├── bs4
    ├── tests
    │   ├── __init__.py
    │   ├── test_htmlparser.py
    │   ├── test_docs.py
    │   ├── test_html5lib.py
    │   ├── test_lxml.py
    │   ├── test_builder_registry.py
    │   └── test_soup.py
    ├── diagnose.py
    ├── builder
    │   ├── _lxml.py
    │   ├── _htmlparser.py
    │   ├── _html5lib.py
    │   └── __init__.py
    ├── __init__.py
    ├── testing.py
    └── dammit.py
├── .gitignore
├── CHANGELOG
├── getcookie.user.coffee
├── getcookie.user.js
├── LICENSE
├── README.md
└── tenkou.py


/bs4/tests/__init__.py:
--------------------------------------------------------------------------------
1 | "The beautifulsoup tests."
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.css
3 | *.htm
4 | *.txt
5 | *auth*


--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
1 | v0.0.2
2 | + 从本地文件批量导入至bangumi（恢复功能）
3 | 
4 | v0.0.1
5 | + 导出bangumi条目到本地
6 | + 批量删除条目


--------------------------------------------------------------------------------
/bs4/tests/test_htmlparser.py:
--------------------------------------------------------------------------------
 1 | """Tests to ensure that the html.parser tree builder generates good
 2 | trees."""
 3 | 
 4 | from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
 5 | from bs4.builder import HTMLParserTreeBuilder
 6 | 
 7 | class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
 8 | 
 9 |     @property
10 |     def default_builder(self):
11 |         return HTMLParserTreeBuilder()
12 | 
13 |     def test_namespaced_system_doctype(self):
14 |         # html.parser can't handle namespaced doctypes, so skip this one.
15 |         pass
16 | 
17 |     def test_namespaced_public_doctype(self):
18 |         # html.parser can't handle namespaced doctypes, so skip this one.
19 |         pass
20 | 


--------------------------------------------------------------------------------
/getcookie.user.coffee:
--------------------------------------------------------------------------------
 1 | ###
 2 | // ==UserScript==
 3 | // @name           getbgmcookie
 4 | // @namespace      https://github.com/hentaiPanda
 5 | // @author         niR
 6 | // @version        0.0.1
 7 | // @license        MIT License
 8 | // @encoding       utf-8
 9 | // @grant          GM_setClipboard
10 | // @grant          GM_registerMenuCommand
11 | // @include        http://bangumi.tv/*
12 | // @include        http://bgm.tv/*
13 | // @include        http://chii.in/*
14 | // ==/UserScript==
15 | ###
16 | 
17 | 
18 | show = ->
19 |   # alert(document.cookie)
20 |   cks = document.cookie.split(';')
21 |   for i in cks
22 |     i = i.trim()
23 |     if i.indexOf('chii_auth') is 0
24 |       auth = i.split('=')[1]
25 |       break
26 |   ua = navigator.userAgent
27 |   data = ua + '\n' + auth
28 |   alert(data)
29 |   console.log(data)
30 |   GM_setClipboard(data)
31 |   alert('已复制到剪贴板')
32 | 
33 | 
34 | GM_registerMenuCommand('显示UA和AUTH', show)


--------------------------------------------------------------------------------
/getcookie.user.js:
--------------------------------------------------------------------------------
 1 | /*
 2 | // ==UserScript==
 3 | // @name           getbgmcookie
 4 | // @namespace      https://github.com/hentaiPanda
 5 | // @author         niR
 6 | // @version        0.0.1
 7 | // @license        MIT License
 8 | // @encoding       utf-8
 9 | // @grant          GM_setClipboard
10 | // @grant          GM_registerMenuCommand
11 | // @include        http://bangumi.tv/*
12 | // @include        http://bgm.tv/*
13 | // @include        http://chii.in/*
14 | // ==/UserScript==
15 |  */
16 | var show;
17 | 
18 | show = function() {
19 |   var auth, cks, data, i, ua, _i, _len;
20 |   cks = document.cookie.split(';');
21 |   for (_i = 0, _len = cks.length; _i < _len; _i++) {
22 |     i = cks[_i];
23 |     i = i.trim();
24 |     if (i.indexOf('chii_auth') === 0) {
25 |       auth = i.split('=')[1];
26 |       break;
27 |     }
28 |   }
29 |   ua = navigator.userAgent;
30 |   data = ua + '\n' + auth;
31 |   alert(data);
32 |   console.log(data);
33 |   GM_setClipboard(data);
34 |   return alert('已复制到剪贴板');
35 | };
36 | 
37 | GM_registerMenuCommand('显示UA和AUTH', show);


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 niR
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/bs4/tests/test_docs.py:
--------------------------------------------------------------------------------
 1 | "Test harness for doctests."
 2 | 
 3 | # pylint: disable-msg=E0611,W0142
 4 | 
 5 | __metaclass__ = type
 6 | __all__ = [
 7 |     'additional_tests',
 8 |     ]
 9 | 
10 | import atexit
11 | import doctest
12 | import os
13 | #from pkg_resources import (
14 | #    resource_filename, resource_exists, resource_listdir, cleanup_resources)
15 | import unittest
16 | 
17 | DOCTEST_FLAGS = (
18 |     doctest.ELLIPSIS |
19 |     doctest.NORMALIZE_WHITESPACE |
20 |     doctest.REPORT_NDIFF)
21 | 
22 | 
23 | # def additional_tests():
24 | #     "Run the doc tests (README.txt and docs/*, if any exist)"
25 | #     doctest_files = [
26 | #         os.path.abspath(resource_filename('bs4', 'README.txt'))]
27 | #     if resource_exists('bs4', 'docs'):
28 | #         for name in resource_listdir('bs4', 'docs'):
29 | #             if name.endswith('.txt'):
30 | #                 doctest_files.append(
31 | #                     os.path.abspath(
32 | #                         resource_filename('bs4', 'docs/%s' % name)))
33 | #     kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
34 | #     atexit.register(cleanup_resources)
35 | #     return unittest.TestSuite((
36 | #         doctest.DocFileSuite(*doctest_files, **kwargs)))
37 | 


--------------------------------------------------------------------------------
/bs4/tests/test_html5lib.py:
--------------------------------------------------------------------------------
 1 | """Tests to ensure that the html5lib tree builder generates good trees."""
 2 | 
 3 | import warnings
 4 | 
 5 | try:
 6 |     from bs4.builder import HTML5TreeBuilder
 7 |     HTML5LIB_PRESENT = True
 8 | except ImportError as e:
 9 |     HTML5LIB_PRESENT = False
10 | from bs4.element import SoupStrainer
11 | from bs4.testing import (
12 |     HTML5TreeBuilderSmokeTest,
13 |     SoupTest,
14 |     skipIf,
15 | )
16 | 
17 | @skipIf(
18 |     not HTML5LIB_PRESENT,
19 |     "html5lib seems not to be present, not testing its tree builder.")
20 | class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
21 |     """See ``HTML5TreeBuilderSmokeTest``."""
22 | 
23 |     @property
24 |     def default_builder(self):
25 |         return HTML5TreeBuilder()
26 | 
27 |     def test_soupstrainer(self):
28 |         # The html5lib tree builder does not support SoupStrainers.
29 |         strainer = SoupStrainer("b")
30 |         markup = "<p>A <b>bold</b> statement.</p>"
31 |         with warnings.catch_warnings(record=True) as w:
32 |             soup = self.soup(markup, parse_only=strainer)
33 |         self.assertEqual(
34 |             soup.decode(), self.document_for(markup))
35 | 
36 |         self.assertTrue(
37 |             "the html5lib tree builder doesn't support parse_only" in
38 |             str(w[0].message))
39 | 
40 |     def test_correctly_nested_tables(self):
41 |         """html5lib inserts <tbody> tags where other parsers don't."""
42 |         markup = ('<table id="1">'
43 |                   '<tr>'
44 |                   "<td>Here's another table:"
45 |                   '<table id="2">'
46 |                   '<tr><td>foo</td></tr>'
47 |                   '</table></td>')
48 | 
49 |         self.assertSoupEquals(
50 |             markup,
51 |             '<table id="1"><tbody><tr><td>Here\'s another table:'
52 |             '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
53 |             '</td></tr></tbody></table>')
54 | 
55 |         self.assertSoupEquals(
56 |             "<table><thead><tr><td>Foo</td></tr></thead>"
57 |             "<tbody><tr><td>Bar</td></tr></tbody>"
58 |             "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
59 | 
60 |     def test_xml_declaration_followed_by_doctype(self):
61 |         markup = '''<?xml version="1.0" encoding="utf-8"?>
62 | <!DOCTYPE html>
63 | <html>
64 |   <head>
65 |   </head>
66 |   <body>
67 |    <p>foo</p>
68 |   </body>
69 | </html>'''
70 |         soup = self.soup(markup)
71 |         # Verify that we can reach the <p> tag; this means the tree is connected.
72 |         self.assertEqual(b"<p>foo</p>", soup.p.encode())
73 | 
74 |     def test_reparented_markup(self):
75 |         markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
76 |         soup = self.soup(markup)
77 |         self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
78 |         self.assertEqual(2, len(soup.find_all('p')))
79 | 
80 | 
81 |     def test_reparented_markup_ends_with_whitespace(self):
82 |         markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
83 |         soup = self.soup(markup)
84 |         self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
85 |         self.assertEqual(2, len(soup.find_all('p')))
86 | 


--------------------------------------------------------------------------------
/bs4/tests/test_lxml.py:
--------------------------------------------------------------------------------
 1 | """Tests to ensure that the lxml tree builder generates good trees."""
 2 | 
 3 | import re
 4 | import warnings
 5 | 
 6 | try:
 7 |     import lxml.etree
 8 |     LXML_PRESENT = True
 9 |     LXML_VERSION = lxml.etree.LXML_VERSION
10 | except ImportError as e:
11 |     LXML_PRESENT = False
12 |     LXML_VERSION = (0,)
13 | 
14 | if LXML_PRESENT:
15 |     from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
16 | 
17 | from bs4 import (
18 |     BeautifulSoup,
19 |     BeautifulStoneSoup,
20 |     )
21 | from bs4.element import Comment, Doctype, SoupStrainer
22 | from bs4.testing import skipIf
23 | from bs4.tests import test_htmlparser
24 | from bs4.testing import (
25 |     HTMLTreeBuilderSmokeTest,
26 |     XMLTreeBuilderSmokeTest,
27 |     SoupTest,
28 |     skipIf,
29 | )
30 | 
31 | @skipIf(
32 |     not LXML_PRESENT,
33 |     "lxml seems not to be present, not testing its tree builder.")
34 | class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
35 |     """See ``HTMLTreeBuilderSmokeTest``."""
36 | 
37 |     @property
38 |     def default_builder(self):
39 |         return LXMLTreeBuilder()
40 | 
41 |     def test_out_of_range_entity(self):
42 |         self.assertSoupEquals(
43 |             "<p>foo&#10000000000000;bar</p>", "<p>foobar</p>")
44 |         self.assertSoupEquals(
45 |             "<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
46 |         self.assertSoupEquals(
47 |             "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
48 | 
49 |     # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
50 |     # test if an old version of lxml is installed.
51 | 
52 |     @skipIf(
53 |         not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
54 |         "Skipping doctype test for old version of lxml to avoid segfault.")
55 |     def test_empty_doctype(self):
56 |         soup = self.soup("<!DOCTYPE>")
57 |         doctype = soup.contents[0]
58 |         self.assertEqual("", doctype.strip())
59 | 
60 |     def test_beautifulstonesoup_is_xml_parser(self):
61 |         # Make sure that the deprecated BSS class uses an xml builder
62 |         # if one is installed.
63 |         with warnings.catch_warnings(record=True) as w:
64 |             soup = BeautifulStoneSoup("<b />")
65 |         self.assertEqual("<b/>", str(soup.b))
66 |         self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
67 | 
68 |     def test_real_xhtml_document(self):
69 |         """lxml strips the XML definition from an XHTML doc, which is fine."""
70 |         markup = b"""<?xml version="1.0" encoding="utf-8"?>
71 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
72 | <html xmlns="http://www.w3.org/1999/xhtml">
73 | <head><title>Hello.</title></head>
74 | <body>Goodbye.</body>
75 | </html>"""
76 |         soup = self.soup(markup)
77 |         self.assertEqual(
78 |             soup.encode("utf-8").replace(b"\n", b''),
79 |             markup.replace(b'\n', b'').replace(
80 |                 b'<?xml version="1.0" encoding="utf-8"?>', b''))
81 | 
82 | 
83 | @skipIf(
84 |     not LXML_PRESENT,
85 |     "lxml seems not to be present, not testing its XML tree builder.")
86 | class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
87 |     """See ``HTMLTreeBuilderSmokeTest``."""
88 | 
89 |     @property
90 |     def default_builder(self):
91 |         return LXMLTreeBuilderForXML()
92 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ##tenkou転校##
  2 | 
  3 | ###批量导出/删除/恢复收藏条目###
  4 | 
  5 | Bangumi有时候连接情况不太好，会一直卡在某个地方
  6 | 
  7 | Don't panic!
  8 | 
  9 | 如果是网络情况不好可以换个时间再试
 10 | 
 11 | ###使用###
 12 | 
 13 | Python 3 脚本
 14 | 
 15 | 参数说明：
 16 | 
 17 | ```
 18 | tenkou.py [-h] [-d {chii.in,bgm.tv,bangumi.tv}] [-u UID]
 19 |           [--password PASSWORD] [--wipe] [-p PATH] [--auth AUTH]
 20 |           [--useragent USERAGENT] [--authfile AUTHFILE] [-v]
 21 | ```
 22 | 
 23 | ```
 24 | -h, --help                  帮助
 25 | -d DOMAIN, --domain DOMAIN  选择域名，默认bgm.tv, 还可选择bangumi.tv或chii.in
 26 | -u UID, --uid UID           你的id
 27 | --password PASSWORD         你的网站登录密码
 28 | -p PATH, --path PATH        本地保存目录，必须事先建立
 29 | --auth AUTH                 auth字符串
 30 | --useragent USERAGENT       你浏览器的User-Agent
 31 | --authfile AUTHFILE         保存User-Agent和Auth字符串的文件（Authfile）位置
 32 | --wipe                      删除所有条目！谨慎使用！
 33 | -r, --restore               恢复条目
 34 | -v, --version               程序版本
 35 | ```
 36 | 
 37 | 只有id是必须的，不需要导出进度的话，不必输入密码
 38 | 
 39 | ###使用举例###
 40 | 
 41 | 以下只是举例，你可以自由搭配
 42 | 
 43 | ```
 44 | 1. tenkou.py -u 9999999                  导出id为9999999那人的条目
 45 | 
 46 | 2. tenkou.py -u 9999999 -p ./bgm_backup  备份到当前位置的bgm_backup目录
 47 | 
 48 | 3. tenkou.py -u 9999999 --password 123
 49 |                                          使用密码登录，导出id为9999999的条目，包括观看进度
 50 | 
 51 | 4. tenkou.py -u 9999999 --useragent Mozilla --auth LFJDSLAF%LFASJD
 52 |                                          使用auth，导出id为9999999的条目，包括观看进度
 53 | 
 54 | 5. tenkou.py -u 9999999 --authfile ./authfile.txt
 55 |                                          使用authfile，导出id为9999999的条目，包括观看进度
 56 | 
 57 | 6. tenkou.py -u 9999999 --password 123 --wipe
 58 |                                          使用密码，导出id为9999999的条目，包括观看进度
 59 |                                          完全删除条目
 60 | 
 61 | 7. tenkou.py -u 9999999 --useragent Mozilla --auth LFJDSLAF%LFASJD --wipe
 62 |                                          使用auth，导出id为9999999的条目，包括观看进度
 63 |                                          完全删除条目
 64 | 
 65 | 8. tenkou.py -u 9999999 --authfile ./authfile.txt --wipe
 66 |                                          使用authfile，导出id为9999999的条目，包括观看进度
 67 |                                          完全删除条目
 68 | 
 69 | 9. tenkou.py -u 9999999 --password 12345 -p ./backup -r
 70 |                                          使用密码，从当前工作目录下的backup目录读取文件，
 71 |                                          恢复id为9999999的条目
 72 | ```
 73 | 
 74 | **注：大量恢复或者删除条目过程中因为网络情况（你的连接问题或者服务器的保护机制，如果有的话）而出错的可能性会比较大。如果是恢复条目时，问题还不严重，因为本地文件还保留着。但在删除条目过程中如果发生错误，你会丢失条目记录或者删除不完全，所以虽然删除可以和导出同时进行，仍建议先导出一次再删除。**
 75 | 
 76 | ###获取Auth和User-Agent的方法###
 77 | 
 78 | 直接查看你浏览器的调试工具，这个Auth就是cookie里的Auth
 79 | 
 80 | ###什么是Authfile###
 81 | 
 82 | 只是一个文本文件，保存了User-Agent和Auth字符串
 83 | 
 84 | 其中，第一行是User-Agent；第二行是Auth字符串
 85 | 
 86 | authfile文件名任意，上面只是举例而用了authfile.txt
 87 | 
 88 | ###需要同时提供密码和Auth吗？###
 89 | 
 90 | 不需要，密码、User-Agent和Auth、Authfile三选一即可
 91 | 
 92 | 可以参考上面的使用举例
 93 | 
 94 | ###有更简单的方法获取Auth和User-Agent吗？###
 95 | 
 96 | 和tenkou.py同时提供了一个getcookie.user.js的greasemonkey/tampermonkey脚本
 97 | 
 98 | 安装后在bangumi页面可以查看并自动复制你的User-Agent/Auth，你可以手动分拆开来输入，或者直接新建一个authfile
 99 | 
100 | 如图所示：
101 | 
102 | Firefox GM
103 | 
104 | ![Firefox选项](http://i.imgur.com/2GdaRSn.jpg)
105 | 
106 | Chrome Tampermonkey
107 | 
108 | ![Chrome选项](http://i.imgur.com/Qwk6ff0.jpg)
109 | 
110 | 结果
111 | 
112 | ![结果](http://i.imgur.com/NW3IYnc.jpg)
113 | 
114 | ###安全性？你会知道我的密码吗？###
115 | 
116 | 不会，只是你本地和网站的通信，没有什么信息会传到我这里
117 | 
118 | 至于密码和Auth方式，因为这个只是用cookie的auth，没什么大的差别
119 | 
120 | 
121 | ###最后生出的备份文件说明###
122 | 
123 | 最后的备份文件按
124 | 
125 | A段：
126 | 
127 | * 动画（anime）
128 | * 音乐（music）
129 | * 游戏（game）
130 | * 书籍（book）
131 | * 三次元（real）
132 | 
133 | 以及B段：
134 | 
135 | * 在看（do）
136 | * 看过（collect）
137 | * 想看（wish）
138 | * 搁置（on_hold）
139 | * 抛弃（dropped）
140 | 
141 | 来命名
142 | 
143 | 组成形式为```bangumi_A段_B段.txt```
144 | 
145 | 比如
146 | 
147 | ```
148 | bangumi_anime_do.txt
149 | bangumi_book_collect.txt
150 | ```


--------------------------------------------------------------------------------
/bs4/tests/test_builder_registry.py:
--------------------------------------------------------------------------------
  1 | """Tests of the builder registry."""
  2 | 
  3 | import unittest
  4 | 
  5 | from bs4 import BeautifulSoup
  6 | from bs4.builder import (
  7 |     builder_registry as registry,
  8 |     HTMLParserTreeBuilder,
  9 |     TreeBuilderRegistry,
 10 | )
 11 | 
 12 | try:
 13 |     from bs4.builder import HTML5TreeBuilder
 14 |     HTML5LIB_PRESENT = True
 15 | except ImportError:
 16 |     HTML5LIB_PRESENT = False
 17 | 
 18 | try:
 19 |     from bs4.builder import (
 20 |         LXMLTreeBuilderForXML,
 21 |         LXMLTreeBuilder,
 22 |         )
 23 |     LXML_PRESENT = True
 24 | except ImportError:
 25 |     LXML_PRESENT = False
 26 | 
 27 | 
 28 | class BuiltInRegistryTest(unittest.TestCase):
 29 |     """Test the built-in registry with the default builders registered."""
 30 | 
 31 |     def test_combination(self):
 32 |         if LXML_PRESENT:
 33 |             self.assertEqual(registry.lookup('fast', 'html'),
 34 |                              LXMLTreeBuilder)
 35 | 
 36 |         if LXML_PRESENT:
 37 |             self.assertEqual(registry.lookup('permissive', 'xml'),
 38 |                              LXMLTreeBuilderForXML)
 39 |         self.assertEqual(registry.lookup('strict', 'html'),
 40 |                           HTMLParserTreeBuilder)
 41 |         if HTML5LIB_PRESENT:
 42 |             self.assertEqual(registry.lookup('html5lib', 'html'),
 43 |                               HTML5TreeBuilder)
 44 | 
 45 |     def test_lookup_by_markup_type(self):
 46 |         if LXML_PRESENT:
 47 |             self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
 48 |             self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
 49 |         else:
 50 |             self.assertEqual(registry.lookup('xml'), None)
 51 |             if HTML5LIB_PRESENT:
 52 |                 self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
 53 |             else:
 54 |                 self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
 55 | 
 56 |     def test_named_library(self):
 57 |         if LXML_PRESENT:
 58 |             self.assertEqual(registry.lookup('lxml', 'xml'),
 59 |                              LXMLTreeBuilderForXML)
 60 |             self.assertEqual(registry.lookup('lxml', 'html'),
 61 |                              LXMLTreeBuilder)
 62 |         if HTML5LIB_PRESENT:
 63 |             self.assertEqual(registry.lookup('html5lib'),
 64 |                               HTML5TreeBuilder)
 65 | 
 66 |         self.assertEqual(registry.lookup('html.parser'),
 67 |                           HTMLParserTreeBuilder)
 68 | 
 69 |     def test_beautifulsoup_constructor_does_lookup(self):
 70 |         # You can pass in a string.
 71 |         BeautifulSoup("", features="html")
 72 |         # Or a list of strings.
 73 |         BeautifulSoup("", features=["html", "fast"])
 74 | 
 75 |         # You'll get an exception if BS can't find an appropriate
 76 |         # builder.
 77 |         self.assertRaises(ValueError, BeautifulSoup,
 78 |                           "", features="no-such-feature")
 79 | 
 80 | class RegistryTest(unittest.TestCase):
 81 |     """Test the TreeBuilderRegistry class in general."""
 82 | 
 83 |     def setUp(self):
 84 |         self.registry = TreeBuilderRegistry()
 85 | 
 86 |     def builder_for_features(self, *feature_list):
 87 |         cls = type('Builder_' + '_'.join(feature_list),
 88 |                    (object,), {'features' : feature_list})
 89 | 
 90 |         self.registry.register(cls)
 91 |         return cls
 92 | 
 93 |     def test_register_with_no_features(self):
 94 |         builder = self.builder_for_features()
 95 | 
 96 |         # Since the builder advertises no features, you can't find it
 97 |         # by looking up features.
 98 |         self.assertEqual(self.registry.lookup('foo'), None)
 99 | 
100 |         # But you can find it by doing a lookup with no features, if
101 |         # this happens to be the only registered builder.
102 |         self.assertEqual(self.registry.lookup(), builder)
103 | 
104 |     def test_register_with_features_makes_lookup_succeed(self):
105 |         builder = self.builder_for_features('foo', 'bar')
106 |         self.assertEqual(self.registry.lookup('foo'), builder)
107 |         self.assertEqual(self.registry.lookup('bar'), builder)
108 | 
109 |     def test_lookup_fails_when_no_builder_implements_feature(self):
110 |         builder = self.builder_for_features('foo', 'bar')
111 |         self.assertEqual(self.registry.lookup('baz'), None)
112 | 
113 |     def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
114 |         builder1 = self.builder_for_features('foo')
115 |         builder2 = self.builder_for_features('bar')
116 |         self.assertEqual(self.registry.lookup(), builder2)
117 | 
118 |     def test_lookup_fails_when_no_tree_builders_registered(self):
119 |         self.assertEqual(self.registry.lookup(), None)
120 | 
121 |     def test_lookup_gets_most_recent_builder_supporting_all_features(self):
122 |         has_one = self.builder_for_features('foo')
123 |         has_the_other = self.builder_for_features('bar')
124 |         has_both_early = self.builder_for_features('foo', 'bar', 'baz')
125 |         has_both_late = self.builder_for_features('foo', 'bar', 'quux')
126 |         lacks_one = self.builder_for_features('bar')
127 |         has_the_other = self.builder_for_features('foo')
128 | 
129 |         # There are two builders featuring 'foo' and 'bar', but
130 |         # the one that also features 'quux' was registered later.
131 |         self.assertEqual(self.registry.lookup('foo', 'bar'),
132 |                           has_both_late)
133 | 
134 |         # There is only one builder featuring 'foo', 'bar', and 'baz'.
135 |         self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'),
136 |                           has_both_early)
137 | 
138 |     def test_lookup_fails_when_cannot_reconcile_requested_features(self):
139 |         builder1 = self.builder_for_features('foo', 'bar')
140 |         builder2 = self.builder_for_features('foo', 'baz')
141 |         self.assertEqual(self.registry.lookup('bar', 'baz'), None)
142 | 


--------------------------------------------------------------------------------
/bs4/diagnose.py:
--------------------------------------------------------------------------------
  1 | """Diagnostic functions, mainly for use when doing tech support."""
  2 | import cProfile
  3 | from io import StringIO
  4 | from html.parser import HTMLParser
  5 | import bs4
  6 | from bs4 import BeautifulSoup, __version__
  7 | from bs4.builder import builder_registry
  8 | 
  9 | import os
 10 | import pstats
 11 | import random
 12 | import tempfile
 13 | import time
 14 | import traceback
 15 | import sys
 16 | import cProfile
 17 | 
 18 | def diagnose(data):
 19 |     """Diagnostic suite for isolating common problems."""
 20 |     print("Diagnostic running on Beautiful Soup %s" % __version__)
 21 |     print("Python version %s" % sys.version)
 22 | 
 23 |     basic_parsers = ["html.parser", "html5lib", "lxml"]
 24 |     for name in basic_parsers:
 25 |         for builder in builder_registry.builders:
 26 |             if name in builder.features:
 27 |                 break
 28 |         else:
 29 |             basic_parsers.remove(name)
 30 |             print((
 31 |                 "I noticed that %s is not installed. Installing it may help." %
 32 |                 name))
 33 | 
 34 |     if 'lxml' in basic_parsers:
 35 |         basic_parsers.append(["lxml", "xml"])
 36 |         from lxml import etree
 37 |         print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
 38 | 
 39 |     if 'html5lib' in basic_parsers:
 40 |         import html5lib
 41 |         print("Found html5lib version %s" % html5lib.__version__)
 42 | 
 43 |     if hasattr(data, 'read'):
 44 |         data = data.read()
 45 |     elif os.path.exists(data):
 46 |         print('"%s" looks like a filename. Reading data from the file.' % data)
 47 |         data = open(data).read()
 48 |     elif data.startswith("http:") or data.startswith("https:"):
 49 |         print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
 50 |         print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
 51 |         return
 52 |     print()
 53 | 
 54 |     for parser in basic_parsers:
 55 |         print("Trying to parse your markup with %s" % parser)
 56 |         success = False
 57 |         try:
 58 |             soup = BeautifulSoup(data, parser)
 59 |             success = True
 60 |         except Exception as e:
 61 |             print("%s could not parse the markup." % parser)
 62 |             traceback.print_exc()
 63 |         if success:
 64 |             print("Here's what %s did with the markup:" % parser)
 65 |             print(soup.prettify())
 66 | 
 67 |         print("-" * 80)
 68 | 
 69 | def lxml_trace(data, html=True, **kwargs):
 70 |     """Print out the lxml events that occur during parsing.
 71 | 
 72 |     This lets you see how lxml parses a document when no Beautiful
 73 |     Soup code is running.
 74 |     """
 75 |     from lxml import etree
 76 |     for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
 77 |         print(("%s, %4s, %s" % (event, element.tag, element.text)))
 78 | 
 79 | class AnnouncingParser(HTMLParser):
 80 |     """Announces HTMLParser parse events, without doing anything else."""
 81 | 
 82 |     def _p(self, s):
 83 |         print(s)
 84 | 
 85 |     def handle_starttag(self, name, attrs):
 86 |         self._p("%s START" % name)
 87 | 
 88 |     def handle_endtag(self, name):
 89 |         self._p("%s END" % name)
 90 | 
 91 |     def handle_data(self, data):
 92 |         self._p("%s DATA" % data)
 93 | 
 94 |     def handle_charref(self, name):
 95 |         self._p("%s CHARREF" % name)
 96 | 
 97 |     def handle_entityref(self, name):
 98 |         self._p("%s ENTITYREF" % name)
 99 | 
100 |     def handle_comment(self, data):
101 |         self._p("%s COMMENT" % data)
102 | 
103 |     def handle_decl(self, data):
104 |         self._p("%s DECL" % data)
105 | 
106 |     def unknown_decl(self, data):
107 |         self._p("%s UNKNOWN-DECL" % data)
108 | 
109 |     def handle_pi(self, data):
110 |         self._p("%s PI" % data)
111 | 
112 | def htmlparser_trace(data):
113 |     """Print out the HTMLParser events that occur during parsing.
114 | 
115 |     This lets you see how HTMLParser parses a document when no
116 |     Beautiful Soup code is running.
117 |     """
118 |     parser = AnnouncingParser()
119 |     parser.feed(data)
120 | 
121 | _vowels = "aeiou"
122 | _consonants = "bcdfghjklmnpqrstvwxyz"
123 | 
124 | def rword(length=5):
125 |     "Generate a random word-like string."
126 |     s = ''
127 |     for i in range(length):
128 |         if i % 2 == 0:
129 |             t = _consonants
130 |         else:
131 |             t = _vowels
132 |         s += random.choice(t)
133 |     return s
134 | 
135 | def rsentence(length=4):
136 |     "Generate a random sentence-like string."
137 |     return " ".join(rword(random.randint(4,9)) for i in range(length))
138 |         
139 | def rdoc(num_elements=1000):
140 |     """Randomly generate an invalid HTML document."""
141 |     tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
142 |     elements = []
143 |     for i in range(num_elements):
144 |         choice = random.randint(0,3)
145 |         if choice == 0:
146 |             # New tag.
147 |             tag_name = random.choice(tag_names)
148 |             elements.append("<%s>" % tag_name)
149 |         elif choice == 1:
150 |             elements.append(rsentence(random.randint(1,4)))
151 |         elif choice == 2:
152 |             # Close a tag.
153 |             tag_name = random.choice(tag_names)
154 |             elements.append("</%s>" % tag_name)
155 |     return "<html>" + "\n".join(elements) + "</html>"
156 | 
157 | def benchmark_parsers(num_elements=100000):
158 |     """Very basic head-to-head performance benchmark."""
159 |     print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
160 |     data = rdoc(num_elements)
161 |     print("Generated a large invalid HTML document (%d bytes)." % len(data))
162 |     
163 |     for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
164 |         success = False
165 |         try:
166 |             a = time.time()
167 |             soup = BeautifulSoup(data, parser)
168 |             b = time.time()
169 |             success = True
170 |         except Exception as e:
171 |             print("%s could not parse the markup." % parser)
172 |             traceback.print_exc()
173 |         if success:
174 |             print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
175 | 
176 |     from lxml import etree
177 |     a = time.time()
178 |     etree.HTML(data)
179 |     b = time.time()
180 |     print("Raw lxml parsed the markup in %.2fs." % (b-a))
181 | 
182 |     import html5lib
183 |     parser = html5lib.HTMLParser()
184 |     a = time.time()
185 |     parser.parse(data)
186 |     b = time.time()
187 |     print("Raw html5lib parsed the markup in %.2fs." % (b-a))
188 | 
189 | def profile(num_elements=100000, parser="lxml"):
190 | 
191 |     filehandle = tempfile.NamedTemporaryFile()
192 |     filename = filehandle.name
193 | 
194 |     data = rdoc(num_elements)
195 |     vars = dict(bs4=bs4, data=data, parser=parser)
196 |     cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
197 | 
198 |     stats = pstats.Stats(filename)
199 |     # stats.strip_dirs()
200 |     stats.sort_stats("cumulative")
201 |     stats.print_stats('_html5lib|bs4', 50)
202 | 
203 | if __name__ == '__main__':
204 |     diagnose(sys.stdin.read())
205 | 


--------------------------------------------------------------------------------
/bs4/builder/_lxml.py:
--------------------------------------------------------------------------------
  1 | __all__ = [
  2 |     'LXMLTreeBuilderForXML',
  3 |     'LXMLTreeBuilder',
  4 |     ]
  5 | 
  6 | from io import BytesIO
  7 | from io import StringIO
  8 | import collections
  9 | from lxml import etree
 10 | from bs4.element import Comment, Doctype, NamespacedAttribute
 11 | from bs4.builder import (
 12 |     FAST,
 13 |     HTML,
 14 |     HTMLTreeBuilder,
 15 |     PERMISSIVE,
 16 |     ParserRejectedMarkup,
 17 |     TreeBuilder,
 18 |     XML)
 19 | from bs4.dammit import EncodingDetector
 20 | 
 21 | LXML = 'lxml'
 22 | 
 23 | class LXMLTreeBuilderForXML(TreeBuilder):
 24 |     DEFAULT_PARSER_CLASS = etree.XMLParser
 25 | 
 26 |     is_xml = True
 27 | 
 28 |     # Well, it's permissive by XML parser standards.
 29 |     features = [LXML, XML, FAST, PERMISSIVE]
 30 | 
 31 |     CHUNK_SIZE = 512
 32 | 
 33 |     # This namespace mapping is specified in the XML Namespace
 34 |     # standard.
 35 |     DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
 36 | 
 37 |     def default_parser(self, encoding):
 38 |         # This can either return a parser object or a class, which
 39 |         # will be instantiated with default arguments.
 40 |         if self._default_parser is not None:
 41 |             return self._default_parser
 42 |         return etree.XMLParser(
 43 |             target=self, strip_cdata=False, recover=True, encoding=encoding)
 44 | 
 45 |     def parser_for(self, encoding):
 46 |         # Use the default parser.
 47 |         parser = self.default_parser(encoding)
 48 | 
 49 |         if isinstance(parser, collections.Callable):
 50 |             # Instantiate the parser with default arguments
 51 |             parser = parser(target=self, strip_cdata=False, encoding=encoding)
 52 |         return parser
 53 | 
 54 |     def __init__(self, parser=None, empty_element_tags=None):
 55 |         # TODO: Issue a warning if parser is present but not a
 56 |         # callable, since that means there's no way to create new
 57 |         # parsers for different encodings.
 58 |         self._default_parser = parser
 59 |         if empty_element_tags is not None:
 60 |             self.empty_element_tags = set(empty_element_tags)
 61 |         self.soup = None
 62 |         self.nsmaps = [self.DEFAULT_NSMAPS]
 63 | 
 64 |     def _getNsTag(self, tag):
 65 |         # Split the namespace URL out of a fully-qualified lxml tag
 66 |         # name. Copied from lxml's src/lxml/sax.py.
 67 |         if tag[0] == '{':
 68 |             return tuple(tag[1:].split('}', 1))
 69 |         else:
 70 |             return (None, tag)
 71 | 
 72 |     def prepare_markup(self, markup, user_specified_encoding=None,
 73 |                        document_declared_encoding=None):
 74 |         """
 75 |         :yield: A series of 4-tuples.
 76 |          (markup, encoding, declared encoding,
 77 |           has undergone character replacement)
 78 | 
 79 |         Each 4-tuple represents a strategy for parsing the document.
 80 |         """
 81 |         if isinstance(markup, str):
 82 |             # We were given Unicode. Maybe lxml can parse Unicode on
 83 |             # this system?
 84 |             yield markup, None, document_declared_encoding, False
 85 | 
 86 |         if isinstance(markup, str):
 87 |             # No, apparently not. Convert the Unicode to UTF-8 and
 88 |             # tell lxml to parse it as UTF-8.
 89 |             yield (markup.encode("utf8"), "utf8",
 90 |                    document_declared_encoding, False)
 91 | 
 92 |         # Instead of using UnicodeDammit to convert the bytestring to
 93 |         # Unicode using different encodings, use EncodingDetector to
 94 |         # iterate over the encodings, and tell lxml to try to parse
 95 |         # the document as each one in turn.
 96 |         is_html = not self.is_xml
 97 |         try_encodings = [user_specified_encoding, document_declared_encoding]
 98 |         detector = EncodingDetector(markup, try_encodings, is_html)
 99 |         for encoding in detector.encodings:
100 |             yield (detector.markup, encoding, document_declared_encoding, False)
101 | 
102 |     def feed(self, markup):
103 |         if isinstance(markup, bytes):
104 |             markup = BytesIO(markup)
105 |         elif isinstance(markup, str):
106 |             markup = StringIO(markup)
107 | 
108 |         # Call feed() at least once, even if the markup is empty,
109 |         # or the parser won't be initialized.
110 |         data = markup.read(self.CHUNK_SIZE)
111 |         try:
112 |             self.parser = self.parser_for(self.soup.original_encoding)
113 |             self.parser.feed(data)
114 |             while len(data) != 0:
115 |                 # Now call feed() on the rest of the data, chunk by chunk.
116 |                 data = markup.read(self.CHUNK_SIZE)
117 |                 if len(data) != 0:
118 |                     self.parser.feed(data)
119 |             self.parser.close()
120 |         except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
121 |             raise ParserRejectedMarkup(str(e))
122 | 
123 |     def close(self):
124 |         self.nsmaps = [self.DEFAULT_NSMAPS]
125 | 
126 |     def start(self, name, attrs, nsmap={}):
127 |         # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
128 |         attrs = dict(attrs)
129 |         nsprefix = None
130 |         # Invert each namespace map as it comes in.
131 |         if len(self.nsmaps) > 1:
132 |             # There are no new namespaces for this tag, but
133 |             # non-default namespaces are in play, so we need a
134 |             # separate tag stack to know when they end.
135 |             self.nsmaps.append(None)
136 |         elif len(nsmap) > 0:
137 |             # A new namespace mapping has come into play.
138 |             inverted_nsmap = dict((value, key) for key, value in list(nsmap.items()))
139 |             self.nsmaps.append(inverted_nsmap)
140 |             # Also treat the namespace mapping as a set of attributes on the
141 |             # tag, so we can recreate it later.
142 |             attrs = attrs.copy()
143 |             for prefix, namespace in list(nsmap.items()):
144 |                 attribute = NamespacedAttribute(
145 |                     "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
146 |                 attrs[attribute] = namespace
147 | 
148 |         # Namespaces are in play. Find any attributes that came in
149 |         # from lxml with namespaces attached to their names, and
150 |         # turn then into NamespacedAttribute objects.
151 |         new_attrs = {}
152 |         for attr, value in list(attrs.items()):
153 |             namespace, attr = self._getNsTag(attr)
154 |             if namespace is None:
155 |                 new_attrs[attr] = value
156 |             else:
157 |                 nsprefix = self._prefix_for_namespace(namespace)
158 |                 attr = NamespacedAttribute(nsprefix, attr, namespace)
159 |                 new_attrs[attr] = value
160 |         attrs = new_attrs
161 | 
162 |         namespace, name = self._getNsTag(name)
163 |         nsprefix = self._prefix_for_namespace(namespace)
164 |         self.soup.handle_starttag(name, namespace, nsprefix, attrs)
165 | 
166 |     def _prefix_for_namespace(self, namespace):
167 |         """Find the currently active prefix for the given namespace."""
168 |         if namespace is None:
169 |             return None
170 |         for inverted_nsmap in reversed(self.nsmaps):
171 |             if inverted_nsmap is not None and namespace in inverted_nsmap:
172 |                 return inverted_nsmap[namespace]
173 |         return None
174 | 
175 |     def end(self, name):
176 |         self.soup.endData()
177 |         completed_tag = self.soup.tagStack[-1]
178 |         namespace, name = self._getNsTag(name)
179 |         nsprefix = None
180 |         if namespace is not None:
181 |             for inverted_nsmap in reversed(self.nsmaps):
182 |                 if inverted_nsmap is not None and namespace in inverted_nsmap:
183 |                     nsprefix = inverted_nsmap[namespace]
184 |                     break
185 |         self.soup.handle_endtag(name, nsprefix)
186 |         if len(self.nsmaps) > 1:
187 |             # This tag, or one of its parents, introduced a namespace
188 |             # mapping, so pop it off the stack.
189 |             self.nsmaps.pop()
190 | 
191 |     def pi(self, target, data):
192 |         pass
193 | 
194 |     def data(self, content):
195 |         self.soup.handle_data(content)
196 | 
197 |     def doctype(self, name, pubid, system):
198 |         self.soup.endData()
199 |         doctype = Doctype.for_name_and_ids(name, pubid, system)
200 |         self.soup.object_was_parsed(doctype)
201 | 
202 |     def comment(self, content):
203 |         "Handle comments as Comment objects."
204 |         self.soup.endData()
205 |         self.soup.handle_data(content)
206 |         self.soup.endData(Comment)
207 | 
208 |     def test_fragment_to_document(self, fragment):
209 |         """See `TreeBuilder`."""
210 |         return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
211 | 
212 | 
213 | class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
214 | 
215 |     features = [LXML, HTML, FAST, PERMISSIVE]
216 |     is_xml = False
217 | 
218 |     def default_parser(self, encoding):
219 |         return etree.HTMLParser
220 | 
221 |     def feed(self, markup):
222 |         encoding = self.soup.original_encoding
223 |         try:
224 |             self.parser = self.parser_for(encoding)
225 |             self.parser.feed(markup)
226 |             self.parser.close()
227 |         except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
228 |             raise ParserRejectedMarkup(str(e))
229 | 
230 | 
231 |     def test_fragment_to_document(self, fragment):
232 |         """See `TreeBuilder`."""
233 |         return '<html><body>%s</body></html>' % fragment
234 | 


--------------------------------------------------------------------------------
/bs4/builder/_htmlparser.py:
--------------------------------------------------------------------------------
  1 | """Use the HTMLParser library to parse HTML files that aren't too bad."""
  2 | 
  3 | __all__ = [
  4 |     'HTMLParserTreeBuilder',
  5 |     ]
  6 | 
  7 | from html.parser import (
  8 |     HTMLParser,
  9 |     HTMLParseError,
 10 |     )
 11 | import sys
 12 | import warnings
 13 | 
 14 | # Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
 15 | # argument, which we'd like to set to False. Unfortunately,
 16 | # http://bugs.python.org/issue13273 makes strict=True a better bet
 17 | # before Python 3.2.3.
 18 | #
 19 | # At the end of this file, we monkeypatch HTMLParser so that
 20 | # strict=True works well on Python 3.2.2.
 21 | major, minor, release = sys.version_info[:3]
 22 | CONSTRUCTOR_TAKES_STRICT = (
 23 |     major > 3
 24 |     or (major == 3 and minor > 2)
 25 |     or (major == 3 and minor == 2 and release >= 3))
 26 | 
 27 | from bs4.element import (
 28 |     CData,
 29 |     Comment,
 30 |     Declaration,
 31 |     Doctype,
 32 |     ProcessingInstruction,
 33 |     )
 34 | from bs4.dammit import EntitySubstitution, UnicodeDammit
 35 | 
 36 | from bs4.builder import (
 37 |     HTML,
 38 |     HTMLTreeBuilder,
 39 |     STRICT,
 40 |     )
 41 | 
 42 | 
 43 | HTMLPARSER = 'html.parser'
 44 | 
 45 | class BeautifulSoupHTMLParser(HTMLParser):
 46 |     def handle_starttag(self, name, attrs):
 47 |         # XXX namespace
 48 |         attr_dict = {}
 49 |         for key, value in attrs:
 50 |             # Change None attribute values to the empty string
 51 |             # for consistency with the other tree builders.
 52 |             if value is None:
 53 |                 value = ''
 54 |             attr_dict[key] = value
 55 |             attrvalue = '""'
 56 |         self.soup.handle_starttag(name, None, None, attr_dict)
 57 | 
 58 |     def handle_endtag(self, name):
 59 |         self.soup.handle_endtag(name)
 60 | 
 61 |     def handle_data(self, data):
 62 |         self.soup.handle_data(data)
 63 | 
 64 |     def handle_charref(self, name):
 65 |         # XXX workaround for a bug in HTMLParser. Remove this once
 66 |         # it's fixed.
 67 |         if name.startswith('x'):
 68 |             real_name = int(name.lstrip('x'), 16)
 69 |         elif name.startswith('X'):
 70 |             real_name = int(name.lstrip('X'), 16)
 71 |         else:
 72 |             real_name = int(name)
 73 | 
 74 |         try:
 75 |             data = chr(real_name)
 76 |         except (ValueError, OverflowError) as e:
 77 |             data = "\N{REPLACEMENT CHARACTER}"
 78 | 
 79 |         self.handle_data(data)
 80 | 
 81 |     def handle_entityref(self, name):
 82 |         character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
 83 |         if character is not None:
 84 |             data = character
 85 |         else:
 86 |             data = "&%s;" % name
 87 |         self.handle_data(data)
 88 | 
 89 |     def handle_comment(self, data):
 90 |         self.soup.endData()
 91 |         self.soup.handle_data(data)
 92 |         self.soup.endData(Comment)
 93 | 
 94 |     def handle_decl(self, data):
 95 |         self.soup.endData()
 96 |         if data.startswith("DOCTYPE "):
 97 |             data = data[len("DOCTYPE "):]
 98 |         elif data == 'DOCTYPE':
 99 |             # i.e. "<!DOCTYPE>"
100 |             data = ''
101 |         self.soup.handle_data(data)
102 |         self.soup.endData(Doctype)
103 | 
104 |     def unknown_decl(self, data):
105 |         if data.upper().startswith('CDATA['):
106 |             cls = CData
107 |             data = data[len('CDATA['):]
108 |         else:
109 |             cls = Declaration
110 |         self.soup.endData()
111 |         self.soup.handle_data(data)
112 |         self.soup.endData(cls)
113 | 
114 |     def handle_pi(self, data):
115 |         self.soup.endData()
116 |         if data.endswith("?") and data.lower().startswith("xml"):
117 |             # "An XHTML processing instruction using the trailing '?'
118 |             # will cause the '?' to be included in data." - HTMLParser
119 |             # docs.
120 |             #
121 |             # Strip the question mark so we don't end up with two
122 |             # question marks.
123 |             data = data[:-1]
124 |         self.soup.handle_data(data)
125 |         self.soup.endData(ProcessingInstruction)
126 | 
127 | 
128 | class HTMLParserTreeBuilder(HTMLTreeBuilder):
129 | 
130 |     is_xml = False
131 |     features = [HTML, STRICT, HTMLPARSER]
132 | 
133 |     def __init__(self, *args, **kwargs):
134 |         if CONSTRUCTOR_TAKES_STRICT:
135 |             kwargs['strict'] = False
136 |         self.parser_args = (args, kwargs)
137 | 
138 |     def prepare_markup(self, markup, user_specified_encoding=None,
139 |                        document_declared_encoding=None):
140 |         """
141 |         :return: A 4-tuple (markup, original encoding, encoding
142 |         declared within markup, whether any characters had to be
143 |         replaced with REPLACEMENT CHARACTER).
144 |         """
145 |         if isinstance(markup, str):
146 |             yield (markup, None, None, False)
147 |             return
148 | 
149 |         try_encodings = [user_specified_encoding, document_declared_encoding]
150 |         dammit = UnicodeDammit(markup, try_encodings, is_html=True)
151 |         yield (dammit.markup, dammit.original_encoding,
152 |                dammit.declared_html_encoding,
153 |                dammit.contains_replacement_characters)
154 | 
155 |     def feed(self, markup):
156 |         args, kwargs = self.parser_args
157 |         parser = BeautifulSoupHTMLParser(*args, **kwargs)
158 |         parser.soup = self.soup
159 |         try:
160 |             parser.feed(markup)
161 |         except HTMLParseError as e:
162 |             warnings.warn(RuntimeWarning(
163 |                 "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
164 |             raise e
165 | 
166 | # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
167 | # 3.2.3 code. This ensures they don't treat markup like <p></p> as a
168 | # string.
169 | #
170 | # XXX This code can be removed once most Python 3 users are on 3.2.3.
171 | if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
172 |     import re
173 |     attrfind_tolerant = re.compile(
174 |         r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
175 |         r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
176 |     HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
177 | 
178 |     locatestarttagend = re.compile(r"""
179 |   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
180 |   (?:\s+                             # whitespace before attribute name
181 |     (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
182 |       (?:\s*=\s*                     # value indicator
183 |         (?:'[^']*'                   # LITA-enclosed value
184 |           |\"[^\"]*\"                # LIT-enclosed value
185 |           |[^'\">\s]+                # bare value
186 |          )
187 |        )?
188 |      )
189 |    )*
190 |   \s*                                # trailing whitespace
191 | """, re.VERBOSE)
192 |     BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
193 | 
194 |     from html.parser import tagfind, attrfind
195 | 
196 |     def parse_starttag(self, i):
197 |         self.__starttag_text = None
198 |         endpos = self.check_for_whole_start_tag(i)
199 |         if endpos < 0:
200 |             return endpos
201 |         rawdata = self.rawdata
202 |         self.__starttag_text = rawdata[i:endpos]
203 | 
204 |         # Now parse the data between i+1 and j into a tag and attrs
205 |         attrs = []
206 |         match = tagfind.match(rawdata, i+1)
207 |         assert match, 'unexpected call to parse_starttag()'
208 |         k = match.end()
209 |         self.lasttag = tag = rawdata[i+1:k].lower()
210 |         while k < endpos:
211 |             if self.strict:
212 |                 m = attrfind.match(rawdata, k)
213 |             else:
214 |                 m = attrfind_tolerant.match(rawdata, k)
215 |             if not m:
216 |                 break
217 |             attrname, rest, attrvalue = m.group(1, 2, 3)
218 |             if not rest:
219 |                 attrvalue = None
220 |             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
221 |                  attrvalue[:1] == '"' == attrvalue[-1:]:
222 |                 attrvalue = attrvalue[1:-1]
223 |             if attrvalue:
224 |                 attrvalue = self.unescape(attrvalue)
225 |             attrs.append((attrname.lower(), attrvalue))
226 |             k = m.end()
227 | 
228 |         end = rawdata[k:endpos].strip()
229 |         if end not in (">", "/>"):
230 |             lineno, offset = self.getpos()
231 |             if "\n" in self.__starttag_text:
232 |                 lineno = lineno + self.__starttag_text.count("\n")
233 |                 offset = len(self.__starttag_text) \
234 |                          - self.__starttag_text.rfind("\n")
235 |             else:
236 |                 offset = offset + len(self.__starttag_text)
237 |             if self.strict:
238 |                 self.error("junk characters in start tag: %r"
239 |                            % (rawdata[k:endpos][:20],))
240 |             self.handle_data(rawdata[i:endpos])
241 |             return endpos
242 |         if end.endswith('/>'):
243 |             # XHTML-style empty tag: <span attr="value" />
244 |             self.handle_startendtag(tag, attrs)
245 |         else:
246 |             self.handle_starttag(tag, attrs)
247 |             if tag in self.CDATA_CONTENT_ELEMENTS:
248 |                 self.set_cdata_mode(tag)
249 |         return endpos
250 | 
251 |     def set_cdata_mode(self, elem):
252 |         self.cdata_elem = elem.lower()
253 |         self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
254 | 
255 |     BeautifulSoupHTMLParser.parse_starttag = parse_starttag
256 |     BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
257 | 
258 |     CONSTRUCTOR_TAKES_STRICT = True
259 | 


--------------------------------------------------------------------------------
/bs4/builder/_html5lib.py:
--------------------------------------------------------------------------------
  1 | __all__ = [
  2 |     'HTML5TreeBuilder',
  3 |     ]
  4 | 
  5 | import warnings
  6 | from bs4.builder import (
  7 |     PERMISSIVE,
  8 |     HTML,
  9 |     HTML_5,
 10 |     HTMLTreeBuilder,
 11 |     )
 12 | from bs4.element import NamespacedAttribute
 13 | import html5lib
 14 | from html5lib.constants import namespaces
 15 | from bs4.element import (
 16 |     Comment,
 17 |     Doctype,
 18 |     NavigableString,
 19 |     Tag,
 20 |     )
 21 | 
 22 | class HTML5TreeBuilder(HTMLTreeBuilder):
 23 |     """Use html5lib to build a tree."""
 24 | 
 25 |     features = ['html5lib', PERMISSIVE, HTML_5, HTML]
 26 | 
 27 |     def prepare_markup(self, markup, user_specified_encoding):
 28 |         # Store the user-specified encoding for use later on.
 29 |         self.user_specified_encoding = user_specified_encoding
 30 |         yield (markup, None, None, False)
 31 | 
 32 |     # These methods are defined by Beautiful Soup.
 33 |     def feed(self, markup):
 34 |         if self.soup.parse_only is not None:
 35 |             warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
 36 |         parser = html5lib.HTMLParser(tree=self.create_treebuilder)
 37 |         doc = parser.parse(markup, encoding=self.user_specified_encoding)
 38 | 
 39 |         # Set the character encoding detected by the tokenizer.
 40 |         if isinstance(markup, str):
 41 |             # We need to special-case this because html5lib sets
 42 |             # charEncoding to UTF-8 if it gets Unicode input.
 43 |             doc.original_encoding = None
 44 |         else:
 45 |             doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
 46 | 
 47 |     def create_treebuilder(self, namespaceHTMLElements):
 48 |         self.underlying_builder = TreeBuilderForHtml5lib(
 49 |             self.soup, namespaceHTMLElements)
 50 |         return self.underlying_builder
 51 | 
 52 |     def test_fragment_to_document(self, fragment):
 53 |         """See `TreeBuilder`."""
 54 |         return '<html><head></head><body>%s</body></html>' % fragment
 55 | 
 56 | 
 57 | class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
 58 | 
 59 |     def __init__(self, soup, namespaceHTMLElements):
 60 |         self.soup = soup
 61 |         super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
 62 | 
 63 |     def documentClass(self):
 64 |         self.soup.reset()
 65 |         return Element(self.soup, self.soup, None)
 66 | 
 67 |     def insertDoctype(self, token):
 68 |         name = token["name"]
 69 |         publicId = token["publicId"]
 70 |         systemId = token["systemId"]
 71 | 
 72 |         doctype = Doctype.for_name_and_ids(name, publicId, systemId)
 73 |         self.soup.object_was_parsed(doctype)
 74 | 
 75 |     def elementClass(self, name, namespace):
 76 |         tag = self.soup.new_tag(name, namespace)
 77 |         return Element(tag, self.soup, namespace)
 78 | 
 79 |     def commentClass(self, data):
 80 |         return TextNode(Comment(data), self.soup)
 81 | 
 82 |     def fragmentClass(self):
 83 |         self.soup = BeautifulSoup("")
 84 |         self.soup.name = "[document_fragment]"
 85 |         return Element(self.soup, self.soup, None)
 86 | 
 87 |     def appendChild(self, node):
 88 |         # XXX This code is not covered by the BS4 tests.
 89 |         self.soup.append(node.element)
 90 | 
 91 |     def getDocument(self):
 92 |         return self.soup
 93 | 
 94 |     def getFragment(self):
 95 |         return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
 96 | 
 97 | class AttrList(object):
 98 |     def __init__(self, element):
 99 |         self.element = element
100 |         self.attrs = dict(self.element.attrs)
101 |     def __iter__(self):
102 |         return list(self.attrs.items()).__iter__()
103 |     def __setitem__(self, name, value):
104 |         "set attr", name, value
105 |         self.element[name] = value
106 |     def items(self):
107 |         return list(self.attrs.items())
108 |     def keys(self):
109 |         return list(self.attrs.keys())
110 |     def __len__(self):
111 |         return len(self.attrs)
112 |     def __getitem__(self, name):
113 |         return self.attrs[name]
114 |     def __contains__(self, name):
115 |         return name in list(self.attrs.keys())
116 | 
117 | 
118 | class Element(html5lib.treebuilders._base.Node):
119 |     def __init__(self, element, soup, namespace):
120 |         html5lib.treebuilders._base.Node.__init__(self, element.name)
121 |         self.element = element
122 |         self.soup = soup
123 |         self.namespace = namespace
124 | 
125 |     def appendChild(self, node):
126 |         string_child = child = None
127 |         if isinstance(node, str):
128 |             # Some other piece of code decided to pass in a string
129 |             # instead of creating a TextElement object to contain the
130 |             # string.
131 |             string_child = child = node
132 |         elif isinstance(node, Tag):
133 |             # Some other piece of code decided to pass in a Tag
134 |             # instead of creating an Element object to contain the
135 |             # Tag.
136 |             child = node
137 |         elif node.element.__class__ == NavigableString:
138 |             string_child = child = node.element
139 |         else:
140 |             child = node.element
141 | 
142 |         if not isinstance(child, str) and child.parent is not None:
143 |             node.element.extract()
144 | 
145 |         if (string_child and self.element.contents
146 |             and self.element.contents[-1].__class__ == NavigableString):
147 |             # We are appending a string onto another string.
148 |             # TODO This has O(n^2) performance, for input like
149 |             # "a</a>a</a>a</a>..."
150 |             old_element = self.element.contents[-1]
151 |             new_element = self.soup.new_string(old_element + string_child)
152 |             old_element.replace_with(new_element)
153 |             self.soup._most_recent_element = new_element
154 |         else:
155 |             if isinstance(node, str):
156 |                 # Create a brand new NavigableString from this string.
157 |                 child = self.soup.new_string(node)
158 | 
159 |             # Tell Beautiful Soup to act as if it parsed this element
160 |             # immediately after the parent's last descendant. (Or
161 |             # immediately after the parent, if it has no children.)
162 |             if self.element.contents:
163 |                 most_recent_element = self.element._last_descendant(False)
164 |             else:
165 |                 most_recent_element = self.element
166 | 
167 |             self.soup.object_was_parsed(
168 |                 child, parent=self.element,
169 |                 most_recent_element=most_recent_element)
170 | 
171 |     def getAttributes(self):
172 |         return AttrList(self.element)
173 | 
174 |     def setAttributes(self, attributes):
175 |         if attributes is not None and len(attributes) > 0:
176 | 
177 |             converted_attributes = []
178 |             for name, value in list(attributes.items()):
179 |                 if isinstance(name, tuple):
180 |                     new_name = NamespacedAttribute(*name)
181 |                     del attributes[name]
182 |                     attributes[new_name] = value
183 | 
184 |             self.soup.builder._replace_cdata_list_attribute_values(
185 |                 self.name, attributes)
186 |             for name, value in list(attributes.items()):
187 |                 self.element[name] = value
188 | 
189 |             # The attributes may contain variables that need substitution.
190 |             # Call set_up_substitutions manually.
191 |             #
192 |             # The Tag constructor called this method when the Tag was created,
193 |             # but we just set/changed the attributes, so call it again.
194 |             self.soup.builder.set_up_substitutions(self.element)
195 |     attributes = property(getAttributes, setAttributes)
196 | 
197 |     def insertText(self, data, insertBefore=None):
198 |         if insertBefore:
199 |             text = TextNode(self.soup.new_string(data), self.soup)
200 |             self.insertBefore(data, insertBefore)
201 |         else:
202 |             self.appendChild(data)
203 | 
204 |     def insertBefore(self, node, refNode):
205 |         index = self.element.index(refNode.element)
206 |         if (node.element.__class__ == NavigableString and self.element.contents
207 |             and self.element.contents[index-1].__class__ == NavigableString):
208 |             # (See comments in appendChild)
209 |             old_node = self.element.contents[index-1]
210 |             new_str = self.soup.new_string(old_node + node.element)
211 |             old_node.replace_with(new_str)
212 |         else:
213 |             self.element.insert(index, node.element)
214 |             node.parent = self
215 | 
216 |     def removeChild(self, node):
217 |         node.element.extract()
218 | 
219 |     def reparentChildren(self, new_parent):
220 |         """Move all of this tag's children into another tag."""
221 |         element = self.element
222 |         new_parent_element = new_parent.element
223 |         # Determine what this tag's next_element will be once all the children
224 |         # are removed.
225 |         final_next_element = element.next_sibling
226 | 
227 |         new_parents_last_descendant = new_parent_element._last_descendant(False, False)
228 |         if len(new_parent_element.contents) > 0:
229 |             # The new parent already contains children. We will be
230 |             # appending this tag's children to the end.
231 |             new_parents_last_child = new_parent_element.contents[-1]
232 |             new_parents_last_descendant_next_element = new_parents_last_descendant.next_element
233 |         else:
234 |             # The new parent contains no children.
235 |             new_parents_last_child = None
236 |             new_parents_last_descendant_next_element = new_parent_element.next_element
237 | 
238 |         to_append = element.contents
239 |         append_after = new_parent.element.contents
240 |         if len(to_append) > 0:
241 |             # Set the first child's previous_element and previous_sibling
242 |             # to elements within the new parent
243 |             first_child = to_append[0]
244 |             first_child.previous_element = new_parents_last_descendant
245 |             first_child.previous_sibling = new_parents_last_child
246 | 
247 |             # Fix the last child's next_element and next_sibling
248 |             last_child = to_append[-1]
249 |             last_child.next_element = new_parents_last_descendant_next_element
250 |             last_child.next_sibling = None
251 | 
252 |         for child in to_append:
253 |             child.parent = new_parent_element
254 |             new_parent_element.contents.append(child)
255 | 
256 |         # Now that this element has no children, change its .next_element.
257 |         element.contents = []
258 |         element.next_element = final_next_element
259 | 
260 |     def cloneNode(self):
261 |         tag = self.soup.new_tag(self.element.name, self.namespace)
262 |         node = Element(tag, self.soup, self.namespace)
263 |         for key,value in self.attributes:
264 |             node.attributes[key] = value
265 |         return node
266 | 
267 |     def hasContent(self):
268 |         return self.element.contents
269 | 
270 |     def getNameTuple(self):
271 |         if self.namespace == None:
272 |             return namespaces["html"], self.name
273 |         else:
274 |             return self.namespace, self.name
275 | 
276 |     nameTuple = property(getNameTuple)
277 | 
278 | class TextNode(Element):
279 |     def __init__(self, element, soup):
280 |         html5lib.treebuilders._base.Node.__init__(self, None)
281 |         self.element = element
282 |         self.soup = soup
283 | 
284 |     def cloneNode(self):
285 |         raise NotImplementedError
286 | 


--------------------------------------------------------------------------------
/bs4/builder/__init__.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import itertools
  3 | import sys
  4 | from bs4.element import (
  5 |     CharsetMetaAttributeValue,
  6 |     ContentMetaAttributeValue,
  7 |     whitespace_re
  8 |     )
  9 | 
 10 | __all__ = [
 11 |     'HTMLTreeBuilder',
 12 |     'SAXTreeBuilder',
 13 |     'TreeBuilder',
 14 |     'TreeBuilderRegistry',
 15 |     ]
 16 | 
 17 | # Some useful features for a TreeBuilder to have.
 18 | FAST = 'fast'
 19 | PERMISSIVE = 'permissive'
 20 | STRICT = 'strict'
 21 | XML = 'xml'
 22 | HTML = 'html'
 23 | HTML_5 = 'html5'
 24 | 
 25 | 
 26 | class TreeBuilderRegistry(object):
 27 | 
 28 |     def __init__(self):
 29 |         self.builders_for_feature = defaultdict(list)
 30 |         self.builders = []
 31 | 
 32 |     def register(self, treebuilder_class):
 33 |         """Register a treebuilder based on its advertised features."""
 34 |         for feature in treebuilder_class.features:
 35 |             self.builders_for_feature[feature].insert(0, treebuilder_class)
 36 |         self.builders.insert(0, treebuilder_class)
 37 | 
 38 |     def lookup(self, *features):
 39 |         if len(self.builders) == 0:
 40 |             # There are no builders at all.
 41 |             return None
 42 | 
 43 |         if len(features) == 0:
 44 |             # They didn't ask for any features. Give them the most
 45 |             # recently registered builder.
 46 |             return self.builders[0]
 47 | 
 48 |         # Go down the list of features in order, and eliminate any builders
 49 |         # that don't match every feature.
 50 |         features = list(features)
 51 |         features.reverse()
 52 |         candidates = None
 53 |         candidate_set = None
 54 |         while len(features) > 0:
 55 |             feature = features.pop()
 56 |             we_have_the_feature = self.builders_for_feature.get(feature, [])
 57 |             if len(we_have_the_feature) > 0:
 58 |                 if candidates is None:
 59 |                     candidates = we_have_the_feature
 60 |                     candidate_set = set(candidates)
 61 |                 else:
 62 |                     # Eliminate any candidates that don't have this feature.
 63 |                     candidate_set = candidate_set.intersection(
 64 |                         set(we_have_the_feature))
 65 | 
 66 |         # The only valid candidates are the ones in candidate_set.
 67 |         # Go through the original list of candidates and pick the first one
 68 |         # that's in candidate_set.
 69 |         if candidate_set is None:
 70 |             return None
 71 |         for candidate in candidates:
 72 |             if candidate in candidate_set:
 73 |                 return candidate
 74 |         return None
 75 | 
 76 | # The BeautifulSoup class will take feature lists from developers and use them
 77 | # to look up builders in this registry.
 78 | builder_registry = TreeBuilderRegistry()
 79 | 
 80 | class TreeBuilder(object):
 81 |     """Turn a document into a Beautiful Soup object tree."""
 82 | 
 83 |     features = []
 84 | 
 85 |     is_xml = False
 86 |     preserve_whitespace_tags = set()
 87 |     empty_element_tags = None # A tag will be considered an empty-element
 88 |                               # tag when and only when it has no contents.
 89 | 
 90 |     # A value for these tag/attribute combinations is a space- or
 91 |     # comma-separated list of CDATA, rather than a single CDATA.
 92 |     cdata_list_attributes = {}
 93 | 
 94 | 
 95 |     def __init__(self):
 96 |         self.soup = None
 97 | 
 98 |     def reset(self):
 99 |         pass
100 | 
101 |     def can_be_empty_element(self, tag_name):
102 |         """Might a tag with this name be an empty-element tag?
103 | 
104 |         The final markup may or may not actually present this tag as
105 |         self-closing.
106 | 
107 |         For instance: an HTMLBuilder does not consider a <p> tag to be
108 |         an empty-element tag (it's not in
109 |         HTMLBuilder.empty_element_tags). This means an empty <p> tag
110 |         will be presented as "<p></p>", not "<p />".
111 | 
112 |         The default implementation has no opinion about which tags are
113 |         empty-element tags, so a tag will be presented as an
114 |         empty-element tag if and only if it has no contents.
115 |         "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
116 |         be left alone.
117 |         """
118 |         if self.empty_element_tags is None:
119 |             return True
120 |         return tag_name in self.empty_element_tags
121 | 
122 |     def feed(self, markup):
123 |         raise NotImplementedError()
124 | 
125 |     def prepare_markup(self, markup, user_specified_encoding=None,
126 |                        document_declared_encoding=None):
127 |         return markup, None, None, False
128 | 
129 |     def test_fragment_to_document(self, fragment):
130 |         """Wrap an HTML fragment to make it look like a document.
131 | 
132 |         Different parsers do this differently. For instance, lxml
133 |         introduces an empty <head> tag, and html5lib
134 |         doesn't. Abstracting this away lets us write simple tests
135 |         which run HTML fragments through the parser and compare the
136 |         results against other HTML fragments.
137 | 
138 |         This method should not be used outside of tests.
139 |         """
140 |         return fragment
141 | 
142 |     def set_up_substitutions(self, tag):
143 |         return False
144 | 
145 |     def _replace_cdata_list_attribute_values(self, tag_name, attrs):
146 |         """Replaces class="foo bar" with class=["foo", "bar"]
147 | 
148 |         Modifies its input in place.
149 |         """
150 |         if not attrs:
151 |             return attrs
152 |         if self.cdata_list_attributes:
153 |             universal = self.cdata_list_attributes.get('*', [])
154 |             tag_specific = self.cdata_list_attributes.get(
155 |                 tag_name.lower(), None)
156 |             for attr in list(attrs.keys()):
157 |                 if attr in universal or (tag_specific and attr in tag_specific):
158 |                     # We have a "class"-type attribute whose string
159 |                     # value is a whitespace-separated list of
160 |                     # values. Split it into a list.
161 |                     value = attrs[attr]
162 |                     if isinstance(value, str):
163 |                         values = whitespace_re.split(value)
164 |                     else:
165 |                         # html5lib sometimes calls setAttributes twice
166 |                         # for the same tag when rearranging the parse
167 |                         # tree. On the second call the attribute value
168 |                         # here is already a list.  If this happens,
169 |                         # leave the value alone rather than trying to
170 |                         # split it again.
171 |                         values = value
172 |                     attrs[attr] = values
173 |         return attrs
174 | 
175 | class SAXTreeBuilder(TreeBuilder):
176 |     """A Beautiful Soup treebuilder that listens for SAX events."""
177 | 
178 |     def feed(self, markup):
179 |         raise NotImplementedError()
180 | 
181 |     def close(self):
182 |         pass
183 | 
184 |     def startElement(self, name, attrs):
185 |         attrs = dict((key[1], value) for key, value in list(attrs.items()))
186 |         #print "Start %s, %r" % (name, attrs)
187 |         self.soup.handle_starttag(name, attrs)
188 | 
189 |     def endElement(self, name):
190 |         #print "End %s" % name
191 |         self.soup.handle_endtag(name)
192 | 
193 |     def startElementNS(self, nsTuple, nodeName, attrs):
194 |         # Throw away (ns, nodeName) for now.
195 |         self.startElement(nodeName, attrs)
196 | 
197 |     def endElementNS(self, nsTuple, nodeName):
198 |         # Throw away (ns, nodeName) for now.
199 |         self.endElement(nodeName)
200 |         #handler.endElementNS((ns, node.nodeName), node.nodeName)
201 | 
202 |     def startPrefixMapping(self, prefix, nodeValue):
203 |         # Ignore the prefix for now.
204 |         pass
205 | 
206 |     def endPrefixMapping(self, prefix):
207 |         # Ignore the prefix for now.
208 |         # handler.endPrefixMapping(prefix)
209 |         pass
210 | 
211 |     def characters(self, content):
212 |         self.soup.handle_data(content)
213 | 
214 |     def startDocument(self):
215 |         pass
216 | 
217 |     def endDocument(self):
218 |         pass
219 | 
220 | 
221 | class HTMLTreeBuilder(TreeBuilder):
222 |     """This TreeBuilder knows facts about HTML.
223 | 
224 |     Such as which tags are empty-element tags.
225 |     """
226 | 
227 |     preserve_whitespace_tags = set(['pre', 'textarea'])
228 |     empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
229 |                               'spacer', 'link', 'frame', 'base'])
230 | 
231 |     # The HTML standard defines these attributes as containing a
232 |     # space-separated list of values, not a single value. That is,
233 |     # class="foo bar" means that the 'class' attribute has two values,
234 |     # 'foo' and 'bar', not the single value 'foo bar'.  When we
235 |     # encounter one of these attributes, we will parse its value into
236 |     # a list of values if possible. Upon output, the list will be
237 |     # converted back into a string.
238 |     cdata_list_attributes = {
239 |         "*" : ['class', 'accesskey', 'dropzone'],
240 |         "a" : ['rel', 'rev'],
241 |         "link" :  ['rel', 'rev'],
242 |         "td" : ["headers"],
243 |         "th" : ["headers"],
244 |         "td" : ["headers"],
245 |         "form" : ["accept-charset"],
246 |         "object" : ["archive"],
247 | 
248 |         # These are HTML5 specific, as are *.accesskey and *.dropzone above.
249 |         "area" : ["rel"],
250 |         "icon" : ["sizes"],
251 |         "iframe" : ["sandbox"],
252 |         "output" : ["for"],
253 |         }
254 | 
255 |     def set_up_substitutions(self, tag):
256 |         # We are only interested in <meta> tags
257 |         if tag.name != 'meta':
258 |             return False
259 | 
260 |         http_equiv = tag.get('http-equiv')
261 |         content = tag.get('content')
262 |         charset = tag.get('charset')
263 | 
264 |         # We are interested in <meta> tags that say what encoding the
265 |         # document was originally in. This means HTML 5-style <meta>
266 |         # tags that provide the "charset" attribute. It also means
267 |         # HTML 4-style <meta> tags that provide the "content"
268 |         # attribute and have "http-equiv" set to "content-type".
269 |         #
270 |         # In both cases we will replace the value of the appropriate
271 |         # attribute with a standin object that can take on any
272 |         # encoding.
273 |         meta_encoding = None
274 |         if charset is not None:
275 |             # HTML 5 style:
276 |             # <meta charset="utf8">
277 |             meta_encoding = charset
278 |             tag['charset'] = CharsetMetaAttributeValue(charset)
279 | 
280 |         elif (content is not None and http_equiv is not None
281 |               and http_equiv.lower() == 'content-type'):
282 |             # HTML 4 style:
283 |             # <meta http-equiv="content-type" content="text/html; charset=utf8">
284 |             tag['content'] = ContentMetaAttributeValue(content)
285 | 
286 |         return (meta_encoding is not None)
287 | 
288 | def register_treebuilders_from(module):
289 |     """Copy TreeBuilders from the given module into this module."""
290 |     # I'm fairly sure this is not the best way to do this.
291 |     this_module = sys.modules['bs4.builder']
292 |     for name in module.__all__:
293 |         obj = getattr(module, name)
294 | 
295 |         if issubclass(obj, TreeBuilder):
296 |             setattr(this_module, name, obj)
297 |             this_module.__all__.append(name)
298 |             # Register the builder while we're at it.
299 |             this_module.builder_registry.register(obj)
300 | 
301 | class ParserRejectedMarkup(Exception):
302 |     pass
303 | 
304 | # Builders are registered in reverse order of priority, so that custom
305 | # builder registrations will take precedence. In general, we want lxml
306 | # to take precedence over html5lib, because it's faster. And we only
307 | # want to use HTMLParser as a last result.
308 | from . import _htmlparser
309 | register_treebuilders_from(_htmlparser)
310 | try:
311 |     from . import _html5lib
312 |     register_treebuilders_from(_html5lib)
313 | except ImportError:
314 |     # They don't have html5lib installed.
315 |     pass
316 | try:
317 |     from . import _lxml
318 |     register_treebuilders_from(_lxml)
319 | except ImportError:
320 |     # They don't have lxml installed.
321 |     pass
322 | 


--------------------------------------------------------------------------------
/tenkou.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # version 0.0.2
  4 | 
  5 | import urllib.request, urllib.parse, urllib.error
  6 | import argparse, re, os, sys, copy
  7 | from bs4 import BeautifulSoup
  8 | 
  9 | 
 10 | def ptStrLiterally(str):
 11 |     for i in str:
 12 |         try:
 13 |             print(i, end='')
 14 |         except UnicodeEncodeError as e:
 15 |             pass
 16 |     print('')
 17 | 
 18 | 
 19 | def puts(str):
 20 |     try:
 21 |         print(str)
 22 |     except UnicodeEncodeError as e:
 23 |         # print(e.reason)
 24 |         ptStrLiterally(str)
 25 |     else:
 26 |         pass
 27 | 
 28 | 
 29 | def searchSubStr(str, pattern_start, pattern_end, quiet=False):
 30 |     try:
 31 |         start = re.search(pattern_start, str).end()
 32 |         end = re.search(pattern_end, str[start:]).start()
 33 |     except AttributeError as e:
 34 |         if not quiet:
 35 |             print('AttributeError: Can\'t find substring')
 36 |         return ''
 37 |     substr = str[start:end+start]
 38 |     return substr
 39 | 
 40 | 
 41 | def generateOpener(auth, ua):
 42 |     opener = urllib.request.build_opener()
 43 |     if ua:
 44 |         opener.addheaders = [('User-agent', ua)]
 45 |     else:
 46 |         opener.addheaders = [('User-agent', 'Mozilla 5.0')]
 47 |     if auth:
 48 |         opener.addheaders.append(('Cookie', 'chii_auth=' + auth))
 49 |     return opener
 50 | 
 51 | 
 52 | def getHtml(url, auth, ua):
 53 |     opener = generateOpener(auth, ua)
 54 |     try:
 55 |         html = opener.open(url).read()
 56 |     except urllib.error.URLError as e:
 57 |         print(url)
 58 |         print('No response...')
 59 |         return None
 60 |     else:
 61 |         return html
 62 | 
 63 | 
 64 | def getProgress(url, auth, ua):
 65 |     opener = generateOpener(auth, ua)
 66 |     try:
 67 |         html = opener.open(url).read()
 68 |         soup = BeautifulSoup(html.decode('utf-8'))
 69 |         p = soup.find('input', id='watchedeps')['value']
 70 |     except urllib.error.URLError as e:
 71 |         print(url)
 72 |         print('No response...')
 73 |         return ''
 74 |     except TypeError as e:
 75 |         print(url)
 76 |         print('TyepError: NoneType')
 77 |         print('Error: the given auth string doesn\'t match the user id')
 78 |         return ''
 79 |     else:
 80 |         return p
 81 | 
 82 | 
 83 | 
 84 | def getIDnGh(li):
 85 |     idngh = li.find('p', class_='collectModify').find_all('a')[1]['onclick']
 86 |     # [subid, gh]
 87 |     return idngh[20:-2].split(", '")
 88 | 
 89 | 
 90 | def removeItem(domain, subid, auth, ua, gh):
 91 |     opener = generateOpener(auth, ua)
 92 |     rmlink = ''.join([domain, '/subject/', subid, '/remove?gh=', gh])
 93 |     try:
 94 |         response = opener.open(rmlink)
 95 |     except urllib.error.URLError as e:
 96 |         print(rmlink)
 97 |         print('Cant erase subject %s' % subid)
 98 |         return False
 99 |     else:
100 |         return True
101 | 
102 | 
103 | def export(domain, auth, ua, uid, path, wipe):
104 |     cats = ['anime', 'game', 'music', 'book', 'real']
105 |     types = ['do', 'collect', 'wish', 'on_hold', 'dropped']
106 |     # types = ['do', 'wish', 'on_hold', 'dropped']
107 |     # types = ['do', 'on_hold', 'dropped']
108 |     cats_c = {'anime' : '动画',
109 |               'game' : '游戏',
110 |               'music' : '音乐',
111 |               'book' : '书籍',
112 |               'real' : '电视剧'}
113 |     types_c = {'do' : '在看',
114 |                'collect' : '看过',
115 |                'wish' : '想看',
116 |                'on_hold' : '搁置',
117 |                'dropped' : '抛弃'}
118 |     cats_types = [(c, t) for c in cats for t in types]
119 |     for cat, type in cats_types:
120 |         # if cat == 'anime' and type == 'collect':
121 |         #     continue
122 |         # print(types_c[type], '的', cats_c[cat], '\n')
123 |         puts(types_c[type] + '的' + cats_c[cat] + '\n')
124 |         pg = 1
125 |         idx = 1
126 |         items = ''
127 |         while pg != 0:
128 |             url = ''.join( [domain, '/', cat, '/list/', uid, '/',
129 |                             type, '?page=', str(pg)] )
130 |             html = getHtml(url, auth, ua)
131 |             if not html:
132 |                 break
133 |             # # test
134 |             # with open("test.html",'w', encoding='utf-8') as ft:
135 |             #     ft.write(html.decode('utf-8'))
136 |             # # test
137 |             soup = BeautifulSoup(html.decode('utf-8'))
138 |             ul = soup.find(id='browserItemList')
139 |             content = ''
140 |             for li in ul.children:
141 |                 inner = li.find('div', class_='inner')
142 |                 collect_info = inner.find('p', class_='collectInfo')
143 |                 comment = inner.find('div', id='comment_box')
144 |                 stars = inner.find('span', class_='starsinfo')
145 |                 greyname = inner.h3.small
146 |                 href = domain + inner.h3.a['href']
147 |                 iname = str(idx) + '. ' + inner.h3.a.text.strip() + '\n'
148 |                 iurl = '地址：' + href + '\n'
149 |                 icollect_info = collect_info.text.strip() + '\n'
150 |                 if greyname:
151 |                     igreyname = '原名：' + greyname.text.strip() + '\n'
152 |                 else:
153 |                     igreyname = ''
154 |                 if stars:
155 |                     istars = '评分：' + stars['class'][0][6:] + '星\n'
156 |                 else:
157 |                     istars = ''
158 |                 if comment:
159 |                     icomment = ('简评：'
160 |                              + inner.find('div',
161 |                                           id='comment_box').text.strip()
162 |                              + '\n')
163 |                 else:
164 |                     icomment = ''
165 |                 if ( (cat == 'anime' or cat == 'real')
166 |                      and type == 'do'
167 |                      and auth ):
168 |                     iprogress = '进度：' + getProgress(href, auth, ua) + '\n'
169 |                 else:
170 |                     iprogress = ''
171 |                 # print(iname)
172 |                 puts(iname)
173 |                 content += (iname + igreyname + iurl + istars + icomment
174 |                          + iprogress + icollect_info + '\n')
175 |                 idx += 1
176 |                 if wipe:
177 |                     # remove item
178 |                     try:
179 |                         subid, gh = getIDnGh(li)
180 |                         removeItem(domain, subid, auth, ua, gh)
181 |                     except:
182 |                         print('Error: wrong auth string\n')
183 |             if content != '':
184 |                 items += content
185 |                 pg += 1
186 |             else:
187 |                 pg = 0
188 |         if items == '':
189 |             continue
190 |         file_name = path + '/bangumi_' + cat + '_' + type + '.txt'
191 |         with open(file_name, 'w', encoding='utf-8') as f:
192 |             f.write(items)
193 | 
194 | 
195 | def getAuth(domain, auth, ua, authfile, uid, password):
196 |     if auth and ua:
197 |         return uid, auth, ua
198 |     elif authfile:
199 |         with open(authfile, 'r') as af:
200 |             user_agent = af.readline()
201 |             auth = af.readline()
202 |         return uid, auth.strip(), user_agent.strip()
203 |     elif not password:
204 |         # print('Error: No auth string, no auth file, no password\n')
205 |         return uid, auth, ua
206 |     url = domain + '/login'
207 |     # url = domain + '/FollowTheRabbit'
208 |     data = {'cookietime': '2592000',
209 |             'email': uid,
210 |             'password': password,
211 |             'loginsubmit': '登录'}
212 |     user_agent = 'Mozilla/5.0 (Elephant 3) Midori 3.5'
213 |     data = urllib.parse.urlencode(data).encode('utf-8')
214 |     opener = urllib.request.build_opener()
215 |     opener.addheaders = [('User-agent', user_agent)]
216 |     urllib.request.install_opener(opener)
217 |     res = urllib.request.urlopen(url, data)
218 |     # print(res.getheaders())
219 |     # print(res.getheader('Set-Cookie'))
220 |     cookie = res.getheader('Set-Cookie')
221 |     # -- use searchSubStr() --
222 |     # start = re.search('chii_auth=', cookie).end()
223 |     # end = re.search('(;|$)', cookie[start:]).start()
224 |     # # print(cookie[start:end+start])
225 |     # auth = cookie[start:end+start]
226 |     # -- use searchSubStr() --
227 |     auth = searchSubStr(cookie, 'chii_auth=', '(;|$)')
228 |     return uid, auth, user_agent
229 | 
230 | 
231 | def post(url, data, auth, ua):
232 |     opener = generateOpener(auth, ua)
233 |     post_data = urllib.parse.urlencode(data).encode('utf-8')
234 |     urllib.request.install_opener(opener)
235 |     res = urllib.request.urlopen(url, post_data)
236 |     return res
237 | 
238 | 
239 | def getGH(domain, auth, ua):
240 |     opener = generateOpener(auth, ua)
241 |     html = opener.open(domain).read().decode('utf-8')
242 |     pattern = '<a href="http://(bangumi.tv|bgm.tv|chii.in)/logout/'
243 |     # -- use searchSubStr() --
244 |     # start = re.search(pattern, html).end()
245 |     # end = re.search('"', html[start:]).start()
246 |     # return html[start:end+start]
247 |     # -- use searchSubStr() --
248 |     return searchSubStr(html, pattern, '"')
249 | 
250 | 
251 | def addItem(domain, subid, type, rating, tags,
252 |             comment, watchedeps, gh, auth, ua):
253 |     # print(domain, subid, type, rating, tags,
254 |     #       comment, watchedeps, gh, auth, ua)
255 |     # on == on_hold
256 |     types_table = {
257 |         'wish'    : 1,
258 |         'collect' : 2,
259 |         'do'      : 3,
260 |         'on'      : 4,
261 |         'dropped' : 5
262 |     }
263 |     item_action = ''.join( [domain, '/subject/', subid,
264 |                             '/interest/update?gh=', gh] )
265 |     item_data = {
266 |         'referer'  : 'subject',
267 |         'interest' : types_table[type],
268 |         'rating'   : rating,
269 |         'tags'     : tags,
270 |         'comment'  : comment,
271 |         'update'   : '保存'
272 |     }
273 |     item_res = post(item_action, item_data, auth, ua)
274 |     if watchedeps:
275 |         eps_action = ''.join( [domain, '/subject/set/watched/', subid] )
276 |         eps_data = {
277 |             'referer'    : 'subject',
278 |             'subject'    : '更新',
279 |             'watchedeps' : watchedeps
280 |         }
281 |         eps_res = post(eps_action, eps_data, auth, ua)
282 |     return item_res
283 | 
284 | 
285 | def restore(domain, auth, ua, path):
286 |     basic_dict = {
287 |         'title'      : '',
288 |         'subid'      : '',
289 |         'type'       : '',
290 |         'rating'     : '',
291 |         'tags'       : '',
292 |         'comment'    : '',
293 |         'watchedeps' : '',
294 |     }
295 |     m_dict = {
296 |         '简评' : 'comment',
297 |         '进度' : 'watchedeps',
298 |     }
299 |     part_a = '_(anime|game|music|book|real)'
300 |     part_b = '_(do|collect|wish|on_hold|dropped)'
301 |     files_name_pattern = 'bangumi' + part_a + part_b + '.txt$'
302 |     files = filter(lambda x : re.match(files_name_pattern, x), os.listdir(path))
303 |     gh = getGH(domain, auth, ua)
304 |     for file in files:
305 |         print(file, '\n')
306 |         items_dict = {}
307 |         counter = 0
308 |         with open(path + file, 'r', encoding='utf-8') as f:
309 |             items = f.readlines()
310 |         for line in items:
311 |             # print(line)
312 |             line = line.strip()
313 |             if re.match('\d+\. ', line):
314 |                 counter += 1
315 |                 items_dict[counter] = copy.deepcopy(basic_dict)
316 |                 items_dict[counter]['title'] = line
317 |                 type = file.split('.')[0].split('_')[2]
318 |                 items_dict[counter]['type'] = type
319 |             elif re.match('\d{4}-\d{1,2}-\d{1,2}', line):
320 |                 tags = searchSubStr(line, '标签: ', '$', True)
321 |                 items_dict[counter]['tags'] = tags
322 |             elif line.startswith('地址'):
323 |                 subid = searchSubStr(line, '\.(tv|in)/subject/', '$')
324 |                 # print('subid', subid)
325 |                 # print('counter', counter)
326 |                 items_dict[counter]['subid'] = subid
327 |             elif line.startswith('评分'):
328 |                 items_dict[counter]['rating'] = line[3:-1]
329 |             else:
330 |                 m = m_dict.get(line[:2])
331 |                 items_dict[counter][m] = line[3:]
332 |         n = len( items_dict.keys() )
333 |         for i in range(n, 0, -1):
334 |             # print(items_dict[i]['subid'],
335 |             #       items_dict[i]['type'],
336 |             #       items_dict[i]['rating'],
337 |             #       items_dict[i]['tags'],
338 |             #       items_dict[i]['comment'],
339 |             #       items_dict[i]['watchedeps'],
340 |             #       gh,
341 |             #       auth,
342 |             #       ua)
343 |             puts(items_dict[i]['title'] + '\n')
344 |             addItem(domain,
345 |                     items_dict[i]['subid'],
346 |                     items_dict[i]['type'],
347 |                     items_dict[i]['rating'],
348 |                     items_dict[i]['tags'],
349 |                     items_dict[i]['comment'],
350 |                     items_dict[i]['watchedeps'],
351 |                     gh,
352 |                     auth,
353 |                     ua)
354 | 
355 | 
356 | 
357 | def main():
358 |     '''Main function'''
359 |     # parse argv start
360 |     parser = argparse.ArgumentParser(prog="tenkou.py")
361 |     parser.add_argument("-d", "--domain",
362 |                         default="bgm.tv",
363 |                         choices=["chii.in", "bgm.tv", "bangumi.tv"],
364 |                         help="choose domain, default is bgm.tv")
365 |     parser.add_argument("-u", "--uid",
366 |                         help="your id")
367 |     parser.add_argument("--password",
368 |                         help="give me your password")
369 |     parser.add_argument("-p", "--path",
370 |                         default="./",
371 |                         help="change the directory "\
372 |                              "where you save files")
373 |     parser.add_argument("--auth",
374 |                         help="your auth string")
375 |     parser.add_argument("--useragent",
376 |                         help="your user-agent")
377 |     parser.add_argument("--authfile",
378 |                         help="specify the location of "\
379 |                              "your auth file")
380 |     parser.add_argument("-r", "--restore",
381 |                         action="store_true",
382 |                         help="restore your data")
383 |     parser.add_argument("--wipe",
384 |                         action="store_true",
385 |                         help="tenkou")
386 |     parser.add_argument("-v", "--version",
387 |                         action='version',
388 |                         version='v0.0.2')
389 |     args = parser.parse_args()
390 |     # parse argv end
391 |     if not os.path.isdir(args.path):
392 |         print("Error: Local path doesn't exist")
393 |         return
394 |     if not args.uid:
395 |         print('Error: Please tell me your id')
396 |         return
397 |     path = args.path + "/"
398 |     domain = 'http://' + args.domain
399 |     wipe = args.wipe
400 |     # print(wipe==True)
401 |     uid, auth, ua = getAuth(domain,
402 |                             args.auth,
403 |                             args.useragent,
404 |                             args.authfile,
405 |                             args.uid,
406 |                             args.password)
407 |     if not args.restore:
408 |         export(domain, auth, ua, uid, path, wipe)
409 |     else:
410 |         restore(domain, auth, ua, path)
411 |     print("Complete")
412 | 
413 | main()
414 | 


--------------------------------------------------------------------------------
/bs4/__init__.py:
--------------------------------------------------------------------------------
  1 | """Beautiful Soup
  2 | Elixir and Tonic
  3 | "The Screen-Scraper's Friend"
  4 | http://www.crummy.com/software/BeautifulSoup/
  5 | 
  6 | Beautiful Soup uses a pluggable XML or HTML parser to parse a
  7 | (possibly invalid) document into a tree representation. Beautiful Soup
  8 | provides provides methods and Pythonic idioms that make it easy to
  9 | navigate, search, and modify the parse tree.
 10 | 
 11 | Beautiful Soup works with Python 2.6 and up. It works better if lxml
 12 | and/or html5lib is installed.
 13 | 
 14 | For more than you ever wanted to know about Beautiful Soup, see the
 15 | documentation:
 16 | http://www.crummy.com/software/BeautifulSoup/bs4/doc/
 17 | """
 18 | 
 19 | __author__ = "Leonard Richardson (leonardr@segfault.org)"
 20 | __version__ = "4.3.2"
 21 | __copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
 22 | __license__ = "MIT"
 23 | 
 24 | __all__ = ['BeautifulSoup']
 25 | 
 26 | import os
 27 | import re
 28 | import warnings
 29 | 
 30 | from .builder import builder_registry, ParserRejectedMarkup
 31 | from .dammit import UnicodeDammit
 32 | from .element import (
 33 |     CData,
 34 |     Comment,
 35 |     DEFAULT_OUTPUT_ENCODING,
 36 |     Declaration,
 37 |     Doctype,
 38 |     NavigableString,
 39 |     PageElement,
 40 |     ProcessingInstruction,
 41 |     ResultSet,
 42 |     SoupStrainer,
 43 |     Tag,
 44 |     )
 45 | 
 46 | # The very first thing we do is give a useful error if someone is
 47 | # running this code under Python 3 without converting it.
 48 | syntax_error = 'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
 49 | 
 50 | class BeautifulSoup(Tag):
 51 |     """
 52 |     This class defines the basic interface called by the tree builders.
 53 | 
 54 |     These methods will be called by the parser:
 55 |       reset()
 56 |       feed(markup)
 57 | 
 58 |     The tree builder may call these methods from its feed() implementation:
 59 |       handle_starttag(name, attrs) # See note about return value
 60 |       handle_endtag(name)
 61 |       handle_data(data) # Appends to the current data node
 62 |       endData(containerClass=NavigableString) # Ends the current data node
 63 | 
 64 |     No matter how complicated the underlying parser is, you should be
 65 |     able to build a tree using 'start tag' events, 'end tag' events,
 66 |     'data' events, and "done with data" events.
 67 | 
 68 |     If you encounter an empty-element tag (aka a self-closing tag,
 69 |     like HTML's <br> tag), call handle_starttag and then
 70 |     handle_endtag.
 71 |     """
 72 |     ROOT_TAG_NAME = '[document]'
 73 | 
 74 |     # If the end-user gives no indication which tree builder they
 75 |     # want, look for one with these features.
 76 |     DEFAULT_BUILDER_FEATURES = ['html', 'fast']
 77 | 
 78 |     ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
 79 | 
 80 |     def __init__(self, markup="", features=None, builder=None,
 81 |                  parse_only=None, from_encoding=None, **kwargs):
 82 |         """The Soup object is initialized as the 'root tag', and the
 83 |         provided markup (which can be a string or a file-like object)
 84 |         is fed into the underlying parser."""
 85 | 
 86 |         if 'convertEntities' in kwargs:
 87 |             warnings.warn(
 88 |                 "BS4 does not respect the convertEntities argument to the "
 89 |                 "BeautifulSoup constructor. Entities are always converted "
 90 |                 "to Unicode characters.")
 91 | 
 92 |         if 'markupMassage' in kwargs:
 93 |             del kwargs['markupMassage']
 94 |             warnings.warn(
 95 |                 "BS4 does not respect the markupMassage argument to the "
 96 |                 "BeautifulSoup constructor. The tree builder is responsible "
 97 |                 "for any necessary markup massage.")
 98 | 
 99 |         if 'smartQuotesTo' in kwargs:
100 |             del kwargs['smartQuotesTo']
101 |             warnings.warn(
102 |                 "BS4 does not respect the smartQuotesTo argument to the "
103 |                 "BeautifulSoup constructor. Smart quotes are always converted "
104 |                 "to Unicode characters.")
105 | 
106 |         if 'selfClosingTags' in kwargs:
107 |             del kwargs['selfClosingTags']
108 |             warnings.warn(
109 |                 "BS4 does not respect the selfClosingTags argument to the "
110 |                 "BeautifulSoup constructor. The tree builder is responsible "
111 |                 "for understanding self-closing tags.")
112 | 
113 |         if 'isHTML' in kwargs:
114 |             del kwargs['isHTML']
115 |             warnings.warn(
116 |                 "BS4 does not respect the isHTML argument to the "
117 |                 "BeautifulSoup constructor. You can pass in features='html' "
118 |                 "or features='xml' to get a builder capable of handling "
119 |                 "one or the other.")
120 | 
121 |         def deprecated_argument(old_name, new_name):
122 |             if old_name in kwargs:
123 |                 warnings.warn(
124 |                     'The "%s" argument to the BeautifulSoup constructor '
125 |                     'has been renamed to "%s."' % (old_name, new_name))
126 |                 value = kwargs[old_name]
127 |                 del kwargs[old_name]
128 |                 return value
129 |             return None
130 | 
131 |         parse_only = parse_only or deprecated_argument(
132 |             "parseOnlyThese", "parse_only")
133 | 
134 |         from_encoding = from_encoding or deprecated_argument(
135 |             "fromEncoding", "from_encoding")
136 | 
137 |         if len(kwargs) > 0:
138 |             arg = list(kwargs.keys()).pop()
139 |             raise TypeError(
140 |                 "__init__() got an unexpected keyword argument '%s'" % arg)
141 | 
142 |         if builder is None:
143 |             if isinstance(features, str):
144 |                 features = [features]
145 |             if features is None or len(features) == 0:
146 |                 features = self.DEFAULT_BUILDER_FEATURES
147 |             builder_class = builder_registry.lookup(*features)
148 |             if builder_class is None:
149 |                 raise FeatureNotFound(
150 |                     "Couldn't find a tree builder with the features you "
151 |                     "requested: %s. Do you need to install a parser library?"
152 |                     % ",".join(features))
153 |             builder = builder_class()
154 |         self.builder = builder
155 |         self.is_xml = builder.is_xml
156 |         self.builder.soup = self
157 | 
158 |         self.parse_only = parse_only
159 | 
160 |         if hasattr(markup, 'read'):        # It's a file-type object.
161 |             markup = markup.read()
162 |         elif len(markup) <= 256:
163 |             # Print out warnings for a couple beginner problems
164 |             # involving passing non-markup to Beautiful Soup.
165 |             # Beautiful Soup will still parse the input as markup,
166 |             # just in case that's what the user really wants.
167 |             if (isinstance(markup, str)
168 |                 and not os.path.supports_unicode_filenames):
169 |                 possible_filename = markup.encode("utf8")
170 |             else:
171 |                 possible_filename = markup
172 |             is_file = False
173 |             try:
174 |                 is_file = os.path.exists(possible_filename)
175 |             except Exception as e:
176 |                 # This is almost certainly a problem involving
177 |                 # characters not valid in filenames on this
178 |                 # system. Just let it go.
179 |                 pass
180 |             if is_file:
181 |                 warnings.warn(
182 |                     '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
183 |             if markup[:5] == "http:" or markup[:6] == "https:":
184 |                 # TODO: This is ugly but I couldn't get it to work in
185 |                 # Python 3 otherwise.
186 |                 if ((isinstance(markup, bytes) and not b' ' in markup)
187 |                     or (isinstance(markup, str) and not ' ' in markup)):
188 |                     warnings.warn(
189 |                         '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
190 | 
191 |         for (self.markup, self.original_encoding, self.declared_html_encoding,
192 |          self.contains_replacement_characters) in (
193 |             self.builder.prepare_markup(markup, from_encoding)):
194 |             self.reset()
195 |             try:
196 |                 self._feed()
197 |                 break
198 |             except ParserRejectedMarkup:
199 |                 pass
200 | 
201 |         # Clear out the markup and remove the builder's circular
202 |         # reference to this object.
203 |         self.markup = None
204 |         self.builder.soup = None
205 | 
206 |     def _feed(self):
207 |         # Convert the document to Unicode.
208 |         self.builder.reset()
209 | 
210 |         self.builder.feed(self.markup)
211 |         # Close out any unfinished strings and close all the open tags.
212 |         self.endData()
213 |         while self.currentTag.name != self.ROOT_TAG_NAME:
214 |             self.popTag()
215 | 
216 |     def reset(self):
217 |         Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
218 |         self.hidden = 1
219 |         self.builder.reset()
220 |         self.current_data = []
221 |         self.currentTag = None
222 |         self.tagStack = []
223 |         self.preserve_whitespace_tag_stack = []
224 |         self.pushTag(self)
225 | 
226 |     def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
227 |         """Create a new tag associated with this soup."""
228 |         return Tag(None, self.builder, name, namespace, nsprefix, attrs)
229 | 
230 |     def new_string(self, s, subclass=NavigableString):
231 |         """Create a new NavigableString associated with this soup."""
232 |         navigable = subclass(s)
233 |         navigable.setup()
234 |         return navigable
235 | 
236 |     def insert_before(self, successor):
237 |         raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
238 | 
239 |     def insert_after(self, successor):
240 |         raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
241 | 
242 |     def popTag(self):
243 |         tag = self.tagStack.pop()
244 |         if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
245 |             self.preserve_whitespace_tag_stack.pop()
246 |         #print "Pop", tag.name
247 |         if self.tagStack:
248 |             self.currentTag = self.tagStack[-1]
249 |         return self.currentTag
250 | 
251 |     def pushTag(self, tag):
252 |         #print "Push", tag.name
253 |         if self.currentTag:
254 |             self.currentTag.contents.append(tag)
255 |         self.tagStack.append(tag)
256 |         self.currentTag = self.tagStack[-1]
257 |         if tag.name in self.builder.preserve_whitespace_tags:
258 |             self.preserve_whitespace_tag_stack.append(tag)
259 | 
260 |     def endData(self, containerClass=NavigableString):
261 |         if self.current_data:
262 |             current_data = ''.join(self.current_data)
263 |             # If whitespace is not preserved, and this string contains
264 |             # nothing but ASCII spaces, replace it with a single space
265 |             # or newline.
266 |             if not self.preserve_whitespace_tag_stack:
267 |                 strippable = True
268 |                 for i in current_data:
269 |                     if i not in self.ASCII_SPACES:
270 |                         strippable = False
271 |                         break
272 |                 if strippable:
273 |                     if '\n' in current_data:
274 |                         current_data = '\n'
275 |                     else:
276 |                         current_data = ' '
277 | 
278 |             # Reset the data collector.
279 |             self.current_data = []
280 | 
281 |             # Should we add this string to the tree at all?
282 |             if self.parse_only and len(self.tagStack) <= 1 and \
283 |                    (not self.parse_only.text or \
284 |                     not self.parse_only.search(current_data)):
285 |                 return
286 | 
287 |             o = containerClass(current_data)
288 |             self.object_was_parsed(o)
289 | 
290 |     def object_was_parsed(self, o, parent=None, most_recent_element=None):
291 |         """Add an object to the parse tree."""
292 |         parent = parent or self.currentTag
293 |         most_recent_element = most_recent_element or self._most_recent_element
294 |         o.setup(parent, most_recent_element)
295 | 
296 |         if most_recent_element is not None:
297 |             most_recent_element.next_element = o
298 |         self._most_recent_element = o
299 |         parent.contents.append(o)
300 | 
301 |     def _popToTag(self, name, nsprefix=None, inclusivePop=True):
302 |         """Pops the tag stack up to and including the most recent
303 |         instance of the given tag. If inclusivePop is false, pops the tag
304 |         stack up to but *not* including the most recent instqance of
305 |         the given tag."""
306 |         #print "Popping to %s" % name
307 |         if name == self.ROOT_TAG_NAME:
308 |             # The BeautifulSoup object itself can never be popped.
309 |             return
310 | 
311 |         most_recently_popped = None
312 | 
313 |         stack_size = len(self.tagStack)
314 |         for i in range(stack_size - 1, 0, -1):
315 |             t = self.tagStack[i]
316 |             if (name == t.name and nsprefix == t.prefix):
317 |                 if inclusivePop:
318 |                     most_recently_popped = self.popTag()
319 |                 break
320 |             most_recently_popped = self.popTag()
321 | 
322 |         return most_recently_popped
323 | 
324 |     def handle_starttag(self, name, namespace, nsprefix, attrs):
325 |         """Push a start tag on to the stack.
326 | 
327 |         If this method returns None, the tag was rejected by the
328 |         SoupStrainer. You should proceed as if the tag had not occured
329 |         in the document. For instance, if this was a self-closing tag,
330 |         don't call handle_endtag.
331 |         """
332 | 
333 |         # print "Start tag %s: %s" % (name, attrs)
334 |         self.endData()
335 | 
336 |         if (self.parse_only and len(self.tagStack) <= 1
337 |             and (self.parse_only.text
338 |                  or not self.parse_only.search_tag(name, attrs))):
339 |             return None
340 | 
341 |         tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
342 |                   self.currentTag, self._most_recent_element)
343 |         if tag is None:
344 |             return tag
345 |         if self._most_recent_element:
346 |             self._most_recent_element.next_element = tag
347 |         self._most_recent_element = tag
348 |         self.pushTag(tag)
349 |         return tag
350 | 
351 |     def handle_endtag(self, name, nsprefix=None):
352 |         #print "End tag: " + name
353 |         self.endData()
354 |         self._popToTag(name, nsprefix)
355 | 
356 |     def handle_data(self, data):
357 |         self.current_data.append(data)
358 | 
359 |     def decode(self, pretty_print=False,
360 |                eventual_encoding=DEFAULT_OUTPUT_ENCODING,
361 |                formatter="minimal"):
362 |         """Returns a string or Unicode representation of this document.
363 |         To get Unicode, pass None for encoding."""
364 | 
365 |         if self.is_xml:
366 |             # Print the XML declaration
367 |             encoding_part = ''
368 |             if eventual_encoding != None:
369 |                 encoding_part = ' encoding="%s"' % eventual_encoding
370 |             prefix = '<?xml version="1.0"%s?>\n' % encoding_part
371 |         else:
372 |             prefix = ''
373 |         if not pretty_print:
374 |             indent_level = None
375 |         else:
376 |             indent_level = 0
377 |         return prefix + super(BeautifulSoup, self).decode(
378 |             indent_level, eventual_encoding, formatter)
379 | 
380 | # Alias to make it easier to type import: 'from bs4 import _soup'
381 | _s = BeautifulSoup
382 | _soup = BeautifulSoup
383 | 
384 | class BeautifulStoneSoup(BeautifulSoup):
385 |     """Deprecated interface to an XML parser."""
386 | 
387 |     def __init__(self, *args, **kwargs):
388 |         kwargs['features'] = 'xml'
389 |         warnings.warn(
390 |             'The BeautifulStoneSoup class is deprecated. Instead of using '
391 |             'it, pass features="xml" into the BeautifulSoup constructor.')
392 |         super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
393 | 
394 | 
395 | class StopParsing(Exception):
396 |     pass
397 | 
398 | class FeatureNotFound(ValueError):
399 |     pass
400 | 
401 | 
402 | #By default, act as an HTML pretty-printer.
403 | if __name__ == '__main__':
404 |     import sys
405 |     soup = BeautifulSoup(sys.stdin)
406 |     print(soup.prettify())
407 | 


--------------------------------------------------------------------------------
/bs4/tests/test_soup.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Tests of Beautiful Soup as a whole."""
  3 | 
  4 | import logging
  5 | import unittest
  6 | import sys
  7 | import tempfile
  8 | 
  9 | from bs4 import (
 10 |     BeautifulSoup,
 11 |     BeautifulStoneSoup,
 12 | )
 13 | from bs4.element import (
 14 |     CharsetMetaAttributeValue,
 15 |     ContentMetaAttributeValue,
 16 |     SoupStrainer,
 17 |     NamespacedAttribute,
 18 |     )
 19 | import bs4.dammit
 20 | from bs4.dammit import (
 21 |     EntitySubstitution,
 22 |     UnicodeDammit,
 23 | )
 24 | from bs4.testing import (
 25 |     SoupTest,
 26 |     skipIf,
 27 | )
 28 | import warnings
 29 | 
 30 | try:
 31 |     from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
 32 |     LXML_PRESENT = True
 33 | except ImportError as e:
 34 |     LXML_PRESENT = False
 35 | 
 36 | PYTHON_2_PRE_2_7 = (sys.version_info < (2,7))
 37 | PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
 38 | 
 39 | class TestConstructor(SoupTest):
 40 | 
 41 |     def test_short_unicode_input(self):
 42 |         data = "<h1>éé</h1>"
 43 |         soup = self.soup(data)
 44 |         self.assertEqual("éé", soup.h1.string)
 45 | 
 46 |     def test_embedded_null(self):
 47 |         data = "<h1>foo\0bar</h1>"
 48 |         soup = self.soup(data)
 49 |         self.assertEqual("foo\0bar", soup.h1.string)
 50 | 
 51 | 
 52 | class TestDeprecatedConstructorArguments(SoupTest):
 53 | 
 54 |     def test_parseOnlyThese_renamed_to_parse_only(self):
 55 |         with warnings.catch_warnings(record=True) as w:
 56 |             soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
 57 |         msg = str(w[0].message)
 58 |         self.assertTrue("parseOnlyThese" in msg)
 59 |         self.assertTrue("parse_only" in msg)
 60 |         self.assertEqual(b"<b></b>", soup.encode())
 61 | 
 62 |     def test_fromEncoding_renamed_to_from_encoding(self):
 63 |         with warnings.catch_warnings(record=True) as w:
 64 |             utf8 = b"\xc3\xa9"
 65 |             soup = self.soup(utf8, fromEncoding="utf8")
 66 |         msg = str(w[0].message)
 67 |         self.assertTrue("fromEncoding" in msg)
 68 |         self.assertTrue("from_encoding" in msg)
 69 |         self.assertEqual("utf8", soup.original_encoding)
 70 | 
 71 |     def test_unrecognized_keyword_argument(self):
 72 |         self.assertRaises(
 73 |             TypeError, self.soup, "<a>", no_such_argument=True)
 74 | 
 75 | class TestWarnings(SoupTest):
 76 | 
 77 |     def test_disk_file_warning(self):
 78 |         filehandle = tempfile.NamedTemporaryFile()
 79 |         filename = filehandle.name
 80 |         try:
 81 |             with warnings.catch_warnings(record=True) as w:
 82 |                 soup = self.soup(filename)
 83 |             msg = str(w[0].message)
 84 |             self.assertTrue("looks like a filename" in msg)
 85 |         finally:
 86 |             filehandle.close()
 87 | 
 88 |         # The file no longer exists, so Beautiful Soup will no longer issue the warning.
 89 |         with warnings.catch_warnings(record=True) as w:
 90 |             soup = self.soup(filename)
 91 |         self.assertEqual(0, len(w))
 92 | 
 93 |     def test_url_warning(self):
 94 |         with warnings.catch_warnings(record=True) as w:
 95 |             soup = self.soup("http://www.crummy.com/")
 96 |         msg = str(w[0].message)
 97 |         self.assertTrue("looks like a URL" in msg)
 98 | 
 99 |         with warnings.catch_warnings(record=True) as w:
100 |             soup = self.soup("http://www.crummy.com/ is great")
101 |         self.assertEqual(0, len(w))
102 | 
103 | class TestSelectiveParsing(SoupTest):
104 | 
105 |     def test_parse_with_soupstrainer(self):
106 |         markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
107 |         strainer = SoupStrainer("b")
108 |         soup = self.soup(markup, parse_only=strainer)
109 |         self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
110 | 
111 | 
112 | class TestEntitySubstitution(unittest.TestCase):
113 |     """Standalone tests of the EntitySubstitution class."""
114 |     def setUp(self):
115 |         self.sub = EntitySubstitution
116 | 
117 |     def test_simple_html_substitution(self):
118 |         # Unicode characters corresponding to named HTML entites
119 |         # are substituted, and no others.
120 |         s = "foo\u2200\N{SNOWMAN}\u00f5bar"
121 |         self.assertEqual(self.sub.substitute_html(s),
122 |                           "foo&forall;\N{SNOWMAN}&otilde;bar")
123 | 
124 |     def test_smart_quote_substitution(self):
125 |         # MS smart quotes are a common source of frustration, so we
126 |         # give them a special test.
127 |         quotes = b"\x91\x92foo\x93\x94"
128 |         dammit = UnicodeDammit(quotes)
129 |         self.assertEqual(self.sub.substitute_html(dammit.markup),
130 |                           "&lsquo;&rsquo;foo&ldquo;&rdquo;")
131 | 
132 |     def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
133 |         s = 'Welcome to "my bar"'
134 |         self.assertEqual(self.sub.substitute_xml(s, False), s)
135 | 
136 |     def test_xml_attribute_quoting_normally_uses_double_quotes(self):
137 |         self.assertEqual(self.sub.substitute_xml("Welcome", True),
138 |                           '"Welcome"')
139 |         self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
140 |                           '"Bob\'s Bar"')
141 | 
142 |     def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
143 |         s = 'Welcome to "my bar"'
144 |         self.assertEqual(self.sub.substitute_xml(s, True),
145 |                           "'Welcome to \"my bar\"'")
146 | 
147 |     def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
148 |         s = 'Welcome to "Bob\'s Bar"'
149 |         self.assertEqual(
150 |             self.sub.substitute_xml(s, True),
151 |             '"Welcome to &quot;Bob\'s Bar&quot;"')
152 | 
153 |     def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
154 |         quoted = 'Welcome to "Bob\'s Bar"'
155 |         self.assertEqual(self.sub.substitute_xml(quoted), quoted)
156 | 
157 |     def test_xml_quoting_handles_angle_brackets(self):
158 |         self.assertEqual(
159 |             self.sub.substitute_xml("foo<bar>"),
160 |             "foo&lt;bar&gt;")
161 | 
162 |     def test_xml_quoting_handles_ampersands(self):
163 |         self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&amp;T")
164 | 
165 |     def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
166 |         self.assertEqual(
167 |             self.sub.substitute_xml("&Aacute;T&T"),
168 |             "&amp;Aacute;T&amp;T")
169 | 
170 |     def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
171 |         self.assertEqual(
172 |             self.sub.substitute_xml_containing_entities("&Aacute;T&T"),
173 |             "&Aacute;T&amp;T")
174 | 
175 |     def test_quotes_not_html_substituted(self):
176 |         """There's no need to do this except inside attribute values."""
177 |         text = 'Bob\'s "bar"'
178 |         self.assertEqual(self.sub.substitute_html(text), text)
179 | 
180 | 
181 | class TestEncodingConversion(SoupTest):
182 |     # Test Beautiful Soup's ability to decode and encode from various
183 |     # encodings.
184 | 
185 |     def setUp(self):
186 |         super(TestEncodingConversion, self).setUp()
187 |         self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
188 |         self.utf8_data = self.unicode_data.encode("utf-8")
189 |         # Just so you know what it looks like.
190 |         self.assertEqual(
191 |             self.utf8_data,
192 |             b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')
193 | 
194 |     def test_ascii_in_unicode_out(self):
195 |         # ASCII input is converted to Unicode. The original_encoding
196 |         # attribute is set to 'utf-8', a superset of ASCII.
197 |         chardet = bs4.dammit.chardet_dammit
198 |         logging.disable(logging.WARNING)
199 |         try:
200 |             def noop(str):
201 |                 return None
202 |             # Disable chardet, which will realize that the ASCII is ASCII.
203 |             bs4.dammit.chardet_dammit = noop
204 |             ascii = b"<foo>a</foo>"
205 |             soup_from_ascii = self.soup(ascii)
206 |             unicode_output = soup_from_ascii.decode()
207 |             self.assertTrue(isinstance(unicode_output, str))
208 |             self.assertEqual(unicode_output, self.document_for(ascii.decode()))
209 |             self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
210 |         finally:
211 |             logging.disable(logging.NOTSET)
212 |             bs4.dammit.chardet_dammit = chardet
213 | 
214 |     def test_unicode_in_unicode_out(self):
215 |         # Unicode input is left alone. The original_encoding attribute
216 |         # is not set.
217 |         soup_from_unicode = self.soup(self.unicode_data)
218 |         self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
219 |         self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!')
220 |         self.assertEqual(soup_from_unicode.original_encoding, None)
221 | 
222 |     def test_utf8_in_unicode_out(self):
223 |         # UTF-8 input is converted to Unicode. The original_encoding
224 |         # attribute is set.
225 |         soup_from_utf8 = self.soup(self.utf8_data)
226 |         self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
227 |         self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!')
228 | 
229 |     def test_utf8_out(self):
230 |         # The internal data structures can be encoded as UTF-8.
231 |         soup_from_unicode = self.soup(self.unicode_data)
232 |         self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
233 | 
234 |     @skipIf(
235 |         PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2,
236 |         "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
237 |     def test_attribute_name_containing_unicode_characters(self):
238 |         markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
239 |         self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
240 | 
241 | class TestUnicodeDammit(unittest.TestCase):
242 |     """Standalone tests of UnicodeDammit."""
243 | 
244 |     def test_unicode_input(self):
245 |         markup = "I'm already Unicode! \N{SNOWMAN}"
246 |         dammit = UnicodeDammit(markup)
247 |         self.assertEqual(dammit.unicode_markup, markup)
248 | 
249 |     def test_smart_quotes_to_unicode(self):
250 |         markup = b"<foo>\x91\x92\x93\x94</foo>"
251 |         dammit = UnicodeDammit(markup)
252 |         self.assertEqual(
253 |             dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
254 | 
255 |     def test_smart_quotes_to_xml_entities(self):
256 |         markup = b"<foo>\x91\x92\x93\x94</foo>"
257 |         dammit = UnicodeDammit(markup, smart_quotes_to="xml")
258 |         self.assertEqual(
259 |             dammit.unicode_markup, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
260 | 
261 |     def test_smart_quotes_to_html_entities(self):
262 |         markup = b"<foo>\x91\x92\x93\x94</foo>"
263 |         dammit = UnicodeDammit(markup, smart_quotes_to="html")
264 |         self.assertEqual(
265 |             dammit.unicode_markup, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
266 | 
267 |     def test_smart_quotes_to_ascii(self):
268 |         markup = b"<foo>\x91\x92\x93\x94</foo>"
269 |         dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
270 |         self.assertEqual(
271 |             dammit.unicode_markup, """<foo>''""</foo>""")
272 | 
273 |     def test_detect_utf8(self):
274 |         utf8 = b"\xc3\xa9"
275 |         dammit = UnicodeDammit(utf8)
276 |         self.assertEqual(dammit.unicode_markup, '\xe9')
277 |         self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
278 | 
279 |     def test_convert_hebrew(self):
280 |         hebrew = b"\xed\xe5\xec\xf9"
281 |         dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
282 |         self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
283 |         self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
284 | 
285 |     def test_dont_see_smart_quotes_where_there_are_none(self):
286 |         utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
287 |         dammit = UnicodeDammit(utf_8)
288 |         self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
289 |         self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
290 | 
291 |     def test_ignore_inappropriate_codecs(self):
292 |         utf8_data = "Räksmörgås".encode("utf-8")
293 |         dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
294 |         self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
295 | 
296 |     def test_ignore_invalid_codecs(self):
297 |         utf8_data = "Räksmörgås".encode("utf-8")
298 |         for bad_encoding in ['.utf8', '...', 'utF---16.!']:
299 |             dammit = UnicodeDammit(utf8_data, [bad_encoding])
300 |             self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
301 | 
302 |     def test_detect_html5_style_meta_tag(self):
303 | 
304 |         for data in (
305 |             b'<html><meta charset="euc-jp" /></html>',
306 |             b"<html><meta charset='euc-jp' /></html>",
307 |             b"<html><meta charset=euc-jp /></html>",
308 |             b"<html><meta charset=euc-jp/></html>"):
309 |             dammit = UnicodeDammit(data, is_html=True)
310 |             self.assertEqual(
311 |                 "euc-jp", dammit.original_encoding)
312 | 
313 |     def test_last_ditch_entity_replacement(self):
314 |         # This is a UTF-8 document that contains bytestrings
315 |         # completely incompatible with UTF-8 (ie. encoded with some other
316 |         # encoding).
317 |         #
318 |         # Since there is no consistent encoding for the document,
319 |         # Unicode, Dammit will eventually encode the document as UTF-8
320 |         # and encode the incompatible characters as REPLACEMENT
321 |         # CHARACTER.
322 |         #
323 |         # If chardet is installed, it will detect that the document
324 |         # can be converted into ISO-8859-1 without errors. This happens
325 |         # to be the wrong encoding, but it is a consistent encoding, so the
326 |         # code we're testing here won't run.
327 |         #
328 |         # So we temporarily disable chardet if it's present.
329 |         doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
330 | <html><b>\330\250\330\252\330\261</b>
331 | <i>\310\322\321\220\312\321\355\344</i></html>"""
332 |         chardet = bs4.dammit.chardet_dammit
333 |         logging.disable(logging.WARNING)
334 |         try:
335 |             def noop(str):
336 |                 return None
337 |             bs4.dammit.chardet_dammit = noop
338 |             dammit = UnicodeDammit(doc)
339 |             self.assertEqual(True, dammit.contains_replacement_characters)
340 |             self.assertTrue("\ufffd" in dammit.unicode_markup)
341 | 
342 |             soup = BeautifulSoup(doc, "html.parser")
343 |             self.assertTrue(soup.contains_replacement_characters)
344 |         finally:
345 |             logging.disable(logging.NOTSET)
346 |             bs4.dammit.chardet_dammit = chardet
347 | 
348 |     def test_byte_order_mark_removed(self):
349 |         # A document written in UTF-16LE will have its byte order marker stripped.
350 |         data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
351 |         dammit = UnicodeDammit(data)
352 |         self.assertEqual("<a>áé</a>", dammit.unicode_markup)
353 |         self.assertEqual("utf-16le", dammit.original_encoding)
354 | 
355 |     def test_detwingle(self):
356 |         # Here's a UTF8 document.
357 |         utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
358 | 
359 |         # Here's a Windows-1252 document.
360 |         windows_1252 = (
361 |             "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
362 |             "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
363 | 
364 |         # Through some unholy alchemy, they've been stuck together.
365 |         doc = utf8 + windows_1252 + utf8
366 | 
367 |         # The document can't be turned into UTF-8:
368 |         self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
369 | 
370 |         # Unicode, Dammit thinks the whole document is Windows-1252,
371 |         # and decodes it into "â˜ƒâ˜ƒâ˜ƒ“Hi, I like Windows!”â˜ƒâ˜ƒâ˜ƒ"
372 | 
373 |         # But if we run it through fix_embedded_windows_1252, it's fixed:
374 | 
375 |         fixed = UnicodeDammit.detwingle(doc)
376 |         self.assertEqual(
377 |             "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
378 | 
379 |     def test_detwingle_ignores_multibyte_characters(self):
380 |         # Each of these characters has a UTF-8 representation ending
381 |         # in \x93. \x93 is a smart quote if interpreted as
382 |         # Windows-1252. But our code knows to skip over multibyte
383 |         # UTF-8 characters, so they'll survive the process unscathed.
384 |         for tricky_unicode_char in (
385 |             "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
386 |             "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
387 |             "\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
388 |             ):
389 |             input = tricky_unicode_char.encode("utf8")
390 |             self.assertTrue(input.endswith(b'\x93'))
391 |             output = UnicodeDammit.detwingle(input)
392 |             self.assertEqual(output, input)
393 | 
394 | class TestNamedspacedAttribute(SoupTest):
395 | 
396 |     def test_name_may_be_none(self):
397 |         a = NamespacedAttribute("xmlns", None)
398 |         self.assertEqual(a, "xmlns")
399 | 
400 |     def test_attribute_is_equivalent_to_colon_separated_string(self):
401 |         a = NamespacedAttribute("a", "b")
402 |         self.assertEqual("a:b", a)
403 | 
404 |     def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
405 |         a = NamespacedAttribute("a", "b", "c")
406 |         b = NamespacedAttribute("a", "b", "c")
407 |         self.assertEqual(a, b)
408 | 
409 |         # The actual namespace is not considered.
410 |         c = NamespacedAttribute("a", "b", None)
411 |         self.assertEqual(a, c)
412 | 
413 |         # But name and prefix are important.
414 |         d = NamespacedAttribute("a", "z", "c")
415 |         self.assertNotEqual(a, d)
416 | 
417 |         e = NamespacedAttribute("z", "b", "c")
418 |         self.assertNotEqual(a, e)
419 | 
420 | 
421 | class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
422 | 
423 |     def test_content_meta_attribute_value(self):
424 |         value = CharsetMetaAttributeValue("euc-jp")
425 |         self.assertEqual("euc-jp", value)
426 |         self.assertEqual("euc-jp", value.original_value)
427 |         self.assertEqual("utf8", value.encode("utf8"))
428 | 
429 | 
430 |     def test_content_meta_attribute_value(self):
431 |         value = ContentMetaAttributeValue("text/html; charset=euc-jp")
432 |         self.assertEqual("text/html; charset=euc-jp", value)
433 |         self.assertEqual("text/html; charset=euc-jp", value.original_value)
434 |         self.assertEqual("text/html; charset=utf8", value.encode("utf8"))
435 | 


--------------------------------------------------------------------------------
/bs4/testing.py:
--------------------------------------------------------------------------------
  1 | """Helper classes for tests."""
  2 | 
  3 | import copy
  4 | import functools
  5 | import unittest
  6 | from unittest import TestCase
  7 | from bs4 import BeautifulSoup
  8 | from bs4.element import (
  9 |     CharsetMetaAttributeValue,
 10 |     Comment,
 11 |     ContentMetaAttributeValue,
 12 |     Doctype,
 13 |     SoupStrainer,
 14 | )
 15 | 
 16 | from bs4.builder import HTMLParserTreeBuilder
 17 | default_builder = HTMLParserTreeBuilder
 18 | 
 19 | 
 20 | class SoupTest(unittest.TestCase):
 21 | 
 22 |     @property
 23 |     def default_builder(self):
 24 |         return default_builder()
 25 | 
 26 |     def soup(self, markup, **kwargs):
 27 |         """Build a Beautiful Soup object from markup."""
 28 |         builder = kwargs.pop('builder', self.default_builder)
 29 |         return BeautifulSoup(markup, builder=builder, **kwargs)
 30 | 
 31 |     def document_for(self, markup):
 32 |         """Turn an HTML fragment into a document.
 33 | 
 34 |         The details depend on the builder.
 35 |         """
 36 |         return self.default_builder.test_fragment_to_document(markup)
 37 | 
 38 |     def assertSoupEquals(self, to_parse, compare_parsed_to=None):
 39 |         builder = self.default_builder
 40 |         obj = BeautifulSoup(to_parse, builder=builder)
 41 |         if compare_parsed_to is None:
 42 |             compare_parsed_to = to_parse
 43 | 
 44 |         self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
 45 | 
 46 | 
 47 | class HTMLTreeBuilderSmokeTest(object):
 48 | 
 49 |     """A basic test of a treebuilder's competence.
 50 | 
 51 |     Any HTML treebuilder, present or future, should be able to pass
 52 |     these tests. With invalid markup, there's room for interpretation,
 53 |     and different parsers can handle it differently. But with the
 54 |     markup in these tests, there's not much room for interpretation.
 55 |     """
 56 | 
 57 |     def assertDoctypeHandled(self, doctype_fragment):
 58 |         """Assert that a given doctype string is handled correctly."""
 59 |         doctype_str, soup = self._document_with_doctype(doctype_fragment)
 60 | 
 61 |         # Make sure a Doctype object was created.
 62 |         doctype = soup.contents[0]
 63 |         self.assertEqual(doctype.__class__, Doctype)
 64 |         self.assertEqual(doctype, doctype_fragment)
 65 |         self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
 66 | 
 67 |         # Make sure that the doctype was correctly associated with the
 68 |         # parse tree and that the rest of the document parsed.
 69 |         self.assertEqual(soup.p.contents[0], 'foo')
 70 | 
 71 |     def _document_with_doctype(self, doctype_fragment):
 72 |         """Generate and parse a document with the given doctype."""
 73 |         doctype = '<!DOCTYPE %s>' % doctype_fragment
 74 |         markup = doctype + '\n<p>foo</p>'
 75 |         soup = self.soup(markup)
 76 |         return doctype, soup
 77 | 
 78 |     def test_normal_doctypes(self):
 79 |         """Make sure normal, everyday HTML doctypes are handled correctly."""
 80 |         self.assertDoctypeHandled("html")
 81 |         self.assertDoctypeHandled(
 82 |             'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
 83 | 
 84 |     def test_empty_doctype(self):
 85 |         soup = self.soup("<!DOCTYPE>")
 86 |         doctype = soup.contents[0]
 87 |         self.assertEqual("", doctype.strip())
 88 | 
 89 |     def test_public_doctype_with_url(self):
 90 |         doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
 91 |         self.assertDoctypeHandled(doctype)
 92 | 
 93 |     def test_system_doctype(self):
 94 |         self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
 95 | 
 96 |     def test_namespaced_system_doctype(self):
 97 |         # We can handle a namespaced doctype with a system ID.
 98 |         self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
 99 | 
100 |     def test_namespaced_public_doctype(self):
101 |         # Test a namespaced doctype with a public id.
102 |         self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
103 | 
104 |     def test_real_xhtml_document(self):
105 |         """A real XHTML document should come out more or less the same as it went in."""
106 |         markup = b"""<?xml version="1.0" encoding="utf-8"?>
107 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
108 | <html xmlns="http://www.w3.org/1999/xhtml">
109 | <head><title>Hello.</title></head>
110 | <body>Goodbye.</body>
111 | </html>"""
112 |         soup = self.soup(markup)
113 |         self.assertEqual(
114 |             soup.encode("utf-8").replace(b"\n", b""),
115 |             markup.replace(b"\n", b""))
116 | 
117 |     def test_deepcopy(self):
118 |         """Make sure you can copy the tree builder.
119 | 
120 |         This is important because the builder is part of a
121 |         BeautifulSoup object, and we want to be able to copy that.
122 |         """
123 |         copy.deepcopy(self.default_builder)
124 | 
125 |     def test_p_tag_is_never_empty_element(self):
126 |         """A <p> tag is never designated as an empty-element tag.
127 | 
128 |         Even if the markup shows it as an empty-element tag, it
129 |         shouldn't be presented that way.
130 |         """
131 |         soup = self.soup("<p/>")
132 |         self.assertFalse(soup.p.is_empty_element)
133 |         self.assertEqual(str(soup.p), "<p></p>")
134 | 
135 |     def test_unclosed_tags_get_closed(self):
136 |         """A tag that's not closed by the end of the document should be closed.
137 | 
138 |         This applies to all tags except empty-element tags.
139 |         """
140 |         self.assertSoupEquals("<p>", "<p></p>")
141 |         self.assertSoupEquals("<b>", "<b></b>")
142 | 
143 |         self.assertSoupEquals("<br>", "<br/>")
144 | 
145 |     def test_br_is_always_empty_element_tag(self):
146 |         """A <br> tag is designated as an empty-element tag.
147 | 
148 |         Some parsers treat <br></br> as one <br/> tag, some parsers as
149 |         two tags, but it should always be an empty-element tag.
150 |         """
151 |         soup = self.soup("<br></br>")
152 |         self.assertTrue(soup.br.is_empty_element)
153 |         self.assertEqual(str(soup.br), "<br/>")
154 | 
155 |     def test_nested_formatting_elements(self):
156 |         self.assertSoupEquals("<em><em></em></em>")
157 | 
158 |     def test_comment(self):
159 |         # Comments are represented as Comment objects.
160 |         markup = "<p>foo<!--foobar-->baz</p>"
161 |         self.assertSoupEquals(markup)
162 | 
163 |         soup = self.soup(markup)
164 |         comment = soup.find(text="foobar")
165 |         self.assertEqual(comment.__class__, Comment)
166 | 
167 |         # The comment is properly integrated into the tree.
168 |         foo = soup.find(text="foo")
169 |         self.assertEqual(comment, foo.next_element)
170 |         baz = soup.find(text="baz")
171 |         self.assertEqual(comment, baz.previous_element)
172 | 
173 |     def test_preserved_whitespace_in_pre_and_textarea(self):
174 |         """Whitespace must be preserved in <pre> and <textarea> tags."""
175 |         self.assertSoupEquals("<pre>   </pre>")
176 |         self.assertSoupEquals("<textarea> woo  </textarea>")
177 | 
178 |     def test_nested_inline_elements(self):
179 |         """Inline elements can be nested indefinitely."""
180 |         b_tag = "<b>Inside a B tag</b>"
181 |         self.assertSoupEquals(b_tag)
182 | 
183 |         nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
184 |         self.assertSoupEquals(nested_b_tag)
185 | 
186 |         double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
187 |         self.assertSoupEquals(nested_b_tag)
188 | 
189 |     def test_nested_block_level_elements(self):
190 |         """Block elements can be nested."""
191 |         soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
192 |         blockquote = soup.blockquote
193 |         self.assertEqual(blockquote.p.b.string, 'Foo')
194 |         self.assertEqual(blockquote.b.string, 'Foo')
195 | 
196 |     def test_correctly_nested_tables(self):
197 |         """One table can go inside another one."""
198 |         markup = ('<table id="1">'
199 |                   '<tr>'
200 |                   "<td>Here's another table:"
201 |                   '<table id="2">'
202 |                   '<tr><td>foo</td></tr>'
203 |                   '</table></td>')
204 | 
205 |         self.assertSoupEquals(
206 |             markup,
207 |             '<table id="1"><tr><td>Here\'s another table:'
208 |             '<table id="2"><tr><td>foo</td></tr></table>'
209 |             '</td></tr></table>')
210 | 
211 |         self.assertSoupEquals(
212 |             "<table><thead><tr><td>Foo</td></tr></thead>"
213 |             "<tbody><tr><td>Bar</td></tr></tbody>"
214 |             "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
215 | 
216 |     def test_deeply_nested_multivalued_attribute(self):
217 |         # html5lib can set the attributes of the same tag many times
218 |         # as it rearranges the tree. This has caused problems with
219 |         # multivalued attributes.
220 |         markup = '<table><div><div class="css"></div></div></table>'
221 |         soup = self.soup(markup)
222 |         self.assertEqual(["css"], soup.div.div['class'])
223 | 
224 |     def test_angle_brackets_in_attribute_values_are_escaped(self):
225 |         self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
226 | 
227 |     def test_entities_in_attributes_converted_to_unicode(self):
228 |         expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
229 |         self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
230 |         self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
231 |         self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
232 |         self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
233 | 
234 |     def test_entities_in_text_converted_to_unicode(self):
235 |         expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
236 |         self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
237 |         self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
238 |         self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
239 |         self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
240 | 
241 |     def test_quot_entity_converted_to_quotation_mark(self):
242 |         self.assertSoupEquals("<p>I said &quot;good day!&quot;</p>",
243 |                               '<p>I said "good day!"</p>')
244 | 
245 |     def test_out_of_range_entity(self):
246 |         expect = "\N{REPLACEMENT CHARACTER}"
247 |         self.assertSoupEquals("&#10000000000000;", expect)
248 |         self.assertSoupEquals("&#x10000000000000;", expect)
249 |         self.assertSoupEquals("&#1000000000;", expect)
250 | 
251 |     def test_multipart_strings(self):
252 |         "Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
253 |         soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
254 |         self.assertEqual("p", soup.h2.string.next_element.name)
255 |         self.assertEqual("p", soup.p.name)
256 | 
257 |     def test_basic_namespaces(self):
258 |         """Parsers don't need to *understand* namespaces, but at the
259 |         very least they should not choke on namespaces or lose
260 |         data."""
261 | 
262 |         markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
263 |         soup = self.soup(markup)
264 |         self.assertEqual(markup, soup.encode())
265 |         html = soup.html
266 |         self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
267 |         self.assertEqual(
268 |             'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
269 |         self.assertEqual(
270 |             'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
271 | 
272 |     def test_multivalued_attribute_value_becomes_list(self):
273 |         markup = b'<a class="foo bar">'
274 |         soup = self.soup(markup)
275 |         self.assertEqual(['foo', 'bar'], soup.a['class'])
276 | 
277 |     #
278 |     # Generally speaking, tests below this point are more tests of
279 |     # Beautiful Soup than tests of the tree builders. But parsers are
280 |     # weird, so we run these tests separately for every tree builder
281 |     # to detect any differences between them.
282 |     #
283 | 
284 |     def test_can_parse_unicode_document(self):
285 |         # A seemingly innocuous document... but it's in Unicode! And
286 |         # it contains characters that can't be represented in the
287 |         # encoding found in the  declaration! The horror!
288 |         markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
289 |         soup = self.soup(markup)
290 |         self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
291 | 
292 |     def test_soupstrainer(self):
293 |         """Parsers should be able to work with SoupStrainers."""
294 |         strainer = SoupStrainer("b")
295 |         soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
296 |                          parse_only=strainer)
297 |         self.assertEqual(soup.decode(), "<b>bold</b>")
298 | 
299 |     def test_single_quote_attribute_values_become_double_quotes(self):
300 |         self.assertSoupEquals("<foo attr='bar'></foo>",
301 |                               '<foo attr="bar"></foo>')
302 | 
303 |     def test_attribute_values_with_nested_quotes_are_left_alone(self):
304 |         text = """<foo attr='bar "brawls" happen'>a</foo>"""
305 |         self.assertSoupEquals(text)
306 | 
307 |     def test_attribute_values_with_double_nested_quotes_get_quoted(self):
308 |         text = """<foo attr='bar "brawls" happen'>a</foo>"""
309 |         soup = self.soup(text)
310 |         soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
311 |         self.assertSoupEquals(
312 |             soup.foo.decode(),
313 |             """<foo attr="Brawls happen at &quot;Bob\'s Bar&quot;">a</foo>""")
314 | 
315 |     def test_ampersand_in_attribute_value_gets_escaped(self):
316 |         self.assertSoupEquals('<this is="really messed up & stuff"></this>',
317 |                               '<this is="really messed up &amp; stuff"></this>')
318 | 
319 |         self.assertSoupEquals(
320 |             '<a href="http://example.org?a=1&b=2;3">foo</a>',
321 |             '<a href="http://example.org?a=1&amp;b=2;3">foo</a>')
322 | 
323 |     def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
324 |         self.assertSoupEquals('<a href="http://example.org?a=1&amp;b=2;3"></a>')
325 | 
326 |     def test_entities_in_strings_converted_during_parsing(self):
327 |         # Both XML and HTML entities are converted to Unicode characters
328 |         # during parsing.
329 |         text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
330 |         expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
331 |         self.assertSoupEquals(text, expected)
332 | 
333 |     def test_smart_quotes_converted_on_the_way_in(self):
334 |         # Microsoft smart quotes are converted to Unicode characters during
335 |         # parsing.
336 |         quote = b"<p>\x91Foo\x92</p>"
337 |         soup = self.soup(quote)
338 |         self.assertEqual(
339 |             soup.p.string,
340 |             "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
341 | 
342 |     def test_non_breaking_spaces_converted_on_the_way_in(self):
343 |         soup = self.soup("<a>&nbsp;&nbsp;</a>")
344 |         self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
345 | 
346 |     def test_entities_converted_on_the_way_out(self):
347 |         text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
348 |         expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
349 |         soup = self.soup(text)
350 |         self.assertEqual(soup.p.encode("utf-8"), expected)
351 | 
352 |     def test_real_iso_latin_document(self):
353 |         # Smoke test of interrelated functionality, using an
354 |         # easy-to-understand document.
355 | 
356 |         # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
357 |         unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
358 | 
359 |         # That's because we're going to encode it into ISO-Latin-1, and use
360 |         # that to test.
361 |         iso_latin_html = unicode_html.encode("iso-8859-1")
362 | 
363 |         # Parse the ISO-Latin-1 HTML.
364 |         soup = self.soup(iso_latin_html)
365 |         # Encode it to UTF-8.
366 |         result = soup.encode("utf-8")
367 | 
368 |         # What do we expect the result to look like? Well, it would
369 |         # look like unicode_html, except that the META tag would say
370 |         # UTF-8 instead of ISO-Latin-1.
371 |         expected = unicode_html.replace("ISO-Latin-1", "utf-8")
372 | 
373 |         # And, of course, it would be in UTF-8, not Unicode.
374 |         expected = expected.encode("utf-8")
375 | 
376 |         # Ta-da!
377 |         self.assertEqual(result, expected)
378 | 
379 |     def test_real_shift_jis_document(self):
380 |         # Smoke test to make sure the parser can handle a document in
381 |         # Shift-JIS encoding, without choking.
382 |         shift_jis_html = (
383 |             b'<html><head></head><body><pre>'
384 |             b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
385 |             b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
386 |             b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
387 |             b'</pre></body></html>')
388 |         unicode_html = shift_jis_html.decode("shift-jis")
389 |         soup = self.soup(unicode_html)
390 | 
391 |         # Make sure the parse tree is correctly encoded to various
392 |         # encodings.
393 |         self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
394 |         self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
395 | 
396 |     def test_real_hebrew_document(self):
397 |         # A real-world test to make sure we can convert ISO-8859-9 (a
398 |         # Hebrew encoding) to UTF-8.
399 |         hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
400 |         soup = self.soup(
401 |             hebrew_document, from_encoding="iso8859-8")
402 |         self.assertEqual(soup.original_encoding, 'iso8859-8')
403 |         self.assertEqual(
404 |             soup.encode('utf-8'),
405 |             hebrew_document.decode("iso8859-8").encode("utf-8"))
406 | 
407 |     def test_meta_tag_reflects_current_encoding(self):
408 |         # Here's the <meta> tag saying that a document is
409 |         # encoded in Shift-JIS.
410 |         meta_tag = ('<meta content="text/html; charset=x-sjis" '
411 |                     'http-equiv="Content-type"/>')
412 | 
413 |         # Here's a document incorporating that meta tag.
414 |         shift_jis_html = (
415 |             '<html><head>\n%s\n'
416 |             '<meta http-equiv="Content-language" content="ja"/>'
417 |             '</head><body>Shift-JIS markup goes here.') % meta_tag
418 |         soup = self.soup(shift_jis_html)
419 | 
420 |         # Parse the document, and the charset is seemingly unaffected.
421 |         parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
422 |         content = parsed_meta['content']
423 |         self.assertEqual('text/html; charset=x-sjis', content)
424 | 
425 |         # But that value is actually a ContentMetaAttributeValue object.
426 |         self.assertTrue(isinstance(content, ContentMetaAttributeValue))
427 | 
428 |         # And it will take on a value that reflects its current
429 |         # encoding.
430 |         self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
431 | 
432 |         # For the rest of the story, see TestSubstitutions in
433 |         # test_tree.py.
434 | 
435 |     def test_html5_style_meta_tag_reflects_current_encoding(self):
436 |         # Here's the <meta> tag saying that a document is
437 |         # encoded in Shift-JIS.
438 |         meta_tag = ('<meta id="encoding" charset="x-sjis" />')
439 | 
440 |         # Here's a document incorporating that meta tag.
441 |         shift_jis_html = (
442 |             '<html><head>\n%s\n'
443 |             '<meta http-equiv="Content-language" content="ja"/>'
444 |             '</head><body>Shift-JIS markup goes here.') % meta_tag
445 |         soup = self.soup(shift_jis_html)
446 | 
447 |         # Parse the document, and the charset is seemingly unaffected.
448 |         parsed_meta = soup.find('meta', id="encoding")
449 |         charset = parsed_meta['charset']
450 |         self.assertEqual('x-sjis', charset)
451 | 
452 |         # But that value is actually a CharsetMetaAttributeValue object.
453 |         self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
454 | 
455 |         # And it will take on a value that reflects its current
456 |         # encoding.
457 |         self.assertEqual('utf8', charset.encode("utf8"))
458 | 
459 |     def test_tag_with_no_attributes_can_have_attributes_added(self):
460 |         data = self.soup("<a>text</a>")
461 |         data.a['foo'] = 'bar'
462 |         self.assertEqual('<a foo="bar">text</a>', data.a.decode())
463 | 
464 | class XMLTreeBuilderSmokeTest(object):
465 | 
466 |     def test_docstring_generated(self):
467 |         soup = self.soup("<root/>")
468 |         self.assertEqual(
469 |             soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
470 | 
471 |     def test_real_xhtml_document(self):
472 |         """A real XHTML document should come out *exactly* the same as it went in."""
473 |         markup = b"""<?xml version="1.0" encoding="utf-8"?>
474 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
475 | <html xmlns="http://www.w3.org/1999/xhtml">
476 | <head><title>Hello.</title></head>
477 | <body>Goodbye.</body>
478 | </html>"""
479 |         soup = self.soup(markup)
480 |         self.assertEqual(
481 |             soup.encode("utf-8"), markup)
482 | 
483 |     def test_formatter_processes_script_tag_for_xml_documents(self):
484 |         doc = """
485 |   <script type="text/javascript">
486 |   </script>
487 | """
488 |         soup = BeautifulSoup(doc, "xml")
489 |         # lxml would have stripped this while parsing, but we can add
490 |         # it later.
491 |         soup.script.string = 'console.log("< < hey > > ");'
492 |         encoded = soup.encode()
493 |         self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
494 | 
495 |     def test_can_parse_unicode_document(self):
496 |         markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
497 |         soup = self.soup(markup)
498 |         self.assertEqual('Sacr\xe9 bleu!', soup.root.string)
499 | 
500 |     def test_popping_namespaced_tag(self):
501 |         markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
502 |         soup = self.soup(markup)
503 |         self.assertEqual(
504 |             str(soup.rss), markup)
505 | 
506 |     def test_docstring_includes_correct_encoding(self):
507 |         soup = self.soup("<root/>")
508 |         self.assertEqual(
509 |             soup.encode("latin1"),
510 |             b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
511 | 
512 |     def test_large_xml_document(self):
513 |         """A large XML document should come out the same as it went in."""
514 |         markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
515 |                   + b'0' * (2**12)
516 |                   + b'</root>')
517 |         soup = self.soup(markup)
518 |         self.assertEqual(soup.encode("utf-8"), markup)
519 | 
520 | 
521 |     def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
522 |         self.assertSoupEquals("<p>", "<p/>")
523 |         self.assertSoupEquals("<p>foo</p>")
524 | 
525 |     def test_namespaces_are_preserved(self):
526 |         markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
527 |         soup = self.soup(markup)
528 |         root = soup.root
529 |         self.assertEqual("http://example.com/", root['xmlns:a'])
530 |         self.assertEqual("http://example.net/", root['xmlns:b'])
531 | 
532 |     def test_closing_namespaced_tag(self):
533 |         markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
534 |         soup = self.soup(markup)
535 |         self.assertEqual(str(soup.p), markup)
536 | 
537 |     def test_namespaced_attributes(self):
538 |         markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
539 |         soup = self.soup(markup)
540 |         self.assertEqual(str(soup.foo), markup)
541 | 
542 |     def test_namespaced_attributes_xml_namespace(self):
543 |         markup = '<foo xml:lang="fr">bar</foo>'
544 |         soup = self.soup(markup)
545 |         self.assertEqual(str(soup.foo), markup)
546 | 
547 | class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
548 |     """Smoke test for a tree builder that supports HTML5."""
549 | 
550 |     def test_real_xhtml_document(self):
551 |         # Since XHTML is not HTML5, HTML5 parsers are not tested to handle
552 |         # XHTML documents in any particular way.
553 |         pass
554 | 
555 |     def test_html_tags_have_namespace(self):
556 |         markup = "<a>"
557 |         soup = self.soup(markup)
558 |         self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
559 | 
560 |     def test_svg_tags_have_namespace(self):
561 |         markup = '<svg><circle/></svg>'
562 |         soup = self.soup(markup)
563 |         namespace = "http://www.w3.org/2000/svg"
564 |         self.assertEqual(namespace, soup.svg.namespace)
565 |         self.assertEqual(namespace, soup.circle.namespace)
566 | 
567 | 
568 |     def test_mathml_tags_have_namespace(self):
569 |         markup = '<math><msqrt>5</msqrt></math>'
570 |         soup = self.soup(markup)
571 |         namespace = 'http://www.w3.org/1998/Math/MathML'
572 |         self.assertEqual(namespace, soup.math.namespace)
573 |         self.assertEqual(namespace, soup.msqrt.namespace)
574 | 
575 |     def test_xml_declaration_becomes_comment(self):
576 |         markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
577 |         soup = self.soup(markup)
578 |         self.assertTrue(isinstance(soup.contents[0], Comment))
579 |         self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
580 |         self.assertEqual("html", soup.contents[0].next_element.name)
581 | 
582 | def skipIf(condition, reason):
583 |    def nothing(test, *args, **kwargs):
584 |        return None
585 | 
586 |    def decorator(test_item):
587 |        if condition:
588 |            return nothing
589 |        else:
590 |            return test_item
591 | 
592 |    return decorator
593 | 


--------------------------------------------------------------------------------
/bs4/dammit.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Beautiful Soup bonus library: Unicode, Dammit
  3 | 
  4 | This library converts a bytestream to Unicode through any means
  5 | necessary. It is heavily based on code from Mark Pilgrim's Universal
  6 | Feed Parser. It works best on XML and XML, but it does not rewrite the
  7 | XML or HTML to reflect a new encoding; that's the tree builder's job.
  8 | """
  9 | 
 10 | import codecs
 11 | from html.entities import codepoint2name
 12 | import re
 13 | import logging
 14 | import string
 15 | 
 16 | # Import a library to autodetect character encodings.
 17 | chardet_type = None
 18 | try:
 19 |     # First try the fast C implementation.
 20 |     #  PyPI package: cchardet
 21 |     import cchardet
 22 |     def chardet_dammit(s):
 23 |         return cchardet.detect(s)['encoding']
 24 | except ImportError:
 25 |     try:
 26 |         # Fall back to the pure Python implementation
 27 |         #  Debian package: python-chardet
 28 |         #  PyPI package: chardet
 29 |         import chardet
 30 |         def chardet_dammit(s):
 31 |             return chardet.detect(s)['encoding']
 32 |         #import chardet.constants
 33 |         #chardet.constants._debug = 1
 34 |     except ImportError:
 35 |         # No chardet available.
 36 |         def chardet_dammit(s):
 37 |             return None
 38 | 
 39 | # Available from http://cjkpython.i18n.org/.
 40 | try:
 41 |     import iconv_codec
 42 | except ImportError:
 43 |     pass
 44 | 
 45 | xml_encoding_re = re.compile(
 46 |     '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
 47 | html_meta_re = re.compile(
 48 |     '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
 49 | 
 50 | class EntitySubstitution(object):
 51 | 
 52 |     """Substitute XML or HTML entities for the corresponding characters."""
 53 | 
 54 |     def _populate_class_variables():
 55 |         lookup = {}
 56 |         reverse_lookup = {}
 57 |         characters_for_re = []
 58 |         for codepoint, name in list(codepoint2name.items()):
 59 |             character = chr(codepoint)
 60 |             if codepoint != 34:
 61 |                 # There's no point in turning the quotation mark into
 62 |                 # &quot;, unless it happens within an attribute value, which
 63 |                 # is handled elsewhere.
 64 |                 characters_for_re.append(character)
 65 |                 lookup[character] = name
 66 |             # But we do want to turn &quot; into the quotation mark.
 67 |             reverse_lookup[name] = character
 68 |         re_definition = "[%s]" % "".join(characters_for_re)
 69 |         return lookup, reverse_lookup, re.compile(re_definition)
 70 |     (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
 71 |      CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
 72 | 
 73 |     CHARACTER_TO_XML_ENTITY = {
 74 |         "'": "apos",
 75 |         '"': "quot",
 76 |         "&": "amp",
 77 |         "<": "lt",
 78 |         ">": "gt",
 79 |         }
 80 | 
 81 |     BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
 82 |                                            "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
 83 |                                            ")")
 84 | 
 85 |     AMPERSAND_OR_BRACKET = re.compile("([<>&])")
 86 | 
 87 |     @classmethod
 88 |     def _substitute_html_entity(cls, matchobj):
 89 |         entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
 90 |         return "&%s;" % entity
 91 | 
 92 |     @classmethod
 93 |     def _substitute_xml_entity(cls, matchobj):
 94 |         """Used with a regular expression to substitute the
 95 |         appropriate XML entity for an XML special character."""
 96 |         entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
 97 |         return "&%s;" % entity
 98 | 
 99 |     @classmethod
100 |     def quoted_attribute_value(self, value):
101 |         """Make a value into a quoted XML attribute, possibly escaping it.
102 | 
103 |          Most strings will be quoted using double quotes.
104 | 
105 |           Bob's Bar -> "Bob's Bar"
106 | 
107 |          If a string contains double quotes, it will be quoted using
108 |          single quotes.
109 | 
110 |           Welcome to "my bar" -> 'Welcome to "my bar"'
111 | 
112 |          If a string contains both single and double quotes, the
113 |          double quotes will be escaped, and the string will be quoted
114 |          using double quotes.
115 | 
116 |           Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
117 |         """
118 |         quote_with = '"'
119 |         if '"' in value:
120 |             if "'" in value:
121 |                 # The string contains both single and double
122 |                 # quotes.  Turn the double quotes into
123 |                 # entities. We quote the double quotes rather than
124 |                 # the single quotes because the entity name is
125 |                 # "&quot;" whether this is HTML or XML.  If we
126 |                 # quoted the single quotes, we'd have to decide
127 |                 # between &apos; and &squot;.
128 |                 replace_with = "&quot;"
129 |                 value = value.replace('"', replace_with)
130 |             else:
131 |                 # There are double quotes but no single quotes.
132 |                 # We can use single quotes to quote the attribute.
133 |                 quote_with = "'"
134 |         return quote_with + value + quote_with
135 | 
136 |     @classmethod
137 |     def substitute_xml(cls, value, make_quoted_attribute=False):
138 |         """Substitute XML entities for special XML characters.
139 | 
140 |         :param value: A string to be substituted. The less-than sign
141 |           will become &lt;, the greater-than sign will become &gt;,
142 |           and any ampersands will become &amp;. If you want ampersands
143 |           that appear to be part of an entity definition to be left
144 |           alone, use substitute_xml_containing_entities() instead.
145 | 
146 |         :param make_quoted_attribute: If True, then the string will be
147 |          quoted, as befits an attribute value.
148 |         """
149 |         # Escape angle brackets and ampersands.
150 |         value = cls.AMPERSAND_OR_BRACKET.sub(
151 |             cls._substitute_xml_entity, value)
152 | 
153 |         if make_quoted_attribute:
154 |             value = cls.quoted_attribute_value(value)
155 |         return value
156 | 
157 |     @classmethod
158 |     def substitute_xml_containing_entities(
159 |         cls, value, make_quoted_attribute=False):
160 |         """Substitute XML entities for special XML characters.
161 | 
162 |         :param value: A string to be substituted. The less-than sign will
163 |           become &lt;, the greater-than sign will become &gt;, and any
164 |           ampersands that are not part of an entity defition will
165 |           become &amp;.
166 | 
167 |         :param make_quoted_attribute: If True, then the string will be
168 |          quoted, as befits an attribute value.
169 |         """
170 |         # Escape angle brackets, and ampersands that aren't part of
171 |         # entities.
172 |         value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
173 |             cls._substitute_xml_entity, value)
174 | 
175 |         if make_quoted_attribute:
176 |             value = cls.quoted_attribute_value(value)
177 |         return value
178 | 
179 |     @classmethod
180 |     def substitute_html(cls, s):
181 |         """Replace certain Unicode characters with named HTML entities.
182 | 
183 |         This differs from data.encode(encoding, 'xmlcharrefreplace')
184 |         in that the goal is to make the result more readable (to those
185 |         with ASCII displays) rather than to recover from
186 |         errors. There's absolutely nothing wrong with a UTF-8 string
187 |         containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
188 |         character with "&eacute;" will make it more readable to some
189 |         people.
190 |         """
191 |         return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
192 |             cls._substitute_html_entity, s)
193 | 
194 | 
195 | class EncodingDetector:
196 |     """Suggests a number of possible encodings for a bytestring.
197 | 
198 |     Order of precedence:
199 | 
200 |     1. Encodings you specifically tell EncodingDetector to try first
201 |     (the override_encodings argument to the constructor).
202 | 
203 |     2. An encoding declared within the bytestring itself, either in an
204 |     XML declaration (if the bytestring is to be interpreted as an XML
205 |     document), or in a <meta> tag (if the bytestring is to be
206 |     interpreted as an HTML document.)
207 | 
208 |     3. An encoding detected through textual analysis by chardet,
209 |     cchardet, or a similar external library.
210 | 
211 |     4. UTF-8.
212 | 
213 |     5. Windows-1252.
214 |     """
215 |     def __init__(self, markup, override_encodings=None, is_html=False):
216 |         self.override_encodings = override_encodings or []
217 |         self.chardet_encoding = None
218 |         self.is_html = is_html
219 |         self.declared_encoding = None
220 | 
221 |         # First order of business: strip a byte-order mark.
222 |         self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
223 | 
224 |     def _usable(self, encoding, tried):
225 |         if encoding is not None:
226 |             encoding = encoding.lower()
227 |             if encoding not in tried:
228 |                 tried.add(encoding)
229 |                 return True
230 |         return False
231 | 
232 |     @property
233 |     def encodings(self):
234 |         """Yield a number of encodings that might work for this markup."""
235 |         tried = set()
236 |         for e in self.override_encodings:
237 |             if self._usable(e, tried):
238 |                 yield e
239 | 
240 |         # Did the document originally start with a byte-order mark
241 |         # that indicated its encoding?
242 |         if self._usable(self.sniffed_encoding, tried):
243 |             yield self.sniffed_encoding
244 | 
245 |         # Look within the document for an XML or HTML encoding
246 |         # declaration.
247 |         if self.declared_encoding is None:
248 |             self.declared_encoding = self.find_declared_encoding(
249 |                 self.markup, self.is_html)
250 |         if self._usable(self.declared_encoding, tried):
251 |             yield self.declared_encoding
252 | 
253 |         # Use third-party character set detection to guess at the
254 |         # encoding.
255 |         if self.chardet_encoding is None:
256 |             self.chardet_encoding = chardet_dammit(self.markup)
257 |         if self._usable(self.chardet_encoding, tried):
258 |             yield self.chardet_encoding
259 | 
260 |         # As a last-ditch effort, try utf-8 and windows-1252.
261 |         for e in ('utf-8', 'windows-1252'):
262 |             if self._usable(e, tried):
263 |                 yield e
264 | 
265 |     @classmethod
266 |     def strip_byte_order_mark(cls, data):
267 |         """If a byte-order mark is present, strip it and return the encoding it implies."""
268 |         encoding = None
269 |         if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
270 |                and (data[2:4] != '\x00\x00'):
271 |             encoding = 'utf-16be'
272 |             data = data[2:]
273 |         elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
274 |                  and (data[2:4] != '\x00\x00'):
275 |             encoding = 'utf-16le'
276 |             data = data[2:]
277 |         elif data[:3] == b'\xef\xbb\xbf':
278 |             encoding = 'utf-8'
279 |             data = data[3:]
280 |         elif data[:4] == b'\x00\x00\xfe\xff':
281 |             encoding = 'utf-32be'
282 |             data = data[4:]
283 |         elif data[:4] == b'\xff\xfe\x00\x00':
284 |             encoding = 'utf-32le'
285 |             data = data[4:]
286 |         return data, encoding
287 | 
288 |     @classmethod
289 |     def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
290 |         """Given a document, tries to find its declared encoding.
291 | 
292 |         An XML encoding is declared at the beginning of the document.
293 | 
294 |         An HTML encoding is declared in a <meta> tag, hopefully near the
295 |         beginning of the document.
296 |         """
297 |         if search_entire_document:
298 |             xml_endpos = html_endpos = len(markup)
299 |         else:
300 |             xml_endpos = 1024
301 |             html_endpos = max(2048, int(len(markup) * 0.05))
302 |             
303 |         declared_encoding = None
304 |         declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
305 |         if not declared_encoding_match and is_html:
306 |             declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
307 |         if declared_encoding_match is not None:
308 |             declared_encoding = declared_encoding_match.groups()[0].decode(
309 |                 'ascii')
310 |         if declared_encoding:
311 |             return declared_encoding.lower()
312 |         return None
313 | 
314 | class UnicodeDammit:
315 |     """A class for detecting the encoding of a *ML document and
316 |     converting it to a Unicode string. If the source encoding is
317 |     windows-1252, can replace MS smart quotes with their HTML or XML
318 |     equivalents."""
319 | 
320 |     # This dictionary maps commonly seen values for "charset" in HTML
321 |     # meta tags to the corresponding Python codec names. It only covers
322 |     # values that aren't in Python's aliases and can't be determined
323 |     # by the heuristics in find_codec.
324 |     CHARSET_ALIASES = {"macintosh": "mac-roman",
325 |                        "x-sjis": "shift-jis"}
326 | 
327 |     ENCODINGS_WITH_SMART_QUOTES = [
328 |         "windows-1252",
329 |         "iso-8859-1",
330 |         "iso-8859-2",
331 |         ]
332 | 
333 |     def __init__(self, markup, override_encodings=[],
334 |                  smart_quotes_to=None, is_html=False):
335 |         self.smart_quotes_to = smart_quotes_to
336 |         self.tried_encodings = []
337 |         self.contains_replacement_characters = False
338 |         self.is_html = is_html
339 | 
340 |         self.detector = EncodingDetector(markup, override_encodings, is_html)
341 | 
342 |         # Short-circuit if the data is in Unicode to begin with.
343 |         if isinstance(markup, str) or markup == '':
344 |             self.markup = markup
345 |             self.unicode_markup = str(markup)
346 |             self.original_encoding = None
347 |             return
348 | 
349 |         # The encoding detector may have stripped a byte-order mark.
350 |         # Use the stripped markup from this point on.
351 |         self.markup = self.detector.markup
352 | 
353 |         u = None
354 |         for encoding in self.detector.encodings:
355 |             markup = self.detector.markup
356 |             u = self._convert_from(encoding)
357 |             if u is not None:
358 |                 break
359 | 
360 |         if not u:
361 |             # None of the encodings worked. As an absolute last resort,
362 |             # try them again with character replacement.
363 | 
364 |             for encoding in self.detector.encodings:
365 |                 if encoding != "ascii":
366 |                     u = self._convert_from(encoding, "replace")
367 |                 if u is not None:
368 |                     logging.warning(
369 |                             "Some characters could not be decoded, and were "
370 |                             "replaced with REPLACEMENT CHARACTER.")
371 |                     self.contains_replacement_characters = True
372 |                     break
373 | 
374 |         # If none of that worked, we could at this point force it to
375 |         # ASCII, but that would destroy so much data that I think
376 |         # giving up is better.
377 |         self.unicode_markup = u
378 |         if not u:
379 |             self.original_encoding = None
380 | 
381 |     def _sub_ms_char(self, match):
382 |         """Changes a MS smart quote character to an XML or HTML
383 |         entity, or an ASCII character."""
384 |         orig = match.group(1)
385 |         if self.smart_quotes_to == 'ascii':
386 |             sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
387 |         else:
388 |             sub = self.MS_CHARS.get(orig)
389 |             if type(sub) == tuple:
390 |                 if self.smart_quotes_to == 'xml':
391 |                     sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
392 |                 else:
393 |                     sub = '&'.encode() + sub[0].encode() + ';'.encode()
394 |             else:
395 |                 sub = sub.encode()
396 |         return sub
397 | 
398 |     def _convert_from(self, proposed, errors="strict"):
399 |         proposed = self.find_codec(proposed)
400 |         if not proposed or (proposed, errors) in self.tried_encodings:
401 |             return None
402 |         self.tried_encodings.append((proposed, errors))
403 |         markup = self.markup
404 |         # Convert smart quotes to HTML if coming from an encoding
405 |         # that might have them.
406 |         if (self.smart_quotes_to is not None
407 |             and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
408 |             smart_quotes_re = b"([\x80-\x9f])"
409 |             smart_quotes_compiled = re.compile(smart_quotes_re)
410 |             markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
411 | 
412 |         try:
413 |             #print "Trying to convert document to %s (errors=%s)" % (
414 |             #    proposed, errors)
415 |             u = self._to_unicode(markup, proposed, errors)
416 |             self.markup = u
417 |             self.original_encoding = proposed
418 |         except Exception as e:
419 |             #print "That didn't work!"
420 |             #print e
421 |             return None
422 |         #print "Correct encoding: %s" % proposed
423 |         return self.markup
424 | 
425 |     def _to_unicode(self, data, encoding, errors="strict"):
426 |         '''Given a string and its encoding, decodes the string into Unicode.
427 |         %encoding is a string recognized by encodings.aliases'''
428 |         return str(data, encoding, errors)
429 | 
430 |     @property
431 |     def declared_html_encoding(self):
432 |         if not self.is_html:
433 |             return None
434 |         return self.detector.declared_encoding
435 | 
436 |     def find_codec(self, charset):
437 |         value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
438 |                or (charset and self._codec(charset.replace("-", "")))
439 |                or (charset and self._codec(charset.replace("-", "_")))
440 |                or (charset and charset.lower())
441 |                or charset
442 |                 )
443 |         if value:
444 |             return value.lower()
445 |         return None
446 | 
447 |     def _codec(self, charset):
448 |         if not charset:
449 |             return charset
450 |         codec = None
451 |         try:
452 |             codecs.lookup(charset)
453 |             codec = charset
454 |         except (LookupError, ValueError):
455 |             pass
456 |         return codec
457 | 
458 | 
459 |     # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
460 |     MS_CHARS = {b'\x80': ('euro', '20AC'),
461 |                 b'\x81': ' ',
462 |                 b'\x82': ('sbquo', '201A'),
463 |                 b'\x83': ('fnof', '192'),
464 |                 b'\x84': ('bdquo', '201E'),
465 |                 b'\x85': ('hellip', '2026'),
466 |                 b'\x86': ('dagger', '2020'),
467 |                 b'\x87': ('Dagger', '2021'),
468 |                 b'\x88': ('circ', '2C6'),
469 |                 b'\x89': ('permil', '2030'),
470 |                 b'\x8A': ('Scaron', '160'),
471 |                 b'\x8B': ('lsaquo', '2039'),
472 |                 b'\x8C': ('OElig', '152'),
473 |                 b'\x8D': '?',
474 |                 b'\x8E': ('#x17D', '17D'),
475 |                 b'\x8F': '?',
476 |                 b'\x90': '?',
477 |                 b'\x91': ('lsquo', '2018'),
478 |                 b'\x92': ('rsquo', '2019'),
479 |                 b'\x93': ('ldquo', '201C'),
480 |                 b'\x94': ('rdquo', '201D'),
481 |                 b'\x95': ('bull', '2022'),
482 |                 b'\x96': ('ndash', '2013'),
483 |                 b'\x97': ('mdash', '2014'),
484 |                 b'\x98': ('tilde', '2DC'),
485 |                 b'\x99': ('trade', '2122'),
486 |                 b'\x9a': ('scaron', '161'),
487 |                 b'\x9b': ('rsaquo', '203A'),
488 |                 b'\x9c': ('oelig', '153'),
489 |                 b'\x9d': '?',
490 |                 b'\x9e': ('#x17E', '17E'),
491 |                 b'\x9f': ('Yuml', ''),}
492 | 
493 |     # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
494 |     # horrors like stripping diacritical marks to turn á into a, but also
495 |     # contains non-horrors like turning “ into ".
496 |     MS_CHARS_TO_ASCII = {
497 |         b'\x80' : 'EUR',
498 |         b'\x81' : ' ',
499 |         b'\x82' : ',',
500 |         b'\x83' : 'f',
501 |         b'\x84' : ',,',
502 |         b'\x85' : '...',
503 |         b'\x86' : '+',
504 |         b'\x87' : '++',
505 |         b'\x88' : '^',
506 |         b'\x89' : '%',
507 |         b'\x8a' : 'S',
508 |         b'\x8b' : '<',
509 |         b'\x8c' : 'OE',
510 |         b'\x8d' : '?',
511 |         b'\x8e' : 'Z',
512 |         b'\x8f' : '?',
513 |         b'\x90' : '?',
514 |         b'\x91' : "'",
515 |         b'\x92' : "'",
516 |         b'\x93' : '"',
517 |         b'\x94' : '"',
518 |         b'\x95' : '*',
519 |         b'\x96' : '-',
520 |         b'\x97' : '--',
521 |         b'\x98' : '~',
522 |         b'\x99' : '(TM)',
523 |         b'\x9a' : 's',
524 |         b'\x9b' : '>',
525 |         b'\x9c' : 'oe',
526 |         b'\x9d' : '?',
527 |         b'\x9e' : 'z',
528 |         b'\x9f' : 'Y',
529 |         b'\xa0' : ' ',
530 |         b'\xa1' : '!',
531 |         b'\xa2' : 'c',
532 |         b'\xa3' : 'GBP',
533 |         b'\xa4' : '$', #This approximation is especially parochial--this is the
534 |                        #generic currency symbol.
535 |         b'\xa5' : 'YEN',
536 |         b'\xa6' : '|',
537 |         b'\xa7' : 'S',
538 |         b'\xa8' : '..',
539 |         b'\xa9' : '',
540 |         b'\xaa' : '(th)',
541 |         b'\xab' : '<<',
542 |         b'\xac' : '!',
543 |         b'\xad' : ' ',
544 |         b'\xae' : '(R)',
545 |         b'\xaf' : '-',
546 |         b'\xb0' : 'o',
547 |         b'\xb1' : '+-',
548 |         b'\xb2' : '2',
549 |         b'\xb3' : '3',
550 |         b'\xb4' : ("'", 'acute'),
551 |         b'\xb5' : 'u',
552 |         b'\xb6' : 'P',
553 |         b'\xb7' : '*',
554 |         b'\xb8' : ',',
555 |         b'\xb9' : '1',
556 |         b'\xba' : '(th)',
557 |         b'\xbb' : '>>',
558 |         b'\xbc' : '1/4',
559 |         b'\xbd' : '1/2',
560 |         b'\xbe' : '3/4',
561 |         b'\xbf' : '?',
562 |         b'\xc0' : 'A',
563 |         b'\xc1' : 'A',
564 |         b'\xc2' : 'A',
565 |         b'\xc3' : 'A',
566 |         b'\xc4' : 'A',
567 |         b'\xc5' : 'A',
568 |         b'\xc6' : 'AE',
569 |         b'\xc7' : 'C',
570 |         b'\xc8' : 'E',
571 |         b'\xc9' : 'E',
572 |         b'\xca' : 'E',
573 |         b'\xcb' : 'E',
574 |         b'\xcc' : 'I',
575 |         b'\xcd' : 'I',
576 |         b'\xce' : 'I',
577 |         b'\xcf' : 'I',
578 |         b'\xd0' : 'D',
579 |         b'\xd1' : 'N',
580 |         b'\xd2' : 'O',
581 |         b'\xd3' : 'O',
582 |         b'\xd4' : 'O',
583 |         b'\xd5' : 'O',
584 |         b'\xd6' : 'O',
585 |         b'\xd7' : '*',
586 |         b'\xd8' : 'O',
587 |         b'\xd9' : 'U',
588 |         b'\xda' : 'U',
589 |         b'\xdb' : 'U',
590 |         b'\xdc' : 'U',
591 |         b'\xdd' : 'Y',
592 |         b'\xde' : 'b',
593 |         b'\xdf' : 'B',
594 |         b'\xe0' : 'a',
595 |         b'\xe1' : 'a',
596 |         b'\xe2' : 'a',
597 |         b'\xe3' : 'a',
598 |         b'\xe4' : 'a',
599 |         b'\xe5' : 'a',
600 |         b'\xe6' : 'ae',
601 |         b'\xe7' : 'c',
602 |         b'\xe8' : 'e',
603 |         b'\xe9' : 'e',
604 |         b'\xea' : 'e',
605 |         b'\xeb' : 'e',
606 |         b'\xec' : 'i',
607 |         b'\xed' : 'i',
608 |         b'\xee' : 'i',
609 |         b'\xef' : 'i',
610 |         b'\xf0' : 'o',
611 |         b'\xf1' : 'n',
612 |         b'\xf2' : 'o',
613 |         b'\xf3' : 'o',
614 |         b'\xf4' : 'o',
615 |         b'\xf5' : 'o',
616 |         b'\xf6' : 'o',
617 |         b'\xf7' : '/',
618 |         b'\xf8' : 'o',
619 |         b'\xf9' : 'u',
620 |         b'\xfa' : 'u',
621 |         b'\xfb' : 'u',
622 |         b'\xfc' : 'u',
623 |         b'\xfd' : 'y',
624 |         b'\xfe' : 'b',
625 |         b'\xff' : 'y',
626 |         }
627 | 
628 |     # A map used when removing rogue Windows-1252/ISO-8859-1
629 |     # characters in otherwise UTF-8 documents.
630 |     #
631 |     # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
632 |     # Windows-1252.
633 |     WINDOWS_1252_TO_UTF8 = {
634 |         0x80 : b'\xe2\x82\xac', # €
635 |         0x82 : b'\xe2\x80\x9a', # ‚
636 |         0x83 : b'\xc6\x92',     # ƒ
637 |         0x84 : b'\xe2\x80\x9e', # „
638 |         0x85 : b'\xe2\x80\xa6', # …
639 |         0x86 : b'\xe2\x80\xa0', # †
640 |         0x87 : b'\xe2\x80\xa1', # ‡
641 |         0x88 : b'\xcb\x86',     # ˆ
642 |         0x89 : b'\xe2\x80\xb0', # ‰
643 |         0x8a : b'\xc5\xa0',     # Š
644 |         0x8b : b'\xe2\x80\xb9', # ‹
645 |         0x8c : b'\xc5\x92',     # Œ
646 |         0x8e : b'\xc5\xbd',     # Ž
647 |         0x91 : b'\xe2\x80\x98', # ‘
648 |         0x92 : b'\xe2\x80\x99', # ’
649 |         0x93 : b'\xe2\x80\x9c', # “
650 |         0x94 : b'\xe2\x80\x9d', # ”
651 |         0x95 : b'\xe2\x80\xa2', # •
652 |         0x96 : b'\xe2\x80\x93', # –
653 |         0x97 : b'\xe2\x80\x94', # —
654 |         0x98 : b'\xcb\x9c',     # ˜
655 |         0x99 : b'\xe2\x84\xa2', # ™
656 |         0x9a : b'\xc5\xa1',     # š
657 |         0x9b : b'\xe2\x80\xba', # ›
658 |         0x9c : b'\xc5\x93',     # œ
659 |         0x9e : b'\xc5\xbe',     # ž
660 |         0x9f : b'\xc5\xb8',     # Ÿ
661 |         0xa0 : b'\xc2\xa0',     #  
662 |         0xa1 : b'\xc2\xa1',     # ¡
663 |         0xa2 : b'\xc2\xa2',     # ¢
664 |         0xa3 : b'\xc2\xa3',     # £
665 |         0xa4 : b'\xc2\xa4',     # ¤
666 |         0xa5 : b'\xc2\xa5',     # ¥
667 |         0xa6 : b'\xc2\xa6',     # ¦
668 |         0xa7 : b'\xc2\xa7',     # §
669 |         0xa8 : b'\xc2\xa8',     # ¨
670 |         0xa9 : b'\xc2\xa9',     # ©
671 |         0xaa : b'\xc2\xaa',     # ª
672 |         0xab : b'\xc2\xab',     # «
673 |         0xac : b'\xc2\xac',     # ¬
674 |         0xad : b'\xc2\xad',     # ­
675 |         0xae : b'\xc2\xae',     # ®
676 |         0xaf : b'\xc2\xaf',     # ¯
677 |         0xb0 : b'\xc2\xb0',     # °
678 |         0xb1 : b'\xc2\xb1',     # ±
679 |         0xb2 : b'\xc2\xb2',     # ²
680 |         0xb3 : b'\xc2\xb3',     # ³
681 |         0xb4 : b'\xc2\xb4',     # ´
682 |         0xb5 : b'\xc2\xb5',     # µ
683 |         0xb6 : b'\xc2\xb6',     # ¶
684 |         0xb7 : b'\xc2\xb7',     # ·
685 |         0xb8 : b'\xc2\xb8',     # ¸
686 |         0xb9 : b'\xc2\xb9',     # ¹
687 |         0xba : b'\xc2\xba',     # º
688 |         0xbb : b'\xc2\xbb',     # »
689 |         0xbc : b'\xc2\xbc',     # ¼
690 |         0xbd : b'\xc2\xbd',     # ½
691 |         0xbe : b'\xc2\xbe',     # ¾
692 |         0xbf : b'\xc2\xbf',     # ¿
693 |         0xc0 : b'\xc3\x80',     # À
694 |         0xc1 : b'\xc3\x81',     # Á
695 |         0xc2 : b'\xc3\x82',     # Â
696 |         0xc3 : b'\xc3\x83',     # Ã
697 |         0xc4 : b'\xc3\x84',     # Ä
698 |         0xc5 : b'\xc3\x85',     # Å
699 |         0xc6 : b'\xc3\x86',     # Æ
700 |         0xc7 : b'\xc3\x87',     # Ç
701 |         0xc8 : b'\xc3\x88',     # È
702 |         0xc9 : b'\xc3\x89',     # É
703 |         0xca : b'\xc3\x8a',     # Ê
704 |         0xcb : b'\xc3\x8b',     # Ë
705 |         0xcc : b'\xc3\x8c',     # Ì
706 |         0xcd : b'\xc3\x8d',     # Í
707 |         0xce : b'\xc3\x8e',     # Î
708 |         0xcf : b'\xc3\x8f',     # Ï
709 |         0xd0 : b'\xc3\x90',     # Ð
710 |         0xd1 : b'\xc3\x91',     # Ñ
711 |         0xd2 : b'\xc3\x92',     # Ò
712 |         0xd3 : b'\xc3\x93',     # Ó
713 |         0xd4 : b'\xc3\x94',     # Ô
714 |         0xd5 : b'\xc3\x95',     # Õ
715 |         0xd6 : b'\xc3\x96',     # Ö
716 |         0xd7 : b'\xc3\x97',     # ×
717 |         0xd8 : b'\xc3\x98',     # Ø
718 |         0xd9 : b'\xc3\x99',     # Ù
719 |         0xda : b'\xc3\x9a',     # Ú
720 |         0xdb : b'\xc3\x9b',     # Û
721 |         0xdc : b'\xc3\x9c',     # Ü
722 |         0xdd : b'\xc3\x9d',     # Ý
723 |         0xde : b'\xc3\x9e',     # Þ
724 |         0xdf : b'\xc3\x9f',     # ß
725 |         0xe0 : b'\xc3\xa0',     # à
726 |         0xe1 : b'\xa1',     # á
727 |         0xe2 : b'\xc3\xa2',     # â
728 |         0xe3 : b'\xc3\xa3',     # ã
729 |         0xe4 : b'\xc3\xa4',     # ä
730 |         0xe5 : b'\xc3\xa5',     # å
731 |         0xe6 : b'\xc3\xa6',     # æ
732 |         0xe7 : b'\xc3\xa7',     # ç
733 |         0xe8 : b'\xc3\xa8',     # è
734 |         0xe9 : b'\xc3\xa9',     # é
735 |         0xea : b'\xc3\xaa',     # ê
736 |         0xeb : b'\xc3\xab',     # ë
737 |         0xec : b'\xc3\xac',     # ì
738 |         0xed : b'\xc3\xad',     # í
739 |         0xee : b'\xc3\xae',     # î
740 |         0xef : b'\xc3\xaf',     # ï
741 |         0xf0 : b'\xc3\xb0',     # ð
742 |         0xf1 : b'\xc3\xb1',     # ñ
743 |         0xf2 : b'\xc3\xb2',     # ò
744 |         0xf3 : b'\xc3\xb3',     # ó
745 |         0xf4 : b'\xc3\xb4',     # ô
746 |         0xf5 : b'\xc3\xb5',     # õ
747 |         0xf6 : b'\xc3\xb6',     # ö
748 |         0xf7 : b'\xc3\xb7',     # ÷
749 |         0xf8 : b'\xc3\xb8',     # ø
750 |         0xf9 : b'\xc3\xb9',     # ù
751 |         0xfa : b'\xc3\xba',     # ú
752 |         0xfb : b'\xc3\xbb',     # û
753 |         0xfc : b'\xc3\xbc',     # ü
754 |         0xfd : b'\xc3\xbd',     # ý
755 |         0xfe : b'\xc3\xbe',     # þ
756 |         }
757 | 
758 |     MULTIBYTE_MARKERS_AND_SIZES = [
759 |         (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
760 |         (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
761 |         (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
762 |         ]
763 | 
764 |     FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
765 |     LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
766 | 
767 |     @classmethod
768 |     def detwingle(cls, in_bytes, main_encoding="utf8",
769 |                   embedded_encoding="windows-1252"):
770 |         """Fix characters from one encoding embedded in some other encoding.
771 | 
772 |         Currently the only situation supported is Windows-1252 (or its
773 |         subset ISO-8859-1), embedded in UTF-8.
774 | 
775 |         The input must be a bytestring. If you've already converted
776 |         the document to Unicode, you're too late.
777 | 
778 |         The output is a bytestring in which `embedded_encoding`
779 |         characters have been converted to their `main_encoding`
780 |         equivalents.
781 |         """
782 |         if embedded_encoding.replace('_', '-').lower() not in (
783 |             'windows-1252', 'windows_1252'):
784 |             raise NotImplementedError(
785 |                 "Windows-1252 and ISO-8859-1 are the only currently supported "
786 |                 "embedded encodings.")
787 | 
788 |         if main_encoding.lower() not in ('utf8', 'utf-8'):
789 |             raise NotImplementedError(
790 |                 "UTF-8 is the only currently supported main encoding.")
791 | 
792 |         byte_chunks = []
793 | 
794 |         chunk_start = 0
795 |         pos = 0
796 |         while pos < len(in_bytes):
797 |             byte = in_bytes[pos]
798 |             if not isinstance(byte, int):
799 |                 # Python 2.x
800 |                 byte = ord(byte)
801 |             if (byte >= cls.FIRST_MULTIBYTE_MARKER
802 |                 and byte <= cls.LAST_MULTIBYTE_MARKER):
803 |                 # This is the start of a UTF-8 multibyte character. Skip
804 |                 # to the end.
805 |                 for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
806 |                     if byte >= start and byte <= end:
807 |                         pos += size
808 |                         break
809 |             elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
810 |                 # We found a Windows-1252 character!
811 |                 # Save the string up to this point as a chunk.
812 |                 byte_chunks.append(in_bytes[chunk_start:pos])
813 | 
814 |                 # Now translate the Windows-1252 character into UTF-8
815 |                 # and add it as another, one-byte chunk.
816 |                 byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
817 |                 pos += 1
818 |                 chunk_start = pos
819 |             else:
820 |                 # Go on to the next character.
821 |                 pos += 1
822 |         if chunk_start == 0:
823 |             # The string is unchanged.
824 |             return in_bytes
825 |         else:
826 |             # Store the final chunk.
827 |             byte_chunks.append(in_bytes[chunk_start:])
828 |         return b''.join(byte_chunks)
829 | 
830 | 


--------------------------------------------------------------------------------