├── bs4 ├── tests │ ├── __init__.py │ ├── test_htmlparser.py │ ├── test_docs.py │ ├── test_html5lib.py │ ├── test_lxml.py │ ├── test_builder_registry.py │ └── test_soup.py ├── diagnose.py ├── builder │ ├── _lxml.py │ ├── _htmlparser.py │ ├── _html5lib.py │ └── __init__.py ├── __init__.py ├── testing.py └── dammit.py ├── .gitignore ├── CHANGELOG ├── getcookie.user.coffee ├── getcookie.user.js ├── LICENSE ├── README.md └── tenkou.py /bs4/tests/__init__.py: -------------------------------------------------------------------------------- 1 | "The beautifulsoup tests." 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.css 3 | *.htm 4 | *.txt 5 | *auth* -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | v0.0.2 2 | + 从本地文件批量导入至bangumi(恢复功能) 3 | 4 | v0.0.1 5 | + 导出bangumi条目到本地 6 | + 批量删除条目 -------------------------------------------------------------------------------- /bs4/tests/test_htmlparser.py: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the html.parser tree builder generates good 2 | trees.""" 3 | 4 | from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest 5 | from bs4.builder import HTMLParserTreeBuilder 6 | 7 | class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): 8 | 9 | @property 10 | def default_builder(self): 11 | return HTMLParserTreeBuilder() 12 | 13 | def test_namespaced_system_doctype(self): 14 | # html.parser can't handle namespaced doctypes, so skip this one. 15 | pass 16 | 17 | def test_namespaced_public_doctype(self): 18 | # html.parser can't handle namespaced doctypes, so skip this one. 19 | pass 20 | -------------------------------------------------------------------------------- /getcookie.user.coffee: -------------------------------------------------------------------------------- 1 | ### 2 | // ==UserScript== 3 | // @name getbgmcookie 4 | // @namespace https://github.com/hentaiPanda 5 | // @author niR 6 | // @version 0.0.1 7 | // @license MIT License 8 | // @encoding utf-8 9 | // @grant GM_setClipboard 10 | // @grant GM_registerMenuCommand 11 | // @include http://bangumi.tv/* 12 | // @include http://bgm.tv/* 13 | // @include http://chii.in/* 14 | // ==/UserScript== 15 | ### 16 | 17 | 18 | show = -> 19 | # alert(document.cookie) 20 | cks = document.cookie.split(';') 21 | for i in cks 22 | i = i.trim() 23 | if i.indexOf('chii_auth') is 0 24 | auth = i.split('=')[1] 25 | break 26 | ua = navigator.userAgent 27 | data = ua + '\n' + auth 28 | alert(data) 29 | console.log(data) 30 | GM_setClipboard(data) 31 | alert('已复制到剪贴板') 32 | 33 | 34 | GM_registerMenuCommand('显示UA和AUTH', show) -------------------------------------------------------------------------------- /getcookie.user.js: -------------------------------------------------------------------------------- 1 | /* 2 | // ==UserScript== 3 | // @name getbgmcookie 4 | // @namespace https://github.com/hentaiPanda 5 | // @author niR 6 | // @version 0.0.1 7 | // @license MIT License 8 | // @encoding utf-8 9 | // @grant GM_setClipboard 10 | // @grant GM_registerMenuCommand 11 | // @include http://bangumi.tv/* 12 | // @include http://bgm.tv/* 13 | // @include http://chii.in/* 14 | // ==/UserScript== 15 | */ 16 | var show; 17 | 18 | show = function() { 19 | var auth, cks, data, i, ua, _i, _len; 20 | cks = document.cookie.split(';'); 21 | for (_i = 0, _len = cks.length; _i < _len; _i++) { 22 | i = cks[_i]; 23 | i = i.trim(); 24 | if (i.indexOf('chii_auth') === 0) { 25 | auth = i.split('=')[1]; 26 | break; 27 | } 28 | } 29 | ua = navigator.userAgent; 30 | data = ua + '\n' + auth; 31 | alert(data); 32 | console.log(data); 33 | GM_setClipboard(data); 34 | return alert('已复制到剪贴板'); 35 | }; 36 | 37 | GM_registerMenuCommand('显示UA和AUTH', show); -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 niR 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /bs4/tests/test_docs.py: -------------------------------------------------------------------------------- 1 | "Test harness for doctests." 2 | 3 | # pylint: disable-msg=E0611,W0142 4 | 5 | __metaclass__ = type 6 | __all__ = [ 7 | 'additional_tests', 8 | ] 9 | 10 | import atexit 11 | import doctest 12 | import os 13 | #from pkg_resources import ( 14 | # resource_filename, resource_exists, resource_listdir, cleanup_resources) 15 | import unittest 16 | 17 | DOCTEST_FLAGS = ( 18 | doctest.ELLIPSIS | 19 | doctest.NORMALIZE_WHITESPACE | 20 | doctest.REPORT_NDIFF) 21 | 22 | 23 | # def additional_tests(): 24 | # "Run the doc tests (README.txt and docs/*, if any exist)" 25 | # doctest_files = [ 26 | # os.path.abspath(resource_filename('bs4', 'README.txt'))] 27 | # if resource_exists('bs4', 'docs'): 28 | # for name in resource_listdir('bs4', 'docs'): 29 | # if name.endswith('.txt'): 30 | # doctest_files.append( 31 | # os.path.abspath( 32 | # resource_filename('bs4', 'docs/%s' % name))) 33 | # kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS) 34 | # atexit.register(cleanup_resources) 35 | # return unittest.TestSuite(( 36 | # doctest.DocFileSuite(*doctest_files, **kwargs))) 37 | -------------------------------------------------------------------------------- /bs4/tests/test_html5lib.py: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the html5lib tree builder generates good trees.""" 2 | 3 | import warnings 4 | 5 | try: 6 | from bs4.builder import HTML5TreeBuilder 7 | HTML5LIB_PRESENT = True 8 | except ImportError as e: 9 | HTML5LIB_PRESENT = False 10 | from bs4.element import SoupStrainer 11 | from bs4.testing import ( 12 | HTML5TreeBuilderSmokeTest, 13 | SoupTest, 14 | skipIf, 15 | ) 16 | 17 | @skipIf( 18 | not HTML5LIB_PRESENT, 19 | "html5lib seems not to be present, not testing its tree builder.") 20 | class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): 21 | """See ``HTML5TreeBuilderSmokeTest``.""" 22 | 23 | @property 24 | def default_builder(self): 25 | return HTML5TreeBuilder() 26 | 27 | def test_soupstrainer(self): 28 | # The html5lib tree builder does not support SoupStrainers. 29 | strainer = SoupStrainer("b") 30 | markup = "

A bold statement.

" 31 | with warnings.catch_warnings(record=True) as w: 32 | soup = self.soup(markup, parse_only=strainer) 33 | self.assertEqual( 34 | soup.decode(), self.document_for(markup)) 35 | 36 | self.assertTrue( 37 | "the html5lib tree builder doesn't support parse_only" in 38 | str(w[0].message)) 39 | 40 | def test_correctly_nested_tables(self): 41 | """html5lib inserts tags where other parsers don't.""" 42 | markup = ('' 43 | '' 44 | "') 48 | 49 | self.assertSoupEquals( 50 | markup, 51 | '
Here's another table:" 45 | '' 46 | '' 47 | '
foo
Here\'s another table:' 52 | '
foo
' 53 | '
') 54 | 55 | self.assertSoupEquals( 56 | "" 57 | "" 58 | "
Foo
Bar
Baz
") 59 | 60 | def test_xml_declaration_followed_by_doctype(self): 61 | markup = ''' 62 | 63 | 64 | 65 | 66 | 67 |

foo

68 | 69 | ''' 70 | soup = self.soup(markup) 71 | # Verify that we can reach the

tag; this means the tree is connected. 72 | self.assertEqual(b"

foo

", soup.p.encode()) 73 | 74 | def test_reparented_markup(self): 75 | markup = '

foo

\n

bar

' 76 | soup = self.soup(markup) 77 | self.assertEqual("

foo

\n

bar

", soup.body.decode()) 78 | self.assertEqual(2, len(soup.find_all('p'))) 79 | 80 | 81 | def test_reparented_markup_ends_with_whitespace(self): 82 | markup = '

foo

\n

bar

\n' 83 | soup = self.soup(markup) 84 | self.assertEqual("

foo

\n

bar

\n", soup.body.decode()) 85 | self.assertEqual(2, len(soup.find_all('p'))) 86 | -------------------------------------------------------------------------------- /bs4/tests/test_lxml.py: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the lxml tree builder generates good trees.""" 2 | 3 | import re 4 | import warnings 5 | 6 | try: 7 | import lxml.etree 8 | LXML_PRESENT = True 9 | LXML_VERSION = lxml.etree.LXML_VERSION 10 | except ImportError as e: 11 | LXML_PRESENT = False 12 | LXML_VERSION = (0,) 13 | 14 | if LXML_PRESENT: 15 | from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML 16 | 17 | from bs4 import ( 18 | BeautifulSoup, 19 | BeautifulStoneSoup, 20 | ) 21 | from bs4.element import Comment, Doctype, SoupStrainer 22 | from bs4.testing import skipIf 23 | from bs4.tests import test_htmlparser 24 | from bs4.testing import ( 25 | HTMLTreeBuilderSmokeTest, 26 | XMLTreeBuilderSmokeTest, 27 | SoupTest, 28 | skipIf, 29 | ) 30 | 31 | @skipIf( 32 | not LXML_PRESENT, 33 | "lxml seems not to be present, not testing its tree builder.") 34 | class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): 35 | """See ``HTMLTreeBuilderSmokeTest``.""" 36 | 37 | @property 38 | def default_builder(self): 39 | return LXMLTreeBuilder() 40 | 41 | def test_out_of_range_entity(self): 42 | self.assertSoupEquals( 43 | "

foo�bar

", "

foobar

") 44 | self.assertSoupEquals( 45 | "

foo�bar

", "

foobar

") 46 | self.assertSoupEquals( 47 | "

foo�bar

", "

foobar

") 48 | 49 | # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this 50 | # test if an old version of lxml is installed. 51 | 52 | @skipIf( 53 | not LXML_PRESENT or LXML_VERSION < (2,3,5,0), 54 | "Skipping doctype test for old version of lxml to avoid segfault.") 55 | def test_empty_doctype(self): 56 | soup = self.soup("") 57 | doctype = soup.contents[0] 58 | self.assertEqual("", doctype.strip()) 59 | 60 | def test_beautifulstonesoup_is_xml_parser(self): 61 | # Make sure that the deprecated BSS class uses an xml builder 62 | # if one is installed. 63 | with warnings.catch_warnings(record=True) as w: 64 | soup = BeautifulStoneSoup("") 65 | self.assertEqual("", str(soup.b)) 66 | self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) 67 | 68 | def test_real_xhtml_document(self): 69 | """lxml strips the XML definition from an XHTML doc, which is fine.""" 70 | markup = b""" 71 | 72 | 73 | Hello. 74 | Goodbye. 75 | """ 76 | soup = self.soup(markup) 77 | self.assertEqual( 78 | soup.encode("utf-8").replace(b"\n", b''), 79 | markup.replace(b'\n', b'').replace( 80 | b'', b'')) 81 | 82 | 83 | @skipIf( 84 | not LXML_PRESENT, 85 | "lxml seems not to be present, not testing its XML tree builder.") 86 | class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): 87 | """See ``HTMLTreeBuilderSmokeTest``.""" 88 | 89 | @property 90 | def default_builder(self): 91 | return LXMLTreeBuilderForXML() 92 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ##tenkou転校## 2 | 3 | ###批量导出/删除/恢复收藏条目### 4 | 5 | Bangumi有时候连接情况不太好,会一直卡在某个地方 6 | 7 | Don't panic! 8 | 9 | 如果是网络情况不好可以换个时间再试 10 | 11 | ###使用### 12 | 13 | Python 3 脚本 14 | 15 | 参数说明: 16 | 17 | ``` 18 | tenkou.py [-h] [-d {chii.in,bgm.tv,bangumi.tv}] [-u UID] 19 | [--password PASSWORD] [--wipe] [-p PATH] [--auth AUTH] 20 | [--useragent USERAGENT] [--authfile AUTHFILE] [-v] 21 | ``` 22 | 23 | ``` 24 | -h, --help 帮助 25 | -d DOMAIN, --domain DOMAIN 选择域名,默认bgm.tv, 还可选择bangumi.tv或chii.in 26 | -u UID, --uid UID 你的id 27 | --password PASSWORD 你的网站登录密码 28 | -p PATH, --path PATH 本地保存目录,必须事先建立 29 | --auth AUTH auth字符串 30 | --useragent USERAGENT 你浏览器的User-Agent 31 | --authfile AUTHFILE 保存User-Agent和Auth字符串的文件(Authfile)位置 32 | --wipe 删除所有条目!谨慎使用! 33 | -r, --restore 恢复条目 34 | -v, --version 程序版本 35 | ``` 36 | 37 | 只有id是必须的,不需要导出进度的话,不必输入密码 38 | 39 | ###使用举例### 40 | 41 | 以下只是举例,你可以自由搭配 42 | 43 | ``` 44 | 1. tenkou.py -u 9999999 导出id为9999999那人的条目 45 | 46 | 2. tenkou.py -u 9999999 -p ./bgm_backup 备份到当前位置的bgm_backup目录 47 | 48 | 3. tenkou.py -u 9999999 --password 123 49 | 使用密码登录,导出id为9999999的条目,包括观看进度 50 | 51 | 4. tenkou.py -u 9999999 --useragent Mozilla --auth LFJDSLAF%LFASJD 52 | 使用auth,导出id为9999999的条目,包括观看进度 53 | 54 | 5. tenkou.py -u 9999999 --authfile ./authfile.txt 55 | 使用authfile,导出id为9999999的条目,包括观看进度 56 | 57 | 6. tenkou.py -u 9999999 --password 123 --wipe 58 | 使用密码,导出id为9999999的条目,包括观看进度 59 | 完全删除条目 60 | 61 | 7. tenkou.py -u 9999999 --useragent Mozilla --auth LFJDSLAF%LFASJD --wipe 62 | 使用auth,导出id为9999999的条目,包括观看进度 63 | 完全删除条目 64 | 65 | 8. tenkou.py -u 9999999 --authfile ./authfile.txt --wipe 66 | 使用authfile,导出id为9999999的条目,包括观看进度 67 | 完全删除条目 68 | 69 | 9. tenkou.py -u 9999999 --password 12345 -p ./backup -r 70 | 使用密码,从当前工作目录下的backup目录读取文件, 71 | 恢复id为9999999的条目 72 | ``` 73 | 74 | **注:大量恢复或者删除条目过程中因为网络情况(你的连接问题或者服务器的保护机制,如果有的话)而出错的可能性会比较大。如果是恢复条目时,问题还不严重,因为本地文件还保留着。但在删除条目过程中如果发生错误,你会丢失条目记录或者删除不完全,所以虽然删除可以和导出同时进行,仍建议先导出一次再删除。** 75 | 76 | ###获取Auth和User-Agent的方法### 77 | 78 | 直接查看你浏览器的调试工具,这个Auth就是cookie里的Auth 79 | 80 | ###什么是Authfile### 81 | 82 | 只是一个文本文件,保存了User-Agent和Auth字符串 83 | 84 | 其中,第一行是User-Agent;第二行是Auth字符串 85 | 86 | authfile文件名任意,上面只是举例而用了authfile.txt 87 | 88 | ###需要同时提供密码和Auth吗?### 89 | 90 | 不需要,密码、User-Agent和Auth、Authfile三选一即可 91 | 92 | 可以参考上面的使用举例 93 | 94 | ###有更简单的方法获取Auth和User-Agent吗?### 95 | 96 | 和tenkou.py同时提供了一个getcookie.user.js的greasemonkey/tampermonkey脚本 97 | 98 | 安装后在bangumi页面可以查看并自动复制你的User-Agent/Auth,你可以手动分拆开来输入,或者直接新建一个authfile 99 | 100 | 如图所示: 101 | 102 | Firefox GM 103 | 104 | ![Firefox选项](http://i.imgur.com/2GdaRSn.jpg) 105 | 106 | Chrome Tampermonkey 107 | 108 | ![Chrome选项](http://i.imgur.com/Qwk6ff0.jpg) 109 | 110 | 结果 111 | 112 | ![结果](http://i.imgur.com/NW3IYnc.jpg) 113 | 114 | ###安全性?你会知道我的密码吗?### 115 | 116 | 不会,只是你本地和网站的通信,没有什么信息会传到我这里 117 | 118 | 至于密码和Auth方式,因为这个只是用cookie的auth,没什么大的差别 119 | 120 | 121 | ###最后生出的备份文件说明### 122 | 123 | 最后的备份文件按 124 | 125 | A段: 126 | 127 | * 动画(anime) 128 | * 音乐(music) 129 | * 游戏(game) 130 | * 书籍(book) 131 | * 三次元(real) 132 | 133 | 以及B段: 134 | 135 | * 在看(do) 136 | * 看过(collect) 137 | * 想看(wish) 138 | * 搁置(on_hold) 139 | * 抛弃(dropped) 140 | 141 | 来命名 142 | 143 | 组成形式为```bangumi_A段_B段.txt``` 144 | 145 | 比如 146 | 147 | ``` 148 | bangumi_anime_do.txt 149 | bangumi_book_collect.txt 150 | ``` -------------------------------------------------------------------------------- /bs4/tests/test_builder_registry.py: -------------------------------------------------------------------------------- 1 | """Tests of the builder registry.""" 2 | 3 | import unittest 4 | 5 | from bs4 import BeautifulSoup 6 | from bs4.builder import ( 7 | builder_registry as registry, 8 | HTMLParserTreeBuilder, 9 | TreeBuilderRegistry, 10 | ) 11 | 12 | try: 13 | from bs4.builder import HTML5TreeBuilder 14 | HTML5LIB_PRESENT = True 15 | except ImportError: 16 | HTML5LIB_PRESENT = False 17 | 18 | try: 19 | from bs4.builder import ( 20 | LXMLTreeBuilderForXML, 21 | LXMLTreeBuilder, 22 | ) 23 | LXML_PRESENT = True 24 | except ImportError: 25 | LXML_PRESENT = False 26 | 27 | 28 | class BuiltInRegistryTest(unittest.TestCase): 29 | """Test the built-in registry with the default builders registered.""" 30 | 31 | def test_combination(self): 32 | if LXML_PRESENT: 33 | self.assertEqual(registry.lookup('fast', 'html'), 34 | LXMLTreeBuilder) 35 | 36 | if LXML_PRESENT: 37 | self.assertEqual(registry.lookup('permissive', 'xml'), 38 | LXMLTreeBuilderForXML) 39 | self.assertEqual(registry.lookup('strict', 'html'), 40 | HTMLParserTreeBuilder) 41 | if HTML5LIB_PRESENT: 42 | self.assertEqual(registry.lookup('html5lib', 'html'), 43 | HTML5TreeBuilder) 44 | 45 | def test_lookup_by_markup_type(self): 46 | if LXML_PRESENT: 47 | self.assertEqual(registry.lookup('html'), LXMLTreeBuilder) 48 | self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML) 49 | else: 50 | self.assertEqual(registry.lookup('xml'), None) 51 | if HTML5LIB_PRESENT: 52 | self.assertEqual(registry.lookup('html'), HTML5TreeBuilder) 53 | else: 54 | self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder) 55 | 56 | def test_named_library(self): 57 | if LXML_PRESENT: 58 | self.assertEqual(registry.lookup('lxml', 'xml'), 59 | LXMLTreeBuilderForXML) 60 | self.assertEqual(registry.lookup('lxml', 'html'), 61 | LXMLTreeBuilder) 62 | if HTML5LIB_PRESENT: 63 | self.assertEqual(registry.lookup('html5lib'), 64 | HTML5TreeBuilder) 65 | 66 | self.assertEqual(registry.lookup('html.parser'), 67 | HTMLParserTreeBuilder) 68 | 69 | def test_beautifulsoup_constructor_does_lookup(self): 70 | # You can pass in a string. 71 | BeautifulSoup("", features="html") 72 | # Or a list of strings. 73 | BeautifulSoup("", features=["html", "fast"]) 74 | 75 | # You'll get an exception if BS can't find an appropriate 76 | # builder. 77 | self.assertRaises(ValueError, BeautifulSoup, 78 | "", features="no-such-feature") 79 | 80 | class RegistryTest(unittest.TestCase): 81 | """Test the TreeBuilderRegistry class in general.""" 82 | 83 | def setUp(self): 84 | self.registry = TreeBuilderRegistry() 85 | 86 | def builder_for_features(self, *feature_list): 87 | cls = type('Builder_' + '_'.join(feature_list), 88 | (object,), {'features' : feature_list}) 89 | 90 | self.registry.register(cls) 91 | return cls 92 | 93 | def test_register_with_no_features(self): 94 | builder = self.builder_for_features() 95 | 96 | # Since the builder advertises no features, you can't find it 97 | # by looking up features. 98 | self.assertEqual(self.registry.lookup('foo'), None) 99 | 100 | # But you can find it by doing a lookup with no features, if 101 | # this happens to be the only registered builder. 102 | self.assertEqual(self.registry.lookup(), builder) 103 | 104 | def test_register_with_features_makes_lookup_succeed(self): 105 | builder = self.builder_for_features('foo', 'bar') 106 | self.assertEqual(self.registry.lookup('foo'), builder) 107 | self.assertEqual(self.registry.lookup('bar'), builder) 108 | 109 | def test_lookup_fails_when_no_builder_implements_feature(self): 110 | builder = self.builder_for_features('foo', 'bar') 111 | self.assertEqual(self.registry.lookup('baz'), None) 112 | 113 | def test_lookup_gets_most_recent_registration_when_no_feature_specified(self): 114 | builder1 = self.builder_for_features('foo') 115 | builder2 = self.builder_for_features('bar') 116 | self.assertEqual(self.registry.lookup(), builder2) 117 | 118 | def test_lookup_fails_when_no_tree_builders_registered(self): 119 | self.assertEqual(self.registry.lookup(), None) 120 | 121 | def test_lookup_gets_most_recent_builder_supporting_all_features(self): 122 | has_one = self.builder_for_features('foo') 123 | has_the_other = self.builder_for_features('bar') 124 | has_both_early = self.builder_for_features('foo', 'bar', 'baz') 125 | has_both_late = self.builder_for_features('foo', 'bar', 'quux') 126 | lacks_one = self.builder_for_features('bar') 127 | has_the_other = self.builder_for_features('foo') 128 | 129 | # There are two builders featuring 'foo' and 'bar', but 130 | # the one that also features 'quux' was registered later. 131 | self.assertEqual(self.registry.lookup('foo', 'bar'), 132 | has_both_late) 133 | 134 | # There is only one builder featuring 'foo', 'bar', and 'baz'. 135 | self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'), 136 | has_both_early) 137 | 138 | def test_lookup_fails_when_cannot_reconcile_requested_features(self): 139 | builder1 = self.builder_for_features('foo', 'bar') 140 | builder2 = self.builder_for_features('foo', 'baz') 141 | self.assertEqual(self.registry.lookup('bar', 'baz'), None) 142 | -------------------------------------------------------------------------------- /bs4/diagnose.py: -------------------------------------------------------------------------------- 1 | """Diagnostic functions, mainly for use when doing tech support.""" 2 | import cProfile 3 | from io import StringIO 4 | from html.parser import HTMLParser 5 | import bs4 6 | from bs4 import BeautifulSoup, __version__ 7 | from bs4.builder import builder_registry 8 | 9 | import os 10 | import pstats 11 | import random 12 | import tempfile 13 | import time 14 | import traceback 15 | import sys 16 | import cProfile 17 | 18 | def diagnose(data): 19 | """Diagnostic suite for isolating common problems.""" 20 | print("Diagnostic running on Beautiful Soup %s" % __version__) 21 | print("Python version %s" % sys.version) 22 | 23 | basic_parsers = ["html.parser", "html5lib", "lxml"] 24 | for name in basic_parsers: 25 | for builder in builder_registry.builders: 26 | if name in builder.features: 27 | break 28 | else: 29 | basic_parsers.remove(name) 30 | print(( 31 | "I noticed that %s is not installed. Installing it may help." % 32 | name)) 33 | 34 | if 'lxml' in basic_parsers: 35 | basic_parsers.append(["lxml", "xml"]) 36 | from lxml import etree 37 | print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))) 38 | 39 | if 'html5lib' in basic_parsers: 40 | import html5lib 41 | print("Found html5lib version %s" % html5lib.__version__) 42 | 43 | if hasattr(data, 'read'): 44 | data = data.read() 45 | elif os.path.exists(data): 46 | print('"%s" looks like a filename. Reading data from the file.' % data) 47 | data = open(data).read() 48 | elif data.startswith("http:") or data.startswith("https:"): 49 | print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data) 50 | print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.") 51 | return 52 | print() 53 | 54 | for parser in basic_parsers: 55 | print("Trying to parse your markup with %s" % parser) 56 | success = False 57 | try: 58 | soup = BeautifulSoup(data, parser) 59 | success = True 60 | except Exception as e: 61 | print("%s could not parse the markup." % parser) 62 | traceback.print_exc() 63 | if success: 64 | print("Here's what %s did with the markup:" % parser) 65 | print(soup.prettify()) 66 | 67 | print("-" * 80) 68 | 69 | def lxml_trace(data, html=True, **kwargs): 70 | """Print out the lxml events that occur during parsing. 71 | 72 | This lets you see how lxml parses a document when no Beautiful 73 | Soup code is running. 74 | """ 75 | from lxml import etree 76 | for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): 77 | print(("%s, %4s, %s" % (event, element.tag, element.text))) 78 | 79 | class AnnouncingParser(HTMLParser): 80 | """Announces HTMLParser parse events, without doing anything else.""" 81 | 82 | def _p(self, s): 83 | print(s) 84 | 85 | def handle_starttag(self, name, attrs): 86 | self._p("%s START" % name) 87 | 88 | def handle_endtag(self, name): 89 | self._p("%s END" % name) 90 | 91 | def handle_data(self, data): 92 | self._p("%s DATA" % data) 93 | 94 | def handle_charref(self, name): 95 | self._p("%s CHARREF" % name) 96 | 97 | def handle_entityref(self, name): 98 | self._p("%s ENTITYREF" % name) 99 | 100 | def handle_comment(self, data): 101 | self._p("%s COMMENT" % data) 102 | 103 | def handle_decl(self, data): 104 | self._p("%s DECL" % data) 105 | 106 | def unknown_decl(self, data): 107 | self._p("%s UNKNOWN-DECL" % data) 108 | 109 | def handle_pi(self, data): 110 | self._p("%s PI" % data) 111 | 112 | def htmlparser_trace(data): 113 | """Print out the HTMLParser events that occur during parsing. 114 | 115 | This lets you see how HTMLParser parses a document when no 116 | Beautiful Soup code is running. 117 | """ 118 | parser = AnnouncingParser() 119 | parser.feed(data) 120 | 121 | _vowels = "aeiou" 122 | _consonants = "bcdfghjklmnpqrstvwxyz" 123 | 124 | def rword(length=5): 125 | "Generate a random word-like string." 126 | s = '' 127 | for i in range(length): 128 | if i % 2 == 0: 129 | t = _consonants 130 | else: 131 | t = _vowels 132 | s += random.choice(t) 133 | return s 134 | 135 | def rsentence(length=4): 136 | "Generate a random sentence-like string." 137 | return " ".join(rword(random.randint(4,9)) for i in range(length)) 138 | 139 | def rdoc(num_elements=1000): 140 | """Randomly generate an invalid HTML document.""" 141 | tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] 142 | elements = [] 143 | for i in range(num_elements): 144 | choice = random.randint(0,3) 145 | if choice == 0: 146 | # New tag. 147 | tag_name = random.choice(tag_names) 148 | elements.append("<%s>" % tag_name) 149 | elif choice == 1: 150 | elements.append(rsentence(random.randint(1,4))) 151 | elif choice == 2: 152 | # Close a tag. 153 | tag_name = random.choice(tag_names) 154 | elements.append("" % tag_name) 155 | return "" + "\n".join(elements) + "" 156 | 157 | def benchmark_parsers(num_elements=100000): 158 | """Very basic head-to-head performance benchmark.""" 159 | print("Comparative parser benchmark on Beautiful Soup %s" % __version__) 160 | data = rdoc(num_elements) 161 | print("Generated a large invalid HTML document (%d bytes)." % len(data)) 162 | 163 | for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: 164 | success = False 165 | try: 166 | a = time.time() 167 | soup = BeautifulSoup(data, parser) 168 | b = time.time() 169 | success = True 170 | except Exception as e: 171 | print("%s could not parse the markup." % parser) 172 | traceback.print_exc() 173 | if success: 174 | print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) 175 | 176 | from lxml import etree 177 | a = time.time() 178 | etree.HTML(data) 179 | b = time.time() 180 | print("Raw lxml parsed the markup in %.2fs." % (b-a)) 181 | 182 | import html5lib 183 | parser = html5lib.HTMLParser() 184 | a = time.time() 185 | parser.parse(data) 186 | b = time.time() 187 | print("Raw html5lib parsed the markup in %.2fs." % (b-a)) 188 | 189 | def profile(num_elements=100000, parser="lxml"): 190 | 191 | filehandle = tempfile.NamedTemporaryFile() 192 | filename = filehandle.name 193 | 194 | data = rdoc(num_elements) 195 | vars = dict(bs4=bs4, data=data, parser=parser) 196 | cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename) 197 | 198 | stats = pstats.Stats(filename) 199 | # stats.strip_dirs() 200 | stats.sort_stats("cumulative") 201 | stats.print_stats('_html5lib|bs4', 50) 202 | 203 | if __name__ == '__main__': 204 | diagnose(sys.stdin.read()) 205 | -------------------------------------------------------------------------------- /bs4/builder/_lxml.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | 'LXMLTreeBuilderForXML', 3 | 'LXMLTreeBuilder', 4 | ] 5 | 6 | from io import BytesIO 7 | from io import StringIO 8 | import collections 9 | from lxml import etree 10 | from bs4.element import Comment, Doctype, NamespacedAttribute 11 | from bs4.builder import ( 12 | FAST, 13 | HTML, 14 | HTMLTreeBuilder, 15 | PERMISSIVE, 16 | ParserRejectedMarkup, 17 | TreeBuilder, 18 | XML) 19 | from bs4.dammit import EncodingDetector 20 | 21 | LXML = 'lxml' 22 | 23 | class LXMLTreeBuilderForXML(TreeBuilder): 24 | DEFAULT_PARSER_CLASS = etree.XMLParser 25 | 26 | is_xml = True 27 | 28 | # Well, it's permissive by XML parser standards. 29 | features = [LXML, XML, FAST, PERMISSIVE] 30 | 31 | CHUNK_SIZE = 512 32 | 33 | # This namespace mapping is specified in the XML Namespace 34 | # standard. 35 | DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} 36 | 37 | def default_parser(self, encoding): 38 | # This can either return a parser object or a class, which 39 | # will be instantiated with default arguments. 40 | if self._default_parser is not None: 41 | return self._default_parser 42 | return etree.XMLParser( 43 | target=self, strip_cdata=False, recover=True, encoding=encoding) 44 | 45 | def parser_for(self, encoding): 46 | # Use the default parser. 47 | parser = self.default_parser(encoding) 48 | 49 | if isinstance(parser, collections.Callable): 50 | # Instantiate the parser with default arguments 51 | parser = parser(target=self, strip_cdata=False, encoding=encoding) 52 | return parser 53 | 54 | def __init__(self, parser=None, empty_element_tags=None): 55 | # TODO: Issue a warning if parser is present but not a 56 | # callable, since that means there's no way to create new 57 | # parsers for different encodings. 58 | self._default_parser = parser 59 | if empty_element_tags is not None: 60 | self.empty_element_tags = set(empty_element_tags) 61 | self.soup = None 62 | self.nsmaps = [self.DEFAULT_NSMAPS] 63 | 64 | def _getNsTag(self, tag): 65 | # Split the namespace URL out of a fully-qualified lxml tag 66 | # name. Copied from lxml's src/lxml/sax.py. 67 | if tag[0] == '{': 68 | return tuple(tag[1:].split('}', 1)) 69 | else: 70 | return (None, tag) 71 | 72 | def prepare_markup(self, markup, user_specified_encoding=None, 73 | document_declared_encoding=None): 74 | """ 75 | :yield: A series of 4-tuples. 76 | (markup, encoding, declared encoding, 77 | has undergone character replacement) 78 | 79 | Each 4-tuple represents a strategy for parsing the document. 80 | """ 81 | if isinstance(markup, str): 82 | # We were given Unicode. Maybe lxml can parse Unicode on 83 | # this system? 84 | yield markup, None, document_declared_encoding, False 85 | 86 | if isinstance(markup, str): 87 | # No, apparently not. Convert the Unicode to UTF-8 and 88 | # tell lxml to parse it as UTF-8. 89 | yield (markup.encode("utf8"), "utf8", 90 | document_declared_encoding, False) 91 | 92 | # Instead of using UnicodeDammit to convert the bytestring to 93 | # Unicode using different encodings, use EncodingDetector to 94 | # iterate over the encodings, and tell lxml to try to parse 95 | # the document as each one in turn. 96 | is_html = not self.is_xml 97 | try_encodings = [user_specified_encoding, document_declared_encoding] 98 | detector = EncodingDetector(markup, try_encodings, is_html) 99 | for encoding in detector.encodings: 100 | yield (detector.markup, encoding, document_declared_encoding, False) 101 | 102 | def feed(self, markup): 103 | if isinstance(markup, bytes): 104 | markup = BytesIO(markup) 105 | elif isinstance(markup, str): 106 | markup = StringIO(markup) 107 | 108 | # Call feed() at least once, even if the markup is empty, 109 | # or the parser won't be initialized. 110 | data = markup.read(self.CHUNK_SIZE) 111 | try: 112 | self.parser = self.parser_for(self.soup.original_encoding) 113 | self.parser.feed(data) 114 | while len(data) != 0: 115 | # Now call feed() on the rest of the data, chunk by chunk. 116 | data = markup.read(self.CHUNK_SIZE) 117 | if len(data) != 0: 118 | self.parser.feed(data) 119 | self.parser.close() 120 | except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 121 | raise ParserRejectedMarkup(str(e)) 122 | 123 | def close(self): 124 | self.nsmaps = [self.DEFAULT_NSMAPS] 125 | 126 | def start(self, name, attrs, nsmap={}): 127 | # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. 128 | attrs = dict(attrs) 129 | nsprefix = None 130 | # Invert each namespace map as it comes in. 131 | if len(self.nsmaps) > 1: 132 | # There are no new namespaces for this tag, but 133 | # non-default namespaces are in play, so we need a 134 | # separate tag stack to know when they end. 135 | self.nsmaps.append(None) 136 | elif len(nsmap) > 0: 137 | # A new namespace mapping has come into play. 138 | inverted_nsmap = dict((value, key) for key, value in list(nsmap.items())) 139 | self.nsmaps.append(inverted_nsmap) 140 | # Also treat the namespace mapping as a set of attributes on the 141 | # tag, so we can recreate it later. 142 | attrs = attrs.copy() 143 | for prefix, namespace in list(nsmap.items()): 144 | attribute = NamespacedAttribute( 145 | "xmlns", prefix, "http://www.w3.org/2000/xmlns/") 146 | attrs[attribute] = namespace 147 | 148 | # Namespaces are in play. Find any attributes that came in 149 | # from lxml with namespaces attached to their names, and 150 | # turn then into NamespacedAttribute objects. 151 | new_attrs = {} 152 | for attr, value in list(attrs.items()): 153 | namespace, attr = self._getNsTag(attr) 154 | if namespace is None: 155 | new_attrs[attr] = value 156 | else: 157 | nsprefix = self._prefix_for_namespace(namespace) 158 | attr = NamespacedAttribute(nsprefix, attr, namespace) 159 | new_attrs[attr] = value 160 | attrs = new_attrs 161 | 162 | namespace, name = self._getNsTag(name) 163 | nsprefix = self._prefix_for_namespace(namespace) 164 | self.soup.handle_starttag(name, namespace, nsprefix, attrs) 165 | 166 | def _prefix_for_namespace(self, namespace): 167 | """Find the currently active prefix for the given namespace.""" 168 | if namespace is None: 169 | return None 170 | for inverted_nsmap in reversed(self.nsmaps): 171 | if inverted_nsmap is not None and namespace in inverted_nsmap: 172 | return inverted_nsmap[namespace] 173 | return None 174 | 175 | def end(self, name): 176 | self.soup.endData() 177 | completed_tag = self.soup.tagStack[-1] 178 | namespace, name = self._getNsTag(name) 179 | nsprefix = None 180 | if namespace is not None: 181 | for inverted_nsmap in reversed(self.nsmaps): 182 | if inverted_nsmap is not None and namespace in inverted_nsmap: 183 | nsprefix = inverted_nsmap[namespace] 184 | break 185 | self.soup.handle_endtag(name, nsprefix) 186 | if len(self.nsmaps) > 1: 187 | # This tag, or one of its parents, introduced a namespace 188 | # mapping, so pop it off the stack. 189 | self.nsmaps.pop() 190 | 191 | def pi(self, target, data): 192 | pass 193 | 194 | def data(self, content): 195 | self.soup.handle_data(content) 196 | 197 | def doctype(self, name, pubid, system): 198 | self.soup.endData() 199 | doctype = Doctype.for_name_and_ids(name, pubid, system) 200 | self.soup.object_was_parsed(doctype) 201 | 202 | def comment(self, content): 203 | "Handle comments as Comment objects." 204 | self.soup.endData() 205 | self.soup.handle_data(content) 206 | self.soup.endData(Comment) 207 | 208 | def test_fragment_to_document(self, fragment): 209 | """See `TreeBuilder`.""" 210 | return '\n%s' % fragment 211 | 212 | 213 | class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): 214 | 215 | features = [LXML, HTML, FAST, PERMISSIVE] 216 | is_xml = False 217 | 218 | def default_parser(self, encoding): 219 | return etree.HTMLParser 220 | 221 | def feed(self, markup): 222 | encoding = self.soup.original_encoding 223 | try: 224 | self.parser = self.parser_for(encoding) 225 | self.parser.feed(markup) 226 | self.parser.close() 227 | except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 228 | raise ParserRejectedMarkup(str(e)) 229 | 230 | 231 | def test_fragment_to_document(self, fragment): 232 | """See `TreeBuilder`.""" 233 | return '%s' % fragment 234 | -------------------------------------------------------------------------------- /bs4/builder/_htmlparser.py: -------------------------------------------------------------------------------- 1 | """Use the HTMLParser library to parse HTML files that aren't too bad.""" 2 | 3 | __all__ = [ 4 | 'HTMLParserTreeBuilder', 5 | ] 6 | 7 | from html.parser import ( 8 | HTMLParser, 9 | HTMLParseError, 10 | ) 11 | import sys 12 | import warnings 13 | 14 | # Starting in Python 3.2, the HTMLParser constructor takes a 'strict' 15 | # argument, which we'd like to set to False. Unfortunately, 16 | # http://bugs.python.org/issue13273 makes strict=True a better bet 17 | # before Python 3.2.3. 18 | # 19 | # At the end of this file, we monkeypatch HTMLParser so that 20 | # strict=True works well on Python 3.2.2. 21 | major, minor, release = sys.version_info[:3] 22 | CONSTRUCTOR_TAKES_STRICT = ( 23 | major > 3 24 | or (major == 3 and minor > 2) 25 | or (major == 3 and minor == 2 and release >= 3)) 26 | 27 | from bs4.element import ( 28 | CData, 29 | Comment, 30 | Declaration, 31 | Doctype, 32 | ProcessingInstruction, 33 | ) 34 | from bs4.dammit import EntitySubstitution, UnicodeDammit 35 | 36 | from bs4.builder import ( 37 | HTML, 38 | HTMLTreeBuilder, 39 | STRICT, 40 | ) 41 | 42 | 43 | HTMLPARSER = 'html.parser' 44 | 45 | class BeautifulSoupHTMLParser(HTMLParser): 46 | def handle_starttag(self, name, attrs): 47 | # XXX namespace 48 | attr_dict = {} 49 | for key, value in attrs: 50 | # Change None attribute values to the empty string 51 | # for consistency with the other tree builders. 52 | if value is None: 53 | value = '' 54 | attr_dict[key] = value 55 | attrvalue = '""' 56 | self.soup.handle_starttag(name, None, None, attr_dict) 57 | 58 | def handle_endtag(self, name): 59 | self.soup.handle_endtag(name) 60 | 61 | def handle_data(self, data): 62 | self.soup.handle_data(data) 63 | 64 | def handle_charref(self, name): 65 | # XXX workaround for a bug in HTMLParser. Remove this once 66 | # it's fixed. 67 | if name.startswith('x'): 68 | real_name = int(name.lstrip('x'), 16) 69 | elif name.startswith('X'): 70 | real_name = int(name.lstrip('X'), 16) 71 | else: 72 | real_name = int(name) 73 | 74 | try: 75 | data = chr(real_name) 76 | except (ValueError, OverflowError) as e: 77 | data = "\N{REPLACEMENT CHARACTER}" 78 | 79 | self.handle_data(data) 80 | 81 | def handle_entityref(self, name): 82 | character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) 83 | if character is not None: 84 | data = character 85 | else: 86 | data = "&%s;" % name 87 | self.handle_data(data) 88 | 89 | def handle_comment(self, data): 90 | self.soup.endData() 91 | self.soup.handle_data(data) 92 | self.soup.endData(Comment) 93 | 94 | def handle_decl(self, data): 95 | self.soup.endData() 96 | if data.startswith("DOCTYPE "): 97 | data = data[len("DOCTYPE "):] 98 | elif data == 'DOCTYPE': 99 | # i.e. "" 100 | data = '' 101 | self.soup.handle_data(data) 102 | self.soup.endData(Doctype) 103 | 104 | def unknown_decl(self, data): 105 | if data.upper().startswith('CDATA['): 106 | cls = CData 107 | data = data[len('CDATA['):] 108 | else: 109 | cls = Declaration 110 | self.soup.endData() 111 | self.soup.handle_data(data) 112 | self.soup.endData(cls) 113 | 114 | def handle_pi(self, data): 115 | self.soup.endData() 116 | if data.endswith("?") and data.lower().startswith("xml"): 117 | # "An XHTML processing instruction using the trailing '?' 118 | # will cause the '?' to be included in data." - HTMLParser 119 | # docs. 120 | # 121 | # Strip the question mark so we don't end up with two 122 | # question marks. 123 | data = data[:-1] 124 | self.soup.handle_data(data) 125 | self.soup.endData(ProcessingInstruction) 126 | 127 | 128 | class HTMLParserTreeBuilder(HTMLTreeBuilder): 129 | 130 | is_xml = False 131 | features = [HTML, STRICT, HTMLPARSER] 132 | 133 | def __init__(self, *args, **kwargs): 134 | if CONSTRUCTOR_TAKES_STRICT: 135 | kwargs['strict'] = False 136 | self.parser_args = (args, kwargs) 137 | 138 | def prepare_markup(self, markup, user_specified_encoding=None, 139 | document_declared_encoding=None): 140 | """ 141 | :return: A 4-tuple (markup, original encoding, encoding 142 | declared within markup, whether any characters had to be 143 | replaced with REPLACEMENT CHARACTER). 144 | """ 145 | if isinstance(markup, str): 146 | yield (markup, None, None, False) 147 | return 148 | 149 | try_encodings = [user_specified_encoding, document_declared_encoding] 150 | dammit = UnicodeDammit(markup, try_encodings, is_html=True) 151 | yield (dammit.markup, dammit.original_encoding, 152 | dammit.declared_html_encoding, 153 | dammit.contains_replacement_characters) 154 | 155 | def feed(self, markup): 156 | args, kwargs = self.parser_args 157 | parser = BeautifulSoupHTMLParser(*args, **kwargs) 158 | parser.soup = self.soup 159 | try: 160 | parser.feed(markup) 161 | except HTMLParseError as e: 162 | warnings.warn(RuntimeWarning( 163 | "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) 164 | raise e 165 | 166 | # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some 167 | # 3.2.3 code. This ensures they don't treat markup like

as a 168 | # string. 169 | # 170 | # XXX This code can be removed once most Python 3 users are on 3.2.3. 171 | if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: 172 | import re 173 | attrfind_tolerant = re.compile( 174 | r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' 175 | r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') 176 | HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant 177 | 178 | locatestarttagend = re.compile(r""" 179 | <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name 180 | (?:\s+ # whitespace before attribute name 181 | (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name 182 | (?:\s*=\s* # value indicator 183 | (?:'[^']*' # LITA-enclosed value 184 | |\"[^\"]*\" # LIT-enclosed value 185 | |[^'\">\s]+ # bare value 186 | ) 187 | )? 188 | ) 189 | )* 190 | \s* # trailing whitespace 191 | """, re.VERBOSE) 192 | BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend 193 | 194 | from html.parser import tagfind, attrfind 195 | 196 | def parse_starttag(self, i): 197 | self.__starttag_text = None 198 | endpos = self.check_for_whole_start_tag(i) 199 | if endpos < 0: 200 | return endpos 201 | rawdata = self.rawdata 202 | self.__starttag_text = rawdata[i:endpos] 203 | 204 | # Now parse the data between i+1 and j into a tag and attrs 205 | attrs = [] 206 | match = tagfind.match(rawdata, i+1) 207 | assert match, 'unexpected call to parse_starttag()' 208 | k = match.end() 209 | self.lasttag = tag = rawdata[i+1:k].lower() 210 | while k < endpos: 211 | if self.strict: 212 | m = attrfind.match(rawdata, k) 213 | else: 214 | m = attrfind_tolerant.match(rawdata, k) 215 | if not m: 216 | break 217 | attrname, rest, attrvalue = m.group(1, 2, 3) 218 | if not rest: 219 | attrvalue = None 220 | elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 221 | attrvalue[:1] == '"' == attrvalue[-1:]: 222 | attrvalue = attrvalue[1:-1] 223 | if attrvalue: 224 | attrvalue = self.unescape(attrvalue) 225 | attrs.append((attrname.lower(), attrvalue)) 226 | k = m.end() 227 | 228 | end = rawdata[k:endpos].strip() 229 | if end not in (">", "/>"): 230 | lineno, offset = self.getpos() 231 | if "\n" in self.__starttag_text: 232 | lineno = lineno + self.__starttag_text.count("\n") 233 | offset = len(self.__starttag_text) \ 234 | - self.__starttag_text.rfind("\n") 235 | else: 236 | offset = offset + len(self.__starttag_text) 237 | if self.strict: 238 | self.error("junk characters in start tag: %r" 239 | % (rawdata[k:endpos][:20],)) 240 | self.handle_data(rawdata[i:endpos]) 241 | return endpos 242 | if end.endswith('/>'): 243 | # XHTML-style empty tag: 244 | self.handle_startendtag(tag, attrs) 245 | else: 246 | self.handle_starttag(tag, attrs) 247 | if tag in self.CDATA_CONTENT_ELEMENTS: 248 | self.set_cdata_mode(tag) 249 | return endpos 250 | 251 | def set_cdata_mode(self, elem): 252 | self.cdata_elem = elem.lower() 253 | self.interesting = re.compile(r'' % self.cdata_elem, re.I) 254 | 255 | BeautifulSoupHTMLParser.parse_starttag = parse_starttag 256 | BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode 257 | 258 | CONSTRUCTOR_TAKES_STRICT = True 259 | -------------------------------------------------------------------------------- /bs4/builder/_html5lib.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | 'HTML5TreeBuilder', 3 | ] 4 | 5 | import warnings 6 | from bs4.builder import ( 7 | PERMISSIVE, 8 | HTML, 9 | HTML_5, 10 | HTMLTreeBuilder, 11 | ) 12 | from bs4.element import NamespacedAttribute 13 | import html5lib 14 | from html5lib.constants import namespaces 15 | from bs4.element import ( 16 | Comment, 17 | Doctype, 18 | NavigableString, 19 | Tag, 20 | ) 21 | 22 | class HTML5TreeBuilder(HTMLTreeBuilder): 23 | """Use html5lib to build a tree.""" 24 | 25 | features = ['html5lib', PERMISSIVE, HTML_5, HTML] 26 | 27 | def prepare_markup(self, markup, user_specified_encoding): 28 | # Store the user-specified encoding for use later on. 29 | self.user_specified_encoding = user_specified_encoding 30 | yield (markup, None, None, False) 31 | 32 | # These methods are defined by Beautiful Soup. 33 | def feed(self, markup): 34 | if self.soup.parse_only is not None: 35 | warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") 36 | parser = html5lib.HTMLParser(tree=self.create_treebuilder) 37 | doc = parser.parse(markup, encoding=self.user_specified_encoding) 38 | 39 | # Set the character encoding detected by the tokenizer. 40 | if isinstance(markup, str): 41 | # We need to special-case this because html5lib sets 42 | # charEncoding to UTF-8 if it gets Unicode input. 43 | doc.original_encoding = None 44 | else: 45 | doc.original_encoding = parser.tokenizer.stream.charEncoding[0] 46 | 47 | def create_treebuilder(self, namespaceHTMLElements): 48 | self.underlying_builder = TreeBuilderForHtml5lib( 49 | self.soup, namespaceHTMLElements) 50 | return self.underlying_builder 51 | 52 | def test_fragment_to_document(self, fragment): 53 | """See `TreeBuilder`.""" 54 | return '%s' % fragment 55 | 56 | 57 | class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): 58 | 59 | def __init__(self, soup, namespaceHTMLElements): 60 | self.soup = soup 61 | super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) 62 | 63 | def documentClass(self): 64 | self.soup.reset() 65 | return Element(self.soup, self.soup, None) 66 | 67 | def insertDoctype(self, token): 68 | name = token["name"] 69 | publicId = token["publicId"] 70 | systemId = token["systemId"] 71 | 72 | doctype = Doctype.for_name_and_ids(name, publicId, systemId) 73 | self.soup.object_was_parsed(doctype) 74 | 75 | def elementClass(self, name, namespace): 76 | tag = self.soup.new_tag(name, namespace) 77 | return Element(tag, self.soup, namespace) 78 | 79 | def commentClass(self, data): 80 | return TextNode(Comment(data), self.soup) 81 | 82 | def fragmentClass(self): 83 | self.soup = BeautifulSoup("") 84 | self.soup.name = "[document_fragment]" 85 | return Element(self.soup, self.soup, None) 86 | 87 | def appendChild(self, node): 88 | # XXX This code is not covered by the BS4 tests. 89 | self.soup.append(node.element) 90 | 91 | def getDocument(self): 92 | return self.soup 93 | 94 | def getFragment(self): 95 | return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element 96 | 97 | class AttrList(object): 98 | def __init__(self, element): 99 | self.element = element 100 | self.attrs = dict(self.element.attrs) 101 | def __iter__(self): 102 | return list(self.attrs.items()).__iter__() 103 | def __setitem__(self, name, value): 104 | "set attr", name, value 105 | self.element[name] = value 106 | def items(self): 107 | return list(self.attrs.items()) 108 | def keys(self): 109 | return list(self.attrs.keys()) 110 | def __len__(self): 111 | return len(self.attrs) 112 | def __getitem__(self, name): 113 | return self.attrs[name] 114 | def __contains__(self, name): 115 | return name in list(self.attrs.keys()) 116 | 117 | 118 | class Element(html5lib.treebuilders._base.Node): 119 | def __init__(self, element, soup, namespace): 120 | html5lib.treebuilders._base.Node.__init__(self, element.name) 121 | self.element = element 122 | self.soup = soup 123 | self.namespace = namespace 124 | 125 | def appendChild(self, node): 126 | string_child = child = None 127 | if isinstance(node, str): 128 | # Some other piece of code decided to pass in a string 129 | # instead of creating a TextElement object to contain the 130 | # string. 131 | string_child = child = node 132 | elif isinstance(node, Tag): 133 | # Some other piece of code decided to pass in a Tag 134 | # instead of creating an Element object to contain the 135 | # Tag. 136 | child = node 137 | elif node.element.__class__ == NavigableString: 138 | string_child = child = node.element 139 | else: 140 | child = node.element 141 | 142 | if not isinstance(child, str) and child.parent is not None: 143 | node.element.extract() 144 | 145 | if (string_child and self.element.contents 146 | and self.element.contents[-1].__class__ == NavigableString): 147 | # We are appending a string onto another string. 148 | # TODO This has O(n^2) performance, for input like 149 | # "aaa..." 150 | old_element = self.element.contents[-1] 151 | new_element = self.soup.new_string(old_element + string_child) 152 | old_element.replace_with(new_element) 153 | self.soup._most_recent_element = new_element 154 | else: 155 | if isinstance(node, str): 156 | # Create a brand new NavigableString from this string. 157 | child = self.soup.new_string(node) 158 | 159 | # Tell Beautiful Soup to act as if it parsed this element 160 | # immediately after the parent's last descendant. (Or 161 | # immediately after the parent, if it has no children.) 162 | if self.element.contents: 163 | most_recent_element = self.element._last_descendant(False) 164 | else: 165 | most_recent_element = self.element 166 | 167 | self.soup.object_was_parsed( 168 | child, parent=self.element, 169 | most_recent_element=most_recent_element) 170 | 171 | def getAttributes(self): 172 | return AttrList(self.element) 173 | 174 | def setAttributes(self, attributes): 175 | if attributes is not None and len(attributes) > 0: 176 | 177 | converted_attributes = [] 178 | for name, value in list(attributes.items()): 179 | if isinstance(name, tuple): 180 | new_name = NamespacedAttribute(*name) 181 | del attributes[name] 182 | attributes[new_name] = value 183 | 184 | self.soup.builder._replace_cdata_list_attribute_values( 185 | self.name, attributes) 186 | for name, value in list(attributes.items()): 187 | self.element[name] = value 188 | 189 | # The attributes may contain variables that need substitution. 190 | # Call set_up_substitutions manually. 191 | # 192 | # The Tag constructor called this method when the Tag was created, 193 | # but we just set/changed the attributes, so call it again. 194 | self.soup.builder.set_up_substitutions(self.element) 195 | attributes = property(getAttributes, setAttributes) 196 | 197 | def insertText(self, data, insertBefore=None): 198 | if insertBefore: 199 | text = TextNode(self.soup.new_string(data), self.soup) 200 | self.insertBefore(data, insertBefore) 201 | else: 202 | self.appendChild(data) 203 | 204 | def insertBefore(self, node, refNode): 205 | index = self.element.index(refNode.element) 206 | if (node.element.__class__ == NavigableString and self.element.contents 207 | and self.element.contents[index-1].__class__ == NavigableString): 208 | # (See comments in appendChild) 209 | old_node = self.element.contents[index-1] 210 | new_str = self.soup.new_string(old_node + node.element) 211 | old_node.replace_with(new_str) 212 | else: 213 | self.element.insert(index, node.element) 214 | node.parent = self 215 | 216 | def removeChild(self, node): 217 | node.element.extract() 218 | 219 | def reparentChildren(self, new_parent): 220 | """Move all of this tag's children into another tag.""" 221 | element = self.element 222 | new_parent_element = new_parent.element 223 | # Determine what this tag's next_element will be once all the children 224 | # are removed. 225 | final_next_element = element.next_sibling 226 | 227 | new_parents_last_descendant = new_parent_element._last_descendant(False, False) 228 | if len(new_parent_element.contents) > 0: 229 | # The new parent already contains children. We will be 230 | # appending this tag's children to the end. 231 | new_parents_last_child = new_parent_element.contents[-1] 232 | new_parents_last_descendant_next_element = new_parents_last_descendant.next_element 233 | else: 234 | # The new parent contains no children. 235 | new_parents_last_child = None 236 | new_parents_last_descendant_next_element = new_parent_element.next_element 237 | 238 | to_append = element.contents 239 | append_after = new_parent.element.contents 240 | if len(to_append) > 0: 241 | # Set the first child's previous_element and previous_sibling 242 | # to elements within the new parent 243 | first_child = to_append[0] 244 | first_child.previous_element = new_parents_last_descendant 245 | first_child.previous_sibling = new_parents_last_child 246 | 247 | # Fix the last child's next_element and next_sibling 248 | last_child = to_append[-1] 249 | last_child.next_element = new_parents_last_descendant_next_element 250 | last_child.next_sibling = None 251 | 252 | for child in to_append: 253 | child.parent = new_parent_element 254 | new_parent_element.contents.append(child) 255 | 256 | # Now that this element has no children, change its .next_element. 257 | element.contents = [] 258 | element.next_element = final_next_element 259 | 260 | def cloneNode(self): 261 | tag = self.soup.new_tag(self.element.name, self.namespace) 262 | node = Element(tag, self.soup, self.namespace) 263 | for key,value in self.attributes: 264 | node.attributes[key] = value 265 | return node 266 | 267 | def hasContent(self): 268 | return self.element.contents 269 | 270 | def getNameTuple(self): 271 | if self.namespace == None: 272 | return namespaces["html"], self.name 273 | else: 274 | return self.namespace, self.name 275 | 276 | nameTuple = property(getNameTuple) 277 | 278 | class TextNode(Element): 279 | def __init__(self, element, soup): 280 | html5lib.treebuilders._base.Node.__init__(self, None) 281 | self.element = element 282 | self.soup = soup 283 | 284 | def cloneNode(self): 285 | raise NotImplementedError 286 | -------------------------------------------------------------------------------- /bs4/builder/__init__.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import itertools 3 | import sys 4 | from bs4.element import ( 5 | CharsetMetaAttributeValue, 6 | ContentMetaAttributeValue, 7 | whitespace_re 8 | ) 9 | 10 | __all__ = [ 11 | 'HTMLTreeBuilder', 12 | 'SAXTreeBuilder', 13 | 'TreeBuilder', 14 | 'TreeBuilderRegistry', 15 | ] 16 | 17 | # Some useful features for a TreeBuilder to have. 18 | FAST = 'fast' 19 | PERMISSIVE = 'permissive' 20 | STRICT = 'strict' 21 | XML = 'xml' 22 | HTML = 'html' 23 | HTML_5 = 'html5' 24 | 25 | 26 | class TreeBuilderRegistry(object): 27 | 28 | def __init__(self): 29 | self.builders_for_feature = defaultdict(list) 30 | self.builders = [] 31 | 32 | def register(self, treebuilder_class): 33 | """Register a treebuilder based on its advertised features.""" 34 | for feature in treebuilder_class.features: 35 | self.builders_for_feature[feature].insert(0, treebuilder_class) 36 | self.builders.insert(0, treebuilder_class) 37 | 38 | def lookup(self, *features): 39 | if len(self.builders) == 0: 40 | # There are no builders at all. 41 | return None 42 | 43 | if len(features) == 0: 44 | # They didn't ask for any features. Give them the most 45 | # recently registered builder. 46 | return self.builders[0] 47 | 48 | # Go down the list of features in order, and eliminate any builders 49 | # that don't match every feature. 50 | features = list(features) 51 | features.reverse() 52 | candidates = None 53 | candidate_set = None 54 | while len(features) > 0: 55 | feature = features.pop() 56 | we_have_the_feature = self.builders_for_feature.get(feature, []) 57 | if len(we_have_the_feature) > 0: 58 | if candidates is None: 59 | candidates = we_have_the_feature 60 | candidate_set = set(candidates) 61 | else: 62 | # Eliminate any candidates that don't have this feature. 63 | candidate_set = candidate_set.intersection( 64 | set(we_have_the_feature)) 65 | 66 | # The only valid candidates are the ones in candidate_set. 67 | # Go through the original list of candidates and pick the first one 68 | # that's in candidate_set. 69 | if candidate_set is None: 70 | return None 71 | for candidate in candidates: 72 | if candidate in candidate_set: 73 | return candidate 74 | return None 75 | 76 | # The BeautifulSoup class will take feature lists from developers and use them 77 | # to look up builders in this registry. 78 | builder_registry = TreeBuilderRegistry() 79 | 80 | class TreeBuilder(object): 81 | """Turn a document into a Beautiful Soup object tree.""" 82 | 83 | features = [] 84 | 85 | is_xml = False 86 | preserve_whitespace_tags = set() 87 | empty_element_tags = None # A tag will be considered an empty-element 88 | # tag when and only when it has no contents. 89 | 90 | # A value for these tag/attribute combinations is a space- or 91 | # comma-separated list of CDATA, rather than a single CDATA. 92 | cdata_list_attributes = {} 93 | 94 | 95 | def __init__(self): 96 | self.soup = None 97 | 98 | def reset(self): 99 | pass 100 | 101 | def can_be_empty_element(self, tag_name): 102 | """Might a tag with this name be an empty-element tag? 103 | 104 | The final markup may or may not actually present this tag as 105 | self-closing. 106 | 107 | For instance: an HTMLBuilder does not consider a

tag to be 108 | an empty-element tag (it's not in 109 | HTMLBuilder.empty_element_tags). This means an empty

tag 110 | will be presented as "

", not "

". 111 | 112 | The default implementation has no opinion about which tags are 113 | empty-element tags, so a tag will be presented as an 114 | empty-element tag if and only if it has no contents. 115 | "" will become "", and "bar" will 116 | be left alone. 117 | """ 118 | if self.empty_element_tags is None: 119 | return True 120 | return tag_name in self.empty_element_tags 121 | 122 | def feed(self, markup): 123 | raise NotImplementedError() 124 | 125 | def prepare_markup(self, markup, user_specified_encoding=None, 126 | document_declared_encoding=None): 127 | return markup, None, None, False 128 | 129 | def test_fragment_to_document(self, fragment): 130 | """Wrap an HTML fragment to make it look like a document. 131 | 132 | Different parsers do this differently. For instance, lxml 133 | introduces an empty tag, and html5lib 134 | doesn't. Abstracting this away lets us write simple tests 135 | which run HTML fragments through the parser and compare the 136 | results against other HTML fragments. 137 | 138 | This method should not be used outside of tests. 139 | """ 140 | return fragment 141 | 142 | def set_up_substitutions(self, tag): 143 | return False 144 | 145 | def _replace_cdata_list_attribute_values(self, tag_name, attrs): 146 | """Replaces class="foo bar" with class=["foo", "bar"] 147 | 148 | Modifies its input in place. 149 | """ 150 | if not attrs: 151 | return attrs 152 | if self.cdata_list_attributes: 153 | universal = self.cdata_list_attributes.get('*', []) 154 | tag_specific = self.cdata_list_attributes.get( 155 | tag_name.lower(), None) 156 | for attr in list(attrs.keys()): 157 | if attr in universal or (tag_specific and attr in tag_specific): 158 | # We have a "class"-type attribute whose string 159 | # value is a whitespace-separated list of 160 | # values. Split it into a list. 161 | value = attrs[attr] 162 | if isinstance(value, str): 163 | values = whitespace_re.split(value) 164 | else: 165 | # html5lib sometimes calls setAttributes twice 166 | # for the same tag when rearranging the parse 167 | # tree. On the second call the attribute value 168 | # here is already a list. If this happens, 169 | # leave the value alone rather than trying to 170 | # split it again. 171 | values = value 172 | attrs[attr] = values 173 | return attrs 174 | 175 | class SAXTreeBuilder(TreeBuilder): 176 | """A Beautiful Soup treebuilder that listens for SAX events.""" 177 | 178 | def feed(self, markup): 179 | raise NotImplementedError() 180 | 181 | def close(self): 182 | pass 183 | 184 | def startElement(self, name, attrs): 185 | attrs = dict((key[1], value) for key, value in list(attrs.items())) 186 | #print "Start %s, %r" % (name, attrs) 187 | self.soup.handle_starttag(name, attrs) 188 | 189 | def endElement(self, name): 190 | #print "End %s" % name 191 | self.soup.handle_endtag(name) 192 | 193 | def startElementNS(self, nsTuple, nodeName, attrs): 194 | # Throw away (ns, nodeName) for now. 195 | self.startElement(nodeName, attrs) 196 | 197 | def endElementNS(self, nsTuple, nodeName): 198 | # Throw away (ns, nodeName) for now. 199 | self.endElement(nodeName) 200 | #handler.endElementNS((ns, node.nodeName), node.nodeName) 201 | 202 | def startPrefixMapping(self, prefix, nodeValue): 203 | # Ignore the prefix for now. 204 | pass 205 | 206 | def endPrefixMapping(self, prefix): 207 | # Ignore the prefix for now. 208 | # handler.endPrefixMapping(prefix) 209 | pass 210 | 211 | def characters(self, content): 212 | self.soup.handle_data(content) 213 | 214 | def startDocument(self): 215 | pass 216 | 217 | def endDocument(self): 218 | pass 219 | 220 | 221 | class HTMLTreeBuilder(TreeBuilder): 222 | """This TreeBuilder knows facts about HTML. 223 | 224 | Such as which tags are empty-element tags. 225 | """ 226 | 227 | preserve_whitespace_tags = set(['pre', 'textarea']) 228 | empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', 229 | 'spacer', 'link', 'frame', 'base']) 230 | 231 | # The HTML standard defines these attributes as containing a 232 | # space-separated list of values, not a single value. That is, 233 | # class="foo bar" means that the 'class' attribute has two values, 234 | # 'foo' and 'bar', not the single value 'foo bar'. When we 235 | # encounter one of these attributes, we will parse its value into 236 | # a list of values if possible. Upon output, the list will be 237 | # converted back into a string. 238 | cdata_list_attributes = { 239 | "*" : ['class', 'accesskey', 'dropzone'], 240 | "a" : ['rel', 'rev'], 241 | "link" : ['rel', 'rev'], 242 | "td" : ["headers"], 243 | "th" : ["headers"], 244 | "td" : ["headers"], 245 | "form" : ["accept-charset"], 246 | "object" : ["archive"], 247 | 248 | # These are HTML5 specific, as are *.accesskey and *.dropzone above. 249 | "area" : ["rel"], 250 | "icon" : ["sizes"], 251 | "iframe" : ["sandbox"], 252 | "output" : ["for"], 253 | } 254 | 255 | def set_up_substitutions(self, tag): 256 | # We are only interested in tags 257 | if tag.name != 'meta': 258 | return False 259 | 260 | http_equiv = tag.get('http-equiv') 261 | content = tag.get('content') 262 | charset = tag.get('charset') 263 | 264 | # We are interested in tags that say what encoding the 265 | # document was originally in. This means HTML 5-style 266 | # tags that provide the "charset" attribute. It also means 267 | # HTML 4-style tags that provide the "content" 268 | # attribute and have "http-equiv" set to "content-type". 269 | # 270 | # In both cases we will replace the value of the appropriate 271 | # attribute with a standin object that can take on any 272 | # encoding. 273 | meta_encoding = None 274 | if charset is not None: 275 | # HTML 5 style: 276 | # 277 | meta_encoding = charset 278 | tag['charset'] = CharsetMetaAttributeValue(charset) 279 | 280 | elif (content is not None and http_equiv is not None 281 | and http_equiv.lower() == 'content-type'): 282 | # HTML 4 style: 283 | # 284 | tag['content'] = ContentMetaAttributeValue(content) 285 | 286 | return (meta_encoding is not None) 287 | 288 | def register_treebuilders_from(module): 289 | """Copy TreeBuilders from the given module into this module.""" 290 | # I'm fairly sure this is not the best way to do this. 291 | this_module = sys.modules['bs4.builder'] 292 | for name in module.__all__: 293 | obj = getattr(module, name) 294 | 295 | if issubclass(obj, TreeBuilder): 296 | setattr(this_module, name, obj) 297 | this_module.__all__.append(name) 298 | # Register the builder while we're at it. 299 | this_module.builder_registry.register(obj) 300 | 301 | class ParserRejectedMarkup(Exception): 302 | pass 303 | 304 | # Builders are registered in reverse order of priority, so that custom 305 | # builder registrations will take precedence. In general, we want lxml 306 | # to take precedence over html5lib, because it's faster. And we only 307 | # want to use HTMLParser as a last result. 308 | from . import _htmlparser 309 | register_treebuilders_from(_htmlparser) 310 | try: 311 | from . import _html5lib 312 | register_treebuilders_from(_html5lib) 313 | except ImportError: 314 | # They don't have html5lib installed. 315 | pass 316 | try: 317 | from . import _lxml 318 | register_treebuilders_from(_lxml) 319 | except ImportError: 320 | # They don't have lxml installed. 321 | pass 322 | -------------------------------------------------------------------------------- /tenkou.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # version 0.0.2 4 | 5 | import urllib.request, urllib.parse, urllib.error 6 | import argparse, re, os, sys, copy 7 | from bs4 import BeautifulSoup 8 | 9 | 10 | def ptStrLiterally(str): 11 | for i in str: 12 | try: 13 | print(i, end='') 14 | except UnicodeEncodeError as e: 15 | pass 16 | print('') 17 | 18 | 19 | def puts(str): 20 | try: 21 | print(str) 22 | except UnicodeEncodeError as e: 23 | # print(e.reason) 24 | ptStrLiterally(str) 25 | else: 26 | pass 27 | 28 | 29 | def searchSubStr(str, pattern_start, pattern_end, quiet=False): 30 | try: 31 | start = re.search(pattern_start, str).end() 32 | end = re.search(pattern_end, str[start:]).start() 33 | except AttributeError as e: 34 | if not quiet: 35 | print('AttributeError: Can\'t find substring') 36 | return '' 37 | substr = str[start:end+start] 38 | return substr 39 | 40 | 41 | def generateOpener(auth, ua): 42 | opener = urllib.request.build_opener() 43 | if ua: 44 | opener.addheaders = [('User-agent', ua)] 45 | else: 46 | opener.addheaders = [('User-agent', 'Mozilla 5.0')] 47 | if auth: 48 | opener.addheaders.append(('Cookie', 'chii_auth=' + auth)) 49 | return opener 50 | 51 | 52 | def getHtml(url, auth, ua): 53 | opener = generateOpener(auth, ua) 54 | try: 55 | html = opener.open(url).read() 56 | except urllib.error.URLError as e: 57 | print(url) 58 | print('No response...') 59 | return None 60 | else: 61 | return html 62 | 63 | 64 | def getProgress(url, auth, ua): 65 | opener = generateOpener(auth, ua) 66 | try: 67 | html = opener.open(url).read() 68 | soup = BeautifulSoup(html.decode('utf-8')) 69 | p = soup.find('input', id='watchedeps')['value'] 70 | except urllib.error.URLError as e: 71 | print(url) 72 | print('No response...') 73 | return '' 74 | except TypeError as e: 75 | print(url) 76 | print('TyepError: NoneType') 77 | print('Error: the given auth string doesn\'t match the user id') 78 | return '' 79 | else: 80 | return p 81 | 82 | 83 | 84 | def getIDnGh(li): 85 | idngh = li.find('p', class_='collectModify').find_all('a')[1]['onclick'] 86 | # [subid, gh] 87 | return idngh[20:-2].split(", '") 88 | 89 | 90 | def removeItem(domain, subid, auth, ua, gh): 91 | opener = generateOpener(auth, ua) 92 | rmlink = ''.join([domain, '/subject/', subid, '/remove?gh=', gh]) 93 | try: 94 | response = opener.open(rmlink) 95 | except urllib.error.URLError as e: 96 | print(rmlink) 97 | print('Cant erase subject %s' % subid) 98 | return False 99 | else: 100 | return True 101 | 102 | 103 | def export(domain, auth, ua, uid, path, wipe): 104 | cats = ['anime', 'game', 'music', 'book', 'real'] 105 | types = ['do', 'collect', 'wish', 'on_hold', 'dropped'] 106 | # types = ['do', 'wish', 'on_hold', 'dropped'] 107 | # types = ['do', 'on_hold', 'dropped'] 108 | cats_c = {'anime' : '动画', 109 | 'game' : '游戏', 110 | 'music' : '音乐', 111 | 'book' : '书籍', 112 | 'real' : '电视剧'} 113 | types_c = {'do' : '在看', 114 | 'collect' : '看过', 115 | 'wish' : '想看', 116 | 'on_hold' : '搁置', 117 | 'dropped' : '抛弃'} 118 | cats_types = [(c, t) for c in cats for t in types] 119 | for cat, type in cats_types: 120 | # if cat == 'anime' and type == 'collect': 121 | # continue 122 | # print(types_c[type], '的', cats_c[cat], '\n') 123 | puts(types_c[type] + '的' + cats_c[cat] + '\n') 124 | pg = 1 125 | idx = 1 126 | items = '' 127 | while pg != 0: 128 | url = ''.join( [domain, '/', cat, '/list/', uid, '/', 129 | type, '?page=', str(pg)] ) 130 | html = getHtml(url, auth, ua) 131 | if not html: 132 | break 133 | # # test 134 | # with open("test.html",'w', encoding='utf-8') as ft: 135 | # ft.write(html.decode('utf-8')) 136 | # # test 137 | soup = BeautifulSoup(html.decode('utf-8')) 138 | ul = soup.find(id='browserItemList') 139 | content = '' 140 | for li in ul.children: 141 | inner = li.find('div', class_='inner') 142 | collect_info = inner.find('p', class_='collectInfo') 143 | comment = inner.find('div', id='comment_box') 144 | stars = inner.find('span', class_='starsinfo') 145 | greyname = inner.h3.small 146 | href = domain + inner.h3.a['href'] 147 | iname = str(idx) + '. ' + inner.h3.a.text.strip() + '\n' 148 | iurl = '地址:' + href + '\n' 149 | icollect_info = collect_info.text.strip() + '\n' 150 | if greyname: 151 | igreyname = '原名:' + greyname.text.strip() + '\n' 152 | else: 153 | igreyname = '' 154 | if stars: 155 | istars = '评分:' + stars['class'][0][6:] + '星\n' 156 | else: 157 | istars = '' 158 | if comment: 159 | icomment = ('简评:' 160 | + inner.find('div', 161 | id='comment_box').text.strip() 162 | + '\n') 163 | else: 164 | icomment = '' 165 | if ( (cat == 'anime' or cat == 'real') 166 | and type == 'do' 167 | and auth ): 168 | iprogress = '进度:' + getProgress(href, auth, ua) + '\n' 169 | else: 170 | iprogress = '' 171 | # print(iname) 172 | puts(iname) 173 | content += (iname + igreyname + iurl + istars + icomment 174 | + iprogress + icollect_info + '\n') 175 | idx += 1 176 | if wipe: 177 | # remove item 178 | try: 179 | subid, gh = getIDnGh(li) 180 | removeItem(domain, subid, auth, ua, gh) 181 | except: 182 | print('Error: wrong auth string\n') 183 | if content != '': 184 | items += content 185 | pg += 1 186 | else: 187 | pg = 0 188 | if items == '': 189 | continue 190 | file_name = path + '/bangumi_' + cat + '_' + type + '.txt' 191 | with open(file_name, 'w', encoding='utf-8') as f: 192 | f.write(items) 193 | 194 | 195 | def getAuth(domain, auth, ua, authfile, uid, password): 196 | if auth and ua: 197 | return uid, auth, ua 198 | elif authfile: 199 | with open(authfile, 'r') as af: 200 | user_agent = af.readline() 201 | auth = af.readline() 202 | return uid, auth.strip(), user_agent.strip() 203 | elif not password: 204 | # print('Error: No auth string, no auth file, no password\n') 205 | return uid, auth, ua 206 | url = domain + '/login' 207 | # url = domain + '/FollowTheRabbit' 208 | data = {'cookietime': '2592000', 209 | 'email': uid, 210 | 'password': password, 211 | 'loginsubmit': '登录'} 212 | user_agent = 'Mozilla/5.0 (Elephant 3) Midori 3.5' 213 | data = urllib.parse.urlencode(data).encode('utf-8') 214 | opener = urllib.request.build_opener() 215 | opener.addheaders = [('User-agent', user_agent)] 216 | urllib.request.install_opener(opener) 217 | res = urllib.request.urlopen(url, data) 218 | # print(res.getheaders()) 219 | # print(res.getheader('Set-Cookie')) 220 | cookie = res.getheader('Set-Cookie') 221 | # -- use searchSubStr() -- 222 | # start = re.search('chii_auth=', cookie).end() 223 | # end = re.search('(;|$)', cookie[start:]).start() 224 | # # print(cookie[start:end+start]) 225 | # auth = cookie[start:end+start] 226 | # -- use searchSubStr() -- 227 | auth = searchSubStr(cookie, 'chii_auth=', '(;|$)') 228 | return uid, auth, user_agent 229 | 230 | 231 | def post(url, data, auth, ua): 232 | opener = generateOpener(auth, ua) 233 | post_data = urllib.parse.urlencode(data).encode('utf-8') 234 | urllib.request.install_opener(opener) 235 | res = urllib.request.urlopen(url, post_data) 236 | return res 237 | 238 | 239 | def getGH(domain, auth, ua): 240 | opener = generateOpener(auth, ua) 241 | html = opener.open(domain).read().decode('utf-8') 242 | pattern = ' tag), call handle_starttag and then 70 | handle_endtag. 71 | """ 72 | ROOT_TAG_NAME = '[document]' 73 | 74 | # If the end-user gives no indication which tree builder they 75 | # want, look for one with these features. 76 | DEFAULT_BUILDER_FEATURES = ['html', 'fast'] 77 | 78 | ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' 79 | 80 | def __init__(self, markup="", features=None, builder=None, 81 | parse_only=None, from_encoding=None, **kwargs): 82 | """The Soup object is initialized as the 'root tag', and the 83 | provided markup (which can be a string or a file-like object) 84 | is fed into the underlying parser.""" 85 | 86 | if 'convertEntities' in kwargs: 87 | warnings.warn( 88 | "BS4 does not respect the convertEntities argument to the " 89 | "BeautifulSoup constructor. Entities are always converted " 90 | "to Unicode characters.") 91 | 92 | if 'markupMassage' in kwargs: 93 | del kwargs['markupMassage'] 94 | warnings.warn( 95 | "BS4 does not respect the markupMassage argument to the " 96 | "BeautifulSoup constructor. The tree builder is responsible " 97 | "for any necessary markup massage.") 98 | 99 | if 'smartQuotesTo' in kwargs: 100 | del kwargs['smartQuotesTo'] 101 | warnings.warn( 102 | "BS4 does not respect the smartQuotesTo argument to the " 103 | "BeautifulSoup constructor. Smart quotes are always converted " 104 | "to Unicode characters.") 105 | 106 | if 'selfClosingTags' in kwargs: 107 | del kwargs['selfClosingTags'] 108 | warnings.warn( 109 | "BS4 does not respect the selfClosingTags argument to the " 110 | "BeautifulSoup constructor. The tree builder is responsible " 111 | "for understanding self-closing tags.") 112 | 113 | if 'isHTML' in kwargs: 114 | del kwargs['isHTML'] 115 | warnings.warn( 116 | "BS4 does not respect the isHTML argument to the " 117 | "BeautifulSoup constructor. You can pass in features='html' " 118 | "or features='xml' to get a builder capable of handling " 119 | "one or the other.") 120 | 121 | def deprecated_argument(old_name, new_name): 122 | if old_name in kwargs: 123 | warnings.warn( 124 | 'The "%s" argument to the BeautifulSoup constructor ' 125 | 'has been renamed to "%s."' % (old_name, new_name)) 126 | value = kwargs[old_name] 127 | del kwargs[old_name] 128 | return value 129 | return None 130 | 131 | parse_only = parse_only or deprecated_argument( 132 | "parseOnlyThese", "parse_only") 133 | 134 | from_encoding = from_encoding or deprecated_argument( 135 | "fromEncoding", "from_encoding") 136 | 137 | if len(kwargs) > 0: 138 | arg = list(kwargs.keys()).pop() 139 | raise TypeError( 140 | "__init__() got an unexpected keyword argument '%s'" % arg) 141 | 142 | if builder is None: 143 | if isinstance(features, str): 144 | features = [features] 145 | if features is None or len(features) == 0: 146 | features = self.DEFAULT_BUILDER_FEATURES 147 | builder_class = builder_registry.lookup(*features) 148 | if builder_class is None: 149 | raise FeatureNotFound( 150 | "Couldn't find a tree builder with the features you " 151 | "requested: %s. Do you need to install a parser library?" 152 | % ",".join(features)) 153 | builder = builder_class() 154 | self.builder = builder 155 | self.is_xml = builder.is_xml 156 | self.builder.soup = self 157 | 158 | self.parse_only = parse_only 159 | 160 | if hasattr(markup, 'read'): # It's a file-type object. 161 | markup = markup.read() 162 | elif len(markup) <= 256: 163 | # Print out warnings for a couple beginner problems 164 | # involving passing non-markup to Beautiful Soup. 165 | # Beautiful Soup will still parse the input as markup, 166 | # just in case that's what the user really wants. 167 | if (isinstance(markup, str) 168 | and not os.path.supports_unicode_filenames): 169 | possible_filename = markup.encode("utf8") 170 | else: 171 | possible_filename = markup 172 | is_file = False 173 | try: 174 | is_file = os.path.exists(possible_filename) 175 | except Exception as e: 176 | # This is almost certainly a problem involving 177 | # characters not valid in filenames on this 178 | # system. Just let it go. 179 | pass 180 | if is_file: 181 | warnings.warn( 182 | '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) 183 | if markup[:5] == "http:" or markup[:6] == "https:": 184 | # TODO: This is ugly but I couldn't get it to work in 185 | # Python 3 otherwise. 186 | if ((isinstance(markup, bytes) and not b' ' in markup) 187 | or (isinstance(markup, str) and not ' ' in markup)): 188 | warnings.warn( 189 | '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup) 190 | 191 | for (self.markup, self.original_encoding, self.declared_html_encoding, 192 | self.contains_replacement_characters) in ( 193 | self.builder.prepare_markup(markup, from_encoding)): 194 | self.reset() 195 | try: 196 | self._feed() 197 | break 198 | except ParserRejectedMarkup: 199 | pass 200 | 201 | # Clear out the markup and remove the builder's circular 202 | # reference to this object. 203 | self.markup = None 204 | self.builder.soup = None 205 | 206 | def _feed(self): 207 | # Convert the document to Unicode. 208 | self.builder.reset() 209 | 210 | self.builder.feed(self.markup) 211 | # Close out any unfinished strings and close all the open tags. 212 | self.endData() 213 | while self.currentTag.name != self.ROOT_TAG_NAME: 214 | self.popTag() 215 | 216 | def reset(self): 217 | Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) 218 | self.hidden = 1 219 | self.builder.reset() 220 | self.current_data = [] 221 | self.currentTag = None 222 | self.tagStack = [] 223 | self.preserve_whitespace_tag_stack = [] 224 | self.pushTag(self) 225 | 226 | def new_tag(self, name, namespace=None, nsprefix=None, **attrs): 227 | """Create a new tag associated with this soup.""" 228 | return Tag(None, self.builder, name, namespace, nsprefix, attrs) 229 | 230 | def new_string(self, s, subclass=NavigableString): 231 | """Create a new NavigableString associated with this soup.""" 232 | navigable = subclass(s) 233 | navigable.setup() 234 | return navigable 235 | 236 | def insert_before(self, successor): 237 | raise NotImplementedError("BeautifulSoup objects don't support insert_before().") 238 | 239 | def insert_after(self, successor): 240 | raise NotImplementedError("BeautifulSoup objects don't support insert_after().") 241 | 242 | def popTag(self): 243 | tag = self.tagStack.pop() 244 | if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: 245 | self.preserve_whitespace_tag_stack.pop() 246 | #print "Pop", tag.name 247 | if self.tagStack: 248 | self.currentTag = self.tagStack[-1] 249 | return self.currentTag 250 | 251 | def pushTag(self, tag): 252 | #print "Push", tag.name 253 | if self.currentTag: 254 | self.currentTag.contents.append(tag) 255 | self.tagStack.append(tag) 256 | self.currentTag = self.tagStack[-1] 257 | if tag.name in self.builder.preserve_whitespace_tags: 258 | self.preserve_whitespace_tag_stack.append(tag) 259 | 260 | def endData(self, containerClass=NavigableString): 261 | if self.current_data: 262 | current_data = ''.join(self.current_data) 263 | # If whitespace is not preserved, and this string contains 264 | # nothing but ASCII spaces, replace it with a single space 265 | # or newline. 266 | if not self.preserve_whitespace_tag_stack: 267 | strippable = True 268 | for i in current_data: 269 | if i not in self.ASCII_SPACES: 270 | strippable = False 271 | break 272 | if strippable: 273 | if '\n' in current_data: 274 | current_data = '\n' 275 | else: 276 | current_data = ' ' 277 | 278 | # Reset the data collector. 279 | self.current_data = [] 280 | 281 | # Should we add this string to the tree at all? 282 | if self.parse_only and len(self.tagStack) <= 1 and \ 283 | (not self.parse_only.text or \ 284 | not self.parse_only.search(current_data)): 285 | return 286 | 287 | o = containerClass(current_data) 288 | self.object_was_parsed(o) 289 | 290 | def object_was_parsed(self, o, parent=None, most_recent_element=None): 291 | """Add an object to the parse tree.""" 292 | parent = parent or self.currentTag 293 | most_recent_element = most_recent_element or self._most_recent_element 294 | o.setup(parent, most_recent_element) 295 | 296 | if most_recent_element is not None: 297 | most_recent_element.next_element = o 298 | self._most_recent_element = o 299 | parent.contents.append(o) 300 | 301 | def _popToTag(self, name, nsprefix=None, inclusivePop=True): 302 | """Pops the tag stack up to and including the most recent 303 | instance of the given tag. If inclusivePop is false, pops the tag 304 | stack up to but *not* including the most recent instqance of 305 | the given tag.""" 306 | #print "Popping to %s" % name 307 | if name == self.ROOT_TAG_NAME: 308 | # The BeautifulSoup object itself can never be popped. 309 | return 310 | 311 | most_recently_popped = None 312 | 313 | stack_size = len(self.tagStack) 314 | for i in range(stack_size - 1, 0, -1): 315 | t = self.tagStack[i] 316 | if (name == t.name and nsprefix == t.prefix): 317 | if inclusivePop: 318 | most_recently_popped = self.popTag() 319 | break 320 | most_recently_popped = self.popTag() 321 | 322 | return most_recently_popped 323 | 324 | def handle_starttag(self, name, namespace, nsprefix, attrs): 325 | """Push a start tag on to the stack. 326 | 327 | If this method returns None, the tag was rejected by the 328 | SoupStrainer. You should proceed as if the tag had not occured 329 | in the document. For instance, if this was a self-closing tag, 330 | don't call handle_endtag. 331 | """ 332 | 333 | # print "Start tag %s: %s" % (name, attrs) 334 | self.endData() 335 | 336 | if (self.parse_only and len(self.tagStack) <= 1 337 | and (self.parse_only.text 338 | or not self.parse_only.search_tag(name, attrs))): 339 | return None 340 | 341 | tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, 342 | self.currentTag, self._most_recent_element) 343 | if tag is None: 344 | return tag 345 | if self._most_recent_element: 346 | self._most_recent_element.next_element = tag 347 | self._most_recent_element = tag 348 | self.pushTag(tag) 349 | return tag 350 | 351 | def handle_endtag(self, name, nsprefix=None): 352 | #print "End tag: " + name 353 | self.endData() 354 | self._popToTag(name, nsprefix) 355 | 356 | def handle_data(self, data): 357 | self.current_data.append(data) 358 | 359 | def decode(self, pretty_print=False, 360 | eventual_encoding=DEFAULT_OUTPUT_ENCODING, 361 | formatter="minimal"): 362 | """Returns a string or Unicode representation of this document. 363 | To get Unicode, pass None for encoding.""" 364 | 365 | if self.is_xml: 366 | # Print the XML declaration 367 | encoding_part = '' 368 | if eventual_encoding != None: 369 | encoding_part = ' encoding="%s"' % eventual_encoding 370 | prefix = '\n' % encoding_part 371 | else: 372 | prefix = '' 373 | if not pretty_print: 374 | indent_level = None 375 | else: 376 | indent_level = 0 377 | return prefix + super(BeautifulSoup, self).decode( 378 | indent_level, eventual_encoding, formatter) 379 | 380 | # Alias to make it easier to type import: 'from bs4 import _soup' 381 | _s = BeautifulSoup 382 | _soup = BeautifulSoup 383 | 384 | class BeautifulStoneSoup(BeautifulSoup): 385 | """Deprecated interface to an XML parser.""" 386 | 387 | def __init__(self, *args, **kwargs): 388 | kwargs['features'] = 'xml' 389 | warnings.warn( 390 | 'The BeautifulStoneSoup class is deprecated. Instead of using ' 391 | 'it, pass features="xml" into the BeautifulSoup constructor.') 392 | super(BeautifulStoneSoup, self).__init__(*args, **kwargs) 393 | 394 | 395 | class StopParsing(Exception): 396 | pass 397 | 398 | class FeatureNotFound(ValueError): 399 | pass 400 | 401 | 402 | #By default, act as an HTML pretty-printer. 403 | if __name__ == '__main__': 404 | import sys 405 | soup = BeautifulSoup(sys.stdin) 406 | print(soup.prettify()) 407 | -------------------------------------------------------------------------------- /bs4/tests/test_soup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Tests of Beautiful Soup as a whole.""" 3 | 4 | import logging 5 | import unittest 6 | import sys 7 | import tempfile 8 | 9 | from bs4 import ( 10 | BeautifulSoup, 11 | BeautifulStoneSoup, 12 | ) 13 | from bs4.element import ( 14 | CharsetMetaAttributeValue, 15 | ContentMetaAttributeValue, 16 | SoupStrainer, 17 | NamespacedAttribute, 18 | ) 19 | import bs4.dammit 20 | from bs4.dammit import ( 21 | EntitySubstitution, 22 | UnicodeDammit, 23 | ) 24 | from bs4.testing import ( 25 | SoupTest, 26 | skipIf, 27 | ) 28 | import warnings 29 | 30 | try: 31 | from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML 32 | LXML_PRESENT = True 33 | except ImportError as e: 34 | LXML_PRESENT = False 35 | 36 | PYTHON_2_PRE_2_7 = (sys.version_info < (2,7)) 37 | PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) 38 | 39 | class TestConstructor(SoupTest): 40 | 41 | def test_short_unicode_input(self): 42 | data = "

éé

" 43 | soup = self.soup(data) 44 | self.assertEqual("éé", soup.h1.string) 45 | 46 | def test_embedded_null(self): 47 | data = "

foo\0bar

" 48 | soup = self.soup(data) 49 | self.assertEqual("foo\0bar", soup.h1.string) 50 | 51 | 52 | class TestDeprecatedConstructorArguments(SoupTest): 53 | 54 | def test_parseOnlyThese_renamed_to_parse_only(self): 55 | with warnings.catch_warnings(record=True) as w: 56 | soup = self.soup("
", parseOnlyThese=SoupStrainer("b")) 57 | msg = str(w[0].message) 58 | self.assertTrue("parseOnlyThese" in msg) 59 | self.assertTrue("parse_only" in msg) 60 | self.assertEqual(b"", soup.encode()) 61 | 62 | def test_fromEncoding_renamed_to_from_encoding(self): 63 | with warnings.catch_warnings(record=True) as w: 64 | utf8 = b"\xc3\xa9" 65 | soup = self.soup(utf8, fromEncoding="utf8") 66 | msg = str(w[0].message) 67 | self.assertTrue("fromEncoding" in msg) 68 | self.assertTrue("from_encoding" in msg) 69 | self.assertEqual("utf8", soup.original_encoding) 70 | 71 | def test_unrecognized_keyword_argument(self): 72 | self.assertRaises( 73 | TypeError, self.soup, "", no_such_argument=True) 74 | 75 | class TestWarnings(SoupTest): 76 | 77 | def test_disk_file_warning(self): 78 | filehandle = tempfile.NamedTemporaryFile() 79 | filename = filehandle.name 80 | try: 81 | with warnings.catch_warnings(record=True) as w: 82 | soup = self.soup(filename) 83 | msg = str(w[0].message) 84 | self.assertTrue("looks like a filename" in msg) 85 | finally: 86 | filehandle.close() 87 | 88 | # The file no longer exists, so Beautiful Soup will no longer issue the warning. 89 | with warnings.catch_warnings(record=True) as w: 90 | soup = self.soup(filename) 91 | self.assertEqual(0, len(w)) 92 | 93 | def test_url_warning(self): 94 | with warnings.catch_warnings(record=True) as w: 95 | soup = self.soup("http://www.crummy.com/") 96 | msg = str(w[0].message) 97 | self.assertTrue("looks like a URL" in msg) 98 | 99 | with warnings.catch_warnings(record=True) as w: 100 | soup = self.soup("http://www.crummy.com/ is great") 101 | self.assertEqual(0, len(w)) 102 | 103 | class TestSelectiveParsing(SoupTest): 104 | 105 | def test_parse_with_soupstrainer(self): 106 | markup = "NoYesNoYes Yes" 107 | strainer = SoupStrainer("b") 108 | soup = self.soup(markup, parse_only=strainer) 109 | self.assertEqual(soup.encode(), b"YesYes Yes") 110 | 111 | 112 | class TestEntitySubstitution(unittest.TestCase): 113 | """Standalone tests of the EntitySubstitution class.""" 114 | def setUp(self): 115 | self.sub = EntitySubstitution 116 | 117 | def test_simple_html_substitution(self): 118 | # Unicode characters corresponding to named HTML entites 119 | # are substituted, and no others. 120 | s = "foo\u2200\N{SNOWMAN}\u00f5bar" 121 | self.assertEqual(self.sub.substitute_html(s), 122 | "foo∀\N{SNOWMAN}õbar") 123 | 124 | def test_smart_quote_substitution(self): 125 | # MS smart quotes are a common source of frustration, so we 126 | # give them a special test. 127 | quotes = b"\x91\x92foo\x93\x94" 128 | dammit = UnicodeDammit(quotes) 129 | self.assertEqual(self.sub.substitute_html(dammit.markup), 130 | "‘’foo“”") 131 | 132 | def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): 133 | s = 'Welcome to "my bar"' 134 | self.assertEqual(self.sub.substitute_xml(s, False), s) 135 | 136 | def test_xml_attribute_quoting_normally_uses_double_quotes(self): 137 | self.assertEqual(self.sub.substitute_xml("Welcome", True), 138 | '"Welcome"') 139 | self.assertEqual(self.sub.substitute_xml("Bob's Bar", True), 140 | '"Bob\'s Bar"') 141 | 142 | def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): 143 | s = 'Welcome to "my bar"' 144 | self.assertEqual(self.sub.substitute_xml(s, True), 145 | "'Welcome to \"my bar\"'") 146 | 147 | def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): 148 | s = 'Welcome to "Bob\'s Bar"' 149 | self.assertEqual( 150 | self.sub.substitute_xml(s, True), 151 | '"Welcome to "Bob\'s Bar""') 152 | 153 | def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): 154 | quoted = 'Welcome to "Bob\'s Bar"' 155 | self.assertEqual(self.sub.substitute_xml(quoted), quoted) 156 | 157 | def test_xml_quoting_handles_angle_brackets(self): 158 | self.assertEqual( 159 | self.sub.substitute_xml("foo"), 160 | "foo<bar>") 161 | 162 | def test_xml_quoting_handles_ampersands(self): 163 | self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T") 164 | 165 | def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): 166 | self.assertEqual( 167 | self.sub.substitute_xml("ÁT&T"), 168 | "&Aacute;T&T") 169 | 170 | def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): 171 | self.assertEqual( 172 | self.sub.substitute_xml_containing_entities("ÁT&T"), 173 | "ÁT&T") 174 | 175 | def test_quotes_not_html_substituted(self): 176 | """There's no need to do this except inside attribute values.""" 177 | text = 'Bob\'s "bar"' 178 | self.assertEqual(self.sub.substitute_html(text), text) 179 | 180 | 181 | class TestEncodingConversion(SoupTest): 182 | # Test Beautiful Soup's ability to decode and encode from various 183 | # encodings. 184 | 185 | def setUp(self): 186 | super(TestEncodingConversion, self).setUp() 187 | self.unicode_data = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' 188 | self.utf8_data = self.unicode_data.encode("utf-8") 189 | # Just so you know what it looks like. 190 | self.assertEqual( 191 | self.utf8_data, 192 | b'Sacr\xc3\xa9 bleu!') 193 | 194 | def test_ascii_in_unicode_out(self): 195 | # ASCII input is converted to Unicode. The original_encoding 196 | # attribute is set to 'utf-8', a superset of ASCII. 197 | chardet = bs4.dammit.chardet_dammit 198 | logging.disable(logging.WARNING) 199 | try: 200 | def noop(str): 201 | return None 202 | # Disable chardet, which will realize that the ASCII is ASCII. 203 | bs4.dammit.chardet_dammit = noop 204 | ascii = b"a" 205 | soup_from_ascii = self.soup(ascii) 206 | unicode_output = soup_from_ascii.decode() 207 | self.assertTrue(isinstance(unicode_output, str)) 208 | self.assertEqual(unicode_output, self.document_for(ascii.decode())) 209 | self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") 210 | finally: 211 | logging.disable(logging.NOTSET) 212 | bs4.dammit.chardet_dammit = chardet 213 | 214 | def test_unicode_in_unicode_out(self): 215 | # Unicode input is left alone. The original_encoding attribute 216 | # is not set. 217 | soup_from_unicode = self.soup(self.unicode_data) 218 | self.assertEqual(soup_from_unicode.decode(), self.unicode_data) 219 | self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!') 220 | self.assertEqual(soup_from_unicode.original_encoding, None) 221 | 222 | def test_utf8_in_unicode_out(self): 223 | # UTF-8 input is converted to Unicode. The original_encoding 224 | # attribute is set. 225 | soup_from_utf8 = self.soup(self.utf8_data) 226 | self.assertEqual(soup_from_utf8.decode(), self.unicode_data) 227 | self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!') 228 | 229 | def test_utf8_out(self): 230 | # The internal data structures can be encoded as UTF-8. 231 | soup_from_unicode = self.soup(self.unicode_data) 232 | self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) 233 | 234 | @skipIf( 235 | PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2, 236 | "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") 237 | def test_attribute_name_containing_unicode_characters(self): 238 | markup = '
' 239 | self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) 240 | 241 | class TestUnicodeDammit(unittest.TestCase): 242 | """Standalone tests of UnicodeDammit.""" 243 | 244 | def test_unicode_input(self): 245 | markup = "I'm already Unicode! \N{SNOWMAN}" 246 | dammit = UnicodeDammit(markup) 247 | self.assertEqual(dammit.unicode_markup, markup) 248 | 249 | def test_smart_quotes_to_unicode(self): 250 | markup = b"\x91\x92\x93\x94" 251 | dammit = UnicodeDammit(markup) 252 | self.assertEqual( 253 | dammit.unicode_markup, "\u2018\u2019\u201c\u201d") 254 | 255 | def test_smart_quotes_to_xml_entities(self): 256 | markup = b"\x91\x92\x93\x94" 257 | dammit = UnicodeDammit(markup, smart_quotes_to="xml") 258 | self.assertEqual( 259 | dammit.unicode_markup, "‘’“”") 260 | 261 | def test_smart_quotes_to_html_entities(self): 262 | markup = b"\x91\x92\x93\x94" 263 | dammit = UnicodeDammit(markup, smart_quotes_to="html") 264 | self.assertEqual( 265 | dammit.unicode_markup, "‘’“”") 266 | 267 | def test_smart_quotes_to_ascii(self): 268 | markup = b"\x91\x92\x93\x94" 269 | dammit = UnicodeDammit(markup, smart_quotes_to="ascii") 270 | self.assertEqual( 271 | dammit.unicode_markup, """''""""") 272 | 273 | def test_detect_utf8(self): 274 | utf8 = b"\xc3\xa9" 275 | dammit = UnicodeDammit(utf8) 276 | self.assertEqual(dammit.unicode_markup, '\xe9') 277 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 278 | 279 | def test_convert_hebrew(self): 280 | hebrew = b"\xed\xe5\xec\xf9" 281 | dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) 282 | self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') 283 | self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9') 284 | 285 | def test_dont_see_smart_quotes_where_there_are_none(self): 286 | utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" 287 | dammit = UnicodeDammit(utf_8) 288 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 289 | self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) 290 | 291 | def test_ignore_inappropriate_codecs(self): 292 | utf8_data = "Räksmörgås".encode("utf-8") 293 | dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) 294 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 295 | 296 | def test_ignore_invalid_codecs(self): 297 | utf8_data = "Räksmörgås".encode("utf-8") 298 | for bad_encoding in ['.utf8', '...', 'utF---16.!']: 299 | dammit = UnicodeDammit(utf8_data, [bad_encoding]) 300 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 301 | 302 | def test_detect_html5_style_meta_tag(self): 303 | 304 | for data in ( 305 | b'', 306 | b"", 307 | b"", 308 | b""): 309 | dammit = UnicodeDammit(data, is_html=True) 310 | self.assertEqual( 311 | "euc-jp", dammit.original_encoding) 312 | 313 | def test_last_ditch_entity_replacement(self): 314 | # This is a UTF-8 document that contains bytestrings 315 | # completely incompatible with UTF-8 (ie. encoded with some other 316 | # encoding). 317 | # 318 | # Since there is no consistent encoding for the document, 319 | # Unicode, Dammit will eventually encode the document as UTF-8 320 | # and encode the incompatible characters as REPLACEMENT 321 | # CHARACTER. 322 | # 323 | # If chardet is installed, it will detect that the document 324 | # can be converted into ISO-8859-1 without errors. This happens 325 | # to be the wrong encoding, but it is a consistent encoding, so the 326 | # code we're testing here won't run. 327 | # 328 | # So we temporarily disable chardet if it's present. 329 | doc = b"""\357\273\277 330 | \330\250\330\252\330\261 331 | \310\322\321\220\312\321\355\344""" 332 | chardet = bs4.dammit.chardet_dammit 333 | logging.disable(logging.WARNING) 334 | try: 335 | def noop(str): 336 | return None 337 | bs4.dammit.chardet_dammit = noop 338 | dammit = UnicodeDammit(doc) 339 | self.assertEqual(True, dammit.contains_replacement_characters) 340 | self.assertTrue("\ufffd" in dammit.unicode_markup) 341 | 342 | soup = BeautifulSoup(doc, "html.parser") 343 | self.assertTrue(soup.contains_replacement_characters) 344 | finally: 345 | logging.disable(logging.NOTSET) 346 | bs4.dammit.chardet_dammit = chardet 347 | 348 | def test_byte_order_mark_removed(self): 349 | # A document written in UTF-16LE will have its byte order marker stripped. 350 | data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' 351 | dammit = UnicodeDammit(data) 352 | self.assertEqual("áé", dammit.unicode_markup) 353 | self.assertEqual("utf-16le", dammit.original_encoding) 354 | 355 | def test_detwingle(self): 356 | # Here's a UTF8 document. 357 | utf8 = ("\N{SNOWMAN}" * 3).encode("utf8") 358 | 359 | # Here's a Windows-1252 document. 360 | windows_1252 = ( 361 | "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" 362 | "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") 363 | 364 | # Through some unholy alchemy, they've been stuck together. 365 | doc = utf8 + windows_1252 + utf8 366 | 367 | # The document can't be turned into UTF-8: 368 | self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") 369 | 370 | # Unicode, Dammit thinks the whole document is Windows-1252, 371 | # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" 372 | 373 | # But if we run it through fix_embedded_windows_1252, it's fixed: 374 | 375 | fixed = UnicodeDammit.detwingle(doc) 376 | self.assertEqual( 377 | "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) 378 | 379 | def test_detwingle_ignores_multibyte_characters(self): 380 | # Each of these characters has a UTF-8 representation ending 381 | # in \x93. \x93 is a smart quote if interpreted as 382 | # Windows-1252. But our code knows to skip over multibyte 383 | # UTF-8 characters, so they'll survive the process unscathed. 384 | for tricky_unicode_char in ( 385 | "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' 386 | "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' 387 | "\xf0\x90\x90\x93", # This is a CJK character, not sure which one. 388 | ): 389 | input = tricky_unicode_char.encode("utf8") 390 | self.assertTrue(input.endswith(b'\x93')) 391 | output = UnicodeDammit.detwingle(input) 392 | self.assertEqual(output, input) 393 | 394 | class TestNamedspacedAttribute(SoupTest): 395 | 396 | def test_name_may_be_none(self): 397 | a = NamespacedAttribute("xmlns", None) 398 | self.assertEqual(a, "xmlns") 399 | 400 | def test_attribute_is_equivalent_to_colon_separated_string(self): 401 | a = NamespacedAttribute("a", "b") 402 | self.assertEqual("a:b", a) 403 | 404 | def test_attributes_are_equivalent_if_prefix_and_name_identical(self): 405 | a = NamespacedAttribute("a", "b", "c") 406 | b = NamespacedAttribute("a", "b", "c") 407 | self.assertEqual(a, b) 408 | 409 | # The actual namespace is not considered. 410 | c = NamespacedAttribute("a", "b", None) 411 | self.assertEqual(a, c) 412 | 413 | # But name and prefix are important. 414 | d = NamespacedAttribute("a", "z", "c") 415 | self.assertNotEqual(a, d) 416 | 417 | e = NamespacedAttribute("z", "b", "c") 418 | self.assertNotEqual(a, e) 419 | 420 | 421 | class TestAttributeValueWithCharsetSubstitution(unittest.TestCase): 422 | 423 | def test_content_meta_attribute_value(self): 424 | value = CharsetMetaAttributeValue("euc-jp") 425 | self.assertEqual("euc-jp", value) 426 | self.assertEqual("euc-jp", value.original_value) 427 | self.assertEqual("utf8", value.encode("utf8")) 428 | 429 | 430 | def test_content_meta_attribute_value(self): 431 | value = ContentMetaAttributeValue("text/html; charset=euc-jp") 432 | self.assertEqual("text/html; charset=euc-jp", value) 433 | self.assertEqual("text/html; charset=euc-jp", value.original_value) 434 | self.assertEqual("text/html; charset=utf8", value.encode("utf8")) 435 | -------------------------------------------------------------------------------- /bs4/testing.py: -------------------------------------------------------------------------------- 1 | """Helper classes for tests.""" 2 | 3 | import copy 4 | import functools 5 | import unittest 6 | from unittest import TestCase 7 | from bs4 import BeautifulSoup 8 | from bs4.element import ( 9 | CharsetMetaAttributeValue, 10 | Comment, 11 | ContentMetaAttributeValue, 12 | Doctype, 13 | SoupStrainer, 14 | ) 15 | 16 | from bs4.builder import HTMLParserTreeBuilder 17 | default_builder = HTMLParserTreeBuilder 18 | 19 | 20 | class SoupTest(unittest.TestCase): 21 | 22 | @property 23 | def default_builder(self): 24 | return default_builder() 25 | 26 | def soup(self, markup, **kwargs): 27 | """Build a Beautiful Soup object from markup.""" 28 | builder = kwargs.pop('builder', self.default_builder) 29 | return BeautifulSoup(markup, builder=builder, **kwargs) 30 | 31 | def document_for(self, markup): 32 | """Turn an HTML fragment into a document. 33 | 34 | The details depend on the builder. 35 | """ 36 | return self.default_builder.test_fragment_to_document(markup) 37 | 38 | def assertSoupEquals(self, to_parse, compare_parsed_to=None): 39 | builder = self.default_builder 40 | obj = BeautifulSoup(to_parse, builder=builder) 41 | if compare_parsed_to is None: 42 | compare_parsed_to = to_parse 43 | 44 | self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) 45 | 46 | 47 | class HTMLTreeBuilderSmokeTest(object): 48 | 49 | """A basic test of a treebuilder's competence. 50 | 51 | Any HTML treebuilder, present or future, should be able to pass 52 | these tests. With invalid markup, there's room for interpretation, 53 | and different parsers can handle it differently. But with the 54 | markup in these tests, there's not much room for interpretation. 55 | """ 56 | 57 | def assertDoctypeHandled(self, doctype_fragment): 58 | """Assert that a given doctype string is handled correctly.""" 59 | doctype_str, soup = self._document_with_doctype(doctype_fragment) 60 | 61 | # Make sure a Doctype object was created. 62 | doctype = soup.contents[0] 63 | self.assertEqual(doctype.__class__, Doctype) 64 | self.assertEqual(doctype, doctype_fragment) 65 | self.assertEqual(str(soup)[:len(doctype_str)], doctype_str) 66 | 67 | # Make sure that the doctype was correctly associated with the 68 | # parse tree and that the rest of the document parsed. 69 | self.assertEqual(soup.p.contents[0], 'foo') 70 | 71 | def _document_with_doctype(self, doctype_fragment): 72 | """Generate and parse a document with the given doctype.""" 73 | doctype = '' % doctype_fragment 74 | markup = doctype + '\n

foo

' 75 | soup = self.soup(markup) 76 | return doctype, soup 77 | 78 | def test_normal_doctypes(self): 79 | """Make sure normal, everyday HTML doctypes are handled correctly.""" 80 | self.assertDoctypeHandled("html") 81 | self.assertDoctypeHandled( 82 | 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') 83 | 84 | def test_empty_doctype(self): 85 | soup = self.soup("") 86 | doctype = soup.contents[0] 87 | self.assertEqual("", doctype.strip()) 88 | 89 | def test_public_doctype_with_url(self): 90 | doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' 91 | self.assertDoctypeHandled(doctype) 92 | 93 | def test_system_doctype(self): 94 | self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"') 95 | 96 | def test_namespaced_system_doctype(self): 97 | # We can handle a namespaced doctype with a system ID. 98 | self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"') 99 | 100 | def test_namespaced_public_doctype(self): 101 | # Test a namespaced doctype with a public id. 102 | self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"') 103 | 104 | def test_real_xhtml_document(self): 105 | """A real XHTML document should come out more or less the same as it went in.""" 106 | markup = b""" 107 | 108 | 109 | Hello. 110 | Goodbye. 111 | """ 112 | soup = self.soup(markup) 113 | self.assertEqual( 114 | soup.encode("utf-8").replace(b"\n", b""), 115 | markup.replace(b"\n", b"")) 116 | 117 | def test_deepcopy(self): 118 | """Make sure you can copy the tree builder. 119 | 120 | This is important because the builder is part of a 121 | BeautifulSoup object, and we want to be able to copy that. 122 | """ 123 | copy.deepcopy(self.default_builder) 124 | 125 | def test_p_tag_is_never_empty_element(self): 126 | """A

tag is never designated as an empty-element tag. 127 | 128 | Even if the markup shows it as an empty-element tag, it 129 | shouldn't be presented that way. 130 | """ 131 | soup = self.soup("

") 132 | self.assertFalse(soup.p.is_empty_element) 133 | self.assertEqual(str(soup.p), "

") 134 | 135 | def test_unclosed_tags_get_closed(self): 136 | """A tag that's not closed by the end of the document should be closed. 137 | 138 | This applies to all tags except empty-element tags. 139 | """ 140 | self.assertSoupEquals("

", "

") 141 | self.assertSoupEquals("", "") 142 | 143 | self.assertSoupEquals("
", "
") 144 | 145 | def test_br_is_always_empty_element_tag(self): 146 | """A
tag is designated as an empty-element tag. 147 | 148 | Some parsers treat

as one
tag, some parsers as 149 | two tags, but it should always be an empty-element tag. 150 | """ 151 | soup = self.soup("

") 152 | self.assertTrue(soup.br.is_empty_element) 153 | self.assertEqual(str(soup.br), "
") 154 | 155 | def test_nested_formatting_elements(self): 156 | self.assertSoupEquals("") 157 | 158 | def test_comment(self): 159 | # Comments are represented as Comment objects. 160 | markup = "

foobaz

" 161 | self.assertSoupEquals(markup) 162 | 163 | soup = self.soup(markup) 164 | comment = soup.find(text="foobar") 165 | self.assertEqual(comment.__class__, Comment) 166 | 167 | # The comment is properly integrated into the tree. 168 | foo = soup.find(text="foo") 169 | self.assertEqual(comment, foo.next_element) 170 | baz = soup.find(text="baz") 171 | self.assertEqual(comment, baz.previous_element) 172 | 173 | def test_preserved_whitespace_in_pre_and_textarea(self): 174 | """Whitespace must be preserved in
 and ")
177 | 
178 |     def test_nested_inline_elements(self):
179 |         """Inline elements can be nested indefinitely."""
180 |         b_tag = "Inside a B tag"
181 |         self.assertSoupEquals(b_tag)
182 | 
183 |         nested_b_tag = "

A nested tag

" 184 | self.assertSoupEquals(nested_b_tag) 185 | 186 | double_nested_b_tag = "

A doubly nested tag

" 187 | self.assertSoupEquals(nested_b_tag) 188 | 189 | def test_nested_block_level_elements(self): 190 | """Block elements can be nested.""" 191 | soup = self.soup('

Foo

') 192 | blockquote = soup.blockquote 193 | self.assertEqual(blockquote.p.b.string, 'Foo') 194 | self.assertEqual(blockquote.b.string, 'Foo') 195 | 196 | def test_correctly_nested_tables(self): 197 | """One table can go inside another one.""" 198 | markup = ('' 199 | '' 200 | "') 204 | 205 | self.assertSoupEquals( 206 | markup, 207 | '
Here's another table:" 201 | '' 202 | '' 203 | '
foo
Here\'s another table:' 208 | '
foo
' 209 | '
') 210 | 211 | self.assertSoupEquals( 212 | "" 213 | "" 214 | "
Foo
Bar
Baz
") 215 | 216 | def test_deeply_nested_multivalued_attribute(self): 217 | # html5lib can set the attributes of the same tag many times 218 | # as it rearranges the tree. This has caused problems with 219 | # multivalued attributes. 220 | markup = '
' 221 | soup = self.soup(markup) 222 | self.assertEqual(["css"], soup.div.div['class']) 223 | 224 | def test_angle_brackets_in_attribute_values_are_escaped(self): 225 | self.assertSoupEquals('', '') 226 | 227 | def test_entities_in_attributes_converted_to_unicode(self): 228 | expect = '

' 229 | self.assertSoupEquals('

', expect) 230 | self.assertSoupEquals('

', expect) 231 | self.assertSoupEquals('

', expect) 232 | self.assertSoupEquals('

', expect) 233 | 234 | def test_entities_in_text_converted_to_unicode(self): 235 | expect = '

pi\N{LATIN SMALL LETTER N WITH TILDE}ata

' 236 | self.assertSoupEquals("

piñata

", expect) 237 | self.assertSoupEquals("

piñata

", expect) 238 | self.assertSoupEquals("

piñata

", expect) 239 | self.assertSoupEquals("

piñata

", expect) 240 | 241 | def test_quot_entity_converted_to_quotation_mark(self): 242 | self.assertSoupEquals("

I said "good day!"

", 243 | '

I said "good day!"

') 244 | 245 | def test_out_of_range_entity(self): 246 | expect = "\N{REPLACEMENT CHARACTER}" 247 | self.assertSoupEquals("�", expect) 248 | self.assertSoupEquals("�", expect) 249 | self.assertSoupEquals("�", expect) 250 | 251 | def test_multipart_strings(self): 252 | "Mostly to prevent a recurrence of a bug in the html5lib treebuilder." 253 | soup = self.soup("

\nfoo

") 254 | self.assertEqual("p", soup.h2.string.next_element.name) 255 | self.assertEqual("p", soup.p.name) 256 | 257 | def test_basic_namespaces(self): 258 | """Parsers don't need to *understand* namespaces, but at the 259 | very least they should not choke on namespaces or lose 260 | data.""" 261 | 262 | markup = b'4' 263 | soup = self.soup(markup) 264 | self.assertEqual(markup, soup.encode()) 265 | html = soup.html 266 | self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns']) 267 | self.assertEqual( 268 | 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml']) 269 | self.assertEqual( 270 | 'http://www.w3.org/2000/svg', soup.html['xmlns:svg']) 271 | 272 | def test_multivalued_attribute_value_becomes_list(self): 273 | markup = b'' 274 | soup = self.soup(markup) 275 | self.assertEqual(['foo', 'bar'], soup.a['class']) 276 | 277 | # 278 | # Generally speaking, tests below this point are more tests of 279 | # Beautiful Soup than tests of the tree builders. But parsers are 280 | # weird, so we run these tests separately for every tree builder 281 | # to detect any differences between them. 282 | # 283 | 284 | def test_can_parse_unicode_document(self): 285 | # A seemingly innocuous document... but it's in Unicode! And 286 | # it contains characters that can't be represented in the 287 | # encoding found in the declaration! The horror! 288 | markup = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' 289 | soup = self.soup(markup) 290 | self.assertEqual('Sacr\xe9 bleu!', soup.body.string) 291 | 292 | def test_soupstrainer(self): 293 | """Parsers should be able to work with SoupStrainers.""" 294 | strainer = SoupStrainer("b") 295 | soup = self.soup("A bold statement", 296 | parse_only=strainer) 297 | self.assertEqual(soup.decode(), "bold") 298 | 299 | def test_single_quote_attribute_values_become_double_quotes(self): 300 | self.assertSoupEquals("", 301 | '') 302 | 303 | def test_attribute_values_with_nested_quotes_are_left_alone(self): 304 | text = """a""" 305 | self.assertSoupEquals(text) 306 | 307 | def test_attribute_values_with_double_nested_quotes_get_quoted(self): 308 | text = """a""" 309 | soup = self.soup(text) 310 | soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' 311 | self.assertSoupEquals( 312 | soup.foo.decode(), 313 | """a""") 314 | 315 | def test_ampersand_in_attribute_value_gets_escaped(self): 316 | self.assertSoupEquals('', 317 | '') 318 | 319 | self.assertSoupEquals( 320 | 'foo', 321 | 'foo') 322 | 323 | def test_escaped_ampersand_in_attribute_value_is_left_alone(self): 324 | self.assertSoupEquals('') 325 | 326 | def test_entities_in_strings_converted_during_parsing(self): 327 | # Both XML and HTML entities are converted to Unicode characters 328 | # during parsing. 329 | text = "

<<sacré bleu!>>

" 330 | expected = "

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

" 331 | self.assertSoupEquals(text, expected) 332 | 333 | def test_smart_quotes_converted_on_the_way_in(self): 334 | # Microsoft smart quotes are converted to Unicode characters during 335 | # parsing. 336 | quote = b"

\x91Foo\x92

" 337 | soup = self.soup(quote) 338 | self.assertEqual( 339 | soup.p.string, 340 | "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") 341 | 342 | def test_non_breaking_spaces_converted_on_the_way_in(self): 343 | soup = self.soup("  ") 344 | self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2) 345 | 346 | def test_entities_converted_on_the_way_out(self): 347 | text = "

<<sacré bleu!>>

" 348 | expected = "

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

".encode("utf-8") 349 | soup = self.soup(text) 350 | self.assertEqual(soup.p.encode("utf-8"), expected) 351 | 352 | def test_real_iso_latin_document(self): 353 | # Smoke test of interrelated functionality, using an 354 | # easy-to-understand document. 355 | 356 | # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. 357 | unicode_html = '

Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!

' 358 | 359 | # That's because we're going to encode it into ISO-Latin-1, and use 360 | # that to test. 361 | iso_latin_html = unicode_html.encode("iso-8859-1") 362 | 363 | # Parse the ISO-Latin-1 HTML. 364 | soup = self.soup(iso_latin_html) 365 | # Encode it to UTF-8. 366 | result = soup.encode("utf-8") 367 | 368 | # What do we expect the result to look like? Well, it would 369 | # look like unicode_html, except that the META tag would say 370 | # UTF-8 instead of ISO-Latin-1. 371 | expected = unicode_html.replace("ISO-Latin-1", "utf-8") 372 | 373 | # And, of course, it would be in UTF-8, not Unicode. 374 | expected = expected.encode("utf-8") 375 | 376 | # Ta-da! 377 | self.assertEqual(result, expected) 378 | 379 | def test_real_shift_jis_document(self): 380 | # Smoke test to make sure the parser can handle a document in 381 | # Shift-JIS encoding, without choking. 382 | shift_jis_html = ( 383 | b'
'
384 |             b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
385 |             b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
386 |             b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
387 |             b'
') 388 | unicode_html = shift_jis_html.decode("shift-jis") 389 | soup = self.soup(unicode_html) 390 | 391 | # Make sure the parse tree is correctly encoded to various 392 | # encodings. 393 | self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8")) 394 | self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp")) 395 | 396 | def test_real_hebrew_document(self): 397 | # A real-world test to make sure we can convert ISO-8859-9 (a 398 | # Hebrew encoding) to UTF-8. 399 | hebrew_document = b'Hebrew (ISO 8859-8) in Visual Directionality

Hebrew (ISO 8859-8) in Visual Directionality

\xed\xe5\xec\xf9' 400 | soup = self.soup( 401 | hebrew_document, from_encoding="iso8859-8") 402 | self.assertEqual(soup.original_encoding, 'iso8859-8') 403 | self.assertEqual( 404 | soup.encode('utf-8'), 405 | hebrew_document.decode("iso8859-8").encode("utf-8")) 406 | 407 | def test_meta_tag_reflects_current_encoding(self): 408 | # Here's the tag saying that a document is 409 | # encoded in Shift-JIS. 410 | meta_tag = ('') 412 | 413 | # Here's a document incorporating that meta tag. 414 | shift_jis_html = ( 415 | '\n%s\n' 416 | '' 417 | 'Shift-JIS markup goes here.') % meta_tag 418 | soup = self.soup(shift_jis_html) 419 | 420 | # Parse the document, and the charset is seemingly unaffected. 421 | parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) 422 | content = parsed_meta['content'] 423 | self.assertEqual('text/html; charset=x-sjis', content) 424 | 425 | # But that value is actually a ContentMetaAttributeValue object. 426 | self.assertTrue(isinstance(content, ContentMetaAttributeValue)) 427 | 428 | # And it will take on a value that reflects its current 429 | # encoding. 430 | self.assertEqual('text/html; charset=utf8', content.encode("utf8")) 431 | 432 | # For the rest of the story, see TestSubstitutions in 433 | # test_tree.py. 434 | 435 | def test_html5_style_meta_tag_reflects_current_encoding(self): 436 | # Here's the tag saying that a document is 437 | # encoded in Shift-JIS. 438 | meta_tag = ('') 439 | 440 | # Here's a document incorporating that meta tag. 441 | shift_jis_html = ( 442 | '\n%s\n' 443 | '' 444 | 'Shift-JIS markup goes here.') % meta_tag 445 | soup = self.soup(shift_jis_html) 446 | 447 | # Parse the document, and the charset is seemingly unaffected. 448 | parsed_meta = soup.find('meta', id="encoding") 449 | charset = parsed_meta['charset'] 450 | self.assertEqual('x-sjis', charset) 451 | 452 | # But that value is actually a CharsetMetaAttributeValue object. 453 | self.assertTrue(isinstance(charset, CharsetMetaAttributeValue)) 454 | 455 | # And it will take on a value that reflects its current 456 | # encoding. 457 | self.assertEqual('utf8', charset.encode("utf8")) 458 | 459 | def test_tag_with_no_attributes_can_have_attributes_added(self): 460 | data = self.soup("text") 461 | data.a['foo'] = 'bar' 462 | self.assertEqual('text', data.a.decode()) 463 | 464 | class XMLTreeBuilderSmokeTest(object): 465 | 466 | def test_docstring_generated(self): 467 | soup = self.soup("") 468 | self.assertEqual( 469 | soup.encode(), b'\n') 470 | 471 | def test_real_xhtml_document(self): 472 | """A real XHTML document should come out *exactly* the same as it went in.""" 473 | markup = b""" 474 | 475 | 476 | Hello. 477 | Goodbye. 478 | """ 479 | soup = self.soup(markup) 480 | self.assertEqual( 481 | soup.encode("utf-8"), markup) 482 | 483 | def test_formatter_processes_script_tag_for_xml_documents(self): 484 | doc = """ 485 | 487 | """ 488 | soup = BeautifulSoup(doc, "xml") 489 | # lxml would have stripped this while parsing, but we can add 490 | # it later. 491 | soup.script.string = 'console.log("< < hey > > ");' 492 | encoded = soup.encode() 493 | self.assertTrue(b"< < hey > >" in encoded) 494 | 495 | def test_can_parse_unicode_document(self): 496 | markup = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' 497 | soup = self.soup(markup) 498 | self.assertEqual('Sacr\xe9 bleu!', soup.root.string) 499 | 500 | def test_popping_namespaced_tag(self): 501 | markup = 'b2012-07-02T20:33:42Zcd' 502 | soup = self.soup(markup) 503 | self.assertEqual( 504 | str(soup.rss), markup) 505 | 506 | def test_docstring_includes_correct_encoding(self): 507 | soup = self.soup("") 508 | self.assertEqual( 509 | soup.encode("latin1"), 510 | b'\n') 511 | 512 | def test_large_xml_document(self): 513 | """A large XML document should come out the same as it went in.""" 514 | markup = (b'\n' 515 | + b'0' * (2**12) 516 | + b'') 517 | soup = self.soup(markup) 518 | self.assertEqual(soup.encode("utf-8"), markup) 519 | 520 | 521 | def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): 522 | self.assertSoupEquals("

", "

") 523 | self.assertSoupEquals("

foo

") 524 | 525 | def test_namespaces_are_preserved(self): 526 | markup = 'This tag is in the a namespaceThis tag is in the b namespace' 527 | soup = self.soup(markup) 528 | root = soup.root 529 | self.assertEqual("http://example.com/", root['xmlns:a']) 530 | self.assertEqual("http://example.net/", root['xmlns:b']) 531 | 532 | def test_closing_namespaced_tag(self): 533 | markup = '

20010504

' 534 | soup = self.soup(markup) 535 | self.assertEqual(str(soup.p), markup) 536 | 537 | def test_namespaced_attributes(self): 538 | markup = '' 539 | soup = self.soup(markup) 540 | self.assertEqual(str(soup.foo), markup) 541 | 542 | def test_namespaced_attributes_xml_namespace(self): 543 | markup = 'bar' 544 | soup = self.soup(markup) 545 | self.assertEqual(str(soup.foo), markup) 546 | 547 | class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): 548 | """Smoke test for a tree builder that supports HTML5.""" 549 | 550 | def test_real_xhtml_document(self): 551 | # Since XHTML is not HTML5, HTML5 parsers are not tested to handle 552 | # XHTML documents in any particular way. 553 | pass 554 | 555 | def test_html_tags_have_namespace(self): 556 | markup = "" 557 | soup = self.soup(markup) 558 | self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace) 559 | 560 | def test_svg_tags_have_namespace(self): 561 | markup = '' 562 | soup = self.soup(markup) 563 | namespace = "http://www.w3.org/2000/svg" 564 | self.assertEqual(namespace, soup.svg.namespace) 565 | self.assertEqual(namespace, soup.circle.namespace) 566 | 567 | 568 | def test_mathml_tags_have_namespace(self): 569 | markup = '5' 570 | soup = self.soup(markup) 571 | namespace = 'http://www.w3.org/1998/Math/MathML' 572 | self.assertEqual(namespace, soup.math.namespace) 573 | self.assertEqual(namespace, soup.msqrt.namespace) 574 | 575 | def test_xml_declaration_becomes_comment(self): 576 | markup = '' 577 | soup = self.soup(markup) 578 | self.assertTrue(isinstance(soup.contents[0], Comment)) 579 | self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?') 580 | self.assertEqual("html", soup.contents[0].next_element.name) 581 | 582 | def skipIf(condition, reason): 583 | def nothing(test, *args, **kwargs): 584 | return None 585 | 586 | def decorator(test_item): 587 | if condition: 588 | return nothing 589 | else: 590 | return test_item 591 | 592 | return decorator 593 | -------------------------------------------------------------------------------- /bs4/dammit.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Beautiful Soup bonus library: Unicode, Dammit 3 | 4 | This library converts a bytestream to Unicode through any means 5 | necessary. It is heavily based on code from Mark Pilgrim's Universal 6 | Feed Parser. It works best on XML and XML, but it does not rewrite the 7 | XML or HTML to reflect a new encoding; that's the tree builder's job. 8 | """ 9 | 10 | import codecs 11 | from html.entities import codepoint2name 12 | import re 13 | import logging 14 | import string 15 | 16 | # Import a library to autodetect character encodings. 17 | chardet_type = None 18 | try: 19 | # First try the fast C implementation. 20 | # PyPI package: cchardet 21 | import cchardet 22 | def chardet_dammit(s): 23 | return cchardet.detect(s)['encoding'] 24 | except ImportError: 25 | try: 26 | # Fall back to the pure Python implementation 27 | # Debian package: python-chardet 28 | # PyPI package: chardet 29 | import chardet 30 | def chardet_dammit(s): 31 | return chardet.detect(s)['encoding'] 32 | #import chardet.constants 33 | #chardet.constants._debug = 1 34 | except ImportError: 35 | # No chardet available. 36 | def chardet_dammit(s): 37 | return None 38 | 39 | # Available from http://cjkpython.i18n.org/. 40 | try: 41 | import iconv_codec 42 | except ImportError: 43 | pass 44 | 45 | xml_encoding_re = re.compile( 46 | '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) 47 | html_meta_re = re.compile( 48 | '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) 49 | 50 | class EntitySubstitution(object): 51 | 52 | """Substitute XML or HTML entities for the corresponding characters.""" 53 | 54 | def _populate_class_variables(): 55 | lookup = {} 56 | reverse_lookup = {} 57 | characters_for_re = [] 58 | for codepoint, name in list(codepoint2name.items()): 59 | character = chr(codepoint) 60 | if codepoint != 34: 61 | # There's no point in turning the quotation mark into 62 | # ", unless it happens within an attribute value, which 63 | # is handled elsewhere. 64 | characters_for_re.append(character) 65 | lookup[character] = name 66 | # But we do want to turn " into the quotation mark. 67 | reverse_lookup[name] = character 68 | re_definition = "[%s]" % "".join(characters_for_re) 69 | return lookup, reverse_lookup, re.compile(re_definition) 70 | (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, 71 | CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() 72 | 73 | CHARACTER_TO_XML_ENTITY = { 74 | "'": "apos", 75 | '"': "quot", 76 | "&": "amp", 77 | "<": "lt", 78 | ">": "gt", 79 | } 80 | 81 | BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" 82 | "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" 83 | ")") 84 | 85 | AMPERSAND_OR_BRACKET = re.compile("([<>&])") 86 | 87 | @classmethod 88 | def _substitute_html_entity(cls, matchobj): 89 | entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) 90 | return "&%s;" % entity 91 | 92 | @classmethod 93 | def _substitute_xml_entity(cls, matchobj): 94 | """Used with a regular expression to substitute the 95 | appropriate XML entity for an XML special character.""" 96 | entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] 97 | return "&%s;" % entity 98 | 99 | @classmethod 100 | def quoted_attribute_value(self, value): 101 | """Make a value into a quoted XML attribute, possibly escaping it. 102 | 103 | Most strings will be quoted using double quotes. 104 | 105 | Bob's Bar -> "Bob's Bar" 106 | 107 | If a string contains double quotes, it will be quoted using 108 | single quotes. 109 | 110 | Welcome to "my bar" -> 'Welcome to "my bar"' 111 | 112 | If a string contains both single and double quotes, the 113 | double quotes will be escaped, and the string will be quoted 114 | using double quotes. 115 | 116 | Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" 117 | """ 118 | quote_with = '"' 119 | if '"' in value: 120 | if "'" in value: 121 | # The string contains both single and double 122 | # quotes. Turn the double quotes into 123 | # entities. We quote the double quotes rather than 124 | # the single quotes because the entity name is 125 | # """ whether this is HTML or XML. If we 126 | # quoted the single quotes, we'd have to decide 127 | # between ' and &squot;. 128 | replace_with = """ 129 | value = value.replace('"', replace_with) 130 | else: 131 | # There are double quotes but no single quotes. 132 | # We can use single quotes to quote the attribute. 133 | quote_with = "'" 134 | return quote_with + value + quote_with 135 | 136 | @classmethod 137 | def substitute_xml(cls, value, make_quoted_attribute=False): 138 | """Substitute XML entities for special XML characters. 139 | 140 | :param value: A string to be substituted. The less-than sign 141 | will become <, the greater-than sign will become >, 142 | and any ampersands will become &. If you want ampersands 143 | that appear to be part of an entity definition to be left 144 | alone, use substitute_xml_containing_entities() instead. 145 | 146 | :param make_quoted_attribute: If True, then the string will be 147 | quoted, as befits an attribute value. 148 | """ 149 | # Escape angle brackets and ampersands. 150 | value = cls.AMPERSAND_OR_BRACKET.sub( 151 | cls._substitute_xml_entity, value) 152 | 153 | if make_quoted_attribute: 154 | value = cls.quoted_attribute_value(value) 155 | return value 156 | 157 | @classmethod 158 | def substitute_xml_containing_entities( 159 | cls, value, make_quoted_attribute=False): 160 | """Substitute XML entities for special XML characters. 161 | 162 | :param value: A string to be substituted. The less-than sign will 163 | become <, the greater-than sign will become >, and any 164 | ampersands that are not part of an entity defition will 165 | become &. 166 | 167 | :param make_quoted_attribute: If True, then the string will be 168 | quoted, as befits an attribute value. 169 | """ 170 | # Escape angle brackets, and ampersands that aren't part of 171 | # entities. 172 | value = cls.BARE_AMPERSAND_OR_BRACKET.sub( 173 | cls._substitute_xml_entity, value) 174 | 175 | if make_quoted_attribute: 176 | value = cls.quoted_attribute_value(value) 177 | return value 178 | 179 | @classmethod 180 | def substitute_html(cls, s): 181 | """Replace certain Unicode characters with named HTML entities. 182 | 183 | This differs from data.encode(encoding, 'xmlcharrefreplace') 184 | in that the goal is to make the result more readable (to those 185 | with ASCII displays) rather than to recover from 186 | errors. There's absolutely nothing wrong with a UTF-8 string 187 | containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that 188 | character with "é" will make it more readable to some 189 | people. 190 | """ 191 | return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( 192 | cls._substitute_html_entity, s) 193 | 194 | 195 | class EncodingDetector: 196 | """Suggests a number of possible encodings for a bytestring. 197 | 198 | Order of precedence: 199 | 200 | 1. Encodings you specifically tell EncodingDetector to try first 201 | (the override_encodings argument to the constructor). 202 | 203 | 2. An encoding declared within the bytestring itself, either in an 204 | XML declaration (if the bytestring is to be interpreted as an XML 205 | document), or in a tag (if the bytestring is to be 206 | interpreted as an HTML document.) 207 | 208 | 3. An encoding detected through textual analysis by chardet, 209 | cchardet, or a similar external library. 210 | 211 | 4. UTF-8. 212 | 213 | 5. Windows-1252. 214 | """ 215 | def __init__(self, markup, override_encodings=None, is_html=False): 216 | self.override_encodings = override_encodings or [] 217 | self.chardet_encoding = None 218 | self.is_html = is_html 219 | self.declared_encoding = None 220 | 221 | # First order of business: strip a byte-order mark. 222 | self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) 223 | 224 | def _usable(self, encoding, tried): 225 | if encoding is not None: 226 | encoding = encoding.lower() 227 | if encoding not in tried: 228 | tried.add(encoding) 229 | return True 230 | return False 231 | 232 | @property 233 | def encodings(self): 234 | """Yield a number of encodings that might work for this markup.""" 235 | tried = set() 236 | for e in self.override_encodings: 237 | if self._usable(e, tried): 238 | yield e 239 | 240 | # Did the document originally start with a byte-order mark 241 | # that indicated its encoding? 242 | if self._usable(self.sniffed_encoding, tried): 243 | yield self.sniffed_encoding 244 | 245 | # Look within the document for an XML or HTML encoding 246 | # declaration. 247 | if self.declared_encoding is None: 248 | self.declared_encoding = self.find_declared_encoding( 249 | self.markup, self.is_html) 250 | if self._usable(self.declared_encoding, tried): 251 | yield self.declared_encoding 252 | 253 | # Use third-party character set detection to guess at the 254 | # encoding. 255 | if self.chardet_encoding is None: 256 | self.chardet_encoding = chardet_dammit(self.markup) 257 | if self._usable(self.chardet_encoding, tried): 258 | yield self.chardet_encoding 259 | 260 | # As a last-ditch effort, try utf-8 and windows-1252. 261 | for e in ('utf-8', 'windows-1252'): 262 | if self._usable(e, tried): 263 | yield e 264 | 265 | @classmethod 266 | def strip_byte_order_mark(cls, data): 267 | """If a byte-order mark is present, strip it and return the encoding it implies.""" 268 | encoding = None 269 | if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ 270 | and (data[2:4] != '\x00\x00'): 271 | encoding = 'utf-16be' 272 | data = data[2:] 273 | elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ 274 | and (data[2:4] != '\x00\x00'): 275 | encoding = 'utf-16le' 276 | data = data[2:] 277 | elif data[:3] == b'\xef\xbb\xbf': 278 | encoding = 'utf-8' 279 | data = data[3:] 280 | elif data[:4] == b'\x00\x00\xfe\xff': 281 | encoding = 'utf-32be' 282 | data = data[4:] 283 | elif data[:4] == b'\xff\xfe\x00\x00': 284 | encoding = 'utf-32le' 285 | data = data[4:] 286 | return data, encoding 287 | 288 | @classmethod 289 | def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False): 290 | """Given a document, tries to find its declared encoding. 291 | 292 | An XML encoding is declared at the beginning of the document. 293 | 294 | An HTML encoding is declared in a tag, hopefully near the 295 | beginning of the document. 296 | """ 297 | if search_entire_document: 298 | xml_endpos = html_endpos = len(markup) 299 | else: 300 | xml_endpos = 1024 301 | html_endpos = max(2048, int(len(markup) * 0.05)) 302 | 303 | declared_encoding = None 304 | declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos) 305 | if not declared_encoding_match and is_html: 306 | declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) 307 | if declared_encoding_match is not None: 308 | declared_encoding = declared_encoding_match.groups()[0].decode( 309 | 'ascii') 310 | if declared_encoding: 311 | return declared_encoding.lower() 312 | return None 313 | 314 | class UnicodeDammit: 315 | """A class for detecting the encoding of a *ML document and 316 | converting it to a Unicode string. If the source encoding is 317 | windows-1252, can replace MS smart quotes with their HTML or XML 318 | equivalents.""" 319 | 320 | # This dictionary maps commonly seen values for "charset" in HTML 321 | # meta tags to the corresponding Python codec names. It only covers 322 | # values that aren't in Python's aliases and can't be determined 323 | # by the heuristics in find_codec. 324 | CHARSET_ALIASES = {"macintosh": "mac-roman", 325 | "x-sjis": "shift-jis"} 326 | 327 | ENCODINGS_WITH_SMART_QUOTES = [ 328 | "windows-1252", 329 | "iso-8859-1", 330 | "iso-8859-2", 331 | ] 332 | 333 | def __init__(self, markup, override_encodings=[], 334 | smart_quotes_to=None, is_html=False): 335 | self.smart_quotes_to = smart_quotes_to 336 | self.tried_encodings = [] 337 | self.contains_replacement_characters = False 338 | self.is_html = is_html 339 | 340 | self.detector = EncodingDetector(markup, override_encodings, is_html) 341 | 342 | # Short-circuit if the data is in Unicode to begin with. 343 | if isinstance(markup, str) or markup == '': 344 | self.markup = markup 345 | self.unicode_markup = str(markup) 346 | self.original_encoding = None 347 | return 348 | 349 | # The encoding detector may have stripped a byte-order mark. 350 | # Use the stripped markup from this point on. 351 | self.markup = self.detector.markup 352 | 353 | u = None 354 | for encoding in self.detector.encodings: 355 | markup = self.detector.markup 356 | u = self._convert_from(encoding) 357 | if u is not None: 358 | break 359 | 360 | if not u: 361 | # None of the encodings worked. As an absolute last resort, 362 | # try them again with character replacement. 363 | 364 | for encoding in self.detector.encodings: 365 | if encoding != "ascii": 366 | u = self._convert_from(encoding, "replace") 367 | if u is not None: 368 | logging.warning( 369 | "Some characters could not be decoded, and were " 370 | "replaced with REPLACEMENT CHARACTER.") 371 | self.contains_replacement_characters = True 372 | break 373 | 374 | # If none of that worked, we could at this point force it to 375 | # ASCII, but that would destroy so much data that I think 376 | # giving up is better. 377 | self.unicode_markup = u 378 | if not u: 379 | self.original_encoding = None 380 | 381 | def _sub_ms_char(self, match): 382 | """Changes a MS smart quote character to an XML or HTML 383 | entity, or an ASCII character.""" 384 | orig = match.group(1) 385 | if self.smart_quotes_to == 'ascii': 386 | sub = self.MS_CHARS_TO_ASCII.get(orig).encode() 387 | else: 388 | sub = self.MS_CHARS.get(orig) 389 | if type(sub) == tuple: 390 | if self.smart_quotes_to == 'xml': 391 | sub = '&#x'.encode() + sub[1].encode() + ';'.encode() 392 | else: 393 | sub = '&'.encode() + sub[0].encode() + ';'.encode() 394 | else: 395 | sub = sub.encode() 396 | return sub 397 | 398 | def _convert_from(self, proposed, errors="strict"): 399 | proposed = self.find_codec(proposed) 400 | if not proposed or (proposed, errors) in self.tried_encodings: 401 | return None 402 | self.tried_encodings.append((proposed, errors)) 403 | markup = self.markup 404 | # Convert smart quotes to HTML if coming from an encoding 405 | # that might have them. 406 | if (self.smart_quotes_to is not None 407 | and proposed in self.ENCODINGS_WITH_SMART_QUOTES): 408 | smart_quotes_re = b"([\x80-\x9f])" 409 | smart_quotes_compiled = re.compile(smart_quotes_re) 410 | markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) 411 | 412 | try: 413 | #print "Trying to convert document to %s (errors=%s)" % ( 414 | # proposed, errors) 415 | u = self._to_unicode(markup, proposed, errors) 416 | self.markup = u 417 | self.original_encoding = proposed 418 | except Exception as e: 419 | #print "That didn't work!" 420 | #print e 421 | return None 422 | #print "Correct encoding: %s" % proposed 423 | return self.markup 424 | 425 | def _to_unicode(self, data, encoding, errors="strict"): 426 | '''Given a string and its encoding, decodes the string into Unicode. 427 | %encoding is a string recognized by encodings.aliases''' 428 | return str(data, encoding, errors) 429 | 430 | @property 431 | def declared_html_encoding(self): 432 | if not self.is_html: 433 | return None 434 | return self.detector.declared_encoding 435 | 436 | def find_codec(self, charset): 437 | value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) 438 | or (charset and self._codec(charset.replace("-", ""))) 439 | or (charset and self._codec(charset.replace("-", "_"))) 440 | or (charset and charset.lower()) 441 | or charset 442 | ) 443 | if value: 444 | return value.lower() 445 | return None 446 | 447 | def _codec(self, charset): 448 | if not charset: 449 | return charset 450 | codec = None 451 | try: 452 | codecs.lookup(charset) 453 | codec = charset 454 | except (LookupError, ValueError): 455 | pass 456 | return codec 457 | 458 | 459 | # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. 460 | MS_CHARS = {b'\x80': ('euro', '20AC'), 461 | b'\x81': ' ', 462 | b'\x82': ('sbquo', '201A'), 463 | b'\x83': ('fnof', '192'), 464 | b'\x84': ('bdquo', '201E'), 465 | b'\x85': ('hellip', '2026'), 466 | b'\x86': ('dagger', '2020'), 467 | b'\x87': ('Dagger', '2021'), 468 | b'\x88': ('circ', '2C6'), 469 | b'\x89': ('permil', '2030'), 470 | b'\x8A': ('Scaron', '160'), 471 | b'\x8B': ('lsaquo', '2039'), 472 | b'\x8C': ('OElig', '152'), 473 | b'\x8D': '?', 474 | b'\x8E': ('#x17D', '17D'), 475 | b'\x8F': '?', 476 | b'\x90': '?', 477 | b'\x91': ('lsquo', '2018'), 478 | b'\x92': ('rsquo', '2019'), 479 | b'\x93': ('ldquo', '201C'), 480 | b'\x94': ('rdquo', '201D'), 481 | b'\x95': ('bull', '2022'), 482 | b'\x96': ('ndash', '2013'), 483 | b'\x97': ('mdash', '2014'), 484 | b'\x98': ('tilde', '2DC'), 485 | b'\x99': ('trade', '2122'), 486 | b'\x9a': ('scaron', '161'), 487 | b'\x9b': ('rsaquo', '203A'), 488 | b'\x9c': ('oelig', '153'), 489 | b'\x9d': '?', 490 | b'\x9e': ('#x17E', '17E'), 491 | b'\x9f': ('Yuml', ''),} 492 | 493 | # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains 494 | # horrors like stripping diacritical marks to turn á into a, but also 495 | # contains non-horrors like turning “ into ". 496 | MS_CHARS_TO_ASCII = { 497 | b'\x80' : 'EUR', 498 | b'\x81' : ' ', 499 | b'\x82' : ',', 500 | b'\x83' : 'f', 501 | b'\x84' : ',,', 502 | b'\x85' : '...', 503 | b'\x86' : '+', 504 | b'\x87' : '++', 505 | b'\x88' : '^', 506 | b'\x89' : '%', 507 | b'\x8a' : 'S', 508 | b'\x8b' : '<', 509 | b'\x8c' : 'OE', 510 | b'\x8d' : '?', 511 | b'\x8e' : 'Z', 512 | b'\x8f' : '?', 513 | b'\x90' : '?', 514 | b'\x91' : "'", 515 | b'\x92' : "'", 516 | b'\x93' : '"', 517 | b'\x94' : '"', 518 | b'\x95' : '*', 519 | b'\x96' : '-', 520 | b'\x97' : '--', 521 | b'\x98' : '~', 522 | b'\x99' : '(TM)', 523 | b'\x9a' : 's', 524 | b'\x9b' : '>', 525 | b'\x9c' : 'oe', 526 | b'\x9d' : '?', 527 | b'\x9e' : 'z', 528 | b'\x9f' : 'Y', 529 | b'\xa0' : ' ', 530 | b'\xa1' : '!', 531 | b'\xa2' : 'c', 532 | b'\xa3' : 'GBP', 533 | b'\xa4' : '$', #This approximation is especially parochial--this is the 534 | #generic currency symbol. 535 | b'\xa5' : 'YEN', 536 | b'\xa6' : '|', 537 | b'\xa7' : 'S', 538 | b'\xa8' : '..', 539 | b'\xa9' : '', 540 | b'\xaa' : '(th)', 541 | b'\xab' : '<<', 542 | b'\xac' : '!', 543 | b'\xad' : ' ', 544 | b'\xae' : '(R)', 545 | b'\xaf' : '-', 546 | b'\xb0' : 'o', 547 | b'\xb1' : '+-', 548 | b'\xb2' : '2', 549 | b'\xb3' : '3', 550 | b'\xb4' : ("'", 'acute'), 551 | b'\xb5' : 'u', 552 | b'\xb6' : 'P', 553 | b'\xb7' : '*', 554 | b'\xb8' : ',', 555 | b'\xb9' : '1', 556 | b'\xba' : '(th)', 557 | b'\xbb' : '>>', 558 | b'\xbc' : '1/4', 559 | b'\xbd' : '1/2', 560 | b'\xbe' : '3/4', 561 | b'\xbf' : '?', 562 | b'\xc0' : 'A', 563 | b'\xc1' : 'A', 564 | b'\xc2' : 'A', 565 | b'\xc3' : 'A', 566 | b'\xc4' : 'A', 567 | b'\xc5' : 'A', 568 | b'\xc6' : 'AE', 569 | b'\xc7' : 'C', 570 | b'\xc8' : 'E', 571 | b'\xc9' : 'E', 572 | b'\xca' : 'E', 573 | b'\xcb' : 'E', 574 | b'\xcc' : 'I', 575 | b'\xcd' : 'I', 576 | b'\xce' : 'I', 577 | b'\xcf' : 'I', 578 | b'\xd0' : 'D', 579 | b'\xd1' : 'N', 580 | b'\xd2' : 'O', 581 | b'\xd3' : 'O', 582 | b'\xd4' : 'O', 583 | b'\xd5' : 'O', 584 | b'\xd6' : 'O', 585 | b'\xd7' : '*', 586 | b'\xd8' : 'O', 587 | b'\xd9' : 'U', 588 | b'\xda' : 'U', 589 | b'\xdb' : 'U', 590 | b'\xdc' : 'U', 591 | b'\xdd' : 'Y', 592 | b'\xde' : 'b', 593 | b'\xdf' : 'B', 594 | b'\xe0' : 'a', 595 | b'\xe1' : 'a', 596 | b'\xe2' : 'a', 597 | b'\xe3' : 'a', 598 | b'\xe4' : 'a', 599 | b'\xe5' : 'a', 600 | b'\xe6' : 'ae', 601 | b'\xe7' : 'c', 602 | b'\xe8' : 'e', 603 | b'\xe9' : 'e', 604 | b'\xea' : 'e', 605 | b'\xeb' : 'e', 606 | b'\xec' : 'i', 607 | b'\xed' : 'i', 608 | b'\xee' : 'i', 609 | b'\xef' : 'i', 610 | b'\xf0' : 'o', 611 | b'\xf1' : 'n', 612 | b'\xf2' : 'o', 613 | b'\xf3' : 'o', 614 | b'\xf4' : 'o', 615 | b'\xf5' : 'o', 616 | b'\xf6' : 'o', 617 | b'\xf7' : '/', 618 | b'\xf8' : 'o', 619 | b'\xf9' : 'u', 620 | b'\xfa' : 'u', 621 | b'\xfb' : 'u', 622 | b'\xfc' : 'u', 623 | b'\xfd' : 'y', 624 | b'\xfe' : 'b', 625 | b'\xff' : 'y', 626 | } 627 | 628 | # A map used when removing rogue Windows-1252/ISO-8859-1 629 | # characters in otherwise UTF-8 documents. 630 | # 631 | # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in 632 | # Windows-1252. 633 | WINDOWS_1252_TO_UTF8 = { 634 | 0x80 : b'\xe2\x82\xac', # € 635 | 0x82 : b'\xe2\x80\x9a', # ‚ 636 | 0x83 : b'\xc6\x92', # ƒ 637 | 0x84 : b'\xe2\x80\x9e', # „ 638 | 0x85 : b'\xe2\x80\xa6', # … 639 | 0x86 : b'\xe2\x80\xa0', # † 640 | 0x87 : b'\xe2\x80\xa1', # ‡ 641 | 0x88 : b'\xcb\x86', # ˆ 642 | 0x89 : b'\xe2\x80\xb0', # ‰ 643 | 0x8a : b'\xc5\xa0', # Š 644 | 0x8b : b'\xe2\x80\xb9', # ‹ 645 | 0x8c : b'\xc5\x92', # Œ 646 | 0x8e : b'\xc5\xbd', # Ž 647 | 0x91 : b'\xe2\x80\x98', # ‘ 648 | 0x92 : b'\xe2\x80\x99', # ’ 649 | 0x93 : b'\xe2\x80\x9c', # “ 650 | 0x94 : b'\xe2\x80\x9d', # ” 651 | 0x95 : b'\xe2\x80\xa2', # • 652 | 0x96 : b'\xe2\x80\x93', # – 653 | 0x97 : b'\xe2\x80\x94', # — 654 | 0x98 : b'\xcb\x9c', # ˜ 655 | 0x99 : b'\xe2\x84\xa2', # ™ 656 | 0x9a : b'\xc5\xa1', # š 657 | 0x9b : b'\xe2\x80\xba', # › 658 | 0x9c : b'\xc5\x93', # œ 659 | 0x9e : b'\xc5\xbe', # ž 660 | 0x9f : b'\xc5\xb8', # Ÿ 661 | 0xa0 : b'\xc2\xa0', #   662 | 0xa1 : b'\xc2\xa1', # ¡ 663 | 0xa2 : b'\xc2\xa2', # ¢ 664 | 0xa3 : b'\xc2\xa3', # £ 665 | 0xa4 : b'\xc2\xa4', # ¤ 666 | 0xa5 : b'\xc2\xa5', # ¥ 667 | 0xa6 : b'\xc2\xa6', # ¦ 668 | 0xa7 : b'\xc2\xa7', # § 669 | 0xa8 : b'\xc2\xa8', # ¨ 670 | 0xa9 : b'\xc2\xa9', # © 671 | 0xaa : b'\xc2\xaa', # ª 672 | 0xab : b'\xc2\xab', # « 673 | 0xac : b'\xc2\xac', # ¬ 674 | 0xad : b'\xc2\xad', # ­ 675 | 0xae : b'\xc2\xae', # ® 676 | 0xaf : b'\xc2\xaf', # ¯ 677 | 0xb0 : b'\xc2\xb0', # ° 678 | 0xb1 : b'\xc2\xb1', # ± 679 | 0xb2 : b'\xc2\xb2', # ² 680 | 0xb3 : b'\xc2\xb3', # ³ 681 | 0xb4 : b'\xc2\xb4', # ´ 682 | 0xb5 : b'\xc2\xb5', # µ 683 | 0xb6 : b'\xc2\xb6', # ¶ 684 | 0xb7 : b'\xc2\xb7', # · 685 | 0xb8 : b'\xc2\xb8', # ¸ 686 | 0xb9 : b'\xc2\xb9', # ¹ 687 | 0xba : b'\xc2\xba', # º 688 | 0xbb : b'\xc2\xbb', # » 689 | 0xbc : b'\xc2\xbc', # ¼ 690 | 0xbd : b'\xc2\xbd', # ½ 691 | 0xbe : b'\xc2\xbe', # ¾ 692 | 0xbf : b'\xc2\xbf', # ¿ 693 | 0xc0 : b'\xc3\x80', # À 694 | 0xc1 : b'\xc3\x81', # Á 695 | 0xc2 : b'\xc3\x82', #  696 | 0xc3 : b'\xc3\x83', # à 697 | 0xc4 : b'\xc3\x84', # Ä 698 | 0xc5 : b'\xc3\x85', # Å 699 | 0xc6 : b'\xc3\x86', # Æ 700 | 0xc7 : b'\xc3\x87', # Ç 701 | 0xc8 : b'\xc3\x88', # È 702 | 0xc9 : b'\xc3\x89', # É 703 | 0xca : b'\xc3\x8a', # Ê 704 | 0xcb : b'\xc3\x8b', # Ë 705 | 0xcc : b'\xc3\x8c', # Ì 706 | 0xcd : b'\xc3\x8d', # Í 707 | 0xce : b'\xc3\x8e', # Î 708 | 0xcf : b'\xc3\x8f', # Ï 709 | 0xd0 : b'\xc3\x90', # Ð 710 | 0xd1 : b'\xc3\x91', # Ñ 711 | 0xd2 : b'\xc3\x92', # Ò 712 | 0xd3 : b'\xc3\x93', # Ó 713 | 0xd4 : b'\xc3\x94', # Ô 714 | 0xd5 : b'\xc3\x95', # Õ 715 | 0xd6 : b'\xc3\x96', # Ö 716 | 0xd7 : b'\xc3\x97', # × 717 | 0xd8 : b'\xc3\x98', # Ø 718 | 0xd9 : b'\xc3\x99', # Ù 719 | 0xda : b'\xc3\x9a', # Ú 720 | 0xdb : b'\xc3\x9b', # Û 721 | 0xdc : b'\xc3\x9c', # Ü 722 | 0xdd : b'\xc3\x9d', # Ý 723 | 0xde : b'\xc3\x9e', # Þ 724 | 0xdf : b'\xc3\x9f', # ß 725 | 0xe0 : b'\xc3\xa0', # à 726 | 0xe1 : b'\xa1', # á 727 | 0xe2 : b'\xc3\xa2', # â 728 | 0xe3 : b'\xc3\xa3', # ã 729 | 0xe4 : b'\xc3\xa4', # ä 730 | 0xe5 : b'\xc3\xa5', # å 731 | 0xe6 : b'\xc3\xa6', # æ 732 | 0xe7 : b'\xc3\xa7', # ç 733 | 0xe8 : b'\xc3\xa8', # è 734 | 0xe9 : b'\xc3\xa9', # é 735 | 0xea : b'\xc3\xaa', # ê 736 | 0xeb : b'\xc3\xab', # ë 737 | 0xec : b'\xc3\xac', # ì 738 | 0xed : b'\xc3\xad', # í 739 | 0xee : b'\xc3\xae', # î 740 | 0xef : b'\xc3\xaf', # ï 741 | 0xf0 : b'\xc3\xb0', # ð 742 | 0xf1 : b'\xc3\xb1', # ñ 743 | 0xf2 : b'\xc3\xb2', # ò 744 | 0xf3 : b'\xc3\xb3', # ó 745 | 0xf4 : b'\xc3\xb4', # ô 746 | 0xf5 : b'\xc3\xb5', # õ 747 | 0xf6 : b'\xc3\xb6', # ö 748 | 0xf7 : b'\xc3\xb7', # ÷ 749 | 0xf8 : b'\xc3\xb8', # ø 750 | 0xf9 : b'\xc3\xb9', # ù 751 | 0xfa : b'\xc3\xba', # ú 752 | 0xfb : b'\xc3\xbb', # û 753 | 0xfc : b'\xc3\xbc', # ü 754 | 0xfd : b'\xc3\xbd', # ý 755 | 0xfe : b'\xc3\xbe', # þ 756 | } 757 | 758 | MULTIBYTE_MARKERS_AND_SIZES = [ 759 | (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF 760 | (0xe0, 0xef, 3), # 3-byte characters start with E0-EF 761 | (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 762 | ] 763 | 764 | FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] 765 | LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] 766 | 767 | @classmethod 768 | def detwingle(cls, in_bytes, main_encoding="utf8", 769 | embedded_encoding="windows-1252"): 770 | """Fix characters from one encoding embedded in some other encoding. 771 | 772 | Currently the only situation supported is Windows-1252 (or its 773 | subset ISO-8859-1), embedded in UTF-8. 774 | 775 | The input must be a bytestring. If you've already converted 776 | the document to Unicode, you're too late. 777 | 778 | The output is a bytestring in which `embedded_encoding` 779 | characters have been converted to their `main_encoding` 780 | equivalents. 781 | """ 782 | if embedded_encoding.replace('_', '-').lower() not in ( 783 | 'windows-1252', 'windows_1252'): 784 | raise NotImplementedError( 785 | "Windows-1252 and ISO-8859-1 are the only currently supported " 786 | "embedded encodings.") 787 | 788 | if main_encoding.lower() not in ('utf8', 'utf-8'): 789 | raise NotImplementedError( 790 | "UTF-8 is the only currently supported main encoding.") 791 | 792 | byte_chunks = [] 793 | 794 | chunk_start = 0 795 | pos = 0 796 | while pos < len(in_bytes): 797 | byte = in_bytes[pos] 798 | if not isinstance(byte, int): 799 | # Python 2.x 800 | byte = ord(byte) 801 | if (byte >= cls.FIRST_MULTIBYTE_MARKER 802 | and byte <= cls.LAST_MULTIBYTE_MARKER): 803 | # This is the start of a UTF-8 multibyte character. Skip 804 | # to the end. 805 | for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: 806 | if byte >= start and byte <= end: 807 | pos += size 808 | break 809 | elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: 810 | # We found a Windows-1252 character! 811 | # Save the string up to this point as a chunk. 812 | byte_chunks.append(in_bytes[chunk_start:pos]) 813 | 814 | # Now translate the Windows-1252 character into UTF-8 815 | # and add it as another, one-byte chunk. 816 | byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) 817 | pos += 1 818 | chunk_start = pos 819 | else: 820 | # Go on to the next character. 821 | pos += 1 822 | if chunk_start == 0: 823 | # The string is unchanged. 824 | return in_bytes 825 | else: 826 | # Store the final chunk. 827 | byte_chunks.append(in_bytes[chunk_start:]) 828 | return b''.join(byte_chunks) 829 | 830 | --------------------------------------------------------------------------------