├── .gitignore ├── workflow ├── version ├── Notify.tgz ├── __init__.py ├── background.py ├── notify.py ├── update.py └── web.py ├── bs4 ├── tests │ ├── __init__.py │ ├── test_htmlparser.py │ ├── test_docs.py │ ├── test_lxml.py │ ├── test_html5lib.py │ ├── test_builder_registry.py │ └── test_soup.py ├── diagnose.py ├── builder │ ├── _lxml.py │ ├── _htmlparser.py │ ├── __init__.py │ └── _html5lib.py ├── __init__.py └── testing.py ├── doc.png ├── icon.png ├── Mweb-Blog.alfredworkflow ├── README.md ├── ListArticle.py ├── article.py ├── xpinyin └── __init__.py └── info.plist /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /workflow/version: -------------------------------------------------------------------------------- 1 | 1.17.2 -------------------------------------------------------------------------------- /bs4/tests/__init__.py: -------------------------------------------------------------------------------- 1 | "The beautifulsoup tests." 2 | -------------------------------------------------------------------------------- /doc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haoliplus/alfred3-mweb-workflow/HEAD/doc.png -------------------------------------------------------------------------------- /icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haoliplus/alfred3-mweb-workflow/HEAD/icon.png -------------------------------------------------------------------------------- /workflow/Notify.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haoliplus/alfred3-mweb-workflow/HEAD/workflow/Notify.tgz -------------------------------------------------------------------------------- /Mweb-Blog.alfredworkflow: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haoliplus/alfred3-mweb-workflow/HEAD/Mweb-Blog.alfredworkflow -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 辅助 MWeb 书写的 Alfred 3 插件 2 | 3 | 最近 Alfred 3 测试版放出,想学一下开发 Workflow 的过程,正好使用 MWeb 时遇到了些问题,在作者没有正式改进前先使用 Alfred 辅助书写。 4 | 5 | 该工具具备处理本地静态网站的相关功能。 6 | 7 | [下载链接](https://github.com/DarryO/alfred3-mweb-workflow/raw/master/Mweb-Blog.alfredworkflow) 8 | 9 | 功能: 10 | 11 | 1. 本地预览静态博客。 12 | >我的 MWeb 生成的文件都有@扩展属性标记,无法直接打开Chrome预览。使用该功能能够删除扩展属性,并在本地预览博客。 13 | 14 | 2. 推送至 Git 仓库。 15 | >如果已经将静态网站生成目录设置为 Git 仓库,并配置了 ssh 免密码登录,可以使用该 Wrokflow 直接推送到远程仓库。 16 | 17 | 3. 搜索站内文章并生成跳转链接。搜索已经生成的文章(拼音搜索感谢[lxneng提供的拼音处理模块](https://github.com/lxneng/xpinyin.git)),并生成站内跳转链接。 18 | 19 | 前两个功能就不多说了。关于生成站内链接,下面补充一下3的作用: 20 | 21 | ![](http://i.imgur.com/S4ZXYAC.gif) 22 | 23 | Alfred 通过读取 静态网站生成目录下的 archives.html,列出所有文章的名称,类别,创建时间,并通过用户输入的拼音进行搜索。 24 | 25 | 我用了Alfred 3 提供的环境变量的功能,用户在使用前需要配置一下自己的静态网站路径。 26 | 27 | ![](http://i.imgur.com/mwBmo56.png) 28 | ![](http://i.imgur.com/J0zb5Dp.png) 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /ListArticle.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | import re,urllib,sys 3 | import article as art 4 | import xpinyin 5 | from workflow import Workflow 6 | def main(wf): 7 | site_path = wf.args[0] 8 | articles = art.Articles(site_path) 9 | py = xpinyin.Pinyin() 10 | if len(wf.args) == 2: 11 | query = wf.args[1] 12 | else: 13 | query = "" 14 | for article in articles.articles: 15 | if not (query == "" or query in py.get_pinyin(article['title'],'').lower() or query in py.get_pinyin(article['category'],'').lower()): 16 | continue 17 | wf.add_item(title = article['title'], 18 | subtitle = article['date']+" " + article['category'], 19 | arg = '[%s](./%s)' % (article['title'],article['link']), 20 | valid=True, 21 | icon = "doc.png") 22 | 23 | wf.send_feedback() 24 | 25 | if __name__ == "__main__": 26 | wf = Workflow() 27 | sys.exit(wf.run(main)) 28 | -------------------------------------------------------------------------------- /article.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import re 4 | import sys,os 5 | from bs4 import BeautifulSoup 6 | 7 | class Articles: 8 | 9 | def __init__(self, path): 10 | self.archives_path = os.path.join(path, "archives.html") 11 | if os.path.isfile(self.archives_path): 12 | self.archives_content = open(self.archives_path).read() 13 | else: 14 | self.archives_content = "" 15 | soup = BeautifulSoup(self.archives_content,"html.parser") 16 | self.articles = [] 17 | for item in soup.find_all('div','article'): 18 | article = {} 19 | article['title']= item.find('h1').get_text() 20 | article['link'] = item.find('a','clearlink').get('href') 21 | article['date']= item.find('span','date').get_text() 22 | article['category'] = ' '.join([x.get_text() for x in item.find_all('span','posted-in')]) 23 | self.articles.append(article) 24 | 25 | if __name__ == "__main__": 26 | a = Articles("/Users/hao/blog/MWeb-Blog/Blog") 27 | for i in a.articles: 28 | print i['title'] 29 | 30 | -------------------------------------------------------------------------------- /bs4/tests/test_htmlparser.py: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the html.parser tree builder generates good 2 | trees.""" 3 | 4 | from pdb import set_trace 5 | import pickle 6 | from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest 7 | from bs4.builder import HTMLParserTreeBuilder 8 | 9 | class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): 10 | 11 | @property 12 | def default_builder(self): 13 | return HTMLParserTreeBuilder() 14 | 15 | def test_namespaced_system_doctype(self): 16 | # html.parser can't handle namespaced doctypes, so skip this one. 17 | pass 18 | 19 | def test_namespaced_public_doctype(self): 20 | # html.parser can't handle namespaced doctypes, so skip this one. 21 | pass 22 | 23 | def test_builder_is_pickled(self): 24 | """Unlike most tree builders, HTMLParserTreeBuilder and will 25 | be restored after pickling. 26 | """ 27 | tree = self.soup("foo") 28 | dumped = pickle.dumps(tree, 2) 29 | loaded = pickle.loads(dumped) 30 | self.assertTrue(isinstance(loaded.builder, type(tree.builder))) 31 | 32 | 33 | -------------------------------------------------------------------------------- /bs4/tests/test_docs.py: -------------------------------------------------------------------------------- 1 | "Test harness for doctests." 2 | 3 | # pylint: disable-msg=E0611,W0142 4 | 5 | __metaclass__ = type 6 | __all__ = [ 7 | 'additional_tests', 8 | ] 9 | 10 | import atexit 11 | import doctest 12 | import os 13 | #from pkg_resources import ( 14 | # resource_filename, resource_exists, resource_listdir, cleanup_resources) 15 | import unittest 16 | 17 | DOCTEST_FLAGS = ( 18 | doctest.ELLIPSIS | 19 | doctest.NORMALIZE_WHITESPACE | 20 | doctest.REPORT_NDIFF) 21 | 22 | 23 | # def additional_tests(): 24 | # "Run the doc tests (README.txt and docs/*, if any exist)" 25 | # doctest_files = [ 26 | # os.path.abspath(resource_filename('bs4', 'README.txt'))] 27 | # if resource_exists('bs4', 'docs'): 28 | # for name in resource_listdir('bs4', 'docs'): 29 | # if name.endswith('.txt'): 30 | # doctest_files.append( 31 | # os.path.abspath( 32 | # resource_filename('bs4', 'docs/%s' % name))) 33 | # kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS) 34 | # atexit.register(cleanup_resources) 35 | # return unittest.TestSuite(( 36 | # doctest.DocFileSuite(*doctest_files, **kwargs))) 37 | -------------------------------------------------------------------------------- /workflow/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | # 4 | # Copyright (c) 2014 Dean Jackson 5 | # 6 | # MIT Licence. See http://opensource.org/licenses/MIT 7 | # 8 | # Created on 2014-02-15 9 | # 10 | 11 | """ 12 | A Python helper library for `Alfred 2 `_ Workflow 13 | authors. 14 | """ 15 | 16 | import os 17 | 18 | __title__ = 'Alfred-Workflow' 19 | __version__ = open(os.path.join(os.path.dirname(__file__), 'version')).read() 20 | __author__ = 'Dean Jackson' 21 | __licence__ = 'MIT' 22 | __copyright__ = 'Copyright 2014 Dean Jackson' 23 | 24 | 25 | # Workflow objects 26 | from .workflow import Workflow, manager 27 | 28 | # Exceptions 29 | from .workflow import PasswordNotFound, KeychainError 30 | 31 | # Icons 32 | from .workflow import ( 33 | ICON_ACCOUNT, 34 | ICON_BURN, 35 | ICON_CLOCK, 36 | ICON_COLOR, 37 | ICON_COLOUR, 38 | ICON_EJECT, 39 | ICON_ERROR, 40 | ICON_FAVORITE, 41 | ICON_FAVOURITE, 42 | ICON_GROUP, 43 | ICON_HELP, 44 | ICON_HOME, 45 | ICON_INFO, 46 | ICON_NETWORK, 47 | ICON_NOTE, 48 | ICON_SETTINGS, 49 | ICON_SWIRL, 50 | ICON_SWITCH, 51 | ICON_SYNC, 52 | ICON_TRASH, 53 | ICON_USER, 54 | ICON_WARNING, 55 | ICON_WEB, 56 | ) 57 | 58 | # Filter matching rules 59 | from .workflow import ( 60 | MATCH_ALL, 61 | MATCH_ALLCHARS, 62 | MATCH_ATOM, 63 | MATCH_CAPITALS, 64 | MATCH_INITIALS, 65 | MATCH_INITIALS_CONTAIN, 66 | MATCH_INITIALS_STARTSWITH, 67 | MATCH_STARTSWITH, 68 | MATCH_SUBSTRING, 69 | ) 70 | 71 | __all__ = [ 72 | 'Workflow', 73 | 'manager', 74 | 'PasswordNotFound', 75 | 'KeychainError', 76 | 'ICON_ACCOUNT', 77 | 'ICON_BURN', 78 | 'ICON_CLOCK', 79 | 'ICON_COLOR', 80 | 'ICON_COLOUR', 81 | 'ICON_EJECT', 82 | 'ICON_ERROR', 83 | 'ICON_FAVORITE', 84 | 'ICON_FAVOURITE', 85 | 'ICON_GROUP', 86 | 'ICON_HELP', 87 | 'ICON_HOME', 88 | 'ICON_INFO', 89 | 'ICON_NETWORK', 90 | 'ICON_NOTE', 91 | 'ICON_SETTINGS', 92 | 'ICON_SWIRL', 93 | 'ICON_SWITCH', 94 | 'ICON_SYNC', 95 | 'ICON_TRASH', 96 | 'ICON_USER', 97 | 'ICON_WARNING', 98 | 'ICON_WEB', 99 | 'MATCH_ALL', 100 | 'MATCH_ALLCHARS', 101 | 'MATCH_ATOM', 102 | 'MATCH_CAPITALS', 103 | 'MATCH_INITIALS', 104 | 'MATCH_INITIALS_CONTAIN', 105 | 'MATCH_INITIALS_STARTSWITH', 106 | 'MATCH_STARTSWITH', 107 | 'MATCH_SUBSTRING', 108 | ] 109 | -------------------------------------------------------------------------------- /bs4/tests/test_lxml.py: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the lxml tree builder generates good trees.""" 2 | 3 | import re 4 | import warnings 5 | 6 | try: 7 | import lxml.etree 8 | LXML_PRESENT = True 9 | LXML_VERSION = lxml.etree.LXML_VERSION 10 | except ImportError, e: 11 | LXML_PRESENT = False 12 | LXML_VERSION = (0,) 13 | 14 | if LXML_PRESENT: 15 | from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML 16 | 17 | from bs4 import ( 18 | BeautifulSoup, 19 | BeautifulStoneSoup, 20 | ) 21 | from bs4.element import Comment, Doctype, SoupStrainer 22 | from bs4.testing import skipIf 23 | from bs4.tests import test_htmlparser 24 | from bs4.testing import ( 25 | HTMLTreeBuilderSmokeTest, 26 | XMLTreeBuilderSmokeTest, 27 | SoupTest, 28 | skipIf, 29 | ) 30 | 31 | @skipIf( 32 | not LXML_PRESENT, 33 | "lxml seems not to be present, not testing its tree builder.") 34 | class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): 35 | """See ``HTMLTreeBuilderSmokeTest``.""" 36 | 37 | @property 38 | def default_builder(self): 39 | return LXMLTreeBuilder() 40 | 41 | def test_out_of_range_entity(self): 42 | self.assertSoupEquals( 43 | "

foo�bar

", "

foobar

") 44 | self.assertSoupEquals( 45 | "

foo�bar

", "

foobar

") 46 | self.assertSoupEquals( 47 | "

foo�bar

", "

foobar

") 48 | 49 | # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this 50 | # test if an old version of lxml is installed. 51 | 52 | @skipIf( 53 | not LXML_PRESENT or LXML_VERSION < (2,3,5,0), 54 | "Skipping doctype test for old version of lxml to avoid segfault.") 55 | def test_empty_doctype(self): 56 | soup = self.soup("") 57 | doctype = soup.contents[0] 58 | self.assertEqual("", doctype.strip()) 59 | 60 | def test_beautifulstonesoup_is_xml_parser(self): 61 | # Make sure that the deprecated BSS class uses an xml builder 62 | # if one is installed. 63 | with warnings.catch_warnings(record=True) as w: 64 | soup = BeautifulStoneSoup("") 65 | self.assertEqual(u"", unicode(soup.b)) 66 | self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) 67 | 68 | @skipIf( 69 | not LXML_PRESENT, 70 | "lxml seems not to be present, not testing its XML tree builder.") 71 | class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): 72 | """See ``HTMLTreeBuilderSmokeTest``.""" 73 | 74 | @property 75 | def default_builder(self): 76 | return LXMLTreeBuilderForXML() 77 | -------------------------------------------------------------------------------- /bs4/tests/test_html5lib.py: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the html5lib tree builder generates good trees.""" 2 | 3 | import warnings 4 | 5 | try: 6 | from bs4.builder import HTML5TreeBuilder 7 | HTML5LIB_PRESENT = True 8 | except ImportError, e: 9 | HTML5LIB_PRESENT = False 10 | from bs4.element import SoupStrainer 11 | from bs4.testing import ( 12 | HTML5TreeBuilderSmokeTest, 13 | SoupTest, 14 | skipIf, 15 | ) 16 | 17 | @skipIf( 18 | not HTML5LIB_PRESENT, 19 | "html5lib seems not to be present, not testing its tree builder.") 20 | class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): 21 | """See ``HTML5TreeBuilderSmokeTest``.""" 22 | 23 | @property 24 | def default_builder(self): 25 | return HTML5TreeBuilder() 26 | 27 | def test_soupstrainer(self): 28 | # The html5lib tree builder does not support SoupStrainers. 29 | strainer = SoupStrainer("b") 30 | markup = "

A bold statement.

" 31 | with warnings.catch_warnings(record=True) as w: 32 | soup = self.soup(markup, parse_only=strainer) 33 | self.assertEqual( 34 | soup.decode(), self.document_for(markup)) 35 | 36 | self.assertTrue( 37 | "the html5lib tree builder doesn't support parse_only" in 38 | str(w[0].message)) 39 | 40 | def test_correctly_nested_tables(self): 41 | """html5lib inserts tags where other parsers don't.""" 42 | markup = ('' 43 | '' 44 | "') 48 | 49 | self.assertSoupEquals( 50 | markup, 51 | '
Here's another table:" 45 | '' 46 | '' 47 | '
foo
Here\'s another table:' 52 | '
foo
' 53 | '
') 54 | 55 | self.assertSoupEquals( 56 | "" 57 | "" 58 | "
Foo
Bar
Baz
") 59 | 60 | def test_xml_declaration_followed_by_doctype(self): 61 | markup = ''' 62 | 63 | 64 | 65 | 66 | 67 |

foo

68 | 69 | ''' 70 | soup = self.soup(markup) 71 | # Verify that we can reach the

tag; this means the tree is connected. 72 | self.assertEqual(b"

foo

", soup.p.encode()) 73 | 74 | def test_reparented_markup(self): 75 | markup = '

foo

\n

bar

' 76 | soup = self.soup(markup) 77 | self.assertEqual(u"

foo

\n

bar

", soup.body.decode()) 78 | self.assertEqual(2, len(soup.find_all('p'))) 79 | 80 | 81 | def test_reparented_markup_ends_with_whitespace(self): 82 | markup = '

foo

\n

bar

\n' 83 | soup = self.soup(markup) 84 | self.assertEqual(u"

foo

\n

bar

\n", soup.body.decode()) 85 | self.assertEqual(2, len(soup.find_all('p'))) 86 | 87 | def test_processing_instruction(self): 88 | """Processing instructions become comments.""" 89 | markup = b"""""" 90 | soup = self.soup(markup) 91 | assert str(soup).startswith("") 92 | 93 | def test_cloned_multivalue_node(self): 94 | markup = b"""

""" 95 | soup = self.soup(markup) 96 | a1, a2 = soup.find_all('a') 97 | self.assertEqual(a1, a2) 98 | assert a1 is not a2 99 | -------------------------------------------------------------------------------- /xpinyin/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import unicode_literals 4 | 5 | import os.path 6 | import re 7 | 8 | PinyinToneMark = { 9 | 0: u"aoeiuv\u00fc", 10 | 1: u"\u0101\u014d\u0113\u012b\u016b\u01d6\u01d6", 11 | 2: u"\u00e1\u00f3\u00e9\u00ed\u00fa\u01d8\u01d8", 12 | 3: u"\u01ce\u01d2\u011b\u01d0\u01d4\u01da\u01da", 13 | 4: u"\u00e0\u00f2\u00e8\u00ec\u00f9\u01dc\u01dc", 14 | } 15 | 16 | 17 | class Pinyin(object): 18 | 19 | """translate chinese hanzi to pinyin by python, inspired by flyerhzm’s 20 | `chinese\_pinyin`_ gem 21 | 22 | usage 23 | ----- 24 | :: 25 | 26 | >>> from xpinyin import Pinyin 27 | >>> p = Pinyin() 28 | >>> # default splitter is `-` 29 | >>> p.get_pinyin(u"上海") 30 | 'shang-hai' 31 | >>> # show tone marks 32 | >>> p.get_pinyin(u"上海", show_tone_marks=True) 33 | 'shàng-hǎi' 34 | >>> # remove splitter 35 | >>> p.get_pinyin(u"上海", '') 36 | 'shanghai' 37 | >>> # set splitter as whitespace 38 | >>> p.get_pinyin(u"上海", ' ') 39 | 'shang hai' 40 | >>> p.get_initial(u"上") 41 | 'S' 42 | >>> p.get_initials(u"上海") 43 | 'S-H' 44 | >>> p.get_initials(u"上海", u'') 45 | 'SH' 46 | >>> p.get_initials(u"上海", u' ') 47 | 'S H' 48 | 49 | 请输入utf8编码汉字 50 | .. _chinese\_pinyin: https://github.com/flyerhzm/chinese_pinyin 51 | """ 52 | 53 | data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 54 | 'Mandarin.dat') 55 | 56 | def __init__(self, data_path=data_path): 57 | self.dict = {} 58 | with open(data_path) as f: 59 | for line in f: 60 | k, v = line.split('\t') 61 | self.dict[k] = v 62 | 63 | @staticmethod 64 | def decode_pinyin(s): 65 | s = s.lower() 66 | r = "" 67 | t = "" 68 | for c in s: 69 | if "a" <= c <= 'z': 70 | t += c 71 | elif c == ':': 72 | assert t[-1] == 'u' 73 | t = t[:-1] + "\u00fc" 74 | else: 75 | if '0' <= c <= '5': 76 | tone = int(c) % 5 77 | if tone != 0: 78 | m = re.search("[aoeiuv\u00fc]+", t) 79 | if m is None: 80 | # pass when no vowels find yet 81 | t += c 82 | elif len(m.group(0)) == 1: 83 | # if just find one vowels, put the mark on it 84 | t = t[:m.start(0)] \ 85 | + PinyinToneMark[tone][PinyinToneMark[0].index(m.group(0))] \ 86 | + t[m.end(0):] 87 | else: 88 | # mark on vowels which search with "a, o, e" one by one 89 | # when "i" and "u" stand together, make the vowels behind 90 | for num, vowels in enumerate(("a", "o", "e", "ui", "iu")): 91 | if vowels in t: 92 | t = t.replace(vowels[-1], PinyinToneMark[tone][num]) 93 | break 94 | else: 95 | t += "!" 96 | r += t 97 | t = "" 98 | r += t 99 | return r 100 | 101 | @staticmethod 102 | def convert_pinyin(word, convert): 103 | if convert == 'capitalize': 104 | return word.capitalize() 105 | if convert == 'lower': 106 | return word.lower() 107 | if convert == 'upper': 108 | return word.upper() 109 | 110 | def get_pinyin(self, chars=u'你好', splitter=u'-', 111 | show_tone_marks=False, convert='lower'): 112 | result = [] 113 | flag = 1 114 | for char in chars: 115 | key = "%X" % ord(char) 116 | try: 117 | if show_tone_marks: 118 | word = self.decode_pinyin(self.dict[key].split()[0].strip()) 119 | else: 120 | word = self.dict[key].split()[0].strip()[:-1] 121 | word = self.convert_pinyin(word, convert) 122 | result.append(word) 123 | flag = 1 124 | except KeyError: 125 | if flag: 126 | result.append(char) 127 | else: 128 | result[-1] += char 129 | flag = 0 130 | return splitter.join(result) 131 | 132 | def get_initial(self, char=u'你'): 133 | try: 134 | return self.dict["%X" % ord(char)].split(" ")[0][0] 135 | except KeyError: 136 | return char 137 | 138 | def get_initials(self, chars=u'你好', splitter=u'-'): 139 | result = [] 140 | flag = 1 141 | for char in chars: 142 | try: 143 | result.append(self.dict["%X" % ord(char)].split(" ")[0][0]) 144 | flag = 1 145 | except KeyError: 146 | if flag: 147 | result.append(char) 148 | else: 149 | result[-1] += char 150 | 151 | return splitter.join(result) 152 | -------------------------------------------------------------------------------- /bs4/tests/test_builder_registry.py: -------------------------------------------------------------------------------- 1 | """Tests of the builder registry.""" 2 | 3 | import unittest 4 | import warnings 5 | 6 | from bs4 import BeautifulSoup 7 | from bs4.builder import ( 8 | builder_registry as registry, 9 | HTMLParserTreeBuilder, 10 | TreeBuilderRegistry, 11 | ) 12 | 13 | try: 14 | from bs4.builder import HTML5TreeBuilder 15 | HTML5LIB_PRESENT = True 16 | except ImportError: 17 | HTML5LIB_PRESENT = False 18 | 19 | try: 20 | from bs4.builder import ( 21 | LXMLTreeBuilderForXML, 22 | LXMLTreeBuilder, 23 | ) 24 | LXML_PRESENT = True 25 | except ImportError: 26 | LXML_PRESENT = False 27 | 28 | 29 | class BuiltInRegistryTest(unittest.TestCase): 30 | """Test the built-in registry with the default builders registered.""" 31 | 32 | def test_combination(self): 33 | if LXML_PRESENT: 34 | self.assertEqual(registry.lookup('fast', 'html'), 35 | LXMLTreeBuilder) 36 | 37 | if LXML_PRESENT: 38 | self.assertEqual(registry.lookup('permissive', 'xml'), 39 | LXMLTreeBuilderForXML) 40 | self.assertEqual(registry.lookup('strict', 'html'), 41 | HTMLParserTreeBuilder) 42 | if HTML5LIB_PRESENT: 43 | self.assertEqual(registry.lookup('html5lib', 'html'), 44 | HTML5TreeBuilder) 45 | 46 | def test_lookup_by_markup_type(self): 47 | if LXML_PRESENT: 48 | self.assertEqual(registry.lookup('html'), LXMLTreeBuilder) 49 | self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML) 50 | else: 51 | self.assertEqual(registry.lookup('xml'), None) 52 | if HTML5LIB_PRESENT: 53 | self.assertEqual(registry.lookup('html'), HTML5TreeBuilder) 54 | else: 55 | self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder) 56 | 57 | def test_named_library(self): 58 | if LXML_PRESENT: 59 | self.assertEqual(registry.lookup('lxml', 'xml'), 60 | LXMLTreeBuilderForXML) 61 | self.assertEqual(registry.lookup('lxml', 'html'), 62 | LXMLTreeBuilder) 63 | if HTML5LIB_PRESENT: 64 | self.assertEqual(registry.lookup('html5lib'), 65 | HTML5TreeBuilder) 66 | 67 | self.assertEqual(registry.lookup('html.parser'), 68 | HTMLParserTreeBuilder) 69 | 70 | def test_beautifulsoup_constructor_does_lookup(self): 71 | 72 | with warnings.catch_warnings(record=True) as w: 73 | # This will create a warning about not explicitly 74 | # specifying a parser, but we'll ignore it. 75 | 76 | # You can pass in a string. 77 | BeautifulSoup("", features="html") 78 | # Or a list of strings. 79 | BeautifulSoup("", features=["html", "fast"]) 80 | 81 | # You'll get an exception if BS can't find an appropriate 82 | # builder. 83 | self.assertRaises(ValueError, BeautifulSoup, 84 | "", features="no-such-feature") 85 | 86 | class RegistryTest(unittest.TestCase): 87 | """Test the TreeBuilderRegistry class in general.""" 88 | 89 | def setUp(self): 90 | self.registry = TreeBuilderRegistry() 91 | 92 | def builder_for_features(self, *feature_list): 93 | cls = type('Builder_' + '_'.join(feature_list), 94 | (object,), {'features' : feature_list}) 95 | 96 | self.registry.register(cls) 97 | return cls 98 | 99 | def test_register_with_no_features(self): 100 | builder = self.builder_for_features() 101 | 102 | # Since the builder advertises no features, you can't find it 103 | # by looking up features. 104 | self.assertEqual(self.registry.lookup('foo'), None) 105 | 106 | # But you can find it by doing a lookup with no features, if 107 | # this happens to be the only registered builder. 108 | self.assertEqual(self.registry.lookup(), builder) 109 | 110 | def test_register_with_features_makes_lookup_succeed(self): 111 | builder = self.builder_for_features('foo', 'bar') 112 | self.assertEqual(self.registry.lookup('foo'), builder) 113 | self.assertEqual(self.registry.lookup('bar'), builder) 114 | 115 | def test_lookup_fails_when_no_builder_implements_feature(self): 116 | builder = self.builder_for_features('foo', 'bar') 117 | self.assertEqual(self.registry.lookup('baz'), None) 118 | 119 | def test_lookup_gets_most_recent_registration_when_no_feature_specified(self): 120 | builder1 = self.builder_for_features('foo') 121 | builder2 = self.builder_for_features('bar') 122 | self.assertEqual(self.registry.lookup(), builder2) 123 | 124 | def test_lookup_fails_when_no_tree_builders_registered(self): 125 | self.assertEqual(self.registry.lookup(), None) 126 | 127 | def test_lookup_gets_most_recent_builder_supporting_all_features(self): 128 | has_one = self.builder_for_features('foo') 129 | has_the_other = self.builder_for_features('bar') 130 | has_both_early = self.builder_for_features('foo', 'bar', 'baz') 131 | has_both_late = self.builder_for_features('foo', 'bar', 'quux') 132 | lacks_one = self.builder_for_features('bar') 133 | has_the_other = self.builder_for_features('foo') 134 | 135 | # There are two builders featuring 'foo' and 'bar', but 136 | # the one that also features 'quux' was registered later. 137 | self.assertEqual(self.registry.lookup('foo', 'bar'), 138 | has_both_late) 139 | 140 | # There is only one builder featuring 'foo', 'bar', and 'baz'. 141 | self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'), 142 | has_both_early) 143 | 144 | def test_lookup_fails_when_cannot_reconcile_requested_features(self): 145 | builder1 = self.builder_for_features('foo', 'bar') 146 | builder2 = self.builder_for_features('foo', 'baz') 147 | self.assertEqual(self.registry.lookup('bar', 'baz'), None) 148 | -------------------------------------------------------------------------------- /bs4/diagnose.py: -------------------------------------------------------------------------------- 1 | """Diagnostic functions, mainly for use when doing tech support.""" 2 | 3 | __license__ = "MIT" 4 | 5 | import cProfile 6 | from StringIO import StringIO 7 | from HTMLParser import HTMLParser 8 | import bs4 9 | from bs4 import BeautifulSoup, __version__ 10 | from bs4.builder import builder_registry 11 | 12 | import os 13 | import pstats 14 | import random 15 | import tempfile 16 | import time 17 | import traceback 18 | import sys 19 | import cProfile 20 | 21 | def diagnose(data): 22 | """Diagnostic suite for isolating common problems.""" 23 | print "Diagnostic running on Beautiful Soup %s" % __version__ 24 | print "Python version %s" % sys.version 25 | 26 | basic_parsers = ["html.parser", "html5lib", "lxml"] 27 | for name in basic_parsers: 28 | for builder in builder_registry.builders: 29 | if name in builder.features: 30 | break 31 | else: 32 | basic_parsers.remove(name) 33 | print ( 34 | "I noticed that %s is not installed. Installing it may help." % 35 | name) 36 | 37 | if 'lxml' in basic_parsers: 38 | basic_parsers.append(["lxml", "xml"]) 39 | try: 40 | from lxml import etree 41 | print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) 42 | except ImportError, e: 43 | print ( 44 | "lxml is not installed or couldn't be imported.") 45 | 46 | 47 | if 'html5lib' in basic_parsers: 48 | try: 49 | import html5lib 50 | print "Found html5lib version %s" % html5lib.__version__ 51 | except ImportError, e: 52 | print ( 53 | "html5lib is not installed or couldn't be imported.") 54 | 55 | if hasattr(data, 'read'): 56 | data = data.read() 57 | elif os.path.exists(data): 58 | print '"%s" looks like a filename. Reading data from the file.' % data 59 | data = open(data).read() 60 | elif data.startswith("http:") or data.startswith("https:"): 61 | print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data 62 | print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." 63 | return 64 | print 65 | 66 | for parser in basic_parsers: 67 | print "Trying to parse your markup with %s" % parser 68 | success = False 69 | try: 70 | soup = BeautifulSoup(data, parser) 71 | success = True 72 | except Exception, e: 73 | print "%s could not parse the markup." % parser 74 | traceback.print_exc() 75 | if success: 76 | print "Here's what %s did with the markup:" % parser 77 | print soup.prettify() 78 | 79 | print "-" * 80 80 | 81 | def lxml_trace(data, html=True, **kwargs): 82 | """Print out the lxml events that occur during parsing. 83 | 84 | This lets you see how lxml parses a document when no Beautiful 85 | Soup code is running. 86 | """ 87 | from lxml import etree 88 | for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): 89 | print("%s, %4s, %s" % (event, element.tag, element.text)) 90 | 91 | class AnnouncingParser(HTMLParser): 92 | """Announces HTMLParser parse events, without doing anything else.""" 93 | 94 | def _p(self, s): 95 | print(s) 96 | 97 | def handle_starttag(self, name, attrs): 98 | self._p("%s START" % name) 99 | 100 | def handle_endtag(self, name): 101 | self._p("%s END" % name) 102 | 103 | def handle_data(self, data): 104 | self._p("%s DATA" % data) 105 | 106 | def handle_charref(self, name): 107 | self._p("%s CHARREF" % name) 108 | 109 | def handle_entityref(self, name): 110 | self._p("%s ENTITYREF" % name) 111 | 112 | def handle_comment(self, data): 113 | self._p("%s COMMENT" % data) 114 | 115 | def handle_decl(self, data): 116 | self._p("%s DECL" % data) 117 | 118 | def unknown_decl(self, data): 119 | self._p("%s UNKNOWN-DECL" % data) 120 | 121 | def handle_pi(self, data): 122 | self._p("%s PI" % data) 123 | 124 | def htmlparser_trace(data): 125 | """Print out the HTMLParser events that occur during parsing. 126 | 127 | This lets you see how HTMLParser parses a document when no 128 | Beautiful Soup code is running. 129 | """ 130 | parser = AnnouncingParser() 131 | parser.feed(data) 132 | 133 | _vowels = "aeiou" 134 | _consonants = "bcdfghjklmnpqrstvwxyz" 135 | 136 | def rword(length=5): 137 | "Generate a random word-like string." 138 | s = '' 139 | for i in range(length): 140 | if i % 2 == 0: 141 | t = _consonants 142 | else: 143 | t = _vowels 144 | s += random.choice(t) 145 | return s 146 | 147 | def rsentence(length=4): 148 | "Generate a random sentence-like string." 149 | return " ".join(rword(random.randint(4,9)) for i in range(length)) 150 | 151 | def rdoc(num_elements=1000): 152 | """Randomly generate an invalid HTML document.""" 153 | tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] 154 | elements = [] 155 | for i in range(num_elements): 156 | choice = random.randint(0,3) 157 | if choice == 0: 158 | # New tag. 159 | tag_name = random.choice(tag_names) 160 | elements.append("<%s>" % tag_name) 161 | elif choice == 1: 162 | elements.append(rsentence(random.randint(1,4))) 163 | elif choice == 2: 164 | # Close a tag. 165 | tag_name = random.choice(tag_names) 166 | elements.append("" % tag_name) 167 | return "" + "\n".join(elements) + "" 168 | 169 | def benchmark_parsers(num_elements=100000): 170 | """Very basic head-to-head performance benchmark.""" 171 | print "Comparative parser benchmark on Beautiful Soup %s" % __version__ 172 | data = rdoc(num_elements) 173 | print "Generated a large invalid HTML document (%d bytes)." % len(data) 174 | 175 | for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: 176 | success = False 177 | try: 178 | a = time.time() 179 | soup = BeautifulSoup(data, parser) 180 | b = time.time() 181 | success = True 182 | except Exception, e: 183 | print "%s could not parse the markup." % parser 184 | traceback.print_exc() 185 | if success: 186 | print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) 187 | 188 | from lxml import etree 189 | a = time.time() 190 | etree.HTML(data) 191 | b = time.time() 192 | print "Raw lxml parsed the markup in %.2fs." % (b-a) 193 | 194 | import html5lib 195 | parser = html5lib.HTMLParser() 196 | a = time.time() 197 | parser.parse(data) 198 | b = time.time() 199 | print "Raw html5lib parsed the markup in %.2fs." % (b-a) 200 | 201 | def profile(num_elements=100000, parser="lxml"): 202 | 203 | filehandle = tempfile.NamedTemporaryFile() 204 | filename = filehandle.name 205 | 206 | data = rdoc(num_elements) 207 | vars = dict(bs4=bs4, data=data, parser=parser) 208 | cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename) 209 | 210 | stats = pstats.Stats(filename) 211 | # stats.strip_dirs() 212 | stats.sort_stats("cumulative") 213 | stats.print_stats('_html5lib|bs4', 50) 214 | 215 | if __name__ == '__main__': 216 | diagnose(sys.stdin.read()) 217 | -------------------------------------------------------------------------------- /workflow/background.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | # 4 | # Copyright (c) 2014 deanishe@deanishe.net 5 | # 6 | # MIT Licence. See http://opensource.org/licenses/MIT 7 | # 8 | # Created on 2014-04-06 9 | # 10 | 11 | """ 12 | Run background tasks 13 | """ 14 | 15 | from __future__ import print_function, unicode_literals 16 | 17 | import sys 18 | import os 19 | import subprocess 20 | import pickle 21 | 22 | from workflow import Workflow 23 | 24 | __all__ = ['is_running', 'run_in_background'] 25 | 26 | _wf = None 27 | 28 | 29 | def wf(): 30 | global _wf 31 | if _wf is None: 32 | _wf = Workflow() 33 | return _wf 34 | 35 | 36 | def _arg_cache(name): 37 | """Return path to pickle cache file for arguments 38 | 39 | :param name: name of task 40 | :type name: ``unicode`` 41 | :returns: Path to cache file 42 | :rtype: ``unicode`` filepath 43 | 44 | """ 45 | 46 | return wf().cachefile('{0}.argcache'.format(name)) 47 | 48 | 49 | def _pid_file(name): 50 | """Return path to PID file for ``name`` 51 | 52 | :param name: name of task 53 | :type name: ``unicode`` 54 | :returns: Path to PID file for task 55 | :rtype: ``unicode`` filepath 56 | 57 | """ 58 | 59 | return wf().cachefile('{0}.pid'.format(name)) 60 | 61 | 62 | def _process_exists(pid): 63 | """Check if a process with PID ``pid`` exists 64 | 65 | :param pid: PID to check 66 | :type pid: ``int`` 67 | :returns: ``True`` if process exists, else ``False`` 68 | :rtype: ``Boolean`` 69 | """ 70 | 71 | try: 72 | os.kill(pid, 0) 73 | except OSError: # not running 74 | return False 75 | return True 76 | 77 | 78 | def is_running(name): 79 | """ 80 | Test whether task is running under ``name`` 81 | 82 | :param name: name of task 83 | :type name: ``unicode`` 84 | :returns: ``True`` if task with name ``name`` is running, else ``False`` 85 | :rtype: ``Boolean`` 86 | 87 | """ 88 | pidfile = _pid_file(name) 89 | if not os.path.exists(pidfile): 90 | return False 91 | 92 | with open(pidfile, 'rb') as file_obj: 93 | pid = int(file_obj.read().strip()) 94 | 95 | if _process_exists(pid): 96 | return True 97 | 98 | elif os.path.exists(pidfile): 99 | os.unlink(pidfile) 100 | 101 | return False 102 | 103 | 104 | def _background(stdin='/dev/null', stdout='/dev/null', 105 | stderr='/dev/null'): # pragma: no cover 106 | """Fork the current process into a background daemon. 107 | 108 | :param stdin: where to read input 109 | :type stdin: filepath 110 | :param stdout: where to write stdout output 111 | :type stdout: filepath 112 | :param stderr: where to write stderr output 113 | :type stderr: filepath 114 | 115 | """ 116 | 117 | # Do first fork. 118 | try: 119 | pid = os.fork() 120 | if pid > 0: 121 | sys.exit(0) # Exit first parent. 122 | except OSError as e: 123 | wf().logger.critical("fork #1 failed: ({0:d}) {1}".format( 124 | e.errno, e.strerror)) 125 | sys.exit(1) 126 | # Decouple from parent environment. 127 | os.chdir(wf().workflowdir) 128 | os.umask(0) 129 | os.setsid() 130 | # Do second fork. 131 | try: 132 | pid = os.fork() 133 | if pid > 0: 134 | sys.exit(0) # Exit second parent. 135 | except OSError as e: 136 | wf().logger.critical("fork #2 failed: ({0:d}) {1}".format( 137 | e.errno, e.strerror)) 138 | sys.exit(1) 139 | # Now I am a daemon! 140 | # Redirect standard file descriptors. 141 | si = file(stdin, 'r', 0) 142 | so = file(stdout, 'a+', 0) 143 | se = file(stderr, 'a+', 0) 144 | if hasattr(sys.stdin, 'fileno'): 145 | os.dup2(si.fileno(), sys.stdin.fileno()) 146 | if hasattr(sys.stdout, 'fileno'): 147 | os.dup2(so.fileno(), sys.stdout.fileno()) 148 | if hasattr(sys.stderr, 'fileno'): 149 | os.dup2(se.fileno(), sys.stderr.fileno()) 150 | 151 | 152 | def run_in_background(name, args, **kwargs): 153 | """Pickle arguments to cache file, then call this script again via 154 | :func:`subprocess.call`. 155 | 156 | :param name: name of task 157 | :type name: ``unicode`` 158 | :param args: arguments passed as first argument to :func:`subprocess.call` 159 | :param \**kwargs: keyword arguments to :func:`subprocess.call` 160 | :returns: exit code of sub-process 161 | :rtype: ``int`` 162 | 163 | When you call this function, it caches its arguments and then calls 164 | ``background.py`` in a subprocess. The Python subprocess will load the 165 | cached arguments, fork into the background, and then run the command you 166 | specified. 167 | 168 | This function will return as soon as the ``background.py`` subprocess has 169 | forked, returning the exit code of *that* process (i.e. not of the command 170 | you're trying to run). 171 | 172 | If that process fails, an error will be written to the log file. 173 | 174 | If a process is already running under the same name, this function will 175 | return immediately and will not run the specified command. 176 | 177 | """ 178 | 179 | if is_running(name): 180 | wf().logger.info('Task `{0}` is already running'.format(name)) 181 | return 182 | 183 | argcache = _arg_cache(name) 184 | 185 | # Cache arguments 186 | with open(argcache, 'wb') as file_obj: 187 | pickle.dump({'args': args, 'kwargs': kwargs}, file_obj) 188 | wf().logger.debug('Command arguments cached to `{0}`'.format(argcache)) 189 | 190 | # Call this script 191 | cmd = ['/usr/bin/python', __file__, name] 192 | wf().logger.debug('Calling {0!r} ...'.format(cmd)) 193 | retcode = subprocess.call(cmd) 194 | if retcode: # pragma: no cover 195 | wf().logger.error('Failed to call task in background') 196 | else: 197 | wf().logger.debug('Executing task `{0}` in background...'.format(name)) 198 | return retcode 199 | 200 | 201 | def main(wf): # pragma: no cover 202 | """ 203 | Load cached arguments, fork into background, then call 204 | :meth:`subprocess.call` with cached arguments 205 | 206 | """ 207 | 208 | name = wf.args[0] 209 | argcache = _arg_cache(name) 210 | if not os.path.exists(argcache): 211 | wf.logger.critical('No arg cache found : {0!r}'.format(argcache)) 212 | return 1 213 | 214 | # Load cached arguments 215 | with open(argcache, 'rb') as file_obj: 216 | data = pickle.load(file_obj) 217 | 218 | # Cached arguments 219 | args = data['args'] 220 | kwargs = data['kwargs'] 221 | 222 | # Delete argument cache file 223 | os.unlink(argcache) 224 | 225 | pidfile = _pid_file(name) 226 | 227 | # Fork to background 228 | _background() 229 | 230 | # Write PID to file 231 | with open(pidfile, 'wb') as file_obj: 232 | file_obj.write('{0}'.format(os.getpid())) 233 | 234 | # Run the command 235 | try: 236 | wf.logger.debug('Task `{0}` running'.format(name)) 237 | wf.logger.debug('cmd : {0!r}'.format(args)) 238 | 239 | retcode = subprocess.call(args, **kwargs) 240 | 241 | if retcode: 242 | wf.logger.error('Command failed with [{0}] : {1!r}'.format( 243 | retcode, args)) 244 | 245 | finally: 246 | if os.path.exists(pidfile): 247 | os.unlink(pidfile) 248 | wf.logger.debug('Task `{0}` finished'.format(name)) 249 | 250 | 251 | if __name__ == '__main__': # pragma: no cover 252 | wf().run(main) 253 | -------------------------------------------------------------------------------- /info.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | bundleid 6 | 7 | connections 8 | 9 | A27C006C-6D4A-4606-8ADB-7B07B6A0FCA9 10 | 11 | 12 | destinationuid 13 | 0EE06942-74AB-41FF-908C-FFBF35486C5D 14 | modifiers 15 | 0 16 | modifiersubtext 17 | 18 | vitoclose 19 | 20 | 21 | 22 | A89A4365-D0B5-46CE-A7B6-A1640961FC55 23 | 24 | 25 | destinationuid 26 | C79D2345-AEF5-4071-B770-C843BA8BCF93 27 | modifiers 28 | 0 29 | modifiersubtext 30 | 31 | vitoclose 32 | 33 | 34 | 35 | C79D2345-AEF5-4071-B770-C843BA8BCF93 36 | 37 | 38 | destinationuid 39 | EBDBF41A-8F1F-4AB3-A4DD-49A38F7999F6 40 | modifiers 41 | 0 42 | modifiersubtext 43 | 44 | vitoclose 45 | 46 | 47 | 48 | FA518468-89D6-4B36-845A-73865872F078 49 | 50 | 51 | destinationuid 52 | 6A616FEE-259D-4EB5-B155-5D8C64455C66 53 | modifiers 54 | 1048576 55 | modifiersubtext 56 | 推送静态站到origin master 57 | vitoclose 58 | 59 | 60 | 61 | destinationuid 62 | A27C006C-6D4A-4606-8ADB-7B07B6A0FCA9 63 | modifiers 64 | 0 65 | modifiersubtext 66 | 67 | vitoclose 68 | 69 | 70 | 71 | 72 | createdby 73 | Hao Li 74 | description 75 | 76 | disabled 77 | 78 | name 79 | Mweb-Blog 80 | objects 81 | 82 | 83 | config 84 | 85 | argumenttype 86 | 2 87 | keyword 88 | mblog 89 | subtext 90 | 本地浏览MWeb网站 91 | text 92 | MWeb 辅助工具 93 | withspace 94 | 95 | 96 | type 97 | alfred.workflow.input.keyword 98 | uid 99 | FA518468-89D6-4B36-845A-73865872F078 100 | version 101 | 0 102 | 103 | 104 | config 105 | 106 | concurrently 107 | 108 | escaping 109 | 102 110 | script 111 | #!/bin/bash 112 | DIR="$site_path" 113 | echo "push blog", $DIR 114 | if [ -d "$DIR" ]; then 115 | cd ${DIR} 116 | git add -A 117 | git commit -m "Update" 118 | git push -u origin master 119 | fi 120 | scriptargtype 121 | 0 122 | scriptfile 123 | 124 | type 125 | 0 126 | 127 | type 128 | alfred.workflow.action.script 129 | uid 130 | 6A616FEE-259D-4EB5-B155-5D8C64455C66 131 | version 132 | 1 133 | 134 | 135 | config 136 | 137 | browser 138 | 139 | spaces 140 | 141 | url 142 | {query} 143 | utf8 144 | 145 | 146 | type 147 | alfred.workflow.action.openurl 148 | uid 149 | 0EE06942-74AB-41FF-908C-FFBF35486C5D 150 | version 151 | 0 152 | 153 | 154 | config 155 | 156 | concurrently 157 | 158 | escaping 159 | 102 160 | script 161 | #!/bin/bash 162 | xattr -c $site_path/index.html 163 | echo -n "file://$site_path/index.html" 164 | scriptargtype 165 | 0 166 | scriptfile 167 | 168 | type 169 | 0 170 | 171 | type 172 | alfred.workflow.action.script 173 | uid 174 | A27C006C-6D4A-4606-8ADB-7B07B6A0FCA9 175 | version 176 | 1 177 | 178 | 179 | config 180 | 181 | lastpathcomponent 182 | 183 | onlyshowifquerypopulated 184 | 185 | removeextension 186 | 187 | text 188 | {query} 189 | title 190 | 已复制到剪切版 191 | 192 | type 193 | alfred.workflow.output.notification 194 | uid 195 | EBDBF41A-8F1F-4AB3-A4DD-49A38F7999F6 196 | version 197 | 0 198 | 199 | 200 | config 201 | 202 | alfredfiltersresults 203 | 204 | argumenttype 205 | 1 206 | escaping 207 | 102 208 | keyword 209 | mlist 210 | queuedelaycustom 211 | 3 212 | queuedelayimmediatelyinitially 213 | 214 | queuedelaymode 215 | 0 216 | queuemode 217 | 1 218 | runningsubtext 219 | ... 220 | script 221 | query=$1 222 | 223 | site_path="$site_path" 224 | 225 | python ListArticle.py "$site_path" "$query" 226 | scriptargtype 227 | 1 228 | scriptfile 229 | 230 | subtext 231 | 使用拼音搜索,选中后复制到剪切版 232 | title 233 | MWeb 静态网页文章列表 234 | type 235 | 0 236 | withspace 237 | 238 | 239 | type 240 | alfred.workflow.input.scriptfilter 241 | uid 242 | A89A4365-D0B5-46CE-A7B6-A1640961FC55 243 | version 244 | 1 245 | 246 | 247 | config 248 | 249 | autopaste 250 | 251 | clipboardtext 252 | {query} 253 | transient 254 | 255 | 256 | type 257 | alfred.workflow.output.clipboard 258 | uid 259 | C79D2345-AEF5-4071-B770-C843BA8BCF93 260 | version 261 | 1 262 | 263 | 264 | readme 265 | 266 | uidata 267 | 268 | 0EE06942-74AB-41FF-908C-FFBF35486C5D 269 | 270 | xpos 271 | 550 272 | ypos 273 | 200 274 | 275 | 6A616FEE-259D-4EB5-B155-5D8C64455C66 276 | 277 | xpos 278 | 550 279 | ypos 280 | 80 281 | 282 | A27C006C-6D4A-4606-8ADB-7B07B6A0FCA9 283 | 284 | xpos 285 | 370 286 | ypos 287 | 200 288 | 289 | A89A4365-D0B5-46CE-A7B6-A1640961FC55 290 | 291 | xpos 292 | 190 293 | ypos 294 | 340 295 | 296 | C79D2345-AEF5-4071-B770-C843BA8BCF93 297 | 298 | xpos 299 | 370 300 | ypos 301 | 340 302 | 303 | EBDBF41A-8F1F-4AB3-A4DD-49A38F7999F6 304 | 305 | xpos 306 | 550 307 | ypos 308 | 340 309 | 310 | FA518468-89D6-4B36-845A-73865872F078 311 | 312 | xpos 313 | 180 314 | ypos 315 | 80 316 | 317 | 318 | variables 319 | 320 | site_path 321 | /Users/hao/blog/MWeb-Blog/Blog 322 | 323 | variablesdontexport 324 | 325 | site_path 326 | 327 | version 328 | 329 | webaddress 330 | 331 | 332 | 333 | -------------------------------------------------------------------------------- /bs4/builder/_lxml.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | 'LXMLTreeBuilderForXML', 3 | 'LXMLTreeBuilder', 4 | ] 5 | 6 | from io import BytesIO 7 | from StringIO import StringIO 8 | import collections 9 | from lxml import etree 10 | from bs4.element import ( 11 | Comment, 12 | Doctype, 13 | NamespacedAttribute, 14 | ProcessingInstruction, 15 | ) 16 | from bs4.builder import ( 17 | FAST, 18 | HTML, 19 | HTMLTreeBuilder, 20 | PERMISSIVE, 21 | ParserRejectedMarkup, 22 | TreeBuilder, 23 | XML) 24 | from bs4.dammit import EncodingDetector 25 | 26 | LXML = 'lxml' 27 | 28 | class LXMLTreeBuilderForXML(TreeBuilder): 29 | DEFAULT_PARSER_CLASS = etree.XMLParser 30 | 31 | is_xml = True 32 | 33 | NAME = "lxml-xml" 34 | ALTERNATE_NAMES = ["xml"] 35 | 36 | # Well, it's permissive by XML parser standards. 37 | features = [NAME, LXML, XML, FAST, PERMISSIVE] 38 | 39 | CHUNK_SIZE = 512 40 | 41 | # This namespace mapping is specified in the XML Namespace 42 | # standard. 43 | DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} 44 | 45 | def default_parser(self, encoding): 46 | # This can either return a parser object or a class, which 47 | # will be instantiated with default arguments. 48 | if self._default_parser is not None: 49 | return self._default_parser 50 | return etree.XMLParser( 51 | target=self, strip_cdata=False, recover=True, encoding=encoding) 52 | 53 | def parser_for(self, encoding): 54 | # Use the default parser. 55 | parser = self.default_parser(encoding) 56 | 57 | if isinstance(parser, collections.Callable): 58 | # Instantiate the parser with default arguments 59 | parser = parser(target=self, strip_cdata=False, encoding=encoding) 60 | return parser 61 | 62 | def __init__(self, parser=None, empty_element_tags=None): 63 | # TODO: Issue a warning if parser is present but not a 64 | # callable, since that means there's no way to create new 65 | # parsers for different encodings. 66 | self._default_parser = parser 67 | if empty_element_tags is not None: 68 | self.empty_element_tags = set(empty_element_tags) 69 | self.soup = None 70 | self.nsmaps = [self.DEFAULT_NSMAPS] 71 | 72 | def _getNsTag(self, tag): 73 | # Split the namespace URL out of a fully-qualified lxml tag 74 | # name. Copied from lxml's src/lxml/sax.py. 75 | if tag[0] == '{': 76 | return tuple(tag[1:].split('}', 1)) 77 | else: 78 | return (None, tag) 79 | 80 | def prepare_markup(self, markup, user_specified_encoding=None, 81 | exclude_encodings=None, 82 | document_declared_encoding=None): 83 | """ 84 | :yield: A series of 4-tuples. 85 | (markup, encoding, declared encoding, 86 | has undergone character replacement) 87 | 88 | Each 4-tuple represents a strategy for parsing the document. 89 | """ 90 | if isinstance(markup, unicode): 91 | # We were given Unicode. Maybe lxml can parse Unicode on 92 | # this system? 93 | yield markup, None, document_declared_encoding, False 94 | 95 | if isinstance(markup, unicode): 96 | # No, apparently not. Convert the Unicode to UTF-8 and 97 | # tell lxml to parse it as UTF-8. 98 | yield (markup.encode("utf8"), "utf8", 99 | document_declared_encoding, False) 100 | 101 | # Instead of using UnicodeDammit to convert the bytestring to 102 | # Unicode using different encodings, use EncodingDetector to 103 | # iterate over the encodings, and tell lxml to try to parse 104 | # the document as each one in turn. 105 | is_html = not self.is_xml 106 | try_encodings = [user_specified_encoding, document_declared_encoding] 107 | detector = EncodingDetector( 108 | markup, try_encodings, is_html, exclude_encodings) 109 | for encoding in detector.encodings: 110 | yield (detector.markup, encoding, document_declared_encoding, False) 111 | 112 | def feed(self, markup): 113 | if isinstance(markup, bytes): 114 | markup = BytesIO(markup) 115 | elif isinstance(markup, unicode): 116 | markup = StringIO(markup) 117 | 118 | # Call feed() at least once, even if the markup is empty, 119 | # or the parser won't be initialized. 120 | data = markup.read(self.CHUNK_SIZE) 121 | try: 122 | self.parser = self.parser_for(self.soup.original_encoding) 123 | self.parser.feed(data) 124 | while len(data) != 0: 125 | # Now call feed() on the rest of the data, chunk by chunk. 126 | data = markup.read(self.CHUNK_SIZE) 127 | if len(data) != 0: 128 | self.parser.feed(data) 129 | self.parser.close() 130 | except (UnicodeDecodeError, LookupError, etree.ParserError), e: 131 | raise ParserRejectedMarkup(str(e)) 132 | 133 | def close(self): 134 | self.nsmaps = [self.DEFAULT_NSMAPS] 135 | 136 | def start(self, name, attrs, nsmap={}): 137 | # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. 138 | attrs = dict(attrs) 139 | nsprefix = None 140 | # Invert each namespace map as it comes in. 141 | if len(self.nsmaps) > 1: 142 | # There are no new namespaces for this tag, but 143 | # non-default namespaces are in play, so we need a 144 | # separate tag stack to know when they end. 145 | self.nsmaps.append(None) 146 | elif len(nsmap) > 0: 147 | # A new namespace mapping has come into play. 148 | inverted_nsmap = dict((value, key) for key, value in nsmap.items()) 149 | self.nsmaps.append(inverted_nsmap) 150 | # Also treat the namespace mapping as a set of attributes on the 151 | # tag, so we can recreate it later. 152 | attrs = attrs.copy() 153 | for prefix, namespace in nsmap.items(): 154 | attribute = NamespacedAttribute( 155 | "xmlns", prefix, "http://www.w3.org/2000/xmlns/") 156 | attrs[attribute] = namespace 157 | 158 | # Namespaces are in play. Find any attributes that came in 159 | # from lxml with namespaces attached to their names, and 160 | # turn then into NamespacedAttribute objects. 161 | new_attrs = {} 162 | for attr, value in attrs.items(): 163 | namespace, attr = self._getNsTag(attr) 164 | if namespace is None: 165 | new_attrs[attr] = value 166 | else: 167 | nsprefix = self._prefix_for_namespace(namespace) 168 | attr = NamespacedAttribute(nsprefix, attr, namespace) 169 | new_attrs[attr] = value 170 | attrs = new_attrs 171 | 172 | namespace, name = self._getNsTag(name) 173 | nsprefix = self._prefix_for_namespace(namespace) 174 | self.soup.handle_starttag(name, namespace, nsprefix, attrs) 175 | 176 | def _prefix_for_namespace(self, namespace): 177 | """Find the currently active prefix for the given namespace.""" 178 | if namespace is None: 179 | return None 180 | for inverted_nsmap in reversed(self.nsmaps): 181 | if inverted_nsmap is not None and namespace in inverted_nsmap: 182 | return inverted_nsmap[namespace] 183 | return None 184 | 185 | def end(self, name): 186 | self.soup.endData() 187 | completed_tag = self.soup.tagStack[-1] 188 | namespace, name = self._getNsTag(name) 189 | nsprefix = None 190 | if namespace is not None: 191 | for inverted_nsmap in reversed(self.nsmaps): 192 | if inverted_nsmap is not None and namespace in inverted_nsmap: 193 | nsprefix = inverted_nsmap[namespace] 194 | break 195 | self.soup.handle_endtag(name, nsprefix) 196 | if len(self.nsmaps) > 1: 197 | # This tag, or one of its parents, introduced a namespace 198 | # mapping, so pop it off the stack. 199 | self.nsmaps.pop() 200 | 201 | def pi(self, target, data): 202 | self.soup.endData() 203 | self.soup.handle_data(target + ' ' + data) 204 | self.soup.endData(ProcessingInstruction) 205 | 206 | def data(self, content): 207 | self.soup.handle_data(content) 208 | 209 | def doctype(self, name, pubid, system): 210 | self.soup.endData() 211 | doctype = Doctype.for_name_and_ids(name, pubid, system) 212 | self.soup.object_was_parsed(doctype) 213 | 214 | def comment(self, content): 215 | "Handle comments as Comment objects." 216 | self.soup.endData() 217 | self.soup.handle_data(content) 218 | self.soup.endData(Comment) 219 | 220 | def test_fragment_to_document(self, fragment): 221 | """See `TreeBuilder`.""" 222 | return u'\n%s' % fragment 223 | 224 | 225 | class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): 226 | 227 | NAME = LXML 228 | ALTERNATE_NAMES = ["lxml-html"] 229 | 230 | features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] 231 | is_xml = False 232 | 233 | def default_parser(self, encoding): 234 | return etree.HTMLParser 235 | 236 | def feed(self, markup): 237 | encoding = self.soup.original_encoding 238 | try: 239 | self.parser = self.parser_for(encoding) 240 | self.parser.feed(markup) 241 | self.parser.close() 242 | except (UnicodeDecodeError, LookupError, etree.ParserError), e: 243 | raise ParserRejectedMarkup(str(e)) 244 | 245 | 246 | def test_fragment_to_document(self, fragment): 247 | """See `TreeBuilder`.""" 248 | return u'%s' % fragment 249 | -------------------------------------------------------------------------------- /bs4/builder/_htmlparser.py: -------------------------------------------------------------------------------- 1 | """Use the HTMLParser library to parse HTML files that aren't too bad.""" 2 | 3 | __all__ = [ 4 | 'HTMLParserTreeBuilder', 5 | ] 6 | 7 | from HTMLParser import HTMLParser 8 | 9 | try: 10 | from HTMLParser import HTMLParseError 11 | except ImportError, e: 12 | # HTMLParseError is removed in Python 3.5. Since it can never be 13 | # thrown in 3.5, we can just define our own class as a placeholder. 14 | class HTMLParseError(Exception): 15 | pass 16 | 17 | import sys 18 | import warnings 19 | 20 | # Starting in Python 3.2, the HTMLParser constructor takes a 'strict' 21 | # argument, which we'd like to set to False. Unfortunately, 22 | # http://bugs.python.org/issue13273 makes strict=True a better bet 23 | # before Python 3.2.3. 24 | # 25 | # At the end of this file, we monkeypatch HTMLParser so that 26 | # strict=True works well on Python 3.2.2. 27 | major, minor, release = sys.version_info[:3] 28 | CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 29 | CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 30 | CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 31 | 32 | 33 | from bs4.element import ( 34 | CData, 35 | Comment, 36 | Declaration, 37 | Doctype, 38 | ProcessingInstruction, 39 | ) 40 | from bs4.dammit import EntitySubstitution, UnicodeDammit 41 | 42 | from bs4.builder import ( 43 | HTML, 44 | HTMLTreeBuilder, 45 | STRICT, 46 | ) 47 | 48 | 49 | HTMLPARSER = 'html.parser' 50 | 51 | class BeautifulSoupHTMLParser(HTMLParser): 52 | def handle_starttag(self, name, attrs): 53 | # XXX namespace 54 | attr_dict = {} 55 | for key, value in attrs: 56 | # Change None attribute values to the empty string 57 | # for consistency with the other tree builders. 58 | if value is None: 59 | value = '' 60 | attr_dict[key] = value 61 | attrvalue = '""' 62 | self.soup.handle_starttag(name, None, None, attr_dict) 63 | 64 | def handle_endtag(self, name): 65 | self.soup.handle_endtag(name) 66 | 67 | def handle_data(self, data): 68 | self.soup.handle_data(data) 69 | 70 | def handle_charref(self, name): 71 | # XXX workaround for a bug in HTMLParser. Remove this once 72 | # it's fixed in all supported versions. 73 | # http://bugs.python.org/issue13633 74 | if name.startswith('x'): 75 | real_name = int(name.lstrip('x'), 16) 76 | elif name.startswith('X'): 77 | real_name = int(name.lstrip('X'), 16) 78 | else: 79 | real_name = int(name) 80 | 81 | try: 82 | data = unichr(real_name) 83 | except (ValueError, OverflowError), e: 84 | data = u"\N{REPLACEMENT CHARACTER}" 85 | 86 | self.handle_data(data) 87 | 88 | def handle_entityref(self, name): 89 | character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) 90 | if character is not None: 91 | data = character 92 | else: 93 | data = "&%s;" % name 94 | self.handle_data(data) 95 | 96 | def handle_comment(self, data): 97 | self.soup.endData() 98 | self.soup.handle_data(data) 99 | self.soup.endData(Comment) 100 | 101 | def handle_decl(self, data): 102 | self.soup.endData() 103 | if data.startswith("DOCTYPE "): 104 | data = data[len("DOCTYPE "):] 105 | elif data == 'DOCTYPE': 106 | # i.e. "" 107 | data = '' 108 | self.soup.handle_data(data) 109 | self.soup.endData(Doctype) 110 | 111 | def unknown_decl(self, data): 112 | if data.upper().startswith('CDATA['): 113 | cls = CData 114 | data = data[len('CDATA['):] 115 | else: 116 | cls = Declaration 117 | self.soup.endData() 118 | self.soup.handle_data(data) 119 | self.soup.endData(cls) 120 | 121 | def handle_pi(self, data): 122 | self.soup.endData() 123 | self.soup.handle_data(data) 124 | self.soup.endData(ProcessingInstruction) 125 | 126 | 127 | class HTMLParserTreeBuilder(HTMLTreeBuilder): 128 | 129 | is_xml = False 130 | picklable = True 131 | NAME = HTMLPARSER 132 | features = [NAME, HTML, STRICT] 133 | 134 | def __init__(self, *args, **kwargs): 135 | if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: 136 | kwargs['strict'] = False 137 | if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: 138 | kwargs['convert_charrefs'] = False 139 | self.parser_args = (args, kwargs) 140 | 141 | def prepare_markup(self, markup, user_specified_encoding=None, 142 | document_declared_encoding=None, exclude_encodings=None): 143 | """ 144 | :return: A 4-tuple (markup, original encoding, encoding 145 | declared within markup, whether any characters had to be 146 | replaced with REPLACEMENT CHARACTER). 147 | """ 148 | if isinstance(markup, unicode): 149 | yield (markup, None, None, False) 150 | return 151 | 152 | try_encodings = [user_specified_encoding, document_declared_encoding] 153 | dammit = UnicodeDammit(markup, try_encodings, is_html=True, 154 | exclude_encodings=exclude_encodings) 155 | yield (dammit.markup, dammit.original_encoding, 156 | dammit.declared_html_encoding, 157 | dammit.contains_replacement_characters) 158 | 159 | def feed(self, markup): 160 | args, kwargs = self.parser_args 161 | parser = BeautifulSoupHTMLParser(*args, **kwargs) 162 | parser.soup = self.soup 163 | try: 164 | parser.feed(markup) 165 | except HTMLParseError, e: 166 | warnings.warn(RuntimeWarning( 167 | "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) 168 | raise e 169 | 170 | # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some 171 | # 3.2.3 code. This ensures they don't treat markup like

as a 172 | # string. 173 | # 174 | # XXX This code can be removed once most Python 3 users are on 3.2.3. 175 | if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: 176 | import re 177 | attrfind_tolerant = re.compile( 178 | r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' 179 | r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') 180 | HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant 181 | 182 | locatestarttagend = re.compile(r""" 183 | <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name 184 | (?:\s+ # whitespace before attribute name 185 | (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name 186 | (?:\s*=\s* # value indicator 187 | (?:'[^']*' # LITA-enclosed value 188 | |\"[^\"]*\" # LIT-enclosed value 189 | |[^'\">\s]+ # bare value 190 | ) 191 | )? 192 | ) 193 | )* 194 | \s* # trailing whitespace 195 | """, re.VERBOSE) 196 | BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend 197 | 198 | from html.parser import tagfind, attrfind 199 | 200 | def parse_starttag(self, i): 201 | self.__starttag_text = None 202 | endpos = self.check_for_whole_start_tag(i) 203 | if endpos < 0: 204 | return endpos 205 | rawdata = self.rawdata 206 | self.__starttag_text = rawdata[i:endpos] 207 | 208 | # Now parse the data between i+1 and j into a tag and attrs 209 | attrs = [] 210 | match = tagfind.match(rawdata, i+1) 211 | assert match, 'unexpected call to parse_starttag()' 212 | k = match.end() 213 | self.lasttag = tag = rawdata[i+1:k].lower() 214 | while k < endpos: 215 | if self.strict: 216 | m = attrfind.match(rawdata, k) 217 | else: 218 | m = attrfind_tolerant.match(rawdata, k) 219 | if not m: 220 | break 221 | attrname, rest, attrvalue = m.group(1, 2, 3) 222 | if not rest: 223 | attrvalue = None 224 | elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 225 | attrvalue[:1] == '"' == attrvalue[-1:]: 226 | attrvalue = attrvalue[1:-1] 227 | if attrvalue: 228 | attrvalue = self.unescape(attrvalue) 229 | attrs.append((attrname.lower(), attrvalue)) 230 | k = m.end() 231 | 232 | end = rawdata[k:endpos].strip() 233 | if end not in (">", "/>"): 234 | lineno, offset = self.getpos() 235 | if "\n" in self.__starttag_text: 236 | lineno = lineno + self.__starttag_text.count("\n") 237 | offset = len(self.__starttag_text) \ 238 | - self.__starttag_text.rfind("\n") 239 | else: 240 | offset = offset + len(self.__starttag_text) 241 | if self.strict: 242 | self.error("junk characters in start tag: %r" 243 | % (rawdata[k:endpos][:20],)) 244 | self.handle_data(rawdata[i:endpos]) 245 | return endpos 246 | if end.endswith('/>'): 247 | # XHTML-style empty tag: 248 | self.handle_startendtag(tag, attrs) 249 | else: 250 | self.handle_starttag(tag, attrs) 251 | if tag in self.CDATA_CONTENT_ELEMENTS: 252 | self.set_cdata_mode(tag) 253 | return endpos 254 | 255 | def set_cdata_mode(self, elem): 256 | self.cdata_elem = elem.lower() 257 | self.interesting = re.compile(r'' % self.cdata_elem, re.I) 258 | 259 | BeautifulSoupHTMLParser.parse_starttag = parse_starttag 260 | BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode 261 | 262 | CONSTRUCTOR_TAKES_STRICT = True 263 | -------------------------------------------------------------------------------- /workflow/notify.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | # 4 | # Copyright (c) 2015 deanishe@deanishe.net 5 | # 6 | # MIT Licence. See http://opensource.org/licenses/MIT 7 | # 8 | # Created on 2015-11-26 9 | # 10 | 11 | # TODO: Exclude this module from test and code coverage in py2.6 12 | 13 | """ 14 | Post notifications via the OS X Notification Center. This feature 15 | is only available on Mountain Lion (10.8) and later. It will 16 | silently fail on older systems. 17 | 18 | The main API is a single function, :func:`~workflow.notify.notify`. 19 | 20 | It works by copying a simple application to your workflow's data 21 | directory. It replaces the application's icon with your workflow's 22 | icon and then calls the application to post notifications. 23 | """ 24 | 25 | from __future__ import print_function, unicode_literals 26 | 27 | import os 28 | import plistlib 29 | import shutil 30 | import subprocess 31 | import sys 32 | import tarfile 33 | import tempfile 34 | import uuid 35 | 36 | import workflow 37 | 38 | 39 | _wf = None 40 | _log = None 41 | 42 | 43 | #: Available system sounds from System Preferences > Sound > Sound Effects 44 | SOUNDS = ( 45 | 'Basso', 46 | 'Blow', 47 | 'Bottle', 48 | 'Frog', 49 | 'Funk', 50 | 'Glass', 51 | 'Hero', 52 | 'Morse', 53 | 'Ping', 54 | 'Pop', 55 | 'Purr', 56 | 'Sosumi', 57 | 'Submarine', 58 | 'Tink', 59 | ) 60 | 61 | 62 | def wf(): 63 | """Return `Workflow` object for this module. 64 | 65 | Returns: 66 | workflow.Workflow: `Workflow` object for current workflow. 67 | """ 68 | global _wf 69 | if _wf is None: 70 | _wf = workflow.Workflow() 71 | return _wf 72 | 73 | 74 | def log(): 75 | """Return logger for this module. 76 | 77 | Returns: 78 | logging.Logger: Logger for this module. 79 | """ 80 | global _log 81 | if _log is None: 82 | _log = wf().logger 83 | return _log 84 | 85 | 86 | def notifier_program(): 87 | """Return path to notifier applet executable. 88 | 89 | Returns: 90 | unicode: Path to Notify.app `applet` executable. 91 | """ 92 | return wf().datafile('Notify.app/Contents/MacOS/applet') 93 | 94 | 95 | def notifier_icon_path(): 96 | """Return path to icon file in installed Notify.app. 97 | 98 | Returns: 99 | unicode: Path to `applet.icns` within the app bundle. 100 | """ 101 | return wf().datafile('Notify.app/Contents/Resources/applet.icns') 102 | 103 | 104 | def install_notifier(): 105 | """Extract `Notify.app` from the workflow to data directory. 106 | 107 | Changes the bundle ID of the installed app and gives it the 108 | workflow's icon. 109 | """ 110 | archive = os.path.join(os.path.dirname(__file__), 'Notify.tgz') 111 | destdir = wf().datadir 112 | app_path = os.path.join(destdir, 'Notify.app') 113 | n = notifier_program() 114 | log().debug("Installing Notify.app to %r ...", destdir) 115 | # z = zipfile.ZipFile(archive, 'r') 116 | # z.extractall(destdir) 117 | tgz = tarfile.open(archive, 'r:gz') 118 | tgz.extractall(destdir) 119 | assert os.path.exists(n), ( 120 | "Notify.app could not be installed in {0!r}.".format(destdir)) 121 | 122 | # Replace applet icon 123 | icon = notifier_icon_path() 124 | workflow_icon = wf().workflowfile('icon.png') 125 | if os.path.exists(icon): 126 | os.unlink(icon) 127 | 128 | png_to_icns(workflow_icon, icon) 129 | 130 | # Set file icon 131 | # PyObjC isn't available for 2.6, so this is 2.7 only. Actually, 132 | # none of this code will "work" on pre-10.8 systems. Let it run 133 | # until I figure out a better way of excluding this module 134 | # from coverage in py2.6. 135 | if sys.version_info >= (2, 7): # pragma: no cover 136 | from AppKit import NSWorkspace, NSImage 137 | 138 | ws = NSWorkspace.sharedWorkspace() 139 | img = NSImage.alloc().init() 140 | img.initWithContentsOfFile_(icon) 141 | ws.setIcon_forFile_options_(img, app_path, 0) 142 | 143 | # Change bundle ID of installed app 144 | ip_path = os.path.join(app_path, 'Contents/Info.plist') 145 | bundle_id = '{0}.{1}'.format(wf().bundleid, uuid.uuid4().hex) 146 | data = plistlib.readPlist(ip_path) 147 | log().debug('Changing bundle ID to {0!r}'.format(bundle_id)) 148 | data['CFBundleIdentifier'] = bundle_id 149 | plistlib.writePlist(data, ip_path) 150 | 151 | 152 | def validate_sound(sound): 153 | """Coerce `sound` to valid sound name. 154 | 155 | Returns `None` for invalid sounds. Sound names can be found 156 | in `System Preferences > Sound > Sound Effects`. 157 | 158 | Args: 159 | sound (str): Name of system sound. 160 | 161 | Returns: 162 | str: Proper name of sound or `None`. 163 | """ 164 | if not sound: 165 | return None 166 | 167 | # Case-insensitive comparison of `sound` 168 | if sound.lower() in [s.lower() for s in SOUNDS]: 169 | # Title-case is correct for all system sounds as of OS X 10.11 170 | return sound.title() 171 | return None 172 | 173 | 174 | def notify(title='', text='', sound=None): 175 | """Post notification via Notify.app helper. 176 | 177 | Args: 178 | title (str, optional): Notification title. 179 | text (str, optional): Notification body text. 180 | sound (str, optional): Name of sound to play. 181 | 182 | Raises: 183 | ValueError: Raised if both `title` and `text` are empty. 184 | 185 | Returns: 186 | bool: `True` if notification was posted, else `False`. 187 | """ 188 | if title == text == '': 189 | raise ValueError('Empty notification') 190 | 191 | sound = validate_sound(sound) or '' 192 | 193 | n = notifier_program() 194 | 195 | if not os.path.exists(n): 196 | install_notifier() 197 | 198 | env = os.environ.copy() 199 | enc = 'utf-8' 200 | env['NOTIFY_TITLE'] = title.encode(enc) 201 | env['NOTIFY_MESSAGE'] = text.encode(enc) 202 | env['NOTIFY_SOUND'] = sound.encode(enc) 203 | cmd = [n] 204 | retcode = subprocess.call(cmd, env=env) 205 | if retcode == 0: 206 | return True 207 | 208 | log().error('Notify.app exited with status {0}.'.format(retcode)) 209 | return False 210 | 211 | 212 | def convert_image(inpath, outpath, size): 213 | """Convert an image file using `sips`. 214 | 215 | Args: 216 | inpath (str): Path of source file. 217 | outpath (str): Path to destination file. 218 | size (int): Width and height of destination image in pixels. 219 | 220 | Raises: 221 | RuntimeError: Raised if `sips` exits with non-zero status. 222 | """ 223 | cmd = [ 224 | b'sips', 225 | b'-z', b'{0}'.format(size), b'{0}'.format(size), 226 | inpath, 227 | b'--out', outpath] 228 | # log().debug(cmd) 229 | with open(os.devnull, 'w') as pipe: 230 | retcode = subprocess.call(cmd, stdout=pipe, stderr=subprocess.STDOUT) 231 | 232 | if retcode != 0: 233 | raise RuntimeError('sips exited with {0}'.format(retcode)) 234 | 235 | 236 | def png_to_icns(png_path, icns_path): 237 | """Convert PNG file to ICNS using `iconutil`. 238 | 239 | Create an iconset from the source PNG file. Generate PNG files 240 | in each size required by OS X, then call `iconutil` to turn 241 | them into a single ICNS file. 242 | 243 | Args: 244 | png_path (str): Path to source PNG file. 245 | icns_path (str): Path to destination ICNS file. 246 | 247 | Raises: 248 | RuntimeError: Raised if `iconutil` or `sips` fail. 249 | """ 250 | tempdir = tempfile.mkdtemp(prefix='aw-', dir=wf().datadir) 251 | 252 | try: 253 | iconset = os.path.join(tempdir, 'Icon.iconset') 254 | 255 | assert not os.path.exists(iconset), ( 256 | "Iconset path already exists : {0!r}".format(iconset)) 257 | os.makedirs(iconset) 258 | 259 | # Copy source icon to icon set and generate all the other 260 | # sizes needed 261 | configs = [] 262 | for i in (16, 32, 128, 256, 512): 263 | configs.append(('icon_{0}x{0}.png'.format(i), i)) 264 | configs.append((('icon_{0}x{0}@2x.png'.format(i), i*2))) 265 | 266 | shutil.copy(png_path, os.path.join(iconset, 'icon_256x256.png')) 267 | shutil.copy(png_path, os.path.join(iconset, 'icon_128x128@2x.png')) 268 | 269 | for name, size in configs: 270 | outpath = os.path.join(iconset, name) 271 | if os.path.exists(outpath): 272 | continue 273 | convert_image(png_path, outpath, size) 274 | 275 | cmd = [ 276 | b'iconutil', 277 | b'-c', b'icns', 278 | b'-o', icns_path, 279 | iconset] 280 | 281 | retcode = subprocess.call(cmd) 282 | if retcode != 0: 283 | raise RuntimeError("iconset exited with {0}".format(retcode)) 284 | 285 | assert os.path.exists(icns_path), ( 286 | "Generated ICNS file not found : {0!r}".format(icns_path)) 287 | finally: 288 | try: 289 | shutil.rmtree(tempdir) 290 | except OSError: # pragma: no cover 291 | pass 292 | 293 | 294 | # def notify_native(title='', text='', sound=''): 295 | # """Post notification via the native API (via pyobjc). 296 | 297 | # At least one of `title` or `text` must be specified. 298 | 299 | # This method will *always* show the Python launcher icon (i.e. the 300 | # rocket with the snakes on it). 301 | 302 | # Args: 303 | # title (str, optional): Notification title. 304 | # text (str, optional): Notification body text. 305 | # sound (str, optional): Name of sound to play. 306 | 307 | # """ 308 | 309 | # if title == text == '': 310 | # raise ValueError('Empty notification') 311 | 312 | # import Foundation 313 | 314 | # sound = sound or Foundation.NSUserNotificationDefaultSoundName 315 | 316 | # n = Foundation.NSUserNotification.alloc().init() 317 | # n.setTitle_(title) 318 | # n.setInformativeText_(text) 319 | # n.setSoundName_(sound) 320 | # nc = Foundation.NSUserNotificationCenter.defaultUserNotificationCenter() 321 | # nc.deliverNotification_(n) 322 | 323 | 324 | if __name__ == '__main__': # pragma: nocover 325 | # Simple command-line script to test module with 326 | # This won't work on 2.6, as `argparse` isn't available 327 | # by default. 328 | import argparse 329 | 330 | from unicodedata import normalize 331 | 332 | def uni(s): 333 | """Coerce `s` to normalised Unicode.""" 334 | ustr = s.decode('utf-8') 335 | return normalize('NFD', ustr) 336 | 337 | p = argparse.ArgumentParser() 338 | p.add_argument('-p', '--png', help="PNG image to convert to ICNS.") 339 | p.add_argument('-l', '--list-sounds', help="Show available sounds.", 340 | action='store_true') 341 | p.add_argument('-t', '--title', 342 | help="Notification title.", type=uni, 343 | default='') 344 | p.add_argument('-s', '--sound', type=uni, 345 | help="Optional notification sound.", default='') 346 | p.add_argument('text', type=uni, 347 | help="Notification body text.", default='', nargs='?') 348 | o = p.parse_args() 349 | 350 | # List available sounds 351 | if o.list_sounds: 352 | for sound in SOUNDS: 353 | print(sound) 354 | sys.exit(0) 355 | 356 | # Convert PNG to ICNS 357 | if o.png: 358 | icns = os.path.join( 359 | os.path.dirname(o.png), 360 | b'{0}{1}'.format(os.path.splitext(os.path.basename(o.png))[0], 361 | '.icns')) 362 | 363 | print('Converting {0!r} to {1!r} ...'.format(o.png, icns), 364 | file=sys.stderr) 365 | 366 | assert not os.path.exists(icns), ( 367 | "Destination file already exists : {0}".format(icns)) 368 | 369 | png_to_icns(o.png, icns) 370 | sys.exit(0) 371 | 372 | # Post notification 373 | if o.title == o.text == '': 374 | print('ERROR: Empty notification.', file=sys.stderr) 375 | sys.exit(1) 376 | else: 377 | notify(o.title, o.text, o.sound) 378 | -------------------------------------------------------------------------------- /bs4/builder/__init__.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import itertools 3 | import sys 4 | from bs4.element import ( 5 | CharsetMetaAttributeValue, 6 | ContentMetaAttributeValue, 7 | whitespace_re 8 | ) 9 | 10 | __all__ = [ 11 | 'HTMLTreeBuilder', 12 | 'SAXTreeBuilder', 13 | 'TreeBuilder', 14 | 'TreeBuilderRegistry', 15 | ] 16 | 17 | # Some useful features for a TreeBuilder to have. 18 | FAST = 'fast' 19 | PERMISSIVE = 'permissive' 20 | STRICT = 'strict' 21 | XML = 'xml' 22 | HTML = 'html' 23 | HTML_5 = 'html5' 24 | 25 | 26 | class TreeBuilderRegistry(object): 27 | 28 | def __init__(self): 29 | self.builders_for_feature = defaultdict(list) 30 | self.builders = [] 31 | 32 | def register(self, treebuilder_class): 33 | """Register a treebuilder based on its advertised features.""" 34 | for feature in treebuilder_class.features: 35 | self.builders_for_feature[feature].insert(0, treebuilder_class) 36 | self.builders.insert(0, treebuilder_class) 37 | 38 | def lookup(self, *features): 39 | if len(self.builders) == 0: 40 | # There are no builders at all. 41 | return None 42 | 43 | if len(features) == 0: 44 | # They didn't ask for any features. Give them the most 45 | # recently registered builder. 46 | return self.builders[0] 47 | 48 | # Go down the list of features in order, and eliminate any builders 49 | # that don't match every feature. 50 | features = list(features) 51 | features.reverse() 52 | candidates = None 53 | candidate_set = None 54 | while len(features) > 0: 55 | feature = features.pop() 56 | we_have_the_feature = self.builders_for_feature.get(feature, []) 57 | if len(we_have_the_feature) > 0: 58 | if candidates is None: 59 | candidates = we_have_the_feature 60 | candidate_set = set(candidates) 61 | else: 62 | # Eliminate any candidates that don't have this feature. 63 | candidate_set = candidate_set.intersection( 64 | set(we_have_the_feature)) 65 | 66 | # The only valid candidates are the ones in candidate_set. 67 | # Go through the original list of candidates and pick the first one 68 | # that's in candidate_set. 69 | if candidate_set is None: 70 | return None 71 | for candidate in candidates: 72 | if candidate in candidate_set: 73 | return candidate 74 | return None 75 | 76 | # The BeautifulSoup class will take feature lists from developers and use them 77 | # to look up builders in this registry. 78 | builder_registry = TreeBuilderRegistry() 79 | 80 | class TreeBuilder(object): 81 | """Turn a document into a Beautiful Soup object tree.""" 82 | 83 | NAME = "[Unknown tree builder]" 84 | ALTERNATE_NAMES = [] 85 | features = [] 86 | 87 | is_xml = False 88 | picklable = False 89 | preserve_whitespace_tags = set() 90 | empty_element_tags = None # A tag will be considered an empty-element 91 | # tag when and only when it has no contents. 92 | 93 | # A value for these tag/attribute combinations is a space- or 94 | # comma-separated list of CDATA, rather than a single CDATA. 95 | cdata_list_attributes = {} 96 | 97 | 98 | def __init__(self): 99 | self.soup = None 100 | 101 | def reset(self): 102 | pass 103 | 104 | def can_be_empty_element(self, tag_name): 105 | """Might a tag with this name be an empty-element tag? 106 | 107 | The final markup may or may not actually present this tag as 108 | self-closing. 109 | 110 | For instance: an HTMLBuilder does not consider a

tag to be 111 | an empty-element tag (it's not in 112 | HTMLBuilder.empty_element_tags). This means an empty

tag 113 | will be presented as "

", not "

". 114 | 115 | The default implementation has no opinion about which tags are 116 | empty-element tags, so a tag will be presented as an 117 | empty-element tag if and only if it has no contents. 118 | "" will become "", and "bar" will 119 | be left alone. 120 | """ 121 | if self.empty_element_tags is None: 122 | return True 123 | return tag_name in self.empty_element_tags 124 | 125 | def feed(self, markup): 126 | raise NotImplementedError() 127 | 128 | def prepare_markup(self, markup, user_specified_encoding=None, 129 | document_declared_encoding=None): 130 | return markup, None, None, False 131 | 132 | def test_fragment_to_document(self, fragment): 133 | """Wrap an HTML fragment to make it look like a document. 134 | 135 | Different parsers do this differently. For instance, lxml 136 | introduces an empty tag, and html5lib 137 | doesn't. Abstracting this away lets us write simple tests 138 | which run HTML fragments through the parser and compare the 139 | results against other HTML fragments. 140 | 141 | This method should not be used outside of tests. 142 | """ 143 | return fragment 144 | 145 | def set_up_substitutions(self, tag): 146 | return False 147 | 148 | def _replace_cdata_list_attribute_values(self, tag_name, attrs): 149 | """Replaces class="foo bar" with class=["foo", "bar"] 150 | 151 | Modifies its input in place. 152 | """ 153 | if not attrs: 154 | return attrs 155 | if self.cdata_list_attributes: 156 | universal = self.cdata_list_attributes.get('*', []) 157 | tag_specific = self.cdata_list_attributes.get( 158 | tag_name.lower(), None) 159 | for attr in attrs.keys(): 160 | if attr in universal or (tag_specific and attr in tag_specific): 161 | # We have a "class"-type attribute whose string 162 | # value is a whitespace-separated list of 163 | # values. Split it into a list. 164 | value = attrs[attr] 165 | if isinstance(value, basestring): 166 | values = whitespace_re.split(value) 167 | else: 168 | # html5lib sometimes calls setAttributes twice 169 | # for the same tag when rearranging the parse 170 | # tree. On the second call the attribute value 171 | # here is already a list. If this happens, 172 | # leave the value alone rather than trying to 173 | # split it again. 174 | values = value 175 | attrs[attr] = values 176 | return attrs 177 | 178 | class SAXTreeBuilder(TreeBuilder): 179 | """A Beautiful Soup treebuilder that listens for SAX events.""" 180 | 181 | def feed(self, markup): 182 | raise NotImplementedError() 183 | 184 | def close(self): 185 | pass 186 | 187 | def startElement(self, name, attrs): 188 | attrs = dict((key[1], value) for key, value in list(attrs.items())) 189 | #print "Start %s, %r" % (name, attrs) 190 | self.soup.handle_starttag(name, attrs) 191 | 192 | def endElement(self, name): 193 | #print "End %s" % name 194 | self.soup.handle_endtag(name) 195 | 196 | def startElementNS(self, nsTuple, nodeName, attrs): 197 | # Throw away (ns, nodeName) for now. 198 | self.startElement(nodeName, attrs) 199 | 200 | def endElementNS(self, nsTuple, nodeName): 201 | # Throw away (ns, nodeName) for now. 202 | self.endElement(nodeName) 203 | #handler.endElementNS((ns, node.nodeName), node.nodeName) 204 | 205 | def startPrefixMapping(self, prefix, nodeValue): 206 | # Ignore the prefix for now. 207 | pass 208 | 209 | def endPrefixMapping(self, prefix): 210 | # Ignore the prefix for now. 211 | # handler.endPrefixMapping(prefix) 212 | pass 213 | 214 | def characters(self, content): 215 | self.soup.handle_data(content) 216 | 217 | def startDocument(self): 218 | pass 219 | 220 | def endDocument(self): 221 | pass 222 | 223 | 224 | class HTMLTreeBuilder(TreeBuilder): 225 | """This TreeBuilder knows facts about HTML. 226 | 227 | Such as which tags are empty-element tags. 228 | """ 229 | 230 | preserve_whitespace_tags = set(['pre', 'textarea']) 231 | empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', 232 | 'spacer', 'link', 'frame', 'base']) 233 | 234 | # The HTML standard defines these attributes as containing a 235 | # space-separated list of values, not a single value. That is, 236 | # class="foo bar" means that the 'class' attribute has two values, 237 | # 'foo' and 'bar', not the single value 'foo bar'. When we 238 | # encounter one of these attributes, we will parse its value into 239 | # a list of values if possible. Upon output, the list will be 240 | # converted back into a string. 241 | cdata_list_attributes = { 242 | "*" : ['class', 'accesskey', 'dropzone'], 243 | "a" : ['rel', 'rev'], 244 | "link" : ['rel', 'rev'], 245 | "td" : ["headers"], 246 | "th" : ["headers"], 247 | "td" : ["headers"], 248 | "form" : ["accept-charset"], 249 | "object" : ["archive"], 250 | 251 | # These are HTML5 specific, as are *.accesskey and *.dropzone above. 252 | "area" : ["rel"], 253 | "icon" : ["sizes"], 254 | "iframe" : ["sandbox"], 255 | "output" : ["for"], 256 | } 257 | 258 | def set_up_substitutions(self, tag): 259 | # We are only interested in tags 260 | if tag.name != 'meta': 261 | return False 262 | 263 | http_equiv = tag.get('http-equiv') 264 | content = tag.get('content') 265 | charset = tag.get('charset') 266 | 267 | # We are interested in tags that say what encoding the 268 | # document was originally in. This means HTML 5-style 269 | # tags that provide the "charset" attribute. It also means 270 | # HTML 4-style tags that provide the "content" 271 | # attribute and have "http-equiv" set to "content-type". 272 | # 273 | # In both cases we will replace the value of the appropriate 274 | # attribute with a standin object that can take on any 275 | # encoding. 276 | meta_encoding = None 277 | if charset is not None: 278 | # HTML 5 style: 279 | # 280 | meta_encoding = charset 281 | tag['charset'] = CharsetMetaAttributeValue(charset) 282 | 283 | elif (content is not None and http_equiv is not None 284 | and http_equiv.lower() == 'content-type'): 285 | # HTML 4 style: 286 | # 287 | tag['content'] = ContentMetaAttributeValue(content) 288 | 289 | return (meta_encoding is not None) 290 | 291 | def register_treebuilders_from(module): 292 | """Copy TreeBuilders from the given module into this module.""" 293 | # I'm fairly sure this is not the best way to do this. 294 | this_module = sys.modules['bs4.builder'] 295 | for name in module.__all__: 296 | obj = getattr(module, name) 297 | 298 | if issubclass(obj, TreeBuilder): 299 | setattr(this_module, name, obj) 300 | this_module.__all__.append(name) 301 | # Register the builder while we're at it. 302 | this_module.builder_registry.register(obj) 303 | 304 | class ParserRejectedMarkup(Exception): 305 | pass 306 | 307 | # Builders are registered in reverse order of priority, so that custom 308 | # builder registrations will take precedence. In general, we want lxml 309 | # to take precedence over html5lib, because it's faster. And we only 310 | # want to use HTMLParser as a last result. 311 | from . import _htmlparser 312 | register_treebuilders_from(_htmlparser) 313 | try: 314 | from . import _html5lib 315 | register_treebuilders_from(_html5lib) 316 | except ImportError: 317 | # They don't have html5lib installed. 318 | pass 319 | try: 320 | from . import _lxml 321 | register_treebuilders_from(_lxml) 322 | except ImportError: 323 | # They don't have lxml installed. 324 | pass 325 | -------------------------------------------------------------------------------- /workflow/update.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | # 4 | # Copyright (c) 2014 Fabio Niephaus , 5 | # Dean Jackson 6 | # 7 | # MIT Licence. See http://opensource.org/licenses/MIT 8 | # 9 | # Created on 2014-08-16 10 | # 11 | 12 | """ 13 | Self-updating from GitHub 14 | 15 | .. versionadded:: 1.9 16 | 17 | .. note:: 18 | 19 | This module is not intended to be used directly. Automatic updates 20 | are controlled by the ``update_settings`` :class:`dict` passed to 21 | :class:`~workflow.workflow.Workflow` objects. 22 | 23 | """ 24 | 25 | from __future__ import print_function, unicode_literals 26 | 27 | import os 28 | import tempfile 29 | import re 30 | import subprocess 31 | 32 | import workflow 33 | import web 34 | 35 | # __all__ = [] 36 | 37 | 38 | RELEASES_BASE = 'https://api.github.com/repos/{0}/releases' 39 | 40 | 41 | _wf = None 42 | 43 | 44 | def wf(): 45 | global _wf 46 | if _wf is None: 47 | _wf = workflow.Workflow() 48 | return _wf 49 | 50 | 51 | class Version(object): 52 | """Mostly semantic versioning 53 | 54 | The main difference to proper :ref:`semantic versioning ` 55 | is that this implementation doesn't require a minor or patch version. 56 | """ 57 | 58 | #: Match version and pre-release/build information in version strings 59 | match_version = re.compile(r'([0-9\.]+)(.+)?').match 60 | 61 | def __init__(self, vstr): 62 | self.vstr = vstr 63 | self.major = 0 64 | self.minor = 0 65 | self.patch = 0 66 | self.suffix = '' 67 | self.build = '' 68 | self._parse(vstr) 69 | 70 | def _parse(self, vstr): 71 | if vstr.startswith('v'): 72 | m = self.match_version(vstr[1:]) 73 | else: 74 | m = self.match_version(vstr) 75 | if not m: 76 | raise ValueError('Invalid version number: {0}'.format(vstr)) 77 | 78 | version, suffix = m.groups() 79 | parts = self._parse_dotted_string(version) 80 | self.major = parts.pop(0) 81 | if len(parts): 82 | self.minor = parts.pop(0) 83 | if len(parts): 84 | self.patch = parts.pop(0) 85 | if not len(parts) == 0: 86 | raise ValueError('Invalid version (too long) : {0}'.format(vstr)) 87 | 88 | if suffix: 89 | # Build info 90 | idx = suffix.find('+') 91 | if idx > -1: 92 | self.build = suffix[idx+1:] 93 | suffix = suffix[:idx] 94 | if suffix: 95 | if not suffix.startswith('-'): 96 | raise ValueError( 97 | 'Invalid suffix : `{0}`. Must start with `-`'.format( 98 | suffix)) 99 | self.suffix = suffix[1:] 100 | 101 | # wf().logger.debug('version str `{}` -> {}'.format(vstr, repr(self))) 102 | 103 | def _parse_dotted_string(self, s): 104 | """Parse string ``s`` into list of ints and strings""" 105 | parsed = [] 106 | parts = s.split('.') 107 | for p in parts: 108 | if p.isdigit(): 109 | p = int(p) 110 | parsed.append(p) 111 | return parsed 112 | 113 | @property 114 | def tuple(self): 115 | """Version number as a tuple of major, minor, patch, pre-release""" 116 | 117 | return (self.major, self.minor, self.patch, self.suffix) 118 | 119 | def __lt__(self, other): 120 | if not isinstance(other, Version): 121 | raise ValueError('Not a Version instance: {0!r}'.format(other)) 122 | t = self.tuple[:3] 123 | o = other.tuple[:3] 124 | if t < o: 125 | return True 126 | if t == o: # We need to compare suffixes 127 | if self.suffix and not other.suffix: 128 | return True 129 | if other.suffix and not self.suffix: 130 | return False 131 | return (self._parse_dotted_string(self.suffix) < 132 | self._parse_dotted_string(other.suffix)) 133 | # t > o 134 | return False 135 | 136 | def __eq__(self, other): 137 | if not isinstance(other, Version): 138 | raise ValueError('Not a Version instance: {0!r}'.format(other)) 139 | return self.tuple == other.tuple 140 | 141 | def __ne__(self, other): 142 | return not self.__eq__(other) 143 | 144 | def __gt__(self, other): 145 | if not isinstance(other, Version): 146 | raise ValueError('Not a Version instance: {0!r}'.format(other)) 147 | return other.__lt__(self) 148 | 149 | def __le__(self, other): 150 | if not isinstance(other, Version): 151 | raise ValueError('Not a Version instance: {0!r}'.format(other)) 152 | return not other.__lt__(self) 153 | 154 | def __ge__(self, other): 155 | return not self.__lt__(other) 156 | 157 | def __str__(self): 158 | vstr = '{0}.{1}.{2}'.format(self.major, self.minor, self.patch) 159 | if self.suffix: 160 | vstr += '-{0}'.format(self.suffix) 161 | if self.build: 162 | vstr += '+{0}'.format(self.build) 163 | return vstr 164 | 165 | def __repr__(self): 166 | return "Version('{0}')".format(str(self)) 167 | 168 | 169 | def download_workflow(url): 170 | """Download workflow at ``url`` to a local temporary file 171 | 172 | :param url: URL to .alfredworkflow file in GitHub repo 173 | :returns: path to downloaded file 174 | 175 | """ 176 | 177 | filename = url.split("/")[-1] 178 | 179 | if (not url.endswith('.alfredworkflow') or 180 | not filename.endswith('.alfredworkflow')): 181 | raise ValueError('Attachment `{0}` not a workflow'.format(filename)) 182 | 183 | local_path = os.path.join(tempfile.gettempdir(), filename) 184 | 185 | wf().logger.debug( 186 | 'Downloading updated workflow from `{0}` to `{1}` ...'.format( 187 | url, local_path)) 188 | 189 | response = web.get(url) 190 | 191 | with open(local_path, 'wb') as output: 192 | output.write(response.content) 193 | 194 | return local_path 195 | 196 | 197 | def build_api_url(slug): 198 | """Generate releases URL from GitHub slug 199 | 200 | :param slug: Repo name in form ``username/repo`` 201 | :returns: URL to the API endpoint for the repo's releases 202 | 203 | """ 204 | 205 | if len(slug.split('/')) != 2: 206 | raise ValueError('Invalid GitHub slug : {0}'.format(slug)) 207 | 208 | return RELEASES_BASE.format(slug) 209 | 210 | 211 | def get_valid_releases(github_slug, prereleases=False): 212 | """Return list of all valid releases 213 | 214 | :param github_slug: ``username/repo`` for workflow's GitHub repo 215 | :param prereleases: Whether to include pre-releases. 216 | :returns: list of dicts. Each :class:`dict` has the form 217 | ``{'version': '1.1', 'download_url': 'http://github.com/...', 218 | 'prerelease': False }`` 219 | 220 | 221 | A valid release is one that contains one ``.alfredworkflow`` file. 222 | 223 | If the GitHub version (i.e. tag) is of the form ``v1.1``, the leading 224 | ``v`` will be stripped. 225 | 226 | """ 227 | 228 | api_url = build_api_url(github_slug) 229 | releases = [] 230 | 231 | wf().logger.debug('Retrieving releases list from `{0}` ...'.format( 232 | api_url)) 233 | 234 | def retrieve_releases(): 235 | wf().logger.info( 236 | 'Retrieving releases for `{0}` ...'.format(github_slug)) 237 | return web.get(api_url).json() 238 | 239 | slug = github_slug.replace('/', '-') 240 | for release in wf().cached_data('gh-releases-{0}'.format(slug), 241 | retrieve_releases): 242 | version = release['tag_name'] 243 | download_urls = [] 244 | for asset in release.get('assets', []): 245 | url = asset.get('browser_download_url') 246 | if not url or not url.endswith('.alfredworkflow'): 247 | continue 248 | download_urls.append(url) 249 | 250 | # Validate release 251 | if release['prerelease'] and not prereleases: 252 | wf().logger.warning( 253 | 'Invalid release {0} : pre-release detected'.format(version)) 254 | continue 255 | if not download_urls: 256 | wf().logger.warning( 257 | 'Invalid release {0} : No workflow file'.format(version)) 258 | continue 259 | if len(download_urls) > 1: 260 | wf().logger.warning( 261 | 'Invalid release {0} : multiple workflow files'.format(version)) 262 | continue 263 | 264 | wf().logger.debug('Release `{0}` : {1}'.format(version, url)) 265 | releases.append({ 266 | 'version': version, 267 | 'download_url': download_urls[0], 268 | 'prerelease': release['prerelease'] 269 | }) 270 | 271 | return releases 272 | 273 | 274 | def check_update(github_slug, current_version, prereleases=False): 275 | """Check whether a newer release is available on GitHub 276 | 277 | :param github_slug: ``username/repo`` for workflow's GitHub repo 278 | :param current_version: the currently installed version of the 279 | workflow. :ref:`Semantic versioning ` is required. 280 | :param prereleases: Whether to include pre-releases. 281 | :type current_version: ``unicode`` 282 | :returns: ``True`` if an update is available, else ``False`` 283 | 284 | If an update is available, its version number and download URL will 285 | be cached. 286 | 287 | """ 288 | 289 | releases = get_valid_releases(github_slug, prereleases) 290 | 291 | wf().logger.info('{0} releases for {1}'.format(len(releases), 292 | github_slug)) 293 | 294 | if not len(releases): 295 | raise ValueError('No valid releases for {0}'.format(github_slug)) 296 | 297 | # GitHub returns releases newest-first 298 | latest_release = releases[0] 299 | 300 | # (latest_version, download_url) = get_latest_release(releases) 301 | vr = Version(latest_release['version']) 302 | vl = Version(current_version) 303 | wf().logger.debug('Latest : {0!r} Installed : {1!r}'.format(vr, vl)) 304 | if vr > vl: 305 | 306 | wf().cache_data('__workflow_update_status', { 307 | 'version': latest_release['version'], 308 | 'download_url': latest_release['download_url'], 309 | 'available': True 310 | }) 311 | 312 | return True 313 | 314 | wf().cache_data('__workflow_update_status', { 315 | 'available': False 316 | }) 317 | return False 318 | 319 | 320 | def install_update(github_slug, current_version): 321 | """If a newer release is available, download and install it 322 | 323 | :param github_slug: ``username/repo`` for workflow's GitHub repo 324 | :param current_version: the currently installed version of the 325 | workflow. :ref:`Semantic versioning ` is required. 326 | :type current_version: ``unicode`` 327 | 328 | If an update is available, it will be downloaded and installed. 329 | 330 | :returns: ``True`` if an update is installed, else ``False`` 331 | 332 | """ 333 | # TODO: `github_slug` and `current_version` are both unusued. 334 | 335 | update_data = wf().cached_data('__workflow_update_status', max_age=0) 336 | 337 | if not update_data or not update_data.get('available'): 338 | wf().logger.info('No update available') 339 | return False 340 | 341 | local_file = download_workflow(update_data['download_url']) 342 | 343 | wf().logger.info('Installing updated workflow ...') 344 | subprocess.call(['open', local_file]) 345 | 346 | update_data['available'] = False 347 | wf().cache_data('__workflow_update_status', update_data) 348 | return True 349 | 350 | 351 | if __name__ == '__main__': # pragma: nocover 352 | import sys 353 | 354 | def show_help(): 355 | print('Usage : update.py (check|install) github_slug version [--prereleases]') 356 | sys.exit(1) 357 | 358 | argv = sys.argv[:] 359 | prereleases = '--prereleases' in argv 360 | 361 | if prereleases: 362 | argv.remove('--prereleases') 363 | 364 | if len(argv) != 4: 365 | show_help() 366 | 367 | action, github_slug, version = argv[1:] 368 | 369 | if action not in ('check', 'install'): 370 | show_help() 371 | 372 | if action == 'check': 373 | check_update(github_slug, version, prereleases) 374 | elif action == 'install': 375 | install_update(github_slug, version) 376 | -------------------------------------------------------------------------------- /bs4/builder/_html5lib.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | 'HTML5TreeBuilder', 3 | ] 4 | 5 | from pdb import set_trace 6 | import warnings 7 | from bs4.builder import ( 8 | PERMISSIVE, 9 | HTML, 10 | HTML_5, 11 | HTMLTreeBuilder, 12 | ) 13 | from bs4.element import ( 14 | NamespacedAttribute, 15 | whitespace_re, 16 | ) 17 | import html5lib 18 | from html5lib.constants import namespaces 19 | from bs4.element import ( 20 | Comment, 21 | Doctype, 22 | NavigableString, 23 | Tag, 24 | ) 25 | 26 | class HTML5TreeBuilder(HTMLTreeBuilder): 27 | """Use html5lib to build a tree.""" 28 | 29 | NAME = "html5lib" 30 | 31 | features = [NAME, PERMISSIVE, HTML_5, HTML] 32 | 33 | def prepare_markup(self, markup, user_specified_encoding, 34 | document_declared_encoding=None, exclude_encodings=None): 35 | # Store the user-specified encoding for use later on. 36 | self.user_specified_encoding = user_specified_encoding 37 | 38 | # document_declared_encoding and exclude_encodings aren't used 39 | # ATM because the html5lib TreeBuilder doesn't use 40 | # UnicodeDammit. 41 | if exclude_encodings: 42 | warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") 43 | yield (markup, None, None, False) 44 | 45 | # These methods are defined by Beautiful Soup. 46 | def feed(self, markup): 47 | if self.soup.parse_only is not None: 48 | warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") 49 | parser = html5lib.HTMLParser(tree=self.create_treebuilder) 50 | doc = parser.parse(markup, encoding=self.user_specified_encoding) 51 | 52 | # Set the character encoding detected by the tokenizer. 53 | if isinstance(markup, unicode): 54 | # We need to special-case this because html5lib sets 55 | # charEncoding to UTF-8 if it gets Unicode input. 56 | doc.original_encoding = None 57 | else: 58 | doc.original_encoding = parser.tokenizer.stream.charEncoding[0] 59 | 60 | def create_treebuilder(self, namespaceHTMLElements): 61 | self.underlying_builder = TreeBuilderForHtml5lib( 62 | self.soup, namespaceHTMLElements) 63 | return self.underlying_builder 64 | 65 | def test_fragment_to_document(self, fragment): 66 | """See `TreeBuilder`.""" 67 | return u'%s' % fragment 68 | 69 | 70 | class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): 71 | 72 | def __init__(self, soup, namespaceHTMLElements): 73 | self.soup = soup 74 | super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) 75 | 76 | def documentClass(self): 77 | self.soup.reset() 78 | return Element(self.soup, self.soup, None) 79 | 80 | def insertDoctype(self, token): 81 | name = token["name"] 82 | publicId = token["publicId"] 83 | systemId = token["systemId"] 84 | 85 | doctype = Doctype.for_name_and_ids(name, publicId, systemId) 86 | self.soup.object_was_parsed(doctype) 87 | 88 | def elementClass(self, name, namespace): 89 | tag = self.soup.new_tag(name, namespace) 90 | return Element(tag, self.soup, namespace) 91 | 92 | def commentClass(self, data): 93 | return TextNode(Comment(data), self.soup) 94 | 95 | def fragmentClass(self): 96 | self.soup = BeautifulSoup("") 97 | self.soup.name = "[document_fragment]" 98 | return Element(self.soup, self.soup, None) 99 | 100 | def appendChild(self, node): 101 | # XXX This code is not covered by the BS4 tests. 102 | self.soup.append(node.element) 103 | 104 | def getDocument(self): 105 | return self.soup 106 | 107 | def getFragment(self): 108 | return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element 109 | 110 | class AttrList(object): 111 | def __init__(self, element): 112 | self.element = element 113 | self.attrs = dict(self.element.attrs) 114 | def __iter__(self): 115 | return list(self.attrs.items()).__iter__() 116 | def __setitem__(self, name, value): 117 | # If this attribute is a multi-valued attribute for this element, 118 | # turn its value into a list. 119 | list_attr = HTML5TreeBuilder.cdata_list_attributes 120 | if (name in list_attr['*'] 121 | or (self.element.name in list_attr 122 | and name in list_attr[self.element.name])): 123 | # A node that is being cloned may have already undergone 124 | # this procedure. 125 | if not isinstance(value, list): 126 | value = whitespace_re.split(value) 127 | self.element[name] = value 128 | def items(self): 129 | return list(self.attrs.items()) 130 | def keys(self): 131 | return list(self.attrs.keys()) 132 | def __len__(self): 133 | return len(self.attrs) 134 | def __getitem__(self, name): 135 | return self.attrs[name] 136 | def __contains__(self, name): 137 | return name in list(self.attrs.keys()) 138 | 139 | 140 | class Element(html5lib.treebuilders._base.Node): 141 | def __init__(self, element, soup, namespace): 142 | html5lib.treebuilders._base.Node.__init__(self, element.name) 143 | self.element = element 144 | self.soup = soup 145 | self.namespace = namespace 146 | 147 | def appendChild(self, node): 148 | string_child = child = None 149 | if isinstance(node, basestring): 150 | # Some other piece of code decided to pass in a string 151 | # instead of creating a TextElement object to contain the 152 | # string. 153 | string_child = child = node 154 | elif isinstance(node, Tag): 155 | # Some other piece of code decided to pass in a Tag 156 | # instead of creating an Element object to contain the 157 | # Tag. 158 | child = node 159 | elif node.element.__class__ == NavigableString: 160 | string_child = child = node.element 161 | else: 162 | child = node.element 163 | 164 | if not isinstance(child, basestring) and child.parent is not None: 165 | node.element.extract() 166 | 167 | if (string_child and self.element.contents 168 | and self.element.contents[-1].__class__ == NavigableString): 169 | # We are appending a string onto another string. 170 | # TODO This has O(n^2) performance, for input like 171 | # "aaa..." 172 | old_element = self.element.contents[-1] 173 | new_element = self.soup.new_string(old_element + string_child) 174 | old_element.replace_with(new_element) 175 | self.soup._most_recent_element = new_element 176 | else: 177 | if isinstance(node, basestring): 178 | # Create a brand new NavigableString from this string. 179 | child = self.soup.new_string(node) 180 | 181 | # Tell Beautiful Soup to act as if it parsed this element 182 | # immediately after the parent's last descendant. (Or 183 | # immediately after the parent, if it has no children.) 184 | if self.element.contents: 185 | most_recent_element = self.element._last_descendant(False) 186 | elif self.element.next_element is not None: 187 | # Something from further ahead in the parse tree is 188 | # being inserted into this earlier element. This is 189 | # very annoying because it means an expensive search 190 | # for the last element in the tree. 191 | most_recent_element = self.soup._last_descendant() 192 | else: 193 | most_recent_element = self.element 194 | 195 | self.soup.object_was_parsed( 196 | child, parent=self.element, 197 | most_recent_element=most_recent_element) 198 | 199 | def getAttributes(self): 200 | return AttrList(self.element) 201 | 202 | def setAttributes(self, attributes): 203 | 204 | if attributes is not None and len(attributes) > 0: 205 | 206 | converted_attributes = [] 207 | for name, value in list(attributes.items()): 208 | if isinstance(name, tuple): 209 | new_name = NamespacedAttribute(*name) 210 | del attributes[name] 211 | attributes[new_name] = value 212 | 213 | self.soup.builder._replace_cdata_list_attribute_values( 214 | self.name, attributes) 215 | for name, value in attributes.items(): 216 | self.element[name] = value 217 | 218 | # The attributes may contain variables that need substitution. 219 | # Call set_up_substitutions manually. 220 | # 221 | # The Tag constructor called this method when the Tag was created, 222 | # but we just set/changed the attributes, so call it again. 223 | self.soup.builder.set_up_substitutions(self.element) 224 | attributes = property(getAttributes, setAttributes) 225 | 226 | def insertText(self, data, insertBefore=None): 227 | if insertBefore: 228 | text = TextNode(self.soup.new_string(data), self.soup) 229 | self.insertBefore(data, insertBefore) 230 | else: 231 | self.appendChild(data) 232 | 233 | def insertBefore(self, node, refNode): 234 | index = self.element.index(refNode.element) 235 | if (node.element.__class__ == NavigableString and self.element.contents 236 | and self.element.contents[index-1].__class__ == NavigableString): 237 | # (See comments in appendChild) 238 | old_node = self.element.contents[index-1] 239 | new_str = self.soup.new_string(old_node + node.element) 240 | old_node.replace_with(new_str) 241 | else: 242 | self.element.insert(index, node.element) 243 | node.parent = self 244 | 245 | def removeChild(self, node): 246 | node.element.extract() 247 | 248 | def reparentChildren(self, new_parent): 249 | """Move all of this tag's children into another tag.""" 250 | # print "MOVE", self.element.contents 251 | # print "FROM", self.element 252 | # print "TO", new_parent.element 253 | element = self.element 254 | new_parent_element = new_parent.element 255 | # Determine what this tag's next_element will be once all the children 256 | # are removed. 257 | final_next_element = element.next_sibling 258 | 259 | new_parents_last_descendant = new_parent_element._last_descendant(False, False) 260 | if len(new_parent_element.contents) > 0: 261 | # The new parent already contains children. We will be 262 | # appending this tag's children to the end. 263 | new_parents_last_child = new_parent_element.contents[-1] 264 | new_parents_last_descendant_next_element = new_parents_last_descendant.next_element 265 | else: 266 | # The new parent contains no children. 267 | new_parents_last_child = None 268 | new_parents_last_descendant_next_element = new_parent_element.next_element 269 | 270 | to_append = element.contents 271 | append_after = new_parent_element.contents 272 | if len(to_append) > 0: 273 | # Set the first child's previous_element and previous_sibling 274 | # to elements within the new parent 275 | first_child = to_append[0] 276 | if new_parents_last_descendant: 277 | first_child.previous_element = new_parents_last_descendant 278 | else: 279 | first_child.previous_element = new_parent_element 280 | first_child.previous_sibling = new_parents_last_child 281 | if new_parents_last_descendant: 282 | new_parents_last_descendant.next_element = first_child 283 | else: 284 | new_parent_element.next_element = first_child 285 | if new_parents_last_child: 286 | new_parents_last_child.next_sibling = first_child 287 | 288 | # Fix the last child's next_element and next_sibling 289 | last_child = to_append[-1] 290 | last_child.next_element = new_parents_last_descendant_next_element 291 | if new_parents_last_descendant_next_element: 292 | new_parents_last_descendant_next_element.previous_element = last_child 293 | last_child.next_sibling = None 294 | 295 | for child in to_append: 296 | child.parent = new_parent_element 297 | new_parent_element.contents.append(child) 298 | 299 | # Now that this element has no children, change its .next_element. 300 | element.contents = [] 301 | element.next_element = final_next_element 302 | 303 | # print "DONE WITH MOVE" 304 | # print "FROM", self.element 305 | # print "TO", new_parent_element 306 | 307 | def cloneNode(self): 308 | tag = self.soup.new_tag(self.element.name, self.namespace) 309 | node = Element(tag, self.soup, self.namespace) 310 | for key,value in self.attributes: 311 | node.attributes[key] = value 312 | return node 313 | 314 | def hasContent(self): 315 | return self.element.contents 316 | 317 | def getNameTuple(self): 318 | if self.namespace == None: 319 | return namespaces["html"], self.name 320 | else: 321 | return self.namespace, self.name 322 | 323 | nameTuple = property(getNameTuple) 324 | 325 | class TextNode(Element): 326 | def __init__(self, element, soup): 327 | html5lib.treebuilders._base.Node.__init__(self, None) 328 | self.element = element 329 | self.soup = soup 330 | 331 | def cloneNode(self): 332 | raise NotImplementedError 333 | -------------------------------------------------------------------------------- /bs4/__init__.py: -------------------------------------------------------------------------------- 1 | """Beautiful Soup 2 | Elixir and Tonic 3 | "The Screen-Scraper's Friend" 4 | http://www.crummy.com/software/BeautifulSoup/ 5 | 6 | Beautiful Soup uses a pluggable XML or HTML parser to parse a 7 | (possibly invalid) document into a tree representation. Beautiful Soup 8 | provides provides methods and Pythonic idioms that make it easy to 9 | navigate, search, and modify the parse tree. 10 | 11 | Beautiful Soup works with Python 2.6 and up. It works better if lxml 12 | and/or html5lib is installed. 13 | 14 | For more than you ever wanted to know about Beautiful Soup, see the 15 | documentation: 16 | http://www.crummy.com/software/BeautifulSoup/bs4/doc/ 17 | """ 18 | 19 | __author__ = "Leonard Richardson (leonardr@segfault.org)" 20 | __version__ = "4.4.1" 21 | __copyright__ = "Copyright (c) 2004-2015 Leonard Richardson" 22 | __license__ = "MIT" 23 | 24 | __all__ = ['BeautifulSoup'] 25 | 26 | import os 27 | import re 28 | import warnings 29 | 30 | from .builder import builder_registry, ParserRejectedMarkup 31 | from .dammit import UnicodeDammit 32 | from .element import ( 33 | CData, 34 | Comment, 35 | DEFAULT_OUTPUT_ENCODING, 36 | Declaration, 37 | Doctype, 38 | NavigableString, 39 | PageElement, 40 | ProcessingInstruction, 41 | ResultSet, 42 | SoupStrainer, 43 | Tag, 44 | ) 45 | 46 | # The very first thing we do is give a useful error if someone is 47 | # running this code under Python 3 without converting it. 48 | 'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' 49 | 50 | class BeautifulSoup(Tag): 51 | """ 52 | This class defines the basic interface called by the tree builders. 53 | 54 | These methods will be called by the parser: 55 | reset() 56 | feed(markup) 57 | 58 | The tree builder may call these methods from its feed() implementation: 59 | handle_starttag(name, attrs) # See note about return value 60 | handle_endtag(name) 61 | handle_data(data) # Appends to the current data node 62 | endData(containerClass=NavigableString) # Ends the current data node 63 | 64 | No matter how complicated the underlying parser is, you should be 65 | able to build a tree using 'start tag' events, 'end tag' events, 66 | 'data' events, and "done with data" events. 67 | 68 | If you encounter an empty-element tag (aka a self-closing tag, 69 | like HTML's
tag), call handle_starttag and then 70 | handle_endtag. 71 | """ 72 | ROOT_TAG_NAME = u'[document]' 73 | 74 | # If the end-user gives no indication which tree builder they 75 | # want, look for one with these features. 76 | DEFAULT_BUILDER_FEATURES = ['html', 'fast'] 77 | 78 | ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' 79 | 80 | NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" 81 | 82 | def __init__(self, markup="", features=None, builder=None, 83 | parse_only=None, from_encoding=None, exclude_encodings=None, 84 | **kwargs): 85 | """The Soup object is initialized as the 'root tag', and the 86 | provided markup (which can be a string or a file-like object) 87 | is fed into the underlying parser.""" 88 | 89 | if 'convertEntities' in kwargs: 90 | warnings.warn( 91 | "BS4 does not respect the convertEntities argument to the " 92 | "BeautifulSoup constructor. Entities are always converted " 93 | "to Unicode characters.") 94 | 95 | if 'markupMassage' in kwargs: 96 | del kwargs['markupMassage'] 97 | warnings.warn( 98 | "BS4 does not respect the markupMassage argument to the " 99 | "BeautifulSoup constructor. The tree builder is responsible " 100 | "for any necessary markup massage.") 101 | 102 | if 'smartQuotesTo' in kwargs: 103 | del kwargs['smartQuotesTo'] 104 | warnings.warn( 105 | "BS4 does not respect the smartQuotesTo argument to the " 106 | "BeautifulSoup constructor. Smart quotes are always converted " 107 | "to Unicode characters.") 108 | 109 | if 'selfClosingTags' in kwargs: 110 | del kwargs['selfClosingTags'] 111 | warnings.warn( 112 | "BS4 does not respect the selfClosingTags argument to the " 113 | "BeautifulSoup constructor. The tree builder is responsible " 114 | "for understanding self-closing tags.") 115 | 116 | if 'isHTML' in kwargs: 117 | del kwargs['isHTML'] 118 | warnings.warn( 119 | "BS4 does not respect the isHTML argument to the " 120 | "BeautifulSoup constructor. Suggest you use " 121 | "features='lxml' for HTML and features='lxml-xml' for " 122 | "XML.") 123 | 124 | def deprecated_argument(old_name, new_name): 125 | if old_name in kwargs: 126 | warnings.warn( 127 | 'The "%s" argument to the BeautifulSoup constructor ' 128 | 'has been renamed to "%s."' % (old_name, new_name)) 129 | value = kwargs[old_name] 130 | del kwargs[old_name] 131 | return value 132 | return None 133 | 134 | parse_only = parse_only or deprecated_argument( 135 | "parseOnlyThese", "parse_only") 136 | 137 | from_encoding = from_encoding or deprecated_argument( 138 | "fromEncoding", "from_encoding") 139 | 140 | if len(kwargs) > 0: 141 | arg = kwargs.keys().pop() 142 | raise TypeError( 143 | "__init__() got an unexpected keyword argument '%s'" % arg) 144 | 145 | if builder is None: 146 | original_features = features 147 | if isinstance(features, basestring): 148 | features = [features] 149 | if features is None or len(features) == 0: 150 | features = self.DEFAULT_BUILDER_FEATURES 151 | builder_class = builder_registry.lookup(*features) 152 | if builder_class is None: 153 | raise FeatureNotFound( 154 | "Couldn't find a tree builder with the features you " 155 | "requested: %s. Do you need to install a parser library?" 156 | % ",".join(features)) 157 | builder = builder_class() 158 | if not (original_features == builder.NAME or 159 | original_features in builder.ALTERNATE_NAMES): 160 | if builder.is_xml: 161 | markup_type = "XML" 162 | else: 163 | markup_type = "HTML" 164 | warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict( 165 | parser=builder.NAME, 166 | markup_type=markup_type)) 167 | 168 | self.builder = builder 169 | self.is_xml = builder.is_xml 170 | self.builder.soup = self 171 | 172 | self.parse_only = parse_only 173 | 174 | if hasattr(markup, 'read'): # It's a file-type object. 175 | markup = markup.read() 176 | elif len(markup) <= 256: 177 | # Print out warnings for a couple beginner problems 178 | # involving passing non-markup to Beautiful Soup. 179 | # Beautiful Soup will still parse the input as markup, 180 | # just in case that's what the user really wants. 181 | if (isinstance(markup, unicode) 182 | and not os.path.supports_unicode_filenames): 183 | possible_filename = markup.encode("utf8") 184 | else: 185 | possible_filename = markup 186 | is_file = False 187 | try: 188 | is_file = os.path.exists(possible_filename) 189 | except Exception, e: 190 | # This is almost certainly a problem involving 191 | # characters not valid in filenames on this 192 | # system. Just let it go. 193 | pass 194 | if is_file: 195 | if isinstance(markup, unicode): 196 | markup = markup.encode("utf8") 197 | warnings.warn( 198 | '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) 199 | if markup[:5] == "http:" or markup[:6] == "https:": 200 | # TODO: This is ugly but I couldn't get it to work in 201 | # Python 3 otherwise. 202 | if ((isinstance(markup, bytes) and not b' ' in markup) 203 | or (isinstance(markup, unicode) and not u' ' in markup)): 204 | if isinstance(markup, unicode): 205 | markup = markup.encode("utf8") 206 | warnings.warn( 207 | '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup) 208 | 209 | for (self.markup, self.original_encoding, self.declared_html_encoding, 210 | self.contains_replacement_characters) in ( 211 | self.builder.prepare_markup( 212 | markup, from_encoding, exclude_encodings=exclude_encodings)): 213 | self.reset() 214 | try: 215 | self._feed() 216 | break 217 | except ParserRejectedMarkup: 218 | pass 219 | 220 | # Clear out the markup and remove the builder's circular 221 | # reference to this object. 222 | self.markup = None 223 | self.builder.soup = None 224 | 225 | def __copy__(self): 226 | return type(self)(self.encode(), builder=self.builder) 227 | 228 | def __getstate__(self): 229 | # Frequently a tree builder can't be pickled. 230 | d = dict(self.__dict__) 231 | if 'builder' in d and not self.builder.picklable: 232 | del d['builder'] 233 | return d 234 | 235 | def _feed(self): 236 | # Convert the document to Unicode. 237 | self.builder.reset() 238 | 239 | self.builder.feed(self.markup) 240 | # Close out any unfinished strings and close all the open tags. 241 | self.endData() 242 | while self.currentTag.name != self.ROOT_TAG_NAME: 243 | self.popTag() 244 | 245 | def reset(self): 246 | Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) 247 | self.hidden = 1 248 | self.builder.reset() 249 | self.current_data = [] 250 | self.currentTag = None 251 | self.tagStack = [] 252 | self.preserve_whitespace_tag_stack = [] 253 | self.pushTag(self) 254 | 255 | def new_tag(self, name, namespace=None, nsprefix=None, **attrs): 256 | """Create a new tag associated with this soup.""" 257 | return Tag(None, self.builder, name, namespace, nsprefix, attrs) 258 | 259 | def new_string(self, s, subclass=NavigableString): 260 | """Create a new NavigableString associated with this soup.""" 261 | return subclass(s) 262 | 263 | def insert_before(self, successor): 264 | raise NotImplementedError("BeautifulSoup objects don't support insert_before().") 265 | 266 | def insert_after(self, successor): 267 | raise NotImplementedError("BeautifulSoup objects don't support insert_after().") 268 | 269 | def popTag(self): 270 | tag = self.tagStack.pop() 271 | if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: 272 | self.preserve_whitespace_tag_stack.pop() 273 | #print "Pop", tag.name 274 | if self.tagStack: 275 | self.currentTag = self.tagStack[-1] 276 | return self.currentTag 277 | 278 | def pushTag(self, tag): 279 | #print "Push", tag.name 280 | if self.currentTag: 281 | self.currentTag.contents.append(tag) 282 | self.tagStack.append(tag) 283 | self.currentTag = self.tagStack[-1] 284 | if tag.name in self.builder.preserve_whitespace_tags: 285 | self.preserve_whitespace_tag_stack.append(tag) 286 | 287 | def endData(self, containerClass=NavigableString): 288 | if self.current_data: 289 | current_data = u''.join(self.current_data) 290 | # If whitespace is not preserved, and this string contains 291 | # nothing but ASCII spaces, replace it with a single space 292 | # or newline. 293 | if not self.preserve_whitespace_tag_stack: 294 | strippable = True 295 | for i in current_data: 296 | if i not in self.ASCII_SPACES: 297 | strippable = False 298 | break 299 | if strippable: 300 | if '\n' in current_data: 301 | current_data = '\n' 302 | else: 303 | current_data = ' ' 304 | 305 | # Reset the data collector. 306 | self.current_data = [] 307 | 308 | # Should we add this string to the tree at all? 309 | if self.parse_only and len(self.tagStack) <= 1 and \ 310 | (not self.parse_only.text or \ 311 | not self.parse_only.search(current_data)): 312 | return 313 | 314 | o = containerClass(current_data) 315 | self.object_was_parsed(o) 316 | 317 | def object_was_parsed(self, o, parent=None, most_recent_element=None): 318 | """Add an object to the parse tree.""" 319 | parent = parent or self.currentTag 320 | previous_element = most_recent_element or self._most_recent_element 321 | 322 | next_element = previous_sibling = next_sibling = None 323 | if isinstance(o, Tag): 324 | next_element = o.next_element 325 | next_sibling = o.next_sibling 326 | previous_sibling = o.previous_sibling 327 | if not previous_element: 328 | previous_element = o.previous_element 329 | 330 | o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) 331 | 332 | self._most_recent_element = o 333 | parent.contents.append(o) 334 | 335 | if parent.next_sibling: 336 | # This node is being inserted into an element that has 337 | # already been parsed. Deal with any dangling references. 338 | index = parent.contents.index(o) 339 | if index == 0: 340 | previous_element = parent 341 | previous_sibling = None 342 | else: 343 | previous_element = previous_sibling = parent.contents[index-1] 344 | if index == len(parent.contents)-1: 345 | next_element = parent.next_sibling 346 | next_sibling = None 347 | else: 348 | next_element = next_sibling = parent.contents[index+1] 349 | 350 | o.previous_element = previous_element 351 | if previous_element: 352 | previous_element.next_element = o 353 | o.next_element = next_element 354 | if next_element: 355 | next_element.previous_element = o 356 | o.next_sibling = next_sibling 357 | if next_sibling: 358 | next_sibling.previous_sibling = o 359 | o.previous_sibling = previous_sibling 360 | if previous_sibling: 361 | previous_sibling.next_sibling = o 362 | 363 | def _popToTag(self, name, nsprefix=None, inclusivePop=True): 364 | """Pops the tag stack up to and including the most recent 365 | instance of the given tag. If inclusivePop is false, pops the tag 366 | stack up to but *not* including the most recent instqance of 367 | the given tag.""" 368 | #print "Popping to %s" % name 369 | if name == self.ROOT_TAG_NAME: 370 | # The BeautifulSoup object itself can never be popped. 371 | return 372 | 373 | most_recently_popped = None 374 | 375 | stack_size = len(self.tagStack) 376 | for i in range(stack_size - 1, 0, -1): 377 | t = self.tagStack[i] 378 | if (name == t.name and nsprefix == t.prefix): 379 | if inclusivePop: 380 | most_recently_popped = self.popTag() 381 | break 382 | most_recently_popped = self.popTag() 383 | 384 | return most_recently_popped 385 | 386 | def handle_starttag(self, name, namespace, nsprefix, attrs): 387 | """Push a start tag on to the stack. 388 | 389 | If this method returns None, the tag was rejected by the 390 | SoupStrainer. You should proceed as if the tag had not occured 391 | in the document. For instance, if this was a self-closing tag, 392 | don't call handle_endtag. 393 | """ 394 | 395 | # print "Start tag %s: %s" % (name, attrs) 396 | self.endData() 397 | 398 | if (self.parse_only and len(self.tagStack) <= 1 399 | and (self.parse_only.text 400 | or not self.parse_only.search_tag(name, attrs))): 401 | return None 402 | 403 | tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, 404 | self.currentTag, self._most_recent_element) 405 | if tag is None: 406 | return tag 407 | if self._most_recent_element: 408 | self._most_recent_element.next_element = tag 409 | self._most_recent_element = tag 410 | self.pushTag(tag) 411 | return tag 412 | 413 | def handle_endtag(self, name, nsprefix=None): 414 | #print "End tag: " + name 415 | self.endData() 416 | self._popToTag(name, nsprefix) 417 | 418 | def handle_data(self, data): 419 | self.current_data.append(data) 420 | 421 | def decode(self, pretty_print=False, 422 | eventual_encoding=DEFAULT_OUTPUT_ENCODING, 423 | formatter="minimal"): 424 | """Returns a string or Unicode representation of this document. 425 | To get Unicode, pass None for encoding.""" 426 | 427 | if self.is_xml: 428 | # Print the XML declaration 429 | encoding_part = '' 430 | if eventual_encoding != None: 431 | encoding_part = ' encoding="%s"' % eventual_encoding 432 | prefix = u'\n' % encoding_part 433 | else: 434 | prefix = u'' 435 | if not pretty_print: 436 | indent_level = None 437 | else: 438 | indent_level = 0 439 | return prefix + super(BeautifulSoup, self).decode( 440 | indent_level, eventual_encoding, formatter) 441 | 442 | # Alias to make it easier to type import: 'from bs4 import _soup' 443 | _s = BeautifulSoup 444 | _soup = BeautifulSoup 445 | 446 | class BeautifulStoneSoup(BeautifulSoup): 447 | """Deprecated interface to an XML parser.""" 448 | 449 | def __init__(self, *args, **kwargs): 450 | kwargs['features'] = 'xml' 451 | warnings.warn( 452 | 'The BeautifulStoneSoup class is deprecated. Instead of using ' 453 | 'it, pass features="xml" into the BeautifulSoup constructor.') 454 | super(BeautifulStoneSoup, self).__init__(*args, **kwargs) 455 | 456 | 457 | class StopParsing(Exception): 458 | pass 459 | 460 | class FeatureNotFound(ValueError): 461 | pass 462 | 463 | 464 | #By default, act as an HTML pretty-printer. 465 | if __name__ == '__main__': 466 | import sys 467 | soup = BeautifulSoup(sys.stdin) 468 | print soup.prettify() 469 | -------------------------------------------------------------------------------- /bs4/tests/test_soup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Tests of Beautiful Soup as a whole.""" 3 | 4 | from pdb import set_trace 5 | import logging 6 | import unittest 7 | import sys 8 | import tempfile 9 | 10 | from bs4 import ( 11 | BeautifulSoup, 12 | BeautifulStoneSoup, 13 | ) 14 | from bs4.element import ( 15 | CharsetMetaAttributeValue, 16 | ContentMetaAttributeValue, 17 | SoupStrainer, 18 | NamespacedAttribute, 19 | ) 20 | import bs4.dammit 21 | from bs4.dammit import ( 22 | EntitySubstitution, 23 | UnicodeDammit, 24 | EncodingDetector, 25 | ) 26 | from bs4.testing import ( 27 | SoupTest, 28 | skipIf, 29 | ) 30 | import warnings 31 | 32 | try: 33 | from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML 34 | LXML_PRESENT = True 35 | except ImportError, e: 36 | LXML_PRESENT = False 37 | 38 | PYTHON_2_PRE_2_7 = (sys.version_info < (2,7)) 39 | PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) 40 | 41 | class TestConstructor(SoupTest): 42 | 43 | def test_short_unicode_input(self): 44 | data = u"

éé

" 45 | soup = self.soup(data) 46 | self.assertEqual(u"éé", soup.h1.string) 47 | 48 | def test_embedded_null(self): 49 | data = u"

foo\0bar

" 50 | soup = self.soup(data) 51 | self.assertEqual(u"foo\0bar", soup.h1.string) 52 | 53 | def test_exclude_encodings(self): 54 | utf8_data = u"Räksmörgås".encode("utf-8") 55 | soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) 56 | self.assertEqual("windows-1252", soup.original_encoding) 57 | 58 | 59 | class TestWarnings(SoupTest): 60 | 61 | def _no_parser_specified(self, s, is_there=True): 62 | v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80]) 63 | self.assertTrue(v) 64 | 65 | def test_warning_if_no_parser_specified(self): 66 | with warnings.catch_warnings(record=True) as w: 67 | soup = self.soup("") 68 | msg = str(w[0].message) 69 | self._assert_no_parser_specified(msg) 70 | 71 | def test_warning_if_parser_specified_too_vague(self): 72 | with warnings.catch_warnings(record=True) as w: 73 | soup = self.soup("", "html") 74 | msg = str(w[0].message) 75 | self._assert_no_parser_specified(msg) 76 | 77 | def test_no_warning_if_explicit_parser_specified(self): 78 | with warnings.catch_warnings(record=True) as w: 79 | soup = self.soup("", "html.parser") 80 | self.assertEquals([], w) 81 | 82 | def test_parseOnlyThese_renamed_to_parse_only(self): 83 | with warnings.catch_warnings(record=True) as w: 84 | soup = self.soup("", parseOnlyThese=SoupStrainer("b")) 85 | msg = str(w[0].message) 86 | self.assertTrue("parseOnlyThese" in msg) 87 | self.assertTrue("parse_only" in msg) 88 | self.assertEqual(b"", soup.encode()) 89 | 90 | def test_fromEncoding_renamed_to_from_encoding(self): 91 | with warnings.catch_warnings(record=True) as w: 92 | utf8 = b"\xc3\xa9" 93 | soup = self.soup(utf8, fromEncoding="utf8") 94 | msg = str(w[0].message) 95 | self.assertTrue("fromEncoding" in msg) 96 | self.assertTrue("from_encoding" in msg) 97 | self.assertEqual("utf8", soup.original_encoding) 98 | 99 | def test_unrecognized_keyword_argument(self): 100 | self.assertRaises( 101 | TypeError, self.soup, "", no_such_argument=True) 102 | 103 | class TestWarnings(SoupTest): 104 | 105 | def test_disk_file_warning(self): 106 | filehandle = tempfile.NamedTemporaryFile() 107 | filename = filehandle.name 108 | try: 109 | with warnings.catch_warnings(record=True) as w: 110 | soup = self.soup(filename) 111 | msg = str(w[0].message) 112 | self.assertTrue("looks like a filename" in msg) 113 | finally: 114 | filehandle.close() 115 | 116 | # The file no longer exists, so Beautiful Soup will no longer issue the warning. 117 | with warnings.catch_warnings(record=True) as w: 118 | soup = self.soup(filename) 119 | self.assertEqual(0, len(w)) 120 | 121 | def test_url_warning(self): 122 | with warnings.catch_warnings(record=True) as w: 123 | soup = self.soup("http://www.crummy.com/") 124 | msg = str(w[0].message) 125 | self.assertTrue("looks like a URL" in msg) 126 | 127 | with warnings.catch_warnings(record=True) as w: 128 | soup = self.soup("http://www.crummy.com/ is great") 129 | self.assertEqual(0, len(w)) 130 | 131 | class TestSelectiveParsing(SoupTest): 132 | 133 | def test_parse_with_soupstrainer(self): 134 | markup = "NoYesNoYes Yes" 135 | strainer = SoupStrainer("b") 136 | soup = self.soup(markup, parse_only=strainer) 137 | self.assertEqual(soup.encode(), b"YesYes Yes") 138 | 139 | 140 | class TestEntitySubstitution(unittest.TestCase): 141 | """Standalone tests of the EntitySubstitution class.""" 142 | def setUp(self): 143 | self.sub = EntitySubstitution 144 | 145 | def test_simple_html_substitution(self): 146 | # Unicode characters corresponding to named HTML entites 147 | # are substituted, and no others. 148 | s = u"foo\u2200\N{SNOWMAN}\u00f5bar" 149 | self.assertEqual(self.sub.substitute_html(s), 150 | u"foo∀\N{SNOWMAN}õbar") 151 | 152 | def test_smart_quote_substitution(self): 153 | # MS smart quotes are a common source of frustration, so we 154 | # give them a special test. 155 | quotes = b"\x91\x92foo\x93\x94" 156 | dammit = UnicodeDammit(quotes) 157 | self.assertEqual(self.sub.substitute_html(dammit.markup), 158 | "‘’foo“”") 159 | 160 | def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): 161 | s = 'Welcome to "my bar"' 162 | self.assertEqual(self.sub.substitute_xml(s, False), s) 163 | 164 | def test_xml_attribute_quoting_normally_uses_double_quotes(self): 165 | self.assertEqual(self.sub.substitute_xml("Welcome", True), 166 | '"Welcome"') 167 | self.assertEqual(self.sub.substitute_xml("Bob's Bar", True), 168 | '"Bob\'s Bar"') 169 | 170 | def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): 171 | s = 'Welcome to "my bar"' 172 | self.assertEqual(self.sub.substitute_xml(s, True), 173 | "'Welcome to \"my bar\"'") 174 | 175 | def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): 176 | s = 'Welcome to "Bob\'s Bar"' 177 | self.assertEqual( 178 | self.sub.substitute_xml(s, True), 179 | '"Welcome to "Bob\'s Bar""') 180 | 181 | def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): 182 | quoted = 'Welcome to "Bob\'s Bar"' 183 | self.assertEqual(self.sub.substitute_xml(quoted), quoted) 184 | 185 | def test_xml_quoting_handles_angle_brackets(self): 186 | self.assertEqual( 187 | self.sub.substitute_xml("foo"), 188 | "foo<bar>") 189 | 190 | def test_xml_quoting_handles_ampersands(self): 191 | self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T") 192 | 193 | def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): 194 | self.assertEqual( 195 | self.sub.substitute_xml("ÁT&T"), 196 | "&Aacute;T&T") 197 | 198 | def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): 199 | self.assertEqual( 200 | self.sub.substitute_xml_containing_entities("ÁT&T"), 201 | "ÁT&T") 202 | 203 | def test_quotes_not_html_substituted(self): 204 | """There's no need to do this except inside attribute values.""" 205 | text = 'Bob\'s "bar"' 206 | self.assertEqual(self.sub.substitute_html(text), text) 207 | 208 | 209 | class TestEncodingConversion(SoupTest): 210 | # Test Beautiful Soup's ability to decode and encode from various 211 | # encodings. 212 | 213 | def setUp(self): 214 | super(TestEncodingConversion, self).setUp() 215 | self.unicode_data = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' 216 | self.utf8_data = self.unicode_data.encode("utf-8") 217 | # Just so you know what it looks like. 218 | self.assertEqual( 219 | self.utf8_data, 220 | b'Sacr\xc3\xa9 bleu!') 221 | 222 | def test_ascii_in_unicode_out(self): 223 | # ASCII input is converted to Unicode. The original_encoding 224 | # attribute is set to 'utf-8', a superset of ASCII. 225 | chardet = bs4.dammit.chardet_dammit 226 | logging.disable(logging.WARNING) 227 | try: 228 | def noop(str): 229 | return None 230 | # Disable chardet, which will realize that the ASCII is ASCII. 231 | bs4.dammit.chardet_dammit = noop 232 | ascii = b"a" 233 | soup_from_ascii = self.soup(ascii) 234 | unicode_output = soup_from_ascii.decode() 235 | self.assertTrue(isinstance(unicode_output, unicode)) 236 | self.assertEqual(unicode_output, self.document_for(ascii.decode())) 237 | self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") 238 | finally: 239 | logging.disable(logging.NOTSET) 240 | bs4.dammit.chardet_dammit = chardet 241 | 242 | def test_unicode_in_unicode_out(self): 243 | # Unicode input is left alone. The original_encoding attribute 244 | # is not set. 245 | soup_from_unicode = self.soup(self.unicode_data) 246 | self.assertEqual(soup_from_unicode.decode(), self.unicode_data) 247 | self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') 248 | self.assertEqual(soup_from_unicode.original_encoding, None) 249 | 250 | def test_utf8_in_unicode_out(self): 251 | # UTF-8 input is converted to Unicode. The original_encoding 252 | # attribute is set. 253 | soup_from_utf8 = self.soup(self.utf8_data) 254 | self.assertEqual(soup_from_utf8.decode(), self.unicode_data) 255 | self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') 256 | 257 | def test_utf8_out(self): 258 | # The internal data structures can be encoded as UTF-8. 259 | soup_from_unicode = self.soup(self.unicode_data) 260 | self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) 261 | 262 | @skipIf( 263 | PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2, 264 | "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") 265 | def test_attribute_name_containing_unicode_characters(self): 266 | markup = u'
' 267 | self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) 268 | 269 | class TestUnicodeDammit(unittest.TestCase): 270 | """Standalone tests of UnicodeDammit.""" 271 | 272 | def test_unicode_input(self): 273 | markup = u"I'm already Unicode! \N{SNOWMAN}" 274 | dammit = UnicodeDammit(markup) 275 | self.assertEqual(dammit.unicode_markup, markup) 276 | 277 | def test_smart_quotes_to_unicode(self): 278 | markup = b"\x91\x92\x93\x94" 279 | dammit = UnicodeDammit(markup) 280 | self.assertEqual( 281 | dammit.unicode_markup, u"\u2018\u2019\u201c\u201d") 282 | 283 | def test_smart_quotes_to_xml_entities(self): 284 | markup = b"\x91\x92\x93\x94" 285 | dammit = UnicodeDammit(markup, smart_quotes_to="xml") 286 | self.assertEqual( 287 | dammit.unicode_markup, "‘’“”") 288 | 289 | def test_smart_quotes_to_html_entities(self): 290 | markup = b"\x91\x92\x93\x94" 291 | dammit = UnicodeDammit(markup, smart_quotes_to="html") 292 | self.assertEqual( 293 | dammit.unicode_markup, "‘’“”") 294 | 295 | def test_smart_quotes_to_ascii(self): 296 | markup = b"\x91\x92\x93\x94" 297 | dammit = UnicodeDammit(markup, smart_quotes_to="ascii") 298 | self.assertEqual( 299 | dammit.unicode_markup, """''""""") 300 | 301 | def test_detect_utf8(self): 302 | utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" 303 | dammit = UnicodeDammit(utf8) 304 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 305 | self.assertEqual(dammit.unicode_markup, u'Sacr\xe9 bleu! \N{SNOWMAN}') 306 | 307 | 308 | def test_convert_hebrew(self): 309 | hebrew = b"\xed\xe5\xec\xf9" 310 | dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) 311 | self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') 312 | self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') 313 | 314 | def test_dont_see_smart_quotes_where_there_are_none(self): 315 | utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" 316 | dammit = UnicodeDammit(utf_8) 317 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 318 | self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) 319 | 320 | def test_ignore_inappropriate_codecs(self): 321 | utf8_data = u"Räksmörgås".encode("utf-8") 322 | dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) 323 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 324 | 325 | def test_ignore_invalid_codecs(self): 326 | utf8_data = u"Räksmörgås".encode("utf-8") 327 | for bad_encoding in ['.utf8', '...', 'utF---16.!']: 328 | dammit = UnicodeDammit(utf8_data, [bad_encoding]) 329 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 330 | 331 | def test_exclude_encodings(self): 332 | # This is UTF-8. 333 | utf8_data = u"Räksmörgås".encode("utf-8") 334 | 335 | # But if we exclude UTF-8 from consideration, the guess is 336 | # Windows-1252. 337 | dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"]) 338 | self.assertEqual(dammit.original_encoding.lower(), 'windows-1252') 339 | 340 | # And if we exclude that, there is no valid guess at all. 341 | dammit = UnicodeDammit( 342 | utf8_data, exclude_encodings=["utf-8", "windows-1252"]) 343 | self.assertEqual(dammit.original_encoding, None) 344 | 345 | def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self): 346 | detected = EncodingDetector( 347 | b'') 348 | encodings = list(detected.encodings) 349 | assert u'utf-\N{REPLACEMENT CHARACTER}' in encodings 350 | 351 | def test_detect_html5_style_meta_tag(self): 352 | 353 | for data in ( 354 | b'', 355 | b"", 356 | b"", 357 | b""): 358 | dammit = UnicodeDammit(data, is_html=True) 359 | self.assertEqual( 360 | "euc-jp", dammit.original_encoding) 361 | 362 | def test_last_ditch_entity_replacement(self): 363 | # This is a UTF-8 document that contains bytestrings 364 | # completely incompatible with UTF-8 (ie. encoded with some other 365 | # encoding). 366 | # 367 | # Since there is no consistent encoding for the document, 368 | # Unicode, Dammit will eventually encode the document as UTF-8 369 | # and encode the incompatible characters as REPLACEMENT 370 | # CHARACTER. 371 | # 372 | # If chardet is installed, it will detect that the document 373 | # can be converted into ISO-8859-1 without errors. This happens 374 | # to be the wrong encoding, but it is a consistent encoding, so the 375 | # code we're testing here won't run. 376 | # 377 | # So we temporarily disable chardet if it's present. 378 | doc = b"""\357\273\277 379 | \330\250\330\252\330\261 380 | \310\322\321\220\312\321\355\344""" 381 | chardet = bs4.dammit.chardet_dammit 382 | logging.disable(logging.WARNING) 383 | try: 384 | def noop(str): 385 | return None 386 | bs4.dammit.chardet_dammit = noop 387 | dammit = UnicodeDammit(doc) 388 | self.assertEqual(True, dammit.contains_replacement_characters) 389 | self.assertTrue(u"\ufffd" in dammit.unicode_markup) 390 | 391 | soup = BeautifulSoup(doc, "html.parser") 392 | self.assertTrue(soup.contains_replacement_characters) 393 | finally: 394 | logging.disable(logging.NOTSET) 395 | bs4.dammit.chardet_dammit = chardet 396 | 397 | def test_byte_order_mark_removed(self): 398 | # A document written in UTF-16LE will have its byte order marker stripped. 399 | data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' 400 | dammit = UnicodeDammit(data) 401 | self.assertEqual(u"áé", dammit.unicode_markup) 402 | self.assertEqual("utf-16le", dammit.original_encoding) 403 | 404 | def test_detwingle(self): 405 | # Here's a UTF8 document. 406 | utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") 407 | 408 | # Here's a Windows-1252 document. 409 | windows_1252 = ( 410 | u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" 411 | u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") 412 | 413 | # Through some unholy alchemy, they've been stuck together. 414 | doc = utf8 + windows_1252 + utf8 415 | 416 | # The document can't be turned into UTF-8: 417 | self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") 418 | 419 | # Unicode, Dammit thinks the whole document is Windows-1252, 420 | # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" 421 | 422 | # But if we run it through fix_embedded_windows_1252, it's fixed: 423 | 424 | fixed = UnicodeDammit.detwingle(doc) 425 | self.assertEqual( 426 | u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) 427 | 428 | def test_detwingle_ignores_multibyte_characters(self): 429 | # Each of these characters has a UTF-8 representation ending 430 | # in \x93. \x93 is a smart quote if interpreted as 431 | # Windows-1252. But our code knows to skip over multibyte 432 | # UTF-8 characters, so they'll survive the process unscathed. 433 | for tricky_unicode_char in ( 434 | u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' 435 | u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' 436 | u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one. 437 | ): 438 | input = tricky_unicode_char.encode("utf8") 439 | self.assertTrue(input.endswith(b'\x93')) 440 | output = UnicodeDammit.detwingle(input) 441 | self.assertEqual(output, input) 442 | 443 | class TestNamedspacedAttribute(SoupTest): 444 | 445 | def test_name_may_be_none(self): 446 | a = NamespacedAttribute("xmlns", None) 447 | self.assertEqual(a, "xmlns") 448 | 449 | def test_attribute_is_equivalent_to_colon_separated_string(self): 450 | a = NamespacedAttribute("a", "b") 451 | self.assertEqual("a:b", a) 452 | 453 | def test_attributes_are_equivalent_if_prefix_and_name_identical(self): 454 | a = NamespacedAttribute("a", "b", "c") 455 | b = NamespacedAttribute("a", "b", "c") 456 | self.assertEqual(a, b) 457 | 458 | # The actual namespace is not considered. 459 | c = NamespacedAttribute("a", "b", None) 460 | self.assertEqual(a, c) 461 | 462 | # But name and prefix are important. 463 | d = NamespacedAttribute("a", "z", "c") 464 | self.assertNotEqual(a, d) 465 | 466 | e = NamespacedAttribute("z", "b", "c") 467 | self.assertNotEqual(a, e) 468 | 469 | 470 | class TestAttributeValueWithCharsetSubstitution(unittest.TestCase): 471 | 472 | def test_content_meta_attribute_value(self): 473 | value = CharsetMetaAttributeValue("euc-jp") 474 | self.assertEqual("euc-jp", value) 475 | self.assertEqual("euc-jp", value.original_value) 476 | self.assertEqual("utf8", value.encode("utf8")) 477 | 478 | 479 | def test_content_meta_attribute_value(self): 480 | value = ContentMetaAttributeValue("text/html; charset=euc-jp") 481 | self.assertEqual("text/html; charset=euc-jp", value) 482 | self.assertEqual("text/html; charset=euc-jp", value.original_value) 483 | self.assertEqual("text/html; charset=utf8", value.encode("utf8")) 484 | -------------------------------------------------------------------------------- /workflow/web.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # 3 | # Copyright (c) 2014 Dean Jackson 4 | # 5 | # MIT Licence. See http://opensource.org/licenses/MIT 6 | # 7 | # Created on 2014-02-15 8 | # 9 | 10 | """ 11 | A lightweight HTTP library with a requests-like interface. 12 | """ 13 | 14 | from __future__ import print_function 15 | 16 | import codecs 17 | import json 18 | import mimetypes 19 | import os 20 | import random 21 | import re 22 | import socket 23 | import string 24 | import unicodedata 25 | import urllib 26 | import urllib2 27 | import urlparse 28 | import zlib 29 | 30 | 31 | USER_AGENT = u'Alfred-Workflow/1.17 (+http://www.deanishe.net/alfred-workflow)' 32 | 33 | # Valid characters for multipart form data boundaries 34 | BOUNDARY_CHARS = string.digits + string.ascii_letters 35 | 36 | # HTTP response codes 37 | RESPONSES = { 38 | 100: 'Continue', 39 | 101: 'Switching Protocols', 40 | 200: 'OK', 41 | 201: 'Created', 42 | 202: 'Accepted', 43 | 203: 'Non-Authoritative Information', 44 | 204: 'No Content', 45 | 205: 'Reset Content', 46 | 206: 'Partial Content', 47 | 300: 'Multiple Choices', 48 | 301: 'Moved Permanently', 49 | 302: 'Found', 50 | 303: 'See Other', 51 | 304: 'Not Modified', 52 | 305: 'Use Proxy', 53 | 307: 'Temporary Redirect', 54 | 400: 'Bad Request', 55 | 401: 'Unauthorized', 56 | 402: 'Payment Required', 57 | 403: 'Forbidden', 58 | 404: 'Not Found', 59 | 405: 'Method Not Allowed', 60 | 406: 'Not Acceptable', 61 | 407: 'Proxy Authentication Required', 62 | 408: 'Request Timeout', 63 | 409: 'Conflict', 64 | 410: 'Gone', 65 | 411: 'Length Required', 66 | 412: 'Precondition Failed', 67 | 413: 'Request Entity Too Large', 68 | 414: 'Request-URI Too Long', 69 | 415: 'Unsupported Media Type', 70 | 416: 'Requested Range Not Satisfiable', 71 | 417: 'Expectation Failed', 72 | 500: 'Internal Server Error', 73 | 501: 'Not Implemented', 74 | 502: 'Bad Gateway', 75 | 503: 'Service Unavailable', 76 | 504: 'Gateway Timeout', 77 | 505: 'HTTP Version Not Supported' 78 | } 79 | 80 | 81 | def str_dict(dic): 82 | """Convert keys and values in ``dic`` into UTF-8-encoded :class:`str` 83 | 84 | :param dic: :class:`dict` of Unicode strings 85 | :returns: :class:`dict` 86 | 87 | """ 88 | if isinstance(dic, CaseInsensitiveDictionary): 89 | dic2 = CaseInsensitiveDictionary() 90 | else: 91 | dic2 = {} 92 | for k, v in dic.items(): 93 | if isinstance(k, unicode): 94 | k = k.encode('utf-8') 95 | if isinstance(v, unicode): 96 | v = v.encode('utf-8') 97 | dic2[k] = v 98 | return dic2 99 | 100 | 101 | class NoRedirectHandler(urllib2.HTTPRedirectHandler): 102 | """Prevent redirections""" 103 | 104 | def redirect_request(self, *args): 105 | return None 106 | 107 | 108 | # Adapted from https://gist.github.com/babakness/3901174 109 | class CaseInsensitiveDictionary(dict): 110 | """ 111 | Dictionary that enables case insensitive searching while preserving 112 | case sensitivity when keys are listed, ie, via keys() or items() methods. 113 | 114 | Works by storing a lowercase version of the key as the new key and 115 | stores the original key-value pair as the key's value 116 | (values become dictionaries). 117 | 118 | """ 119 | 120 | def __init__(self, initval=None): 121 | 122 | if isinstance(initval, dict): 123 | for key, value in initval.iteritems(): 124 | self.__setitem__(key, value) 125 | 126 | elif isinstance(initval, list): 127 | for (key, value) in initval: 128 | self.__setitem__(key, value) 129 | 130 | def __contains__(self, key): 131 | return dict.__contains__(self, key.lower()) 132 | 133 | def __getitem__(self, key): 134 | return dict.__getitem__(self, key.lower())['val'] 135 | 136 | def __setitem__(self, key, value): 137 | return dict.__setitem__(self, key.lower(), {'key': key, 'val': value}) 138 | 139 | def get(self, key, default=None): 140 | try: 141 | v = dict.__getitem__(self, key.lower()) 142 | except KeyError: 143 | return default 144 | else: 145 | return v['val'] 146 | 147 | def update(self, other): 148 | for k, v in other.items(): 149 | self[k] = v 150 | 151 | def items(self): 152 | return [(v['key'], v['val']) for v in dict.itervalues(self)] 153 | 154 | def keys(self): 155 | return [v['key'] for v in dict.itervalues(self)] 156 | 157 | def values(self): 158 | return [v['val'] for v in dict.itervalues(self)] 159 | 160 | def iteritems(self): 161 | for v in dict.itervalues(self): 162 | yield v['key'], v['val'] 163 | 164 | def iterkeys(self): 165 | for v in dict.itervalues(self): 166 | yield v['key'] 167 | 168 | def itervalues(self): 169 | for v in dict.itervalues(self): 170 | yield v['val'] 171 | 172 | 173 | class Response(object): 174 | """ 175 | Returned by :func:`request` / :func:`get` / :func:`post` functions. 176 | 177 | A simplified version of the ``Response`` object in the ``requests`` library. 178 | 179 | >>> r = request('http://www.google.com') 180 | >>> r.status_code 181 | 200 182 | >>> r.encoding 183 | ISO-8859-1 184 | >>> r.content # bytes 185 | ... 186 | >>> r.text # unicode, decoded according to charset in HTTP header/meta tag 187 | u' ...' 188 | >>> r.json() # content parsed as JSON 189 | 190 | """ 191 | 192 | def __init__(self, request, stream=False): 193 | """Call `request` with :mod:`urllib2` and process results. 194 | 195 | :param request: :class:`urllib2.Request` instance 196 | :param stream: Whether to stream response or retrieve it all at once 197 | :type stream: ``bool`` 198 | 199 | """ 200 | 201 | self.request = request 202 | self._stream = stream 203 | self.url = None 204 | self.raw = None 205 | self._encoding = None 206 | self.error = None 207 | self.status_code = None 208 | self.reason = None 209 | self.headers = CaseInsensitiveDictionary() 210 | self._content = None 211 | self._content_loaded = False 212 | self._gzipped = False 213 | 214 | # Execute query 215 | try: 216 | self.raw = urllib2.urlopen(request) 217 | except urllib2.HTTPError as err: 218 | self.error = err 219 | try: 220 | self.url = err.geturl() 221 | # sometimes (e.g. when authentication fails) 222 | # urllib can't get a URL from an HTTPError 223 | # This behaviour changes across Python versions, 224 | # so no test cover (it isn't important). 225 | except AttributeError: # pragma: no cover 226 | pass 227 | self.status_code = err.code 228 | else: 229 | self.status_code = self.raw.getcode() 230 | self.url = self.raw.geturl() 231 | self.reason = RESPONSES.get(self.status_code) 232 | 233 | # Parse additional info if request succeeded 234 | if not self.error: 235 | headers = self.raw.info() 236 | self.transfer_encoding = headers.getencoding() 237 | self.mimetype = headers.gettype() 238 | for key in headers.keys(): 239 | self.headers[key.lower()] = headers.get(key) 240 | 241 | # Is content gzipped? 242 | # Transfer-Encoding appears to not be used in the wild 243 | # (contrary to the HTTP standard), but no harm in testing 244 | # for it 245 | if ('gzip' in headers.get('content-encoding', '') or 246 | 'gzip' in headers.get('transfer-encoding', '')): 247 | self._gzipped = True 248 | 249 | @property 250 | def stream(self): 251 | return self._stream 252 | 253 | @stream.setter 254 | def stream(self, value): 255 | if self._content_loaded: 256 | raise RuntimeError("`content` has already been read from " 257 | "this Response.") 258 | 259 | self._stream = value 260 | 261 | def json(self): 262 | """Decode response contents as JSON. 263 | 264 | :returns: object decoded from JSON 265 | :rtype: :class:`list` / :class:`dict` 266 | 267 | """ 268 | 269 | return json.loads(self.content, self.encoding or 'utf-8') 270 | 271 | @property 272 | def encoding(self): 273 | """Text encoding of document or ``None`` 274 | 275 | :returns: :class:`str` or ``None`` 276 | 277 | """ 278 | 279 | if not self._encoding: 280 | self._encoding = self._get_encoding() 281 | 282 | return self._encoding 283 | 284 | @property 285 | def content(self): 286 | """Raw content of response (i.e. bytes) 287 | 288 | :returns: Body of HTTP response 289 | :rtype: :class:`str` 290 | 291 | """ 292 | 293 | if not self._content: 294 | 295 | # Decompress gzipped content 296 | if self._gzipped: 297 | decoder = zlib.decompressobj(16 + zlib.MAX_WBITS) 298 | self._content = decoder.decompress(self.raw.read()) 299 | 300 | else: 301 | self._content = self.raw.read() 302 | 303 | self._content_loaded = True 304 | 305 | return self._content 306 | 307 | @property 308 | def text(self): 309 | """Unicode-decoded content of response body. 310 | 311 | If no encoding can be determined from HTTP headers or the content 312 | itself, the encoded response body will be returned instead. 313 | 314 | :returns: Body of HTTP response 315 | :rtype: :class:`unicode` or :class:`str` 316 | 317 | """ 318 | 319 | if self.encoding: 320 | return unicodedata.normalize('NFC', unicode(self.content, 321 | self.encoding)) 322 | return self.content 323 | 324 | def iter_content(self, chunk_size=4096, decode_unicode=False): 325 | """Iterate over response data. 326 | 327 | .. versionadded:: 1.6 328 | 329 | :param chunk_size: Number of bytes to read into memory 330 | :type chunk_size: ``int`` 331 | :param decode_unicode: Decode to Unicode using detected encoding 332 | :type decode_unicode: ``Boolean`` 333 | :returns: iterator 334 | 335 | """ 336 | 337 | if not self.stream: 338 | raise RuntimeError("You cannot call `iter_content` on a " 339 | "Response unless you passed `stream=True`" 340 | " to `get()`/`post()`/`request()`.") 341 | 342 | if self._content_loaded: 343 | raise RuntimeError( 344 | "`content` has already been read from this Response.") 345 | 346 | def decode_stream(iterator, r): 347 | 348 | decoder = codecs.getincrementaldecoder(r.encoding)(errors='replace') 349 | 350 | for chunk in iterator: 351 | data = decoder.decode(chunk) 352 | if data: 353 | yield data 354 | 355 | data = decoder.decode(b'', final=True) 356 | if data: # pragma: no cover 357 | yield data 358 | 359 | def generate(): 360 | 361 | if self._gzipped: 362 | decoder = zlib.decompressobj(16 + zlib.MAX_WBITS) 363 | 364 | while True: 365 | chunk = self.raw.read(chunk_size) 366 | if not chunk: 367 | break 368 | 369 | if self._gzipped: 370 | chunk = decoder.decompress(chunk) 371 | 372 | yield chunk 373 | 374 | chunks = generate() 375 | 376 | if decode_unicode and self.encoding: 377 | chunks = decode_stream(chunks, self) 378 | 379 | return chunks 380 | 381 | def save_to_path(self, filepath): 382 | """Save retrieved data to file at ``filepath`` 383 | 384 | .. versionadded: 1.9.6 385 | 386 | :param filepath: Path to save retrieved data. 387 | 388 | """ 389 | 390 | filepath = os.path.abspath(filepath) 391 | dirname = os.path.dirname(filepath) 392 | if not os.path.exists(dirname): 393 | os.makedirs(dirname) 394 | 395 | self.stream = True 396 | 397 | with open(filepath, 'wb') as fileobj: 398 | for data in self.iter_content(): 399 | fileobj.write(data) 400 | 401 | def raise_for_status(self): 402 | """Raise stored error if one occurred. 403 | 404 | error will be instance of :class:`urllib2.HTTPError` 405 | """ 406 | 407 | if self.error is not None: 408 | raise self.error 409 | return 410 | 411 | def _get_encoding(self): 412 | """Get encoding from HTTP headers or content. 413 | 414 | :returns: encoding or `None` 415 | :rtype: ``unicode`` or ``None`` 416 | 417 | """ 418 | 419 | headers = self.raw.info() 420 | encoding = None 421 | 422 | if headers.getparam('charset'): 423 | encoding = headers.getparam('charset') 424 | 425 | # HTTP Content-Type header 426 | for param in headers.getplist(): 427 | if param.startswith('charset='): 428 | encoding = param[8:] 429 | break 430 | 431 | if not self.stream: # Try sniffing response content 432 | # Encoding declared in document should override HTTP headers 433 | if self.mimetype == 'text/html': # sniff HTML headers 434 | m = re.search("""""", 435 | self.content) 436 | if m: 437 | encoding = m.group(1) 438 | print('sniffed HTML encoding=%r' % encoding) 439 | 440 | elif ((self.mimetype.startswith('application/') or 441 | self.mimetype.startswith('text/')) and 442 | 'xml' in self.mimetype): 443 | m = re.search("""]*\?>""", 444 | self.content) 445 | if m: 446 | encoding = m.group(1) 447 | 448 | # Format defaults 449 | if self.mimetype == 'application/json' and not encoding: 450 | # The default encoding for JSON 451 | encoding = 'utf-8' 452 | 453 | elif self.mimetype == 'application/xml' and not encoding: 454 | # The default for 'application/xml' 455 | encoding = 'utf-8' 456 | 457 | if encoding: 458 | encoding = encoding.lower() 459 | 460 | return encoding 461 | 462 | 463 | def request(method, url, params=None, data=None, headers=None, cookies=None, 464 | files=None, auth=None, timeout=60, allow_redirects=False, 465 | stream=False): 466 | """Initiate an HTTP(S) request. Returns :class:`Response` object. 467 | 468 | :param method: 'GET' or 'POST' 469 | :type method: ``unicode`` 470 | :param url: URL to open 471 | :type url: ``unicode`` 472 | :param params: mapping of URL parameters 473 | :type params: :class:`dict` 474 | :param data: mapping of form data ``{'field_name': 'value'}`` or 475 | :class:`str` 476 | :type data: :class:`dict` or :class:`str` 477 | :param headers: HTTP headers 478 | :type headers: :class:`dict` 479 | :param cookies: cookies to send to server 480 | :type cookies: :class:`dict` 481 | :param files: files to upload (see below). 482 | :type files: :class:`dict` 483 | :param auth: username, password 484 | :type auth: ``tuple`` 485 | :param timeout: connection timeout limit in seconds 486 | :type timeout: ``int`` 487 | :param allow_redirects: follow redirections 488 | :type allow_redirects: ``Boolean`` 489 | :param stream: Stream content instead of fetching it all at once. 490 | :type stream: ``bool`` 491 | :returns: :class:`Response` object 492 | 493 | 494 | The ``files`` argument is a dictionary:: 495 | 496 | {'fieldname' : { 'filename': 'blah.txt', 497 | 'content': '', 498 | 'mimetype': 'text/plain'} 499 | } 500 | 501 | * ``fieldname`` is the name of the field in the HTML form. 502 | * ``mimetype`` is optional. If not provided, :mod:`mimetypes` will 503 | be used to guess the mimetype, or ``application/octet-stream`` 504 | will be used. 505 | 506 | """ 507 | 508 | # TODO: cookies 509 | socket.setdefaulttimeout(timeout) 510 | 511 | # Default handlers 512 | openers = [] 513 | 514 | if not allow_redirects: 515 | openers.append(NoRedirectHandler()) 516 | 517 | if auth is not None: # Add authorisation handler 518 | username, password = auth 519 | password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm() 520 | password_manager.add_password(None, url, username, password) 521 | auth_manager = urllib2.HTTPBasicAuthHandler(password_manager) 522 | openers.append(auth_manager) 523 | 524 | # Install our custom chain of openers 525 | opener = urllib2.build_opener(*openers) 526 | urllib2.install_opener(opener) 527 | 528 | if not headers: 529 | headers = CaseInsensitiveDictionary() 530 | else: 531 | headers = CaseInsensitiveDictionary(headers) 532 | 533 | if 'user-agent' not in headers: 534 | headers['user-agent'] = USER_AGENT 535 | 536 | # Accept gzip-encoded content 537 | encodings = [s.strip() for s in 538 | headers.get('accept-encoding', '').split(',')] 539 | if 'gzip' not in encodings: 540 | encodings.append('gzip') 541 | 542 | headers['accept-encoding'] = ', '.join(encodings) 543 | 544 | # Force POST by providing an empty data string 545 | if method == 'POST' and not data: 546 | data = '' 547 | 548 | if files: 549 | if not data: 550 | data = {} 551 | new_headers, data = encode_multipart_formdata(data, files) 552 | headers.update(new_headers) 553 | elif data and isinstance(data, dict): 554 | data = urllib.urlencode(str_dict(data)) 555 | 556 | # Make sure everything is encoded text 557 | headers = str_dict(headers) 558 | 559 | if isinstance(url, unicode): 560 | url = url.encode('utf-8') 561 | 562 | if params: # GET args (POST args are handled in encode_multipart_formdata) 563 | 564 | scheme, netloc, path, query, fragment = urlparse.urlsplit(url) 565 | 566 | if query: # Combine query string and `params` 567 | url_params = urlparse.parse_qs(query) 568 | # `params` take precedence over URL query string 569 | url_params.update(params) 570 | params = url_params 571 | 572 | query = urllib.urlencode(str_dict(params), doseq=True) 573 | url = urlparse.urlunsplit((scheme, netloc, path, query, fragment)) 574 | 575 | req = urllib2.Request(url, data, headers) 576 | return Response(req, stream) 577 | 578 | 579 | def get(url, params=None, headers=None, cookies=None, auth=None, 580 | timeout=60, allow_redirects=True, stream=False): 581 | """Initiate a GET request. Arguments as for :func:`request`. 582 | 583 | :returns: :class:`Response` instance 584 | 585 | """ 586 | 587 | return request('GET', url, params, headers=headers, cookies=cookies, 588 | auth=auth, timeout=timeout, allow_redirects=allow_redirects, 589 | stream=stream) 590 | 591 | 592 | def post(url, params=None, data=None, headers=None, cookies=None, files=None, 593 | auth=None, timeout=60, allow_redirects=False, stream=False): 594 | """Initiate a POST request. Arguments as for :func:`request`. 595 | 596 | :returns: :class:`Response` instance 597 | 598 | """ 599 | return request('POST', url, params, data, headers, cookies, files, auth, 600 | timeout, allow_redirects, stream) 601 | 602 | 603 | def encode_multipart_formdata(fields, files): 604 | """Encode form data (``fields``) and ``files`` for POST request. 605 | 606 | :param fields: mapping of ``{name : value}`` pairs for normal form fields. 607 | :type fields: :class:`dict` 608 | :param files: dictionary of fieldnames/files elements for file data. 609 | See below for details. 610 | :type files: :class:`dict` of :class:`dicts` 611 | :returns: ``(headers, body)`` ``headers`` is a :class:`dict` of HTTP headers 612 | :rtype: 2-tuple ``(dict, str)`` 613 | 614 | The ``files`` argument is a dictionary:: 615 | 616 | {'fieldname' : { 'filename': 'blah.txt', 617 | 'content': '', 618 | 'mimetype': 'text/plain'} 619 | } 620 | 621 | - ``fieldname`` is the name of the field in the HTML form. 622 | - ``mimetype`` is optional. If not provided, :mod:`mimetypes` will be used to guess the mimetype, or ``application/octet-stream`` will be used. 623 | 624 | """ 625 | 626 | def get_content_type(filename): 627 | """Return or guess mimetype of ``filename``. 628 | 629 | :param filename: filename of file 630 | :type filename: unicode/string 631 | :returns: mime-type, e.g. ``text/html`` 632 | :rtype: :class::class:`str` 633 | 634 | """ 635 | 636 | return mimetypes.guess_type(filename)[0] or 'application/octet-stream' 637 | 638 | boundary = '-----' + ''.join(random.choice(BOUNDARY_CHARS) 639 | for i in range(30)) 640 | CRLF = '\r\n' 641 | output = [] 642 | 643 | # Normal form fields 644 | for (name, value) in fields.items(): 645 | if isinstance(name, unicode): 646 | name = name.encode('utf-8') 647 | if isinstance(value, unicode): 648 | value = value.encode('utf-8') 649 | output.append('--' + boundary) 650 | output.append('Content-Disposition: form-data; name="%s"' % name) 651 | output.append('') 652 | output.append(value) 653 | 654 | # Files to upload 655 | for name, d in files.items(): 656 | filename = d[u'filename'] 657 | content = d[u'content'] 658 | if u'mimetype' in d: 659 | mimetype = d[u'mimetype'] 660 | else: 661 | mimetype = get_content_type(filename) 662 | if isinstance(name, unicode): 663 | name = name.encode('utf-8') 664 | if isinstance(filename, unicode): 665 | filename = filename.encode('utf-8') 666 | if isinstance(mimetype, unicode): 667 | mimetype = mimetype.encode('utf-8') 668 | output.append('--' + boundary) 669 | output.append('Content-Disposition: form-data; ' 670 | 'name="%s"; filename="%s"' % (name, filename)) 671 | output.append('Content-Type: %s' % mimetype) 672 | output.append('') 673 | output.append(content) 674 | 675 | output.append('--' + boundary + '--') 676 | output.append('') 677 | body = CRLF.join(output) 678 | headers = { 679 | 'Content-Type': 'multipart/form-data; boundary=%s' % boundary, 680 | 'Content-Length': str(len(body)), 681 | } 682 | return (headers, body) 683 | -------------------------------------------------------------------------------- /bs4/testing.py: -------------------------------------------------------------------------------- 1 | """Helper classes for tests.""" 2 | 3 | __license__ = "MIT" 4 | 5 | import pickle 6 | import copy 7 | import functools 8 | import unittest 9 | from unittest import TestCase 10 | from bs4 import BeautifulSoup 11 | from bs4.element import ( 12 | CharsetMetaAttributeValue, 13 | Comment, 14 | ContentMetaAttributeValue, 15 | Doctype, 16 | SoupStrainer, 17 | ) 18 | 19 | from bs4.builder import HTMLParserTreeBuilder 20 | default_builder = HTMLParserTreeBuilder 21 | 22 | 23 | class SoupTest(unittest.TestCase): 24 | 25 | @property 26 | def default_builder(self): 27 | return default_builder() 28 | 29 | def soup(self, markup, **kwargs): 30 | """Build a Beautiful Soup object from markup.""" 31 | builder = kwargs.pop('builder', self.default_builder) 32 | return BeautifulSoup(markup, builder=builder, **kwargs) 33 | 34 | def document_for(self, markup): 35 | """Turn an HTML fragment into a document. 36 | 37 | The details depend on the builder. 38 | """ 39 | return self.default_builder.test_fragment_to_document(markup) 40 | 41 | def assertSoupEquals(self, to_parse, compare_parsed_to=None): 42 | builder = self.default_builder 43 | obj = BeautifulSoup(to_parse, builder=builder) 44 | if compare_parsed_to is None: 45 | compare_parsed_to = to_parse 46 | 47 | self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) 48 | 49 | def assertConnectedness(self, element): 50 | """Ensure that next_element and previous_element are properly 51 | set for all descendants of the given element. 52 | """ 53 | earlier = None 54 | for e in element.descendants: 55 | if earlier: 56 | self.assertEqual(e, earlier.next_element) 57 | self.assertEqual(earlier, e.previous_element) 58 | earlier = e 59 | 60 | class HTMLTreeBuilderSmokeTest(object): 61 | 62 | """A basic test of a treebuilder's competence. 63 | 64 | Any HTML treebuilder, present or future, should be able to pass 65 | these tests. With invalid markup, there's room for interpretation, 66 | and different parsers can handle it differently. But with the 67 | markup in these tests, there's not much room for interpretation. 68 | """ 69 | 70 | def test_pickle_and_unpickle_identity(self): 71 | # Pickling a tree, then unpickling it, yields a tree identical 72 | # to the original. 73 | tree = self.soup("foo") 74 | dumped = pickle.dumps(tree, 2) 75 | loaded = pickle.loads(dumped) 76 | self.assertEqual(loaded.__class__, BeautifulSoup) 77 | self.assertEqual(loaded.decode(), tree.decode()) 78 | 79 | def assertDoctypeHandled(self, doctype_fragment): 80 | """Assert that a given doctype string is handled correctly.""" 81 | doctype_str, soup = self._document_with_doctype(doctype_fragment) 82 | 83 | # Make sure a Doctype object was created. 84 | doctype = soup.contents[0] 85 | self.assertEqual(doctype.__class__, Doctype) 86 | self.assertEqual(doctype, doctype_fragment) 87 | self.assertEqual(str(soup)[:len(doctype_str)], doctype_str) 88 | 89 | # Make sure that the doctype was correctly associated with the 90 | # parse tree and that the rest of the document parsed. 91 | self.assertEqual(soup.p.contents[0], 'foo') 92 | 93 | def _document_with_doctype(self, doctype_fragment): 94 | """Generate and parse a document with the given doctype.""" 95 | doctype = '' % doctype_fragment 96 | markup = doctype + '\n

foo

' 97 | soup = self.soup(markup) 98 | return doctype, soup 99 | 100 | def test_normal_doctypes(self): 101 | """Make sure normal, everyday HTML doctypes are handled correctly.""" 102 | self.assertDoctypeHandled("html") 103 | self.assertDoctypeHandled( 104 | 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') 105 | 106 | def test_empty_doctype(self): 107 | soup = self.soup("") 108 | doctype = soup.contents[0] 109 | self.assertEqual("", doctype.strip()) 110 | 111 | def test_public_doctype_with_url(self): 112 | doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' 113 | self.assertDoctypeHandled(doctype) 114 | 115 | def test_system_doctype(self): 116 | self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"') 117 | 118 | def test_namespaced_system_doctype(self): 119 | # We can handle a namespaced doctype with a system ID. 120 | self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"') 121 | 122 | def test_namespaced_public_doctype(self): 123 | # Test a namespaced doctype with a public id. 124 | self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"') 125 | 126 | def test_real_xhtml_document(self): 127 | """A real XHTML document should come out more or less the same as it went in.""" 128 | markup = b""" 129 | 130 | 131 | Hello. 132 | Goodbye. 133 | """ 134 | soup = self.soup(markup) 135 | self.assertEqual( 136 | soup.encode("utf-8").replace(b"\n", b""), 137 | markup.replace(b"\n", b"")) 138 | 139 | def test_processing_instruction(self): 140 | markup = b"""""" 141 | soup = self.soup(markup) 142 | self.assertEqual(markup, soup.encode("utf8")) 143 | 144 | def test_deepcopy(self): 145 | """Make sure you can copy the tree builder. 146 | 147 | This is important because the builder is part of a 148 | BeautifulSoup object, and we want to be able to copy that. 149 | """ 150 | copy.deepcopy(self.default_builder) 151 | 152 | def test_p_tag_is_never_empty_element(self): 153 | """A

tag is never designated as an empty-element tag. 154 | 155 | Even if the markup shows it as an empty-element tag, it 156 | shouldn't be presented that way. 157 | """ 158 | soup = self.soup("

") 159 | self.assertFalse(soup.p.is_empty_element) 160 | self.assertEqual(str(soup.p), "

") 161 | 162 | def test_unclosed_tags_get_closed(self): 163 | """A tag that's not closed by the end of the document should be closed. 164 | 165 | This applies to all tags except empty-element tags. 166 | """ 167 | self.assertSoupEquals("

", "

") 168 | self.assertSoupEquals("", "") 169 | 170 | self.assertSoupEquals("
", "
") 171 | 172 | def test_br_is_always_empty_element_tag(self): 173 | """A
tag is designated as an empty-element tag. 174 | 175 | Some parsers treat

as one
tag, some parsers as 176 | two tags, but it should always be an empty-element tag. 177 | """ 178 | soup = self.soup("

") 179 | self.assertTrue(soup.br.is_empty_element) 180 | self.assertEqual(str(soup.br), "
") 181 | 182 | def test_nested_formatting_elements(self): 183 | self.assertSoupEquals("") 184 | 185 | def test_double_head(self): 186 | html = ''' 187 | 188 | 189 | Ordinary HEAD element test 190 | 191 | 194 | 195 | Hello, world! 196 | 197 | 198 | ''' 199 | soup = self.soup(html) 200 | self.assertEqual("text/javascript", soup.find('script')['type']) 201 | 202 | def test_comment(self): 203 | # Comments are represented as Comment objects. 204 | markup = "

foobaz

" 205 | self.assertSoupEquals(markup) 206 | 207 | soup = self.soup(markup) 208 | comment = soup.find(text="foobar") 209 | self.assertEqual(comment.__class__, Comment) 210 | 211 | # The comment is properly integrated into the tree. 212 | foo = soup.find(text="foo") 213 | self.assertEqual(comment, foo.next_element) 214 | baz = soup.find(text="baz") 215 | self.assertEqual(comment, baz.previous_element) 216 | 217 | def test_preserved_whitespace_in_pre_and_textarea(self): 218 | """Whitespace must be preserved in
 and ")
221 | 
222 |     def test_nested_inline_elements(self):
223 |         """Inline elements can be nested indefinitely."""
224 |         b_tag = "Inside a B tag"
225 |         self.assertSoupEquals(b_tag)
226 | 
227 |         nested_b_tag = "

A nested tag

" 228 | self.assertSoupEquals(nested_b_tag) 229 | 230 | double_nested_b_tag = "

A doubly nested tag

" 231 | self.assertSoupEquals(nested_b_tag) 232 | 233 | def test_nested_block_level_elements(self): 234 | """Block elements can be nested.""" 235 | soup = self.soup('

Foo

') 236 | blockquote = soup.blockquote 237 | self.assertEqual(blockquote.p.b.string, 'Foo') 238 | self.assertEqual(blockquote.b.string, 'Foo') 239 | 240 | def test_correctly_nested_tables(self): 241 | """One table can go inside another one.""" 242 | markup = ('' 243 | '' 244 | "') 248 | 249 | self.assertSoupEquals( 250 | markup, 251 | '
Here's another table:" 245 | '' 246 | '' 247 | '
foo
Here\'s another table:' 252 | '
foo
' 253 | '
') 254 | 255 | self.assertSoupEquals( 256 | "" 257 | "" 258 | "
Foo
Bar
Baz
") 259 | 260 | def test_deeply_nested_multivalued_attribute(self): 261 | # html5lib can set the attributes of the same tag many times 262 | # as it rearranges the tree. This has caused problems with 263 | # multivalued attributes. 264 | markup = '
' 265 | soup = self.soup(markup) 266 | self.assertEqual(["css"], soup.div.div['class']) 267 | 268 | def test_multivalued_attribute_on_html(self): 269 | # html5lib uses a different API to set the attributes ot the 270 | # tag. This has caused problems with multivalued 271 | # attributes. 272 | markup = '' 273 | soup = self.soup(markup) 274 | self.assertEqual(["a", "b"], soup.html['class']) 275 | 276 | def test_angle_brackets_in_attribute_values_are_escaped(self): 277 | self.assertSoupEquals('', '') 278 | 279 | def test_entities_in_attributes_converted_to_unicode(self): 280 | expect = u'

' 281 | self.assertSoupEquals('

', expect) 282 | self.assertSoupEquals('

', expect) 283 | self.assertSoupEquals('

', expect) 284 | self.assertSoupEquals('

', expect) 285 | 286 | def test_entities_in_text_converted_to_unicode(self): 287 | expect = u'

pi\N{LATIN SMALL LETTER N WITH TILDE}ata

' 288 | self.assertSoupEquals("

piñata

", expect) 289 | self.assertSoupEquals("

piñata

", expect) 290 | self.assertSoupEquals("

piñata

", expect) 291 | self.assertSoupEquals("

piñata

", expect) 292 | 293 | def test_quot_entity_converted_to_quotation_mark(self): 294 | self.assertSoupEquals("

I said "good day!"

", 295 | '

I said "good day!"

') 296 | 297 | def test_out_of_range_entity(self): 298 | expect = u"\N{REPLACEMENT CHARACTER}" 299 | self.assertSoupEquals("�", expect) 300 | self.assertSoupEquals("�", expect) 301 | self.assertSoupEquals("�", expect) 302 | 303 | def test_multipart_strings(self): 304 | "Mostly to prevent a recurrence of a bug in the html5lib treebuilder." 305 | soup = self.soup("

\nfoo

") 306 | self.assertEqual("p", soup.h2.string.next_element.name) 307 | self.assertEqual("p", soup.p.name) 308 | self.assertConnectedness(soup) 309 | 310 | def test_head_tag_between_head_and_body(self): 311 | "Prevent recurrence of a bug in the html5lib treebuilder." 312 | content = """ 313 | 314 | foo 315 | 316 | """ 317 | soup = self.soup(content) 318 | self.assertNotEqual(None, soup.html.body) 319 | self.assertConnectedness(soup) 320 | 321 | def test_multiple_copies_of_a_tag(self): 322 | "Prevent recurrence of a bug in the html5lib treebuilder." 323 | content = """ 324 | 325 | 326 | 332 | 333 | 334 | """ 335 | soup = self.soup(content) 336 | self.assertConnectedness(soup.article) 337 | 338 | def test_basic_namespaces(self): 339 | """Parsers don't need to *understand* namespaces, but at the 340 | very least they should not choke on namespaces or lose 341 | data.""" 342 | 343 | markup = b'4' 344 | soup = self.soup(markup) 345 | self.assertEqual(markup, soup.encode()) 346 | html = soup.html 347 | self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns']) 348 | self.assertEqual( 349 | 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml']) 350 | self.assertEqual( 351 | 'http://www.w3.org/2000/svg', soup.html['xmlns:svg']) 352 | 353 | def test_multivalued_attribute_value_becomes_list(self): 354 | markup = b'' 355 | soup = self.soup(markup) 356 | self.assertEqual(['foo', 'bar'], soup.a['class']) 357 | 358 | # 359 | # Generally speaking, tests below this point are more tests of 360 | # Beautiful Soup than tests of the tree builders. But parsers are 361 | # weird, so we run these tests separately for every tree builder 362 | # to detect any differences between them. 363 | # 364 | 365 | def test_can_parse_unicode_document(self): 366 | # A seemingly innocuous document... but it's in Unicode! And 367 | # it contains characters that can't be represented in the 368 | # encoding found in the declaration! The horror! 369 | markup = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' 370 | soup = self.soup(markup) 371 | self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string) 372 | 373 | def test_soupstrainer(self): 374 | """Parsers should be able to work with SoupStrainers.""" 375 | strainer = SoupStrainer("b") 376 | soup = self.soup("A bold statement", 377 | parse_only=strainer) 378 | self.assertEqual(soup.decode(), "bold") 379 | 380 | def test_single_quote_attribute_values_become_double_quotes(self): 381 | self.assertSoupEquals("", 382 | '') 383 | 384 | def test_attribute_values_with_nested_quotes_are_left_alone(self): 385 | text = """a""" 386 | self.assertSoupEquals(text) 387 | 388 | def test_attribute_values_with_double_nested_quotes_get_quoted(self): 389 | text = """a""" 390 | soup = self.soup(text) 391 | soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' 392 | self.assertSoupEquals( 393 | soup.foo.decode(), 394 | """a""") 395 | 396 | def test_ampersand_in_attribute_value_gets_escaped(self): 397 | self.assertSoupEquals('', 398 | '') 399 | 400 | self.assertSoupEquals( 401 | 'foo', 402 | 'foo') 403 | 404 | def test_escaped_ampersand_in_attribute_value_is_left_alone(self): 405 | self.assertSoupEquals('') 406 | 407 | def test_entities_in_strings_converted_during_parsing(self): 408 | # Both XML and HTML entities are converted to Unicode characters 409 | # during parsing. 410 | text = "

<<sacré bleu!>>

" 411 | expected = u"

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

" 412 | self.assertSoupEquals(text, expected) 413 | 414 | def test_smart_quotes_converted_on_the_way_in(self): 415 | # Microsoft smart quotes are converted to Unicode characters during 416 | # parsing. 417 | quote = b"

\x91Foo\x92

" 418 | soup = self.soup(quote) 419 | self.assertEqual( 420 | soup.p.string, 421 | u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") 422 | 423 | def test_non_breaking_spaces_converted_on_the_way_in(self): 424 | soup = self.soup("  ") 425 | self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) 426 | 427 | def test_entities_converted_on_the_way_out(self): 428 | text = "

<<sacré bleu!>>

" 429 | expected = u"

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

".encode("utf-8") 430 | soup = self.soup(text) 431 | self.assertEqual(soup.p.encode("utf-8"), expected) 432 | 433 | def test_real_iso_latin_document(self): 434 | # Smoke test of interrelated functionality, using an 435 | # easy-to-understand document. 436 | 437 | # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. 438 | unicode_html = u'

Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!

' 439 | 440 | # That's because we're going to encode it into ISO-Latin-1, and use 441 | # that to test. 442 | iso_latin_html = unicode_html.encode("iso-8859-1") 443 | 444 | # Parse the ISO-Latin-1 HTML. 445 | soup = self.soup(iso_latin_html) 446 | # Encode it to UTF-8. 447 | result = soup.encode("utf-8") 448 | 449 | # What do we expect the result to look like? Well, it would 450 | # look like unicode_html, except that the META tag would say 451 | # UTF-8 instead of ISO-Latin-1. 452 | expected = unicode_html.replace("ISO-Latin-1", "utf-8") 453 | 454 | # And, of course, it would be in UTF-8, not Unicode. 455 | expected = expected.encode("utf-8") 456 | 457 | # Ta-da! 458 | self.assertEqual(result, expected) 459 | 460 | def test_real_shift_jis_document(self): 461 | # Smoke test to make sure the parser can handle a document in 462 | # Shift-JIS encoding, without choking. 463 | shift_jis_html = ( 464 | b'
'
465 |             b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
466 |             b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
467 |             b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
468 |             b'
') 469 | unicode_html = shift_jis_html.decode("shift-jis") 470 | soup = self.soup(unicode_html) 471 | 472 | # Make sure the parse tree is correctly encoded to various 473 | # encodings. 474 | self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8")) 475 | self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp")) 476 | 477 | def test_real_hebrew_document(self): 478 | # A real-world test to make sure we can convert ISO-8859-9 (a 479 | # Hebrew encoding) to UTF-8. 480 | hebrew_document = b'Hebrew (ISO 8859-8) in Visual Directionality

Hebrew (ISO 8859-8) in Visual Directionality

\xed\xe5\xec\xf9' 481 | soup = self.soup( 482 | hebrew_document, from_encoding="iso8859-8") 483 | self.assertEqual(soup.original_encoding, 'iso8859-8') 484 | self.assertEqual( 485 | soup.encode('utf-8'), 486 | hebrew_document.decode("iso8859-8").encode("utf-8")) 487 | 488 | def test_meta_tag_reflects_current_encoding(self): 489 | # Here's the tag saying that a document is 490 | # encoded in Shift-JIS. 491 | meta_tag = ('') 493 | 494 | # Here's a document incorporating that meta tag. 495 | shift_jis_html = ( 496 | '\n%s\n' 497 | '' 498 | 'Shift-JIS markup goes here.') % meta_tag 499 | soup = self.soup(shift_jis_html) 500 | 501 | # Parse the document, and the charset is seemingly unaffected. 502 | parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) 503 | content = parsed_meta['content'] 504 | self.assertEqual('text/html; charset=x-sjis', content) 505 | 506 | # But that value is actually a ContentMetaAttributeValue object. 507 | self.assertTrue(isinstance(content, ContentMetaAttributeValue)) 508 | 509 | # And it will take on a value that reflects its current 510 | # encoding. 511 | self.assertEqual('text/html; charset=utf8', content.encode("utf8")) 512 | 513 | # For the rest of the story, see TestSubstitutions in 514 | # test_tree.py. 515 | 516 | def test_html5_style_meta_tag_reflects_current_encoding(self): 517 | # Here's the tag saying that a document is 518 | # encoded in Shift-JIS. 519 | meta_tag = ('') 520 | 521 | # Here's a document incorporating that meta tag. 522 | shift_jis_html = ( 523 | '\n%s\n' 524 | '' 525 | 'Shift-JIS markup goes here.') % meta_tag 526 | soup = self.soup(shift_jis_html) 527 | 528 | # Parse the document, and the charset is seemingly unaffected. 529 | parsed_meta = soup.find('meta', id="encoding") 530 | charset = parsed_meta['charset'] 531 | self.assertEqual('x-sjis', charset) 532 | 533 | # But that value is actually a CharsetMetaAttributeValue object. 534 | self.assertTrue(isinstance(charset, CharsetMetaAttributeValue)) 535 | 536 | # And it will take on a value that reflects its current 537 | # encoding. 538 | self.assertEqual('utf8', charset.encode("utf8")) 539 | 540 | def test_tag_with_no_attributes_can_have_attributes_added(self): 541 | data = self.soup("text") 542 | data.a['foo'] = 'bar' 543 | self.assertEqual('text', data.a.decode()) 544 | 545 | class XMLTreeBuilderSmokeTest(object): 546 | 547 | def test_pickle_and_unpickle_identity(self): 548 | # Pickling a tree, then unpickling it, yields a tree identical 549 | # to the original. 550 | tree = self.soup("foo") 551 | dumped = pickle.dumps(tree, 2) 552 | loaded = pickle.loads(dumped) 553 | self.assertEqual(loaded.__class__, BeautifulSoup) 554 | self.assertEqual(loaded.decode(), tree.decode()) 555 | 556 | def test_docstring_generated(self): 557 | soup = self.soup("") 558 | self.assertEqual( 559 | soup.encode(), b'\n') 560 | 561 | def test_xml_declaration(self): 562 | markup = b"""\n""" 563 | soup = self.soup(markup) 564 | self.assertEqual(markup, soup.encode("utf8")) 565 | 566 | def test_real_xhtml_document(self): 567 | """A real XHTML document should come out *exactly* the same as it went in.""" 568 | markup = b""" 569 | 570 | 571 | Hello. 572 | Goodbye. 573 | """ 574 | soup = self.soup(markup) 575 | self.assertEqual( 576 | soup.encode("utf-8"), markup) 577 | 578 | def test_formatter_processes_script_tag_for_xml_documents(self): 579 | doc = """ 580 | 582 | """ 583 | soup = BeautifulSoup(doc, "lxml-xml") 584 | # lxml would have stripped this while parsing, but we can add 585 | # it later. 586 | soup.script.string = 'console.log("< < hey > > ");' 587 | encoded = soup.encode() 588 | self.assertTrue(b"< < hey > >" in encoded) 589 | 590 | def test_can_parse_unicode_document(self): 591 | markup = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' 592 | soup = self.soup(markup) 593 | self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string) 594 | 595 | def test_popping_namespaced_tag(self): 596 | markup = 'b2012-07-02T20:33:42Zcd' 597 | soup = self.soup(markup) 598 | self.assertEqual( 599 | unicode(soup.rss), markup) 600 | 601 | def test_docstring_includes_correct_encoding(self): 602 | soup = self.soup("") 603 | self.assertEqual( 604 | soup.encode("latin1"), 605 | b'\n') 606 | 607 | def test_large_xml_document(self): 608 | """A large XML document should come out the same as it went in.""" 609 | markup = (b'\n' 610 | + b'0' * (2**12) 611 | + b'') 612 | soup = self.soup(markup) 613 | self.assertEqual(soup.encode("utf-8"), markup) 614 | 615 | 616 | def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): 617 | self.assertSoupEquals("

", "

") 618 | self.assertSoupEquals("

foo

") 619 | 620 | def test_namespaces_are_preserved(self): 621 | markup = 'This tag is in the a namespaceThis tag is in the b namespace' 622 | soup = self.soup(markup) 623 | root = soup.root 624 | self.assertEqual("http://example.com/", root['xmlns:a']) 625 | self.assertEqual("http://example.net/", root['xmlns:b']) 626 | 627 | def test_closing_namespaced_tag(self): 628 | markup = '

20010504

' 629 | soup = self.soup(markup) 630 | self.assertEqual(unicode(soup.p), markup) 631 | 632 | def test_namespaced_attributes(self): 633 | markup = '' 634 | soup = self.soup(markup) 635 | self.assertEqual(unicode(soup.foo), markup) 636 | 637 | def test_namespaced_attributes_xml_namespace(self): 638 | markup = 'bar' 639 | soup = self.soup(markup) 640 | self.assertEqual(unicode(soup.foo), markup) 641 | 642 | class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): 643 | """Smoke test for a tree builder that supports HTML5.""" 644 | 645 | def test_real_xhtml_document(self): 646 | # Since XHTML is not HTML5, HTML5 parsers are not tested to handle 647 | # XHTML documents in any particular way. 648 | pass 649 | 650 | def test_html_tags_have_namespace(self): 651 | markup = "" 652 | soup = self.soup(markup) 653 | self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace) 654 | 655 | def test_svg_tags_have_namespace(self): 656 | markup = '' 657 | soup = self.soup(markup) 658 | namespace = "http://www.w3.org/2000/svg" 659 | self.assertEqual(namespace, soup.svg.namespace) 660 | self.assertEqual(namespace, soup.circle.namespace) 661 | 662 | 663 | def test_mathml_tags_have_namespace(self): 664 | markup = '5' 665 | soup = self.soup(markup) 666 | namespace = 'http://www.w3.org/1998/Math/MathML' 667 | self.assertEqual(namespace, soup.math.namespace) 668 | self.assertEqual(namespace, soup.msqrt.namespace) 669 | 670 | def test_xml_declaration_becomes_comment(self): 671 | markup = '' 672 | soup = self.soup(markup) 673 | self.assertTrue(isinstance(soup.contents[0], Comment)) 674 | self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?') 675 | self.assertEqual("html", soup.contents[0].next_element.name) 676 | 677 | def skipIf(condition, reason): 678 | def nothing(test, *args, **kwargs): 679 | return None 680 | 681 | def decorator(test_item): 682 | if condition: 683 | return nothing 684 | else: 685 | return test_item 686 | 687 | return decorator 688 | --------------------------------------------------------------------------------