├── .gitignore
├── workflow
    ├── version
    ├── Notify.tgz
    ├── __init__.py
    ├── background.py
    ├── notify.py
    ├── update.py
    └── web.py
├── bs4
    ├── tests
    │   ├── __init__.py
    │   ├── test_htmlparser.py
    │   ├── test_docs.py
    │   ├── test_lxml.py
    │   ├── test_html5lib.py
    │   ├── test_builder_registry.py
    │   └── test_soup.py
    ├── diagnose.py
    ├── builder
    │   ├── _lxml.py
    │   ├── _htmlparser.py
    │   ├── __init__.py
    │   └── _html5lib.py
    ├── __init__.py
    └── testing.py
├── doc.png
├── icon.png
├── Mweb-Blog.alfredworkflow
├── README.md
├── ListArticle.py
├── article.py
├── xpinyin
    └── __init__.py
└── info.plist


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 


--------------------------------------------------------------------------------
/workflow/version:
--------------------------------------------------------------------------------
1 | 1.17.2


--------------------------------------------------------------------------------
/bs4/tests/__init__.py:
--------------------------------------------------------------------------------
1 | "The beautifulsoup tests."
2 | 


--------------------------------------------------------------------------------
/doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haoliplus/alfred3-mweb-workflow/HEAD/doc.png


--------------------------------------------------------------------------------
/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haoliplus/alfred3-mweb-workflow/HEAD/icon.png


--------------------------------------------------------------------------------
/workflow/Notify.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haoliplus/alfred3-mweb-workflow/HEAD/workflow/Notify.tgz


--------------------------------------------------------------------------------
/Mweb-Blog.alfredworkflow:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haoliplus/alfred3-mweb-workflow/HEAD/Mweb-Blog.alfredworkflow


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 辅助 MWeb 书写的 Alfred 3 插件
 2 | 
 3 | 最近 Alfred 3 测试版放出，想学一下开发 Workflow 的过程，正好使用 MWeb 时遇到了些问题，在作者没有正式改进前先使用 Alfred 辅助书写。
 4 | 
 5 | 该工具具备处理本地静态网站的相关功能。
 6 | 
 7 | [下载链接](https://github.com/DarryO/alfred3-mweb-workflow/raw/master/Mweb-Blog.alfredworkflow)
 8 | 
 9 | 功能：
10 | 
11 | 1. 本地预览静态博客。
12 | >我的 MWeb 生成的文件都有@扩展属性标记，无法直接打开Chrome预览。使用该功能能够删除扩展属性，并在本地预览博客。
13 | 
14 | 2. 推送至 Git 仓库。
15 | >如果已经将静态网站生成目录设置为 Git 仓库，并配置了 ssh 免密码登录，可以使用该 Wrokflow 直接推送到远程仓库。
16 | 
17 | 3. 搜索站内文章并生成跳转链接。搜索已经生成的文章(拼音搜索感谢[lxneng提供的拼音处理模块](https://github.com/lxneng/xpinyin.git))，并生成站内跳转链接。
18 | 
19 | 前两个功能就不多说了。关于生成站内链接，下面补充一下3的作用：
20 | 
21 | ![](http://i.imgur.com/S4ZXYAC.gif)
22 | 
23 | Alfred 通过读取 静态网站生成目录下的 archives.html，列出所有文章的名称，类别，创建时间，并通过用户输入的拼音进行搜索。
24 | 
25 | 我用了Alfred 3 提供的环境变量的功能，用户在使用前需要配置一下自己的静态网站路径。
26 | 
27 | ![](http://i.imgur.com/mwBmo56.png)
28 | ![](http://i.imgur.com/J0zb5Dp.png)
29 | 
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/ListArticle.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | import re,urllib,sys
 3 | import article as art
 4 | import xpinyin
 5 | from workflow import Workflow
 6 | def main(wf):
 7 |     site_path = wf.args[0]
 8 |     articles = art.Articles(site_path)
 9 |     py = xpinyin.Pinyin()
10 |     if len(wf.args) == 2:
11 |         query = wf.args[1]
12 |     else:
13 |         query = ""
14 |     for article in articles.articles:
15 |         if not (query == "" or query in py.get_pinyin(article['title'],'').lower() or query in py.get_pinyin(article['category'],'').lower()):
16 |             continue
17 |         wf.add_item(title = article['title'],
18 |                     subtitle = article['date']+" " + article['category'],
19 |                     arg = '[%s](./%s)' % (article['title'],article['link']),
20 |                     valid=True,
21 |                     icon = "doc.png")
22 | 
23 |     wf.send_feedback()
24 | 
25 | if __name__ == "__main__":
26 |     wf = Workflow()
27 |     sys.exit(wf.run(main))
28 | 


--------------------------------------------------------------------------------
/article.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import re
 4 | import sys,os
 5 | from bs4 import BeautifulSoup
 6 | 
 7 | class Articles:
 8 | 
 9 |     def __init__(self, path):
10 |         self.archives_path = os.path.join(path, "archives.html")
11 |         if os.path.isfile(self.archives_path):
12 |             self.archives_content = open(self.archives_path).read()
13 |         else:
14 |             self.archives_content = ""
15 |         soup = BeautifulSoup(self.archives_content,"html.parser")
16 |         self.articles = []
17 |         for item in soup.find_all('div','article'):
18 |             article = {}
19 |             article['title']= item.find('h1').get_text()
20 |             article['link'] = item.find('a','clearlink').get('href')
21 |             article['date']= item.find('span','date').get_text()
22 |             article['category'] = ' '.join([x.get_text() for x in item.find_all('span','posted-in')])
23 |             self.articles.append(article)
24 | 
25 | if __name__ == "__main__":
26 |     a = Articles("/Users/hao/blog/MWeb-Blog/Blog")
27 |     for i in a.articles:
28 |         print i['title']
29 | 
30 | 


--------------------------------------------------------------------------------
/bs4/tests/test_htmlparser.py:
--------------------------------------------------------------------------------
 1 | """Tests to ensure that the html.parser tree builder generates good
 2 | trees."""
 3 | 
 4 | from pdb import set_trace
 5 | import pickle
 6 | from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
 7 | from bs4.builder import HTMLParserTreeBuilder
 8 | 
 9 | class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
10 | 
11 |     @property
12 |     def default_builder(self):
13 |         return HTMLParserTreeBuilder()
14 | 
15 |     def test_namespaced_system_doctype(self):
16 |         # html.parser can't handle namespaced doctypes, so skip this one.
17 |         pass
18 | 
19 |     def test_namespaced_public_doctype(self):
20 |         # html.parser can't handle namespaced doctypes, so skip this one.
21 |         pass
22 | 
23 |     def test_builder_is_pickled(self):
24 |         """Unlike most tree builders, HTMLParserTreeBuilder and will
25 |         be restored after pickling.
26 |         """
27 |         tree = self.soup("<a><b>foo</a>")
28 |         dumped = pickle.dumps(tree, 2)
29 |         loaded = pickle.loads(dumped)
30 |         self.assertTrue(isinstance(loaded.builder, type(tree.builder)))
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/bs4/tests/test_docs.py:
--------------------------------------------------------------------------------
 1 | "Test harness for doctests."
 2 | 
 3 | # pylint: disable-msg=E0611,W0142
 4 | 
 5 | __metaclass__ = type
 6 | __all__ = [
 7 |     'additional_tests',
 8 |     ]
 9 | 
10 | import atexit
11 | import doctest
12 | import os
13 | #from pkg_resources import (
14 | #    resource_filename, resource_exists, resource_listdir, cleanup_resources)
15 | import unittest
16 | 
17 | DOCTEST_FLAGS = (
18 |     doctest.ELLIPSIS |
19 |     doctest.NORMALIZE_WHITESPACE |
20 |     doctest.REPORT_NDIFF)
21 | 
22 | 
23 | # def additional_tests():
24 | #     "Run the doc tests (README.txt and docs/*, if any exist)"
25 | #     doctest_files = [
26 | #         os.path.abspath(resource_filename('bs4', 'README.txt'))]
27 | #     if resource_exists('bs4', 'docs'):
28 | #         for name in resource_listdir('bs4', 'docs'):
29 | #             if name.endswith('.txt'):
30 | #                 doctest_files.append(
31 | #                     os.path.abspath(
32 | #                         resource_filename('bs4', 'docs/%s' % name)))
33 | #     kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
34 | #     atexit.register(cleanup_resources)
35 | #     return unittest.TestSuite((
36 | #         doctest.DocFileSuite(*doctest_files, **kwargs)))
37 | 


--------------------------------------------------------------------------------
/workflow/__init__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | #
  4 | # Copyright (c) 2014 Dean Jackson <deanishe@deanishe.net>
  5 | #
  6 | # MIT Licence. See http://opensource.org/licenses/MIT
  7 | #
  8 | # Created on 2014-02-15
  9 | #
 10 | 
 11 | """
 12 | A Python helper library for `Alfred 2 <http://www.alfredapp.com/>`_ Workflow
 13 | authors.
 14 | """
 15 | 
 16 | import os
 17 | 
 18 | __title__ = 'Alfred-Workflow'
 19 | __version__ = open(os.path.join(os.path.dirname(__file__), 'version')).read()
 20 | __author__ = 'Dean Jackson'
 21 | __licence__ = 'MIT'
 22 | __copyright__ = 'Copyright 2014 Dean Jackson'
 23 | 
 24 | 
 25 | # Workflow objects
 26 | from .workflow import Workflow, manager
 27 | 
 28 | # Exceptions
 29 | from .workflow import PasswordNotFound, KeychainError
 30 | 
 31 | # Icons
 32 | from .workflow import (
 33 |     ICON_ACCOUNT,
 34 |     ICON_BURN,
 35 |     ICON_CLOCK,
 36 |     ICON_COLOR,
 37 |     ICON_COLOUR,
 38 |     ICON_EJECT,
 39 |     ICON_ERROR,
 40 |     ICON_FAVORITE,
 41 |     ICON_FAVOURITE,
 42 |     ICON_GROUP,
 43 |     ICON_HELP,
 44 |     ICON_HOME,
 45 |     ICON_INFO,
 46 |     ICON_NETWORK,
 47 |     ICON_NOTE,
 48 |     ICON_SETTINGS,
 49 |     ICON_SWIRL,
 50 |     ICON_SWITCH,
 51 |     ICON_SYNC,
 52 |     ICON_TRASH,
 53 |     ICON_USER,
 54 |     ICON_WARNING,
 55 |     ICON_WEB,
 56 | )
 57 | 
 58 | # Filter matching rules
 59 | from .workflow import (
 60 |     MATCH_ALL,
 61 |     MATCH_ALLCHARS,
 62 |     MATCH_ATOM,
 63 |     MATCH_CAPITALS,
 64 |     MATCH_INITIALS,
 65 |     MATCH_INITIALS_CONTAIN,
 66 |     MATCH_INITIALS_STARTSWITH,
 67 |     MATCH_STARTSWITH,
 68 |     MATCH_SUBSTRING,
 69 | )
 70 | 
 71 | __all__ = [
 72 |     'Workflow',
 73 |     'manager',
 74 |     'PasswordNotFound',
 75 |     'KeychainError',
 76 |     'ICON_ACCOUNT',
 77 |     'ICON_BURN',
 78 |     'ICON_CLOCK',
 79 |     'ICON_COLOR',
 80 |     'ICON_COLOUR',
 81 |     'ICON_EJECT',
 82 |     'ICON_ERROR',
 83 |     'ICON_FAVORITE',
 84 |     'ICON_FAVOURITE',
 85 |     'ICON_GROUP',
 86 |     'ICON_HELP',
 87 |     'ICON_HOME',
 88 |     'ICON_INFO',
 89 |     'ICON_NETWORK',
 90 |     'ICON_NOTE',
 91 |     'ICON_SETTINGS',
 92 |     'ICON_SWIRL',
 93 |     'ICON_SWITCH',
 94 |     'ICON_SYNC',
 95 |     'ICON_TRASH',
 96 |     'ICON_USER',
 97 |     'ICON_WARNING',
 98 |     'ICON_WEB',
 99 |     'MATCH_ALL',
100 |     'MATCH_ALLCHARS',
101 |     'MATCH_ATOM',
102 |     'MATCH_CAPITALS',
103 |     'MATCH_INITIALS',
104 |     'MATCH_INITIALS_CONTAIN',
105 |     'MATCH_INITIALS_STARTSWITH',
106 |     'MATCH_STARTSWITH',
107 |     'MATCH_SUBSTRING',
108 | ]
109 | 


--------------------------------------------------------------------------------
/bs4/tests/test_lxml.py:
--------------------------------------------------------------------------------
 1 | """Tests to ensure that the lxml tree builder generates good trees."""
 2 | 
 3 | import re
 4 | import warnings
 5 | 
 6 | try:
 7 |     import lxml.etree
 8 |     LXML_PRESENT = True
 9 |     LXML_VERSION = lxml.etree.LXML_VERSION
10 | except ImportError, e:
11 |     LXML_PRESENT = False
12 |     LXML_VERSION = (0,)
13 | 
14 | if LXML_PRESENT:
15 |     from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
16 | 
17 | from bs4 import (
18 |     BeautifulSoup,
19 |     BeautifulStoneSoup,
20 |     )
21 | from bs4.element import Comment, Doctype, SoupStrainer
22 | from bs4.testing import skipIf
23 | from bs4.tests import test_htmlparser
24 | from bs4.testing import (
25 |     HTMLTreeBuilderSmokeTest,
26 |     XMLTreeBuilderSmokeTest,
27 |     SoupTest,
28 |     skipIf,
29 | )
30 | 
31 | @skipIf(
32 |     not LXML_PRESENT,
33 |     "lxml seems not to be present, not testing its tree builder.")
34 | class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
35 |     """See ``HTMLTreeBuilderSmokeTest``."""
36 | 
37 |     @property
38 |     def default_builder(self):
39 |         return LXMLTreeBuilder()
40 | 
41 |     def test_out_of_range_entity(self):
42 |         self.assertSoupEquals(
43 |             "<p>foo&#10000000000000;bar</p>", "<p>foobar</p>")
44 |         self.assertSoupEquals(
45 |             "<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
46 |         self.assertSoupEquals(
47 |             "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
48 | 
49 |     # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
50 |     # test if an old version of lxml is installed.
51 | 
52 |     @skipIf(
53 |         not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
54 |         "Skipping doctype test for old version of lxml to avoid segfault.")
55 |     def test_empty_doctype(self):
56 |         soup = self.soup("<!DOCTYPE>")
57 |         doctype = soup.contents[0]
58 |         self.assertEqual("", doctype.strip())
59 | 
60 |     def test_beautifulstonesoup_is_xml_parser(self):
61 |         # Make sure that the deprecated BSS class uses an xml builder
62 |         # if one is installed.
63 |         with warnings.catch_warnings(record=True) as w:
64 |             soup = BeautifulStoneSoup("<b />")
65 |         self.assertEqual(u"<b/>", unicode(soup.b))
66 |         self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
67 | 
68 | @skipIf(
69 |     not LXML_PRESENT,
70 |     "lxml seems not to be present, not testing its XML tree builder.")
71 | class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
72 |     """See ``HTMLTreeBuilderSmokeTest``."""
73 | 
74 |     @property
75 |     def default_builder(self):
76 |         return LXMLTreeBuilderForXML()
77 | 


--------------------------------------------------------------------------------
/bs4/tests/test_html5lib.py:
--------------------------------------------------------------------------------
 1 | """Tests to ensure that the html5lib tree builder generates good trees."""
 2 | 
 3 | import warnings
 4 | 
 5 | try:
 6 |     from bs4.builder import HTML5TreeBuilder
 7 |     HTML5LIB_PRESENT = True
 8 | except ImportError, e:
 9 |     HTML5LIB_PRESENT = False
10 | from bs4.element import SoupStrainer
11 | from bs4.testing import (
12 |     HTML5TreeBuilderSmokeTest,
13 |     SoupTest,
14 |     skipIf,
15 | )
16 | 
17 | @skipIf(
18 |     not HTML5LIB_PRESENT,
19 |     "html5lib seems not to be present, not testing its tree builder.")
20 | class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
21 |     """See ``HTML5TreeBuilderSmokeTest``."""
22 | 
23 |     @property
24 |     def default_builder(self):
25 |         return HTML5TreeBuilder()
26 | 
27 |     def test_soupstrainer(self):
28 |         # The html5lib tree builder does not support SoupStrainers.
29 |         strainer = SoupStrainer("b")
30 |         markup = "<p>A <b>bold</b> statement.</p>"
31 |         with warnings.catch_warnings(record=True) as w:
32 |             soup = self.soup(markup, parse_only=strainer)
33 |         self.assertEqual(
34 |             soup.decode(), self.document_for(markup))
35 | 
36 |         self.assertTrue(
37 |             "the html5lib tree builder doesn't support parse_only" in
38 |             str(w[0].message))
39 | 
40 |     def test_correctly_nested_tables(self):
41 |         """html5lib inserts <tbody> tags where other parsers don't."""
42 |         markup = ('<table id="1">'
43 |                   '<tr>'
44 |                   "<td>Here's another table:"
45 |                   '<table id="2">'
46 |                   '<tr><td>foo</td></tr>'
47 |                   '</table></td>')
48 | 
49 |         self.assertSoupEquals(
50 |             markup,
51 |             '<table id="1"><tbody><tr><td>Here\'s another table:'
52 |             '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
53 |             '</td></tr></tbody></table>')
54 | 
55 |         self.assertSoupEquals(
56 |             "<table><thead><tr><td>Foo</td></tr></thead>"
57 |             "<tbody><tr><td>Bar</td></tr></tbody>"
58 |             "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
59 | 
60 |     def test_xml_declaration_followed_by_doctype(self):
61 |         markup = '''<?xml version="1.0" encoding="utf-8"?>
62 | <!DOCTYPE html>
63 | <html>
64 |   <head>
65 |   </head>
66 |   <body>
67 |    <p>foo</p>
68 |   </body>
69 | </html>'''
70 |         soup = self.soup(markup)
71 |         # Verify that we can reach the <p> tag; this means the tree is connected.
72 |         self.assertEqual(b"<p>foo</p>", soup.p.encode())
73 | 
74 |     def test_reparented_markup(self):
75 |         markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
76 |         soup = self.soup(markup)
77 |         self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
78 |         self.assertEqual(2, len(soup.find_all('p')))
79 | 
80 | 
81 |     def test_reparented_markup_ends_with_whitespace(self):
82 |         markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
83 |         soup = self.soup(markup)
84 |         self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
85 |         self.assertEqual(2, len(soup.find_all('p')))
86 | 
87 |     def test_processing_instruction(self):
88 |         """Processing instructions become comments."""
89 |         markup = b"""<?PITarget PIContent?>"""
90 |         soup = self.soup(markup)
91 |         assert str(soup).startswith("<!--?PITarget PIContent?-->")
92 | 
93 |     def test_cloned_multivalue_node(self):
94 |         markup = b"""<a class="my_class"><p></a>"""
95 |         soup = self.soup(markup)
96 |         a1, a2 = soup.find_all('a')
97 |         self.assertEqual(a1, a2)
98 |         assert a1 is not a2
99 | 


--------------------------------------------------------------------------------
/xpinyin/__init__.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from __future__ import unicode_literals
  4 | 
  5 | import os.path
  6 | import re
  7 | 
  8 | PinyinToneMark = {
  9 |     0: u"aoeiuv\u00fc",
 10 |     1: u"\u0101\u014d\u0113\u012b\u016b\u01d6\u01d6",
 11 |     2: u"\u00e1\u00f3\u00e9\u00ed\u00fa\u01d8\u01d8",
 12 |     3: u"\u01ce\u01d2\u011b\u01d0\u01d4\u01da\u01da",
 13 |     4: u"\u00e0\u00f2\u00e8\u00ec\u00f9\u01dc\u01dc",
 14 | }
 15 | 
 16 | 
 17 | class Pinyin(object):
 18 | 
 19 |     """translate chinese hanzi to pinyin by python, inspired by flyerhzm’s
 20 |     `chinese\_pinyin`_ gem
 21 | 
 22 |     usage
 23 |     -----
 24 |     ::
 25 | 
 26 |         >>> from xpinyin import Pinyin
 27 |         >>> p = Pinyin()
 28 |         >>> # default splitter is `-`
 29 |         >>> p.get_pinyin(u"上海")
 30 |         'shang-hai'
 31 |         >>> # show tone marks
 32 |         >>> p.get_pinyin(u"上海", show_tone_marks=True)
 33 |         'shàng-hǎi'
 34 |         >>> # remove splitter
 35 |         >>> p.get_pinyin(u"上海", '')
 36 |         'shanghai'
 37 |         >>> # set splitter as whitespace
 38 |         >>> p.get_pinyin(u"上海", ' ')
 39 |         'shang hai'
 40 |         >>> p.get_initial(u"上")
 41 |         'S'
 42 |         >>> p.get_initials(u"上海")
 43 |         'S-H'
 44 |         >>> p.get_initials(u"上海", u'')
 45 |         'SH'
 46 |         >>> p.get_initials(u"上海", u' ')
 47 |         'S H'
 48 | 
 49 |     请输入utf8编码汉字
 50 |     .. _chinese\_pinyin: https://github.com/flyerhzm/chinese_pinyin
 51 |     """
 52 | 
 53 |     data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
 54 |                              'Mandarin.dat')
 55 | 
 56 |     def __init__(self, data_path=data_path):
 57 |         self.dict = {}
 58 |         with open(data_path) as f:
 59 |             for line in f:
 60 |                 k, v = line.split('\t')
 61 |                 self.dict[k] = v
 62 | 
 63 |     @staticmethod
 64 |     def decode_pinyin(s):
 65 |         s = s.lower()
 66 |         r = ""
 67 |         t = ""
 68 |         for c in s:
 69 |             if "a" <= c <= 'z':
 70 |                 t += c
 71 |             elif c == ':':
 72 |                 assert t[-1] == 'u'
 73 |                 t = t[:-1] + "\u00fc"
 74 |             else:
 75 |                 if '0' <= c <= '5':
 76 |                     tone = int(c) % 5
 77 |                     if tone != 0:
 78 |                         m = re.search("[aoeiuv\u00fc]+", t)
 79 |                         if m is None:
 80 |                             # pass when no vowels find yet
 81 |                             t += c
 82 |                         elif len(m.group(0)) == 1:
 83 |                             # if just find one vowels, put the mark on it
 84 |                             t = t[:m.start(0)] \
 85 |                                 + PinyinToneMark[tone][PinyinToneMark[0].index(m.group(0))] \
 86 |                                 + t[m.end(0):]
 87 |                         else:
 88 |                             # mark on vowels which search with "a, o, e" one by one
 89 |                             # when "i" and "u" stand together, make the vowels behind
 90 |                             for num, vowels in enumerate(("a", "o", "e", "ui", "iu")):
 91 |                                 if vowels in t:
 92 |                                     t = t.replace(vowels[-1], PinyinToneMark[tone][num])
 93 |                                     break
 94 |                                 else:
 95 |                                     t += "!"
 96 |                 r += t
 97 |                 t = ""
 98 |         r += t
 99 |         return r
100 | 
101 |     @staticmethod
102 |     def convert_pinyin(word, convert):
103 |         if convert == 'capitalize':
104 |             return word.capitalize()
105 |         if convert == 'lower':
106 |             return word.lower()
107 |         if convert == 'upper':
108 |             return word.upper()
109 | 
110 |     def get_pinyin(self, chars=u'你好', splitter=u'-',
111 |                    show_tone_marks=False, convert='lower'):
112 |         result = []
113 |         flag = 1
114 |         for char in chars:
115 |             key = "%X" % ord(char)
116 |             try:
117 |                 if show_tone_marks:
118 |                     word = self.decode_pinyin(self.dict[key].split()[0].strip())
119 |                 else:
120 |                     word = self.dict[key].split()[0].strip()[:-1]
121 |                 word = self.convert_pinyin(word, convert)
122 |                 result.append(word)
123 |                 flag = 1
124 |             except KeyError:
125 |                 if flag:
126 |                     result.append(char)
127 |                 else:
128 |                     result[-1] += char
129 |                 flag = 0
130 |         return splitter.join(result)
131 | 
132 |     def get_initial(self, char=u'你'):
133 |         try:
134 |             return self.dict["%X" % ord(char)].split(" ")[0][0]
135 |         except KeyError:
136 |             return char
137 | 
138 |     def get_initials(self, chars=u'你好', splitter=u'-'):
139 |         result = []
140 |         flag = 1
141 |         for char in chars:
142 |             try:
143 |                 result.append(self.dict["%X" % ord(char)].split(" ")[0][0])
144 |                 flag = 1
145 |             except KeyError:
146 |                 if flag:
147 |                     result.append(char)
148 |                 else:
149 |                     result[-1] += char
150 | 
151 |         return splitter.join(result)
152 | 


--------------------------------------------------------------------------------
/bs4/tests/test_builder_registry.py:
--------------------------------------------------------------------------------
  1 | """Tests of the builder registry."""
  2 | 
  3 | import unittest
  4 | import warnings
  5 | 
  6 | from bs4 import BeautifulSoup
  7 | from bs4.builder import (
  8 |     builder_registry as registry,
  9 |     HTMLParserTreeBuilder,
 10 |     TreeBuilderRegistry,
 11 | )
 12 | 
 13 | try:
 14 |     from bs4.builder import HTML5TreeBuilder
 15 |     HTML5LIB_PRESENT = True
 16 | except ImportError:
 17 |     HTML5LIB_PRESENT = False
 18 | 
 19 | try:
 20 |     from bs4.builder import (
 21 |         LXMLTreeBuilderForXML,
 22 |         LXMLTreeBuilder,
 23 |         )
 24 |     LXML_PRESENT = True
 25 | except ImportError:
 26 |     LXML_PRESENT = False
 27 | 
 28 | 
 29 | class BuiltInRegistryTest(unittest.TestCase):
 30 |     """Test the built-in registry with the default builders registered."""
 31 | 
 32 |     def test_combination(self):
 33 |         if LXML_PRESENT:
 34 |             self.assertEqual(registry.lookup('fast', 'html'),
 35 |                              LXMLTreeBuilder)
 36 | 
 37 |         if LXML_PRESENT:
 38 |             self.assertEqual(registry.lookup('permissive', 'xml'),
 39 |                              LXMLTreeBuilderForXML)
 40 |         self.assertEqual(registry.lookup('strict', 'html'),
 41 |                           HTMLParserTreeBuilder)
 42 |         if HTML5LIB_PRESENT:
 43 |             self.assertEqual(registry.lookup('html5lib', 'html'),
 44 |                               HTML5TreeBuilder)
 45 | 
 46 |     def test_lookup_by_markup_type(self):
 47 |         if LXML_PRESENT:
 48 |             self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
 49 |             self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
 50 |         else:
 51 |             self.assertEqual(registry.lookup('xml'), None)
 52 |             if HTML5LIB_PRESENT:
 53 |                 self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
 54 |             else:
 55 |                 self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
 56 | 
 57 |     def test_named_library(self):
 58 |         if LXML_PRESENT:
 59 |             self.assertEqual(registry.lookup('lxml', 'xml'),
 60 |                              LXMLTreeBuilderForXML)
 61 |             self.assertEqual(registry.lookup('lxml', 'html'),
 62 |                              LXMLTreeBuilder)
 63 |         if HTML5LIB_PRESENT:
 64 |             self.assertEqual(registry.lookup('html5lib'),
 65 |                               HTML5TreeBuilder)
 66 | 
 67 |         self.assertEqual(registry.lookup('html.parser'),
 68 |                           HTMLParserTreeBuilder)
 69 | 
 70 |     def test_beautifulsoup_constructor_does_lookup(self):
 71 | 
 72 |         with warnings.catch_warnings(record=True) as w:
 73 |             # This will create a warning about not explicitly
 74 |             # specifying a parser, but we'll ignore it.
 75 | 
 76 |             # You can pass in a string.
 77 |             BeautifulSoup("", features="html")
 78 |             # Or a list of strings.
 79 |             BeautifulSoup("", features=["html", "fast"])
 80 | 
 81 |         # You'll get an exception if BS can't find an appropriate
 82 |         # builder.
 83 |         self.assertRaises(ValueError, BeautifulSoup,
 84 |                           "", features="no-such-feature")
 85 | 
 86 | class RegistryTest(unittest.TestCase):
 87 |     """Test the TreeBuilderRegistry class in general."""
 88 | 
 89 |     def setUp(self):
 90 |         self.registry = TreeBuilderRegistry()
 91 | 
 92 |     def builder_for_features(self, *feature_list):
 93 |         cls = type('Builder_' + '_'.join(feature_list),
 94 |                    (object,), {'features' : feature_list})
 95 | 
 96 |         self.registry.register(cls)
 97 |         return cls
 98 | 
 99 |     def test_register_with_no_features(self):
100 |         builder = self.builder_for_features()
101 | 
102 |         # Since the builder advertises no features, you can't find it
103 |         # by looking up features.
104 |         self.assertEqual(self.registry.lookup('foo'), None)
105 | 
106 |         # But you can find it by doing a lookup with no features, if
107 |         # this happens to be the only registered builder.
108 |         self.assertEqual(self.registry.lookup(), builder)
109 | 
110 |     def test_register_with_features_makes_lookup_succeed(self):
111 |         builder = self.builder_for_features('foo', 'bar')
112 |         self.assertEqual(self.registry.lookup('foo'), builder)
113 |         self.assertEqual(self.registry.lookup('bar'), builder)
114 | 
115 |     def test_lookup_fails_when_no_builder_implements_feature(self):
116 |         builder = self.builder_for_features('foo', 'bar')
117 |         self.assertEqual(self.registry.lookup('baz'), None)
118 | 
119 |     def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
120 |         builder1 = self.builder_for_features('foo')
121 |         builder2 = self.builder_for_features('bar')
122 |         self.assertEqual(self.registry.lookup(), builder2)
123 | 
124 |     def test_lookup_fails_when_no_tree_builders_registered(self):
125 |         self.assertEqual(self.registry.lookup(), None)
126 | 
127 |     def test_lookup_gets_most_recent_builder_supporting_all_features(self):
128 |         has_one = self.builder_for_features('foo')
129 |         has_the_other = self.builder_for_features('bar')
130 |         has_both_early = self.builder_for_features('foo', 'bar', 'baz')
131 |         has_both_late = self.builder_for_features('foo', 'bar', 'quux')
132 |         lacks_one = self.builder_for_features('bar')
133 |         has_the_other = self.builder_for_features('foo')
134 | 
135 |         # There are two builders featuring 'foo' and 'bar', but
136 |         # the one that also features 'quux' was registered later.
137 |         self.assertEqual(self.registry.lookup('foo', 'bar'),
138 |                           has_both_late)
139 | 
140 |         # There is only one builder featuring 'foo', 'bar', and 'baz'.
141 |         self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'),
142 |                           has_both_early)
143 | 
144 |     def test_lookup_fails_when_cannot_reconcile_requested_features(self):
145 |         builder1 = self.builder_for_features('foo', 'bar')
146 |         builder2 = self.builder_for_features('foo', 'baz')
147 |         self.assertEqual(self.registry.lookup('bar', 'baz'), None)
148 | 


--------------------------------------------------------------------------------
/bs4/diagnose.py:
--------------------------------------------------------------------------------
  1 | """Diagnostic functions, mainly for use when doing tech support."""
  2 | 
  3 | __license__ = "MIT"
  4 | 
  5 | import cProfile
  6 | from StringIO import StringIO
  7 | from HTMLParser import HTMLParser
  8 | import bs4
  9 | from bs4 import BeautifulSoup, __version__
 10 | from bs4.builder import builder_registry
 11 | 
 12 | import os
 13 | import pstats
 14 | import random
 15 | import tempfile
 16 | import time
 17 | import traceback
 18 | import sys
 19 | import cProfile
 20 | 
 21 | def diagnose(data):
 22 |     """Diagnostic suite for isolating common problems."""
 23 |     print "Diagnostic running on Beautiful Soup %s" % __version__
 24 |     print "Python version %s" % sys.version
 25 | 
 26 |     basic_parsers = ["html.parser", "html5lib", "lxml"]
 27 |     for name in basic_parsers:
 28 |         for builder in builder_registry.builders:
 29 |             if name in builder.features:
 30 |                 break
 31 |         else:
 32 |             basic_parsers.remove(name)
 33 |             print (
 34 |                 "I noticed that %s is not installed. Installing it may help." %
 35 |                 name)
 36 | 
 37 |     if 'lxml' in basic_parsers:
 38 |         basic_parsers.append(["lxml", "xml"])
 39 |         try:
 40 |             from lxml import etree
 41 |             print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
 42 |         except ImportError, e:
 43 |             print (
 44 |                 "lxml is not installed or couldn't be imported.")
 45 | 
 46 | 
 47 |     if 'html5lib' in basic_parsers:
 48 |         try:
 49 |             import html5lib
 50 |             print "Found html5lib version %s" % html5lib.__version__
 51 |         except ImportError, e:
 52 |             print (
 53 |                 "html5lib is not installed or couldn't be imported.")
 54 | 
 55 |     if hasattr(data, 'read'):
 56 |         data = data.read()
 57 |     elif os.path.exists(data):
 58 |         print '"%s" looks like a filename. Reading data from the file.' % data
 59 |         data = open(data).read()
 60 |     elif data.startswith("http:") or data.startswith("https:"):
 61 |         print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
 62 |         print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
 63 |         return
 64 |     print
 65 | 
 66 |     for parser in basic_parsers:
 67 |         print "Trying to parse your markup with %s" % parser
 68 |         success = False
 69 |         try:
 70 |             soup = BeautifulSoup(data, parser)
 71 |             success = True
 72 |         except Exception, e:
 73 |             print "%s could not parse the markup." % parser
 74 |             traceback.print_exc()
 75 |         if success:
 76 |             print "Here's what %s did with the markup:" % parser
 77 |             print soup.prettify()
 78 | 
 79 |         print "-" * 80
 80 | 
 81 | def lxml_trace(data, html=True, **kwargs):
 82 |     """Print out the lxml events that occur during parsing.
 83 | 
 84 |     This lets you see how lxml parses a document when no Beautiful
 85 |     Soup code is running.
 86 |     """
 87 |     from lxml import etree
 88 |     for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
 89 |         print("%s, %4s, %s" % (event, element.tag, element.text))
 90 | 
 91 | class AnnouncingParser(HTMLParser):
 92 |     """Announces HTMLParser parse events, without doing anything else."""
 93 | 
 94 |     def _p(self, s):
 95 |         print(s)
 96 | 
 97 |     def handle_starttag(self, name, attrs):
 98 |         self._p("%s START" % name)
 99 | 
100 |     def handle_endtag(self, name):
101 |         self._p("%s END" % name)
102 | 
103 |     def handle_data(self, data):
104 |         self._p("%s DATA" % data)
105 | 
106 |     def handle_charref(self, name):
107 |         self._p("%s CHARREF" % name)
108 | 
109 |     def handle_entityref(self, name):
110 |         self._p("%s ENTITYREF" % name)
111 | 
112 |     def handle_comment(self, data):
113 |         self._p("%s COMMENT" % data)
114 | 
115 |     def handle_decl(self, data):
116 |         self._p("%s DECL" % data)
117 | 
118 |     def unknown_decl(self, data):
119 |         self._p("%s UNKNOWN-DECL" % data)
120 | 
121 |     def handle_pi(self, data):
122 |         self._p("%s PI" % data)
123 | 
124 | def htmlparser_trace(data):
125 |     """Print out the HTMLParser events that occur during parsing.
126 | 
127 |     This lets you see how HTMLParser parses a document when no
128 |     Beautiful Soup code is running.
129 |     """
130 |     parser = AnnouncingParser()
131 |     parser.feed(data)
132 | 
133 | _vowels = "aeiou"
134 | _consonants = "bcdfghjklmnpqrstvwxyz"
135 | 
136 | def rword(length=5):
137 |     "Generate a random word-like string."
138 |     s = ''
139 |     for i in range(length):
140 |         if i % 2 == 0:
141 |             t = _consonants
142 |         else:
143 |             t = _vowels
144 |         s += random.choice(t)
145 |     return s
146 | 
147 | def rsentence(length=4):
148 |     "Generate a random sentence-like string."
149 |     return " ".join(rword(random.randint(4,9)) for i in range(length))
150 |         
151 | def rdoc(num_elements=1000):
152 |     """Randomly generate an invalid HTML document."""
153 |     tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
154 |     elements = []
155 |     for i in range(num_elements):
156 |         choice = random.randint(0,3)
157 |         if choice == 0:
158 |             # New tag.
159 |             tag_name = random.choice(tag_names)
160 |             elements.append("<%s>" % tag_name)
161 |         elif choice == 1:
162 |             elements.append(rsentence(random.randint(1,4)))
163 |         elif choice == 2:
164 |             # Close a tag.
165 |             tag_name = random.choice(tag_names)
166 |             elements.append("</%s>" % tag_name)
167 |     return "<html>" + "\n".join(elements) + "</html>"
168 | 
169 | def benchmark_parsers(num_elements=100000):
170 |     """Very basic head-to-head performance benchmark."""
171 |     print "Comparative parser benchmark on Beautiful Soup %s" % __version__
172 |     data = rdoc(num_elements)
173 |     print "Generated a large invalid HTML document (%d bytes)." % len(data)
174 |     
175 |     for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
176 |         success = False
177 |         try:
178 |             a = time.time()
179 |             soup = BeautifulSoup(data, parser)
180 |             b = time.time()
181 |             success = True
182 |         except Exception, e:
183 |             print "%s could not parse the markup." % parser
184 |             traceback.print_exc()
185 |         if success:
186 |             print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
187 | 
188 |     from lxml import etree
189 |     a = time.time()
190 |     etree.HTML(data)
191 |     b = time.time()
192 |     print "Raw lxml parsed the markup in %.2fs." % (b-a)
193 | 
194 |     import html5lib
195 |     parser = html5lib.HTMLParser()
196 |     a = time.time()
197 |     parser.parse(data)
198 |     b = time.time()
199 |     print "Raw html5lib parsed the markup in %.2fs." % (b-a)
200 | 
201 | def profile(num_elements=100000, parser="lxml"):
202 | 
203 |     filehandle = tempfile.NamedTemporaryFile()
204 |     filename = filehandle.name
205 | 
206 |     data = rdoc(num_elements)
207 |     vars = dict(bs4=bs4, data=data, parser=parser)
208 |     cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
209 | 
210 |     stats = pstats.Stats(filename)
211 |     # stats.strip_dirs()
212 |     stats.sort_stats("cumulative")
213 |     stats.print_stats('_html5lib|bs4', 50)
214 | 
215 | if __name__ == '__main__':
216 |     diagnose(sys.stdin.read())
217 | 


--------------------------------------------------------------------------------
/workflow/background.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | #
  4 | # Copyright (c) 2014 deanishe@deanishe.net
  5 | #
  6 | # MIT Licence. See http://opensource.org/licenses/MIT
  7 | #
  8 | # Created on 2014-04-06
  9 | #
 10 | 
 11 | """
 12 | Run background tasks
 13 | """
 14 | 
 15 | from __future__ import print_function, unicode_literals
 16 | 
 17 | import sys
 18 | import os
 19 | import subprocess
 20 | import pickle
 21 | 
 22 | from workflow import Workflow
 23 | 
 24 | __all__ = ['is_running', 'run_in_background']
 25 | 
 26 | _wf = None
 27 | 
 28 | 
 29 | def wf():
 30 |     global _wf
 31 |     if _wf is None:
 32 |         _wf = Workflow()
 33 |     return _wf
 34 | 
 35 | 
 36 | def _arg_cache(name):
 37 |     """Return path to pickle cache file for arguments
 38 | 
 39 |     :param name: name of task
 40 |     :type name: ``unicode``
 41 |     :returns: Path to cache file
 42 |     :rtype: ``unicode`` filepath
 43 | 
 44 |     """
 45 | 
 46 |     return wf().cachefile('{0}.argcache'.format(name))
 47 | 
 48 | 
 49 | def _pid_file(name):
 50 |     """Return path to PID file for ``name``
 51 | 
 52 |     :param name: name of task
 53 |     :type name: ``unicode``
 54 |     :returns: Path to PID file for task
 55 |     :rtype: ``unicode`` filepath
 56 | 
 57 |     """
 58 | 
 59 |     return wf().cachefile('{0}.pid'.format(name))
 60 | 
 61 | 
 62 | def _process_exists(pid):
 63 |     """Check if a process with PID ``pid`` exists
 64 | 
 65 |     :param pid: PID to check
 66 |     :type pid: ``int``
 67 |     :returns: ``True`` if process exists, else ``False``
 68 |     :rtype: ``Boolean``
 69 |     """
 70 | 
 71 |     try:
 72 |         os.kill(pid, 0)
 73 |     except OSError:  # not running
 74 |         return False
 75 |     return True
 76 | 
 77 | 
 78 | def is_running(name):
 79 |     """
 80 |     Test whether task is running under ``name``
 81 | 
 82 |     :param name: name of task
 83 |     :type name: ``unicode``
 84 |     :returns: ``True`` if task with name ``name`` is running, else ``False``
 85 |     :rtype: ``Boolean``
 86 | 
 87 |     """
 88 |     pidfile = _pid_file(name)
 89 |     if not os.path.exists(pidfile):
 90 |         return False
 91 | 
 92 |     with open(pidfile, 'rb') as file_obj:
 93 |         pid = int(file_obj.read().strip())
 94 | 
 95 |     if _process_exists(pid):
 96 |         return True
 97 | 
 98 |     elif os.path.exists(pidfile):
 99 |         os.unlink(pidfile)
100 | 
101 |     return False
102 | 
103 | 
104 | def _background(stdin='/dev/null', stdout='/dev/null',
105 |                 stderr='/dev/null'):  # pragma: no cover
106 |     """Fork the current process into a background daemon.
107 | 
108 |     :param stdin: where to read input
109 |     :type stdin: filepath
110 |     :param stdout: where to write stdout output
111 |     :type stdout: filepath
112 |     :param stderr: where to write stderr output
113 |     :type stderr: filepath
114 | 
115 |     """
116 | 
117 |     # Do first fork.
118 |     try:
119 |         pid = os.fork()
120 |         if pid > 0:
121 |             sys.exit(0)  # Exit first parent.
122 |     except OSError as e:
123 |         wf().logger.critical("fork #1 failed: ({0:d}) {1}".format(
124 |                              e.errno, e.strerror))
125 |         sys.exit(1)
126 |     # Decouple from parent environment.
127 |     os.chdir(wf().workflowdir)
128 |     os.umask(0)
129 |     os.setsid()
130 |     # Do second fork.
131 |     try:
132 |         pid = os.fork()
133 |         if pid > 0:
134 |             sys.exit(0)  # Exit second parent.
135 |     except OSError as e:
136 |         wf().logger.critical("fork #2 failed: ({0:d}) {1}".format(
137 |                              e.errno, e.strerror))
138 |         sys.exit(1)
139 |     # Now I am a daemon!
140 |     # Redirect standard file descriptors.
141 |     si = file(stdin, 'r', 0)
142 |     so = file(stdout, 'a+', 0)
143 |     se = file(stderr, 'a+', 0)
144 |     if hasattr(sys.stdin, 'fileno'):
145 |         os.dup2(si.fileno(), sys.stdin.fileno())
146 |     if hasattr(sys.stdout, 'fileno'):
147 |         os.dup2(so.fileno(), sys.stdout.fileno())
148 |     if hasattr(sys.stderr, 'fileno'):
149 |         os.dup2(se.fileno(), sys.stderr.fileno())
150 | 
151 | 
152 | def run_in_background(name, args, **kwargs):
153 |     """Pickle arguments to cache file, then call this script again via
154 |     :func:`subprocess.call`.
155 | 
156 |     :param name: name of task
157 |     :type name: ``unicode``
158 |     :param args: arguments passed as first argument to :func:`subprocess.call`
159 |     :param \**kwargs: keyword arguments to :func:`subprocess.call`
160 |     :returns: exit code of sub-process
161 |     :rtype: ``int``
162 | 
163 |     When you call this function, it caches its arguments and then calls
164 |     ``background.py`` in a subprocess. The Python subprocess will load the
165 |     cached arguments, fork into the background, and then run the command you
166 |     specified.
167 | 
168 |     This function will return as soon as the ``background.py`` subprocess has
169 |     forked, returning the exit code of *that* process (i.e. not of the command
170 |     you're trying to run).
171 | 
172 |     If that process fails, an error will be written to the log file.
173 | 
174 |     If a process is already running under the same name, this function will
175 |     return immediately and will not run the specified command.
176 | 
177 |     """
178 | 
179 |     if is_running(name):
180 |         wf().logger.info('Task `{0}` is already running'.format(name))
181 |         return
182 | 
183 |     argcache = _arg_cache(name)
184 | 
185 |     # Cache arguments
186 |     with open(argcache, 'wb') as file_obj:
187 |         pickle.dump({'args': args, 'kwargs': kwargs}, file_obj)
188 |         wf().logger.debug('Command arguments cached to `{0}`'.format(argcache))
189 | 
190 |     # Call this script
191 |     cmd = ['/usr/bin/python', __file__, name]
192 |     wf().logger.debug('Calling {0!r} ...'.format(cmd))
193 |     retcode = subprocess.call(cmd)
194 |     if retcode:  # pragma: no cover
195 |         wf().logger.error('Failed to call task in background')
196 |     else:
197 |         wf().logger.debug('Executing task `{0}` in background...'.format(name))
198 |     return retcode
199 | 
200 | 
201 | def main(wf):  # pragma: no cover
202 |     """
203 |     Load cached arguments, fork into background, then call
204 |     :meth:`subprocess.call` with cached arguments
205 | 
206 |     """
207 | 
208 |     name = wf.args[0]
209 |     argcache = _arg_cache(name)
210 |     if not os.path.exists(argcache):
211 |         wf.logger.critical('No arg cache found : {0!r}'.format(argcache))
212 |         return 1
213 | 
214 |     # Load cached arguments
215 |     with open(argcache, 'rb') as file_obj:
216 |         data = pickle.load(file_obj)
217 | 
218 |     # Cached arguments
219 |     args = data['args']
220 |     kwargs = data['kwargs']
221 | 
222 |     # Delete argument cache file
223 |     os.unlink(argcache)
224 | 
225 |     pidfile = _pid_file(name)
226 | 
227 |     # Fork to background
228 |     _background()
229 | 
230 |     # Write PID to file
231 |     with open(pidfile, 'wb') as file_obj:
232 |         file_obj.write('{0}'.format(os.getpid()))
233 | 
234 |     # Run the command
235 |     try:
236 |         wf.logger.debug('Task `{0}` running'.format(name))
237 |         wf.logger.debug('cmd : {0!r}'.format(args))
238 | 
239 |         retcode = subprocess.call(args, **kwargs)
240 | 
241 |         if retcode:
242 |             wf.logger.error('Command failed with [{0}] : {1!r}'.format(
243 |                             retcode, args))
244 | 
245 |     finally:
246 |         if os.path.exists(pidfile):
247 |             os.unlink(pidfile)
248 |         wf.logger.debug('Task `{0}` finished'.format(name))
249 | 
250 | 
251 | if __name__ == '__main__':  # pragma: no cover
252 |     wf().run(main)
253 | 


--------------------------------------------------------------------------------
/info.plist:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
  3 | <plist version="1.0">
  4 | <dict>
  5 | 	<key>bundleid</key>
  6 | 	<string></string>
  7 | 	<key>connections</key>
  8 | 	<dict>
  9 | 		<key>A27C006C-6D4A-4606-8ADB-7B07B6A0FCA9</key>
 10 | 		<array>
 11 | 			<dict>
 12 | 				<key>destinationuid</key>
 13 | 				<string>0EE06942-74AB-41FF-908C-FFBF35486C5D</string>
 14 | 				<key>modifiers</key>
 15 | 				<integer>0</integer>
 16 | 				<key>modifiersubtext</key>
 17 | 				<string></string>
 18 | 				<key>vitoclose</key>
 19 | 				<false/>
 20 | 			</dict>
 21 | 		</array>
 22 | 		<key>A89A4365-D0B5-46CE-A7B6-A1640961FC55</key>
 23 | 		<array>
 24 | 			<dict>
 25 | 				<key>destinationuid</key>
 26 | 				<string>C79D2345-AEF5-4071-B770-C843BA8BCF93</string>
 27 | 				<key>modifiers</key>
 28 | 				<integer>0</integer>
 29 | 				<key>modifiersubtext</key>
 30 | 				<string></string>
 31 | 				<key>vitoclose</key>
 32 | 				<false/>
 33 | 			</dict>
 34 | 		</array>
 35 | 		<key>C79D2345-AEF5-4071-B770-C843BA8BCF93</key>
 36 | 		<array>
 37 | 			<dict>
 38 | 				<key>destinationuid</key>
 39 | 				<string>EBDBF41A-8F1F-4AB3-A4DD-49A38F7999F6</string>
 40 | 				<key>modifiers</key>
 41 | 				<integer>0</integer>
 42 | 				<key>modifiersubtext</key>
 43 | 				<string></string>
 44 | 				<key>vitoclose</key>
 45 | 				<false/>
 46 | 			</dict>
 47 | 		</array>
 48 | 		<key>FA518468-89D6-4B36-845A-73865872F078</key>
 49 | 		<array>
 50 | 			<dict>
 51 | 				<key>destinationuid</key>
 52 | 				<string>6A616FEE-259D-4EB5-B155-5D8C64455C66</string>
 53 | 				<key>modifiers</key>
 54 | 				<integer>1048576</integer>
 55 | 				<key>modifiersubtext</key>
 56 | 				<string>推送静态站到origin master</string>
 57 | 				<key>vitoclose</key>
 58 | 				<false/>
 59 | 			</dict>
 60 | 			<dict>
 61 | 				<key>destinationuid</key>
 62 | 				<string>A27C006C-6D4A-4606-8ADB-7B07B6A0FCA9</string>
 63 | 				<key>modifiers</key>
 64 | 				<integer>0</integer>
 65 | 				<key>modifiersubtext</key>
 66 | 				<string></string>
 67 | 				<key>vitoclose</key>
 68 | 				<false/>
 69 | 			</dict>
 70 | 		</array>
 71 | 	</dict>
 72 | 	<key>createdby</key>
 73 | 	<string>Hao Li</string>
 74 | 	<key>description</key>
 75 | 	<string></string>
 76 | 	<key>disabled</key>
 77 | 	<false/>
 78 | 	<key>name</key>
 79 | 	<string>Mweb-Blog</string>
 80 | 	<key>objects</key>
 81 | 	<array>
 82 | 		<dict>
 83 | 			<key>config</key>
 84 | 			<dict>
 85 | 				<key>argumenttype</key>
 86 | 				<integer>2</integer>
 87 | 				<key>keyword</key>
 88 | 				<string>mblog</string>
 89 | 				<key>subtext</key>
 90 | 				<string>本地浏览MWeb网站</string>
 91 | 				<key>text</key>
 92 | 				<string>MWeb 辅助工具</string>
 93 | 				<key>withspace</key>
 94 | 				<false/>
 95 | 			</dict>
 96 | 			<key>type</key>
 97 | 			<string>alfred.workflow.input.keyword</string>
 98 | 			<key>uid</key>
 99 | 			<string>FA518468-89D6-4B36-845A-73865872F078</string>
100 | 			<key>version</key>
101 | 			<integer>0</integer>
102 | 		</dict>
103 | 		<dict>
104 | 			<key>config</key>
105 | 			<dict>
106 | 				<key>concurrently</key>
107 | 				<false/>
108 | 				<key>escaping</key>
109 | 				<integer>102</integer>
110 | 				<key>script</key>
111 | 				<string>#!/bin/bash
112 | DIR="$site_path"
113 | echo "push blog", $DIR
114 | if [ -d "$DIR" ]; then
115 | 	cd ${DIR}
116 | 	git add -A
117 | 	git commit -m "Update"
118 | 	git push -u origin master
119 | fi</string>
120 | 				<key>scriptargtype</key>
121 | 				<integer>0</integer>
122 | 				<key>scriptfile</key>
123 | 				<string></string>
124 | 				<key>type</key>
125 | 				<integer>0</integer>
126 | 			</dict>
127 | 			<key>type</key>
128 | 			<string>alfred.workflow.action.script</string>
129 | 			<key>uid</key>
130 | 			<string>6A616FEE-259D-4EB5-B155-5D8C64455C66</string>
131 | 			<key>version</key>
132 | 			<integer>1</integer>
133 | 		</dict>
134 | 		<dict>
135 | 			<key>config</key>
136 | 			<dict>
137 | 				<key>browser</key>
138 | 				<string></string>
139 | 				<key>spaces</key>
140 | 				<string></string>
141 | 				<key>url</key>
142 | 				<string>{query}</string>
143 | 				<key>utf8</key>
144 | 				<true/>
145 | 			</dict>
146 | 			<key>type</key>
147 | 			<string>alfred.workflow.action.openurl</string>
148 | 			<key>uid</key>
149 | 			<string>0EE06942-74AB-41FF-908C-FFBF35486C5D</string>
150 | 			<key>version</key>
151 | 			<integer>0</integer>
152 | 		</dict>
153 | 		<dict>
154 | 			<key>config</key>
155 | 			<dict>
156 | 				<key>concurrently</key>
157 | 				<false/>
158 | 				<key>escaping</key>
159 | 				<integer>102</integer>
160 | 				<key>script</key>
161 | 				<string>#!/bin/bash
162 | xattr -c $site_path/index.html
163 | echo -n "file://$site_path/index.html"</string>
164 | 				<key>scriptargtype</key>
165 | 				<integer>0</integer>
166 | 				<key>scriptfile</key>
167 | 				<string></string>
168 | 				<key>type</key>
169 | 				<integer>0</integer>
170 | 			</dict>
171 | 			<key>type</key>
172 | 			<string>alfred.workflow.action.script</string>
173 | 			<key>uid</key>
174 | 			<string>A27C006C-6D4A-4606-8ADB-7B07B6A0FCA9</string>
175 | 			<key>version</key>
176 | 			<integer>1</integer>
177 | 		</dict>
178 | 		<dict>
179 | 			<key>config</key>
180 | 			<dict>
181 | 				<key>lastpathcomponent</key>
182 | 				<false/>
183 | 				<key>onlyshowifquerypopulated</key>
184 | 				<false/>
185 | 				<key>removeextension</key>
186 | 				<false/>
187 | 				<key>text</key>
188 | 				<string>{query}</string>
189 | 				<key>title</key>
190 | 				<string>已复制到剪切版</string>
191 | 			</dict>
192 | 			<key>type</key>
193 | 			<string>alfred.workflow.output.notification</string>
194 | 			<key>uid</key>
195 | 			<string>EBDBF41A-8F1F-4AB3-A4DD-49A38F7999F6</string>
196 | 			<key>version</key>
197 | 			<integer>0</integer>
198 | 		</dict>
199 | 		<dict>
200 | 			<key>config</key>
201 | 			<dict>
202 | 				<key>alfredfiltersresults</key>
203 | 				<false/>
204 | 				<key>argumenttype</key>
205 | 				<integer>1</integer>
206 | 				<key>escaping</key>
207 | 				<integer>102</integer>
208 | 				<key>keyword</key>
209 | 				<string>mlist</string>
210 | 				<key>queuedelaycustom</key>
211 | 				<integer>3</integer>
212 | 				<key>queuedelayimmediatelyinitially</key>
213 | 				<true/>
214 | 				<key>queuedelaymode</key>
215 | 				<integer>0</integer>
216 | 				<key>queuemode</key>
217 | 				<integer>1</integer>
218 | 				<key>runningsubtext</key>
219 | 				<string>...</string>
220 | 				<key>script</key>
221 | 				<string>query=$1
222 | 
223 | site_path="$site_path"
224 | 
225 | python ListArticle.py "$site_path" "$query"</string>
226 | 				<key>scriptargtype</key>
227 | 				<integer>1</integer>
228 | 				<key>scriptfile</key>
229 | 				<string></string>
230 | 				<key>subtext</key>
231 | 				<string>使用拼音搜索，选中后复制到剪切版</string>
232 | 				<key>title</key>
233 | 				<string>MWeb 静态网页文章列表</string>
234 | 				<key>type</key>
235 | 				<integer>0</integer>
236 | 				<key>withspace</key>
237 | 				<true/>
238 | 			</dict>
239 | 			<key>type</key>
240 | 			<string>alfred.workflow.input.scriptfilter</string>
241 | 			<key>uid</key>
242 | 			<string>A89A4365-D0B5-46CE-A7B6-A1640961FC55</string>
243 | 			<key>version</key>
244 | 			<integer>1</integer>
245 | 		</dict>
246 | 		<dict>
247 | 			<key>config</key>
248 | 			<dict>
249 | 				<key>autopaste</key>
250 | 				<false/>
251 | 				<key>clipboardtext</key>
252 | 				<string>{query}</string>
253 | 				<key>transient</key>
254 | 				<false/>
255 | 			</dict>
256 | 			<key>type</key>
257 | 			<string>alfred.workflow.output.clipboard</string>
258 | 			<key>uid</key>
259 | 			<string>C79D2345-AEF5-4071-B770-C843BA8BCF93</string>
260 | 			<key>version</key>
261 | 			<integer>1</integer>
262 | 		</dict>
263 | 	</array>
264 | 	<key>readme</key>
265 | 	<string></string>
266 | 	<key>uidata</key>
267 | 	<dict>
268 | 		<key>0EE06942-74AB-41FF-908C-FFBF35486C5D</key>
269 | 		<dict>
270 | 			<key>xpos</key>
271 | 			<integer>550</integer>
272 | 			<key>ypos</key>
273 | 			<integer>200</integer>
274 | 		</dict>
275 | 		<key>6A616FEE-259D-4EB5-B155-5D8C64455C66</key>
276 | 		<dict>
277 | 			<key>xpos</key>
278 | 			<integer>550</integer>
279 | 			<key>ypos</key>
280 | 			<integer>80</integer>
281 | 		</dict>
282 | 		<key>A27C006C-6D4A-4606-8ADB-7B07B6A0FCA9</key>
283 | 		<dict>
284 | 			<key>xpos</key>
285 | 			<integer>370</integer>
286 | 			<key>ypos</key>
287 | 			<integer>200</integer>
288 | 		</dict>
289 | 		<key>A89A4365-D0B5-46CE-A7B6-A1640961FC55</key>
290 | 		<dict>
291 | 			<key>xpos</key>
292 | 			<integer>190</integer>
293 | 			<key>ypos</key>
294 | 			<integer>340</integer>
295 | 		</dict>
296 | 		<key>C79D2345-AEF5-4071-B770-C843BA8BCF93</key>
297 | 		<dict>
298 | 			<key>xpos</key>
299 | 			<integer>370</integer>
300 | 			<key>ypos</key>
301 | 			<integer>340</integer>
302 | 		</dict>
303 | 		<key>EBDBF41A-8F1F-4AB3-A4DD-49A38F7999F6</key>
304 | 		<dict>
305 | 			<key>xpos</key>
306 | 			<integer>550</integer>
307 | 			<key>ypos</key>
308 | 			<integer>340</integer>
309 | 		</dict>
310 | 		<key>FA518468-89D6-4B36-845A-73865872F078</key>
311 | 		<dict>
312 | 			<key>xpos</key>
313 | 			<integer>180</integer>
314 | 			<key>ypos</key>
315 | 			<integer>80</integer>
316 | 		</dict>
317 | 	</dict>
318 | 	<key>variables</key>
319 | 	<dict>
320 | 		<key>site_path</key>
321 | 		<string>/Users/hao/blog/MWeb-Blog/Blog</string>
322 | 	</dict>
323 | 	<key>variablesdontexport</key>
324 | 	<array>
325 | 		<string>site_path</string>
326 | 	</array>
327 | 	<key>version</key>
328 | 	<string></string>
329 | 	<key>webaddress</key>
330 | 	<string></string>
331 | </dict>
332 | </plist>
333 | 


--------------------------------------------------------------------------------
/bs4/builder/_lxml.py:
--------------------------------------------------------------------------------
  1 | __all__ = [
  2 |     'LXMLTreeBuilderForXML',
  3 |     'LXMLTreeBuilder',
  4 |     ]
  5 | 
  6 | from io import BytesIO
  7 | from StringIO import StringIO
  8 | import collections
  9 | from lxml import etree
 10 | from bs4.element import (
 11 |     Comment,
 12 |     Doctype,
 13 |     NamespacedAttribute,
 14 |     ProcessingInstruction,
 15 | )
 16 | from bs4.builder import (
 17 |     FAST,
 18 |     HTML,
 19 |     HTMLTreeBuilder,
 20 |     PERMISSIVE,
 21 |     ParserRejectedMarkup,
 22 |     TreeBuilder,
 23 |     XML)
 24 | from bs4.dammit import EncodingDetector
 25 | 
 26 | LXML = 'lxml'
 27 | 
 28 | class LXMLTreeBuilderForXML(TreeBuilder):
 29 |     DEFAULT_PARSER_CLASS = etree.XMLParser
 30 | 
 31 |     is_xml = True
 32 | 
 33 |     NAME = "lxml-xml"
 34 |     ALTERNATE_NAMES = ["xml"]
 35 | 
 36 |     # Well, it's permissive by XML parser standards.
 37 |     features = [NAME, LXML, XML, FAST, PERMISSIVE]
 38 | 
 39 |     CHUNK_SIZE = 512
 40 | 
 41 |     # This namespace mapping is specified in the XML Namespace
 42 |     # standard.
 43 |     DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
 44 | 
 45 |     def default_parser(self, encoding):
 46 |         # This can either return a parser object or a class, which
 47 |         # will be instantiated with default arguments.
 48 |         if self._default_parser is not None:
 49 |             return self._default_parser
 50 |         return etree.XMLParser(
 51 |             target=self, strip_cdata=False, recover=True, encoding=encoding)
 52 | 
 53 |     def parser_for(self, encoding):
 54 |         # Use the default parser.
 55 |         parser = self.default_parser(encoding)
 56 | 
 57 |         if isinstance(parser, collections.Callable):
 58 |             # Instantiate the parser with default arguments
 59 |             parser = parser(target=self, strip_cdata=False, encoding=encoding)
 60 |         return parser
 61 | 
 62 |     def __init__(self, parser=None, empty_element_tags=None):
 63 |         # TODO: Issue a warning if parser is present but not a
 64 |         # callable, since that means there's no way to create new
 65 |         # parsers for different encodings.
 66 |         self._default_parser = parser
 67 |         if empty_element_tags is not None:
 68 |             self.empty_element_tags = set(empty_element_tags)
 69 |         self.soup = None
 70 |         self.nsmaps = [self.DEFAULT_NSMAPS]
 71 | 
 72 |     def _getNsTag(self, tag):
 73 |         # Split the namespace URL out of a fully-qualified lxml tag
 74 |         # name. Copied from lxml's src/lxml/sax.py.
 75 |         if tag[0] == '{':
 76 |             return tuple(tag[1:].split('}', 1))
 77 |         else:
 78 |             return (None, tag)
 79 | 
 80 |     def prepare_markup(self, markup, user_specified_encoding=None,
 81 |                        exclude_encodings=None,
 82 |                        document_declared_encoding=None):
 83 |         """
 84 |         :yield: A series of 4-tuples.
 85 |          (markup, encoding, declared encoding,
 86 |           has undergone character replacement)
 87 | 
 88 |         Each 4-tuple represents a strategy for parsing the document.
 89 |         """
 90 |         if isinstance(markup, unicode):
 91 |             # We were given Unicode. Maybe lxml can parse Unicode on
 92 |             # this system?
 93 |             yield markup, None, document_declared_encoding, False
 94 | 
 95 |         if isinstance(markup, unicode):
 96 |             # No, apparently not. Convert the Unicode to UTF-8 and
 97 |             # tell lxml to parse it as UTF-8.
 98 |             yield (markup.encode("utf8"), "utf8",
 99 |                    document_declared_encoding, False)
100 | 
101 |         # Instead of using UnicodeDammit to convert the bytestring to
102 |         # Unicode using different encodings, use EncodingDetector to
103 |         # iterate over the encodings, and tell lxml to try to parse
104 |         # the document as each one in turn.
105 |         is_html = not self.is_xml
106 |         try_encodings = [user_specified_encoding, document_declared_encoding]
107 |         detector = EncodingDetector(
108 |             markup, try_encodings, is_html, exclude_encodings)
109 |         for encoding in detector.encodings:
110 |             yield (detector.markup, encoding, document_declared_encoding, False)
111 | 
112 |     def feed(self, markup):
113 |         if isinstance(markup, bytes):
114 |             markup = BytesIO(markup)
115 |         elif isinstance(markup, unicode):
116 |             markup = StringIO(markup)
117 | 
118 |         # Call feed() at least once, even if the markup is empty,
119 |         # or the parser won't be initialized.
120 |         data = markup.read(self.CHUNK_SIZE)
121 |         try:
122 |             self.parser = self.parser_for(self.soup.original_encoding)
123 |             self.parser.feed(data)
124 |             while len(data) != 0:
125 |                 # Now call feed() on the rest of the data, chunk by chunk.
126 |                 data = markup.read(self.CHUNK_SIZE)
127 |                 if len(data) != 0:
128 |                     self.parser.feed(data)
129 |             self.parser.close()
130 |         except (UnicodeDecodeError, LookupError, etree.ParserError), e:
131 |             raise ParserRejectedMarkup(str(e))
132 | 
133 |     def close(self):
134 |         self.nsmaps = [self.DEFAULT_NSMAPS]
135 | 
136 |     def start(self, name, attrs, nsmap={}):
137 |         # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
138 |         attrs = dict(attrs)
139 |         nsprefix = None
140 |         # Invert each namespace map as it comes in.
141 |         if len(self.nsmaps) > 1:
142 |             # There are no new namespaces for this tag, but
143 |             # non-default namespaces are in play, so we need a
144 |             # separate tag stack to know when they end.
145 |             self.nsmaps.append(None)
146 |         elif len(nsmap) > 0:
147 |             # A new namespace mapping has come into play.
148 |             inverted_nsmap = dict((value, key) for key, value in nsmap.items())
149 |             self.nsmaps.append(inverted_nsmap)
150 |             # Also treat the namespace mapping as a set of attributes on the
151 |             # tag, so we can recreate it later.
152 |             attrs = attrs.copy()
153 |             for prefix, namespace in nsmap.items():
154 |                 attribute = NamespacedAttribute(
155 |                     "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
156 |                 attrs[attribute] = namespace
157 | 
158 |         # Namespaces are in play. Find any attributes that came in
159 |         # from lxml with namespaces attached to their names, and
160 |         # turn then into NamespacedAttribute objects.
161 |         new_attrs = {}
162 |         for attr, value in attrs.items():
163 |             namespace, attr = self._getNsTag(attr)
164 |             if namespace is None:
165 |                 new_attrs[attr] = value
166 |             else:
167 |                 nsprefix = self._prefix_for_namespace(namespace)
168 |                 attr = NamespacedAttribute(nsprefix, attr, namespace)
169 |                 new_attrs[attr] = value
170 |         attrs = new_attrs
171 | 
172 |         namespace, name = self._getNsTag(name)
173 |         nsprefix = self._prefix_for_namespace(namespace)
174 |         self.soup.handle_starttag(name, namespace, nsprefix, attrs)
175 | 
176 |     def _prefix_for_namespace(self, namespace):
177 |         """Find the currently active prefix for the given namespace."""
178 |         if namespace is None:
179 |             return None
180 |         for inverted_nsmap in reversed(self.nsmaps):
181 |             if inverted_nsmap is not None and namespace in inverted_nsmap:
182 |                 return inverted_nsmap[namespace]
183 |         return None
184 | 
185 |     def end(self, name):
186 |         self.soup.endData()
187 |         completed_tag = self.soup.tagStack[-1]
188 |         namespace, name = self._getNsTag(name)
189 |         nsprefix = None
190 |         if namespace is not None:
191 |             for inverted_nsmap in reversed(self.nsmaps):
192 |                 if inverted_nsmap is not None and namespace in inverted_nsmap:
193 |                     nsprefix = inverted_nsmap[namespace]
194 |                     break
195 |         self.soup.handle_endtag(name, nsprefix)
196 |         if len(self.nsmaps) > 1:
197 |             # This tag, or one of its parents, introduced a namespace
198 |             # mapping, so pop it off the stack.
199 |             self.nsmaps.pop()
200 | 
201 |     def pi(self, target, data):
202 |         self.soup.endData()
203 |         self.soup.handle_data(target + ' ' + data)
204 |         self.soup.endData(ProcessingInstruction)
205 | 
206 |     def data(self, content):
207 |         self.soup.handle_data(content)
208 | 
209 |     def doctype(self, name, pubid, system):
210 |         self.soup.endData()
211 |         doctype = Doctype.for_name_and_ids(name, pubid, system)
212 |         self.soup.object_was_parsed(doctype)
213 | 
214 |     def comment(self, content):
215 |         "Handle comments as Comment objects."
216 |         self.soup.endData()
217 |         self.soup.handle_data(content)
218 |         self.soup.endData(Comment)
219 | 
220 |     def test_fragment_to_document(self, fragment):
221 |         """See `TreeBuilder`."""
222 |         return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
223 | 
224 | 
225 | class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
226 | 
227 |     NAME = LXML
228 |     ALTERNATE_NAMES = ["lxml-html"]
229 | 
230 |     features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
231 |     is_xml = False
232 | 
233 |     def default_parser(self, encoding):
234 |         return etree.HTMLParser
235 | 
236 |     def feed(self, markup):
237 |         encoding = self.soup.original_encoding
238 |         try:
239 |             self.parser = self.parser_for(encoding)
240 |             self.parser.feed(markup)
241 |             self.parser.close()
242 |         except (UnicodeDecodeError, LookupError, etree.ParserError), e:
243 |             raise ParserRejectedMarkup(str(e))
244 | 
245 | 
246 |     def test_fragment_to_document(self, fragment):
247 |         """See `TreeBuilder`."""
248 |         return u'<html><body>%s</body></html>' % fragment
249 | 


--------------------------------------------------------------------------------
/bs4/builder/_htmlparser.py:
--------------------------------------------------------------------------------
  1 | """Use the HTMLParser library to parse HTML files that aren't too bad."""
  2 | 
  3 | __all__ = [
  4 |     'HTMLParserTreeBuilder',
  5 |     ]
  6 | 
  7 | from HTMLParser import HTMLParser
  8 | 
  9 | try:
 10 |     from HTMLParser import HTMLParseError
 11 | except ImportError, e:
 12 |     # HTMLParseError is removed in Python 3.5. Since it can never be
 13 |     # thrown in 3.5, we can just define our own class as a placeholder.
 14 |     class HTMLParseError(Exception):
 15 |         pass
 16 | 
 17 | import sys
 18 | import warnings
 19 | 
 20 | # Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
 21 | # argument, which we'd like to set to False. Unfortunately,
 22 | # http://bugs.python.org/issue13273 makes strict=True a better bet
 23 | # before Python 3.2.3.
 24 | #
 25 | # At the end of this file, we monkeypatch HTMLParser so that
 26 | # strict=True works well on Python 3.2.2.
 27 | major, minor, release = sys.version_info[:3]
 28 | CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
 29 | CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
 30 | CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
 31 | 
 32 | 
 33 | from bs4.element import (
 34 |     CData,
 35 |     Comment,
 36 |     Declaration,
 37 |     Doctype,
 38 |     ProcessingInstruction,
 39 |     )
 40 | from bs4.dammit import EntitySubstitution, UnicodeDammit
 41 | 
 42 | from bs4.builder import (
 43 |     HTML,
 44 |     HTMLTreeBuilder,
 45 |     STRICT,
 46 |     )
 47 | 
 48 | 
 49 | HTMLPARSER = 'html.parser'
 50 | 
 51 | class BeautifulSoupHTMLParser(HTMLParser):
 52 |     def handle_starttag(self, name, attrs):
 53 |         # XXX namespace
 54 |         attr_dict = {}
 55 |         for key, value in attrs:
 56 |             # Change None attribute values to the empty string
 57 |             # for consistency with the other tree builders.
 58 |             if value is None:
 59 |                 value = ''
 60 |             attr_dict[key] = value
 61 |             attrvalue = '""'
 62 |         self.soup.handle_starttag(name, None, None, attr_dict)
 63 | 
 64 |     def handle_endtag(self, name):
 65 |         self.soup.handle_endtag(name)
 66 | 
 67 |     def handle_data(self, data):
 68 |         self.soup.handle_data(data)
 69 | 
 70 |     def handle_charref(self, name):
 71 |         # XXX workaround for a bug in HTMLParser. Remove this once
 72 |         # it's fixed in all supported versions.
 73 |         # http://bugs.python.org/issue13633
 74 |         if name.startswith('x'):
 75 |             real_name = int(name.lstrip('x'), 16)
 76 |         elif name.startswith('X'):
 77 |             real_name = int(name.lstrip('X'), 16)
 78 |         else:
 79 |             real_name = int(name)
 80 | 
 81 |         try:
 82 |             data = unichr(real_name)
 83 |         except (ValueError, OverflowError), e:
 84 |             data = u"\N{REPLACEMENT CHARACTER}"
 85 | 
 86 |         self.handle_data(data)
 87 | 
 88 |     def handle_entityref(self, name):
 89 |         character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
 90 |         if character is not None:
 91 |             data = character
 92 |         else:
 93 |             data = "&%s;" % name
 94 |         self.handle_data(data)
 95 | 
 96 |     def handle_comment(self, data):
 97 |         self.soup.endData()
 98 |         self.soup.handle_data(data)
 99 |         self.soup.endData(Comment)
100 | 
101 |     def handle_decl(self, data):
102 |         self.soup.endData()
103 |         if data.startswith("DOCTYPE "):
104 |             data = data[len("DOCTYPE "):]
105 |         elif data == 'DOCTYPE':
106 |             # i.e. "<!DOCTYPE>"
107 |             data = ''
108 |         self.soup.handle_data(data)
109 |         self.soup.endData(Doctype)
110 | 
111 |     def unknown_decl(self, data):
112 |         if data.upper().startswith('CDATA['):
113 |             cls = CData
114 |             data = data[len('CDATA['):]
115 |         else:
116 |             cls = Declaration
117 |         self.soup.endData()
118 |         self.soup.handle_data(data)
119 |         self.soup.endData(cls)
120 | 
121 |     def handle_pi(self, data):
122 |         self.soup.endData()
123 |         self.soup.handle_data(data)
124 |         self.soup.endData(ProcessingInstruction)
125 | 
126 | 
127 | class HTMLParserTreeBuilder(HTMLTreeBuilder):
128 | 
129 |     is_xml = False
130 |     picklable = True
131 |     NAME = HTMLPARSER
132 |     features = [NAME, HTML, STRICT]
133 | 
134 |     def __init__(self, *args, **kwargs):
135 |         if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
136 |             kwargs['strict'] = False
137 |         if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
138 |             kwargs['convert_charrefs'] = False
139 |         self.parser_args = (args, kwargs)
140 | 
141 |     def prepare_markup(self, markup, user_specified_encoding=None,
142 |                        document_declared_encoding=None, exclude_encodings=None):
143 |         """
144 |         :return: A 4-tuple (markup, original encoding, encoding
145 |         declared within markup, whether any characters had to be
146 |         replaced with REPLACEMENT CHARACTER).
147 |         """
148 |         if isinstance(markup, unicode):
149 |             yield (markup, None, None, False)
150 |             return
151 | 
152 |         try_encodings = [user_specified_encoding, document_declared_encoding]
153 |         dammit = UnicodeDammit(markup, try_encodings, is_html=True,
154 |                                exclude_encodings=exclude_encodings)
155 |         yield (dammit.markup, dammit.original_encoding,
156 |                dammit.declared_html_encoding,
157 |                dammit.contains_replacement_characters)
158 | 
159 |     def feed(self, markup):
160 |         args, kwargs = self.parser_args
161 |         parser = BeautifulSoupHTMLParser(*args, **kwargs)
162 |         parser.soup = self.soup
163 |         try:
164 |             parser.feed(markup)
165 |         except HTMLParseError, e:
166 |             warnings.warn(RuntimeWarning(
167 |                 "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
168 |             raise e
169 | 
170 | # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
171 | # 3.2.3 code. This ensures they don't treat markup like <p></p> as a
172 | # string.
173 | #
174 | # XXX This code can be removed once most Python 3 users are on 3.2.3.
175 | if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
176 |     import re
177 |     attrfind_tolerant = re.compile(
178 |         r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
179 |         r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
180 |     HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
181 | 
182 |     locatestarttagend = re.compile(r"""
183 |   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
184 |   (?:\s+                             # whitespace before attribute name
185 |     (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
186 |       (?:\s*=\s*                     # value indicator
187 |         (?:'[^']*'                   # LITA-enclosed value
188 |           |\"[^\"]*\"                # LIT-enclosed value
189 |           |[^'\">\s]+                # bare value
190 |          )
191 |        )?
192 |      )
193 |    )*
194 |   \s*                                # trailing whitespace
195 | """, re.VERBOSE)
196 |     BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
197 | 
198 |     from html.parser import tagfind, attrfind
199 | 
200 |     def parse_starttag(self, i):
201 |         self.__starttag_text = None
202 |         endpos = self.check_for_whole_start_tag(i)
203 |         if endpos < 0:
204 |             return endpos
205 |         rawdata = self.rawdata
206 |         self.__starttag_text = rawdata[i:endpos]
207 | 
208 |         # Now parse the data between i+1 and j into a tag and attrs
209 |         attrs = []
210 |         match = tagfind.match(rawdata, i+1)
211 |         assert match, 'unexpected call to parse_starttag()'
212 |         k = match.end()
213 |         self.lasttag = tag = rawdata[i+1:k].lower()
214 |         while k < endpos:
215 |             if self.strict:
216 |                 m = attrfind.match(rawdata, k)
217 |             else:
218 |                 m = attrfind_tolerant.match(rawdata, k)
219 |             if not m:
220 |                 break
221 |             attrname, rest, attrvalue = m.group(1, 2, 3)
222 |             if not rest:
223 |                 attrvalue = None
224 |             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
225 |                  attrvalue[:1] == '"' == attrvalue[-1:]:
226 |                 attrvalue = attrvalue[1:-1]
227 |             if attrvalue:
228 |                 attrvalue = self.unescape(attrvalue)
229 |             attrs.append((attrname.lower(), attrvalue))
230 |             k = m.end()
231 | 
232 |         end = rawdata[k:endpos].strip()
233 |         if end not in (">", "/>"):
234 |             lineno, offset = self.getpos()
235 |             if "\n" in self.__starttag_text:
236 |                 lineno = lineno + self.__starttag_text.count("\n")
237 |                 offset = len(self.__starttag_text) \
238 |                          - self.__starttag_text.rfind("\n")
239 |             else:
240 |                 offset = offset + len(self.__starttag_text)
241 |             if self.strict:
242 |                 self.error("junk characters in start tag: %r"
243 |                            % (rawdata[k:endpos][:20],))
244 |             self.handle_data(rawdata[i:endpos])
245 |             return endpos
246 |         if end.endswith('/>'):
247 |             # XHTML-style empty tag: <span attr="value" />
248 |             self.handle_startendtag(tag, attrs)
249 |         else:
250 |             self.handle_starttag(tag, attrs)
251 |             if tag in self.CDATA_CONTENT_ELEMENTS:
252 |                 self.set_cdata_mode(tag)
253 |         return endpos
254 | 
255 |     def set_cdata_mode(self, elem):
256 |         self.cdata_elem = elem.lower()
257 |         self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
258 | 
259 |     BeautifulSoupHTMLParser.parse_starttag = parse_starttag
260 |     BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
261 | 
262 |     CONSTRUCTOR_TAKES_STRICT = True
263 | 


--------------------------------------------------------------------------------
/workflow/notify.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | #
  4 | # Copyright (c) 2015 deanishe@deanishe.net
  5 | #
  6 | # MIT Licence. See http://opensource.org/licenses/MIT
  7 | #
  8 | # Created on 2015-11-26
  9 | #
 10 | 
 11 | # TODO: Exclude this module from test and code coverage in py2.6
 12 | 
 13 | """
 14 | Post notifications via the OS X Notification Center. This feature
 15 | is only available on Mountain Lion (10.8) and later. It will
 16 | silently fail on older systems.
 17 | 
 18 | The main API is a single function, :func:`~workflow.notify.notify`.
 19 | 
 20 | It works by copying a simple application to your workflow's data
 21 | directory. It replaces the application's icon with your workflow's
 22 | icon and then calls the application to post notifications.
 23 | """
 24 | 
 25 | from __future__ import print_function, unicode_literals
 26 | 
 27 | import os
 28 | import plistlib
 29 | import shutil
 30 | import subprocess
 31 | import sys
 32 | import tarfile
 33 | import tempfile
 34 | import uuid
 35 | 
 36 | import workflow
 37 | 
 38 | 
 39 | _wf = None
 40 | _log = None
 41 | 
 42 | 
 43 | #: Available system sounds from System Preferences > Sound > Sound Effects
 44 | SOUNDS = (
 45 |     'Basso',
 46 |     'Blow',
 47 |     'Bottle',
 48 |     'Frog',
 49 |     'Funk',
 50 |     'Glass',
 51 |     'Hero',
 52 |     'Morse',
 53 |     'Ping',
 54 |     'Pop',
 55 |     'Purr',
 56 |     'Sosumi',
 57 |     'Submarine',
 58 |     'Tink',
 59 | )
 60 | 
 61 | 
 62 | def wf():
 63 |     """Return `Workflow` object for this module.
 64 | 
 65 |     Returns:
 66 |         workflow.Workflow: `Workflow` object for current workflow.
 67 |     """
 68 |     global _wf
 69 |     if _wf is None:
 70 |         _wf = workflow.Workflow()
 71 |     return _wf
 72 | 
 73 | 
 74 | def log():
 75 |     """Return logger for this module.
 76 | 
 77 |     Returns:
 78 |         logging.Logger: Logger for this module.
 79 |     """
 80 |     global _log
 81 |     if _log is None:
 82 |         _log = wf().logger
 83 |     return _log
 84 | 
 85 | 
 86 | def notifier_program():
 87 |     """Return path to notifier applet executable.
 88 | 
 89 |     Returns:
 90 |         unicode: Path to Notify.app `applet` executable.
 91 |     """
 92 |     return wf().datafile('Notify.app/Contents/MacOS/applet')
 93 | 
 94 | 
 95 | def notifier_icon_path():
 96 |     """Return path to icon file in installed Notify.app.
 97 | 
 98 |     Returns:
 99 |         unicode: Path to `applet.icns` within the app bundle.
100 |     """
101 |     return wf().datafile('Notify.app/Contents/Resources/applet.icns')
102 | 
103 | 
104 | def install_notifier():
105 |     """Extract `Notify.app` from the workflow to data directory.
106 | 
107 |     Changes the bundle ID of the installed app and gives it the
108 |     workflow's icon.
109 |     """
110 |     archive = os.path.join(os.path.dirname(__file__), 'Notify.tgz')
111 |     destdir = wf().datadir
112 |     app_path = os.path.join(destdir, 'Notify.app')
113 |     n = notifier_program()
114 |     log().debug("Installing Notify.app to %r ...", destdir)
115 |     # z = zipfile.ZipFile(archive, 'r')
116 |     # z.extractall(destdir)
117 |     tgz = tarfile.open(archive, 'r:gz')
118 |     tgz.extractall(destdir)
119 |     assert os.path.exists(n), (
120 |         "Notify.app could not be installed in {0!r}.".format(destdir))
121 | 
122 |     # Replace applet icon
123 |     icon = notifier_icon_path()
124 |     workflow_icon = wf().workflowfile('icon.png')
125 |     if os.path.exists(icon):
126 |         os.unlink(icon)
127 | 
128 |     png_to_icns(workflow_icon, icon)
129 | 
130 |     # Set file icon
131 |     # PyObjC isn't available for 2.6, so this is 2.7 only. Actually,
132 |     # none of this code will "work" on pre-10.8 systems. Let it run
133 |     # until I figure out a better way of excluding this module
134 |     # from coverage in py2.6.
135 |     if sys.version_info >= (2, 7):  # pragma: no cover
136 |         from AppKit import NSWorkspace, NSImage
137 | 
138 |         ws = NSWorkspace.sharedWorkspace()
139 |         img = NSImage.alloc().init()
140 |         img.initWithContentsOfFile_(icon)
141 |         ws.setIcon_forFile_options_(img, app_path, 0)
142 | 
143 |     # Change bundle ID of installed app
144 |     ip_path = os.path.join(app_path, 'Contents/Info.plist')
145 |     bundle_id = '{0}.{1}'.format(wf().bundleid, uuid.uuid4().hex)
146 |     data = plistlib.readPlist(ip_path)
147 |     log().debug('Changing bundle ID to {0!r}'.format(bundle_id))
148 |     data['CFBundleIdentifier'] = bundle_id
149 |     plistlib.writePlist(data, ip_path)
150 | 
151 | 
152 | def validate_sound(sound):
153 |     """Coerce `sound` to valid sound name.
154 | 
155 |     Returns `None` for invalid sounds. Sound names can be found
156 |     in `System Preferences > Sound > Sound Effects`.
157 | 
158 |     Args:
159 |         sound (str): Name of system sound.
160 | 
161 |     Returns:
162 |         str: Proper name of sound or `None`.
163 |     """
164 |     if not sound:
165 |         return None
166 | 
167 |     # Case-insensitive comparison of `sound`
168 |     if sound.lower() in [s.lower() for s in SOUNDS]:
169 |         # Title-case is correct for all system sounds as of OS X 10.11
170 |         return sound.title()
171 |     return None
172 | 
173 | 
174 | def notify(title='', text='', sound=None):
175 |     """Post notification via Notify.app helper.
176 | 
177 |     Args:
178 |         title (str, optional): Notification title.
179 |         text (str, optional): Notification body text.
180 |         sound (str, optional): Name of sound to play.
181 | 
182 |     Raises:
183 |         ValueError: Raised if both `title` and `text` are empty.
184 | 
185 |     Returns:
186 |         bool: `True` if notification was posted, else `False`.
187 |     """
188 |     if title == text == '':
189 |         raise ValueError('Empty notification')
190 | 
191 |     sound = validate_sound(sound) or ''
192 | 
193 |     n = notifier_program()
194 | 
195 |     if not os.path.exists(n):
196 |         install_notifier()
197 | 
198 |     env = os.environ.copy()
199 |     enc = 'utf-8'
200 |     env['NOTIFY_TITLE'] = title.encode(enc)
201 |     env['NOTIFY_MESSAGE'] =  text.encode(enc)
202 |     env['NOTIFY_SOUND'] = sound.encode(enc)
203 |     cmd = [n]
204 |     retcode = subprocess.call(cmd, env=env)
205 |     if retcode == 0:
206 |         return True
207 | 
208 |     log().error('Notify.app exited with status {0}.'.format(retcode))
209 |     return False
210 | 
211 | 
212 | def convert_image(inpath, outpath, size):
213 |     """Convert an image file using `sips`.
214 | 
215 |     Args:
216 |         inpath (str): Path of source file.
217 |         outpath (str): Path to destination file.
218 |         size (int): Width and height of destination image in pixels.
219 | 
220 |     Raises:
221 |         RuntimeError: Raised if `sips` exits with non-zero status.
222 |     """
223 |     cmd = [
224 |         b'sips',
225 |         b'-z', b'{0}'.format(size), b'{0}'.format(size),
226 |         inpath,
227 |         b'--out', outpath]
228 |     # log().debug(cmd)
229 |     with open(os.devnull, 'w') as pipe:
230 |         retcode = subprocess.call(cmd, stdout=pipe, stderr=subprocess.STDOUT)
231 | 
232 |     if retcode != 0:
233 |         raise RuntimeError('sips exited with {0}'.format(retcode))
234 | 
235 | 
236 | def png_to_icns(png_path, icns_path):
237 |     """Convert PNG file to ICNS using `iconutil`.
238 | 
239 |     Create an iconset from the source PNG file. Generate PNG files
240 |     in each size required by OS X, then call `iconutil` to turn
241 |     them into a single ICNS file.
242 | 
243 |     Args:
244 |         png_path (str): Path to source PNG file.
245 |         icns_path (str): Path to destination ICNS file.
246 | 
247 |     Raises:
248 |         RuntimeError: Raised if `iconutil` or `sips` fail.
249 |     """
250 |     tempdir = tempfile.mkdtemp(prefix='aw-', dir=wf().datadir)
251 | 
252 |     try:
253 |         iconset = os.path.join(tempdir, 'Icon.iconset')
254 | 
255 |         assert not os.path.exists(iconset), (
256 |             "Iconset path already exists : {0!r}".format(iconset))
257 |         os.makedirs(iconset)
258 | 
259 |         # Copy source icon to icon set and generate all the other
260 |         # sizes needed
261 |         configs = []
262 |         for i in (16, 32, 128, 256, 512):
263 |             configs.append(('icon_{0}x{0}.png'.format(i), i))
264 |             configs.append((('icon_{0}x{0}@2x.png'.format(i), i*2)))
265 | 
266 |         shutil.copy(png_path, os.path.join(iconset, 'icon_256x256.png'))
267 |         shutil.copy(png_path, os.path.join(iconset, 'icon_128x128@2x.png'))
268 | 
269 |         for name, size in configs:
270 |             outpath = os.path.join(iconset, name)
271 |             if os.path.exists(outpath):
272 |                 continue
273 |             convert_image(png_path, outpath, size)
274 | 
275 |         cmd = [
276 |             b'iconutil',
277 |             b'-c', b'icns',
278 |             b'-o', icns_path,
279 |             iconset]
280 | 
281 |         retcode = subprocess.call(cmd)
282 |         if retcode != 0:
283 |             raise RuntimeError("iconset exited with {0}".format(retcode))
284 | 
285 |         assert os.path.exists(icns_path), (
286 |             "Generated ICNS file not found : {0!r}".format(icns_path))
287 |     finally:
288 |         try:
289 |             shutil.rmtree(tempdir)
290 |         except OSError:  # pragma: no cover
291 |             pass
292 | 
293 | 
294 | # def notify_native(title='', text='', sound=''):
295 | #     """Post notification via the native API (via pyobjc).
296 | 
297 | #     At least one of `title` or `text` must be specified.
298 | 
299 | #     This method will *always* show the Python launcher icon (i.e. the
300 | #     rocket with the snakes on it).
301 | 
302 | #     Args:
303 | #         title (str, optional): Notification title.
304 | #         text (str, optional): Notification body text.
305 | #         sound (str, optional): Name of sound to play.
306 | 
307 | #     """
308 | 
309 | #     if title == text == '':
310 | #         raise ValueError('Empty notification')
311 | 
312 | #     import Foundation
313 | 
314 | #     sound = sound or Foundation.NSUserNotificationDefaultSoundName
315 | 
316 | #     n = Foundation.NSUserNotification.alloc().init()
317 | #     n.setTitle_(title)
318 | #     n.setInformativeText_(text)
319 | #     n.setSoundName_(sound)
320 | #     nc = Foundation.NSUserNotificationCenter.defaultUserNotificationCenter()
321 | #     nc.deliverNotification_(n)
322 | 
323 | 
324 | if __name__ == '__main__':  # pragma: nocover
325 |     # Simple command-line script to test module with
326 |     # This won't work on 2.6, as `argparse` isn't available
327 |     # by default.
328 |     import argparse
329 | 
330 |     from unicodedata import normalize
331 | 
332 |     def uni(s):
333 |         """Coerce `s` to normalised Unicode."""
334 |         ustr = s.decode('utf-8')
335 |         return normalize('NFD', ustr)
336 | 
337 |     p = argparse.ArgumentParser()
338 |     p.add_argument('-p', '--png', help="PNG image to convert to ICNS.")
339 |     p.add_argument('-l', '--list-sounds', help="Show available sounds.",
340 |                    action='store_true')
341 |     p.add_argument('-t', '--title',
342 |                    help="Notification title.", type=uni,
343 |                    default='')
344 |     p.add_argument('-s', '--sound', type=uni,
345 |                    help="Optional notification sound.", default='')
346 |     p.add_argument('text', type=uni,
347 |                    help="Notification body text.", default='', nargs='?')
348 |     o = p.parse_args()
349 | 
350 |     # List available sounds
351 |     if o.list_sounds:
352 |         for sound in SOUNDS:
353 |             print(sound)
354 |         sys.exit(0)
355 | 
356 |     # Convert PNG to ICNS
357 |     if o.png:
358 |         icns = os.path.join(
359 |             os.path.dirname(o.png),
360 |             b'{0}{1}'.format(os.path.splitext(os.path.basename(o.png))[0],
361 |                              '.icns'))
362 | 
363 |         print('Converting {0!r} to {1!r} ...'.format(o.png, icns),
364 |               file=sys.stderr)
365 | 
366 |         assert not os.path.exists(icns), (
367 |             "Destination file already exists : {0}".format(icns))
368 | 
369 |         png_to_icns(o.png, icns)
370 |         sys.exit(0)
371 | 
372 |     # Post notification
373 |     if o.title == o.text == '':
374 |         print('ERROR: Empty notification.', file=sys.stderr)
375 |         sys.exit(1)
376 |     else:
377 |         notify(o.title, o.text, o.sound)
378 | 


--------------------------------------------------------------------------------
/bs4/builder/__init__.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import itertools
  3 | import sys
  4 | from bs4.element import (
  5 |     CharsetMetaAttributeValue,
  6 |     ContentMetaAttributeValue,
  7 |     whitespace_re
  8 |     )
  9 | 
 10 | __all__ = [
 11 |     'HTMLTreeBuilder',
 12 |     'SAXTreeBuilder',
 13 |     'TreeBuilder',
 14 |     'TreeBuilderRegistry',
 15 |     ]
 16 | 
 17 | # Some useful features for a TreeBuilder to have.
 18 | FAST = 'fast'
 19 | PERMISSIVE = 'permissive'
 20 | STRICT = 'strict'
 21 | XML = 'xml'
 22 | HTML = 'html'
 23 | HTML_5 = 'html5'
 24 | 
 25 | 
 26 | class TreeBuilderRegistry(object):
 27 | 
 28 |     def __init__(self):
 29 |         self.builders_for_feature = defaultdict(list)
 30 |         self.builders = []
 31 | 
 32 |     def register(self, treebuilder_class):
 33 |         """Register a treebuilder based on its advertised features."""
 34 |         for feature in treebuilder_class.features:
 35 |             self.builders_for_feature[feature].insert(0, treebuilder_class)
 36 |         self.builders.insert(0, treebuilder_class)
 37 | 
 38 |     def lookup(self, *features):
 39 |         if len(self.builders) == 0:
 40 |             # There are no builders at all.
 41 |             return None
 42 | 
 43 |         if len(features) == 0:
 44 |             # They didn't ask for any features. Give them the most
 45 |             # recently registered builder.
 46 |             return self.builders[0]
 47 | 
 48 |         # Go down the list of features in order, and eliminate any builders
 49 |         # that don't match every feature.
 50 |         features = list(features)
 51 |         features.reverse()
 52 |         candidates = None
 53 |         candidate_set = None
 54 |         while len(features) > 0:
 55 |             feature = features.pop()
 56 |             we_have_the_feature = self.builders_for_feature.get(feature, [])
 57 |             if len(we_have_the_feature) > 0:
 58 |                 if candidates is None:
 59 |                     candidates = we_have_the_feature
 60 |                     candidate_set = set(candidates)
 61 |                 else:
 62 |                     # Eliminate any candidates that don't have this feature.
 63 |                     candidate_set = candidate_set.intersection(
 64 |                         set(we_have_the_feature))
 65 | 
 66 |         # The only valid candidates are the ones in candidate_set.
 67 |         # Go through the original list of candidates and pick the first one
 68 |         # that's in candidate_set.
 69 |         if candidate_set is None:
 70 |             return None
 71 |         for candidate in candidates:
 72 |             if candidate in candidate_set:
 73 |                 return candidate
 74 |         return None
 75 | 
 76 | # The BeautifulSoup class will take feature lists from developers and use them
 77 | # to look up builders in this registry.
 78 | builder_registry = TreeBuilderRegistry()
 79 | 
 80 | class TreeBuilder(object):
 81 |     """Turn a document into a Beautiful Soup object tree."""
 82 | 
 83 |     NAME = "[Unknown tree builder]"
 84 |     ALTERNATE_NAMES = []
 85 |     features = []
 86 | 
 87 |     is_xml = False
 88 |     picklable = False
 89 |     preserve_whitespace_tags = set()
 90 |     empty_element_tags = None # A tag will be considered an empty-element
 91 |                               # tag when and only when it has no contents.
 92 | 
 93 |     # A value for these tag/attribute combinations is a space- or
 94 |     # comma-separated list of CDATA, rather than a single CDATA.
 95 |     cdata_list_attributes = {}
 96 | 
 97 | 
 98 |     def __init__(self):
 99 |         self.soup = None
100 | 
101 |     def reset(self):
102 |         pass
103 | 
104 |     def can_be_empty_element(self, tag_name):
105 |         """Might a tag with this name be an empty-element tag?
106 | 
107 |         The final markup may or may not actually present this tag as
108 |         self-closing.
109 | 
110 |         For instance: an HTMLBuilder does not consider a <p> tag to be
111 |         an empty-element tag (it's not in
112 |         HTMLBuilder.empty_element_tags). This means an empty <p> tag
113 |         will be presented as "<p></p>", not "<p />".
114 | 
115 |         The default implementation has no opinion about which tags are
116 |         empty-element tags, so a tag will be presented as an
117 |         empty-element tag if and only if it has no contents.
118 |         "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
119 |         be left alone.
120 |         """
121 |         if self.empty_element_tags is None:
122 |             return True
123 |         return tag_name in self.empty_element_tags
124 | 
125 |     def feed(self, markup):
126 |         raise NotImplementedError()
127 | 
128 |     def prepare_markup(self, markup, user_specified_encoding=None,
129 |                        document_declared_encoding=None):
130 |         return markup, None, None, False
131 | 
132 |     def test_fragment_to_document(self, fragment):
133 |         """Wrap an HTML fragment to make it look like a document.
134 | 
135 |         Different parsers do this differently. For instance, lxml
136 |         introduces an empty <head> tag, and html5lib
137 |         doesn't. Abstracting this away lets us write simple tests
138 |         which run HTML fragments through the parser and compare the
139 |         results against other HTML fragments.
140 | 
141 |         This method should not be used outside of tests.
142 |         """
143 |         return fragment
144 | 
145 |     def set_up_substitutions(self, tag):
146 |         return False
147 | 
148 |     def _replace_cdata_list_attribute_values(self, tag_name, attrs):
149 |         """Replaces class="foo bar" with class=["foo", "bar"]
150 | 
151 |         Modifies its input in place.
152 |         """
153 |         if not attrs:
154 |             return attrs
155 |         if self.cdata_list_attributes:
156 |             universal = self.cdata_list_attributes.get('*', [])
157 |             tag_specific = self.cdata_list_attributes.get(
158 |                 tag_name.lower(), None)
159 |             for attr in attrs.keys():
160 |                 if attr in universal or (tag_specific and attr in tag_specific):
161 |                     # We have a "class"-type attribute whose string
162 |                     # value is a whitespace-separated list of
163 |                     # values. Split it into a list.
164 |                     value = attrs[attr]
165 |                     if isinstance(value, basestring):
166 |                         values = whitespace_re.split(value)
167 |                     else:
168 |                         # html5lib sometimes calls setAttributes twice
169 |                         # for the same tag when rearranging the parse
170 |                         # tree. On the second call the attribute value
171 |                         # here is already a list.  If this happens,
172 |                         # leave the value alone rather than trying to
173 |                         # split it again.
174 |                         values = value
175 |                     attrs[attr] = values
176 |         return attrs
177 | 
178 | class SAXTreeBuilder(TreeBuilder):
179 |     """A Beautiful Soup treebuilder that listens for SAX events."""
180 | 
181 |     def feed(self, markup):
182 |         raise NotImplementedError()
183 | 
184 |     def close(self):
185 |         pass
186 | 
187 |     def startElement(self, name, attrs):
188 |         attrs = dict((key[1], value) for key, value in list(attrs.items()))
189 |         #print "Start %s, %r" % (name, attrs)
190 |         self.soup.handle_starttag(name, attrs)
191 | 
192 |     def endElement(self, name):
193 |         #print "End %s" % name
194 |         self.soup.handle_endtag(name)
195 | 
196 |     def startElementNS(self, nsTuple, nodeName, attrs):
197 |         # Throw away (ns, nodeName) for now.
198 |         self.startElement(nodeName, attrs)
199 | 
200 |     def endElementNS(self, nsTuple, nodeName):
201 |         # Throw away (ns, nodeName) for now.
202 |         self.endElement(nodeName)
203 |         #handler.endElementNS((ns, node.nodeName), node.nodeName)
204 | 
205 |     def startPrefixMapping(self, prefix, nodeValue):
206 |         # Ignore the prefix for now.
207 |         pass
208 | 
209 |     def endPrefixMapping(self, prefix):
210 |         # Ignore the prefix for now.
211 |         # handler.endPrefixMapping(prefix)
212 |         pass
213 | 
214 |     def characters(self, content):
215 |         self.soup.handle_data(content)
216 | 
217 |     def startDocument(self):
218 |         pass
219 | 
220 |     def endDocument(self):
221 |         pass
222 | 
223 | 
224 | class HTMLTreeBuilder(TreeBuilder):
225 |     """This TreeBuilder knows facts about HTML.
226 | 
227 |     Such as which tags are empty-element tags.
228 |     """
229 | 
230 |     preserve_whitespace_tags = set(['pre', 'textarea'])
231 |     empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
232 |                               'spacer', 'link', 'frame', 'base'])
233 | 
234 |     # The HTML standard defines these attributes as containing a
235 |     # space-separated list of values, not a single value. That is,
236 |     # class="foo bar" means that the 'class' attribute has two values,
237 |     # 'foo' and 'bar', not the single value 'foo bar'.  When we
238 |     # encounter one of these attributes, we will parse its value into
239 |     # a list of values if possible. Upon output, the list will be
240 |     # converted back into a string.
241 |     cdata_list_attributes = {
242 |         "*" : ['class', 'accesskey', 'dropzone'],
243 |         "a" : ['rel', 'rev'],
244 |         "link" :  ['rel', 'rev'],
245 |         "td" : ["headers"],
246 |         "th" : ["headers"],
247 |         "td" : ["headers"],
248 |         "form" : ["accept-charset"],
249 |         "object" : ["archive"],
250 | 
251 |         # These are HTML5 specific, as are *.accesskey and *.dropzone above.
252 |         "area" : ["rel"],
253 |         "icon" : ["sizes"],
254 |         "iframe" : ["sandbox"],
255 |         "output" : ["for"],
256 |         }
257 | 
258 |     def set_up_substitutions(self, tag):
259 |         # We are only interested in <meta> tags
260 |         if tag.name != 'meta':
261 |             return False
262 | 
263 |         http_equiv = tag.get('http-equiv')
264 |         content = tag.get('content')
265 |         charset = tag.get('charset')
266 | 
267 |         # We are interested in <meta> tags that say what encoding the
268 |         # document was originally in. This means HTML 5-style <meta>
269 |         # tags that provide the "charset" attribute. It also means
270 |         # HTML 4-style <meta> tags that provide the "content"
271 |         # attribute and have "http-equiv" set to "content-type".
272 |         #
273 |         # In both cases we will replace the value of the appropriate
274 |         # attribute with a standin object that can take on any
275 |         # encoding.
276 |         meta_encoding = None
277 |         if charset is not None:
278 |             # HTML 5 style:
279 |             # <meta charset="utf8">
280 |             meta_encoding = charset
281 |             tag['charset'] = CharsetMetaAttributeValue(charset)
282 | 
283 |         elif (content is not None and http_equiv is not None
284 |               and http_equiv.lower() == 'content-type'):
285 |             # HTML 4 style:
286 |             # <meta http-equiv="content-type" content="text/html; charset=utf8">
287 |             tag['content'] = ContentMetaAttributeValue(content)
288 | 
289 |         return (meta_encoding is not None)
290 | 
291 | def register_treebuilders_from(module):
292 |     """Copy TreeBuilders from the given module into this module."""
293 |     # I'm fairly sure this is not the best way to do this.
294 |     this_module = sys.modules['bs4.builder']
295 |     for name in module.__all__:
296 |         obj = getattr(module, name)
297 | 
298 |         if issubclass(obj, TreeBuilder):
299 |             setattr(this_module, name, obj)
300 |             this_module.__all__.append(name)
301 |             # Register the builder while we're at it.
302 |             this_module.builder_registry.register(obj)
303 | 
304 | class ParserRejectedMarkup(Exception):
305 |     pass
306 | 
307 | # Builders are registered in reverse order of priority, so that custom
308 | # builder registrations will take precedence. In general, we want lxml
309 | # to take precedence over html5lib, because it's faster. And we only
310 | # want to use HTMLParser as a last result.
311 | from . import _htmlparser
312 | register_treebuilders_from(_htmlparser)
313 | try:
314 |     from . import _html5lib
315 |     register_treebuilders_from(_html5lib)
316 | except ImportError:
317 |     # They don't have html5lib installed.
318 |     pass
319 | try:
320 |     from . import _lxml
321 |     register_treebuilders_from(_lxml)
322 | except ImportError:
323 |     # They don't have lxml installed.
324 |     pass
325 | 


--------------------------------------------------------------------------------
/workflow/update.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | #
  4 | # Copyright (c) 2014 Fabio Niephaus <fabio.niephaus@gmail.com>,
  5 | #       Dean Jackson <deanishe@deanishe.net>
  6 | #
  7 | # MIT Licence. See http://opensource.org/licenses/MIT
  8 | #
  9 | # Created on 2014-08-16
 10 | #
 11 | 
 12 | """
 13 | Self-updating from GitHub
 14 | 
 15 | .. versionadded:: 1.9
 16 | 
 17 | .. note::
 18 | 
 19 |    This module is not intended to be used directly. Automatic updates
 20 |    are controlled by the ``update_settings`` :class:`dict` passed to
 21 |    :class:`~workflow.workflow.Workflow` objects.
 22 | 
 23 | """
 24 | 
 25 | from __future__ import print_function, unicode_literals
 26 | 
 27 | import os
 28 | import tempfile
 29 | import re
 30 | import subprocess
 31 | 
 32 | import workflow
 33 | import web
 34 | 
 35 | # __all__ = []
 36 | 
 37 | 
 38 | RELEASES_BASE = 'https://api.github.com/repos/{0}/releases'
 39 | 
 40 | 
 41 | _wf = None
 42 | 
 43 | 
 44 | def wf():
 45 |     global _wf
 46 |     if _wf is None:
 47 |         _wf = workflow.Workflow()
 48 |     return _wf
 49 | 
 50 | 
 51 | class Version(object):
 52 |     """Mostly semantic versioning
 53 | 
 54 |     The main difference to proper :ref:`semantic versioning <semver>`
 55 |     is that this implementation doesn't require a minor or patch version.
 56 |     """
 57 | 
 58 |     #: Match version and pre-release/build information in version strings
 59 |     match_version = re.compile(r'([0-9\.]+)(.+)?').match
 60 | 
 61 |     def __init__(self, vstr):
 62 |         self.vstr = vstr
 63 |         self.major = 0
 64 |         self.minor = 0
 65 |         self.patch = 0
 66 |         self.suffix = ''
 67 |         self.build = ''
 68 |         self._parse(vstr)
 69 | 
 70 |     def _parse(self, vstr):
 71 |         if vstr.startswith('v'):
 72 |             m = self.match_version(vstr[1:])
 73 |         else:
 74 |             m = self.match_version(vstr)
 75 |         if not m:
 76 |             raise ValueError('Invalid version number: {0}'.format(vstr))
 77 | 
 78 |         version, suffix = m.groups()
 79 |         parts = self._parse_dotted_string(version)
 80 |         self.major = parts.pop(0)
 81 |         if len(parts):
 82 |             self.minor = parts.pop(0)
 83 |         if len(parts):
 84 |             self.patch = parts.pop(0)
 85 |         if not len(parts) == 0:
 86 |             raise ValueError('Invalid version (too long) : {0}'.format(vstr))
 87 | 
 88 |         if suffix:
 89 |             # Build info
 90 |             idx = suffix.find('+')
 91 |             if idx > -1:
 92 |                 self.build = suffix[idx+1:]
 93 |                 suffix = suffix[:idx]
 94 |             if suffix:
 95 |                 if not suffix.startswith('-'):
 96 |                     raise ValueError(
 97 |                         'Invalid suffix : `{0}`. Must start with `-`'.format(
 98 |                             suffix))
 99 |                 self.suffix = suffix[1:]
100 | 
101 |         # wf().logger.debug('version str `{}` -> {}'.format(vstr, repr(self)))
102 | 
103 |     def _parse_dotted_string(self, s):
104 |         """Parse string ``s`` into list of ints and strings"""
105 |         parsed = []
106 |         parts = s.split('.')
107 |         for p in parts:
108 |             if p.isdigit():
109 |                 p = int(p)
110 |             parsed.append(p)
111 |         return parsed
112 | 
113 |     @property
114 |     def tuple(self):
115 |         """Version number as a tuple of major, minor, patch, pre-release"""
116 | 
117 |         return (self.major, self.minor, self.patch, self.suffix)
118 | 
119 |     def __lt__(self, other):
120 |         if not isinstance(other, Version):
121 |             raise ValueError('Not a Version instance: {0!r}'.format(other))
122 |         t = self.tuple[:3]
123 |         o = other.tuple[:3]
124 |         if t < o:
125 |             return True
126 |         if t == o:  # We need to compare suffixes
127 |             if self.suffix and not other.suffix:
128 |                 return True
129 |             if other.suffix and not self.suffix:
130 |                 return False
131 |             return (self._parse_dotted_string(self.suffix) <
132 |                     self._parse_dotted_string(other.suffix))
133 |         # t > o
134 |         return False
135 | 
136 |     def __eq__(self, other):
137 |         if not isinstance(other, Version):
138 |             raise ValueError('Not a Version instance: {0!r}'.format(other))
139 |         return self.tuple == other.tuple
140 | 
141 |     def __ne__(self, other):
142 |         return not self.__eq__(other)
143 | 
144 |     def __gt__(self, other):
145 |         if not isinstance(other, Version):
146 |             raise ValueError('Not a Version instance: {0!r}'.format(other))
147 |         return other.__lt__(self)
148 | 
149 |     def __le__(self, other):
150 |         if not isinstance(other, Version):
151 |             raise ValueError('Not a Version instance: {0!r}'.format(other))
152 |         return not other.__lt__(self)
153 | 
154 |     def __ge__(self, other):
155 |         return not self.__lt__(other)
156 | 
157 |     def __str__(self):
158 |         vstr = '{0}.{1}.{2}'.format(self.major, self.minor, self.patch)
159 |         if self.suffix:
160 |             vstr += '-{0}'.format(self.suffix)
161 |         if self.build:
162 |             vstr += '+{0}'.format(self.build)
163 |         return vstr
164 | 
165 |     def __repr__(self):
166 |         return "Version('{0}')".format(str(self))
167 | 
168 | 
169 | def download_workflow(url):
170 |     """Download workflow at ``url`` to a local temporary file
171 | 
172 |     :param url: URL to .alfredworkflow file in GitHub repo
173 |     :returns: path to downloaded file
174 | 
175 |     """
176 | 
177 |     filename = url.split("/")[-1]
178 | 
179 |     if (not url.endswith('.alfredworkflow') or
180 |             not filename.endswith('.alfredworkflow')):
181 |         raise ValueError('Attachment `{0}` not a workflow'.format(filename))
182 | 
183 |     local_path = os.path.join(tempfile.gettempdir(), filename)
184 | 
185 |     wf().logger.debug(
186 |         'Downloading updated workflow from `{0}` to `{1}` ...'.format(
187 |             url, local_path))
188 | 
189 |     response = web.get(url)
190 | 
191 |     with open(local_path, 'wb') as output:
192 |         output.write(response.content)
193 | 
194 |     return local_path
195 | 
196 | 
197 | def build_api_url(slug):
198 |     """Generate releases URL from GitHub slug
199 | 
200 |     :param slug: Repo name in form ``username/repo``
201 |     :returns: URL to the API endpoint for the repo's releases
202 | 
203 |      """
204 | 
205 |     if len(slug.split('/')) != 2:
206 |         raise ValueError('Invalid GitHub slug : {0}'.format(slug))
207 | 
208 |     return RELEASES_BASE.format(slug)
209 | 
210 | 
211 | def get_valid_releases(github_slug, prereleases=False):
212 |     """Return list of all valid releases
213 | 
214 |     :param github_slug: ``username/repo`` for workflow's GitHub repo
215 |     :param prereleases: Whether to include pre-releases.
216 |     :returns: list of dicts. Each :class:`dict` has the form
217 |         ``{'version': '1.1', 'download_url': 'http://github.com/...',
218 |         'prerelease': False }``
219 | 
220 | 
221 |     A valid release is one that contains one ``.alfredworkflow`` file.
222 | 
223 |     If the GitHub version (i.e. tag) is of the form ``v1.1``, the leading
224 |     ``v`` will be stripped.
225 | 
226 |     """
227 | 
228 |     api_url = build_api_url(github_slug)
229 |     releases = []
230 | 
231 |     wf().logger.debug('Retrieving releases list from `{0}` ...'.format(
232 |                       api_url))
233 | 
234 |     def retrieve_releases():
235 |         wf().logger.info(
236 |             'Retrieving releases for `{0}` ...'.format(github_slug))
237 |         return web.get(api_url).json()
238 | 
239 |     slug = github_slug.replace('/', '-')
240 |     for release in wf().cached_data('gh-releases-{0}'.format(slug),
241 |                                     retrieve_releases):
242 |         version = release['tag_name']
243 |         download_urls = []
244 |         for asset in release.get('assets', []):
245 |             url = asset.get('browser_download_url')
246 |             if not url or not url.endswith('.alfredworkflow'):
247 |                 continue
248 |             download_urls.append(url)
249 | 
250 |         # Validate release
251 |         if release['prerelease'] and not prereleases:
252 |             wf().logger.warning(
253 |                 'Invalid release {0} : pre-release detected'.format(version))
254 |             continue
255 |         if not download_urls:
256 |             wf().logger.warning(
257 |                 'Invalid release {0} : No workflow file'.format(version))
258 |             continue
259 |         if len(download_urls) > 1:
260 |             wf().logger.warning(
261 |                 'Invalid release {0} : multiple workflow files'.format(version))
262 |             continue
263 | 
264 |         wf().logger.debug('Release `{0}` : {1}'.format(version, url))
265 |         releases.append({
266 |             'version': version,
267 |             'download_url': download_urls[0],
268 |             'prerelease': release['prerelease']
269 |         })
270 | 
271 |     return releases
272 | 
273 | 
274 | def check_update(github_slug, current_version, prereleases=False):
275 |     """Check whether a newer release is available on GitHub
276 | 
277 |     :param github_slug: ``username/repo`` for workflow's GitHub repo
278 |     :param current_version: the currently installed version of the
279 |         workflow. :ref:`Semantic versioning <semver>` is required.
280 |     :param prereleases: Whether to include pre-releases.
281 |     :type current_version: ``unicode``
282 |     :returns: ``True`` if an update is available, else ``False``
283 | 
284 |     If an update is available, its version number and download URL will
285 |     be cached.
286 | 
287 |     """
288 | 
289 |     releases = get_valid_releases(github_slug, prereleases)
290 | 
291 |     wf().logger.info('{0} releases for {1}'.format(len(releases),
292 |                                                    github_slug))
293 | 
294 |     if not len(releases):
295 |         raise ValueError('No valid releases for {0}'.format(github_slug))
296 | 
297 |     # GitHub returns releases newest-first
298 |     latest_release = releases[0]
299 | 
300 |     # (latest_version, download_url) = get_latest_release(releases)
301 |     vr = Version(latest_release['version'])
302 |     vl = Version(current_version)
303 |     wf().logger.debug('Latest : {0!r} Installed : {1!r}'.format(vr, vl))
304 |     if vr > vl:
305 | 
306 |         wf().cache_data('__workflow_update_status', {
307 |             'version': latest_release['version'],
308 |             'download_url': latest_release['download_url'],
309 |             'available': True
310 |         })
311 | 
312 |         return True
313 | 
314 |     wf().cache_data('__workflow_update_status', {
315 |         'available': False
316 |     })
317 |     return False
318 | 
319 | 
320 | def install_update(github_slug, current_version):
321 |     """If a newer release is available, download and install it
322 | 
323 |     :param github_slug: ``username/repo`` for workflow's GitHub repo
324 |     :param current_version: the currently installed version of the
325 |         workflow. :ref:`Semantic versioning <semver>` is required.
326 |     :type current_version: ``unicode``
327 | 
328 |     If an update is available, it will be downloaded and installed.
329 | 
330 |     :returns: ``True`` if an update is installed, else ``False``
331 | 
332 |     """
333 |     # TODO: `github_slug` and `current_version` are both unusued.
334 | 
335 |     update_data = wf().cached_data('__workflow_update_status', max_age=0)
336 | 
337 |     if not update_data or not update_data.get('available'):
338 |         wf().logger.info('No update available')
339 |         return False
340 | 
341 |     local_file = download_workflow(update_data['download_url'])
342 | 
343 |     wf().logger.info('Installing updated workflow ...')
344 |     subprocess.call(['open', local_file])
345 | 
346 |     update_data['available'] = False
347 |     wf().cache_data('__workflow_update_status', update_data)
348 |     return True
349 | 
350 | 
351 | if __name__ == '__main__':  # pragma: nocover
352 |     import sys
353 | 
354 |     def show_help():
355 |         print('Usage : update.py (check|install) github_slug version [--prereleases]')
356 |         sys.exit(1)
357 | 
358 |     argv = sys.argv[:]
359 |     prereleases = '--prereleases' in argv
360 | 
361 |     if prereleases:
362 |         argv.remove('--prereleases')
363 | 
364 |     if len(argv) != 4:
365 |         show_help()
366 | 
367 |     action, github_slug, version = argv[1:]
368 | 
369 |     if action not in ('check', 'install'):
370 |         show_help()
371 | 
372 |     if action == 'check':
373 |         check_update(github_slug, version, prereleases)
374 |     elif action == 'install':
375 |         install_update(github_slug, version)
376 | 


--------------------------------------------------------------------------------
/bs4/builder/_html5lib.py:
--------------------------------------------------------------------------------
  1 | __all__ = [
  2 |     'HTML5TreeBuilder',
  3 |     ]
  4 | 
  5 | from pdb import set_trace
  6 | import warnings
  7 | from bs4.builder import (
  8 |     PERMISSIVE,
  9 |     HTML,
 10 |     HTML_5,
 11 |     HTMLTreeBuilder,
 12 |     )
 13 | from bs4.element import (
 14 |     NamespacedAttribute,
 15 |     whitespace_re,
 16 | )
 17 | import html5lib
 18 | from html5lib.constants import namespaces
 19 | from bs4.element import (
 20 |     Comment,
 21 |     Doctype,
 22 |     NavigableString,
 23 |     Tag,
 24 |     )
 25 | 
 26 | class HTML5TreeBuilder(HTMLTreeBuilder):
 27 |     """Use html5lib to build a tree."""
 28 | 
 29 |     NAME = "html5lib"
 30 | 
 31 |     features = [NAME, PERMISSIVE, HTML_5, HTML]
 32 | 
 33 |     def prepare_markup(self, markup, user_specified_encoding,
 34 |                        document_declared_encoding=None, exclude_encodings=None):
 35 |         # Store the user-specified encoding for use later on.
 36 |         self.user_specified_encoding = user_specified_encoding
 37 | 
 38 |         # document_declared_encoding and exclude_encodings aren't used
 39 |         # ATM because the html5lib TreeBuilder doesn't use
 40 |         # UnicodeDammit.
 41 |         if exclude_encodings:
 42 |             warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
 43 |         yield (markup, None, None, False)
 44 | 
 45 |     # These methods are defined by Beautiful Soup.
 46 |     def feed(self, markup):
 47 |         if self.soup.parse_only is not None:
 48 |             warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
 49 |         parser = html5lib.HTMLParser(tree=self.create_treebuilder)
 50 |         doc = parser.parse(markup, encoding=self.user_specified_encoding)
 51 | 
 52 |         # Set the character encoding detected by the tokenizer.
 53 |         if isinstance(markup, unicode):
 54 |             # We need to special-case this because html5lib sets
 55 |             # charEncoding to UTF-8 if it gets Unicode input.
 56 |             doc.original_encoding = None
 57 |         else:
 58 |             doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
 59 | 
 60 |     def create_treebuilder(self, namespaceHTMLElements):
 61 |         self.underlying_builder = TreeBuilderForHtml5lib(
 62 |             self.soup, namespaceHTMLElements)
 63 |         return self.underlying_builder
 64 | 
 65 |     def test_fragment_to_document(self, fragment):
 66 |         """See `TreeBuilder`."""
 67 |         return u'<html><head></head><body>%s</body></html>' % fragment
 68 | 
 69 | 
 70 | class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
 71 | 
 72 |     def __init__(self, soup, namespaceHTMLElements):
 73 |         self.soup = soup
 74 |         super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
 75 | 
 76 |     def documentClass(self):
 77 |         self.soup.reset()
 78 |         return Element(self.soup, self.soup, None)
 79 | 
 80 |     def insertDoctype(self, token):
 81 |         name = token["name"]
 82 |         publicId = token["publicId"]
 83 |         systemId = token["systemId"]
 84 | 
 85 |         doctype = Doctype.for_name_and_ids(name, publicId, systemId)
 86 |         self.soup.object_was_parsed(doctype)
 87 | 
 88 |     def elementClass(self, name, namespace):
 89 |         tag = self.soup.new_tag(name, namespace)
 90 |         return Element(tag, self.soup, namespace)
 91 | 
 92 |     def commentClass(self, data):
 93 |         return TextNode(Comment(data), self.soup)
 94 | 
 95 |     def fragmentClass(self):
 96 |         self.soup = BeautifulSoup("")
 97 |         self.soup.name = "[document_fragment]"
 98 |         return Element(self.soup, self.soup, None)
 99 | 
100 |     def appendChild(self, node):
101 |         # XXX This code is not covered by the BS4 tests.
102 |         self.soup.append(node.element)
103 | 
104 |     def getDocument(self):
105 |         return self.soup
106 | 
107 |     def getFragment(self):
108 |         return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
109 | 
110 | class AttrList(object):
111 |     def __init__(self, element):
112 |         self.element = element
113 |         self.attrs = dict(self.element.attrs)
114 |     def __iter__(self):
115 |         return list(self.attrs.items()).__iter__()
116 |     def __setitem__(self, name, value):
117 |         # If this attribute is a multi-valued attribute for this element,
118 |         # turn its value into a list.
119 |         list_attr = HTML5TreeBuilder.cdata_list_attributes
120 |         if (name in list_attr['*']
121 |             or (self.element.name in list_attr
122 |                 and name in list_attr[self.element.name])):
123 |             # A node that is being cloned may have already undergone
124 |             # this procedure.
125 |             if not isinstance(value, list):
126 |                 value = whitespace_re.split(value)
127 |         self.element[name] = value
128 |     def items(self):
129 |         return list(self.attrs.items())
130 |     def keys(self):
131 |         return list(self.attrs.keys())
132 |     def __len__(self):
133 |         return len(self.attrs)
134 |     def __getitem__(self, name):
135 |         return self.attrs[name]
136 |     def __contains__(self, name):
137 |         return name in list(self.attrs.keys())
138 | 
139 | 
140 | class Element(html5lib.treebuilders._base.Node):
141 |     def __init__(self, element, soup, namespace):
142 |         html5lib.treebuilders._base.Node.__init__(self, element.name)
143 |         self.element = element
144 |         self.soup = soup
145 |         self.namespace = namespace
146 | 
147 |     def appendChild(self, node):
148 |         string_child = child = None
149 |         if isinstance(node, basestring):
150 |             # Some other piece of code decided to pass in a string
151 |             # instead of creating a TextElement object to contain the
152 |             # string.
153 |             string_child = child = node
154 |         elif isinstance(node, Tag):
155 |             # Some other piece of code decided to pass in a Tag
156 |             # instead of creating an Element object to contain the
157 |             # Tag.
158 |             child = node
159 |         elif node.element.__class__ == NavigableString:
160 |             string_child = child = node.element
161 |         else:
162 |             child = node.element
163 | 
164 |         if not isinstance(child, basestring) and child.parent is not None:
165 |             node.element.extract()
166 | 
167 |         if (string_child and self.element.contents
168 |             and self.element.contents[-1].__class__ == NavigableString):
169 |             # We are appending a string onto another string.
170 |             # TODO This has O(n^2) performance, for input like
171 |             # "a</a>a</a>a</a>..."
172 |             old_element = self.element.contents[-1]
173 |             new_element = self.soup.new_string(old_element + string_child)
174 |             old_element.replace_with(new_element)
175 |             self.soup._most_recent_element = new_element
176 |         else:
177 |             if isinstance(node, basestring):
178 |                 # Create a brand new NavigableString from this string.
179 |                 child = self.soup.new_string(node)
180 | 
181 |             # Tell Beautiful Soup to act as if it parsed this element
182 |             # immediately after the parent's last descendant. (Or
183 |             # immediately after the parent, if it has no children.)
184 |             if self.element.contents:
185 |                 most_recent_element = self.element._last_descendant(False)
186 |             elif self.element.next_element is not None:
187 |                 # Something from further ahead in the parse tree is
188 |                 # being inserted into this earlier element. This is
189 |                 # very annoying because it means an expensive search
190 |                 # for the last element in the tree.
191 |                 most_recent_element = self.soup._last_descendant()
192 |             else:
193 |                 most_recent_element = self.element
194 | 
195 |             self.soup.object_was_parsed(
196 |                 child, parent=self.element,
197 |                 most_recent_element=most_recent_element)
198 | 
199 |     def getAttributes(self):
200 |         return AttrList(self.element)
201 | 
202 |     def setAttributes(self, attributes):
203 | 
204 |         if attributes is not None and len(attributes) > 0:
205 | 
206 |             converted_attributes = []
207 |             for name, value in list(attributes.items()):
208 |                 if isinstance(name, tuple):
209 |                     new_name = NamespacedAttribute(*name)
210 |                     del attributes[name]
211 |                     attributes[new_name] = value
212 | 
213 |             self.soup.builder._replace_cdata_list_attribute_values(
214 |                 self.name, attributes)
215 |             for name, value in attributes.items():
216 |                 self.element[name] = value
217 | 
218 |             # The attributes may contain variables that need substitution.
219 |             # Call set_up_substitutions manually.
220 |             #
221 |             # The Tag constructor called this method when the Tag was created,
222 |             # but we just set/changed the attributes, so call it again.
223 |             self.soup.builder.set_up_substitutions(self.element)
224 |     attributes = property(getAttributes, setAttributes)
225 | 
226 |     def insertText(self, data, insertBefore=None):
227 |         if insertBefore:
228 |             text = TextNode(self.soup.new_string(data), self.soup)
229 |             self.insertBefore(data, insertBefore)
230 |         else:
231 |             self.appendChild(data)
232 | 
233 |     def insertBefore(self, node, refNode):
234 |         index = self.element.index(refNode.element)
235 |         if (node.element.__class__ == NavigableString and self.element.contents
236 |             and self.element.contents[index-1].__class__ == NavigableString):
237 |             # (See comments in appendChild)
238 |             old_node = self.element.contents[index-1]
239 |             new_str = self.soup.new_string(old_node + node.element)
240 |             old_node.replace_with(new_str)
241 |         else:
242 |             self.element.insert(index, node.element)
243 |             node.parent = self
244 | 
245 |     def removeChild(self, node):
246 |         node.element.extract()
247 | 
248 |     def reparentChildren(self, new_parent):
249 |         """Move all of this tag's children into another tag."""
250 |         # print "MOVE", self.element.contents
251 |         # print "FROM", self.element
252 |         # print "TO", new_parent.element
253 |         element = self.element
254 |         new_parent_element = new_parent.element
255 |         # Determine what this tag's next_element will be once all the children
256 |         # are removed.
257 |         final_next_element = element.next_sibling
258 | 
259 |         new_parents_last_descendant = new_parent_element._last_descendant(False, False)
260 |         if len(new_parent_element.contents) > 0:
261 |             # The new parent already contains children. We will be
262 |             # appending this tag's children to the end.
263 |             new_parents_last_child = new_parent_element.contents[-1]
264 |             new_parents_last_descendant_next_element = new_parents_last_descendant.next_element
265 |         else:
266 |             # The new parent contains no children.
267 |             new_parents_last_child = None
268 |             new_parents_last_descendant_next_element = new_parent_element.next_element
269 | 
270 |         to_append = element.contents
271 |         append_after = new_parent_element.contents
272 |         if len(to_append) > 0:
273 |             # Set the first child's previous_element and previous_sibling
274 |             # to elements within the new parent
275 |             first_child = to_append[0]
276 |             if new_parents_last_descendant:
277 |                 first_child.previous_element = new_parents_last_descendant
278 |             else:
279 |                 first_child.previous_element = new_parent_element
280 |             first_child.previous_sibling = new_parents_last_child
281 |             if new_parents_last_descendant:
282 |                 new_parents_last_descendant.next_element = first_child
283 |             else:
284 |                 new_parent_element.next_element = first_child
285 |             if new_parents_last_child:
286 |                 new_parents_last_child.next_sibling = first_child
287 | 
288 |             # Fix the last child's next_element and next_sibling
289 |             last_child = to_append[-1]
290 |             last_child.next_element = new_parents_last_descendant_next_element
291 |             if new_parents_last_descendant_next_element:
292 |                 new_parents_last_descendant_next_element.previous_element = last_child
293 |             last_child.next_sibling = None
294 | 
295 |         for child in to_append:
296 |             child.parent = new_parent_element
297 |             new_parent_element.contents.append(child)
298 | 
299 |         # Now that this element has no children, change its .next_element.
300 |         element.contents = []
301 |         element.next_element = final_next_element
302 | 
303 |         # print "DONE WITH MOVE"
304 |         # print "FROM", self.element
305 |         # print "TO", new_parent_element
306 | 
307 |     def cloneNode(self):
308 |         tag = self.soup.new_tag(self.element.name, self.namespace)
309 |         node = Element(tag, self.soup, self.namespace)
310 |         for key,value in self.attributes:
311 |             node.attributes[key] = value
312 |         return node
313 | 
314 |     def hasContent(self):
315 |         return self.element.contents
316 | 
317 |     def getNameTuple(self):
318 |         if self.namespace == None:
319 |             return namespaces["html"], self.name
320 |         else:
321 |             return self.namespace, self.name
322 | 
323 |     nameTuple = property(getNameTuple)
324 | 
325 | class TextNode(Element):
326 |     def __init__(self, element, soup):
327 |         html5lib.treebuilders._base.Node.__init__(self, None)
328 |         self.element = element
329 |         self.soup = soup
330 | 
331 |     def cloneNode(self):
332 |         raise NotImplementedError
333 | 


--------------------------------------------------------------------------------
/bs4/__init__.py:
--------------------------------------------------------------------------------
  1 | """Beautiful Soup
  2 | Elixir and Tonic
  3 | "The Screen-Scraper's Friend"
  4 | http://www.crummy.com/software/BeautifulSoup/
  5 | 
  6 | Beautiful Soup uses a pluggable XML or HTML parser to parse a
  7 | (possibly invalid) document into a tree representation. Beautiful Soup
  8 | provides provides methods and Pythonic idioms that make it easy to
  9 | navigate, search, and modify the parse tree.
 10 | 
 11 | Beautiful Soup works with Python 2.6 and up. It works better if lxml
 12 | and/or html5lib is installed.
 13 | 
 14 | For more than you ever wanted to know about Beautiful Soup, see the
 15 | documentation:
 16 | http://www.crummy.com/software/BeautifulSoup/bs4/doc/
 17 | """
 18 | 
 19 | __author__ = "Leonard Richardson (leonardr@segfault.org)"
 20 | __version__ = "4.4.1"
 21 | __copyright__ = "Copyright (c) 2004-2015 Leonard Richardson"
 22 | __license__ = "MIT"
 23 | 
 24 | __all__ = ['BeautifulSoup']
 25 | 
 26 | import os
 27 | import re
 28 | import warnings
 29 | 
 30 | from .builder import builder_registry, ParserRejectedMarkup
 31 | from .dammit import UnicodeDammit
 32 | from .element import (
 33 |     CData,
 34 |     Comment,
 35 |     DEFAULT_OUTPUT_ENCODING,
 36 |     Declaration,
 37 |     Doctype,
 38 |     NavigableString,
 39 |     PageElement,
 40 |     ProcessingInstruction,
 41 |     ResultSet,
 42 |     SoupStrainer,
 43 |     Tag,
 44 |     )
 45 | 
 46 | # The very first thing we do is give a useful error if someone is
 47 | # running this code under Python 3 without converting it.
 48 | 'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
 49 | 
 50 | class BeautifulSoup(Tag):
 51 |     """
 52 |     This class defines the basic interface called by the tree builders.
 53 | 
 54 |     These methods will be called by the parser:
 55 |       reset()
 56 |       feed(markup)
 57 | 
 58 |     The tree builder may call these methods from its feed() implementation:
 59 |       handle_starttag(name, attrs) # See note about return value
 60 |       handle_endtag(name)
 61 |       handle_data(data) # Appends to the current data node
 62 |       endData(containerClass=NavigableString) # Ends the current data node
 63 | 
 64 |     No matter how complicated the underlying parser is, you should be
 65 |     able to build a tree using 'start tag' events, 'end tag' events,
 66 |     'data' events, and "done with data" events.
 67 | 
 68 |     If you encounter an empty-element tag (aka a self-closing tag,
 69 |     like HTML's <br> tag), call handle_starttag and then
 70 |     handle_endtag.
 71 |     """
 72 |     ROOT_TAG_NAME = u'[document]'
 73 | 
 74 |     # If the end-user gives no indication which tree builder they
 75 |     # want, look for one with these features.
 76 |     DEFAULT_BUILDER_FEATURES = ['html', 'fast']
 77 | 
 78 |     ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
 79 | 
 80 |     NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
 81 | 
 82 |     def __init__(self, markup="", features=None, builder=None,
 83 |                  parse_only=None, from_encoding=None, exclude_encodings=None,
 84 |                  **kwargs):
 85 |         """The Soup object is initialized as the 'root tag', and the
 86 |         provided markup (which can be a string or a file-like object)
 87 |         is fed into the underlying parser."""
 88 | 
 89 |         if 'convertEntities' in kwargs:
 90 |             warnings.warn(
 91 |                 "BS4 does not respect the convertEntities argument to the "
 92 |                 "BeautifulSoup constructor. Entities are always converted "
 93 |                 "to Unicode characters.")
 94 | 
 95 |         if 'markupMassage' in kwargs:
 96 |             del kwargs['markupMassage']
 97 |             warnings.warn(
 98 |                 "BS4 does not respect the markupMassage argument to the "
 99 |                 "BeautifulSoup constructor. The tree builder is responsible "
100 |                 "for any necessary markup massage.")
101 | 
102 |         if 'smartQuotesTo' in kwargs:
103 |             del kwargs['smartQuotesTo']
104 |             warnings.warn(
105 |                 "BS4 does not respect the smartQuotesTo argument to the "
106 |                 "BeautifulSoup constructor. Smart quotes are always converted "
107 |                 "to Unicode characters.")
108 | 
109 |         if 'selfClosingTags' in kwargs:
110 |             del kwargs['selfClosingTags']
111 |             warnings.warn(
112 |                 "BS4 does not respect the selfClosingTags argument to the "
113 |                 "BeautifulSoup constructor. The tree builder is responsible "
114 |                 "for understanding self-closing tags.")
115 | 
116 |         if 'isHTML' in kwargs:
117 |             del kwargs['isHTML']
118 |             warnings.warn(
119 |                 "BS4 does not respect the isHTML argument to the "
120 |                 "BeautifulSoup constructor. Suggest you use "
121 |                 "features='lxml' for HTML and features='lxml-xml' for "
122 |                 "XML.")
123 | 
124 |         def deprecated_argument(old_name, new_name):
125 |             if old_name in kwargs:
126 |                 warnings.warn(
127 |                     'The "%s" argument to the BeautifulSoup constructor '
128 |                     'has been renamed to "%s."' % (old_name, new_name))
129 |                 value = kwargs[old_name]
130 |                 del kwargs[old_name]
131 |                 return value
132 |             return None
133 | 
134 |         parse_only = parse_only or deprecated_argument(
135 |             "parseOnlyThese", "parse_only")
136 | 
137 |         from_encoding = from_encoding or deprecated_argument(
138 |             "fromEncoding", "from_encoding")
139 | 
140 |         if len(kwargs) > 0:
141 |             arg = kwargs.keys().pop()
142 |             raise TypeError(
143 |                 "__init__() got an unexpected keyword argument '%s'" % arg)
144 | 
145 |         if builder is None:
146 |             original_features = features
147 |             if isinstance(features, basestring):
148 |                 features = [features]
149 |             if features is None or len(features) == 0:
150 |                 features = self.DEFAULT_BUILDER_FEATURES
151 |             builder_class = builder_registry.lookup(*features)
152 |             if builder_class is None:
153 |                 raise FeatureNotFound(
154 |                     "Couldn't find a tree builder with the features you "
155 |                     "requested: %s. Do you need to install a parser library?"
156 |                     % ",".join(features))
157 |             builder = builder_class()
158 |             if not (original_features == builder.NAME or
159 |                     original_features in builder.ALTERNATE_NAMES):
160 |                 if builder.is_xml:
161 |                     markup_type = "XML"
162 |                 else:
163 |                     markup_type = "HTML"
164 |                 warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
165 |                     parser=builder.NAME,
166 |                     markup_type=markup_type))
167 | 
168 |         self.builder = builder
169 |         self.is_xml = builder.is_xml
170 |         self.builder.soup = self
171 | 
172 |         self.parse_only = parse_only
173 | 
174 |         if hasattr(markup, 'read'):        # It's a file-type object.
175 |             markup = markup.read()
176 |         elif len(markup) <= 256:
177 |             # Print out warnings for a couple beginner problems
178 |             # involving passing non-markup to Beautiful Soup.
179 |             # Beautiful Soup will still parse the input as markup,
180 |             # just in case that's what the user really wants.
181 |             if (isinstance(markup, unicode)
182 |                 and not os.path.supports_unicode_filenames):
183 |                 possible_filename = markup.encode("utf8")
184 |             else:
185 |                 possible_filename = markup
186 |             is_file = False
187 |             try:
188 |                 is_file = os.path.exists(possible_filename)
189 |             except Exception, e:
190 |                 # This is almost certainly a problem involving
191 |                 # characters not valid in filenames on this
192 |                 # system. Just let it go.
193 |                 pass
194 |             if is_file:
195 |                 if isinstance(markup, unicode):
196 |                     markup = markup.encode("utf8")
197 |                 warnings.warn(
198 |                     '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
199 |             if markup[:5] == "http:" or markup[:6] == "https:":
200 |                 # TODO: This is ugly but I couldn't get it to work in
201 |                 # Python 3 otherwise.
202 |                 if ((isinstance(markup, bytes) and not b' ' in markup)
203 |                     or (isinstance(markup, unicode) and not u' ' in markup)):
204 |                     if isinstance(markup, unicode):
205 |                         markup = markup.encode("utf8")
206 |                     warnings.warn(
207 |                         '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
208 | 
209 |         for (self.markup, self.original_encoding, self.declared_html_encoding,
210 |          self.contains_replacement_characters) in (
211 |              self.builder.prepare_markup(
212 |                  markup, from_encoding, exclude_encodings=exclude_encodings)):
213 |             self.reset()
214 |             try:
215 |                 self._feed()
216 |                 break
217 |             except ParserRejectedMarkup:
218 |                 pass
219 | 
220 |         # Clear out the markup and remove the builder's circular
221 |         # reference to this object.
222 |         self.markup = None
223 |         self.builder.soup = None
224 | 
225 |     def __copy__(self):
226 |         return type(self)(self.encode(), builder=self.builder)
227 | 
228 |     def __getstate__(self):
229 |         # Frequently a tree builder can't be pickled.
230 |         d = dict(self.__dict__)
231 |         if 'builder' in d and not self.builder.picklable:
232 |             del d['builder']
233 |         return d
234 | 
235 |     def _feed(self):
236 |         # Convert the document to Unicode.
237 |         self.builder.reset()
238 | 
239 |         self.builder.feed(self.markup)
240 |         # Close out any unfinished strings and close all the open tags.
241 |         self.endData()
242 |         while self.currentTag.name != self.ROOT_TAG_NAME:
243 |             self.popTag()
244 | 
245 |     def reset(self):
246 |         Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
247 |         self.hidden = 1
248 |         self.builder.reset()
249 |         self.current_data = []
250 |         self.currentTag = None
251 |         self.tagStack = []
252 |         self.preserve_whitespace_tag_stack = []
253 |         self.pushTag(self)
254 | 
255 |     def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
256 |         """Create a new tag associated with this soup."""
257 |         return Tag(None, self.builder, name, namespace, nsprefix, attrs)
258 | 
259 |     def new_string(self, s, subclass=NavigableString):
260 |         """Create a new NavigableString associated with this soup."""
261 |         return subclass(s)
262 | 
263 |     def insert_before(self, successor):
264 |         raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
265 | 
266 |     def insert_after(self, successor):
267 |         raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
268 | 
269 |     def popTag(self):
270 |         tag = self.tagStack.pop()
271 |         if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
272 |             self.preserve_whitespace_tag_stack.pop()
273 |         #print "Pop", tag.name
274 |         if self.tagStack:
275 |             self.currentTag = self.tagStack[-1]
276 |         return self.currentTag
277 | 
278 |     def pushTag(self, tag):
279 |         #print "Push", tag.name
280 |         if self.currentTag:
281 |             self.currentTag.contents.append(tag)
282 |         self.tagStack.append(tag)
283 |         self.currentTag = self.tagStack[-1]
284 |         if tag.name in self.builder.preserve_whitespace_tags:
285 |             self.preserve_whitespace_tag_stack.append(tag)
286 | 
287 |     def endData(self, containerClass=NavigableString):
288 |         if self.current_data:
289 |             current_data = u''.join(self.current_data)
290 |             # If whitespace is not preserved, and this string contains
291 |             # nothing but ASCII spaces, replace it with a single space
292 |             # or newline.
293 |             if not self.preserve_whitespace_tag_stack:
294 |                 strippable = True
295 |                 for i in current_data:
296 |                     if i not in self.ASCII_SPACES:
297 |                         strippable = False
298 |                         break
299 |                 if strippable:
300 |                     if '\n' in current_data:
301 |                         current_data = '\n'
302 |                     else:
303 |                         current_data = ' '
304 | 
305 |             # Reset the data collector.
306 |             self.current_data = []
307 | 
308 |             # Should we add this string to the tree at all?
309 |             if self.parse_only and len(self.tagStack) <= 1 and \
310 |                    (not self.parse_only.text or \
311 |                     not self.parse_only.search(current_data)):
312 |                 return
313 | 
314 |             o = containerClass(current_data)
315 |             self.object_was_parsed(o)
316 | 
317 |     def object_was_parsed(self, o, parent=None, most_recent_element=None):
318 |         """Add an object to the parse tree."""
319 |         parent = parent or self.currentTag
320 |         previous_element = most_recent_element or self._most_recent_element
321 | 
322 |         next_element = previous_sibling = next_sibling = None
323 |         if isinstance(o, Tag):
324 |             next_element = o.next_element
325 |             next_sibling = o.next_sibling
326 |             previous_sibling = o.previous_sibling
327 |             if not previous_element:
328 |                 previous_element = o.previous_element
329 | 
330 |         o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
331 | 
332 |         self._most_recent_element = o
333 |         parent.contents.append(o)
334 | 
335 |         if parent.next_sibling:
336 |             # This node is being inserted into an element that has
337 |             # already been parsed. Deal with any dangling references.
338 |             index = parent.contents.index(o)
339 |             if index == 0:
340 |                 previous_element = parent
341 |                 previous_sibling = None
342 |             else:
343 |                 previous_element = previous_sibling = parent.contents[index-1]
344 |             if index == len(parent.contents)-1:
345 |                 next_element = parent.next_sibling
346 |                 next_sibling = None
347 |             else:
348 |                 next_element = next_sibling = parent.contents[index+1]
349 | 
350 |             o.previous_element = previous_element
351 |             if previous_element:
352 |                 previous_element.next_element = o
353 |             o.next_element = next_element
354 |             if next_element:
355 |                 next_element.previous_element = o
356 |             o.next_sibling = next_sibling
357 |             if next_sibling:
358 |                 next_sibling.previous_sibling = o
359 |             o.previous_sibling = previous_sibling
360 |             if previous_sibling:
361 |                 previous_sibling.next_sibling = o
362 | 
363 |     def _popToTag(self, name, nsprefix=None, inclusivePop=True):
364 |         """Pops the tag stack up to and including the most recent
365 |         instance of the given tag. If inclusivePop is false, pops the tag
366 |         stack up to but *not* including the most recent instqance of
367 |         the given tag."""
368 |         #print "Popping to %s" % name
369 |         if name == self.ROOT_TAG_NAME:
370 |             # The BeautifulSoup object itself can never be popped.
371 |             return
372 | 
373 |         most_recently_popped = None
374 | 
375 |         stack_size = len(self.tagStack)
376 |         for i in range(stack_size - 1, 0, -1):
377 |             t = self.tagStack[i]
378 |             if (name == t.name and nsprefix == t.prefix):
379 |                 if inclusivePop:
380 |                     most_recently_popped = self.popTag()
381 |                 break
382 |             most_recently_popped = self.popTag()
383 | 
384 |         return most_recently_popped
385 | 
386 |     def handle_starttag(self, name, namespace, nsprefix, attrs):
387 |         """Push a start tag on to the stack.
388 | 
389 |         If this method returns None, the tag was rejected by the
390 |         SoupStrainer. You should proceed as if the tag had not occured
391 |         in the document. For instance, if this was a self-closing tag,
392 |         don't call handle_endtag.
393 |         """
394 | 
395 |         # print "Start tag %s: %s" % (name, attrs)
396 |         self.endData()
397 | 
398 |         if (self.parse_only and len(self.tagStack) <= 1
399 |             and (self.parse_only.text
400 |                  or not self.parse_only.search_tag(name, attrs))):
401 |             return None
402 | 
403 |         tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
404 |                   self.currentTag, self._most_recent_element)
405 |         if tag is None:
406 |             return tag
407 |         if self._most_recent_element:
408 |             self._most_recent_element.next_element = tag
409 |         self._most_recent_element = tag
410 |         self.pushTag(tag)
411 |         return tag
412 | 
413 |     def handle_endtag(self, name, nsprefix=None):
414 |         #print "End tag: " + name
415 |         self.endData()
416 |         self._popToTag(name, nsprefix)
417 | 
418 |     def handle_data(self, data):
419 |         self.current_data.append(data)
420 | 
421 |     def decode(self, pretty_print=False,
422 |                eventual_encoding=DEFAULT_OUTPUT_ENCODING,
423 |                formatter="minimal"):
424 |         """Returns a string or Unicode representation of this document.
425 |         To get Unicode, pass None for encoding."""
426 | 
427 |         if self.is_xml:
428 |             # Print the XML declaration
429 |             encoding_part = ''
430 |             if eventual_encoding != None:
431 |                 encoding_part = ' encoding="%s"' % eventual_encoding
432 |             prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
433 |         else:
434 |             prefix = u''
435 |         if not pretty_print:
436 |             indent_level = None
437 |         else:
438 |             indent_level = 0
439 |         return prefix + super(BeautifulSoup, self).decode(
440 |             indent_level, eventual_encoding, formatter)
441 | 
442 | # Alias to make it easier to type import: 'from bs4 import _soup'
443 | _s = BeautifulSoup
444 | _soup = BeautifulSoup
445 | 
446 | class BeautifulStoneSoup(BeautifulSoup):
447 |     """Deprecated interface to an XML parser."""
448 | 
449 |     def __init__(self, *args, **kwargs):
450 |         kwargs['features'] = 'xml'
451 |         warnings.warn(
452 |             'The BeautifulStoneSoup class is deprecated. Instead of using '
453 |             'it, pass features="xml" into the BeautifulSoup constructor.')
454 |         super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
455 | 
456 | 
457 | class StopParsing(Exception):
458 |     pass
459 | 
460 | class FeatureNotFound(ValueError):
461 |     pass
462 | 
463 | 
464 | #By default, act as an HTML pretty-printer.
465 | if __name__ == '__main__':
466 |     import sys
467 |     soup = BeautifulSoup(sys.stdin)
468 |     print soup.prettify()
469 | 


--------------------------------------------------------------------------------
/bs4/tests/test_soup.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Tests of Beautiful Soup as a whole."""
  3 | 
  4 | from pdb import set_trace
  5 | import logging
  6 | import unittest
  7 | import sys
  8 | import tempfile
  9 | 
 10 | from bs4 import (
 11 |     BeautifulSoup,
 12 |     BeautifulStoneSoup,
 13 | )
 14 | from bs4.element import (
 15 |     CharsetMetaAttributeValue,
 16 |     ContentMetaAttributeValue,
 17 |     SoupStrainer,
 18 |     NamespacedAttribute,
 19 |     )
 20 | import bs4.dammit
 21 | from bs4.dammit import (
 22 |     EntitySubstitution,
 23 |     UnicodeDammit,
 24 |     EncodingDetector,
 25 | )
 26 | from bs4.testing import (
 27 |     SoupTest,
 28 |     skipIf,
 29 | )
 30 | import warnings
 31 | 
 32 | try:
 33 |     from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
 34 |     LXML_PRESENT = True
 35 | except ImportError, e:
 36 |     LXML_PRESENT = False
 37 | 
 38 | PYTHON_2_PRE_2_7 = (sys.version_info < (2,7))
 39 | PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
 40 | 
 41 | class TestConstructor(SoupTest):
 42 | 
 43 |     def test_short_unicode_input(self):
 44 |         data = u"<h1>éé</h1>"
 45 |         soup = self.soup(data)
 46 |         self.assertEqual(u"éé", soup.h1.string)
 47 | 
 48 |     def test_embedded_null(self):
 49 |         data = u"<h1>foo\0bar</h1>"
 50 |         soup = self.soup(data)
 51 |         self.assertEqual(u"foo\0bar", soup.h1.string)
 52 | 
 53 |     def test_exclude_encodings(self):
 54 |         utf8_data = u"Räksmörgås".encode("utf-8")
 55 |         soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
 56 |         self.assertEqual("windows-1252", soup.original_encoding)
 57 | 
 58 | 
 59 | class TestWarnings(SoupTest):
 60 | 
 61 |     def _no_parser_specified(self, s, is_there=True):
 62 |         v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80])
 63 |         self.assertTrue(v)
 64 | 
 65 |     def test_warning_if_no_parser_specified(self):
 66 |         with warnings.catch_warnings(record=True) as w:
 67 |             soup = self.soup("<a><b></b></a>")
 68 |         msg = str(w[0].message)
 69 |         self._assert_no_parser_specified(msg)
 70 | 
 71 |     def test_warning_if_parser_specified_too_vague(self):
 72 |         with warnings.catch_warnings(record=True) as w:
 73 |             soup = self.soup("<a><b></b></a>", "html")
 74 |         msg = str(w[0].message)
 75 |         self._assert_no_parser_specified(msg)
 76 | 
 77 |     def test_no_warning_if_explicit_parser_specified(self):
 78 |         with warnings.catch_warnings(record=True) as w:
 79 |             soup = self.soup("<a><b></b></a>", "html.parser")
 80 |         self.assertEquals([], w)
 81 | 
 82 |     def test_parseOnlyThese_renamed_to_parse_only(self):
 83 |         with warnings.catch_warnings(record=True) as w:
 84 |             soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
 85 |         msg = str(w[0].message)
 86 |         self.assertTrue("parseOnlyThese" in msg)
 87 |         self.assertTrue("parse_only" in msg)
 88 |         self.assertEqual(b"<b></b>", soup.encode())
 89 | 
 90 |     def test_fromEncoding_renamed_to_from_encoding(self):
 91 |         with warnings.catch_warnings(record=True) as w:
 92 |             utf8 = b"\xc3\xa9"
 93 |             soup = self.soup(utf8, fromEncoding="utf8")
 94 |         msg = str(w[0].message)
 95 |         self.assertTrue("fromEncoding" in msg)
 96 |         self.assertTrue("from_encoding" in msg)
 97 |         self.assertEqual("utf8", soup.original_encoding)
 98 | 
 99 |     def test_unrecognized_keyword_argument(self):
100 |         self.assertRaises(
101 |             TypeError, self.soup, "<a>", no_such_argument=True)
102 | 
103 | class TestWarnings(SoupTest):
104 | 
105 |     def test_disk_file_warning(self):
106 |         filehandle = tempfile.NamedTemporaryFile()
107 |         filename = filehandle.name
108 |         try:
109 |             with warnings.catch_warnings(record=True) as w:
110 |                 soup = self.soup(filename)
111 |             msg = str(w[0].message)
112 |             self.assertTrue("looks like a filename" in msg)
113 |         finally:
114 |             filehandle.close()
115 | 
116 |         # The file no longer exists, so Beautiful Soup will no longer issue the warning.
117 |         with warnings.catch_warnings(record=True) as w:
118 |             soup = self.soup(filename)
119 |         self.assertEqual(0, len(w))
120 | 
121 |     def test_url_warning(self):
122 |         with warnings.catch_warnings(record=True) as w:
123 |             soup = self.soup("http://www.crummy.com/")
124 |         msg = str(w[0].message)
125 |         self.assertTrue("looks like a URL" in msg)
126 | 
127 |         with warnings.catch_warnings(record=True) as w:
128 |             soup = self.soup("http://www.crummy.com/ is great")
129 |         self.assertEqual(0, len(w))
130 | 
131 | class TestSelectiveParsing(SoupTest):
132 | 
133 |     def test_parse_with_soupstrainer(self):
134 |         markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
135 |         strainer = SoupStrainer("b")
136 |         soup = self.soup(markup, parse_only=strainer)
137 |         self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
138 | 
139 | 
140 | class TestEntitySubstitution(unittest.TestCase):
141 |     """Standalone tests of the EntitySubstitution class."""
142 |     def setUp(self):
143 |         self.sub = EntitySubstitution
144 | 
145 |     def test_simple_html_substitution(self):
146 |         # Unicode characters corresponding to named HTML entites
147 |         # are substituted, and no others.
148 |         s = u"foo\u2200\N{SNOWMAN}\u00f5bar"
149 |         self.assertEqual(self.sub.substitute_html(s),
150 |                           u"foo&forall;\N{SNOWMAN}&otilde;bar")
151 | 
152 |     def test_smart_quote_substitution(self):
153 |         # MS smart quotes are a common source of frustration, so we
154 |         # give them a special test.
155 |         quotes = b"\x91\x92foo\x93\x94"
156 |         dammit = UnicodeDammit(quotes)
157 |         self.assertEqual(self.sub.substitute_html(dammit.markup),
158 |                           "&lsquo;&rsquo;foo&ldquo;&rdquo;")
159 | 
160 |     def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
161 |         s = 'Welcome to "my bar"'
162 |         self.assertEqual(self.sub.substitute_xml(s, False), s)
163 | 
164 |     def test_xml_attribute_quoting_normally_uses_double_quotes(self):
165 |         self.assertEqual(self.sub.substitute_xml("Welcome", True),
166 |                           '"Welcome"')
167 |         self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
168 |                           '"Bob\'s Bar"')
169 | 
170 |     def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
171 |         s = 'Welcome to "my bar"'
172 |         self.assertEqual(self.sub.substitute_xml(s, True),
173 |                           "'Welcome to \"my bar\"'")
174 | 
175 |     def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
176 |         s = 'Welcome to "Bob\'s Bar"'
177 |         self.assertEqual(
178 |             self.sub.substitute_xml(s, True),
179 |             '"Welcome to &quot;Bob\'s Bar&quot;"')
180 | 
181 |     def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
182 |         quoted = 'Welcome to "Bob\'s Bar"'
183 |         self.assertEqual(self.sub.substitute_xml(quoted), quoted)
184 | 
185 |     def test_xml_quoting_handles_angle_brackets(self):
186 |         self.assertEqual(
187 |             self.sub.substitute_xml("foo<bar>"),
188 |             "foo&lt;bar&gt;")
189 | 
190 |     def test_xml_quoting_handles_ampersands(self):
191 |         self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&amp;T")
192 | 
193 |     def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
194 |         self.assertEqual(
195 |             self.sub.substitute_xml("&Aacute;T&T"),
196 |             "&amp;Aacute;T&amp;T")
197 | 
198 |     def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
199 |         self.assertEqual(
200 |             self.sub.substitute_xml_containing_entities("&Aacute;T&T"),
201 |             "&Aacute;T&amp;T")
202 | 
203 |     def test_quotes_not_html_substituted(self):
204 |         """There's no need to do this except inside attribute values."""
205 |         text = 'Bob\'s "bar"'
206 |         self.assertEqual(self.sub.substitute_html(text), text)
207 | 
208 | 
209 | class TestEncodingConversion(SoupTest):
210 |     # Test Beautiful Soup's ability to decode and encode from various
211 |     # encodings.
212 | 
213 |     def setUp(self):
214 |         super(TestEncodingConversion, self).setUp()
215 |         self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
216 |         self.utf8_data = self.unicode_data.encode("utf-8")
217 |         # Just so you know what it looks like.
218 |         self.assertEqual(
219 |             self.utf8_data,
220 |             b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')
221 | 
222 |     def test_ascii_in_unicode_out(self):
223 |         # ASCII input is converted to Unicode. The original_encoding
224 |         # attribute is set to 'utf-8', a superset of ASCII.
225 |         chardet = bs4.dammit.chardet_dammit
226 |         logging.disable(logging.WARNING)
227 |         try:
228 |             def noop(str):
229 |                 return None
230 |             # Disable chardet, which will realize that the ASCII is ASCII.
231 |             bs4.dammit.chardet_dammit = noop
232 |             ascii = b"<foo>a</foo>"
233 |             soup_from_ascii = self.soup(ascii)
234 |             unicode_output = soup_from_ascii.decode()
235 |             self.assertTrue(isinstance(unicode_output, unicode))
236 |             self.assertEqual(unicode_output, self.document_for(ascii.decode()))
237 |             self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
238 |         finally:
239 |             logging.disable(logging.NOTSET)
240 |             bs4.dammit.chardet_dammit = chardet
241 | 
242 |     def test_unicode_in_unicode_out(self):
243 |         # Unicode input is left alone. The original_encoding attribute
244 |         # is not set.
245 |         soup_from_unicode = self.soup(self.unicode_data)
246 |         self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
247 |         self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!')
248 |         self.assertEqual(soup_from_unicode.original_encoding, None)
249 | 
250 |     def test_utf8_in_unicode_out(self):
251 |         # UTF-8 input is converted to Unicode. The original_encoding
252 |         # attribute is set.
253 |         soup_from_utf8 = self.soup(self.utf8_data)
254 |         self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
255 |         self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
256 | 
257 |     def test_utf8_out(self):
258 |         # The internal data structures can be encoded as UTF-8.
259 |         soup_from_unicode = self.soup(self.unicode_data)
260 |         self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
261 | 
262 |     @skipIf(
263 |         PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2,
264 |         "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
265 |     def test_attribute_name_containing_unicode_characters(self):
266 |         markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>'
267 |         self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
268 | 
269 | class TestUnicodeDammit(unittest.TestCase):
270 |     """Standalone tests of UnicodeDammit."""
271 | 
272 |     def test_unicode_input(self):
273 |         markup = u"I'm already Unicode! \N{SNOWMAN}"
274 |         dammit = UnicodeDammit(markup)
275 |         self.assertEqual(dammit.unicode_markup, markup)
276 | 
277 |     def test_smart_quotes_to_unicode(self):
278 |         markup = b"<foo>\x91\x92\x93\x94</foo>"
279 |         dammit = UnicodeDammit(markup)
280 |         self.assertEqual(
281 |             dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>")
282 | 
283 |     def test_smart_quotes_to_xml_entities(self):
284 |         markup = b"<foo>\x91\x92\x93\x94</foo>"
285 |         dammit = UnicodeDammit(markup, smart_quotes_to="xml")
286 |         self.assertEqual(
287 |             dammit.unicode_markup, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
288 | 
289 |     def test_smart_quotes_to_html_entities(self):
290 |         markup = b"<foo>\x91\x92\x93\x94</foo>"
291 |         dammit = UnicodeDammit(markup, smart_quotes_to="html")
292 |         self.assertEqual(
293 |             dammit.unicode_markup, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
294 | 
295 |     def test_smart_quotes_to_ascii(self):
296 |         markup = b"<foo>\x91\x92\x93\x94</foo>"
297 |         dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
298 |         self.assertEqual(
299 |             dammit.unicode_markup, """<foo>''""</foo>""")
300 | 
301 |     def test_detect_utf8(self):
302 |         utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
303 |         dammit = UnicodeDammit(utf8)
304 |         self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
305 |         self.assertEqual(dammit.unicode_markup, u'Sacr\xe9 bleu! \N{SNOWMAN}')
306 | 
307 | 
308 |     def test_convert_hebrew(self):
309 |         hebrew = b"\xed\xe5\xec\xf9"
310 |         dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
311 |         self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
312 |         self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
313 | 
314 |     def test_dont_see_smart_quotes_where_there_are_none(self):
315 |         utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
316 |         dammit = UnicodeDammit(utf_8)
317 |         self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
318 |         self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
319 | 
320 |     def test_ignore_inappropriate_codecs(self):
321 |         utf8_data = u"Räksmörgås".encode("utf-8")
322 |         dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
323 |         self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
324 | 
325 |     def test_ignore_invalid_codecs(self):
326 |         utf8_data = u"Räksmörgås".encode("utf-8")
327 |         for bad_encoding in ['.utf8', '...', 'utF---16.!']:
328 |             dammit = UnicodeDammit(utf8_data, [bad_encoding])
329 |             self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
330 | 
331 |     def test_exclude_encodings(self):
332 |         # This is UTF-8.
333 |         utf8_data = u"Räksmörgås".encode("utf-8")
334 | 
335 |         # But if we exclude UTF-8 from consideration, the guess is
336 |         # Windows-1252.
337 |         dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
338 |         self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
339 | 
340 |         # And if we exclude that, there is no valid guess at all.
341 |         dammit = UnicodeDammit(
342 |             utf8_data, exclude_encodings=["utf-8", "windows-1252"])
343 |         self.assertEqual(dammit.original_encoding, None)
344 | 
345 |     def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
346 |         detected = EncodingDetector(
347 |             b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
348 |         encodings = list(detected.encodings)
349 |         assert u'utf-\N{REPLACEMENT CHARACTER}' in encodings
350 | 
351 |     def test_detect_html5_style_meta_tag(self):
352 | 
353 |         for data in (
354 |             b'<html><meta charset="euc-jp" /></html>',
355 |             b"<html><meta charset='euc-jp' /></html>",
356 |             b"<html><meta charset=euc-jp /></html>",
357 |             b"<html><meta charset=euc-jp/></html>"):
358 |             dammit = UnicodeDammit(data, is_html=True)
359 |             self.assertEqual(
360 |                 "euc-jp", dammit.original_encoding)
361 | 
362 |     def test_last_ditch_entity_replacement(self):
363 |         # This is a UTF-8 document that contains bytestrings
364 |         # completely incompatible with UTF-8 (ie. encoded with some other
365 |         # encoding).
366 |         #
367 |         # Since there is no consistent encoding for the document,
368 |         # Unicode, Dammit will eventually encode the document as UTF-8
369 |         # and encode the incompatible characters as REPLACEMENT
370 |         # CHARACTER.
371 |         #
372 |         # If chardet is installed, it will detect that the document
373 |         # can be converted into ISO-8859-1 without errors. This happens
374 |         # to be the wrong encoding, but it is a consistent encoding, so the
375 |         # code we're testing here won't run.
376 |         #
377 |         # So we temporarily disable chardet if it's present.
378 |         doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
379 | <html><b>\330\250\330\252\330\261</b>
380 | <i>\310\322\321\220\312\321\355\344</i></html>"""
381 |         chardet = bs4.dammit.chardet_dammit
382 |         logging.disable(logging.WARNING)
383 |         try:
384 |             def noop(str):
385 |                 return None
386 |             bs4.dammit.chardet_dammit = noop
387 |             dammit = UnicodeDammit(doc)
388 |             self.assertEqual(True, dammit.contains_replacement_characters)
389 |             self.assertTrue(u"\ufffd" in dammit.unicode_markup)
390 | 
391 |             soup = BeautifulSoup(doc, "html.parser")
392 |             self.assertTrue(soup.contains_replacement_characters)
393 |         finally:
394 |             logging.disable(logging.NOTSET)
395 |             bs4.dammit.chardet_dammit = chardet
396 | 
397 |     def test_byte_order_mark_removed(self):
398 |         # A document written in UTF-16LE will have its byte order marker stripped.
399 |         data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
400 |         dammit = UnicodeDammit(data)
401 |         self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
402 |         self.assertEqual("utf-16le", dammit.original_encoding)
403 | 
404 |     def test_detwingle(self):
405 |         # Here's a UTF8 document.
406 |         utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8")
407 | 
408 |         # Here's a Windows-1252 document.
409 |         windows_1252 = (
410 |             u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
411 |             u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
412 | 
413 |         # Through some unholy alchemy, they've been stuck together.
414 |         doc = utf8 + windows_1252 + utf8
415 | 
416 |         # The document can't be turned into UTF-8:
417 |         self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
418 | 
419 |         # Unicode, Dammit thinks the whole document is Windows-1252,
420 |         # and decodes it into "â˜ƒâ˜ƒâ˜ƒ“Hi, I like Windows!”â˜ƒâ˜ƒâ˜ƒ"
421 | 
422 |         # But if we run it through fix_embedded_windows_1252, it's fixed:
423 | 
424 |         fixed = UnicodeDammit.detwingle(doc)
425 |         self.assertEqual(
426 |             u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
427 | 
428 |     def test_detwingle_ignores_multibyte_characters(self):
429 |         # Each of these characters has a UTF-8 representation ending
430 |         # in \x93. \x93 is a smart quote if interpreted as
431 |         # Windows-1252. But our code knows to skip over multibyte
432 |         # UTF-8 characters, so they'll survive the process unscathed.
433 |         for tricky_unicode_char in (
434 |             u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
435 |             u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
436 |             u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
437 |             ):
438 |             input = tricky_unicode_char.encode("utf8")
439 |             self.assertTrue(input.endswith(b'\x93'))
440 |             output = UnicodeDammit.detwingle(input)
441 |             self.assertEqual(output, input)
442 | 
443 | class TestNamedspacedAttribute(SoupTest):
444 | 
445 |     def test_name_may_be_none(self):
446 |         a = NamespacedAttribute("xmlns", None)
447 |         self.assertEqual(a, "xmlns")
448 | 
449 |     def test_attribute_is_equivalent_to_colon_separated_string(self):
450 |         a = NamespacedAttribute("a", "b")
451 |         self.assertEqual("a:b", a)
452 | 
453 |     def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
454 |         a = NamespacedAttribute("a", "b", "c")
455 |         b = NamespacedAttribute("a", "b", "c")
456 |         self.assertEqual(a, b)
457 | 
458 |         # The actual namespace is not considered.
459 |         c = NamespacedAttribute("a", "b", None)
460 |         self.assertEqual(a, c)
461 | 
462 |         # But name and prefix are important.
463 |         d = NamespacedAttribute("a", "z", "c")
464 |         self.assertNotEqual(a, d)
465 | 
466 |         e = NamespacedAttribute("z", "b", "c")
467 |         self.assertNotEqual(a, e)
468 | 
469 | 
470 | class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
471 | 
472 |     def test_content_meta_attribute_value(self):
473 |         value = CharsetMetaAttributeValue("euc-jp")
474 |         self.assertEqual("euc-jp", value)
475 |         self.assertEqual("euc-jp", value.original_value)
476 |         self.assertEqual("utf8", value.encode("utf8"))
477 | 
478 | 
479 |     def test_content_meta_attribute_value(self):
480 |         value = ContentMetaAttributeValue("text/html; charset=euc-jp")
481 |         self.assertEqual("text/html; charset=euc-jp", value)
482 |         self.assertEqual("text/html; charset=euc-jp", value.original_value)
483 |         self.assertEqual("text/html; charset=utf8", value.encode("utf8"))
484 | 


--------------------------------------------------------------------------------
/workflow/web.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | #
  3 | # Copyright (c) 2014 Dean Jackson <deanishe@deanishe.net>
  4 | #
  5 | # MIT Licence. See http://opensource.org/licenses/MIT
  6 | #
  7 | # Created on 2014-02-15
  8 | #
  9 | 
 10 | """
 11 | A lightweight HTTP library with a requests-like interface.
 12 | """
 13 | 
 14 | from __future__ import print_function
 15 | 
 16 | import codecs
 17 | import json
 18 | import mimetypes
 19 | import os
 20 | import random
 21 | import re
 22 | import socket
 23 | import string
 24 | import unicodedata
 25 | import urllib
 26 | import urllib2
 27 | import urlparse
 28 | import zlib
 29 | 
 30 | 
 31 | USER_AGENT = u'Alfred-Workflow/1.17 (+http://www.deanishe.net/alfred-workflow)'
 32 | 
 33 | # Valid characters for multipart form data boundaries
 34 | BOUNDARY_CHARS = string.digits + string.ascii_letters
 35 | 
 36 | # HTTP response codes
 37 | RESPONSES = {
 38 |     100: 'Continue',
 39 |     101: 'Switching Protocols',
 40 |     200: 'OK',
 41 |     201: 'Created',
 42 |     202: 'Accepted',
 43 |     203: 'Non-Authoritative Information',
 44 |     204: 'No Content',
 45 |     205: 'Reset Content',
 46 |     206: 'Partial Content',
 47 |     300: 'Multiple Choices',
 48 |     301: 'Moved Permanently',
 49 |     302: 'Found',
 50 |     303: 'See Other',
 51 |     304: 'Not Modified',
 52 |     305: 'Use Proxy',
 53 |     307: 'Temporary Redirect',
 54 |     400: 'Bad Request',
 55 |     401: 'Unauthorized',
 56 |     402: 'Payment Required',
 57 |     403: 'Forbidden',
 58 |     404: 'Not Found',
 59 |     405: 'Method Not Allowed',
 60 |     406: 'Not Acceptable',
 61 |     407: 'Proxy Authentication Required',
 62 |     408: 'Request Timeout',
 63 |     409: 'Conflict',
 64 |     410: 'Gone',
 65 |     411: 'Length Required',
 66 |     412: 'Precondition Failed',
 67 |     413: 'Request Entity Too Large',
 68 |     414: 'Request-URI Too Long',
 69 |     415: 'Unsupported Media Type',
 70 |     416: 'Requested Range Not Satisfiable',
 71 |     417: 'Expectation Failed',
 72 |     500: 'Internal Server Error',
 73 |     501: 'Not Implemented',
 74 |     502: 'Bad Gateway',
 75 |     503: 'Service Unavailable',
 76 |     504: 'Gateway Timeout',
 77 |     505: 'HTTP Version Not Supported'
 78 | }
 79 | 
 80 | 
 81 | def str_dict(dic):
 82 |     """Convert keys and values in ``dic`` into UTF-8-encoded :class:`str`
 83 | 
 84 |     :param dic: :class:`dict` of Unicode strings
 85 |     :returns: :class:`dict`
 86 | 
 87 |     """
 88 |     if isinstance(dic, CaseInsensitiveDictionary):
 89 |         dic2 = CaseInsensitiveDictionary()
 90 |     else:
 91 |         dic2 = {}
 92 |     for k, v in dic.items():
 93 |         if isinstance(k, unicode):
 94 |             k = k.encode('utf-8')
 95 |         if isinstance(v, unicode):
 96 |             v = v.encode('utf-8')
 97 |         dic2[k] = v
 98 |     return dic2
 99 | 
100 | 
101 | class NoRedirectHandler(urllib2.HTTPRedirectHandler):
102 |     """Prevent redirections"""
103 | 
104 |     def redirect_request(self, *args):
105 |         return None
106 | 
107 | 
108 | # Adapted from https://gist.github.com/babakness/3901174
109 | class CaseInsensitiveDictionary(dict):
110 |     """
111 |     Dictionary that enables case insensitive searching while preserving
112 |     case sensitivity when keys are listed, ie, via keys() or items() methods.
113 | 
114 |     Works by storing a lowercase version of the key as the new key and
115 |     stores the original key-value pair as the key's value
116 |     (values become dictionaries).
117 | 
118 |     """
119 | 
120 |     def __init__(self, initval=None):
121 | 
122 |         if isinstance(initval, dict):
123 |             for key, value in initval.iteritems():
124 |                 self.__setitem__(key, value)
125 | 
126 |         elif isinstance(initval, list):
127 |             for (key, value) in initval:
128 |                 self.__setitem__(key, value)
129 | 
130 |     def __contains__(self, key):
131 |         return dict.__contains__(self, key.lower())
132 | 
133 |     def __getitem__(self, key):
134 |         return dict.__getitem__(self, key.lower())['val']
135 | 
136 |     def __setitem__(self, key, value):
137 |         return dict.__setitem__(self, key.lower(), {'key': key, 'val': value})
138 | 
139 |     def get(self, key, default=None):
140 |         try:
141 |             v = dict.__getitem__(self, key.lower())
142 |         except KeyError:
143 |             return default
144 |         else:
145 |             return v['val']
146 | 
147 |     def update(self, other):
148 |         for k, v in other.items():
149 |             self[k] = v
150 | 
151 |     def items(self):
152 |         return [(v['key'], v['val']) for v in dict.itervalues(self)]
153 | 
154 |     def keys(self):
155 |         return [v['key'] for v in dict.itervalues(self)]
156 | 
157 |     def values(self):
158 |         return [v['val'] for v in dict.itervalues(self)]
159 | 
160 |     def iteritems(self):
161 |         for v in dict.itervalues(self):
162 |             yield v['key'], v['val']
163 | 
164 |     def iterkeys(self):
165 |         for v in dict.itervalues(self):
166 |             yield v['key']
167 | 
168 |     def itervalues(self):
169 |         for v in dict.itervalues(self):
170 |             yield v['val']
171 | 
172 | 
173 | class Response(object):
174 |     """
175 |     Returned by :func:`request` / :func:`get` / :func:`post` functions.
176 | 
177 |     A simplified version of the ``Response`` object in the ``requests`` library.
178 | 
179 |     >>> r = request('http://www.google.com')
180 |     >>> r.status_code
181 |     200
182 |     >>> r.encoding
183 |     ISO-8859-1
184 |     >>> r.content  # bytes
185 |     <html> ...
186 |     >>> r.text  # unicode, decoded according to charset in HTTP header/meta tag
187 |     u'<html> ...'
188 |     >>> r.json()  # content parsed as JSON
189 | 
190 |     """
191 | 
192 |     def __init__(self, request, stream=False):
193 |         """Call `request` with :mod:`urllib2` and process results.
194 | 
195 |         :param request: :class:`urllib2.Request` instance
196 |         :param stream: Whether to stream response or retrieve it all at once
197 |         :type stream: ``bool``
198 | 
199 |         """
200 | 
201 |         self.request = request
202 |         self._stream = stream
203 |         self.url = None
204 |         self.raw = None
205 |         self._encoding = None
206 |         self.error = None
207 |         self.status_code = None
208 |         self.reason = None
209 |         self.headers = CaseInsensitiveDictionary()
210 |         self._content = None
211 |         self._content_loaded = False
212 |         self._gzipped = False
213 | 
214 |         # Execute query
215 |         try:
216 |             self.raw = urllib2.urlopen(request)
217 |         except urllib2.HTTPError as err:
218 |             self.error = err
219 |             try:
220 |                 self.url = err.geturl()
221 |             # sometimes (e.g. when authentication fails)
222 |             # urllib can't get a URL from an HTTPError
223 |             # This behaviour changes across Python versions,
224 |             # so no test cover (it isn't important).
225 |             except AttributeError:  # pragma: no cover
226 |                 pass
227 |             self.status_code = err.code
228 |         else:
229 |             self.status_code = self.raw.getcode()
230 |             self.url = self.raw.geturl()
231 |         self.reason = RESPONSES.get(self.status_code)
232 | 
233 |         # Parse additional info if request succeeded
234 |         if not self.error:
235 |             headers = self.raw.info()
236 |             self.transfer_encoding = headers.getencoding()
237 |             self.mimetype = headers.gettype()
238 |             for key in headers.keys():
239 |                 self.headers[key.lower()] = headers.get(key)
240 | 
241 |             # Is content gzipped?
242 |             # Transfer-Encoding appears to not be used in the wild
243 |             # (contrary to the HTTP standard), but no harm in testing
244 |             # for it
245 |             if ('gzip' in headers.get('content-encoding', '') or
246 |                     'gzip' in headers.get('transfer-encoding', '')):
247 |                 self._gzipped = True
248 | 
249 |     @property
250 |     def stream(self):
251 |         return self._stream
252 | 
253 |     @stream.setter
254 |     def stream(self, value):
255 |         if self._content_loaded:
256 |             raise RuntimeError("`content` has already been read from "
257 |                                "this Response.")
258 | 
259 |         self._stream = value
260 | 
261 |     def json(self):
262 |         """Decode response contents as JSON.
263 | 
264 |         :returns: object decoded from JSON
265 |         :rtype: :class:`list` / :class:`dict`
266 | 
267 |         """
268 | 
269 |         return json.loads(self.content, self.encoding or 'utf-8')
270 | 
271 |     @property
272 |     def encoding(self):
273 |         """Text encoding of document or ``None``
274 | 
275 |         :returns: :class:`str` or ``None``
276 | 
277 |         """
278 | 
279 |         if not self._encoding:
280 |             self._encoding = self._get_encoding()
281 | 
282 |         return self._encoding
283 | 
284 |     @property
285 |     def content(self):
286 |         """Raw content of response (i.e. bytes)
287 | 
288 |         :returns: Body of HTTP response
289 |         :rtype: :class:`str`
290 | 
291 |         """
292 | 
293 |         if not self._content:
294 | 
295 |             # Decompress gzipped content
296 |             if self._gzipped:
297 |                 decoder = zlib.decompressobj(16 + zlib.MAX_WBITS)
298 |                 self._content = decoder.decompress(self.raw.read())
299 | 
300 |             else:
301 |                 self._content = self.raw.read()
302 | 
303 |             self._content_loaded = True
304 | 
305 |         return self._content
306 | 
307 |     @property
308 |     def text(self):
309 |         """Unicode-decoded content of response body.
310 | 
311 |         If no encoding can be determined from HTTP headers or the content
312 |         itself, the encoded response body will be returned instead.
313 | 
314 |         :returns: Body of HTTP response
315 |         :rtype: :class:`unicode` or :class:`str`
316 | 
317 |         """
318 | 
319 |         if self.encoding:
320 |             return unicodedata.normalize('NFC', unicode(self.content,
321 |                                                         self.encoding))
322 |         return self.content
323 | 
324 |     def iter_content(self, chunk_size=4096, decode_unicode=False):
325 |         """Iterate over response data.
326 | 
327 |         .. versionadded:: 1.6
328 | 
329 |         :param chunk_size: Number of bytes to read into memory
330 |         :type chunk_size: ``int``
331 |         :param decode_unicode: Decode to Unicode using detected encoding
332 |         :type decode_unicode: ``Boolean``
333 |         :returns: iterator
334 | 
335 |         """
336 | 
337 |         if not self.stream:
338 |             raise RuntimeError("You cannot call `iter_content` on a "
339 |                                "Response unless you passed `stream=True`"
340 |                                " to `get()`/`post()`/`request()`.")
341 | 
342 |         if self._content_loaded:
343 |             raise RuntimeError(
344 |                 "`content` has already been read from this Response.")
345 | 
346 |         def decode_stream(iterator, r):
347 | 
348 |             decoder = codecs.getincrementaldecoder(r.encoding)(errors='replace')
349 | 
350 |             for chunk in iterator:
351 |                 data = decoder.decode(chunk)
352 |                 if data:
353 |                     yield data
354 | 
355 |             data = decoder.decode(b'', final=True)
356 |             if data:  # pragma: no cover
357 |                 yield data
358 | 
359 |         def generate():
360 | 
361 |             if self._gzipped:
362 |                 decoder = zlib.decompressobj(16 + zlib.MAX_WBITS)
363 | 
364 |             while True:
365 |                 chunk = self.raw.read(chunk_size)
366 |                 if not chunk:
367 |                     break
368 | 
369 |                 if self._gzipped:
370 |                     chunk = decoder.decompress(chunk)
371 | 
372 |                 yield chunk
373 | 
374 |         chunks = generate()
375 | 
376 |         if decode_unicode and self.encoding:
377 |             chunks = decode_stream(chunks, self)
378 | 
379 |         return chunks
380 | 
381 |     def save_to_path(self, filepath):
382 |         """Save retrieved data to file at ``filepath``
383 | 
384 |         .. versionadded: 1.9.6
385 | 
386 |         :param filepath: Path to save retrieved data.
387 | 
388 |         """
389 | 
390 |         filepath = os.path.abspath(filepath)
391 |         dirname = os.path.dirname(filepath)
392 |         if not os.path.exists(dirname):
393 |             os.makedirs(dirname)
394 | 
395 |         self.stream = True
396 | 
397 |         with open(filepath, 'wb') as fileobj:
398 |             for data in self.iter_content():
399 |                 fileobj.write(data)
400 | 
401 |     def raise_for_status(self):
402 |         """Raise stored error if one occurred.
403 | 
404 |         error will be instance of :class:`urllib2.HTTPError`
405 |         """
406 | 
407 |         if self.error is not None:
408 |             raise self.error
409 |         return
410 | 
411 |     def _get_encoding(self):
412 |         """Get encoding from HTTP headers or content.
413 | 
414 |         :returns: encoding or `None`
415 |         :rtype: ``unicode`` or ``None``
416 | 
417 |         """
418 | 
419 |         headers = self.raw.info()
420 |         encoding = None
421 | 
422 |         if headers.getparam('charset'):
423 |             encoding = headers.getparam('charset')
424 | 
425 |         # HTTP Content-Type header
426 |         for param in headers.getplist():
427 |             if param.startswith('charset='):
428 |                 encoding = param[8:]
429 |                 break
430 | 
431 |         if not self.stream:  # Try sniffing response content
432 |             # Encoding declared in document should override HTTP headers
433 |             if self.mimetype == 'text/html':  # sniff HTML headers
434 |                 m = re.search("""<meta.+charset=["']{0,1}(.+?)["'].*>""",
435 |                               self.content)
436 |                 if m:
437 |                     encoding = m.group(1)
438 |                     print('sniffed HTML encoding=%r' % encoding)
439 | 
440 |             elif ((self.mimetype.startswith('application/') or
441 |                    self.mimetype.startswith('text/')) and
442 |                   'xml' in self.mimetype):
443 |                 m = re.search("""<?xml.+encoding=["'](.+?)["'][^>]*\?>""",
444 |                               self.content)
445 |                 if m:
446 |                     encoding = m.group(1)
447 | 
448 |         # Format defaults
449 |         if self.mimetype == 'application/json' and not encoding:
450 |             # The default encoding for JSON
451 |             encoding = 'utf-8'
452 | 
453 |         elif self.mimetype == 'application/xml' and not encoding:
454 |             # The default for 'application/xml'
455 |             encoding = 'utf-8'
456 | 
457 |         if encoding:
458 |             encoding = encoding.lower()
459 | 
460 |         return encoding
461 | 
462 | 
463 | def request(method, url, params=None, data=None, headers=None, cookies=None,
464 |             files=None, auth=None, timeout=60, allow_redirects=False,
465 |             stream=False):
466 |     """Initiate an HTTP(S) request. Returns :class:`Response` object.
467 | 
468 |     :param method: 'GET' or 'POST'
469 |     :type method: ``unicode``
470 |     :param url: URL to open
471 |     :type url: ``unicode``
472 |     :param params: mapping of URL parameters
473 |     :type params: :class:`dict`
474 |     :param data: mapping of form data ``{'field_name': 'value'}`` or
475 |         :class:`str`
476 |     :type data: :class:`dict` or :class:`str`
477 |     :param headers: HTTP headers
478 |     :type headers: :class:`dict`
479 |     :param cookies: cookies to send to server
480 |     :type cookies: :class:`dict`
481 |     :param files: files to upload (see below).
482 |     :type files: :class:`dict`
483 |     :param auth: username, password
484 |     :type auth: ``tuple``
485 |     :param timeout: connection timeout limit in seconds
486 |     :type timeout: ``int``
487 |     :param allow_redirects: follow redirections
488 |     :type allow_redirects: ``Boolean``
489 |     :param stream: Stream content instead of fetching it all at once.
490 |     :type stream: ``bool``
491 |     :returns: :class:`Response` object
492 | 
493 | 
494 |     The ``files`` argument is a dictionary::
495 | 
496 |         {'fieldname' : { 'filename': 'blah.txt',
497 |                          'content': '<binary data>',
498 |                          'mimetype': 'text/plain'}
499 |         }
500 | 
501 |     * ``fieldname`` is the name of the field in the HTML form.
502 |     * ``mimetype`` is optional. If not provided, :mod:`mimetypes` will
503 |       be used to guess the mimetype, or ``application/octet-stream``
504 |       will be used.
505 | 
506 |     """
507 | 
508 |     # TODO: cookies
509 |     socket.setdefaulttimeout(timeout)
510 | 
511 |     # Default handlers
512 |     openers = []
513 | 
514 |     if not allow_redirects:
515 |         openers.append(NoRedirectHandler())
516 | 
517 |     if auth is not None:  # Add authorisation handler
518 |         username, password = auth
519 |         password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
520 |         password_manager.add_password(None, url, username, password)
521 |         auth_manager = urllib2.HTTPBasicAuthHandler(password_manager)
522 |         openers.append(auth_manager)
523 | 
524 |     # Install our custom chain of openers
525 |     opener = urllib2.build_opener(*openers)
526 |     urllib2.install_opener(opener)
527 | 
528 |     if not headers:
529 |         headers = CaseInsensitiveDictionary()
530 |     else:
531 |         headers = CaseInsensitiveDictionary(headers)
532 | 
533 |     if 'user-agent' not in headers:
534 |         headers['user-agent'] = USER_AGENT
535 | 
536 |     # Accept gzip-encoded content
537 |     encodings = [s.strip() for s in
538 |                  headers.get('accept-encoding', '').split(',')]
539 |     if 'gzip' not in encodings:
540 |         encodings.append('gzip')
541 | 
542 |     headers['accept-encoding'] = ', '.join(encodings)
543 | 
544 |     # Force POST by providing an empty data string
545 |     if method == 'POST' and not data:
546 |         data = ''
547 | 
548 |     if files:
549 |         if not data:
550 |             data = {}
551 |         new_headers, data = encode_multipart_formdata(data, files)
552 |         headers.update(new_headers)
553 |     elif data and isinstance(data, dict):
554 |         data = urllib.urlencode(str_dict(data))
555 | 
556 |     # Make sure everything is encoded text
557 |     headers = str_dict(headers)
558 | 
559 |     if isinstance(url, unicode):
560 |         url = url.encode('utf-8')
561 | 
562 |     if params:  # GET args (POST args are handled in encode_multipart_formdata)
563 | 
564 |         scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
565 | 
566 |         if query:  # Combine query string and `params`
567 |             url_params = urlparse.parse_qs(query)
568 |             # `params` take precedence over URL query string
569 |             url_params.update(params)
570 |             params = url_params
571 | 
572 |         query = urllib.urlencode(str_dict(params), doseq=True)
573 |         url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
574 | 
575 |     req = urllib2.Request(url, data, headers)
576 |     return Response(req, stream)
577 | 
578 | 
579 | def get(url, params=None, headers=None, cookies=None, auth=None,
580 |         timeout=60, allow_redirects=True, stream=False):
581 |     """Initiate a GET request. Arguments as for :func:`request`.
582 | 
583 |     :returns: :class:`Response` instance
584 | 
585 |     """
586 | 
587 |     return request('GET', url, params, headers=headers, cookies=cookies,
588 |                    auth=auth, timeout=timeout, allow_redirects=allow_redirects,
589 |                    stream=stream)
590 | 
591 | 
592 | def post(url, params=None, data=None, headers=None, cookies=None, files=None,
593 |          auth=None, timeout=60, allow_redirects=False, stream=False):
594 |     """Initiate a POST request. Arguments as for :func:`request`.
595 | 
596 |     :returns: :class:`Response` instance
597 | 
598 |     """
599 |     return request('POST', url, params, data, headers, cookies, files, auth,
600 |                    timeout, allow_redirects, stream)
601 | 
602 | 
603 | def encode_multipart_formdata(fields, files):
604 |     """Encode form data (``fields``) and ``files`` for POST request.
605 | 
606 |     :param fields: mapping of ``{name : value}`` pairs for normal form fields.
607 |     :type fields: :class:`dict`
608 |     :param files: dictionary of fieldnames/files elements for file data.
609 |                   See below for details.
610 |     :type files: :class:`dict` of :class:`dicts`
611 |     :returns: ``(headers, body)`` ``headers`` is a :class:`dict` of HTTP headers
612 |     :rtype: 2-tuple ``(dict, str)``
613 | 
614 |     The ``files`` argument is a dictionary::
615 | 
616 |         {'fieldname' : { 'filename': 'blah.txt',
617 |                          'content': '<binary data>',
618 |                          'mimetype': 'text/plain'}
619 |         }
620 | 
621 |     - ``fieldname`` is the name of the field in the HTML form.
622 |     - ``mimetype`` is optional. If not provided, :mod:`mimetypes` will be used to guess the mimetype, or ``application/octet-stream`` will be used.
623 | 
624 |     """
625 | 
626 |     def get_content_type(filename):
627 |         """Return or guess mimetype of ``filename``.
628 | 
629 |         :param filename: filename of file
630 |         :type filename: unicode/string
631 |         :returns: mime-type, e.g. ``text/html``
632 |         :rtype: :class::class:`str`
633 | 
634 |         """
635 | 
636 |         return mimetypes.guess_type(filename)[0] or 'application/octet-stream'
637 | 
638 |     boundary = '-----' + ''.join(random.choice(BOUNDARY_CHARS)
639 |                                  for i in range(30))
640 |     CRLF = '\r\n'
641 |     output = []
642 | 
643 |     # Normal form fields
644 |     for (name, value) in fields.items():
645 |         if isinstance(name, unicode):
646 |             name = name.encode('utf-8')
647 |         if isinstance(value, unicode):
648 |             value = value.encode('utf-8')
649 |         output.append('--' + boundary)
650 |         output.append('Content-Disposition: form-data; name="%s"' % name)
651 |         output.append('')
652 |         output.append(value)
653 | 
654 |     # Files to upload
655 |     for name, d in files.items():
656 |         filename = d[u'filename']
657 |         content = d[u'content']
658 |         if u'mimetype' in d:
659 |             mimetype = d[u'mimetype']
660 |         else:
661 |             mimetype = get_content_type(filename)
662 |         if isinstance(name, unicode):
663 |             name = name.encode('utf-8')
664 |         if isinstance(filename, unicode):
665 |             filename = filename.encode('utf-8')
666 |         if isinstance(mimetype, unicode):
667 |             mimetype = mimetype.encode('utf-8')
668 |         output.append('--' + boundary)
669 |         output.append('Content-Disposition: form-data; '
670 |                       'name="%s"; filename="%s"' % (name, filename))
671 |         output.append('Content-Type: %s' % mimetype)
672 |         output.append('')
673 |         output.append(content)
674 | 
675 |     output.append('--' + boundary + '--')
676 |     output.append('')
677 |     body = CRLF.join(output)
678 |     headers = {
679 |         'Content-Type': 'multipart/form-data; boundary=%s' % boundary,
680 |         'Content-Length': str(len(body)),
681 |     }
682 |     return (headers, body)
683 | 


--------------------------------------------------------------------------------
/bs4/testing.py:
--------------------------------------------------------------------------------
  1 | """Helper classes for tests."""
  2 | 
  3 | __license__ = "MIT"
  4 | 
  5 | import pickle
  6 | import copy
  7 | import functools
  8 | import unittest
  9 | from unittest import TestCase
 10 | from bs4 import BeautifulSoup
 11 | from bs4.element import (
 12 |     CharsetMetaAttributeValue,
 13 |     Comment,
 14 |     ContentMetaAttributeValue,
 15 |     Doctype,
 16 |     SoupStrainer,
 17 | )
 18 | 
 19 | from bs4.builder import HTMLParserTreeBuilder
 20 | default_builder = HTMLParserTreeBuilder
 21 | 
 22 | 
 23 | class SoupTest(unittest.TestCase):
 24 | 
 25 |     @property
 26 |     def default_builder(self):
 27 |         return default_builder()
 28 | 
 29 |     def soup(self, markup, **kwargs):
 30 |         """Build a Beautiful Soup object from markup."""
 31 |         builder = kwargs.pop('builder', self.default_builder)
 32 |         return BeautifulSoup(markup, builder=builder, **kwargs)
 33 | 
 34 |     def document_for(self, markup):
 35 |         """Turn an HTML fragment into a document.
 36 | 
 37 |         The details depend on the builder.
 38 |         """
 39 |         return self.default_builder.test_fragment_to_document(markup)
 40 | 
 41 |     def assertSoupEquals(self, to_parse, compare_parsed_to=None):
 42 |         builder = self.default_builder
 43 |         obj = BeautifulSoup(to_parse, builder=builder)
 44 |         if compare_parsed_to is None:
 45 |             compare_parsed_to = to_parse
 46 | 
 47 |         self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
 48 | 
 49 |     def assertConnectedness(self, element):
 50 |         """Ensure that next_element and previous_element are properly
 51 |         set for all descendants of the given element.
 52 |         """
 53 |         earlier = None
 54 |         for e in element.descendants:
 55 |             if earlier:
 56 |                 self.assertEqual(e, earlier.next_element)
 57 |                 self.assertEqual(earlier, e.previous_element)
 58 |             earlier = e
 59 | 
 60 | class HTMLTreeBuilderSmokeTest(object):
 61 | 
 62 |     """A basic test of a treebuilder's competence.
 63 | 
 64 |     Any HTML treebuilder, present or future, should be able to pass
 65 |     these tests. With invalid markup, there's room for interpretation,
 66 |     and different parsers can handle it differently. But with the
 67 |     markup in these tests, there's not much room for interpretation.
 68 |     """
 69 | 
 70 |     def test_pickle_and_unpickle_identity(self):
 71 |         # Pickling a tree, then unpickling it, yields a tree identical
 72 |         # to the original.
 73 |         tree = self.soup("<a><b>foo</a>")
 74 |         dumped = pickle.dumps(tree, 2)
 75 |         loaded = pickle.loads(dumped)
 76 |         self.assertEqual(loaded.__class__, BeautifulSoup)
 77 |         self.assertEqual(loaded.decode(), tree.decode())
 78 | 
 79 |     def assertDoctypeHandled(self, doctype_fragment):
 80 |         """Assert that a given doctype string is handled correctly."""
 81 |         doctype_str, soup = self._document_with_doctype(doctype_fragment)
 82 | 
 83 |         # Make sure a Doctype object was created.
 84 |         doctype = soup.contents[0]
 85 |         self.assertEqual(doctype.__class__, Doctype)
 86 |         self.assertEqual(doctype, doctype_fragment)
 87 |         self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
 88 | 
 89 |         # Make sure that the doctype was correctly associated with the
 90 |         # parse tree and that the rest of the document parsed.
 91 |         self.assertEqual(soup.p.contents[0], 'foo')
 92 | 
 93 |     def _document_with_doctype(self, doctype_fragment):
 94 |         """Generate and parse a document with the given doctype."""
 95 |         doctype = '<!DOCTYPE %s>' % doctype_fragment
 96 |         markup = doctype + '\n<p>foo</p>'
 97 |         soup = self.soup(markup)
 98 |         return doctype, soup
 99 | 
100 |     def test_normal_doctypes(self):
101 |         """Make sure normal, everyday HTML doctypes are handled correctly."""
102 |         self.assertDoctypeHandled("html")
103 |         self.assertDoctypeHandled(
104 |             'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
105 | 
106 |     def test_empty_doctype(self):
107 |         soup = self.soup("<!DOCTYPE>")
108 |         doctype = soup.contents[0]
109 |         self.assertEqual("", doctype.strip())
110 | 
111 |     def test_public_doctype_with_url(self):
112 |         doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
113 |         self.assertDoctypeHandled(doctype)
114 | 
115 |     def test_system_doctype(self):
116 |         self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
117 | 
118 |     def test_namespaced_system_doctype(self):
119 |         # We can handle a namespaced doctype with a system ID.
120 |         self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
121 | 
122 |     def test_namespaced_public_doctype(self):
123 |         # Test a namespaced doctype with a public id.
124 |         self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
125 | 
126 |     def test_real_xhtml_document(self):
127 |         """A real XHTML document should come out more or less the same as it went in."""
128 |         markup = b"""<?xml version="1.0" encoding="utf-8"?>
129 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
130 | <html xmlns="http://www.w3.org/1999/xhtml">
131 | <head><title>Hello.</title></head>
132 | <body>Goodbye.</body>
133 | </html>"""
134 |         soup = self.soup(markup)
135 |         self.assertEqual(
136 |             soup.encode("utf-8").replace(b"\n", b""),
137 |             markup.replace(b"\n", b""))
138 | 
139 |     def test_processing_instruction(self):
140 |         markup = b"""<?PITarget PIContent?>"""
141 |         soup = self.soup(markup)
142 |         self.assertEqual(markup, soup.encode("utf8"))
143 | 
144 |     def test_deepcopy(self):
145 |         """Make sure you can copy the tree builder.
146 | 
147 |         This is important because the builder is part of a
148 |         BeautifulSoup object, and we want to be able to copy that.
149 |         """
150 |         copy.deepcopy(self.default_builder)
151 | 
152 |     def test_p_tag_is_never_empty_element(self):
153 |         """A <p> tag is never designated as an empty-element tag.
154 | 
155 |         Even if the markup shows it as an empty-element tag, it
156 |         shouldn't be presented that way.
157 |         """
158 |         soup = self.soup("<p/>")
159 |         self.assertFalse(soup.p.is_empty_element)
160 |         self.assertEqual(str(soup.p), "<p></p>")
161 | 
162 |     def test_unclosed_tags_get_closed(self):
163 |         """A tag that's not closed by the end of the document should be closed.
164 | 
165 |         This applies to all tags except empty-element tags.
166 |         """
167 |         self.assertSoupEquals("<p>", "<p></p>")
168 |         self.assertSoupEquals("<b>", "<b></b>")
169 | 
170 |         self.assertSoupEquals("<br>", "<br/>")
171 | 
172 |     def test_br_is_always_empty_element_tag(self):
173 |         """A <br> tag is designated as an empty-element tag.
174 | 
175 |         Some parsers treat <br></br> as one <br/> tag, some parsers as
176 |         two tags, but it should always be an empty-element tag.
177 |         """
178 |         soup = self.soup("<br></br>")
179 |         self.assertTrue(soup.br.is_empty_element)
180 |         self.assertEqual(str(soup.br), "<br/>")
181 | 
182 |     def test_nested_formatting_elements(self):
183 |         self.assertSoupEquals("<em><em></em></em>")
184 | 
185 |     def test_double_head(self):
186 |         html = '''<!DOCTYPE html>
187 | <html>
188 | <head>
189 | <title>Ordinary HEAD element test</title>
190 | </head>
191 | <script type="text/javascript">
192 | alert("Help!");
193 | </script>
194 | <body>
195 | Hello, world!
196 | </body>
197 | </html>
198 | '''
199 |         soup = self.soup(html)
200 |         self.assertEqual("text/javascript", soup.find('script')['type'])
201 | 
202 |     def test_comment(self):
203 |         # Comments are represented as Comment objects.
204 |         markup = "<p>foo<!--foobar-->baz</p>"
205 |         self.assertSoupEquals(markup)
206 | 
207 |         soup = self.soup(markup)
208 |         comment = soup.find(text="foobar")
209 |         self.assertEqual(comment.__class__, Comment)
210 | 
211 |         # The comment is properly integrated into the tree.
212 |         foo = soup.find(text="foo")
213 |         self.assertEqual(comment, foo.next_element)
214 |         baz = soup.find(text="baz")
215 |         self.assertEqual(comment, baz.previous_element)
216 | 
217 |     def test_preserved_whitespace_in_pre_and_textarea(self):
218 |         """Whitespace must be preserved in <pre> and <textarea> tags."""
219 |         self.assertSoupEquals("<pre>   </pre>")
220 |         self.assertSoupEquals("<textarea> woo  </textarea>")
221 | 
222 |     def test_nested_inline_elements(self):
223 |         """Inline elements can be nested indefinitely."""
224 |         b_tag = "<b>Inside a B tag</b>"
225 |         self.assertSoupEquals(b_tag)
226 | 
227 |         nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
228 |         self.assertSoupEquals(nested_b_tag)
229 | 
230 |         double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
231 |         self.assertSoupEquals(nested_b_tag)
232 | 
233 |     def test_nested_block_level_elements(self):
234 |         """Block elements can be nested."""
235 |         soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
236 |         blockquote = soup.blockquote
237 |         self.assertEqual(blockquote.p.b.string, 'Foo')
238 |         self.assertEqual(blockquote.b.string, 'Foo')
239 | 
240 |     def test_correctly_nested_tables(self):
241 |         """One table can go inside another one."""
242 |         markup = ('<table id="1">'
243 |                   '<tr>'
244 |                   "<td>Here's another table:"
245 |                   '<table id="2">'
246 |                   '<tr><td>foo</td></tr>'
247 |                   '</table></td>')
248 | 
249 |         self.assertSoupEquals(
250 |             markup,
251 |             '<table id="1"><tr><td>Here\'s another table:'
252 |             '<table id="2"><tr><td>foo</td></tr></table>'
253 |             '</td></tr></table>')
254 | 
255 |         self.assertSoupEquals(
256 |             "<table><thead><tr><td>Foo</td></tr></thead>"
257 |             "<tbody><tr><td>Bar</td></tr></tbody>"
258 |             "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
259 | 
260 |     def test_deeply_nested_multivalued_attribute(self):
261 |         # html5lib can set the attributes of the same tag many times
262 |         # as it rearranges the tree. This has caused problems with
263 |         # multivalued attributes.
264 |         markup = '<table><div><div class="css"></div></div></table>'
265 |         soup = self.soup(markup)
266 |         self.assertEqual(["css"], soup.div.div['class'])
267 | 
268 |     def test_multivalued_attribute_on_html(self):
269 |         # html5lib uses a different API to set the attributes ot the
270 |         # <html> tag. This has caused problems with multivalued
271 |         # attributes.
272 |         markup = '<html class="a b"></html>'
273 |         soup = self.soup(markup)
274 |         self.assertEqual(["a", "b"], soup.html['class'])
275 | 
276 |     def test_angle_brackets_in_attribute_values_are_escaped(self):
277 |         self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
278 | 
279 |     def test_entities_in_attributes_converted_to_unicode(self):
280 |         expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
281 |         self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
282 |         self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
283 |         self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
284 |         self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
285 | 
286 |     def test_entities_in_text_converted_to_unicode(self):
287 |         expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
288 |         self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
289 |         self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
290 |         self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
291 |         self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
292 | 
293 |     def test_quot_entity_converted_to_quotation_mark(self):
294 |         self.assertSoupEquals("<p>I said &quot;good day!&quot;</p>",
295 |                               '<p>I said "good day!"</p>')
296 | 
297 |     def test_out_of_range_entity(self):
298 |         expect = u"\N{REPLACEMENT CHARACTER}"
299 |         self.assertSoupEquals("&#10000000000000;", expect)
300 |         self.assertSoupEquals("&#x10000000000000;", expect)
301 |         self.assertSoupEquals("&#1000000000;", expect)
302 | 
303 |     def test_multipart_strings(self):
304 |         "Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
305 |         soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
306 |         self.assertEqual("p", soup.h2.string.next_element.name)
307 |         self.assertEqual("p", soup.p.name)
308 |         self.assertConnectedness(soup)
309 | 
310 |     def test_head_tag_between_head_and_body(self):
311 |         "Prevent recurrence of a bug in the html5lib treebuilder."
312 |         content = """<html><head></head>
313 |   <link></link>
314 |   <body>foo</body>
315 | </html>
316 | """
317 |         soup = self.soup(content)
318 |         self.assertNotEqual(None, soup.html.body)
319 |         self.assertConnectedness(soup)
320 | 
321 |     def test_multiple_copies_of_a_tag(self):
322 |         "Prevent recurrence of a bug in the html5lib treebuilder."
323 |         content = """<!DOCTYPE html>
324 | <html>
325 |  <body>
326 |    <article id="a" >
327 |    <div><a href="1"></div>
328 |    <footer>
329 |      <a href="2"></a>
330 |    </footer>
331 |   </article>
332 |   </body>
333 | </html>
334 | """
335 |         soup = self.soup(content)
336 |         self.assertConnectedness(soup.article)
337 | 
338 |     def test_basic_namespaces(self):
339 |         """Parsers don't need to *understand* namespaces, but at the
340 |         very least they should not choke on namespaces or lose
341 |         data."""
342 | 
343 |         markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
344 |         soup = self.soup(markup)
345 |         self.assertEqual(markup, soup.encode())
346 |         html = soup.html
347 |         self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
348 |         self.assertEqual(
349 |             'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
350 |         self.assertEqual(
351 |             'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
352 | 
353 |     def test_multivalued_attribute_value_becomes_list(self):
354 |         markup = b'<a class="foo bar">'
355 |         soup = self.soup(markup)
356 |         self.assertEqual(['foo', 'bar'], soup.a['class'])
357 | 
358 |     #
359 |     # Generally speaking, tests below this point are more tests of
360 |     # Beautiful Soup than tests of the tree builders. But parsers are
361 |     # weird, so we run these tests separately for every tree builder
362 |     # to detect any differences between them.
363 |     #
364 | 
365 |     def test_can_parse_unicode_document(self):
366 |         # A seemingly innocuous document... but it's in Unicode! And
367 |         # it contains characters that can't be represented in the
368 |         # encoding found in the  declaration! The horror!
369 |         markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
370 |         soup = self.soup(markup)
371 |         self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
372 | 
373 |     def test_soupstrainer(self):
374 |         """Parsers should be able to work with SoupStrainers."""
375 |         strainer = SoupStrainer("b")
376 |         soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
377 |                          parse_only=strainer)
378 |         self.assertEqual(soup.decode(), "<b>bold</b>")
379 | 
380 |     def test_single_quote_attribute_values_become_double_quotes(self):
381 |         self.assertSoupEquals("<foo attr='bar'></foo>",
382 |                               '<foo attr="bar"></foo>')
383 | 
384 |     def test_attribute_values_with_nested_quotes_are_left_alone(self):
385 |         text = """<foo attr='bar "brawls" happen'>a</foo>"""
386 |         self.assertSoupEquals(text)
387 | 
388 |     def test_attribute_values_with_double_nested_quotes_get_quoted(self):
389 |         text = """<foo attr='bar "brawls" happen'>a</foo>"""
390 |         soup = self.soup(text)
391 |         soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
392 |         self.assertSoupEquals(
393 |             soup.foo.decode(),
394 |             """<foo attr="Brawls happen at &quot;Bob\'s Bar&quot;">a</foo>""")
395 | 
396 |     def test_ampersand_in_attribute_value_gets_escaped(self):
397 |         self.assertSoupEquals('<this is="really messed up & stuff"></this>',
398 |                               '<this is="really messed up &amp; stuff"></this>')
399 | 
400 |         self.assertSoupEquals(
401 |             '<a href="http://example.org?a=1&b=2;3">foo</a>',
402 |             '<a href="http://example.org?a=1&amp;b=2;3">foo</a>')
403 | 
404 |     def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
405 |         self.assertSoupEquals('<a href="http://example.org?a=1&amp;b=2;3"></a>')
406 | 
407 |     def test_entities_in_strings_converted_during_parsing(self):
408 |         # Both XML and HTML entities are converted to Unicode characters
409 |         # during parsing.
410 |         text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
411 |         expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
412 |         self.assertSoupEquals(text, expected)
413 | 
414 |     def test_smart_quotes_converted_on_the_way_in(self):
415 |         # Microsoft smart quotes are converted to Unicode characters during
416 |         # parsing.
417 |         quote = b"<p>\x91Foo\x92</p>"
418 |         soup = self.soup(quote)
419 |         self.assertEqual(
420 |             soup.p.string,
421 |             u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
422 | 
423 |     def test_non_breaking_spaces_converted_on_the_way_in(self):
424 |         soup = self.soup("<a>&nbsp;&nbsp;</a>")
425 |         self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
426 | 
427 |     def test_entities_converted_on_the_way_out(self):
428 |         text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
429 |         expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
430 |         soup = self.soup(text)
431 |         self.assertEqual(soup.p.encode("utf-8"), expected)
432 | 
433 |     def test_real_iso_latin_document(self):
434 |         # Smoke test of interrelated functionality, using an
435 |         # easy-to-understand document.
436 | 
437 |         # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
438 |         unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
439 | 
440 |         # That's because we're going to encode it into ISO-Latin-1, and use
441 |         # that to test.
442 |         iso_latin_html = unicode_html.encode("iso-8859-1")
443 | 
444 |         # Parse the ISO-Latin-1 HTML.
445 |         soup = self.soup(iso_latin_html)
446 |         # Encode it to UTF-8.
447 |         result = soup.encode("utf-8")
448 | 
449 |         # What do we expect the result to look like? Well, it would
450 |         # look like unicode_html, except that the META tag would say
451 |         # UTF-8 instead of ISO-Latin-1.
452 |         expected = unicode_html.replace("ISO-Latin-1", "utf-8")
453 | 
454 |         # And, of course, it would be in UTF-8, not Unicode.
455 |         expected = expected.encode("utf-8")
456 | 
457 |         # Ta-da!
458 |         self.assertEqual(result, expected)
459 | 
460 |     def test_real_shift_jis_document(self):
461 |         # Smoke test to make sure the parser can handle a document in
462 |         # Shift-JIS encoding, without choking.
463 |         shift_jis_html = (
464 |             b'<html><head></head><body><pre>'
465 |             b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
466 |             b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
467 |             b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
468 |             b'</pre></body></html>')
469 |         unicode_html = shift_jis_html.decode("shift-jis")
470 |         soup = self.soup(unicode_html)
471 | 
472 |         # Make sure the parse tree is correctly encoded to various
473 |         # encodings.
474 |         self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
475 |         self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
476 | 
477 |     def test_real_hebrew_document(self):
478 |         # A real-world test to make sure we can convert ISO-8859-9 (a
479 |         # Hebrew encoding) to UTF-8.
480 |         hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
481 |         soup = self.soup(
482 |             hebrew_document, from_encoding="iso8859-8")
483 |         self.assertEqual(soup.original_encoding, 'iso8859-8')
484 |         self.assertEqual(
485 |             soup.encode('utf-8'),
486 |             hebrew_document.decode("iso8859-8").encode("utf-8"))
487 | 
488 |     def test_meta_tag_reflects_current_encoding(self):
489 |         # Here's the <meta> tag saying that a document is
490 |         # encoded in Shift-JIS.
491 |         meta_tag = ('<meta content="text/html; charset=x-sjis" '
492 |                     'http-equiv="Content-type"/>')
493 | 
494 |         # Here's a document incorporating that meta tag.
495 |         shift_jis_html = (
496 |             '<html><head>\n%s\n'
497 |             '<meta http-equiv="Content-language" content="ja"/>'
498 |             '</head><body>Shift-JIS markup goes here.') % meta_tag
499 |         soup = self.soup(shift_jis_html)
500 | 
501 |         # Parse the document, and the charset is seemingly unaffected.
502 |         parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
503 |         content = parsed_meta['content']
504 |         self.assertEqual('text/html; charset=x-sjis', content)
505 | 
506 |         # But that value is actually a ContentMetaAttributeValue object.
507 |         self.assertTrue(isinstance(content, ContentMetaAttributeValue))
508 | 
509 |         # And it will take on a value that reflects its current
510 |         # encoding.
511 |         self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
512 | 
513 |         # For the rest of the story, see TestSubstitutions in
514 |         # test_tree.py.
515 | 
516 |     def test_html5_style_meta_tag_reflects_current_encoding(self):
517 |         # Here's the <meta> tag saying that a document is
518 |         # encoded in Shift-JIS.
519 |         meta_tag = ('<meta id="encoding" charset="x-sjis" />')
520 | 
521 |         # Here's a document incorporating that meta tag.
522 |         shift_jis_html = (
523 |             '<html><head>\n%s\n'
524 |             '<meta http-equiv="Content-language" content="ja"/>'
525 |             '</head><body>Shift-JIS markup goes here.') % meta_tag
526 |         soup = self.soup(shift_jis_html)
527 | 
528 |         # Parse the document, and the charset is seemingly unaffected.
529 |         parsed_meta = soup.find('meta', id="encoding")
530 |         charset = parsed_meta['charset']
531 |         self.assertEqual('x-sjis', charset)
532 | 
533 |         # But that value is actually a CharsetMetaAttributeValue object.
534 |         self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
535 | 
536 |         # And it will take on a value that reflects its current
537 |         # encoding.
538 |         self.assertEqual('utf8', charset.encode("utf8"))
539 | 
540 |     def test_tag_with_no_attributes_can_have_attributes_added(self):
541 |         data = self.soup("<a>text</a>")
542 |         data.a['foo'] = 'bar'
543 |         self.assertEqual('<a foo="bar">text</a>', data.a.decode())
544 | 
545 | class XMLTreeBuilderSmokeTest(object):
546 | 
547 |     def test_pickle_and_unpickle_identity(self):
548 |         # Pickling a tree, then unpickling it, yields a tree identical
549 |         # to the original.
550 |         tree = self.soup("<a><b>foo</a>")
551 |         dumped = pickle.dumps(tree, 2)
552 |         loaded = pickle.loads(dumped)
553 |         self.assertEqual(loaded.__class__, BeautifulSoup)
554 |         self.assertEqual(loaded.decode(), tree.decode())
555 | 
556 |     def test_docstring_generated(self):
557 |         soup = self.soup("<root/>")
558 |         self.assertEqual(
559 |             soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
560 | 
561 |     def test_xml_declaration(self):
562 |         markup = b"""<?xml version="1.0" encoding="utf8"?>\n<foo/>"""
563 |         soup = self.soup(markup)
564 |         self.assertEqual(markup, soup.encode("utf8"))
565 | 
566 |     def test_real_xhtml_document(self):
567 |         """A real XHTML document should come out *exactly* the same as it went in."""
568 |         markup = b"""<?xml version="1.0" encoding="utf-8"?>
569 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
570 | <html xmlns="http://www.w3.org/1999/xhtml">
571 | <head><title>Hello.</title></head>
572 | <body>Goodbye.</body>
573 | </html>"""
574 |         soup = self.soup(markup)
575 |         self.assertEqual(
576 |             soup.encode("utf-8"), markup)
577 | 
578 |     def test_formatter_processes_script_tag_for_xml_documents(self):
579 |         doc = """
580 |   <script type="text/javascript">
581 |   </script>
582 | """
583 |         soup = BeautifulSoup(doc, "lxml-xml")
584 |         # lxml would have stripped this while parsing, but we can add
585 |         # it later.
586 |         soup.script.string = 'console.log("< < hey > > ");'
587 |         encoded = soup.encode()
588 |         self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
589 | 
590 |     def test_can_parse_unicode_document(self):
591 |         markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
592 |         soup = self.soup(markup)
593 |         self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
594 | 
595 |     def test_popping_namespaced_tag(self):
596 |         markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
597 |         soup = self.soup(markup)
598 |         self.assertEqual(
599 |             unicode(soup.rss), markup)
600 | 
601 |     def test_docstring_includes_correct_encoding(self):
602 |         soup = self.soup("<root/>")
603 |         self.assertEqual(
604 |             soup.encode("latin1"),
605 |             b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
606 | 
607 |     def test_large_xml_document(self):
608 |         """A large XML document should come out the same as it went in."""
609 |         markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
610 |                   + b'0' * (2**12)
611 |                   + b'</root>')
612 |         soup = self.soup(markup)
613 |         self.assertEqual(soup.encode("utf-8"), markup)
614 | 
615 | 
616 |     def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
617 |         self.assertSoupEquals("<p>", "<p/>")
618 |         self.assertSoupEquals("<p>foo</p>")
619 | 
620 |     def test_namespaces_are_preserved(self):
621 |         markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
622 |         soup = self.soup(markup)
623 |         root = soup.root
624 |         self.assertEqual("http://example.com/", root['xmlns:a'])
625 |         self.assertEqual("http://example.net/", root['xmlns:b'])
626 | 
627 |     def test_closing_namespaced_tag(self):
628 |         markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
629 |         soup = self.soup(markup)
630 |         self.assertEqual(unicode(soup.p), markup)
631 | 
632 |     def test_namespaced_attributes(self):
633 |         markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
634 |         soup = self.soup(markup)
635 |         self.assertEqual(unicode(soup.foo), markup)
636 | 
637 |     def test_namespaced_attributes_xml_namespace(self):
638 |         markup = '<foo xml:lang="fr">bar</foo>'
639 |         soup = self.soup(markup)
640 |         self.assertEqual(unicode(soup.foo), markup)
641 | 
642 | class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
643 |     """Smoke test for a tree builder that supports HTML5."""
644 | 
645 |     def test_real_xhtml_document(self):
646 |         # Since XHTML is not HTML5, HTML5 parsers are not tested to handle
647 |         # XHTML documents in any particular way.
648 |         pass
649 | 
650 |     def test_html_tags_have_namespace(self):
651 |         markup = "<a>"
652 |         soup = self.soup(markup)
653 |         self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
654 | 
655 |     def test_svg_tags_have_namespace(self):
656 |         markup = '<svg><circle/></svg>'
657 |         soup = self.soup(markup)
658 |         namespace = "http://www.w3.org/2000/svg"
659 |         self.assertEqual(namespace, soup.svg.namespace)
660 |         self.assertEqual(namespace, soup.circle.namespace)
661 | 
662 | 
663 |     def test_mathml_tags_have_namespace(self):
664 |         markup = '<math><msqrt>5</msqrt></math>'
665 |         soup = self.soup(markup)
666 |         namespace = 'http://www.w3.org/1998/Math/MathML'
667 |         self.assertEqual(namespace, soup.math.namespace)
668 |         self.assertEqual(namespace, soup.msqrt.namespace)
669 | 
670 |     def test_xml_declaration_becomes_comment(self):
671 |         markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
672 |         soup = self.soup(markup)
673 |         self.assertTrue(isinstance(soup.contents[0], Comment))
674 |         self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
675 |         self.assertEqual("html", soup.contents[0].next_element.name)
676 | 
677 | def skipIf(condition, reason):
678 |    def nothing(test, *args, **kwargs):
679 |        return None
680 | 
681 |    def decorator(test_item):
682 |        if condition:
683 |            return nothing
684 |        else:
685 |            return test_item
686 | 
687 |    return decorator
688 | 


--------------------------------------------------------------------------------