foobar
", "foobar
") 44 | self.assertSoupEquals( 45 | "foobar
", "foobar
") 46 | self.assertSoupEquals( 47 | "foobar
", "foobar
") 48 | 49 | # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this 50 | # test if an old version of lxml is installed. 51 | 52 | @skipIf( 53 | not LXML_PRESENT or LXML_VERSION < (2,3,5,0), 54 | "Skipping doctype test for old version of lxml to avoid segfault.") 55 | def test_empty_doctype(self): 56 | soup = self.soup("") 57 | doctype = soup.contents[0] 58 | self.assertEqual("", doctype.strip()) 59 | 60 | def test_beautifulstonesoup_is_xml_parser(self): 61 | # Make sure that the deprecated BSS class uses an xml builder 62 | # if one is installed. 63 | with warnings.catch_warnings(record=True) as w: 64 | soup = BeautifulStoneSoup("") 65 | self.assertEqual(u"", unicode(soup.b)) 66 | self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) 67 | 68 | @skipIf( 69 | not LXML_PRESENT, 70 | "lxml seems not to be present, not testing its XML tree builder.") 71 | class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): 72 | """See ``HTMLTreeBuilderSmokeTest``.""" 73 | 74 | @property 75 | def default_builder(self): 76 | return LXMLTreeBuilderForXML() 77 | -------------------------------------------------------------------------------- /bs4/tests/test_html5lib.py: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the html5lib tree builder generates good trees.""" 2 | 3 | import warnings 4 | 5 | try: 6 | from bs4.builder import HTML5TreeBuilder 7 | HTML5LIB_PRESENT = True 8 | except ImportError, e: 9 | HTML5LIB_PRESENT = False 10 | from bs4.element import SoupStrainer 11 | from bs4.testing import ( 12 | HTML5TreeBuilderSmokeTest, 13 | SoupTest, 14 | skipIf, 15 | ) 16 | 17 | @skipIf( 18 | not HTML5LIB_PRESENT, 19 | "html5lib seems not to be present, not testing its tree builder.") 20 | class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): 21 | """See ``HTML5TreeBuilderSmokeTest``.""" 22 | 23 | @property 24 | def default_builder(self): 25 | return HTML5TreeBuilder() 26 | 27 | def test_soupstrainer(self): 28 | # The html5lib tree builder does not support SoupStrainers. 29 | strainer = SoupStrainer("b") 30 | markup = "A bold statement.
" 31 | with warnings.catch_warnings(record=True) as w: 32 | soup = self.soup(markup, parse_only=strainer) 33 | self.assertEqual( 34 | soup.decode(), self.document_for(markup)) 35 | 36 | self.assertTrue( 37 | "the html5lib tree builder doesn't support parse_only" in 38 | str(w[0].message)) 39 | 40 | def test_correctly_nested_tables(self): 41 | """html5lib inserts tags where other parsers don't.""" 42 | markup = ('Here's another table:"
45 | '
| ')
48 |
49 | self.assertSoupEquals(
50 | markup,
51 | '
Here\'s another table:'
52 | '
|
| Foo |
| Bar |
| Baz |
foo
68 | 69 | ''' 70 | soup = self.soup(markup) 71 | # Verify that we can reach thetag; this means the tree is connected. 72 | self.assertEqual(b"
foo
", soup.p.encode()) 73 | 74 | def test_reparented_markup(self): 75 | markup = 'foo
\n' 76 | soup = self.soup(markup) 77 | self.assertEqual(u"foo
\n", soup.body.decode()) 78 | self.assertEqual(2, len(soup.find_all('p'))) 79 | 80 | 81 | def test_reparented_markup_ends_with_whitespace(self): 82 | markup = 'foo
\n\n' 83 | soup = self.soup(markup) 84 | self.assertEqual(u"foo
\n\n", soup.body.decode()) 85 | self.assertEqual(2, len(soup.find_all('p'))) 86 | 87 | def test_processing_instruction(self): 88 | """Processing instructions become comments.""" 89 | markup = b"""""" 90 | soup = self.soup(markup) 91 | assert str(soup).startswith("") 92 | 93 | def test_cloned_multivalue_node(self): 94 | markup = b"""""" 95 | soup = self.soup(markup) 96 | a1, a2 = soup.find_all('a') 97 | self.assertEqual(a1, a2) 98 | assert a1 is not a2 99 | -------------------------------------------------------------------------------- /xpinyin/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import unicode_literals 4 | 5 | import os.path 6 | import re 7 | 8 | PinyinToneMark = { 9 | 0: u"aoeiuv\u00fc", 10 | 1: u"\u0101\u014d\u0113\u012b\u016b\u01d6\u01d6", 11 | 2: u"\u00e1\u00f3\u00e9\u00ed\u00fa\u01d8\u01d8", 12 | 3: u"\u01ce\u01d2\u011b\u01d0\u01d4\u01da\u01da", 13 | 4: u"\u00e0\u00f2\u00e8\u00ec\u00f9\u01dc\u01dc", 14 | } 15 | 16 | 17 | class Pinyin(object): 18 | 19 | """translate chinese hanzi to pinyin by python, inspired by flyerhzm’s 20 | `chinese\_pinyin`_ gem 21 | 22 | usage 23 | ----- 24 | :: 25 | 26 | >>> from xpinyin import Pinyin 27 | >>> p = Pinyin() 28 | >>> # default splitter is `-` 29 | >>> p.get_pinyin(u"上海") 30 | 'shang-hai' 31 | >>> # show tone marks 32 | >>> p.get_pinyin(u"上海", show_tone_marks=True) 33 | 'shàng-hǎi' 34 | >>> # remove splitter 35 | >>> p.get_pinyin(u"上海", '') 36 | 'shanghai' 37 | >>> # set splitter as whitespace 38 | >>> p.get_pinyin(u"上海", ' ') 39 | 'shang hai' 40 | >>> p.get_initial(u"上") 41 | 'S' 42 | >>> p.get_initials(u"上海") 43 | 'S-H' 44 | >>> p.get_initials(u"上海", u'') 45 | 'SH' 46 | >>> p.get_initials(u"上海", u' ') 47 | 'S H' 48 | 49 | 请输入utf8编码汉字 50 | .. _chinese\_pinyin: https://github.com/flyerhzm/chinese_pinyin 51 | """ 52 | 53 | data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 54 | 'Mandarin.dat') 55 | 56 | def __init__(self, data_path=data_path): 57 | self.dict = {} 58 | with open(data_path) as f: 59 | for line in f: 60 | k, v = line.split('\t') 61 | self.dict[k] = v 62 | 63 | @staticmethod 64 | def decode_pinyin(s): 65 | s = s.lower() 66 | r = "" 67 | t = "" 68 | for c in s: 69 | if "a" <= c <= 'z': 70 | t += c 71 | elif c == ':': 72 | assert t[-1] == 'u' 73 | t = t[:-1] + "\u00fc" 74 | else: 75 | if '0' <= c <= '5': 76 | tone = int(c) % 5 77 | if tone != 0: 78 | m = re.search("[aoeiuv\u00fc]+", t) 79 | if m is None: 80 | # pass when no vowels find yet 81 | t += c 82 | elif len(m.group(0)) == 1: 83 | # if just find one vowels, put the mark on it 84 | t = t[:m.start(0)] \ 85 | + PinyinToneMark[tone][PinyinToneMark[0].index(m.group(0))] \ 86 | + t[m.end(0):] 87 | else: 88 | # mark on vowels which search with "a, o, e" one by one 89 | # when "i" and "u" stand together, make the vowels behind 90 | for num, vowels in enumerate(("a", "o", "e", "ui", "iu")): 91 | if vowels in t: 92 | t = t.replace(vowels[-1], PinyinToneMark[tone][num]) 93 | break 94 | else: 95 | t += "!" 96 | r += t 97 | t = "" 98 | r += t 99 | return r 100 | 101 | @staticmethod 102 | def convert_pinyin(word, convert): 103 | if convert == 'capitalize': 104 | return word.capitalize() 105 | if convert == 'lower': 106 | return word.lower() 107 | if convert == 'upper': 108 | return word.upper() 109 | 110 | def get_pinyin(self, chars=u'你好', splitter=u'-', 111 | show_tone_marks=False, convert='lower'): 112 | result = [] 113 | flag = 1 114 | for char in chars: 115 | key = "%X" % ord(char) 116 | try: 117 | if show_tone_marks: 118 | word = self.decode_pinyin(self.dict[key].split()[0].strip()) 119 | else: 120 | word = self.dict[key].split()[0].strip()[:-1] 121 | word = self.convert_pinyin(word, convert) 122 | result.append(word) 123 | flag = 1 124 | except KeyError: 125 | if flag: 126 | result.append(char) 127 | else: 128 | result[-1] += char 129 | flag = 0 130 | return splitter.join(result) 131 | 132 | def get_initial(self, char=u'你'): 133 | try: 134 | return self.dict["%X" % ord(char)].split(" ")[0][0] 135 | except KeyError: 136 | return char 137 | 138 | def get_initials(self, chars=u'你好', splitter=u'-'): 139 | result = [] 140 | flag = 1 141 | for char in chars: 142 | try: 143 | result.append(self.dict["%X" % ord(char)].split(" ")[0][0]) 144 | flag = 1 145 | except KeyError: 146 | if flag: 147 | result.append(char) 148 | else: 149 | result[-1] += char 150 | 151 | return splitter.join(result) 152 | -------------------------------------------------------------------------------- /bs4/tests/test_builder_registry.py: -------------------------------------------------------------------------------- 1 | """Tests of the builder registry.""" 2 | 3 | import unittest 4 | import warnings 5 | 6 | from bs4 import BeautifulSoup 7 | from bs4.builder import ( 8 | builder_registry as registry, 9 | HTMLParserTreeBuilder, 10 | TreeBuilderRegistry, 11 | ) 12 | 13 | try: 14 | from bs4.builder import HTML5TreeBuilder 15 | HTML5LIB_PRESENT = True 16 | except ImportError: 17 | HTML5LIB_PRESENT = False 18 | 19 | try: 20 | from bs4.builder import ( 21 | LXMLTreeBuilderForXML, 22 | LXMLTreeBuilder, 23 | ) 24 | LXML_PRESENT = True 25 | except ImportError: 26 | LXML_PRESENT = False 27 | 28 | 29 | class BuiltInRegistryTest(unittest.TestCase): 30 | """Test the built-in registry with the default builders registered.""" 31 | 32 | def test_combination(self): 33 | if LXML_PRESENT: 34 | self.assertEqual(registry.lookup('fast', 'html'), 35 | LXMLTreeBuilder) 36 | 37 | if LXML_PRESENT: 38 | self.assertEqual(registry.lookup('permissive', 'xml'), 39 | LXMLTreeBuilderForXML) 40 | self.assertEqual(registry.lookup('strict', 'html'), 41 | HTMLParserTreeBuilder) 42 | if HTML5LIB_PRESENT: 43 | self.assertEqual(registry.lookup('html5lib', 'html'), 44 | HTML5TreeBuilder) 45 | 46 | def test_lookup_by_markup_type(self): 47 | if LXML_PRESENT: 48 | self.assertEqual(registry.lookup('html'), LXMLTreeBuilder) 49 | self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML) 50 | else: 51 | self.assertEqual(registry.lookup('xml'), None) 52 | if HTML5LIB_PRESENT: 53 | self.assertEqual(registry.lookup('html'), HTML5TreeBuilder) 54 | else: 55 | self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder) 56 | 57 | def test_named_library(self): 58 | if LXML_PRESENT: 59 | self.assertEqual(registry.lookup('lxml', 'xml'), 60 | LXMLTreeBuilderForXML) 61 | self.assertEqual(registry.lookup('lxml', 'html'), 62 | LXMLTreeBuilder) 63 | if HTML5LIB_PRESENT: 64 | self.assertEqual(registry.lookup('html5lib'), 65 | HTML5TreeBuilder) 66 | 67 | self.assertEqual(registry.lookup('html.parser'), 68 | HTMLParserTreeBuilder) 69 | 70 | def test_beautifulsoup_constructor_does_lookup(self): 71 | 72 | with warnings.catch_warnings(record=True) as w: 73 | # This will create a warning about not explicitly 74 | # specifying a parser, but we'll ignore it. 75 | 76 | # You can pass in a string. 77 | BeautifulSoup("", features="html") 78 | # Or a list of strings. 79 | BeautifulSoup("", features=["html", "fast"]) 80 | 81 | # You'll get an exception if BS can't find an appropriate 82 | # builder. 83 | self.assertRaises(ValueError, BeautifulSoup, 84 | "", features="no-such-feature") 85 | 86 | class RegistryTest(unittest.TestCase): 87 | """Test the TreeBuilderRegistry class in general.""" 88 | 89 | def setUp(self): 90 | self.registry = TreeBuilderRegistry() 91 | 92 | def builder_for_features(self, *feature_list): 93 | cls = type('Builder_' + '_'.join(feature_list), 94 | (object,), {'features' : feature_list}) 95 | 96 | self.registry.register(cls) 97 | return cls 98 | 99 | def test_register_with_no_features(self): 100 | builder = self.builder_for_features() 101 | 102 | # Since the builder advertises no features, you can't find it 103 | # by looking up features. 104 | self.assertEqual(self.registry.lookup('foo'), None) 105 | 106 | # But you can find it by doing a lookup with no features, if 107 | # this happens to be the only registered builder. 108 | self.assertEqual(self.registry.lookup(), builder) 109 | 110 | def test_register_with_features_makes_lookup_succeed(self): 111 | builder = self.builder_for_features('foo', 'bar') 112 | self.assertEqual(self.registry.lookup('foo'), builder) 113 | self.assertEqual(self.registry.lookup('bar'), builder) 114 | 115 | def test_lookup_fails_when_no_builder_implements_feature(self): 116 | builder = self.builder_for_features('foo', 'bar') 117 | self.assertEqual(self.registry.lookup('baz'), None) 118 | 119 | def test_lookup_gets_most_recent_registration_when_no_feature_specified(self): 120 | builder1 = self.builder_for_features('foo') 121 | builder2 = self.builder_for_features('bar') 122 | self.assertEqual(self.registry.lookup(), builder2) 123 | 124 | def test_lookup_fails_when_no_tree_builders_registered(self): 125 | self.assertEqual(self.registry.lookup(), None) 126 | 127 | def test_lookup_gets_most_recent_builder_supporting_all_features(self): 128 | has_one = self.builder_for_features('foo') 129 | has_the_other = self.builder_for_features('bar') 130 | has_both_early = self.builder_for_features('foo', 'bar', 'baz') 131 | has_both_late = self.builder_for_features('foo', 'bar', 'quux') 132 | lacks_one = self.builder_for_features('bar') 133 | has_the_other = self.builder_for_features('foo') 134 | 135 | # There are two builders featuring 'foo' and 'bar', but 136 | # the one that also features 'quux' was registered later. 137 | self.assertEqual(self.registry.lookup('foo', 'bar'), 138 | has_both_late) 139 | 140 | # There is only one builder featuring 'foo', 'bar', and 'baz'. 141 | self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'), 142 | has_both_early) 143 | 144 | def test_lookup_fails_when_cannot_reconcile_requested_features(self): 145 | builder1 = self.builder_for_features('foo', 'bar') 146 | builder2 = self.builder_for_features('foo', 'baz') 147 | self.assertEqual(self.registry.lookup('bar', 'baz'), None) 148 | -------------------------------------------------------------------------------- /bs4/diagnose.py: -------------------------------------------------------------------------------- 1 | """Diagnostic functions, mainly for use when doing tech support.""" 2 | 3 | __license__ = "MIT" 4 | 5 | import cProfile 6 | from StringIO import StringIO 7 | from HTMLParser import HTMLParser 8 | import bs4 9 | from bs4 import BeautifulSoup, __version__ 10 | from bs4.builder import builder_registry 11 | 12 | import os 13 | import pstats 14 | import random 15 | import tempfile 16 | import time 17 | import traceback 18 | import sys 19 | import cProfile 20 | 21 | def diagnose(data): 22 | """Diagnostic suite for isolating common problems.""" 23 | print "Diagnostic running on Beautiful Soup %s" % __version__ 24 | print "Python version %s" % sys.version 25 | 26 | basic_parsers = ["html.parser", "html5lib", "lxml"] 27 | for name in basic_parsers: 28 | for builder in builder_registry.builders: 29 | if name in builder.features: 30 | break 31 | else: 32 | basic_parsers.remove(name) 33 | print ( 34 | "I noticed that %s is not installed. Installing it may help." % 35 | name) 36 | 37 | if 'lxml' in basic_parsers: 38 | basic_parsers.append(["lxml", "xml"]) 39 | try: 40 | from lxml import etree 41 | print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) 42 | except ImportError, e: 43 | print ( 44 | "lxml is not installed or couldn't be imported.") 45 | 46 | 47 | if 'html5lib' in basic_parsers: 48 | try: 49 | import html5lib 50 | print "Found html5lib version %s" % html5lib.__version__ 51 | except ImportError, e: 52 | print ( 53 | "html5lib is not installed or couldn't be imported.") 54 | 55 | if hasattr(data, 'read'): 56 | data = data.read() 57 | elif os.path.exists(data): 58 | print '"%s" looks like a filename. Reading data from the file.' % data 59 | data = open(data).read() 60 | elif data.startswith("http:") or data.startswith("https:"): 61 | print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data 62 | print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." 63 | return 64 | print 65 | 66 | for parser in basic_parsers: 67 | print "Trying to parse your markup with %s" % parser 68 | success = False 69 | try: 70 | soup = BeautifulSoup(data, parser) 71 | success = True 72 | except Exception, e: 73 | print "%s could not parse the markup." % parser 74 | traceback.print_exc() 75 | if success: 76 | print "Here's what %s did with the markup:" % parser 77 | print soup.prettify() 78 | 79 | print "-" * 80 80 | 81 | def lxml_trace(data, html=True, **kwargs): 82 | """Print out the lxml events that occur during parsing. 83 | 84 | This lets you see how lxml parses a document when no Beautiful 85 | Soup code is running. 86 | """ 87 | from lxml import etree 88 | for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): 89 | print("%s, %4s, %s" % (event, element.tag, element.text)) 90 | 91 | class AnnouncingParser(HTMLParser): 92 | """Announces HTMLParser parse events, without doing anything else.""" 93 | 94 | def _p(self, s): 95 | print(s) 96 | 97 | def handle_starttag(self, name, attrs): 98 | self._p("%s START" % name) 99 | 100 | def handle_endtag(self, name): 101 | self._p("%s END" % name) 102 | 103 | def handle_data(self, data): 104 | self._p("%s DATA" % data) 105 | 106 | def handle_charref(self, name): 107 | self._p("%s CHARREF" % name) 108 | 109 | def handle_entityref(self, name): 110 | self._p("%s ENTITYREF" % name) 111 | 112 | def handle_comment(self, data): 113 | self._p("%s COMMENT" % data) 114 | 115 | def handle_decl(self, data): 116 | self._p("%s DECL" % data) 117 | 118 | def unknown_decl(self, data): 119 | self._p("%s UNKNOWN-DECL" % data) 120 | 121 | def handle_pi(self, data): 122 | self._p("%s PI" % data) 123 | 124 | def htmlparser_trace(data): 125 | """Print out the HTMLParser events that occur during parsing. 126 | 127 | This lets you see how HTMLParser parses a document when no 128 | Beautiful Soup code is running. 129 | """ 130 | parser = AnnouncingParser() 131 | parser.feed(data) 132 | 133 | _vowels = "aeiou" 134 | _consonants = "bcdfghjklmnpqrstvwxyz" 135 | 136 | def rword(length=5): 137 | "Generate a random word-like string." 138 | s = '' 139 | for i in range(length): 140 | if i % 2 == 0: 141 | t = _consonants 142 | else: 143 | t = _vowels 144 | s += random.choice(t) 145 | return s 146 | 147 | def rsentence(length=4): 148 | "Generate a random sentence-like string." 149 | return " ".join(rword(random.randint(4,9)) for i in range(length)) 150 | 151 | def rdoc(num_elements=1000): 152 | """Randomly generate an invalid HTML document.""" 153 | tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] 154 | elements = [] 155 | for i in range(num_elements): 156 | choice = random.randint(0,3) 157 | if choice == 0: 158 | # New tag. 159 | tag_name = random.choice(tag_names) 160 | elements.append("<%s>" % tag_name) 161 | elif choice == 1: 162 | elements.append(rsentence(random.randint(1,4))) 163 | elif choice == 2: 164 | # Close a tag. 165 | tag_name = random.choice(tag_names) 166 | elements.append("%s>" % tag_name) 167 | return "" + "\n".join(elements) + "" 168 | 169 | def benchmark_parsers(num_elements=100000): 170 | """Very basic head-to-head performance benchmark.""" 171 | print "Comparative parser benchmark on Beautiful Soup %s" % __version__ 172 | data = rdoc(num_elements) 173 | print "Generated a large invalid HTML document (%d bytes)." % len(data) 174 | 175 | for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: 176 | success = False 177 | try: 178 | a = time.time() 179 | soup = BeautifulSoup(data, parser) 180 | b = time.time() 181 | success = True 182 | except Exception, e: 183 | print "%s could not parse the markup." % parser 184 | traceback.print_exc() 185 | if success: 186 | print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) 187 | 188 | from lxml import etree 189 | a = time.time() 190 | etree.HTML(data) 191 | b = time.time() 192 | print "Raw lxml parsed the markup in %.2fs." % (b-a) 193 | 194 | import html5lib 195 | parser = html5lib.HTMLParser() 196 | a = time.time() 197 | parser.parse(data) 198 | b = time.time() 199 | print "Raw html5lib parsed the markup in %.2fs." % (b-a) 200 | 201 | def profile(num_elements=100000, parser="lxml"): 202 | 203 | filehandle = tempfile.NamedTemporaryFile() 204 | filename = filehandle.name 205 | 206 | data = rdoc(num_elements) 207 | vars = dict(bs4=bs4, data=data, parser=parser) 208 | cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename) 209 | 210 | stats = pstats.Stats(filename) 211 | # stats.strip_dirs() 212 | stats.sort_stats("cumulative") 213 | stats.print_stats('_html5lib|bs4', 50) 214 | 215 | if __name__ == '__main__': 216 | diagnose(sys.stdin.read()) 217 | -------------------------------------------------------------------------------- /workflow/background.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | # 4 | # Copyright (c) 2014 deanishe@deanishe.net 5 | # 6 | # MIT Licence. See http://opensource.org/licenses/MIT 7 | # 8 | # Created on 2014-04-06 9 | # 10 | 11 | """ 12 | Run background tasks 13 | """ 14 | 15 | from __future__ import print_function, unicode_literals 16 | 17 | import sys 18 | import os 19 | import subprocess 20 | import pickle 21 | 22 | from workflow import Workflow 23 | 24 | __all__ = ['is_running', 'run_in_background'] 25 | 26 | _wf = None 27 | 28 | 29 | def wf(): 30 | global _wf 31 | if _wf is None: 32 | _wf = Workflow() 33 | return _wf 34 | 35 | 36 | def _arg_cache(name): 37 | """Return path to pickle cache file for arguments 38 | 39 | :param name: name of task 40 | :type name: ``unicode`` 41 | :returns: Path to cache file 42 | :rtype: ``unicode`` filepath 43 | 44 | """ 45 | 46 | return wf().cachefile('{0}.argcache'.format(name)) 47 | 48 | 49 | def _pid_file(name): 50 | """Return path to PID file for ``name`` 51 | 52 | :param name: name of task 53 | :type name: ``unicode`` 54 | :returns: Path to PID file for task 55 | :rtype: ``unicode`` filepath 56 | 57 | """ 58 | 59 | return wf().cachefile('{0}.pid'.format(name)) 60 | 61 | 62 | def _process_exists(pid): 63 | """Check if a process with PID ``pid`` exists 64 | 65 | :param pid: PID to check 66 | :type pid: ``int`` 67 | :returns: ``True`` if process exists, else ``False`` 68 | :rtype: ``Boolean`` 69 | """ 70 | 71 | try: 72 | os.kill(pid, 0) 73 | except OSError: # not running 74 | return False 75 | return True 76 | 77 | 78 | def is_running(name): 79 | """ 80 | Test whether task is running under ``name`` 81 | 82 | :param name: name of task 83 | :type name: ``unicode`` 84 | :returns: ``True`` if task with name ``name`` is running, else ``False`` 85 | :rtype: ``Boolean`` 86 | 87 | """ 88 | pidfile = _pid_file(name) 89 | if not os.path.exists(pidfile): 90 | return False 91 | 92 | with open(pidfile, 'rb') as file_obj: 93 | pid = int(file_obj.read().strip()) 94 | 95 | if _process_exists(pid): 96 | return True 97 | 98 | elif os.path.exists(pidfile): 99 | os.unlink(pidfile) 100 | 101 | return False 102 | 103 | 104 | def _background(stdin='/dev/null', stdout='/dev/null', 105 | stderr='/dev/null'): # pragma: no cover 106 | """Fork the current process into a background daemon. 107 | 108 | :param stdin: where to read input 109 | :type stdin: filepath 110 | :param stdout: where to write stdout output 111 | :type stdout: filepath 112 | :param stderr: where to write stderr output 113 | :type stderr: filepath 114 | 115 | """ 116 | 117 | # Do first fork. 118 | try: 119 | pid = os.fork() 120 | if pid > 0: 121 | sys.exit(0) # Exit first parent. 122 | except OSError as e: 123 | wf().logger.critical("fork #1 failed: ({0:d}) {1}".format( 124 | e.errno, e.strerror)) 125 | sys.exit(1) 126 | # Decouple from parent environment. 127 | os.chdir(wf().workflowdir) 128 | os.umask(0) 129 | os.setsid() 130 | # Do second fork. 131 | try: 132 | pid = os.fork() 133 | if pid > 0: 134 | sys.exit(0) # Exit second parent. 135 | except OSError as e: 136 | wf().logger.critical("fork #2 failed: ({0:d}) {1}".format( 137 | e.errno, e.strerror)) 138 | sys.exit(1) 139 | # Now I am a daemon! 140 | # Redirect standard file descriptors. 141 | si = file(stdin, 'r', 0) 142 | so = file(stdout, 'a+', 0) 143 | se = file(stderr, 'a+', 0) 144 | if hasattr(sys.stdin, 'fileno'): 145 | os.dup2(si.fileno(), sys.stdin.fileno()) 146 | if hasattr(sys.stdout, 'fileno'): 147 | os.dup2(so.fileno(), sys.stdout.fileno()) 148 | if hasattr(sys.stderr, 'fileno'): 149 | os.dup2(se.fileno(), sys.stderr.fileno()) 150 | 151 | 152 | def run_in_background(name, args, **kwargs): 153 | """Pickle arguments to cache file, then call this script again via 154 | :func:`subprocess.call`. 155 | 156 | :param name: name of task 157 | :type name: ``unicode`` 158 | :param args: arguments passed as first argument to :func:`subprocess.call` 159 | :param \**kwargs: keyword arguments to :func:`subprocess.call` 160 | :returns: exit code of sub-process 161 | :rtype: ``int`` 162 | 163 | When you call this function, it caches its arguments and then calls 164 | ``background.py`` in a subprocess. The Python subprocess will load the 165 | cached arguments, fork into the background, and then run the command you 166 | specified. 167 | 168 | This function will return as soon as the ``background.py`` subprocess has 169 | forked, returning the exit code of *that* process (i.e. not of the command 170 | you're trying to run). 171 | 172 | If that process fails, an error will be written to the log file. 173 | 174 | If a process is already running under the same name, this function will 175 | return immediately and will not run the specified command. 176 | 177 | """ 178 | 179 | if is_running(name): 180 | wf().logger.info('Task `{0}` is already running'.format(name)) 181 | return 182 | 183 | argcache = _arg_cache(name) 184 | 185 | # Cache arguments 186 | with open(argcache, 'wb') as file_obj: 187 | pickle.dump({'args': args, 'kwargs': kwargs}, file_obj) 188 | wf().logger.debug('Command arguments cached to `{0}`'.format(argcache)) 189 | 190 | # Call this script 191 | cmd = ['/usr/bin/python', __file__, name] 192 | wf().logger.debug('Calling {0!r} ...'.format(cmd)) 193 | retcode = subprocess.call(cmd) 194 | if retcode: # pragma: no cover 195 | wf().logger.error('Failed to call task in background') 196 | else: 197 | wf().logger.debug('Executing task `{0}` in background...'.format(name)) 198 | return retcode 199 | 200 | 201 | def main(wf): # pragma: no cover 202 | """ 203 | Load cached arguments, fork into background, then call 204 | :meth:`subprocess.call` with cached arguments 205 | 206 | """ 207 | 208 | name = wf.args[0] 209 | argcache = _arg_cache(name) 210 | if not os.path.exists(argcache): 211 | wf.logger.critical('No arg cache found : {0!r}'.format(argcache)) 212 | return 1 213 | 214 | # Load cached arguments 215 | with open(argcache, 'rb') as file_obj: 216 | data = pickle.load(file_obj) 217 | 218 | # Cached arguments 219 | args = data['args'] 220 | kwargs = data['kwargs'] 221 | 222 | # Delete argument cache file 223 | os.unlink(argcache) 224 | 225 | pidfile = _pid_file(name) 226 | 227 | # Fork to background 228 | _background() 229 | 230 | # Write PID to file 231 | with open(pidfile, 'wb') as file_obj: 232 | file_obj.write('{0}'.format(os.getpid())) 233 | 234 | # Run the command 235 | try: 236 | wf.logger.debug('Task `{0}` running'.format(name)) 237 | wf.logger.debug('cmd : {0!r}'.format(args)) 238 | 239 | retcode = subprocess.call(args, **kwargs) 240 | 241 | if retcode: 242 | wf.logger.error('Command failed with [{0}] : {1!r}'.format( 243 | retcode, args)) 244 | 245 | finally: 246 | if os.path.exists(pidfile): 247 | os.unlink(pidfile) 248 | wf.logger.debug('Task `{0}` finished'.format(name)) 249 | 250 | 251 | if __name__ == '__main__': # pragma: no cover 252 | wf().run(main) 253 | -------------------------------------------------------------------------------- /info.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 |tag to be 111 | an empty-element tag (it's not in 112 | HTMLBuilder.empty_element_tags). This means an empty
tag 113 | will be presented as "
", not "". 114 | 115 | The default implementation has no opinion about which tags are 116 | empty-element tags, so a tag will be presented as an 117 | empty-element tag if and only if it has no contents. 118 | "foo
' 97 | soup = self.soup(markup) 98 | return doctype, soup 99 | 100 | def test_normal_doctypes(self): 101 | """Make sure normal, everyday HTML doctypes are handled correctly.""" 102 | self.assertDoctypeHandled("html") 103 | self.assertDoctypeHandled( 104 | 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') 105 | 106 | def test_empty_doctype(self): 107 | soup = self.soup("") 108 | doctype = soup.contents[0] 109 | self.assertEqual("", doctype.strip()) 110 | 111 | def test_public_doctype_with_url(self): 112 | doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' 113 | self.assertDoctypeHandled(doctype) 114 | 115 | def test_system_doctype(self): 116 | self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"') 117 | 118 | def test_namespaced_system_doctype(self): 119 | # We can handle a namespaced doctype with a system ID. 120 | self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"') 121 | 122 | def test_namespaced_public_doctype(self): 123 | # Test a namespaced doctype with a public id. 124 | self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"') 125 | 126 | def test_real_xhtml_document(self): 127 | """A real XHTML document should come out more or less the same as it went in.""" 128 | markup = b""" 129 | 130 | 131 |tag is never designated as an empty-element tag. 154 | 155 | Even if the markup shows it as an empty-element tag, it 156 | shouldn't be presented that way. 157 | """ 158 | soup = self.soup("
") 159 | self.assertFalse(soup.p.is_empty_element) 160 | self.assertEqual(str(soup.p), "") 161 | 162 | def test_unclosed_tags_get_closed(self): 163 | """A tag that's not closed by the end of the document should be closed. 164 | 165 | This applies to all tags except empty-element tags. 166 | """ 167 | self.assertSoupEquals("", "
") 168 | self.assertSoupEquals("", "") 169 | 170 | self.assertSoupEquals("foobaz
" 205 | self.assertSoupEquals(markup) 206 | 207 | soup = self.soup(markup) 208 | comment = soup.find(text="foobar") 209 | self.assertEqual(comment.__class__, Comment) 210 | 211 | # The comment is properly integrated into the tree. 212 | foo = soup.find(text="foo") 213 | self.assertEqual(comment, foo.next_element) 214 | baz = soup.find(text="baz") 215 | self.assertEqual(comment, baz.previous_element) 216 | 217 | def test_preserved_whitespace_in_pre_and_textarea(self): 218 | """Whitespace must be preserved inand