├── .gitmodules ├── MANIFEST.in ├── .gitignore ├── tox.ini ├── README.md ├── rust-glue ├── Cargo.toml ├── examples │ └── time_parse_stdin.rs ├── Cargo.lock └── lib.rs ├── Makefile ├── COPYRIGHT ├── setup.py ├── tests ├── api.py └── tree_construction.py ├── benchmarks ├── results-cpython2 ├── results-cpython3 ├── results-pypy └── run.py ├── LICENSE-MIT ├── setuptools_ext.py ├── html5ever ├── _build_ffi.py ├── elementtree.py └── __init__.py └── LICENSE-APACHE /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "tests/html5lib-tests"] 2 | path = tests/html5lib-tests 3 | url = https://github.com/html5lib/html5lib-tests 4 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include rust-glue/Cargo.toml 2 | include rust-glue/Cargo.lock 3 | include rust-glue/lib.rs 4 | include setuptools_ext.py 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.so 2 | *.egg-info 3 | *.egg 4 | *.pyc 5 | __pycache__/ 6 | .tox/ 7 | html5ever/_ffi.py 8 | target/ 9 | .cache/ 10 | .eggs/ 11 | profile 12 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py26, py27, py33, py34, pypy 3 | 4 | [testenv] 5 | deps = pytest 6 | commands = py.test 7 | passenv = *RUST* *CARGO* HOME 8 | 9 | [pytest] 10 | testpaths = tests 11 | norecursedirs = * 12 | python_files = *.py 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | html5ever-python 2 | ================ 3 | 4 | Python bindings for [html5ever](https://github.com/servo/html5ever), 5 | using [CFFI](http://cffi.readthedocs.org/en/latest/). 6 | 7 | [![No Maintenance Intended](http://unmaintained.tech/badge.svg)](http://unmaintained.tech/) 8 | -------------------------------------------------------------------------------- /rust-glue/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "html5ever-capi" 3 | version = "0.1.0" 4 | authors = ["Simon Sapin "] 5 | 6 | [lib] 7 | name = "html5ever_capi" 8 | path = "lib.rs" 9 | crate-type = ["dylib"] 10 | test = false 11 | 12 | [dependencies] 13 | html5ever = "0.2.4" 14 | string_cache = "0.1.12" 15 | tendril = "0.1.5" 16 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | VENV = ~/.virtualenvs/html5ever 2 | 3 | test: 4 | @${VENV}/bin/python setup.py -q develop 5 | @${VENV}/bin/py.test -q 6 | 7 | bench: 8 | @${VENV}/bin/python benchmarks/run.py | tee benchmarks/results-cpython3 9 | @${VENV}-py2/bin/python benchmarks/run.py | tee benchmarks/results-cpython2 10 | @${VENV}-pypy/bin/python benchmarks/run.py | tee benchmarks/results-pypy 11 | -------------------------------------------------------------------------------- /COPYRIGHT: -------------------------------------------------------------------------------- 1 | This project is copyright 2015, The html5ever-python Project Developers 2 | (as shown by `git shortlog -se`). 3 | 4 | Licensed under the Apache License, Version 2.0 or the MIT license , at your option. Files in the project 7 | may not be copied, modified, or distributed except according to those terms. 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='html5ever', 5 | url='https://github.com/SimonSapin/html5ever-python', 6 | license='MIT / Apache-2.0', 7 | packages=['html5ever'], 8 | 9 | setup_requires=['cffi>=1.0.0'], 10 | install_requires=['cffi>=1.0.0'], 11 | cffi_modules=['html5ever/_build_ffi.py:ffi'], 12 | 13 | entry_points={'distutils.setup_keywords': ['rust_crates = setuptools_ext:rust_crates']}, 14 | rust_crates=[('rust-glue', 'html5ever')], 15 | ) 16 | -------------------------------------------------------------------------------- /tests/api.py: -------------------------------------------------------------------------------- 1 | import gc 2 | from html5ever import Parser, parse 3 | 4 | def test_parser_gc(): 5 | deleted = [False] 6 | class RecordDel(object): 7 | def __del__(self): 8 | deleted[0] = True 9 | 10 | parser = Parser() 11 | parser._document.record_del = RecordDel() 12 | assert not deleted[0] 13 | 14 | del parser 15 | gc.collect() 16 | assert deleted[0] 17 | 18 | def test_feed(): 19 | Parser().feed(b'a') 20 | 21 | def test_parse(): 22 | parse(b'a') 23 | -------------------------------------------------------------------------------- /benchmarks/results-cpython2: -------------------------------------------------------------------------------- 1 | Python 2.7.10 (default, Sep 7 2015, 13:51:49) [GCC 5.2.0] 2 | rustc rustc 1.5.0-nightly (f0666b45d 2015-09-24) 3 | html5ever 0.2.5 4 | lxml (3, 4, 4, 0) 5 | libxml (2, 9, 2) 6 | htmllib 0.9999999 7 | 8 | HTML source SHA1: 6a2963fa1f15fe99884acdad8a895af4df8e8c01 9 | Best time of 3, parsing 5,598,887 bytes of HTML: 10 | 11 | html5ever to Rust RcDom: 16.612 MiB/s 12 | lxml.html: 25.601 MiB/s 13 | html5ever-python: 2.923 MiB/s 14 | html5ever-python to ElementTree: 2.607 MiB/s 15 | html5lib to ElementTree: 0.688 MiB/s 16 | html5lib to lxml: 0.446 MiB/s 17 | 18 | -------------------------------------------------------------------------------- /benchmarks/results-cpython3: -------------------------------------------------------------------------------- 1 | Python 3.4.3 (default, Sep 7 2015, 15:40:35) [GCC 5.2.0] 2 | rustc rustc 1.5.0-nightly (f0666b45d 2015-09-24) 3 | html5ever 0.2.5 4 | lxml (3, 4, 4, 0) 5 | libxml (2, 9, 2) 6 | htmllib 0.9999999 7 | 8 | HTML source SHA1: 6a2963fa1f15fe99884acdad8a895af4df8e8c01 9 | Best time of 3, parsing 5,598,887 bytes of HTML: 10 | 11 | html5ever to Rust RcDom: 16.801 MiB/s 12 | lxml.html: 25.107 MiB/s 13 | html5ever-python: 2.957 MiB/s 14 | html5ever-python to ElementTree: 2.839 MiB/s 15 | html5lib to ElementTree: 0.695 MiB/s 16 | html5lib to lxml: 0.462 MiB/s 17 | 18 | -------------------------------------------------------------------------------- /benchmarks/results-pypy: -------------------------------------------------------------------------------- 1 | Python 2.7.10 (f3ad1e1e1d62, Sep 07 2015, 21:46:51) [PyPy 2.6.1 with GCC 5.2.0] 2 | rustc rustc 1.5.0-nightly (f0666b45d 2015-09-24) 3 | html5ever 0.2.5 4 | lxml (3, 4, 4, 0) 5 | libxml (2, 9, 2) 6 | htmllib 0.9999999 7 | 8 | HTML source SHA1: 6a2963fa1f15fe99884acdad8a895af4df8e8c01 9 | Best time of 3, parsing 5,598,887 bytes of HTML: 10 | 11 | html5ever to Rust RcDom: 16.822 MiB/s 12 | lxml.html: 24.181 MiB/s 13 | html5ever-python: 8.832 MiB/s 14 | html5ever-python to ElementTree: 7.825 MiB/s 15 | html5lib to ElementTree: 1.888 MiB/s 16 | html5lib to lxml: 0.249 MiB/s 17 | 18 | -------------------------------------------------------------------------------- /rust-glue/examples/time_parse_stdin.rs: -------------------------------------------------------------------------------- 1 | #![feature(duration_span)] 2 | extern crate html5ever; 3 | extern crate tendril; 4 | 5 | use html5ever::{parse, one_input}; 6 | use html5ever::rcdom::RcDom; 7 | use tendril::StrTendril; 8 | use std::io::{stdin, Read}; 9 | use std::time::Duration; 10 | 11 | fn main() { 12 | let mut data = Vec::new(); 13 | stdin().read_to_end(&mut data).unwrap(); 14 | let d = (0..3).map(|_| Duration::span(|| { 15 | let data = StrTendril::from_slice(&String::from_utf8_lossy(&data)); 16 | let _dom: RcDom = parse(one_input(data), Default::default()); 17 | })).min().unwrap(); 18 | println!("{}.{:09}", d.as_secs(), d.subsec_nanos()); 19 | } 20 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014 The html5ever Project Developers 2 | 3 | Permission is hereby granted, free of charge, to any 4 | person obtaining a copy of this software and associated 5 | documentation files (the "Software"), to deal in the 6 | Software without restriction, including without 7 | limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software 10 | is furnished to do so, subject to the following 11 | conditions: 12 | 13 | The above copyright notice and this permission notice 14 | shall be included in all copies or substantial portions 15 | of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | DEALINGS IN THE SOFTWARE. 26 | -------------------------------------------------------------------------------- /setuptools_ext.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import shutil 3 | import subprocess 4 | import sys 5 | from distutils import log 6 | 7 | try: 8 | basestring 9 | except NameError: 10 | # Python 3.x 11 | basestring = str 12 | 13 | 14 | if sys.platform == 'win32': 15 | DYNAMIC_LIB_SUFFIX = '.dll' 16 | elif sys.platform == 'darwin': 17 | DYNAMIC_LIB_SUFFIX = '.dylib' 18 | else: 19 | DYNAMIC_LIB_SUFFIX = '.so' 20 | 21 | 22 | def rust_crates(dist, attr, value): 23 | assert attr == 'rust_crates' 24 | if isinstance(value, basestring): 25 | value = [value] 26 | 27 | release = True 28 | 29 | for crate, destination in value: 30 | args = ['cargo', 'build', '--manifest-path', os.path.join(crate, 'Cargo.toml')] 31 | if release: 32 | args.append('--release') 33 | log.info(' '.join(args)) 34 | subprocess.check_call(args) 35 | 36 | target = os.path.join(crate, 'target', 'release' if release else 'debug') 37 | libs = [name for name in os.listdir(target) if name.endswith(DYNAMIC_LIB_SUFFIX)] 38 | assert libs 39 | for lib in libs: 40 | shutil.copy(os.path.join(target, lib), os.path.join(destination, lib)) 41 | 42 | # Tell bdist_wheel to include the CPU architecture in the wheel file name. 43 | # FIXME: Can we do that but *not* include the Python version/implementation? 44 | dist.is_pure = lambda: False 45 | -------------------------------------------------------------------------------- /html5ever/_build_ffi.py: -------------------------------------------------------------------------------- 1 | from cffi import FFI 2 | 3 | ffi = FFI() 4 | ffi.set_source('html5ever._ffi', None) 5 | ffi.cdef(''' 6 | 7 | typedef ... Callbacks; 8 | typedef ... ParserUserData; 9 | typedef ... Node; 10 | typedef ... Parser; 11 | 12 | typedef struct { 13 | uint8_t* ptr; 14 | uintptr_t len; 15 | } BytesSlice; 16 | 17 | typedef BytesSlice Utf8Slice; 18 | 19 | Callbacks* declare_callbacks( 20 | Node* (*clone_node_ref)(ParserUserData*, Node*), 21 | int (*destroy_node_ref)(ParserUserData*, Node*), 22 | int (*same_node)(ParserUserData*, Node*, Node*), 23 | int (*parse_error)(ParserUserData*, Utf8Slice), 24 | 25 | Node* (*create_element)(ParserUserData*, Utf8Slice, Utf8Slice), 26 | Node* (*get_template_contents)(ParserUserData*, Node*), 27 | int (*add_attribute_if_missing)(ParserUserData*, Node*, Utf8Slice, Utf8Slice, Utf8Slice), 28 | Node* (*create_comment)(ParserUserData*, Utf8Slice), 29 | int (*append_doctype_to_document)(ParserUserData*, uintptr_t, Utf8Slice, Utf8Slice, Utf8Slice), 30 | 31 | int (*append_node)(ParserUserData*, Node*, Node*), 32 | int (*append_text)(ParserUserData*, Node*, Utf8Slice), 33 | int (*insert_node_before_sibling)(ParserUserData*, Node*, Node*), 34 | int (*insert_text_before_sibling)(ParserUserData*, Node*, Utf8Slice), 35 | int (*reparent_children)(ParserUserData*, Node*, Node*), 36 | int (*remove_from_parent)(ParserUserData*, Node*) 37 | ); 38 | 39 | Parser* new_parser(Callbacks*, ParserUserData*, Node*); 40 | int destroy_parser(Parser*); 41 | int feed_parser(Parser*, BytesSlice); 42 | int end_parser(Parser*); 43 | 44 | ''') 45 | 46 | if __name__ == '__main__': 47 | ffi.compile() 48 | -------------------------------------------------------------------------------- /benchmarks/run.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import html5ever.elementtree 3 | import html5lib 4 | import lxml.html 5 | import os.path 6 | import re 7 | import subprocess 8 | import sys 9 | import timeit 10 | try: 11 | from urllib.request import urlopen # Python 3.x 12 | except ImportError: 13 | from urllib import urlopen 14 | 15 | 16 | def run(url, quick=False): 17 | html = urlopen(url).read() 18 | bytes = len(html) 19 | root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 20 | print('Python {}'.format(sys.version.replace('\n', ' '))) 21 | if not quick: 22 | print('rustc {}'.format(rustc_version())) 23 | print('html5ever {}'.format(html5ever_version(root))) 24 | print('lxml {}'.format(lxml.etree.LXML_VERSION)) 25 | print('libxml {}'.format(lxml.etree.LIBXML_COMPILED_VERSION)) 26 | print('htmllib {}'.format(html5lib.__version__)) 27 | print('') 28 | print('HTML source SHA1: {}'.format(hashlib.sha1(html).hexdigest())) 29 | print('Best time of 3, parsing {:,} bytes of HTML:'.format(bytes)) 30 | print('') 31 | sys.stdout.flush() 32 | bench_rust(bytes, root, html) 33 | bench_python(bytes, 'lxml.html', lambda: lxml.html.fromstring(html)) 34 | bench_python(bytes, 'html5ever-python', lambda: html5ever.parse(html)) 35 | bench_python(bytes, 'html5ever-python to ElementTree', 36 | lambda: html5ever.parse(html, tree_builder=html5ever.elementtree.TreeBuilder)) 37 | if not quick: 38 | bench_python(bytes, 'html5lib to ElementTree', lambda: html5lib.parse(html)) 39 | bench_python(bytes, 'html5lib to lxml', lambda: html5lib.parse(html, treebuilder='lxml')) 40 | print('') 41 | 42 | 43 | def rustc_version(): 44 | stdout, stderr = subprocess.Popen(['rustc', '--version'], stdout=subprocess.PIPE).communicate() 45 | return stdout.strip().decode('utf8') 46 | 47 | 48 | def html5ever_version(root): 49 | with open(os.path.join(root, 'rust-glue', 'Cargo.lock'), 'rb') as fd: 50 | return re.search(b'html5ever ([\d.]+)', fd.read()).group(1).decode('utf8') 51 | 52 | 53 | def bench_rust(bytes, root, html): 54 | subprocess.check_call([ 55 | 'cargo', 'test', '--no-run', '--release', '--manifest-path', 56 | os.path.join(root, 'rust-glue', 'Cargo.toml'), 57 | ]) 58 | stdout, _stderr = subprocess.Popen( 59 | [os.path.join(root, 'rust-glue', 'target', 'release', 'examples', 'time_parse_stdin')], 60 | stdin=subprocess.PIPE, 61 | stdout=subprocess.PIPE, 62 | ).communicate(html) 63 | bench(bytes, 'html5ever to Rust RcDom', float(stdout)) 64 | 65 | 66 | def bench_python(bytes, name, func): 67 | bench(bytes, name, min(timeit.repeat(func, number=1, repeat=3))) 68 | 69 | 70 | def bench(bytes, name, seconds): 71 | print('{}: {:.3f} MiB/s'.format(name, bytes / seconds / (1024. ** 2))) 72 | sys.stdout.flush() 73 | 74 | 75 | if __name__ == '__main__': 76 | quick = '--quick' in sys.argv 77 | if quick: 78 | sys.argv.remove('--quick') 79 | if len(sys.argv) > 1: 80 | url = sys.argv[1] 81 | else: 82 | url = 'https://raw.githubusercontent.com/whatwg/html/d8717d8831c276ca65d2d44bbf2ce4ce673997b9/source' 83 | run(url, quick) 84 | -------------------------------------------------------------------------------- /html5ever/elementtree.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree as ET 2 | 3 | 4 | def qname(namespace_url, local_name): 5 | return '{%s}%s' % (namespace_url, local_name) if namespace_url else local_name 6 | 7 | 8 | class TreeBuilder(object): 9 | def __init__(self): 10 | self.parent_map = {} 11 | 12 | def new_document(self): 13 | return ET.ElementTree() 14 | 15 | def new_element(self, namespace_url, local_name): 16 | return ET.Element(qname(namespace_url, local_name)) 17 | 18 | def element_add_template_contents(self, element): 19 | # Store the template contents as children of the