├── .gitignore
├── .gitmodules
├── .travis.yml
├── Cargo.toml
├── README.rst
├── glue.h
├── glue.pxd
├── glue.rs
├── htmlpyever.pyx
├── pytest.ini
├── setup.py
└── tests
├── conftest.py
└── xfail.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | Cargo.lock
3 | build
4 | target
5 | htmlpyever.egg-info
6 | dist
7 | .cache
8 | .eggs
9 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "tests/data"]
2 | path = tests/data
3 | url = https://github.com/html5lib/html5lib-tests
4 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | install:
3 | - pip install cython lxml
4 | - pip install -e .
5 | script: ./setup.py test
6 |
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "html5ever-glue"
3 | version = "0.1.0"
4 | authors = ["Theodore Dubois "]
5 |
6 | [lib]
7 | name = "html5ever_glue"
8 | path = "glue.rs"
9 | crate-type = ["staticlib"]
10 | test = false
11 |
12 | [dependencies]
13 | libc = "*"
14 | html5ever = "^0.13"
15 | html5ever-atoms = "^0.2"
16 | tendril = "^0.2.2"
17 | string_cache = "^0.4"
18 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | htmlpyever
2 | ==========
3 |
4 | htmlpyever is a very single-minded binding to html5ever. You can:
5 |
6 | * Feed the parser:
7 |
8 | .. code-block:: python
9 |
10 | parser.feed(b'hOI wURLD!')
11 |
12 | * Get a callback when the parser encounters a closing script tag:
13 |
14 | .. code-block:: python
15 |
16 | def script_callback(script):
17 | # handle script
18 | parser = htmlpyever.Parser(script_callback)
19 |
20 | # or
21 |
22 | class MyParser(htmlpyever.Parser):
23 | def run_script(self, script)
24 | # handle script
25 | parser = MyParser()
26 |
27 | * Obtain the result as an LXML ``Element`` or ``ElementTree``:
28 |
29 | .. code-block:: python
30 |
31 | from lxml import etree
32 | etree.tostring(parser.root)
33 | # >>> '
hOI! wURLD!'
34 | etree.tostring(parser.root)
35 | # >>> 'hOI! wURLD!'
36 | # not sure why the doctype doesn't show up in the serialized ElementTree
37 |
38 | That's it.
39 |
--------------------------------------------------------------------------------
/glue.h:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | typedef struct _BytesSlice {
4 | size_t len;
5 | const char *ptr;
6 | } h5eBytes;
7 | typedef h5eBytes h5eUnicode;
8 |
9 | typedef void *node_t;
10 | typedef struct _Parser h5eParser;
11 |
12 | typedef struct {
13 | node_t (*clone_node_ref)(void *data, node_t node);
14 | int (*destroy_node_ref)(void *data, node_t node);
15 | int (*same_node)(void *data, node_t node1, node_t node2);
16 | int (*parse_error)(void *data, h5eUnicode error);
17 | int (*run_script)(void *data, node_t script);
18 | node_t (*create_element)(void *data, h5eUnicode ns, h5eUnicode name);
19 | node_t (*get_template_contents)(void *data, node_t node);
20 | int (*add_attribute_if_missing)(void *data, node_t node, h5eUnicode ns, h5eUnicode name, h5eUnicode value);
21 | node_t (*create_comment)(void *data, h5eUnicode text);
22 | int (*append_doctype_to_document)(void *data, h5eUnicode name, h5eUnicode public_id, h5eUnicode system_id);
23 | int (*append_node)(void *data, node_t parent, node_t child);
24 | int (*append_text)(void *data, node_t node, h5eUnicode text);
25 | int (*insert_node_before_sibling)(void *data, node_t sibling, node_t node);
26 | int (*insert_text_before_sibling)(void *data, node_t sibling, h5eUnicode text);
27 | int (*reparent_children)(void *data, node_t node, node_t new_parent);
28 | int (*remove_from_parent)(void *data, node_t node);
29 | } h5eCallbacks;
30 |
31 | h5eParser* new_parser(h5eCallbacks *, void *data, node_t document, const char *frag_ctx_name, int scripting_enabled);
32 | int destroy_parser(h5eParser *);
33 | int feed_parser(h5eParser *, h5eBytes);
34 | int end_parser(h5eParser *);
35 |
--------------------------------------------------------------------------------
/glue.pxd:
--------------------------------------------------------------------------------
1 | cdef extern from "glue.h":
2 | ctypedef struct h5eBytes:
3 | size_t len
4 | const char *ptr
5 | ctypedef h5eBytes h5eUnicode
6 |
7 | ctypedef struct h5eQualName:
8 | h5eUnicode ns
9 | h5eUnicode local
10 |
11 | ctypedef void *node_t
12 | ctypedef struct h5eParser:
13 | pass
14 |
15 | ctypedef struct h5eCallbacks:
16 | node_t (*clone_node_ref)(void *data, node_t node)
17 | int (*destroy_node_ref)(void *data, node_t node)
18 | int (*same_node)(void *data, node_t node1, node_t node2)
19 | int (*parse_error)(void *data, h5eUnicode error)
20 | int (*run_script)(void *data, node_t script)
21 | node_t (*create_element)(void *data, h5eUnicode ns, h5eUnicode name)
22 | node_t (*get_template_contents)(void *data, node_t node)
23 | int (*add_attribute_if_missing)(void *data, node_t node, h5eUnicode ns, h5eUnicode name, h5eUnicode value)
24 | node_t (*create_comment)(void *data, h5eUnicode text)
25 | int (*append_doctype_to_document)(void *data, h5eUnicode name, h5eUnicode public_id, h5eUnicode system_id)
26 | int (*append_node)(void *data, node_t parent, node_t child)
27 | int (*append_text)(void *data, node_t node, h5eUnicode text)
28 | int (*insert_node_before_sibling)(void *data, node_t sibling, node_t node)
29 | int (*insert_text_before_sibling)(void *data, node_t sibling, h5eUnicode text)
30 | int (*reparent_children)(void *data, node_t node, node_t new_parent)
31 | int (*remove_from_parent)(void *data, node_t node)
32 |
33 | h5eParser* new_parser(h5eCallbacks *, void *data, node_t document, const char *frag_ctx_name, int scripting_enabled)
34 | int destroy_parser(h5eParser *)
35 | # if any of the callbacks threw an exception then this will return -1
36 | int feed_parser(h5eParser *, h5eBytes) except? -1
37 | int end_parser(h5eParser *)
38 |
--------------------------------------------------------------------------------
/glue.rs:
--------------------------------------------------------------------------------
1 | extern crate libc;
2 | extern crate html5ever;
3 | #[macro_use] extern crate html5ever_atoms;
4 | extern crate string_cache;
5 | extern crate tendril;
6 |
7 | use html5ever::tokenizer::{Tokenizer, TokenizerOpts, Attribute, TokenizerResult};
8 | use html5ever::tokenizer::buffer_queue::BufferQueue;
9 | use html5ever::tree_builder::{TreeBuilder, TreeBuilderOpts, TreeSink, QuirksMode, NodeOrText};
10 | use html5ever::QualName;
11 | use std::borrow::Cow;
12 | use std::slice;
13 | use std::mem;
14 | use std::ffi::CStr;
15 | use libc::{c_void, c_int, c_char, size_t};
16 | use std::panic::{catch_unwind, UnwindSafe};
17 | use tendril::StrTendril;
18 | use string_cache::atom::Atom;
19 |
20 | /// When given as a function parameter, only valid for the duration of the call.
21 | #[repr(C)]
22 | #[derive(Copy, Clone, Debug)]
23 | pub struct CBytes {
24 | len: size_t,
25 | ptr: *const u8,
26 | }
27 |
28 | impl CBytes {
29 | fn from_slice(slice: &[u8]) -> CBytes {
30 | CBytes {
31 | len: slice.len(),
32 | ptr: slice.as_ptr(),
33 | }
34 | }
35 |
36 | unsafe fn as_slice(&self) -> &[u8] {
37 | slice::from_raw_parts(self.ptr, self.len)
38 | }
39 | }
40 |
41 | /// When given as a function parameter, only valid for the duration of the call.
42 | #[repr(C)]
43 | #[derive(Copy, Clone, Debug)]
44 | pub struct CUnicode(CBytes);
45 |
46 | impl CUnicode {
47 | fn from_str(s: &str) -> CUnicode {
48 | CUnicode(CBytes::from_slice(s.as_bytes()))
49 | }
50 | }
51 |
52 | pub type OpaqueParserUserData = c_void;
53 | pub type OpaqueNode = c_void;
54 |
55 | struct NodeHandle {
56 | ptr: *const OpaqueNode,
57 | parser_user_data: *const OpaqueParserUserData,
58 | callbacks: &'static Callbacks,
59 | qualified_name: Option,
60 | }
61 |
62 | macro_rules! call {
63 | ($self_: expr, $callback: ident ( $( $arg: expr ),* )) => {
64 | ($self_.callbacks.$callback)($self_.parser_user_data, $( $arg ),* )
65 | };
66 | }
67 |
68 | macro_rules! call_if_some {
69 | ($self_: expr, $opt_callback: ident ( $( $arg: expr ),* )) => {
70 | call_if_some!($self_, $opt_callback( $( $arg ),* ) else 0)
71 | };
72 | ($self_: expr, $opt_callback: ident ( $( $arg: expr ),* ) else $default: expr) => {
73 | if let Some(callback) = $self_.callbacks.$opt_callback {
74 | callback($self_.parser_user_data, $( $arg ),* )
75 | } else {
76 | $default
77 | }
78 | };
79 | }
80 |
81 | impl Clone for NodeHandle {
82 | fn clone(&self) -> NodeHandle {
83 | NodeHandle {
84 | ptr: check_pointer(call_if_some!(self, clone_node_ref(self.ptr) else self.ptr)),
85 | parser_user_data: self.parser_user_data,
86 | callbacks: self.callbacks,
87 | qualified_name: self.qualified_name.clone(),
88 | }
89 | }
90 | }
91 |
92 | impl Drop for NodeHandle {
93 | fn drop(&mut self) {
94 | check_int(call_if_some!(self, destroy_node_ref(self.ptr)));
95 | }
96 | }
97 |
98 | struct CallbackTreeSink {
99 | parser_user_data: *const c_void,
100 | callbacks: &'static Callbacks,
101 | document: NodeHandle,
102 | quirks_mode: QuirksMode,
103 | }
104 |
105 | pub struct Parser {
106 | tokenizer: Tokenizer>
107 | }
108 |
109 | struct ParserMutPtr(*mut Parser);
110 |
111 | // FIXME: These make catch_unwind happy, but they are total lies as far as I know.
112 | impl UnwindSafe for CBytes {}
113 | impl UnwindSafe for ParserMutPtr {}
114 | impl UnwindSafe for Parser {}
115 |
116 | impl CallbackTreeSink {
117 | fn new_handle(&self, ptr: *const OpaqueNode) -> NodeHandle {
118 | NodeHandle {
119 | ptr: ptr,
120 | parser_user_data: self.parser_user_data,
121 | callbacks: self.callbacks,
122 | qualified_name: None,
123 | }
124 | }
125 |
126 | fn add_attributes_if_missing(&self, element: *const OpaqueNode, attributes: Vec) {
127 | for attribute in attributes {
128 | check_int(call!(self, add_attribute_if_missing(
129 | element,
130 | CUnicode::from_str(&attribute.name.ns),
131 | CUnicode::from_str(&attribute.name.local),
132 | CUnicode::from_str(&attribute.value))));
133 | }
134 | }
135 | }
136 |
137 | impl TreeSink for CallbackTreeSink {
138 | type Handle = NodeHandle;
139 | type Output = Self;
140 |
141 | fn finish(self) -> Self {
142 | self
143 | }
144 |
145 | fn parse_error(&mut self, msg: Cow<'static, str>) {
146 | check_int(call_if_some!(self, parse_error(CUnicode::from_str(&msg))));
147 | }
148 |
149 | fn get_document(&mut self) -> NodeHandle {
150 | self.document.clone()
151 | }
152 |
153 | fn get_template_contents(&mut self, target: NodeHandle) -> NodeHandle {
154 | self.new_handle(check_pointer(call!(self, get_template_contents(target.ptr))))
155 | }
156 |
157 | fn set_quirks_mode(&mut self, mode: QuirksMode) {
158 | self.quirks_mode = mode
159 | }
160 |
161 | fn same_node(&self, x: NodeHandle, y: NodeHandle) -> bool {
162 | check_int(call_if_some!(self, same_node(x.ptr, y.ptr) else (x.ptr == y.ptr) as c_int)) != 0
163 | }
164 |
165 | fn elem_name(&self, target: NodeHandle) -> QualName {
166 | target.qualified_name.as_ref().unwrap().clone()
167 | }
168 |
169 | fn create_element(&mut self, name: QualName, attrs: Vec) -> NodeHandle {
170 | let element = check_pointer(call!(self, create_element(
171 | CUnicode::from_str(&name.ns), CUnicode::from_str(&name.local))));
172 | self.add_attributes_if_missing(element, attrs);
173 | let mut handle = self.new_handle(element);
174 | handle.qualified_name = Some(name);
175 | handle
176 | }
177 |
178 | fn create_comment(&mut self, text: StrTendril) -> NodeHandle {
179 | self.new_handle(check_pointer(call!(
180 | self, create_comment(CUnicode::from_str(&text)))))
181 | }
182 |
183 | fn append(&mut self, parent: NodeHandle, child: NodeOrText) {
184 | check_int(match child {
185 | NodeOrText::AppendNode(node) => {
186 | call!(self, append_node(parent.ptr, node.ptr))
187 | }
188 | NodeOrText::AppendText(ref text) => {
189 | call!(self, append_text(parent.ptr, CUnicode::from_str(text)))
190 | }
191 | });
192 | }
193 |
194 | fn append_before_sibling(&mut self, sibling: NodeHandle, child: NodeOrText)
195 | -> Result<(), NodeOrText> {
196 | let result = check_int(match child {
197 | NodeOrText::AppendNode(ref node) => {
198 | call!(self, insert_node_before_sibling(sibling.ptr, node.ptr))
199 | }
200 | NodeOrText::AppendText(ref text) => {
201 | call!(self, insert_text_before_sibling(sibling.ptr, CUnicode::from_str(text)))
202 | }
203 | });
204 | if result == 0 {
205 | Ok(())
206 | } else {
207 | Err(child)
208 | }
209 | }
210 |
211 | fn append_doctype_to_document(&mut self,
212 | name: StrTendril,
213 | public_id: StrTendril,
214 | system_id: StrTendril) {
215 | check_int(call!(self, append_doctype_to_document(
216 | CUnicode::from_str(&name),
217 | CUnicode::from_str(&public_id),
218 | CUnicode::from_str(&system_id))));
219 | }
220 |
221 | fn add_attrs_if_missing(&mut self, target: NodeHandle, attrs: Vec) {
222 | self.add_attributes_if_missing(target.ptr, attrs)
223 | }
224 |
225 | fn remove_from_parent(&mut self, target: NodeHandle) {
226 | check_int(call!(self, remove_from_parent(target.ptr)));
227 | }
228 |
229 | fn reparent_children(&mut self, node: NodeHandle, new_parent: NodeHandle) {
230 | check_int(call!(self, reparent_children(node.ptr, new_parent.ptr)));
231 | }
232 |
233 | fn mark_script_already_started(&mut self, _target: NodeHandle) {}
234 | }
235 |
236 | macro_rules! declare_with_callbacks {
237 | ($( $( #[$attr:meta] )* callback $name: ident: $ty: ty )+) => {
238 | pub struct Callbacks {
239 | $( $( #[$attr] )* $name: $ty, )+
240 | }
241 |
242 | /// Return a heap-allocated stuct that lives forever,
243 | /// containing the given function pointers.
244 | ///
245 | /// This leaks memory, but you normally only need one of these per program.
246 | #[no_mangle]
247 | pub unsafe extern "C" fn declare_callbacks($( $name: $ty ),+)
248 | -> Option<&'static Callbacks> {
249 | catch_unwind_opt(move || {
250 | &*Box::into_raw(Box::new(Callbacks {
251 | $( $name: $name, )+
252 | }))
253 | })
254 | }
255 |
256 | }
257 | }
258 |
259 | declare_with_callbacks! {
260 | /// Create and return a new reference to the given node.
261 | /// The returned pointer may be the same as the given one.
262 | /// If this callback is not provided, the same pointer is always used
263 | callback clone_node_ref: Option *const OpaqueNode>
265 |
266 | /// Destroy a new reference to the given node.
267 | /// When all references are gone, the node itself can be destroyed.
268 | /// If this callback is not provided, references are leaked.
269 | callback destroy_node_ref: Option c_int>
271 |
272 | /// Return a position value if the two given references are for the same node,
273 | /// zero for different nodes, and a negative value of an unexpected error.
274 | /// If this callback is not provided, pointer equality is used.
275 | callback same_node: Option c_int>
277 |
278 | /// Log an author conformance error.
279 | /// The pointer is guaranteed to point to the given size of well-formed UTF-8 bytes.
280 | /// The pointer can not be used after the end of this call.
281 | /// If this callback is not provided, author conformance errors are ignored.
282 | callback parse_error: Option c_int>
284 |
285 | /// Run a script.
286 | callback run_script: Option c_int>
288 |
289 | /// Create an element node with the given namespace URL and local name.
290 | ///
291 | /// If the element in `template` element in the HTML namespace,
292 | /// an associated document fragment node should be created for the template contents.
293 | callback create_element: extern "C" fn(*const OpaqueParserUserData,
294 | CUnicode, CUnicode) -> *const OpaqueNode
295 |
296 | /// Return a reference to the document fragment node for the template contents.
297 | ///
298 | /// This is only ever called for `template` elements in the HTML namespace.
299 | callback get_template_contents: extern "C" fn(*const OpaqueParserUserData,
300 | *const OpaqueNode) -> *const OpaqueNode
301 |
302 | /// Add the attribute (given as namespace URL, local name, and value)
303 | /// to the given element node if the element doesn’t already have
304 | /// an attribute with that name in that namespace.
305 | callback add_attribute_if_missing: extern "C" fn(*const OpaqueParserUserData,
306 | *const OpaqueNode, CUnicode, CUnicode, CUnicode) -> c_int
307 |
308 | /// Create a comment node.
309 | callback create_comment: extern "C" fn(*const OpaqueParserUserData,
310 | CUnicode) -> *const OpaqueNode
311 |
312 | /// Create a doctype node and append it to the document.
313 | callback append_doctype_to_document: extern "C" fn(*const OpaqueParserUserData,
314 | CUnicode, CUnicode, CUnicode) -> c_int
315 |
316 | callback append_node: extern "C" fn(*const OpaqueParserUserData,
317 | *const OpaqueNode, *const OpaqueNode) -> c_int
318 |
319 | callback append_text: extern "C" fn(*const OpaqueParserUserData,
320 | *const OpaqueNode, CUnicode) -> c_int
321 |
322 | /// If `sibling` has a parent, insert the given node just before it and return 1.
323 | /// Otherwise, do nothing and return zero.
324 | callback insert_node_before_sibling: extern "C" fn(*const OpaqueParserUserData,
325 | *const OpaqueNode, *const OpaqueNode) -> c_int
326 |
327 | /// If `sibling` has a parent, insert the given text just before it and return 1.
328 | /// Otherwise, do nothing and return zero.
329 | callback insert_text_before_sibling: extern "C" fn(*const OpaqueParserUserData,
330 | *const OpaqueNode, CUnicode) -> c_int
331 |
332 | callback reparent_children: extern "C" fn(*const OpaqueParserUserData,
333 | *const OpaqueNode, *const OpaqueNode) -> c_int
334 |
335 | callback remove_from_parent: extern "C" fn(*const OpaqueParserUserData,
336 | *const OpaqueNode) -> c_int
337 | }
338 |
339 | #[no_mangle]
340 | pub unsafe extern "C" fn new_parser(callbacks: &'static Callbacks,
341 | data: *const OpaqueParserUserData,
342 | document: *const OpaqueNode,
343 | frag_ctx_name: *const c_char,
344 | scripting_enabled: c_int)
345 | -> Option> {
346 | // MUCH CODE
347 | // VERY BAD
348 | // CLEANUP PLS
349 | catch_unwind_opt(move || {
350 | let context_qualname = if frag_ctx_name.is_null() {
351 | None
352 | } else {
353 | Some(QualName {
354 | ns: ns!(html),
355 | local: Atom::from(CStr::from_ptr(frag_ctx_name).to_str().unwrap()),
356 | })
357 | };
358 | let mut sink = CallbackTreeSink {
359 | parser_user_data: data,
360 | callbacks: callbacks,
361 | document: NodeHandle {
362 | ptr: document,
363 | parser_user_data: data,
364 | callbacks: callbacks,
365 | qualified_name: None,
366 | },
367 | quirks_mode: QuirksMode::NoQuirks,
368 | };
369 | let tree_builder_options = TreeBuilderOpts {
370 | scripting_enabled: scripting_enabled != 0,
371 | ..Default::default()
372 | };
373 | let (tree_builder, initial_state) = match context_qualname {
374 | None => (TreeBuilder::new(sink, tree_builder_options), None),
375 | Some(qualname) => {
376 | let element = sink.create_element(qualname, Vec::new());
377 | let tree_builder = TreeBuilder::new_for_fragment(sink, element, None, tree_builder_options);
378 | let state = tree_builder.tokenizer_state_for_context_elem();
379 | (tree_builder, Some(state))
380 | },
381 | };
382 | let tokenizer_opts = TokenizerOpts {
383 | initial_state: initial_state,
384 | ..Default::default()
385 | };
386 | Box::new(Parser {
387 | tokenizer: Tokenizer::new(tree_builder, tokenizer_opts),
388 | })
389 | })
390 | }
391 |
392 | #[no_mangle]
393 | pub unsafe extern "C" fn feed_parser(parser: &mut Parser, chunk: CBytes) -> c_int {
394 | let parser = ParserMutPtr(parser);
395 | catch_unwind_int(move || {
396 | let parser = &mut *parser.0;
397 | // FIXME: Support UTF-8 byte sequences split across chunk boundary
398 | // FIXME: Go through the data once here instead of twice.
399 | let string = String::from_utf8_lossy(chunk.as_slice());
400 | let mut buffers = BufferQueue::new();
401 | buffers.push_back((&*string).into());
402 | while let TokenizerResult::Script(node) = parser.tokenizer.feed(&mut buffers) {
403 | let sink = parser.tokenizer.sink().sink();
404 | check_int(call_if_some!(sink, run_script(node.ptr)));
405 | }
406 | })
407 | }
408 |
409 | #[no_mangle]
410 | pub unsafe extern "C" fn end_parser(parser: &mut Parser) -> c_int {
411 | let parser = ParserMutPtr(parser);
412 | catch_unwind_int(move || {
413 | let parser = &mut *parser.0;
414 | parser.tokenizer.end();
415 | })
416 | }
417 |
418 | #[no_mangle]
419 | pub extern "C" fn destroy_parser(parser: Box) -> c_int {
420 | catch_unwind_int(move || {
421 | mem::drop(parser)
422 | })
423 | }
424 |
425 | #[no_mangle]
426 | pub extern "C" fn destroy_qualified_name(name: Box) -> c_int {
427 | catch_unwind_int(|| {
428 | mem::drop(name)
429 | })
430 | }
431 |
432 | fn catch_unwind_opt R + UnwindSafe + 'static>(f: F) -> Option {
433 | catch_unwind(f).ok()
434 | }
435 |
436 | fn catch_unwind_int(f: F) -> c_int {
437 | match catch_unwind(f) {
438 | Ok(()) => 0,
439 | Err(_) => -1,
440 | }
441 | }
442 |
443 | fn check_int(value: c_int) -> c_int {
444 | assert!(value >= 0, "Python exception");
445 | value
446 | }
447 |
448 | fn check_pointer(ptr: *const T) -> *const T {
449 | assert!(!ptr.is_null(), "Python exception");
450 | ptr
451 | }
452 |
--------------------------------------------------------------------------------
/htmlpyever.pyx:
--------------------------------------------------------------------------------
1 | from libc.string cimport strcmp
2 | from libc.stdio cimport printf
3 |
4 | cimport etreepublic as cetree
5 | cdef object etree
6 | from lxml import etree
7 | cetree.import_lxml__etree()
8 | cimport tree
9 | cimport xmlparser
10 |
11 | from glue cimport h5eParser, h5eUnicode, h5eBytes, h5eCallbacks, node_t
12 | cimport glue
13 |
14 | # it's scary that it's 2017 and I still need to spend so much time just doing string conversion
15 |
16 | cdef bytes bytes_h5e(h5eUnicode h5eutf):
17 | cdef bytes utf8 = h5eutf.ptr[:h5eutf.len]
18 | cdef unsigned char ch
19 | for ch in utf8:
20 | if not tree.xmlIsChar_ch(ch):
21 | raise ValueError('html5ever gave invalid xml character')
22 | return utf8
23 |
24 | # ok phew we're done with that
25 |
26 | cdef cetree._Document documentFactory(tree.xmlDoc *c_doc):
27 | cdef cetree._Document doc
28 | if c_doc._private is not NULL:
29 | return c_doc._private
30 | doc = cetree.makeElement('fuck', None, None, None, None, None, None)._doc
31 | tree.xmlFreeDoc(doc._c_doc)
32 | doc._c_doc = c_doc
33 | c_doc._private = doc
34 | return doc
35 |
36 | cdef class Parser:
37 | cdef tree.xmlDoc *doc
38 | cdef cetree._Document lxml_doc
39 | cdef h5eParser *parser
40 |
41 | cdef tree.xmlNs *html_ns
42 | cdef tree.xmlNs *math_ns
43 | cdef tree.xmlNs *svg_ns
44 | cdef tree.xmlNs *xlink_ns
45 | cdef tree.xmlNs *xml_ns
46 | cdef tree.xmlNs *xmlns_ns
47 |
48 | cdef readonly dict template_contents
49 | cdef public object script_callback
50 |
51 | def __cinit__(self):
52 | self.doc = NULL
53 | self.parser = NULL
54 |
55 | def __init__(self, object script_callback=None, cetree._Element fragment_context=None, scripting=True):
56 | cdef cetree._Element fuck
57 | cdef const char *ctx_name
58 |
59 | self.doc = tree.xmlNewDoc(NULL)
60 | self.doc.encoding = tree.xmlStrdup( 'UTF-8')
61 | self.doc.dict = xmlparser.xmlDictCreate()
62 | self.lxml_doc = documentFactory(self.doc)
63 |
64 | if fragment_context is not None and (
65 | fragment_context._c_node.ns is NULL or
66 | strcmp( fragment_context._c_node.ns.href, "http://www.w3.org/1999/xhtml") == 0
67 | ):
68 | ctx_name = fragment_context._c_node.name
69 | else:
70 | ctx_name = NULL
71 |
72 | self.parser = glue.new_parser(&callbacks, self, self.doc, ctx_name, bool(scripting))
73 |
74 | self.script_callback = script_callback
75 | self.template_contents = {}
76 |
77 | def __dealloc__(self):
78 | if self.parser is not NULL:
79 | glue.destroy_parser(self.parser)
80 |
81 | def feed(self, bytes data):
82 | self.check_initted()
83 | if glue.feed_parser(self.parser, glue.h5eBytes(len(data), data)) == -1:
84 | raise ValueError('html5ever failed for some unknown reason')
85 |
86 | def end(self):
87 | glue.end_parser(self.parser)
88 |
89 | property root:
90 | def __get__(self):
91 | self.check_initted()
92 | if tree.xmlDocGetRootElement(self.doc) is NULL:
93 | raise ValueError('root element does not exist')
94 | return cetree.elementFactory(self.lxml_doc, tree.xmlDocGetRootElement(self.doc))
95 | property roottree:
96 | def __get__(self):
97 | self.check_initted()
98 | return cetree.elementTreeFactory(self.root)
99 |
100 | cdef int check_initted(self) except -1:
101 | if self.doc == NULL:
102 | raise ValueError('__init__ was never called')
103 | return 0
104 |
105 | # RUN DA SCRIPTS YAAAH
106 |
107 | cdef int run_script_cb(self, node_t script_) except -1:
108 | cdef tree.xmlNode *script = script_
109 | self.run_script(cetree.elementFactory(self.lxml_doc, script))
110 |
111 | def run_script(self, script):
112 | if self.script_callback is not None:
113 | self.script_callback(script)
114 |
115 | # DA CALLBACKS WOOHOO
116 |
117 | cdef node_t create_element_cb(self, h5eUnicode ns, h5eUnicode name) except NULL:
118 | cdef tree.xmlNode *element
119 | cdef cetree._Element etree_element
120 | cdef cetree._Element template
121 | element = tree.xmlNewDocNode(NULL, NULL, tree._xcstr(bytes_h5e(name)), NULL)
122 | self.recalibrate_namespace(element, ns)
123 | if element is NULL: raise MemoryError
124 |
125 | return element
126 |
127 | cdef node_t get_template_contents_cb(self, node_t element_) except NULL:
128 | cdef tree.xmlNode *element = element_
129 | cdef cetree._Element contents
130 | cdef cetree._Element etree_element
131 |
132 | template = cetree.elementFactory(documentFactory(element.doc), element)
133 | if template not in self.template_contents:
134 | contents = etree.Element('fuck')
135 | tree.xmlNodeSetName(contents._c_node, "template contents")
136 | contents._doc._c_doc._private = contents._doc
137 | self.template_contents[template] = contents
138 |
139 | return ( self.template_contents[template])._c_node
140 |
141 | cdef int add_attribute_if_missing_cb(self, node_t element_, h5eUnicode ns, h5eUnicode name, h5eUnicode value) except -1:
142 | cdef tree.xmlNode *element = element_
143 | cdef tree.xmlAttr *attr
144 | cdef tree.const_xmlChar *c_name = tree._xcstr(bytes_h5e(name))
145 | cdef tree.const_xmlChar *c_value = tree._xcstr(bytes_h5e(value))
146 | if not tree.xmlHasProp(element, c_name):
147 | attr = tree.xmlSetNsProp(element, NULL,
148 | tree._xcstr(bytes_h5e(name)),
149 | tree._xcstr(bytes_h5e(value)))
150 | self.recalibrate_namespace( attr, ns)
151 | return 0
152 |
153 | cdef node_t create_comment_cb(self, h5eUnicode data) except NULL:
154 | cdef tree.xmlNode *comment = tree.xmlNewDocComment(self.doc, tree._xcstr(bytes_h5e(data)))
155 | return comment
156 |
157 | cdef int append_doctype_to_document_cb(self, h5eUnicode name, h5eUnicode public_id, h5eUnicode system_id) except -1:
158 | cdef tree.xmlDtd *doctype
159 | doctype = tree.xmlCreateIntSubset(self.doc,
160 | tree._xcstr(bytes_h5e(name)),
161 | tree._xcstr(bytes_h5e(public_id)),
162 | tree._xcstr(bytes_h5e(system_id)))
163 | tree.xmlAddChild( self.doc, doctype)
164 | return 0
165 |
166 | cdef int append_node_cb(self, node_t parent_, node_t child_) except -1:
167 | cdef tree.xmlNode *parent = parent_
168 | cdef tree.xmlNode *child = child_
169 | tree.xmlAddChild(parent, child)
170 | return 0
171 |
172 | cdef int append_text_cb(self, node_t parent, h5eUnicode text) except -1:
173 | cdef tree.xmlNode *child = tree.xmlNewDocText(self.doc, tree._xcstr(bytes_h5e(text)))
174 | return self.append_node_cb(parent, child)
175 |
176 | # These callbacks are only triggered when text or a tag not on the
177 | # whitelist is found in a table. The text or tag is then inserted before
178 | # the table.
179 | #
180 | cdef int insert_node_before_sibling_cb(self, node_t sibling_, node_t new_sibling_) except -1:
181 | cdef tree.xmlNode *sibling = sibling_
182 | cdef tree.xmlNode *new_sibling = new_sibling_
183 | if sibling.parent is NULL:
184 | return 1
185 | tree.xmlAddPrevSibling(sibling, new_sibling)
186 | return 0
187 |
188 | #
189 | cdef int insert_text_before_sibling_cb(self, node_t sibling_, h5eUnicode text) except -1:
190 | cdef tree.xmlNode *text_node = tree.xmlNewDocText(self.doc, tree._xcstr(bytes_h5e(text)))
191 | return self.insert_node_before_sibling_cb(sibling_, text_node)
192 |
193 | # This is only called when dealing with end tags that don't match start tags
194 | # e.g.
195 | cdef int reparent_children_cb(self, node_t parent_, node_t new_parent_) except -1:
196 | cdef tree.xmlNode *parent = parent_
197 | cdef tree.xmlNode *new_parent = new_parent_
198 | cdef tree.xmlNode *node
199 |
200 | while parent.children is not NULL:
201 | node = parent.children
202 | tree.xmlUnlinkNode(node)
203 | tree.xmlAddChild(new_parent, node)
204 | return 0
205 |
206 | # rare case, triggered by
207 | cdef int remove_from_parent_cb(self, node_t node_) except -1:
208 | cdef tree.xmlNode *node = node_
209 | tree.xmlUnlinkNode(node)
210 |
211 | cdef int recalibrate_namespace(self, tree.xmlNode *node, h5eUnicode ns) except -1:
212 | cdef bytes ns_url = bytes_h5e(ns)
213 | cdef tree.const_xmlChar *ns_prefix = ''
214 | cdef tree.xmlNs *xmlns = NULL
215 | cdef tree.xmlNode *element
216 |
217 | if ns_url == b'http://www.w3.org/1999/xhtml':
218 | xmlns = self.html_ns
219 | ns_prefix = NULL
220 | elif ns_url == b'http://www.w3.org/1998/Math/MathML':
221 | xmlns = self.math_ns
222 | ns_prefix = 'math'
223 | elif ns_url == b'http://www.w3.org/2000/svg':
224 | xmlns = self.svg_ns
225 | ns_prefix = 'svg'
226 | elif ns_url == b'http://www.w3.org/1999/xlink':
227 | xmlns = self.xlink_ns
228 | ns_prefix = 'xlink'
229 | elif ns_url == b'http://www.w3.org/XML/1998/namespace':
230 | xmlns = self.xml_ns
231 | ns_prefix = 'xml'
232 | elif ns_url == b'http://www.w3.org/2000/xmlns/':
233 | xmlns = self.xmlns_ns
234 | ns_prefix = 'xmlns'
235 | elif ns_url == b'':
236 | xmlns = NULL
237 | ns_prefix = NULL
238 | else:
239 | raise AssertionError(ns_url)
240 |
241 | if xmlns is NULL and ns_url != b'':
242 | element = node
243 | if node.type == tree.XML_ATTRIBUTE_NODE:
244 | element = node.parent
245 | xmlns = tree.xmlNewNs(element, tree._xcstr(ns_url), ns_prefix)
246 | if ns_url == b'http://www.w3.org/1999/xhtml':
247 | self.html_ns = xmlns
248 | elif ns_url == b'http://www.w3.org/1998/Math/MathML':
249 | self.math_ns = xmlns
250 | elif ns_url == b'http://www.w3.org/2000/svg':
251 | self.svg_ns = xmlns
252 | elif ns_url == b'http://www.w3.org/1999/xlink':
253 | self.xlink_ns = xmlns
254 | elif ns_url == b'http://www.w3.org/XML/1998/namespace':
255 | self.xml_ns = xmlns
256 | elif ns_url == b'http://www.w3.org/2000/xmlns/':
257 | self.xmlns_ns = xmlns
258 |
259 | tree.xmlSetNs(node, xmlns)
260 | return 0
261 |
262 | cdef h5eCallbacks callbacks = h5eCallbacks(
263 | clone_node_ref= NULL,
264 | destroy_node_ref= NULL,
265 | same_node= NULL,
266 | parse_error= NULL,
267 | run_script= Parser.run_script_cb,
268 | create_element= Parser.create_element_cb,
269 | get_template_contents= Parser.get_template_contents_cb,
270 | add_attribute_if_missing= Parser.add_attribute_if_missing_cb,
271 | create_comment= Parser.create_comment_cb,
272 | append_doctype_to_document= Parser.append_doctype_to_document_cb,
273 | append_node= Parser.append_node_cb,
274 | append_text= Parser.append_text_cb,
275 | insert_node_before_sibling= Parser.insert_node_before_sibling_cb,
276 | insert_text_before_sibling= Parser.insert_text_before_sibling_cb,
277 | reparent_children= Parser.reparent_children_cb,
278 | remove_from_parent= Parser.remove_from_parent_cb,
279 | )
280 |
281 |
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = -p no:doctest --tb=short
3 | testpaths = tests
4 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import subprocess
3 | from setuptools import setup, Extension
4 | from setuptools.command.build_ext import build_ext as setuptools_build_ext
5 | from Cython.Build.Cythonize import cythonize
6 | import lxml
7 |
8 | MODE = 'release'
9 |
10 | class build_ext(setuptools_build_ext):
11 | def build_extension(self, ext):
12 | subprocess.check_call(['cargo', 'build'] +
13 | (['--release'] if MODE == 'release' else []))
14 | setuptools_build_ext.build_extension(self, ext)
15 |
16 | includes = ['/usr/include/libxml2'] + lxml.get_include()
17 | setup(
18 | name='htmlpyever',
19 |
20 | ext_modules=cythonize([Extension(
21 | name='htmlpyever',
22 | sources=['htmlpyever.pyx'],
23 | libraries=['html5ever_glue', 'xml2'],
24 | library_dirs=['target/{}'.format(MODE)],
25 | include_dirs=includes,
26 | depends=['target/{}/libhtml5ever_glue.a'.format(MODE)],
27 | )], include_path=includes),
28 |
29 | setup_requires=['cython'],
30 | install_requires=['lxml'],
31 | cmdclass={'build_ext': build_ext},
32 | )
33 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import itertools
4 | import operator
5 |
6 | import pytest
7 | from lxml import etree
8 | import htmlpyever
9 | import fucklxml
10 |
11 | def pytest_collect_file(path, parent):
12 | dir = os.path.basename(path.dirname)
13 | if dir == 'tree-construction' and path.ext == '.dat':
14 | return TreeConstructionFile(path, parent)
15 |
16 | with open('tests/xfail.txt') as xfail:
17 | # chop off the ending newlines
18 | xfail_list = list(map(operator.itemgetter(slice(-1)), xfail))
19 |
20 | class TreeConstructionFile(pytest.File):
21 | def collect(self):
22 | with open(self.fspath, 'rb') as dat:
23 | testdata = {}
24 | # The whole in_quote thing is really ghetto but at least it works on the test data
25 | in_quote = False
26 | for i, line in enumerate(itertools.chain(dat, [b'\n']), 1):
27 | if line == b'\n' and len(testdata) >= 3 and not in_quote:
28 | assert not in_quote
29 | yield TreeConstructionTest(os.path.basename(self.fspath), i, self, **testdata)
30 | testdata = {}
31 | elif line.startswith(b'#'):
32 | heading = line[1:-1].replace(b'-', b'_').decode()
33 | testdata.setdefault(heading, b'')
34 | if heading == 'document':
35 | in_quote = False
36 | else:
37 | if heading == 'document':
38 | if in_quote or line[1:].lstrip().startswith(b'"'):
39 | for _ in range(line.count(b'"')):
40 | in_quote = not in_quote
41 | testdata[heading] += line
42 |
43 | etree.register_namespace('math', 'http://www.w3.org/1998/Math/MathML')
44 | etree.register_namespace('svg', 'http://www.w3.org/2000/svg')
45 | etree.register_namespace('xlink', 'http://www.w3.org/1999/xlink')
46 | etree.register_namespace('xml', 'http://www.w3.org/XML/1998/namespace')
47 | etree.register_namespace('xmlns', 'http://www.w3.org/2000/xmlns/')
48 |
49 | HTML_NS = 'http://www.w3.org/1999/xhtml'
50 |
51 | def parse_name(name):
52 | if name.startswith(b'math '):
53 | namespace = 'http://www.w3.org/1998/Math/MathML'
54 | prefix = 'math'
55 | elif name.startswith(b'svg '):
56 | namespace = 'http://www.w3.org/2000/svg'
57 | prefix = 'svg'
58 | elif name.startswith(b'xlink '):
59 | namespace = 'http://www.w3.org/1999/xlink'
60 | prefix = 'xlink'
61 | elif name.startswith(b'xml '):
62 | namespace = 'http://www.w3.org/XML/1998/namespace'
63 | prefix = 'xml'
64 | elif name.startswith(b'xmlns '):
65 | namespace = 'http://www.w3.org/2000/xmlns/'
66 | prefix = 'xmlns'
67 | else:
68 | namespace = HTML_NS
69 | prefix = None
70 | if namespace != HTML_NS:
71 | name = name.split()[1]
72 |
73 | return name, prefix, namespace
74 |
75 | def etreeify_name(name, attribute=False):
76 | name, prefix, namespace = parse_name(name)
77 | if attribute:
78 | if namespace == HTML_NS:
79 | return name
80 | return b'{' + namespace.encode() + b'}' + name
81 | return b'{' + namespace.encode() + b'}' + name, {prefix: namespace}
82 |
83 | def etreeify(raw_name):
84 | try:
85 | name, nsmap = etreeify_name(raw_name)
86 | return etree.Element(name, nsmap=nsmap)
87 | except ValueError:
88 | elem = etree.Element('fuck')
89 | name, prefix, namespace = parse_name(raw_name)
90 | fucklxml.set_name(elem, name, prefix, namespace)
91 | return elem
92 |
93 | class TreeConstructionTest(pytest.Item):
94 | def __init__(self, filename, index, parent, data=None, errors=None, document=None, document_fragment=None, script_off=False, **kwargs):
95 | super().__init__(f'{filename}:{index}', parent)
96 | if data != b'':
97 | assert data.endswith(b'\n')
98 | data = data[:-1]
99 | self.data = data
100 | self.errors = errors
101 | self.document = document
102 | self.script_on = script_off != b''
103 | if document_fragment is not None:
104 | assert document_fragment.endswith(b'\n')
105 | self.fragment_context = etreeify(document_fragment[:-1])
106 | else:
107 | self.fragment_context = None
108 |
109 | def runtest(self):
110 | if self.name in xfail_list:
111 | try:
112 | self._runtest()
113 | except:
114 | pytest.xfail()
115 | else:
116 | self._runtest()
117 |
118 | def _runtest(self):
119 | print(self.name)
120 | print('data', self.data)
121 |
122 | top_level = [] # to deal with top-level comments
123 | stack = []
124 | def append(elem):
125 | if len(stack):
126 | stack[-1].append(elem)
127 | else:
128 | top_level.append(elem)
129 | stack.append(elem)
130 | if self.fragment_context is not None:
131 | append(self.fragment_context)
132 |
133 | doctype_name = None
134 | doctype_public_id = doctype_system_id = ''
135 |
136 | template_contents = {}
137 |
138 | document = self.document
139 | assert document.startswith(b'| ') and document.endswith(b'\n')
140 | for line in document[2:-1].split(b'\n| '):
141 |
142 | line_depth = (len(line) - len(line.lstrip())) // 2
143 | if self.fragment_context is not None:
144 | line_depth += 1
145 | line = line.lstrip()
146 |
147 | while line_depth < len(stack):
148 | stack.pop()
149 |
150 | if line == b'content':
151 | # template contents
152 | contents = etree.Element('template-contents')
153 | template_contents[stack[-1]] = contents
154 | stack.append(contents)
155 |
156 | elif line.startswith(b''):
157 | # comment
158 | content = line[5:-4].decode('utf-8')
159 | comment = etree.Comment()
160 | comment.text = content
161 | append(comment)
162 |
163 | elif line.startswith(b''):
164 | # doctype
165 | content = line[10:-1]
166 | doctype_name, _, content = content.partition(b' "')
167 | if content:
168 | doctype_public_id, _, content = content.partition(b'" "')
169 | doctype_system_id, _, _ = content.rpartition(b'"')
170 | doctype_public_id = doctype_public_id.decode()
171 | doctype_system_id = doctype_system_id.decode()
172 | doctype_name = doctype_name.decode()
173 |
174 | elif line.startswith(b'<') and line.endswith(b'>'):
175 | # element
176 | name = line[1:-1]
177 | elem = etreeify(name)
178 | append(elem)
179 |
180 | elif line.startswith(b'"') and line.endswith(b'"'):
181 | # text
182 | text = line[1:-1].decode('utf-8')
183 | top = stack[-1]
184 | if len(top) == 0:
185 | if top.text is None:
186 | top.text = text
187 | else:
188 | top.text += text
189 | else:
190 | if top[-1].tail is None:
191 | top[-1].tail = text
192 | else:
193 | top[-1].tail += text
194 |
195 | else:
196 | assert b'=' in line
197 | name, _, value = line.partition(b'=')
198 | assert value.startswith(b'"') and value.endswith(b'"')
199 | value = value[1:-1]
200 | try:
201 | stack[-1].set(etreeify_name(name, attribute=True), value)
202 | except ValueError:
203 | name, prefix, namespace = parse_name(name)
204 | fucklxml.set_attribute(elem, name, value, prefix, namespace)
205 |
206 | pre_root = []
207 | root = None
208 | for node in top_level:
209 | if root is None:
210 | if isinstance(node.tag, str):
211 | root = node
212 | for node in pre_root:
213 | root.addprevious(node)
214 | else:
215 | pre_root.append(node)
216 | else:
217 | root.addnext(node)
218 |
219 | document = etree.ElementTree(root)
220 | if self.fragment_context is None:
221 | assert document.getroot().tag == '{' + HTML_NS + '}html'
222 | document.docinfo.public_id = doctype_public_id
223 | document.docinfo.system_url = doctype_system_id
224 |
225 | parser = htmlpyever.Parser(fragment_context=self.fragment_context, scripting=self.script_on)
226 | parser.feed(self.data)
227 | parser.end()
228 | if self.fragment_context is not None:
229 | self.fragment_context.extend(parser.root)
230 | root = self.fragment_context
231 | else:
232 | root = parser.root
233 | try:
234 | assert etree.tostring(root) == etree.tostring(document.getroot())
235 | if parser.roottree.docinfo.internalDTD is not None:
236 | assert doctype_name == parser.roottree.docinfo.internalDTD.name
237 | assert doctype_public_id == parser.roottree.docinfo.public_id
238 | assert doctype_system_id == parser.roottree.docinfo.system_url
239 | else:
240 | assert doctype_name is None
241 | assert doctype_public_id == ''
242 | assert doctype_system_id == ''
243 | except:
244 | print(etree.tostring(root))
245 | print(etree.tostring(document.getroot()))
246 | raise
247 |
248 | def repr_failure(self, excinfo):
249 | traceback = excinfo.traceback
250 | ntraceback = traceback.cut(path=__file__)
251 | excinfo.traceback = ntraceback.filter()
252 |
253 | return excinfo.getrepr(funcargs=True,
254 | showlocals=False,
255 | style="short", tbfilter=False)
256 |
--------------------------------------------------------------------------------
/tests/xfail.txt:
--------------------------------------------------------------------------------
1 | isindex.dat:26
2 | menuitem-element.dat:51
3 | menuitem-element.dat:66
4 | menuitem-element.dat:81
5 | tests19.dat:512
6 | tests19.dat:524
7 | tests19.dat:1080
8 | tests20.dat:459
9 | tests20.dat:487
10 | tests20.dat:501
11 | tests23.dat:134
12 | tests26.dat:285
13 | webkit01.dat:194
14 |
--------------------------------------------------------------------------------