├── .gitignore ├── .gitmodules ├── .travis.yml ├── Cargo.toml ├── README.rst ├── glue.h ├── glue.pxd ├── glue.rs ├── htmlpyever.pyx ├── pytest.ini ├── setup.py └── tests ├── conftest.py └── xfail.txt /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | Cargo.lock 3 | build 4 | target 5 | htmlpyever.egg-info 6 | dist 7 | .cache 8 | .eggs 9 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "tests/data"] 2 | path = tests/data 3 | url = https://github.com/html5lib/html5lib-tests 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | install: 3 | - pip install cython lxml 4 | - pip install -e . 5 | script: ./setup.py test 6 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "html5ever-glue" 3 | version = "0.1.0" 4 | authors = ["Theodore Dubois "] 5 | 6 | [lib] 7 | name = "html5ever_glue" 8 | path = "glue.rs" 9 | crate-type = ["staticlib"] 10 | test = false 11 | 12 | [dependencies] 13 | libc = "*" 14 | html5ever = "^0.13" 15 | html5ever-atoms = "^0.2" 16 | tendril = "^0.2.2" 17 | string_cache = "^0.4" 18 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | htmlpyever 2 | ========== 3 | 4 | htmlpyever is a very single-minded binding to html5ever. You can: 5 | 6 | * Feed the parser: 7 | 8 | .. code-block:: python 9 | 10 | parser.feed(b'hOI wURLD!') 11 | 12 | * Get a callback when the parser encounters a closing script tag: 13 | 14 | .. code-block:: python 15 | 16 | def script_callback(script): 17 | # handle script 18 | parser = htmlpyever.Parser(script_callback) 19 | 20 | # or 21 | 22 | class MyParser(htmlpyever.Parser): 23 | def run_script(self, script) 24 | # handle script 25 | parser = MyParser() 26 | 27 | * Obtain the result as an LXML ``Element`` or ``ElementTree``: 28 | 29 | .. code-block:: python 30 | 31 | from lxml import etree 32 | etree.tostring(parser.root) 33 | # >>> 'hOI! wURLD!' 34 | etree.tostring(parser.root) 35 | # >>> 'hOI! wURLD!' 36 | # not sure why the doctype doesn't show up in the serialized ElementTree 37 | 38 | That's it. 39 | -------------------------------------------------------------------------------- /glue.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | typedef struct _BytesSlice { 4 | size_t len; 5 | const char *ptr; 6 | } h5eBytes; 7 | typedef h5eBytes h5eUnicode; 8 | 9 | typedef void *node_t; 10 | typedef struct _Parser h5eParser; 11 | 12 | typedef struct { 13 | node_t (*clone_node_ref)(void *data, node_t node); 14 | int (*destroy_node_ref)(void *data, node_t node); 15 | int (*same_node)(void *data, node_t node1, node_t node2); 16 | int (*parse_error)(void *data, h5eUnicode error); 17 | int (*run_script)(void *data, node_t script); 18 | node_t (*create_element)(void *data, h5eUnicode ns, h5eUnicode name); 19 | node_t (*get_template_contents)(void *data, node_t node); 20 | int (*add_attribute_if_missing)(void *data, node_t node, h5eUnicode ns, h5eUnicode name, h5eUnicode value); 21 | node_t (*create_comment)(void *data, h5eUnicode text); 22 | int (*append_doctype_to_document)(void *data, h5eUnicode name, h5eUnicode public_id, h5eUnicode system_id); 23 | int (*append_node)(void *data, node_t parent, node_t child); 24 | int (*append_text)(void *data, node_t node, h5eUnicode text); 25 | int (*insert_node_before_sibling)(void *data, node_t sibling, node_t node); 26 | int (*insert_text_before_sibling)(void *data, node_t sibling, h5eUnicode text); 27 | int (*reparent_children)(void *data, node_t node, node_t new_parent); 28 | int (*remove_from_parent)(void *data, node_t node); 29 | } h5eCallbacks; 30 | 31 | h5eParser* new_parser(h5eCallbacks *, void *data, node_t document, const char *frag_ctx_name, int scripting_enabled); 32 | int destroy_parser(h5eParser *); 33 | int feed_parser(h5eParser *, h5eBytes); 34 | int end_parser(h5eParser *); 35 | -------------------------------------------------------------------------------- /glue.pxd: -------------------------------------------------------------------------------- 1 | cdef extern from "glue.h": 2 | ctypedef struct h5eBytes: 3 | size_t len 4 | const char *ptr 5 | ctypedef h5eBytes h5eUnicode 6 | 7 | ctypedef struct h5eQualName: 8 | h5eUnicode ns 9 | h5eUnicode local 10 | 11 | ctypedef void *node_t 12 | ctypedef struct h5eParser: 13 | pass 14 | 15 | ctypedef struct h5eCallbacks: 16 | node_t (*clone_node_ref)(void *data, node_t node) 17 | int (*destroy_node_ref)(void *data, node_t node) 18 | int (*same_node)(void *data, node_t node1, node_t node2) 19 | int (*parse_error)(void *data, h5eUnicode error) 20 | int (*run_script)(void *data, node_t script) 21 | node_t (*create_element)(void *data, h5eUnicode ns, h5eUnicode name) 22 | node_t (*get_template_contents)(void *data, node_t node) 23 | int (*add_attribute_if_missing)(void *data, node_t node, h5eUnicode ns, h5eUnicode name, h5eUnicode value) 24 | node_t (*create_comment)(void *data, h5eUnicode text) 25 | int (*append_doctype_to_document)(void *data, h5eUnicode name, h5eUnicode public_id, h5eUnicode system_id) 26 | int (*append_node)(void *data, node_t parent, node_t child) 27 | int (*append_text)(void *data, node_t node, h5eUnicode text) 28 | int (*insert_node_before_sibling)(void *data, node_t sibling, node_t node) 29 | int (*insert_text_before_sibling)(void *data, node_t sibling, h5eUnicode text) 30 | int (*reparent_children)(void *data, node_t node, node_t new_parent) 31 | int (*remove_from_parent)(void *data, node_t node) 32 | 33 | h5eParser* new_parser(h5eCallbacks *, void *data, node_t document, const char *frag_ctx_name, int scripting_enabled) 34 | int destroy_parser(h5eParser *) 35 | # if any of the callbacks threw an exception then this will return -1 36 | int feed_parser(h5eParser *, h5eBytes) except? -1 37 | int end_parser(h5eParser *) 38 | -------------------------------------------------------------------------------- /glue.rs: -------------------------------------------------------------------------------- 1 | extern crate libc; 2 | extern crate html5ever; 3 | #[macro_use] extern crate html5ever_atoms; 4 | extern crate string_cache; 5 | extern crate tendril; 6 | 7 | use html5ever::tokenizer::{Tokenizer, TokenizerOpts, Attribute, TokenizerResult}; 8 | use html5ever::tokenizer::buffer_queue::BufferQueue; 9 | use html5ever::tree_builder::{TreeBuilder, TreeBuilderOpts, TreeSink, QuirksMode, NodeOrText}; 10 | use html5ever::QualName; 11 | use std::borrow::Cow; 12 | use std::slice; 13 | use std::mem; 14 | use std::ffi::CStr; 15 | use libc::{c_void, c_int, c_char, size_t}; 16 | use std::panic::{catch_unwind, UnwindSafe}; 17 | use tendril::StrTendril; 18 | use string_cache::atom::Atom; 19 | 20 | /// When given as a function parameter, only valid for the duration of the call. 21 | #[repr(C)] 22 | #[derive(Copy, Clone, Debug)] 23 | pub struct CBytes { 24 | len: size_t, 25 | ptr: *const u8, 26 | } 27 | 28 | impl CBytes { 29 | fn from_slice(slice: &[u8]) -> CBytes { 30 | CBytes { 31 | len: slice.len(), 32 | ptr: slice.as_ptr(), 33 | } 34 | } 35 | 36 | unsafe fn as_slice(&self) -> &[u8] { 37 | slice::from_raw_parts(self.ptr, self.len) 38 | } 39 | } 40 | 41 | /// When given as a function parameter, only valid for the duration of the call. 42 | #[repr(C)] 43 | #[derive(Copy, Clone, Debug)] 44 | pub struct CUnicode(CBytes); 45 | 46 | impl CUnicode { 47 | fn from_str(s: &str) -> CUnicode { 48 | CUnicode(CBytes::from_slice(s.as_bytes())) 49 | } 50 | } 51 | 52 | pub type OpaqueParserUserData = c_void; 53 | pub type OpaqueNode = c_void; 54 | 55 | struct NodeHandle { 56 | ptr: *const OpaqueNode, 57 | parser_user_data: *const OpaqueParserUserData, 58 | callbacks: &'static Callbacks, 59 | qualified_name: Option, 60 | } 61 | 62 | macro_rules! call { 63 | ($self_: expr, $callback: ident ( $( $arg: expr ),* )) => { 64 | ($self_.callbacks.$callback)($self_.parser_user_data, $( $arg ),* ) 65 | }; 66 | } 67 | 68 | macro_rules! call_if_some { 69 | ($self_: expr, $opt_callback: ident ( $( $arg: expr ),* )) => { 70 | call_if_some!($self_, $opt_callback( $( $arg ),* ) else 0) 71 | }; 72 | ($self_: expr, $opt_callback: ident ( $( $arg: expr ),* ) else $default: expr) => { 73 | if let Some(callback) = $self_.callbacks.$opt_callback { 74 | callback($self_.parser_user_data, $( $arg ),* ) 75 | } else { 76 | $default 77 | } 78 | }; 79 | } 80 | 81 | impl Clone for NodeHandle { 82 | fn clone(&self) -> NodeHandle { 83 | NodeHandle { 84 | ptr: check_pointer(call_if_some!(self, clone_node_ref(self.ptr) else self.ptr)), 85 | parser_user_data: self.parser_user_data, 86 | callbacks: self.callbacks, 87 | qualified_name: self.qualified_name.clone(), 88 | } 89 | } 90 | } 91 | 92 | impl Drop for NodeHandle { 93 | fn drop(&mut self) { 94 | check_int(call_if_some!(self, destroy_node_ref(self.ptr))); 95 | } 96 | } 97 | 98 | struct CallbackTreeSink { 99 | parser_user_data: *const c_void, 100 | callbacks: &'static Callbacks, 101 | document: NodeHandle, 102 | quirks_mode: QuirksMode, 103 | } 104 | 105 | pub struct Parser { 106 | tokenizer: Tokenizer> 107 | } 108 | 109 | struct ParserMutPtr(*mut Parser); 110 | 111 | // FIXME: These make catch_unwind happy, but they are total lies as far as I know. 112 | impl UnwindSafe for CBytes {} 113 | impl UnwindSafe for ParserMutPtr {} 114 | impl UnwindSafe for Parser {} 115 | 116 | impl CallbackTreeSink { 117 | fn new_handle(&self, ptr: *const OpaqueNode) -> NodeHandle { 118 | NodeHandle { 119 | ptr: ptr, 120 | parser_user_data: self.parser_user_data, 121 | callbacks: self.callbacks, 122 | qualified_name: None, 123 | } 124 | } 125 | 126 | fn add_attributes_if_missing(&self, element: *const OpaqueNode, attributes: Vec) { 127 | for attribute in attributes { 128 | check_int(call!(self, add_attribute_if_missing( 129 | element, 130 | CUnicode::from_str(&attribute.name.ns), 131 | CUnicode::from_str(&attribute.name.local), 132 | CUnicode::from_str(&attribute.value)))); 133 | } 134 | } 135 | } 136 | 137 | impl TreeSink for CallbackTreeSink { 138 | type Handle = NodeHandle; 139 | type Output = Self; 140 | 141 | fn finish(self) -> Self { 142 | self 143 | } 144 | 145 | fn parse_error(&mut self, msg: Cow<'static, str>) { 146 | check_int(call_if_some!(self, parse_error(CUnicode::from_str(&msg)))); 147 | } 148 | 149 | fn get_document(&mut self) -> NodeHandle { 150 | self.document.clone() 151 | } 152 | 153 | fn get_template_contents(&mut self, target: NodeHandle) -> NodeHandle { 154 | self.new_handle(check_pointer(call!(self, get_template_contents(target.ptr)))) 155 | } 156 | 157 | fn set_quirks_mode(&mut self, mode: QuirksMode) { 158 | self.quirks_mode = mode 159 | } 160 | 161 | fn same_node(&self, x: NodeHandle, y: NodeHandle) -> bool { 162 | check_int(call_if_some!(self, same_node(x.ptr, y.ptr) else (x.ptr == y.ptr) as c_int)) != 0 163 | } 164 | 165 | fn elem_name(&self, target: NodeHandle) -> QualName { 166 | target.qualified_name.as_ref().unwrap().clone() 167 | } 168 | 169 | fn create_element(&mut self, name: QualName, attrs: Vec) -> NodeHandle { 170 | let element = check_pointer(call!(self, create_element( 171 | CUnicode::from_str(&name.ns), CUnicode::from_str(&name.local)))); 172 | self.add_attributes_if_missing(element, attrs); 173 | let mut handle = self.new_handle(element); 174 | handle.qualified_name = Some(name); 175 | handle 176 | } 177 | 178 | fn create_comment(&mut self, text: StrTendril) -> NodeHandle { 179 | self.new_handle(check_pointer(call!( 180 | self, create_comment(CUnicode::from_str(&text))))) 181 | } 182 | 183 | fn append(&mut self, parent: NodeHandle, child: NodeOrText) { 184 | check_int(match child { 185 | NodeOrText::AppendNode(node) => { 186 | call!(self, append_node(parent.ptr, node.ptr)) 187 | } 188 | NodeOrText::AppendText(ref text) => { 189 | call!(self, append_text(parent.ptr, CUnicode::from_str(text))) 190 | } 191 | }); 192 | } 193 | 194 | fn append_before_sibling(&mut self, sibling: NodeHandle, child: NodeOrText) 195 | -> Result<(), NodeOrText> { 196 | let result = check_int(match child { 197 | NodeOrText::AppendNode(ref node) => { 198 | call!(self, insert_node_before_sibling(sibling.ptr, node.ptr)) 199 | } 200 | NodeOrText::AppendText(ref text) => { 201 | call!(self, insert_text_before_sibling(sibling.ptr, CUnicode::from_str(text))) 202 | } 203 | }); 204 | if result == 0 { 205 | Ok(()) 206 | } else { 207 | Err(child) 208 | } 209 | } 210 | 211 | fn append_doctype_to_document(&mut self, 212 | name: StrTendril, 213 | public_id: StrTendril, 214 | system_id: StrTendril) { 215 | check_int(call!(self, append_doctype_to_document( 216 | CUnicode::from_str(&name), 217 | CUnicode::from_str(&public_id), 218 | CUnicode::from_str(&system_id)))); 219 | } 220 | 221 | fn add_attrs_if_missing(&mut self, target: NodeHandle, attrs: Vec) { 222 | self.add_attributes_if_missing(target.ptr, attrs) 223 | } 224 | 225 | fn remove_from_parent(&mut self, target: NodeHandle) { 226 | check_int(call!(self, remove_from_parent(target.ptr))); 227 | } 228 | 229 | fn reparent_children(&mut self, node: NodeHandle, new_parent: NodeHandle) { 230 | check_int(call!(self, reparent_children(node.ptr, new_parent.ptr))); 231 | } 232 | 233 | fn mark_script_already_started(&mut self, _target: NodeHandle) {} 234 | } 235 | 236 | macro_rules! declare_with_callbacks { 237 | ($( $( #[$attr:meta] )* callback $name: ident: $ty: ty )+) => { 238 | pub struct Callbacks { 239 | $( $( #[$attr] )* $name: $ty, )+ 240 | } 241 | 242 | /// Return a heap-allocated stuct that lives forever, 243 | /// containing the given function pointers. 244 | /// 245 | /// This leaks memory, but you normally only need one of these per program. 246 | #[no_mangle] 247 | pub unsafe extern "C" fn declare_callbacks($( $name: $ty ),+) 248 | -> Option<&'static Callbacks> { 249 | catch_unwind_opt(move || { 250 | &*Box::into_raw(Box::new(Callbacks { 251 | $( $name: $name, )+ 252 | })) 253 | }) 254 | } 255 | 256 | } 257 | } 258 | 259 | declare_with_callbacks! { 260 | /// Create and return a new reference to the given node. 261 | /// The returned pointer may be the same as the given one. 262 | /// If this callback is not provided, the same pointer is always used 263 | callback clone_node_ref: Option *const OpaqueNode> 265 | 266 | /// Destroy a new reference to the given node. 267 | /// When all references are gone, the node itself can be destroyed. 268 | /// If this callback is not provided, references are leaked. 269 | callback destroy_node_ref: Option c_int> 271 | 272 | /// Return a position value if the two given references are for the same node, 273 | /// zero for different nodes, and a negative value of an unexpected error. 274 | /// If this callback is not provided, pointer equality is used. 275 | callback same_node: Option c_int> 277 | 278 | /// Log an author conformance error. 279 | /// The pointer is guaranteed to point to the given size of well-formed UTF-8 bytes. 280 | /// The pointer can not be used after the end of this call. 281 | /// If this callback is not provided, author conformance errors are ignored. 282 | callback parse_error: Option c_int> 284 | 285 | /// Run a script. 286 | callback run_script: Option c_int> 288 | 289 | /// Create an element node with the given namespace URL and local name. 290 | /// 291 | /// If the element in `template` element in the HTML namespace, 292 | /// an associated document fragment node should be created for the template contents. 293 | callback create_element: extern "C" fn(*const OpaqueParserUserData, 294 | CUnicode, CUnicode) -> *const OpaqueNode 295 | 296 | /// Return a reference to the document fragment node for the template contents. 297 | /// 298 | /// This is only ever called for `template` elements in the HTML namespace. 299 | callback get_template_contents: extern "C" fn(*const OpaqueParserUserData, 300 | *const OpaqueNode) -> *const OpaqueNode 301 | 302 | /// Add the attribute (given as namespace URL, local name, and value) 303 | /// to the given element node if the element doesn’t already have 304 | /// an attribute with that name in that namespace. 305 | callback add_attribute_if_missing: extern "C" fn(*const OpaqueParserUserData, 306 | *const OpaqueNode, CUnicode, CUnicode, CUnicode) -> c_int 307 | 308 | /// Create a comment node. 309 | callback create_comment: extern "C" fn(*const OpaqueParserUserData, 310 | CUnicode) -> *const OpaqueNode 311 | 312 | /// Create a doctype node and append it to the document. 313 | callback append_doctype_to_document: extern "C" fn(*const OpaqueParserUserData, 314 | CUnicode, CUnicode, CUnicode) -> c_int 315 | 316 | callback append_node: extern "C" fn(*const OpaqueParserUserData, 317 | *const OpaqueNode, *const OpaqueNode) -> c_int 318 | 319 | callback append_text: extern "C" fn(*const OpaqueParserUserData, 320 | *const OpaqueNode, CUnicode) -> c_int 321 | 322 | /// If `sibling` has a parent, insert the given node just before it and return 1. 323 | /// Otherwise, do nothing and return zero. 324 | callback insert_node_before_sibling: extern "C" fn(*const OpaqueParserUserData, 325 | *const OpaqueNode, *const OpaqueNode) -> c_int 326 | 327 | /// If `sibling` has a parent, insert the given text just before it and return 1. 328 | /// Otherwise, do nothing and return zero. 329 | callback insert_text_before_sibling: extern "C" fn(*const OpaqueParserUserData, 330 | *const OpaqueNode, CUnicode) -> c_int 331 | 332 | callback reparent_children: extern "C" fn(*const OpaqueParserUserData, 333 | *const OpaqueNode, *const OpaqueNode) -> c_int 334 | 335 | callback remove_from_parent: extern "C" fn(*const OpaqueParserUserData, 336 | *const OpaqueNode) -> c_int 337 | } 338 | 339 | #[no_mangle] 340 | pub unsafe extern "C" fn new_parser(callbacks: &'static Callbacks, 341 | data: *const OpaqueParserUserData, 342 | document: *const OpaqueNode, 343 | frag_ctx_name: *const c_char, 344 | scripting_enabled: c_int) 345 | -> Option> { 346 | // MUCH CODE 347 | // VERY BAD 348 | // CLEANUP PLS 349 | catch_unwind_opt(move || { 350 | let context_qualname = if frag_ctx_name.is_null() { 351 | None 352 | } else { 353 | Some(QualName { 354 | ns: ns!(html), 355 | local: Atom::from(CStr::from_ptr(frag_ctx_name).to_str().unwrap()), 356 | }) 357 | }; 358 | let mut sink = CallbackTreeSink { 359 | parser_user_data: data, 360 | callbacks: callbacks, 361 | document: NodeHandle { 362 | ptr: document, 363 | parser_user_data: data, 364 | callbacks: callbacks, 365 | qualified_name: None, 366 | }, 367 | quirks_mode: QuirksMode::NoQuirks, 368 | }; 369 | let tree_builder_options = TreeBuilderOpts { 370 | scripting_enabled: scripting_enabled != 0, 371 | ..Default::default() 372 | }; 373 | let (tree_builder, initial_state) = match context_qualname { 374 | None => (TreeBuilder::new(sink, tree_builder_options), None), 375 | Some(qualname) => { 376 | let element = sink.create_element(qualname, Vec::new()); 377 | let tree_builder = TreeBuilder::new_for_fragment(sink, element, None, tree_builder_options); 378 | let state = tree_builder.tokenizer_state_for_context_elem(); 379 | (tree_builder, Some(state)) 380 | }, 381 | }; 382 | let tokenizer_opts = TokenizerOpts { 383 | initial_state: initial_state, 384 | ..Default::default() 385 | }; 386 | Box::new(Parser { 387 | tokenizer: Tokenizer::new(tree_builder, tokenizer_opts), 388 | }) 389 | }) 390 | } 391 | 392 | #[no_mangle] 393 | pub unsafe extern "C" fn feed_parser(parser: &mut Parser, chunk: CBytes) -> c_int { 394 | let parser = ParserMutPtr(parser); 395 | catch_unwind_int(move || { 396 | let parser = &mut *parser.0; 397 | // FIXME: Support UTF-8 byte sequences split across chunk boundary 398 | // FIXME: Go through the data once here instead of twice. 399 | let string = String::from_utf8_lossy(chunk.as_slice()); 400 | let mut buffers = BufferQueue::new(); 401 | buffers.push_back((&*string).into()); 402 | while let TokenizerResult::Script(node) = parser.tokenizer.feed(&mut buffers) { 403 | let sink = parser.tokenizer.sink().sink(); 404 | check_int(call_if_some!(sink, run_script(node.ptr))); 405 | } 406 | }) 407 | } 408 | 409 | #[no_mangle] 410 | pub unsafe extern "C" fn end_parser(parser: &mut Parser) -> c_int { 411 | let parser = ParserMutPtr(parser); 412 | catch_unwind_int(move || { 413 | let parser = &mut *parser.0; 414 | parser.tokenizer.end(); 415 | }) 416 | } 417 | 418 | #[no_mangle] 419 | pub extern "C" fn destroy_parser(parser: Box) -> c_int { 420 | catch_unwind_int(move || { 421 | mem::drop(parser) 422 | }) 423 | } 424 | 425 | #[no_mangle] 426 | pub extern "C" fn destroy_qualified_name(name: Box) -> c_int { 427 | catch_unwind_int(|| { 428 | mem::drop(name) 429 | }) 430 | } 431 | 432 | fn catch_unwind_opt R + UnwindSafe + 'static>(f: F) -> Option { 433 | catch_unwind(f).ok() 434 | } 435 | 436 | fn catch_unwind_int(f: F) -> c_int { 437 | match catch_unwind(f) { 438 | Ok(()) => 0, 439 | Err(_) => -1, 440 | } 441 | } 442 | 443 | fn check_int(value: c_int) -> c_int { 444 | assert!(value >= 0, "Python exception"); 445 | value 446 | } 447 | 448 | fn check_pointer(ptr: *const T) -> *const T { 449 | assert!(!ptr.is_null(), "Python exception"); 450 | ptr 451 | } 452 | -------------------------------------------------------------------------------- /htmlpyever.pyx: -------------------------------------------------------------------------------- 1 | from libc.string cimport strcmp 2 | from libc.stdio cimport printf 3 | 4 | cimport etreepublic as cetree 5 | cdef object etree 6 | from lxml import etree 7 | cetree.import_lxml__etree() 8 | cimport tree 9 | cimport xmlparser 10 | 11 | from glue cimport h5eParser, h5eUnicode, h5eBytes, h5eCallbacks, node_t 12 | cimport glue 13 | 14 | # it's scary that it's 2017 and I still need to spend so much time just doing string conversion 15 | 16 | cdef bytes bytes_h5e(h5eUnicode h5eutf): 17 | cdef bytes utf8 = h5eutf.ptr[:h5eutf.len] 18 | cdef unsigned char ch 19 | for ch in utf8: 20 | if not tree.xmlIsChar_ch(ch): 21 | raise ValueError('html5ever gave invalid xml character') 22 | return utf8 23 | 24 | # ok phew we're done with that 25 | 26 | cdef cetree._Document documentFactory(tree.xmlDoc *c_doc): 27 | cdef cetree._Document doc 28 | if c_doc._private is not NULL: 29 | return c_doc._private 30 | doc = cetree.makeElement('fuck', None, None, None, None, None, None)._doc 31 | tree.xmlFreeDoc(doc._c_doc) 32 | doc._c_doc = c_doc 33 | c_doc._private = doc 34 | return doc 35 | 36 | cdef class Parser: 37 | cdef tree.xmlDoc *doc 38 | cdef cetree._Document lxml_doc 39 | cdef h5eParser *parser 40 | 41 | cdef tree.xmlNs *html_ns 42 | cdef tree.xmlNs *math_ns 43 | cdef tree.xmlNs *svg_ns 44 | cdef tree.xmlNs *xlink_ns 45 | cdef tree.xmlNs *xml_ns 46 | cdef tree.xmlNs *xmlns_ns 47 | 48 | cdef readonly dict template_contents 49 | cdef public object script_callback 50 | 51 | def __cinit__(self): 52 | self.doc = NULL 53 | self.parser = NULL 54 | 55 | def __init__(self, object script_callback=None, cetree._Element fragment_context=None, scripting=True): 56 | cdef cetree._Element fuck 57 | cdef const char *ctx_name 58 | 59 | self.doc = tree.xmlNewDoc(NULL) 60 | self.doc.encoding = tree.xmlStrdup( 'UTF-8') 61 | self.doc.dict = xmlparser.xmlDictCreate() 62 | self.lxml_doc = documentFactory(self.doc) 63 | 64 | if fragment_context is not None and ( 65 | fragment_context._c_node.ns is NULL or 66 | strcmp( fragment_context._c_node.ns.href, "http://www.w3.org/1999/xhtml") == 0 67 | ): 68 | ctx_name = fragment_context._c_node.name 69 | else: 70 | ctx_name = NULL 71 | 72 | self.parser = glue.new_parser(&callbacks, self, self.doc, ctx_name, bool(scripting)) 73 | 74 | self.script_callback = script_callback 75 | self.template_contents = {} 76 | 77 | def __dealloc__(self): 78 | if self.parser is not NULL: 79 | glue.destroy_parser(self.parser) 80 | 81 | def feed(self, bytes data): 82 | self.check_initted() 83 | if glue.feed_parser(self.parser, glue.h5eBytes(len(data), data)) == -1: 84 | raise ValueError('html5ever failed for some unknown reason') 85 | 86 | def end(self): 87 | glue.end_parser(self.parser) 88 | 89 | property root: 90 | def __get__(self): 91 | self.check_initted() 92 | if tree.xmlDocGetRootElement(self.doc) is NULL: 93 | raise ValueError('root element does not exist') 94 | return cetree.elementFactory(self.lxml_doc, tree.xmlDocGetRootElement(self.doc)) 95 | property roottree: 96 | def __get__(self): 97 | self.check_initted() 98 | return cetree.elementTreeFactory(self.root) 99 | 100 | cdef int check_initted(self) except -1: 101 | if self.doc == NULL: 102 | raise ValueError('__init__ was never called') 103 | return 0 104 | 105 | # RUN DA SCRIPTS YAAAH 106 | 107 | cdef int run_script_cb(self, node_t script_) except -1: 108 | cdef tree.xmlNode *script = script_ 109 | self.run_script(cetree.elementFactory(self.lxml_doc, script)) 110 | 111 | def run_script(self, script): 112 | if self.script_callback is not None: 113 | self.script_callback(script) 114 | 115 | # DA CALLBACKS WOOHOO 116 | 117 | cdef node_t create_element_cb(self, h5eUnicode ns, h5eUnicode name) except NULL: 118 | cdef tree.xmlNode *element 119 | cdef cetree._Element etree_element 120 | cdef cetree._Element template 121 | element = tree.xmlNewDocNode(NULL, NULL, tree._xcstr(bytes_h5e(name)), NULL) 122 | self.recalibrate_namespace(element, ns) 123 | if element is NULL: raise MemoryError 124 | 125 | return element 126 | 127 | cdef node_t get_template_contents_cb(self, node_t element_) except NULL: 128 | cdef tree.xmlNode *element = element_ 129 | cdef cetree._Element contents 130 | cdef cetree._Element etree_element 131 | 132 | template = cetree.elementFactory(documentFactory(element.doc), element) 133 | if template not in self.template_contents: 134 | contents = etree.Element('fuck') 135 | tree.xmlNodeSetName(contents._c_node, "template contents") 136 | contents._doc._c_doc._private = contents._doc 137 | self.template_contents[template] = contents 138 | 139 | return ( self.template_contents[template])._c_node 140 | 141 | cdef int add_attribute_if_missing_cb(self, node_t element_, h5eUnicode ns, h5eUnicode name, h5eUnicode value) except -1: 142 | cdef tree.xmlNode *element = element_ 143 | cdef tree.xmlAttr *attr 144 | cdef tree.const_xmlChar *c_name = tree._xcstr(bytes_h5e(name)) 145 | cdef tree.const_xmlChar *c_value = tree._xcstr(bytes_h5e(value)) 146 | if not tree.xmlHasProp(element, c_name): 147 | attr = tree.xmlSetNsProp(element, NULL, 148 | tree._xcstr(bytes_h5e(name)), 149 | tree._xcstr(bytes_h5e(value))) 150 | self.recalibrate_namespace( attr, ns) 151 | return 0 152 | 153 | cdef node_t create_comment_cb(self, h5eUnicode data) except NULL: 154 | cdef tree.xmlNode *comment = tree.xmlNewDocComment(self.doc, tree._xcstr(bytes_h5e(data))) 155 | return comment 156 | 157 | cdef int append_doctype_to_document_cb(self, h5eUnicode name, h5eUnicode public_id, h5eUnicode system_id) except -1: 158 | cdef tree.xmlDtd *doctype 159 | doctype = tree.xmlCreateIntSubset(self.doc, 160 | tree._xcstr(bytes_h5e(name)), 161 | tree._xcstr(bytes_h5e(public_id)), 162 | tree._xcstr(bytes_h5e(system_id))) 163 | tree.xmlAddChild( self.doc, doctype) 164 | return 0 165 | 166 | cdef int append_node_cb(self, node_t parent_, node_t child_) except -1: 167 | cdef tree.xmlNode *parent = parent_ 168 | cdef tree.xmlNode *child = child_ 169 | tree.xmlAddChild(parent, child) 170 | return 0 171 | 172 | cdef int append_text_cb(self, node_t parent, h5eUnicode text) except -1: 173 | cdef tree.xmlNode *child = tree.xmlNewDocText(self.doc, tree._xcstr(bytes_h5e(text))) 174 | return self.append_node_cb(parent, child) 175 | 176 | # These callbacks are only triggered when text or a tag not on the 177 | # whitelist is found in a table. The text or tag is then inserted before 178 | # the table. 179 | #
180 | cdef int insert_node_before_sibling_cb(self, node_t sibling_, node_t new_sibling_) except -1: 181 | cdef tree.xmlNode *sibling = sibling_ 182 | cdef tree.xmlNode *new_sibling = new_sibling_ 183 | if sibling.parent is NULL: 184 | return 1 185 | tree.xmlAddPrevSibling(sibling, new_sibling) 186 | return 0 187 | 188 | # foof
189 | cdef int insert_text_before_sibling_cb(self, node_t sibling_, h5eUnicode text) except -1: 190 | cdef tree.xmlNode *text_node = tree.xmlNewDocText(self.doc, tree._xcstr(bytes_h5e(text))) 191 | return self.insert_node_before_sibling_cb(sibling_, text_node) 192 | 193 | # This is only called when dealing with end tags that don't match start tags 194 | # e.g.

195 | cdef int reparent_children_cb(self, node_t parent_, node_t new_parent_) except -1: 196 | cdef tree.xmlNode *parent = parent_ 197 | cdef tree.xmlNode *new_parent = new_parent_ 198 | cdef tree.xmlNode *node 199 | 200 | while parent.children is not NULL: 201 | node = parent.children 202 | tree.xmlUnlinkNode(node) 203 | tree.xmlAddChild(new_parent, node) 204 | return 0 205 | 206 | # rare case, triggered by 207 | cdef int remove_from_parent_cb(self, node_t node_) except -1: 208 | cdef tree.xmlNode *node = node_ 209 | tree.xmlUnlinkNode(node) 210 | 211 | cdef int recalibrate_namespace(self, tree.xmlNode *node, h5eUnicode ns) except -1: 212 | cdef bytes ns_url = bytes_h5e(ns) 213 | cdef tree.const_xmlChar *ns_prefix = '' 214 | cdef tree.xmlNs *xmlns = NULL 215 | cdef tree.xmlNode *element 216 | 217 | if ns_url == b'http://www.w3.org/1999/xhtml': 218 | xmlns = self.html_ns 219 | ns_prefix = NULL 220 | elif ns_url == b'http://www.w3.org/1998/Math/MathML': 221 | xmlns = self.math_ns 222 | ns_prefix = 'math' 223 | elif ns_url == b'http://www.w3.org/2000/svg': 224 | xmlns = self.svg_ns 225 | ns_prefix = 'svg' 226 | elif ns_url == b'http://www.w3.org/1999/xlink': 227 | xmlns = self.xlink_ns 228 | ns_prefix = 'xlink' 229 | elif ns_url == b'http://www.w3.org/XML/1998/namespace': 230 | xmlns = self.xml_ns 231 | ns_prefix = 'xml' 232 | elif ns_url == b'http://www.w3.org/2000/xmlns/': 233 | xmlns = self.xmlns_ns 234 | ns_prefix = 'xmlns' 235 | elif ns_url == b'': 236 | xmlns = NULL 237 | ns_prefix = NULL 238 | else: 239 | raise AssertionError(ns_url) 240 | 241 | if xmlns is NULL and ns_url != b'': 242 | element = node 243 | if node.type == tree.XML_ATTRIBUTE_NODE: 244 | element = node.parent 245 | xmlns = tree.xmlNewNs(element, tree._xcstr(ns_url), ns_prefix) 246 | if ns_url == b'http://www.w3.org/1999/xhtml': 247 | self.html_ns = xmlns 248 | elif ns_url == b'http://www.w3.org/1998/Math/MathML': 249 | self.math_ns = xmlns 250 | elif ns_url == b'http://www.w3.org/2000/svg': 251 | self.svg_ns = xmlns 252 | elif ns_url == b'http://www.w3.org/1999/xlink': 253 | self.xlink_ns = xmlns 254 | elif ns_url == b'http://www.w3.org/XML/1998/namespace': 255 | self.xml_ns = xmlns 256 | elif ns_url == b'http://www.w3.org/2000/xmlns/': 257 | self.xmlns_ns = xmlns 258 | 259 | tree.xmlSetNs(node, xmlns) 260 | return 0 261 | 262 | cdef h5eCallbacks callbacks = h5eCallbacks( 263 | clone_node_ref= NULL, 264 | destroy_node_ref= NULL, 265 | same_node= NULL, 266 | parse_error= NULL, 267 | run_script= Parser.run_script_cb, 268 | create_element= Parser.create_element_cb, 269 | get_template_contents= Parser.get_template_contents_cb, 270 | add_attribute_if_missing= Parser.add_attribute_if_missing_cb, 271 | create_comment= Parser.create_comment_cb, 272 | append_doctype_to_document= Parser.append_doctype_to_document_cb, 273 | append_node= Parser.append_node_cb, 274 | append_text= Parser.append_text_cb, 275 | insert_node_before_sibling= Parser.insert_node_before_sibling_cb, 276 | insert_text_before_sibling= Parser.insert_text_before_sibling_cb, 277 | reparent_children= Parser.reparent_children_cb, 278 | remove_from_parent= Parser.remove_from_parent_cb, 279 | ) 280 | 281 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = -p no:doctest --tb=short 3 | testpaths = tests 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import subprocess 3 | from setuptools import setup, Extension 4 | from setuptools.command.build_ext import build_ext as setuptools_build_ext 5 | from Cython.Build.Cythonize import cythonize 6 | import lxml 7 | 8 | MODE = 'release' 9 | 10 | class build_ext(setuptools_build_ext): 11 | def build_extension(self, ext): 12 | subprocess.check_call(['cargo', 'build'] + 13 | (['--release'] if MODE == 'release' else [])) 14 | setuptools_build_ext.build_extension(self, ext) 15 | 16 | includes = ['/usr/include/libxml2'] + lxml.get_include() 17 | setup( 18 | name='htmlpyever', 19 | 20 | ext_modules=cythonize([Extension( 21 | name='htmlpyever', 22 | sources=['htmlpyever.pyx'], 23 | libraries=['html5ever_glue', 'xml2'], 24 | library_dirs=['target/{}'.format(MODE)], 25 | include_dirs=includes, 26 | depends=['target/{}/libhtml5ever_glue.a'.format(MODE)], 27 | )], include_path=includes), 28 | 29 | setup_requires=['cython'], 30 | install_requires=['lxml'], 31 | cmdclass={'build_ext': build_ext}, 32 | ) 33 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import itertools 4 | import operator 5 | 6 | import pytest 7 | from lxml import etree 8 | import htmlpyever 9 | import fucklxml 10 | 11 | def pytest_collect_file(path, parent): 12 | dir = os.path.basename(path.dirname) 13 | if dir == 'tree-construction' and path.ext == '.dat': 14 | return TreeConstructionFile(path, parent) 15 | 16 | with open('tests/xfail.txt') as xfail: 17 | # chop off the ending newlines 18 | xfail_list = list(map(operator.itemgetter(slice(-1)), xfail)) 19 | 20 | class TreeConstructionFile(pytest.File): 21 | def collect(self): 22 | with open(self.fspath, 'rb') as dat: 23 | testdata = {} 24 | # The whole in_quote thing is really ghetto but at least it works on the test data 25 | in_quote = False 26 | for i, line in enumerate(itertools.chain(dat, [b'\n']), 1): 27 | if line == b'\n' and len(testdata) >= 3 and not in_quote: 28 | assert not in_quote 29 | yield TreeConstructionTest(os.path.basename(self.fspath), i, self, **testdata) 30 | testdata = {} 31 | elif line.startswith(b'#'): 32 | heading = line[1:-1].replace(b'-', b'_').decode() 33 | testdata.setdefault(heading, b'') 34 | if heading == 'document': 35 | in_quote = False 36 | else: 37 | if heading == 'document': 38 | if in_quote or line[1:].lstrip().startswith(b'"'): 39 | for _ in range(line.count(b'"')): 40 | in_quote = not in_quote 41 | testdata[heading] += line 42 | 43 | etree.register_namespace('math', 'http://www.w3.org/1998/Math/MathML') 44 | etree.register_namespace('svg', 'http://www.w3.org/2000/svg') 45 | etree.register_namespace('xlink', 'http://www.w3.org/1999/xlink') 46 | etree.register_namespace('xml', 'http://www.w3.org/XML/1998/namespace') 47 | etree.register_namespace('xmlns', 'http://www.w3.org/2000/xmlns/') 48 | 49 | HTML_NS = 'http://www.w3.org/1999/xhtml' 50 | 51 | def parse_name(name): 52 | if name.startswith(b'math '): 53 | namespace = 'http://www.w3.org/1998/Math/MathML' 54 | prefix = 'math' 55 | elif name.startswith(b'svg '): 56 | namespace = 'http://www.w3.org/2000/svg' 57 | prefix = 'svg' 58 | elif name.startswith(b'xlink '): 59 | namespace = 'http://www.w3.org/1999/xlink' 60 | prefix = 'xlink' 61 | elif name.startswith(b'xml '): 62 | namespace = 'http://www.w3.org/XML/1998/namespace' 63 | prefix = 'xml' 64 | elif name.startswith(b'xmlns '): 65 | namespace = 'http://www.w3.org/2000/xmlns/' 66 | prefix = 'xmlns' 67 | else: 68 | namespace = HTML_NS 69 | prefix = None 70 | if namespace != HTML_NS: 71 | name = name.split()[1] 72 | 73 | return name, prefix, namespace 74 | 75 | def etreeify_name(name, attribute=False): 76 | name, prefix, namespace = parse_name(name) 77 | if attribute: 78 | if namespace == HTML_NS: 79 | return name 80 | return b'{' + namespace.encode() + b'}' + name 81 | return b'{' + namespace.encode() + b'}' + name, {prefix: namespace} 82 | 83 | def etreeify(raw_name): 84 | try: 85 | name, nsmap = etreeify_name(raw_name) 86 | return etree.Element(name, nsmap=nsmap) 87 | except ValueError: 88 | elem = etree.Element('fuck') 89 | name, prefix, namespace = parse_name(raw_name) 90 | fucklxml.set_name(elem, name, prefix, namespace) 91 | return elem 92 | 93 | class TreeConstructionTest(pytest.Item): 94 | def __init__(self, filename, index, parent, data=None, errors=None, document=None, document_fragment=None, script_off=False, **kwargs): 95 | super().__init__(f'{filename}:{index}', parent) 96 | if data != b'': 97 | assert data.endswith(b'\n') 98 | data = data[:-1] 99 | self.data = data 100 | self.errors = errors 101 | self.document = document 102 | self.script_on = script_off != b'' 103 | if document_fragment is not None: 104 | assert document_fragment.endswith(b'\n') 105 | self.fragment_context = etreeify(document_fragment[:-1]) 106 | else: 107 | self.fragment_context = None 108 | 109 | def runtest(self): 110 | if self.name in xfail_list: 111 | try: 112 | self._runtest() 113 | except: 114 | pytest.xfail() 115 | else: 116 | self._runtest() 117 | 118 | def _runtest(self): 119 | print(self.name) 120 | print('data', self.data) 121 | 122 | top_level = [] # to deal with top-level comments 123 | stack = [] 124 | def append(elem): 125 | if len(stack): 126 | stack[-1].append(elem) 127 | else: 128 | top_level.append(elem) 129 | stack.append(elem) 130 | if self.fragment_context is not None: 131 | append(self.fragment_context) 132 | 133 | doctype_name = None 134 | doctype_public_id = doctype_system_id = '' 135 | 136 | template_contents = {} 137 | 138 | document = self.document 139 | assert document.startswith(b'| ') and document.endswith(b'\n') 140 | for line in document[2:-1].split(b'\n| '): 141 | 142 | line_depth = (len(line) - len(line.lstrip())) // 2 143 | if self.fragment_context is not None: 144 | line_depth += 1 145 | line = line.lstrip() 146 | 147 | while line_depth < len(stack): 148 | stack.pop() 149 | 150 | if line == b'content': 151 | # template contents 152 | contents = etree.Element('template-contents') 153 | template_contents[stack[-1]] = contents 154 | stack.append(contents) 155 | 156 | elif line.startswith(b''): 157 | # comment 158 | content = line[5:-4].decode('utf-8') 159 | comment = etree.Comment() 160 | comment.text = content 161 | append(comment) 162 | 163 | elif line.startswith(b''): 164 | # doctype 165 | content = line[10:-1] 166 | doctype_name, _, content = content.partition(b' "') 167 | if content: 168 | doctype_public_id, _, content = content.partition(b'" "') 169 | doctype_system_id, _, _ = content.rpartition(b'"') 170 | doctype_public_id = doctype_public_id.decode() 171 | doctype_system_id = doctype_system_id.decode() 172 | doctype_name = doctype_name.decode() 173 | 174 | elif line.startswith(b'<') and line.endswith(b'>'): 175 | # element 176 | name = line[1:-1] 177 | elem = etreeify(name) 178 | append(elem) 179 | 180 | elif line.startswith(b'"') and line.endswith(b'"'): 181 | # text 182 | text = line[1:-1].decode('utf-8') 183 | top = stack[-1] 184 | if len(top) == 0: 185 | if top.text is None: 186 | top.text = text 187 | else: 188 | top.text += text 189 | else: 190 | if top[-1].tail is None: 191 | top[-1].tail = text 192 | else: 193 | top[-1].tail += text 194 | 195 | else: 196 | assert b'=' in line 197 | name, _, value = line.partition(b'=') 198 | assert value.startswith(b'"') and value.endswith(b'"') 199 | value = value[1:-1] 200 | try: 201 | stack[-1].set(etreeify_name(name, attribute=True), value) 202 | except ValueError: 203 | name, prefix, namespace = parse_name(name) 204 | fucklxml.set_attribute(elem, name, value, prefix, namespace) 205 | 206 | pre_root = [] 207 | root = None 208 | for node in top_level: 209 | if root is None: 210 | if isinstance(node.tag, str): 211 | root = node 212 | for node in pre_root: 213 | root.addprevious(node) 214 | else: 215 | pre_root.append(node) 216 | else: 217 | root.addnext(node) 218 | 219 | document = etree.ElementTree(root) 220 | if self.fragment_context is None: 221 | assert document.getroot().tag == '{' + HTML_NS + '}html' 222 | document.docinfo.public_id = doctype_public_id 223 | document.docinfo.system_url = doctype_system_id 224 | 225 | parser = htmlpyever.Parser(fragment_context=self.fragment_context, scripting=self.script_on) 226 | parser.feed(self.data) 227 | parser.end() 228 | if self.fragment_context is not None: 229 | self.fragment_context.extend(parser.root) 230 | root = self.fragment_context 231 | else: 232 | root = parser.root 233 | try: 234 | assert etree.tostring(root) == etree.tostring(document.getroot()) 235 | if parser.roottree.docinfo.internalDTD is not None: 236 | assert doctype_name == parser.roottree.docinfo.internalDTD.name 237 | assert doctype_public_id == parser.roottree.docinfo.public_id 238 | assert doctype_system_id == parser.roottree.docinfo.system_url 239 | else: 240 | assert doctype_name is None 241 | assert doctype_public_id == '' 242 | assert doctype_system_id == '' 243 | except: 244 | print(etree.tostring(root)) 245 | print(etree.tostring(document.getroot())) 246 | raise 247 | 248 | def repr_failure(self, excinfo): 249 | traceback = excinfo.traceback 250 | ntraceback = traceback.cut(path=__file__) 251 | excinfo.traceback = ntraceback.filter() 252 | 253 | return excinfo.getrepr(funcargs=True, 254 | showlocals=False, 255 | style="short", tbfilter=False) 256 | -------------------------------------------------------------------------------- /tests/xfail.txt: -------------------------------------------------------------------------------- 1 | isindex.dat:26 2 | menuitem-element.dat:51 3 | menuitem-element.dat:66 4 | menuitem-element.dat:81 5 | tests19.dat:512 6 | tests19.dat:524 7 | tests19.dat:1080 8 | tests20.dat:459 9 | tests20.dat:487 10 | tests20.dat:501 11 | tests23.dat:134 12 | tests26.dat:285 13 | webkit01.dat:194 14 | --------------------------------------------------------------------------------