├── .gitattributes ├── .gitignore ├── .npmignore ├── LICENSE ├── README.md ├── bindings ├── Cargo.toml └── src │ ├── lib.rs │ ├── mem.rs │ └── option.rs ├── docs ├── .nojekyll ├── assets │ ├── highlight.css │ ├── icons.css │ ├── icons.png │ ├── icons@2x.png │ ├── main.js │ ├── search.js │ ├── style.css │ ├── widgets.png │ └── widgets@2x.png ├── classes │ ├── Attributes.html │ ├── ChildrenCollection.html │ ├── Collection.html │ ├── CollectionIter.html │ ├── Comment.html │ ├── Dom.html │ ├── GlobalNodeCollection.html │ ├── Node.html │ ├── RawTag.html │ └── Tag.html ├── enums │ └── HTMLVersion.html ├── index.html ├── interfaces │ └── ParserOptions.html └── modules.html ├── node └── lib │ ├── bindings.ts │ └── index.ts ├── package-lock.json ├── package.json ├── test ├── dom.js ├── example.js ├── parse.js ├── runner.js └── test.html ├── tsconfig.json └── typedoc.json /.gitattributes: -------------------------------------------------------------------------------- 1 | test/* linguist-vendored 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | bindings/target 2 | bindings/Cargo.lock 3 | node/dist 4 | node/test 5 | node_modules 6 | __test__ 7 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | bindings 2 | node_modules 3 | docs 4 | typedoc.json 5 | __test__ 6 | test 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Timo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tljs 2 | A [high performance](#benchmark) HTML5 parser for JavaScript. 3 | 4 | This library wraps the Rust crate [tl](https://github.com/y21/tl) and exposes its interface to JavaScript. 5 | 6 | ## When To Use 7 | This library can *very quickly* parse *very large* HTML documents. However, this library is not suitable for every use case. 8 | In particular, if you find yourself having to do lots of operations on the nodes, this may not be for you, due to the overhead of calling into WebAssembly. 9 | So, use this library if: 10 | 11 | - Most of the time is likely spent parsing documents. 12 | - Not a lot of operations are done on the nodes. 13 | - You need to parse *large* documents (tens to hundreds of megabytes) 14 | 15 | In any case, you should benchmark this library for your specific use case, and see if you benefit from the fast parsing speeds, or if the WebAssembly overhead is a bottleneck. 16 | 17 | ## How To Use 18 | ```js 19 | const tljs = require('@y21/tljs'); 20 | const dom = await tljs.parse(` 21 | 22 |
23 |

Hello World

24 | 25 |
26 | `); 27 | 28 | console.log(dom.getElementById('img').asTag().attributes().get('src')); // image.png 29 | console.log(dom.getElementById('greeting').asTag().innerText()); // Hello World 30 | console.log(dom.querySelector('p#greeting').asTag().innerText()); // Hello World 31 | console.log(dom.version() === tljs.HTMLVersion.HTML5); // true 32 | ``` 33 | 34 | ## Spec compliance 35 | This parser does **not** fully follow the HTML standard, however it is expected to be able to parse most "sane" HTML. 36 | This greatly impacts performance and should be taken into consideration when comparing performance of different HTML parsers. 37 | Not being bound to a spec enables a *lot* more optimization opportunities. 38 | 39 | ## Using this library in the browser 40 | It's possible to use this library in very "restricted" JavaScript environments (for example no access to the file system or network). By default, this library assumes it's running under Node.js and attempts to load the `.wasm` binary needed to call into Rust code using `require('fs').readFile`. 41 | 42 | If you want to use this library in the browser or other environments, you need to override the default WebAssembly loading mechanism. This depends on your setup, but one way to achieve this would be to host the `.wasm` binary elsewhere (maybe serve it from your webserver) and use `fetch()` to get the binary. 43 | ```js 44 | const tljs = require('@y21/tljs'); 45 | 46 | // override the wasm loading function 47 | tljs.setInitializerCallback(() => { 48 | return fetch('/tl.wasm').then(x => x.arrayBuffer()); // assuming `/tl.wasm` serves the binary. 49 | }); 50 | 51 | tljs.parse('

Hello world

'); 52 | ``` 53 | It doesn't matter *how* you obtain the WebAssembly binary, but you'll need to return an `ArrayBuffer` from the initializer callback (can also be a promise resolving to an `ArrayBuffer`). 54 | 55 | ## Benchmark 56 | ``` 57 | tl : 0.863912 ms/file ± 0.528114 58 | htmlparser2 : 2.02348 ms/file ± 3.05865 59 | html5parser : 2.20736 ms/file ± 2.66850 60 | htmlparser2-dom : 2.70631 ms/file ± 3.40642 61 | html-dom-parser : 2.72998 ms/file ± 3.56091 62 | neutron-html5parser: 2.74419 ms/file ± 1.52848 63 | node-html-parser : 2.89545 ms/file ± 1.80618 64 | libxmljs : 4.20240 ms/file ± 2.99146 65 | zeed-dom : 4.82065 ms/file ± 2.86533 66 | htmljs-parser : 5.97658 ms/file ± 6.65908 67 | parse5 : 6.85238 ms/file ± 7.75122 68 | arijs-stream : 18.7410 ms/file ± 18.6447 69 | arijs-tree : 20.6841 ms/file ± 19.4813 70 | htmlparser : 21.8427 ms/file ± 154.758 71 | html-parser : 27.3543 ms/file ± 20.7064 72 | saxes : 58.8234 ms/file ± 167.164 73 | html5 : 109.685 ms/file ± 146.399 74 | ``` 75 | Benchmarked against real world data using [AndreasMadsen/htmlparser-benchmark](https://github.com/AndreasMadsen/htmlparser-benchmark). 76 | 77 | *Note: This benchmark only measures raw HTML parsing, not DOM interaction.* 78 | -------------------------------------------------------------------------------- /bindings/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "bindings" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | 7 | [dependencies] 8 | tl = { version = "0.7.5", features = ["simd"] } 9 | 10 | [lib] 11 | crate-type = ["cdylib"] 12 | -------------------------------------------------------------------------------- /bindings/src/lib.rs: -------------------------------------------------------------------------------- 1 | use std::{ffi::CString, mem::ManuallyDrop}; 2 | 3 | use mem::ExternalString; 4 | use option::FFIOption; 5 | use tl::NodeHandle; 6 | 7 | mod mem; 8 | mod option; 9 | 10 | type Dom = tl::VDom<'static>; 11 | 12 | #[no_mangle] 13 | pub unsafe extern "C" fn tl_parse(ptr: *const u8, len: usize, opts: u8) -> *mut Dom { 14 | let options = tl::ParserOptions::from_raw_checked(opts).unwrap(); 15 | 16 | let slice = std::slice::from_raw_parts(ptr, len); 17 | let input = std::str::from_utf8_unchecked(slice); 18 | let dom = tl::parse(input, options).expect("WASM strings cannot exceed u32::MAX"); 19 | 20 | Box::into_raw(Box::new(dom)) 21 | } 22 | 23 | #[no_mangle] 24 | pub unsafe extern "C" fn tl_dom_nodes_count(ptr: *mut Dom) -> usize { 25 | (*ptr).nodes().len() 26 | } 27 | 28 | #[no_mangle] 29 | pub unsafe extern "C" fn tl_dom_version(ptr: *mut Dom) -> tl::HTMLVersion { 30 | (*ptr) 31 | .version() 32 | .unwrap_or(tl::HTMLVersion::TransitionalHTML401) 33 | } 34 | 35 | #[no_mangle] 36 | pub unsafe extern "C" fn tl_dom_get_element_by_id( 37 | dom_ptr: *mut Dom, 38 | str_ptr: *mut u8, 39 | str_len: usize, 40 | ) -> *mut FFIOption { 41 | let id = ExternalString::new(str_ptr, str_len); 42 | let element = (*dom_ptr).get_element_by_id(id.as_str()); 43 | Box::into_raw(Box::new(element.into())) 44 | } 45 | 46 | // todo: optimise 47 | #[no_mangle] 48 | pub unsafe extern "C" fn tl_dom_get_elements_by_class_name( 49 | dom_ptr: *mut Dom, 50 | str_ptr: *mut u8, 51 | str_len: usize, 52 | ) -> *mut [usize; 3] { 53 | let class_name = ExternalString::new(str_ptr, str_len); 54 | let mut elements = ManuallyDrop::new( 55 | (*dom_ptr) 56 | .get_elements_by_class_name(class_name.as_str()) 57 | .collect::>(), 58 | ); 59 | 60 | let (ptr, len, cap) = ( 61 | elements.as_mut_ptr() as usize, 62 | elements.len() as usize, 63 | elements.capacity() as usize, 64 | ); 65 | 66 | Box::into_raw(Box::new([ptr, len, cap])) 67 | } 68 | 69 | #[no_mangle] 70 | pub unsafe extern "C" fn tl_node_inner_text( 71 | dom_ptr: *mut Dom, 72 | id: tl::NodeHandle, 73 | ) -> *mut [usize; 2] { 74 | let dom = &*dom_ptr; 75 | let parser = dom.parser(); 76 | 77 | let node = match id.get(parser) { 78 | Some(node) => node, 79 | None => return std::ptr::null_mut() 80 | }; 81 | 82 | let inner_text = node.inner_text(parser); 83 | ExternalString::from_str_cloned(&inner_text).into_leaked_raw_parts() 84 | } 85 | 86 | #[no_mangle] 87 | pub unsafe extern "C" fn tl_node_inner_html( 88 | dom_ptr: *mut Dom, 89 | id: tl::NodeHandle, 90 | ) -> *mut [usize; 2] { 91 | let dom = &*dom_ptr; 92 | let parser = dom.parser(); 93 | 94 | let node = match id.get(parser) { 95 | Some(node) => node, 96 | None => return std::ptr::null_mut() 97 | }; 98 | 99 | let inner_html = node.inner_html(parser); 100 | ExternalString::from_str_cloned(&inner_html).into_leaked_raw_parts() 101 | } 102 | 103 | #[no_mangle] 104 | pub unsafe extern "C" fn tl_node_is_tag(dom_ptr: *mut Dom, id: tl::NodeHandle) -> bool { 105 | let node = (*dom_ptr).parser().resolve_node_id(id.get_inner()).unwrap(); 106 | node.as_tag().is_some() 107 | } 108 | 109 | #[no_mangle] 110 | pub unsafe extern "C" fn tl_node_is_raw(dom_ptr: *mut Dom, id: tl::NodeHandle) -> bool { 111 | let node = (*dom_ptr).parser().resolve_node_id(id.get_inner()).unwrap(); 112 | node.as_raw().is_some() 113 | } 114 | 115 | #[no_mangle] 116 | pub unsafe extern "C" fn tl_node_is_comment(dom_ptr: *mut Dom, id: tl::NodeHandle) -> bool { 117 | let node = (*dom_ptr).parser().resolve_node_id(id.get_inner()).unwrap(); 118 | node.as_comment().is_some() 119 | } 120 | 121 | #[no_mangle] 122 | pub unsafe extern "C" fn tl_node_tag_name( 123 | dom_ptr: *mut Dom, 124 | id: tl::NodeHandle, 125 | ) -> *mut [usize; 2] { 126 | let dom = &*dom_ptr; 127 | let parser = dom.parser(); 128 | 129 | let node = match id.get(parser) { 130 | Some(node) => node, 131 | None => return std::ptr::null_mut() 132 | }; 133 | 134 | let name = node.as_tag().unwrap().name().as_utf8_str(); 135 | ExternalString::from_str_cloned(&name).into_leaked_raw_parts() 136 | } 137 | 138 | #[no_mangle] 139 | pub unsafe extern "C" fn tl_node_tag_attributes_count( 140 | dom_ptr: *mut Dom, 141 | id: tl::NodeHandle, 142 | ) -> usize { 143 | let node = (*dom_ptr) 144 | .parser() 145 | .resolve_node_id(id.get_inner()) 146 | .unwrap() 147 | .as_tag() 148 | .unwrap(); 149 | 150 | node.attributes().len() 151 | } 152 | 153 | #[no_mangle] 154 | pub unsafe extern "C" fn tl_node_tag_attributes_insert( 155 | dom_ptr: *mut Dom, 156 | id: tl::NodeHandle, 157 | key_ptr: *mut u8, 158 | key_len: usize, 159 | value_ptr: *mut u8, 160 | value_len: usize 161 | ) { 162 | let dom = &mut *dom_ptr; 163 | let parser = dom.parser_mut(); 164 | let key = { 165 | let key = ExternalString::new(key_ptr, key_len).to_string(); 166 | tl::Bytes::try_from(key).unwrap() 167 | }; 168 | let value = { 169 | let value = ExternalString::new(value_ptr, value_len).to_string(); 170 | tl::Bytes::try_from(value).unwrap() 171 | }; 172 | 173 | let attributes = id 174 | .get_mut(parser) 175 | .unwrap() 176 | .as_tag_mut() 177 | .unwrap() 178 | .attributes_mut(); 179 | 180 | attributes.insert(key, Some(value)); 181 | } 182 | 183 | #[no_mangle] 184 | pub unsafe extern "C" fn tl_node_tag_attributes_remove( 185 | dom_ptr: *mut Dom, 186 | id: tl::NodeHandle, 187 | key_ptr: *mut u8, 188 | key_len: usize 189 | ) { 190 | let dom = &mut *dom_ptr; 191 | let parser = dom.parser_mut(); 192 | let key = ExternalString::new(key_ptr, key_len); 193 | 194 | let attributes = id 195 | .get_mut(parser) 196 | .unwrap() 197 | .as_tag_mut() 198 | .unwrap() 199 | .attributes_mut(); 200 | 201 | // Currently (as of 6/5) the lifetime requirement is unnecessarily too strict 202 | // and requires a &'a str ('a being the input string lifetime, here 'static) 203 | // for now we just transmute the lifetime, which is sound to do 204 | // TODO: fix upstream Attributes::remove lifetime requirement 205 | let attr_key = std::mem::transmute::<&str, &'static str>(key.as_str()); 206 | 207 | attributes.remove(attr_key); 208 | } 209 | 210 | #[no_mangle] 211 | pub unsafe extern "C" fn tl_node_tag_attributes_get( 212 | dom_ptr: *mut Dom, 213 | id: tl::NodeHandle, 214 | str_ptr: *mut u8, 215 | str_len: usize, 216 | ) -> *mut FFIOption<*mut [usize; 2]> { 217 | let tag = (*dom_ptr) 218 | .parser() 219 | .resolve_node_id(id.get_inner()) 220 | .unwrap() 221 | .as_tag() 222 | .unwrap(); 223 | 224 | let attributes = tag.attributes(); 225 | let name = ExternalString::new(str_ptr, str_len); 226 | let value = attributes.get(name.as_str()).flatten().map(|x| x.as_utf8_str()); 227 | 228 | let value = value.map(|x| ExternalString::from_str_cloned(&x).into_leaked_raw_parts()); 229 | Box::into_raw(Box::new(value.into())) 230 | } 231 | 232 | #[no_mangle] 233 | pub unsafe extern "C" fn tl_dom_inner_html(dom_ptr: *mut Dom) -> *mut [usize; 2] { 234 | let inner_html = (*dom_ptr).inner_html(); 235 | ExternalString::from_str_cloned(&inner_html).into_leaked_raw_parts() 236 | } 237 | 238 | #[no_mangle] 239 | pub unsafe extern "C" fn tl_dom_subnodes(dom_ptr: *mut Dom) -> *mut [usize; 2] { 240 | let nodes = (*dom_ptr).nodes(); 241 | let len = nodes.len(); 242 | let ptr = nodes.as_ptr(); 243 | Box::into_raw(Box::new([ptr as usize, len])) 244 | } 245 | 246 | #[no_mangle] 247 | pub unsafe extern "C" fn tl_dom_children(dom_ptr: *mut Dom) -> *mut [usize; 2] { 248 | let nodes = (*dom_ptr).children(); 249 | let len = nodes.len(); 250 | let ptr = nodes.as_ptr(); 251 | Box::into_raw(Box::new([ptr as usize, len])) 252 | } 253 | 254 | #[no_mangle] 255 | pub unsafe extern "C" fn tl_dom_children_index( 256 | slice: *const tl::NodeHandle, 257 | len: usize, 258 | at: usize, 259 | ) -> tl::NodeHandle { 260 | let slice = std::slice::from_raw_parts(slice, len); 261 | let node = slice[at].get_inner(); 262 | tl::NodeHandle::new(node) 263 | } 264 | 265 | #[no_mangle] 266 | pub unsafe extern "C" fn tl_dom_query_selector_single( 267 | dom_ptr: *mut Dom, 268 | selector_ptr: *mut u8, 269 | selector_len: usize, 270 | ) -> *mut FFIOption { 271 | let selector = ExternalString::new(selector_ptr, selector_len); 272 | let node: FFIOption<_> = (*dom_ptr) 273 | .query_selector(selector.as_str()) 274 | .and_then(|mut selector| selector.next()) 275 | .into(); 276 | 277 | Box::into_raw(Box::new(node)) 278 | } 279 | 280 | #[no_mangle] 281 | pub unsafe extern "C" fn tl_dom_query_selector_all( 282 | dom_ptr: *mut Dom, 283 | selector_ptr: *mut u8, 284 | selector_len: usize, 285 | ) -> *mut [usize; 3] { 286 | let selector = ExternalString::new(selector_ptr, selector_len); 287 | let handles = (*dom_ptr) 288 | .query_selector(selector.as_str()) 289 | .map(|selector| selector.collect::>()) 290 | .map(ManuallyDrop::new); 291 | 292 | if let Some(mut handles) = handles { 293 | Box::into_raw(Box::new([ 294 | handles.as_mut_ptr() as usize, 295 | handles.len() as usize, 296 | handles.capacity() as usize, 297 | ])) 298 | } else { 299 | std::ptr::null_mut() 300 | } 301 | } 302 | 303 | macro_rules! define_generic_destructors { 304 | ($(($name:ident => $type:ty)),+) => { 305 | $( 306 | #[no_mangle] 307 | pub unsafe extern "C" fn $name(ptr: *mut $type) { 308 | drop(Box::from_raw(ptr)); 309 | } 310 | )+ 311 | }; 312 | } 313 | 314 | define_generic_destructors! { 315 | (drop_collection_vtable => *mut [usize; 2]), 316 | (drop_node_handle_option => *mut FFIOption), 317 | (drop_string_option => *mut FFIOption<*mut [usize; 2]>), 318 | (drop_dom => Dom) 319 | } 320 | 321 | #[no_mangle] 322 | pub unsafe extern "C" fn drop_c_string(ptr: *mut i8) { 323 | drop(CString::from_raw(ptr)); 324 | } 325 | 326 | #[no_mangle] 327 | pub unsafe extern "C" fn drop_collection(ptr: *mut [usize; 3]) { 328 | let parts = Box::from_raw(ptr); 329 | let vptr = parts[0] as *mut NodeHandle; 330 | let len = parts[1]; 331 | let cap = parts[2]; 332 | drop(Vec::from_raw_parts(vptr, len, cap)); 333 | } 334 | -------------------------------------------------------------------------------- /bindings/src/mem.rs: -------------------------------------------------------------------------------- 1 | use std::alloc; 2 | 3 | #[no_mangle] 4 | pub unsafe extern "C" fn alloc(len: usize) -> *mut u8 { 5 | alloc::alloc(alloc::Layout::from_size_align(len, 1).unwrap()) 6 | } 7 | 8 | #[no_mangle] 9 | pub unsafe extern "C" fn dealloc(ptr: *mut u8, len: usize) { 10 | alloc::dealloc(ptr, alloc::Layout::from_size_align(len, 1).unwrap()) 11 | } 12 | 13 | pub struct ExternalString(*mut u8, usize); 14 | impl ExternalString { 15 | pub unsafe fn new(ptr: *mut u8, len: usize) -> Self { 16 | ExternalString(ptr, len) 17 | } 18 | pub fn to_string(&self) -> String { 19 | self.as_str().to_owned() 20 | } 21 | pub fn from_str_cloned(s: &str) -> Self { 22 | let len = s.len(); 23 | let ptr = unsafe { alloc(len) }; 24 | assert!(!ptr.is_null()); 25 | unsafe { 26 | std::ptr::copy_nonoverlapping(s.as_ptr(), ptr, len); 27 | ExternalString::new(ptr, len) 28 | } 29 | } 30 | pub fn into_leaked_raw_parts(self) -> *mut [usize; 2] { 31 | let (ptr, len) = (self.0, self.1); 32 | let parts = Box::into_raw(Box::new([ptr as usize, len])); 33 | std::mem::forget(self); 34 | parts 35 | } 36 | pub fn as_str(&self) -> &str { 37 | unsafe { 38 | let slice = std::slice::from_raw_parts(self.0, self.1); 39 | std::str::from_utf8_unchecked(slice) 40 | } 41 | } 42 | } 43 | impl Drop for ExternalString { 44 | fn drop(&mut self) { 45 | // The external string was allocated using mem::alloc(len) 46 | unsafe { dealloc(self.0 as *mut _, self.1) } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /bindings/src/option.rs: -------------------------------------------------------------------------------- 1 | use Option as StdOption; 2 | 3 | /// Same as std's Option but repr(C) 4 | #[repr(C)] 5 | pub enum FFIOption { 6 | Some(T), 7 | None, 8 | } 9 | 10 | impl From> for FFIOption { 11 | fn from(opt: StdOption) -> Self { 12 | opt.map(FFIOption::Some).unwrap_or(FFIOption::None) 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /docs/.nojekyll: -------------------------------------------------------------------------------- 1 | TypeDoc added this file to prevent GitHub Pages from using Jekyll. You can turn off this behavior by setting the `githubPages` option to false. -------------------------------------------------------------------------------- /docs/assets/highlight.css: -------------------------------------------------------------------------------- 1 | :root { 2 | --light-hl-0: #0000FF; 3 | --dark-hl-0: #569CD6; 4 | --light-hl-1: #000000; 5 | --dark-hl-1: #D4D4D4; 6 | --light-hl-2: #0070C1; 7 | --dark-hl-2: #4FC1FF; 8 | --light-hl-3: #795E26; 9 | --dark-hl-3: #DCDCAA; 10 | --light-hl-4: #A31515; 11 | --dark-hl-4: #CE9178; 12 | --light-hl-5: #AF00DB; 13 | --dark-hl-5: #C586C0; 14 | --light-hl-6: #001080; 15 | --dark-hl-6: #9CDCFE; 16 | --light-hl-7: #008000; 17 | --dark-hl-7: #6A9955; 18 | --light-hl-8: #000000; 19 | --dark-hl-8: #C8C8C8; 20 | --light-hl-9: #098658; 21 | --dark-hl-9: #B5CEA8; 22 | --light-code-background: #F5F5F5; 23 | --dark-code-background: #1E1E1E; 24 | } 25 | 26 | @media (prefers-color-scheme: light) { :root { 27 | --hl-0: var(--light-hl-0); 28 | --hl-1: var(--light-hl-1); 29 | --hl-2: var(--light-hl-2); 30 | --hl-3: var(--light-hl-3); 31 | --hl-4: var(--light-hl-4); 32 | --hl-5: var(--light-hl-5); 33 | --hl-6: var(--light-hl-6); 34 | --hl-7: var(--light-hl-7); 35 | --hl-8: var(--light-hl-8); 36 | --hl-9: var(--light-hl-9); 37 | --code-background: var(--light-code-background); 38 | } } 39 | 40 | @media (prefers-color-scheme: dark) { :root { 41 | --hl-0: var(--dark-hl-0); 42 | --hl-1: var(--dark-hl-1); 43 | --hl-2: var(--dark-hl-2); 44 | --hl-3: var(--dark-hl-3); 45 | --hl-4: var(--dark-hl-4); 46 | --hl-5: var(--dark-hl-5); 47 | --hl-6: var(--dark-hl-6); 48 | --hl-7: var(--dark-hl-7); 49 | --hl-8: var(--dark-hl-8); 50 | --hl-9: var(--dark-hl-9); 51 | --code-background: var(--dark-code-background); 52 | } } 53 | 54 | body.light { 55 | --hl-0: var(--light-hl-0); 56 | --hl-1: var(--light-hl-1); 57 | --hl-2: var(--light-hl-2); 58 | --hl-3: var(--light-hl-3); 59 | --hl-4: var(--light-hl-4); 60 | --hl-5: var(--light-hl-5); 61 | --hl-6: var(--light-hl-6); 62 | --hl-7: var(--light-hl-7); 63 | --hl-8: var(--light-hl-8); 64 | --hl-9: var(--light-hl-9); 65 | --code-background: var(--light-code-background); 66 | } 67 | 68 | body.dark { 69 | --hl-0: var(--dark-hl-0); 70 | --hl-1: var(--dark-hl-1); 71 | --hl-2: var(--dark-hl-2); 72 | --hl-3: var(--dark-hl-3); 73 | --hl-4: var(--dark-hl-4); 74 | --hl-5: var(--dark-hl-5); 75 | --hl-6: var(--dark-hl-6); 76 | --hl-7: var(--dark-hl-7); 77 | --hl-8: var(--dark-hl-8); 78 | --hl-9: var(--dark-hl-9); 79 | --code-background: var(--dark-code-background); 80 | } 81 | 82 | .hl-0 { color: var(--hl-0); } 83 | .hl-1 { color: var(--hl-1); } 84 | .hl-2 { color: var(--hl-2); } 85 | .hl-3 { color: var(--hl-3); } 86 | .hl-4 { color: var(--hl-4); } 87 | .hl-5 { color: var(--hl-5); } 88 | .hl-6 { color: var(--hl-6); } 89 | .hl-7 { color: var(--hl-7); } 90 | .hl-8 { color: var(--hl-8); } 91 | .hl-9 { color: var(--hl-9); } 92 | pre, code { background: var(--code-background); } 93 | -------------------------------------------------------------------------------- /docs/assets/icons.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/y21/tljs/c0719060725218ee4a9e3704b5f374711e857acd/docs/assets/icons.png -------------------------------------------------------------------------------- /docs/assets/icons@2x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/y21/tljs/c0719060725218ee4a9e3704b5f374711e857acd/docs/assets/icons@2x.png -------------------------------------------------------------------------------- /docs/assets/widgets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/y21/tljs/c0719060725218ee4a9e3704b5f374711e857acd/docs/assets/widgets.png -------------------------------------------------------------------------------- /docs/assets/widgets@2x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/y21/tljs/c0719060725218ee4a9e3704b5f374711e857acd/docs/assets/widgets@2x.png -------------------------------------------------------------------------------- /docs/classes/Attributes.html: -------------------------------------------------------------------------------- 1 | Attributes | tljs
Options
All
  • Public
  • Public/Protected
  • All
Menu

Class Attributes

2 |

HTML Tag Attributes

3 |

Hierarchy

  • Attributes

Index

Constructors

Properties

Methods

Constructors

Properties

dom: Dom
nodeId: number

Methods

  • count(): number
  • 4 |

    Returns the number of attributes

    5 |

    Returns number

  • get(key: string): null | string
  • 6 |

    Looks up an attribute by key

    7 |

    Parameters

    • key: string

    Returns null | string

  • insert(key: string, value: string): void
  • 8 |

    Inserts a key-value pair into this attributes storage

    9 |

    Parameters

    • key: string
    • value: string

    Returns void

  • remove(key: string): void
  • 10 |

    Removes a key-value pair

    11 |

    Parameters

    • key: string

    Returns void

Legend

  • Constructor
  • Method
  • Inherited constructor
  • Inherited method
  • Private property
  • Private method
  • Property
  • Protected property

Settings

Theme

Generated using TypeDoc

-------------------------------------------------------------------------------- /docs/classes/ChildrenCollection.html: -------------------------------------------------------------------------------- 1 | ChildrenCollection | tljs
Options
All
  • Public
  • Public/Protected
  • All
Menu

Class ChildrenCollection

2 |

A collection of subnodes of a particular node, or the DOM.

3 |

Hierarchy

Index

Constructors

Properties

dom: Dom
len: number
ptr: number

Methods

  • [iterator](): Iterator<Node, any, undefined>
  • at(index: number): null | Node
  • length(): number

Legend

  • Constructor
  • Method
  • Inherited constructor
  • Inherited method
  • Private property
  • Private method
  • Property
  • Protected property

Settings

Theme

Generated using TypeDoc

-------------------------------------------------------------------------------- /docs/classes/Collection.html: -------------------------------------------------------------------------------- 1 | Collection | tljs
Options
All
  • Public
  • Public/Protected
  • All
Menu

Class Collection<T> Abstract

2 |

A base class for collections

3 |

Type Parameters

  • T

Hierarchy

Index

Constructors

  • new Collection<T>(dom: Dom, ptr: number, len: number): Collection<T>

Properties

dom: Dom
len: number
ptr: number

Methods

  • [iterator](): Iterator<T, any, undefined>
  • at(index: number): null | T
  • Parameters

    • index: number

    Returns null | T

  • length(): number
  • 4 |

    Returns the number of elements in this collection

    5 |

    Returns number

  • toArray(): T[]
  • 6 |

    Copies all elements of this external collection to an array.

    7 |

    Returns T[]

Legend

  • Constructor
  • Method
  • Inherited constructor
  • Inherited method
  • Private property
  • Private method
  • Property
  • Protected property

Settings

Theme

Generated using TypeDoc

-------------------------------------------------------------------------------- /docs/classes/CollectionIter.html: -------------------------------------------------------------------------------- 1 | CollectionIter | tljs
Options
All
  • Public
  • Public/Protected
  • All
Menu

Class CollectionIter<T>

2 |

An iterator over elements in an external collection

3 |

Type Parameters

  • T

Hierarchy

  • CollectionIter

Implements

  • Iterator<T>

Index

Constructors

Properties

Methods

Constructors

Properties

collection: Collection<T>
index: number = 0

Methods

  • next(): IteratorResult<T, any>
  • Returns IteratorResult<T, any>

Legend

  • Constructor
  • Method
  • Inherited constructor
  • Inherited method
  • Private property
  • Private method
  • Property
  • Protected property

Settings

Theme

Generated using TypeDoc

-------------------------------------------------------------------------------- /docs/classes/Comment.html: -------------------------------------------------------------------------------- 1 | Comment | tljs
Options
All
  • Public
  • Public/Protected
  • All
Menu

Class Comment

Hierarchy

Index

Constructors

Properties

dom: Dom
id: number

Methods

  • 4 |

    Attempts to downcast this node handle to a raw HTML node (text).

    5 |

    Returns null | RawTag

  • asTag(): null | Tag
  • 6 |

    Attempts to downcast this node handle to a concrete HTML tag. 7 | Some operations are only valid on HTML tags.

    8 |

    Returns null | Tag

  • innerHTML(): string
  • innerText(): string

Legend

  • Constructor
  • Method
  • Inherited constructor
  • Inherited method
  • Private property
  • Private method
  • Property
  • Protected property

Settings

Theme

Generated using TypeDoc

-------------------------------------------------------------------------------- /docs/classes/GlobalNodeCollection.html: -------------------------------------------------------------------------------- 1 | GlobalNodeCollection | tljs
Options
All
  • Public
  • Public/Protected
  • All
Menu

Class GlobalNodeCollection

2 |

A collection of global nodes

3 |

Hierarchy

Index

Constructors

Properties

dom: Dom
len: number
ptr: number

Methods

  • [iterator](): Iterator<Node, any, undefined>
  • at(index: number): null | Node
  • length(): number

Legend

  • Constructor
  • Method
  • Inherited constructor
  • Inherited method
  • Private property
  • Private method
  • Property
  • Protected property

Settings

Theme

Generated using TypeDoc

-------------------------------------------------------------------------------- /docs/classes/Node.html: -------------------------------------------------------------------------------- 1 | Node | tljs
Options
All
  • Public
  • Public/Protected
  • All
Menu

Class Node

2 |

A handle to a node in the DOM tree.

3 |

Hierarchy

Index

Constructors

  • new Node(dom: Dom, id: number): Node

Properties

dom: Dom
id: number

Methods

  • 4 |

    Attempts to downcast this node handle to an HTML comment ().

    5 |

    Returns null | Comment

  • 6 |

    Attempts to downcast this node handle to a raw HTML node (text).

    7 |

    Returns null | RawTag

  • asTag(): null | Tag
  • 8 |

    Attempts to downcast this node handle to a concrete HTML tag. 9 | Some operations are only valid on HTML tags.

    10 |

    Returns null | Tag

  • downcastable(kind: DowncastTarget): boolean
  • Parameters

    • kind: DowncastTarget

    Returns boolean

  • innerHTML(): string
  • 11 |

    Returns the inner HTML of this node.

    12 |

    Returns string

  • innerText(): string
  • 13 |

    Returns the inner text of this node.

    14 |

    Returns string

Legend

  • Constructor
  • Method
  • Inherited constructor
  • Inherited method
  • Private property
  • Private method
  • Property
  • Protected property

Settings

Theme

Generated using TypeDoc

-------------------------------------------------------------------------------- /docs/classes/RawTag.html: -------------------------------------------------------------------------------- 1 | RawTag | tljs
Options
All
  • Public
  • Public/Protected
  • All
Menu

Class RawTag

Hierarchy

Index

Constructors

Properties

dom: Dom
id: number

Methods

  • 4 |

    Attempts to downcast this node handle to a raw HTML node (text).

    5 |

    Returns null | RawTag

  • asTag(): null | Tag
  • 6 |

    Attempts to downcast this node handle to a concrete HTML tag. 7 | Some operations are only valid on HTML tags.

    8 |

    Returns null | Tag

  • innerHTML(): string
  • innerText(): string

Legend

  • Constructor
  • Method
  • Inherited constructor
  • Inherited method
  • Private property
  • Private method
  • Property
  • Protected property

Settings

Theme

Generated using TypeDoc

-------------------------------------------------------------------------------- /docs/enums/HTMLVersion.html: -------------------------------------------------------------------------------- 1 | HTMLVersion | tljs
Options
All
  • Public
  • Public/Protected
  • All
Menu

Enumeration HTMLVersion

2 |

The version of this HTML document

3 |

Index

Enumeration Members

FRAMESET_HTML401: 3
HTML5: 0
STRICT_HTML401: 1
TRANSITIONAL_HTML401: 2

Legend

  • Constructor
  • Method
  • Inherited constructor
  • Inherited method
  • Private property
  • Private method
  • Property
  • Protected property

Settings

Theme

Generated using TypeDoc

-------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | tljs
Options
All
  • Public
  • Public/Protected
  • All
Menu

tljs

2 | 3 |

tljs

4 |
5 |

A high performance HTML5 parser for JavaScript.

6 |

This library wraps the Rust crate tl and exposes its interface to JavaScript.

7 | 8 | 9 |

When To Use

10 |
11 |

This library can very quickly parse very large HTML documents. However, this library is not suitable for every use case. 12 | In particular, if you find yourself having to do lots of operations on the nodes, this may not be for you, due to the overhead of calling into WebAssembly. 13 | So, use this library if:

14 |
    15 |
  • Most of the time is likely spent parsing documents.
  • 16 |
  • Not a lot of operations are done on the nodes.
  • 17 |
  • You need to parse large documents (tens to hundreds of megabytes)
  • 18 |
19 |

In any case, you should benchmark this library for your specific use case, and see if you benefit from the fast parsing speeds, or if the WebAssembly overhead is a bottleneck.

20 | 21 | 22 |

How To Use

23 |
24 |
const tljs = require('@y21/tljs');
const dom = await tljs.parse(`
<!DOCTYPE html>
<div>
<p id="greeting">Hello World</p>
<img id="img" src="image.png" />
</div>
`);

console.log(dom.getElementById('img').asTag().attributes().get('src')); // image.png
console.log(dom.getElementById('greeting').asTag().innerText()); // Hello World
console.log(dom.querySelector('p#greeting').asTag().innerText()); // Hello World
console.log(dom.version() === tljs.HTMLVersion.HTML5); // true 25 |
26 | 27 | 28 |

Spec compliance

29 |
30 |

This parser does not fully follow the HTML standard, however it is expected to be able to parse most "sane" HTML. 31 | This greatly impacts performance and should be taken into consideration when comparing performance of different HTML parsers. 32 | Not being bound to a spec enables a lot more optimization opportunities.

33 | 34 | 35 |

Using this library in the browser

36 |
37 |

It's possible to use this library in very "restricted" JavaScript environments (for example no access to the file system or network). By default, this library assumes it's running under Node.js and attempts to load the .wasm binary needed to call into Rust code using require('fs').readFile.

38 |

If you want to use this library in the browser or other environments, you need to override the default WebAssembly loading mechanism. This depends on your setup, but one way to achieve this would be to host the .wasm binary elsewhere (maybe serve it from your webserver) and use fetch() to get the binary.

39 |
const tljs = require('@y21/tljs');

// override the wasm loading function
tljs.setInitializerCallback(() => {
return fetch('/tl.wasm').then(x => x.arrayBuffer()); // assuming `/tl.wasm` serves the binary.
});

tljs.parse('<p>Hello world</p>'); 40 |
41 |

It doesn't matter how you obtain the WebAssembly binary, but you'll need to return an ArrayBuffer from the initializer callback (can also be a promise resolving to an ArrayBuffer).

42 | 43 | 44 |

Benchmark

45 |
46 |
tl                 : 0.863912 ms/file ± 0.528114
htmlparser2 : 2.02348 ms/file ± 3.05865
html5parser : 2.20736 ms/file ± 2.66850
htmlparser2-dom : 2.70631 ms/file ± 3.40642
html-dom-parser : 2.72998 ms/file ± 3.56091
neutron-html5parser: 2.74419 ms/file ± 1.52848
node-html-parser : 2.89545 ms/file ± 1.80618
libxmljs : 4.20240 ms/file ± 2.99146
zeed-dom : 4.82065 ms/file ± 2.86533
htmljs-parser : 5.97658 ms/file ± 6.65908
parse5 : 6.85238 ms/file ± 7.75122
arijs-stream : 18.7410 ms/file ± 18.6447
arijs-tree : 20.6841 ms/file ± 19.4813
htmlparser : 21.8427 ms/file ± 154.758
html-parser : 27.3543 ms/file ± 20.7064
saxes : 58.8234 ms/file ± 167.164
html5 : 109.685 ms/file ± 146.399 47 |
48 |

Benchmarked against real world data using AndreasMadsen/htmlparser-benchmark.

49 |

Note: This benchmark only measures raw HTML parsing, not DOM interaction.

50 |

Legend

  • Constructor
  • Method
  • Inherited constructor
  • Inherited method
  • Private property
  • Private method
  • Property
  • Protected property

Settings

Theme

Generated using TypeDoc

-------------------------------------------------------------------------------- /docs/interfaces/ParserOptions.html: -------------------------------------------------------------------------------- 1 | ParserOptions | tljs
Options
All
  • Public
  • Public/Protected
  • All
Menu

Interface ParserOptions

2 |

Options to use for the HTML parser. 3 | The default options are optimized for raw parsing speed.

4 |

Hierarchy

  • ParserOptions

Index

Properties

trackClasses?: boolean
5 |

Enables tracking of HTML Tag class names.

6 |

The parser will cache tags during parsing on the fly. 7 | Enabling this makes getElementsByClassName() lookups ~O(1), 8 | at the cost of a lot of hashing. 9 | Default: false

10 |
trackIds?: boolean
11 |

Enables tracking of HTML Tag IDs.

12 |

The parser will cache tags during parsing on the fly. 13 | Enabling this makes getElementById() lookups ~O(1). 14 | Default: false

15 |

Legend

  • Constructor
  • Method
  • Inherited constructor
  • Inherited method
  • Private property
  • Private method
  • Property
  • Protected property

Settings

Theme

Generated using TypeDoc

-------------------------------------------------------------------------------- /node/lib/bindings.ts: -------------------------------------------------------------------------------- 1 | export interface WasmExports { 2 | memory: WebAssembly.Memory; 3 | alloc: (size: number) => number; 4 | dealloc: (ptr: number, size: number) => void; 5 | tl_parse: (ptr: number, len: number, opts: number) => number; 6 | tl_dom_version: (ptr: number) => number; 7 | tl_dom_nodes_count: (ptr: number) => number; 8 | tl_dom_get_element_by_id: (ptr: number, len: number, id: number) => number; 9 | tl_dom_get_elements_by_class_name: (ptr: number, class_name: number, class_name_len: number) => number; 10 | tl_dom_query_selector_single: (ptr: number, selector: number, selector_len: number) => number; 11 | tl_dom_query_selector_all: (ptr: number, selector: number, selector_len: number) => number; 12 | tl_node_inner_text: (ptr: number, id: number) => number; 13 | tl_node_inner_html: (ptr: number, id: number) => number; 14 | tl_node_is_tag: (ptr: number, id: number) => boolean; 15 | tl_node_is_raw: (ptr: number, id: number) => boolean; 16 | tl_node_is_comment: (ptr: number, id: number) => boolean; 17 | tl_node_tag_name: (ptr: number, id: number) => number; 18 | tl_node_tag_attributes_count: (ptr: number, id: number) => number; 19 | tl_node_tag_attributes_get: (ptr: number, id: number, str: number, str_len: number) => number; 20 | tl_node_tag_attributes_insert: (ptr: number, id: number, kptr: number, klen: number, vptr: number, vlen: number) => void; 21 | tl_node_tag_attributes_remove: (ptr: number, id: number, kptr: number, klen: number) => void; 22 | tl_dom_subnodes: (ptr: number) => number; 23 | tl_dom_children: (ptr: number) => number; 24 | tl_dom_children_index: (slice_ptr: number, slice_len: number, at: number) => number; 25 | tl_dom_inner_html: (ptr: number) => number; 26 | drop_collection_vtable: (ptr: number) => void; 27 | drop_collection: (ptr: number) => void; 28 | drop_c_string: (ptr: number) => void; 29 | drop_node_handle_option: (ptr: number) => void; 30 | drop_string_option: (ptr: number) => void; 31 | drop_dom: (ptr: number) => void; 32 | } 33 | -------------------------------------------------------------------------------- /package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@y21/tljs", 3 | "version": "0.3.1", 4 | "lockfileVersion": 2, 5 | "requires": true, 6 | "packages": { 7 | "": { 8 | "name": "@y21/tljs", 9 | "version": "0.3.1", 10 | "license": "ISC", 11 | "devDependencies": { 12 | "@types/node": "^16.11.38", 13 | "typedoc": "^0.22.9" 14 | } 15 | }, 16 | "node_modules/@types/node": { 17 | "version": "16.11.38", 18 | "resolved": "https://registry.npmjs.org/@types/node/-/node-16.11.38.tgz", 19 | "integrity": "sha512-hjO/0K140An3GWDw2HJfq7gko3wWeznbjXgg+rzPdVzhe198hp4x2i1dgveAOEiFKd8sOilAxzoSJiVv5P/CUg==", 20 | "dev": true 21 | }, 22 | "node_modules/balanced-match": { 23 | "version": "1.0.2", 24 | "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", 25 | "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==", 26 | "dev": true 27 | }, 28 | "node_modules/brace-expansion": { 29 | "version": "2.0.1", 30 | "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz", 31 | "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==", 32 | "dev": true, 33 | "dependencies": { 34 | "balanced-match": "^1.0.0" 35 | } 36 | }, 37 | "node_modules/fs.realpath": { 38 | "version": "1.0.0", 39 | "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", 40 | "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==", 41 | "dev": true 42 | }, 43 | "node_modules/glob": { 44 | "version": "8.0.3", 45 | "resolved": "https://registry.npmjs.org/glob/-/glob-8.0.3.tgz", 46 | "integrity": "sha512-ull455NHSHI/Y1FqGaaYFaLGkNMMJbavMrEGFXG/PGrg6y7sutWHUHrz6gy6WEBH6akM1M414dWKCNs+IhKdiQ==", 47 | "dev": true, 48 | "dependencies": { 49 | "fs.realpath": "^1.0.0", 50 | "inflight": "^1.0.4", 51 | "inherits": "2", 52 | "minimatch": "^5.0.1", 53 | "once": "^1.3.0" 54 | }, 55 | "engines": { 56 | "node": ">=12" 57 | }, 58 | "funding": { 59 | "url": "https://github.com/sponsors/isaacs" 60 | } 61 | }, 62 | "node_modules/inflight": { 63 | "version": "1.0.6", 64 | "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", 65 | "integrity": "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA==", 66 | "dev": true, 67 | "dependencies": { 68 | "once": "^1.3.0", 69 | "wrappy": "1" 70 | } 71 | }, 72 | "node_modules/inherits": { 73 | "version": "2.0.4", 74 | "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", 75 | "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==", 76 | "dev": true 77 | }, 78 | "node_modules/jsonc-parser": { 79 | "version": "3.0.0", 80 | "resolved": "https://registry.npmjs.org/jsonc-parser/-/jsonc-parser-3.0.0.tgz", 81 | "integrity": "sha512-fQzRfAbIBnR0IQvftw9FJveWiHp72Fg20giDrHz6TdfB12UH/uue0D3hm57UB5KgAVuniLMCaS8P1IMj9NR7cA==", 82 | "dev": true 83 | }, 84 | "node_modules/lunr": { 85 | "version": "2.3.9", 86 | "resolved": "https://registry.npmjs.org/lunr/-/lunr-2.3.9.tgz", 87 | "integrity": "sha512-zTU3DaZaF3Rt9rhN3uBMGQD3dD2/vFQqnvZCDv4dl5iOzq2IZQqTxu90r4E5J+nP70J3ilqVCrbho2eWaeW8Ow==", 88 | "dev": true 89 | }, 90 | "node_modules/marked": { 91 | "version": "4.0.16", 92 | "resolved": "https://registry.npmjs.org/marked/-/marked-4.0.16.tgz", 93 | "integrity": "sha512-wahonIQ5Jnyatt2fn8KqF/nIqZM8mh3oRu2+l5EANGMhu6RFjiSG52QNE2eWzFMI94HqYSgN184NurgNG6CztA==", 94 | "dev": true, 95 | "bin": { 96 | "marked": "bin/marked.js" 97 | }, 98 | "engines": { 99 | "node": ">= 12" 100 | } 101 | }, 102 | "node_modules/minimatch": { 103 | "version": "5.1.0", 104 | "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-5.1.0.tgz", 105 | "integrity": "sha512-9TPBGGak4nHfGZsPBohm9AWg6NoT7QTCehS3BIJABslyZbzxfV78QM2Y6+i741OPZIafFAaiiEMh5OyIrJPgtg==", 106 | "dev": true, 107 | "dependencies": { 108 | "brace-expansion": "^2.0.1" 109 | }, 110 | "engines": { 111 | "node": ">=10" 112 | } 113 | }, 114 | "node_modules/once": { 115 | "version": "1.4.0", 116 | "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", 117 | "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==", 118 | "dev": true, 119 | "dependencies": { 120 | "wrappy": "1" 121 | } 122 | }, 123 | "node_modules/shiki": { 124 | "version": "0.10.1", 125 | "resolved": "https://registry.npmjs.org/shiki/-/shiki-0.10.1.tgz", 126 | "integrity": "sha512-VsY7QJVzU51j5o1+DguUd+6vmCmZ5v/6gYu4vyYAhzjuNQU6P/vmSy4uQaOhvje031qQMiW0d2BwgMH52vqMng==", 127 | "dev": true, 128 | "dependencies": { 129 | "jsonc-parser": "^3.0.0", 130 | "vscode-oniguruma": "^1.6.1", 131 | "vscode-textmate": "5.2.0" 132 | } 133 | }, 134 | "node_modules/typedoc": { 135 | "version": "0.22.17", 136 | "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.22.17.tgz", 137 | "integrity": "sha512-h6+uXHVVCPDaANzjwzdsj9aePBjZiBTpiMpBBeyh1zcN2odVsDCNajz8zyKnixF93HJeGpl34j/70yoEE5BfNg==", 138 | "dev": true, 139 | "dependencies": { 140 | "glob": "^8.0.3", 141 | "lunr": "^2.3.9", 142 | "marked": "^4.0.16", 143 | "minimatch": "^5.1.0", 144 | "shiki": "^0.10.1" 145 | }, 146 | "bin": { 147 | "typedoc": "bin/typedoc" 148 | }, 149 | "engines": { 150 | "node": ">= 12.10.0" 151 | }, 152 | "peerDependencies": { 153 | "typescript": "4.0.x || 4.1.x || 4.2.x || 4.3.x || 4.4.x || 4.5.x || 4.6.x || 4.7.x" 154 | } 155 | }, 156 | "node_modules/typescript": { 157 | "version": "4.7.3", 158 | "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.7.3.tgz", 159 | "integrity": "sha512-WOkT3XYvrpXx4vMMqlD+8R8R37fZkjyLGlxavMc4iB8lrl8L0DeTcHbYgw/v0N/z9wAFsgBhcsF0ruoySS22mA==", 160 | "dev": true, 161 | "peer": true, 162 | "bin": { 163 | "tsc": "bin/tsc", 164 | "tsserver": "bin/tsserver" 165 | }, 166 | "engines": { 167 | "node": ">=4.2.0" 168 | } 169 | }, 170 | "node_modules/vscode-oniguruma": { 171 | "version": "1.6.2", 172 | "resolved": "https://registry.npmjs.org/vscode-oniguruma/-/vscode-oniguruma-1.6.2.tgz", 173 | "integrity": "sha512-KH8+KKov5eS/9WhofZR8M8dMHWN2gTxjMsG4jd04YhpbPR91fUj7rYQ2/XjeHCJWbg7X++ApRIU9NUwM2vTvLA==", 174 | "dev": true 175 | }, 176 | "node_modules/vscode-textmate": { 177 | "version": "5.2.0", 178 | "resolved": "https://registry.npmjs.org/vscode-textmate/-/vscode-textmate-5.2.0.tgz", 179 | "integrity": "sha512-Uw5ooOQxRASHgu6C7GVvUxisKXfSgW4oFlO+aa+PAkgmH89O3CXxEEzNRNtHSqtXFTl0nAC1uYj0GMSH27uwtQ==", 180 | "dev": true 181 | }, 182 | "node_modules/wrappy": { 183 | "version": "1.0.2", 184 | "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", 185 | "integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8=", 186 | "dev": true 187 | } 188 | }, 189 | "dependencies": { 190 | "@types/node": { 191 | "version": "16.11.38", 192 | "resolved": "https://registry.npmjs.org/@types/node/-/node-16.11.38.tgz", 193 | "integrity": "sha512-hjO/0K140An3GWDw2HJfq7gko3wWeznbjXgg+rzPdVzhe198hp4x2i1dgveAOEiFKd8sOilAxzoSJiVv5P/CUg==", 194 | "dev": true 195 | }, 196 | "balanced-match": { 197 | "version": "1.0.2", 198 | "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", 199 | "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==", 200 | "dev": true 201 | }, 202 | "brace-expansion": { 203 | "version": "2.0.1", 204 | "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz", 205 | "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==", 206 | "dev": true, 207 | "requires": { 208 | "balanced-match": "^1.0.0" 209 | } 210 | }, 211 | "fs.realpath": { 212 | "version": "1.0.0", 213 | "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", 214 | "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==", 215 | "dev": true 216 | }, 217 | "glob": { 218 | "version": "8.0.3", 219 | "resolved": "https://registry.npmjs.org/glob/-/glob-8.0.3.tgz", 220 | "integrity": "sha512-ull455NHSHI/Y1FqGaaYFaLGkNMMJbavMrEGFXG/PGrg6y7sutWHUHrz6gy6WEBH6akM1M414dWKCNs+IhKdiQ==", 221 | "dev": true, 222 | "requires": { 223 | "fs.realpath": "^1.0.0", 224 | "inflight": "^1.0.4", 225 | "inherits": "2", 226 | "minimatch": "^5.0.1", 227 | "once": "^1.3.0" 228 | } 229 | }, 230 | "inflight": { 231 | "version": "1.0.6", 232 | "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", 233 | "integrity": "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA==", 234 | "dev": true, 235 | "requires": { 236 | "once": "^1.3.0", 237 | "wrappy": "1" 238 | } 239 | }, 240 | "inherits": { 241 | "version": "2.0.4", 242 | "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", 243 | "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==", 244 | "dev": true 245 | }, 246 | "jsonc-parser": { 247 | "version": "3.0.0", 248 | "resolved": "https://registry.npmjs.org/jsonc-parser/-/jsonc-parser-3.0.0.tgz", 249 | "integrity": "sha512-fQzRfAbIBnR0IQvftw9FJveWiHp72Fg20giDrHz6TdfB12UH/uue0D3hm57UB5KgAVuniLMCaS8P1IMj9NR7cA==", 250 | "dev": true 251 | }, 252 | "lunr": { 253 | "version": "2.3.9", 254 | "resolved": "https://registry.npmjs.org/lunr/-/lunr-2.3.9.tgz", 255 | "integrity": "sha512-zTU3DaZaF3Rt9rhN3uBMGQD3dD2/vFQqnvZCDv4dl5iOzq2IZQqTxu90r4E5J+nP70J3ilqVCrbho2eWaeW8Ow==", 256 | "dev": true 257 | }, 258 | "marked": { 259 | "version": "4.0.16", 260 | "resolved": "https://registry.npmjs.org/marked/-/marked-4.0.16.tgz", 261 | "integrity": "sha512-wahonIQ5Jnyatt2fn8KqF/nIqZM8mh3oRu2+l5EANGMhu6RFjiSG52QNE2eWzFMI94HqYSgN184NurgNG6CztA==", 262 | "dev": true 263 | }, 264 | "minimatch": { 265 | "version": "5.1.0", 266 | "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-5.1.0.tgz", 267 | "integrity": "sha512-9TPBGGak4nHfGZsPBohm9AWg6NoT7QTCehS3BIJABslyZbzxfV78QM2Y6+i741OPZIafFAaiiEMh5OyIrJPgtg==", 268 | "dev": true, 269 | "requires": { 270 | "brace-expansion": "^2.0.1" 271 | } 272 | }, 273 | "once": { 274 | "version": "1.4.0", 275 | "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", 276 | "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==", 277 | "dev": true, 278 | "requires": { 279 | "wrappy": "1" 280 | } 281 | }, 282 | "shiki": { 283 | "version": "0.10.1", 284 | "resolved": "https://registry.npmjs.org/shiki/-/shiki-0.10.1.tgz", 285 | "integrity": "sha512-VsY7QJVzU51j5o1+DguUd+6vmCmZ5v/6gYu4vyYAhzjuNQU6P/vmSy4uQaOhvje031qQMiW0d2BwgMH52vqMng==", 286 | "dev": true, 287 | "requires": { 288 | "jsonc-parser": "^3.0.0", 289 | "vscode-oniguruma": "^1.6.1", 290 | "vscode-textmate": "5.2.0" 291 | } 292 | }, 293 | "typedoc": { 294 | "version": "0.22.17", 295 | "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.22.17.tgz", 296 | "integrity": "sha512-h6+uXHVVCPDaANzjwzdsj9aePBjZiBTpiMpBBeyh1zcN2odVsDCNajz8zyKnixF93HJeGpl34j/70yoEE5BfNg==", 297 | "dev": true, 298 | "requires": { 299 | "glob": "^8.0.3", 300 | "lunr": "^2.3.9", 301 | "marked": "^4.0.16", 302 | "minimatch": "^5.1.0", 303 | "shiki": "^0.10.1" 304 | } 305 | }, 306 | "typescript": { 307 | "version": "4.7.3", 308 | "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.7.3.tgz", 309 | "integrity": "sha512-WOkT3XYvrpXx4vMMqlD+8R8R37fZkjyLGlxavMc4iB8lrl8L0DeTcHbYgw/v0N/z9wAFsgBhcsF0ruoySS22mA==", 310 | "dev": true, 311 | "peer": true 312 | }, 313 | "vscode-oniguruma": { 314 | "version": "1.6.2", 315 | "resolved": "https://registry.npmjs.org/vscode-oniguruma/-/vscode-oniguruma-1.6.2.tgz", 316 | "integrity": "sha512-KH8+KKov5eS/9WhofZR8M8dMHWN2gTxjMsG4jd04YhpbPR91fUj7rYQ2/XjeHCJWbg7X++ApRIU9NUwM2vTvLA==", 317 | "dev": true 318 | }, 319 | "vscode-textmate": { 320 | "version": "5.2.0", 321 | "resolved": "https://registry.npmjs.org/vscode-textmate/-/vscode-textmate-5.2.0.tgz", 322 | "integrity": "sha512-Uw5ooOQxRASHgu6C7GVvUxisKXfSgW4oFlO+aa+PAkgmH89O3CXxEEzNRNtHSqtXFTl0nAC1uYj0GMSH27uwtQ==", 323 | "dev": true 324 | }, 325 | "wrappy": { 326 | "version": "1.0.2", 327 | "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", 328 | "integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8=", 329 | "dev": true 330 | } 331 | } 332 | } 333 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@y21/tljs", 3 | "version": "0.4.0", 4 | "description": "A very fast HTML5 parser", 5 | "main": "node/dist/index.js", 6 | "scripts": { 7 | "test": "node test/runner.js", 8 | "build-rust": "cd bindings && cargo b --release --target wasm32-unknown-unknown && cp target/wasm32-unknown-unknown/release/bindings.wasm ../node/dist/", 9 | "build-ts": "tsc", 10 | "build": "npm run build-rust && npm run build-ts" 11 | }, 12 | "keywords": [], 13 | "author": "", 14 | "license": "ISC", 15 | "devDependencies": { 16 | "@types/node": "^16.11.38", 17 | "typedoc": "^0.22.9" 18 | }, 19 | "types": "node/dist/index.d.ts" 20 | } 21 | -------------------------------------------------------------------------------- /test/dom.js: -------------------------------------------------------------------------------- 1 | const tljs = require('../'); 2 | const assert = require('assert'); 3 | 4 | module.exports = async function(html) { 5 | const dom = await tljs.parse(html); 6 | 7 | const children = dom.children(); 8 | assert(children.length() > 0); 9 | assert(children.at(0) !== null); 10 | 11 | const element = dom.getElementById('mw-content-text'); 12 | assert(element !== null); 13 | assert(element.innerText().length > 0); 14 | 15 | const query = dom.querySelector('div#mw-content-text'); 16 | assert(query.innerText() === element.innerText()); 17 | 18 | assert(dom.version() === tljs.HTMLVersion.HTML5); 19 | }; 20 | -------------------------------------------------------------------------------- /test/example.js: -------------------------------------------------------------------------------- 1 | const tljs = require('../'); 2 | const assert = require('assert'); 3 | 4 | module.exports = async function() { 5 | const dom = await tljs.parse(` 6 | 7 |
8 |

Hello World

9 | 10 |
11 | `); 12 | 13 | assert(dom.getElementById('img').asTag().attributes().get('src') === 'image.png'); 14 | assert(dom.getElementById('greeting').asTag().innerText() === 'Hello World'); 15 | assert(dom.querySelector('p#greeting').asTag().innerText() === 'Hello World'); 16 | assert(dom.version() === tljs.HTMLVersion.HTML5); 17 | } 18 | -------------------------------------------------------------------------------- /test/parse.js: -------------------------------------------------------------------------------- 1 | const tljs = require('../'); 2 | 3 | module.exports = async function(html) { 4 | await tljs.parse(html); 5 | }; 6 | -------------------------------------------------------------------------------- /test/runner.js: -------------------------------------------------------------------------------- 1 | const tljs = require('../'); 2 | const fs = require('fs'); 3 | const exclude = ['runner.js', 'setup.js']; 4 | const files = fs.readdirSync(__dirname).filter(x => !exclude.includes(x) && x.endsWith('.js')); 5 | const input = fs.readFileSync(`${__dirname}/test.html`, 'utf8'); 6 | tljs.initializeWasmSync(); 7 | 8 | (async () => { 9 | for (const file of files) { 10 | const mod = require(`./${file}`); 11 | console.time(file); 12 | try { 13 | await mod(input); 14 | } catch(e) { 15 | console.log(`[${file}] Failed: ${e.stack}`); 16 | } finally { 17 | console.timeEnd(file); 18 | } 19 | } 20 | })(); 21 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "es2021", 4 | "module": "commonjs", 5 | "outDir": "./node/dist", 6 | "esModuleInterop": true, 7 | "forceConsistentCasingInFileNames": true, 8 | "strict": true, 9 | "skipLibCheck": true, 10 | "declaration": true 11 | }, 12 | "include": [ 13 | "node/lib/*.ts" 14 | ] 15 | } 16 | -------------------------------------------------------------------------------- /typedoc.json: -------------------------------------------------------------------------------- 1 | { 2 | "entryPoints": ["node/lib"], 3 | "out": "docs/", 4 | "readme": "README.md", 5 | "name": "tljs", 6 | "tsconfig": "./tsconfig.json" 7 | } --------------------------------------------------------------------------------