├── html5ever ├── fuzz │ ├── .gitignore │ ├── Cargo.toml │ └── fuzz_targets │ │ └── fuzz_document_parse.rs ├── data │ └── bench │ │ ├── tiny-fragment.html │ │ ├── small-fragment.html │ │ ├── strong.html │ │ ├── lipsum-zh.html │ │ └── medium-fragment.html ├── Cargo.toml ├── src │ ├── lib.rs │ ├── macros.rs │ ├── util │ │ └── str.rs │ ├── tree_builder │ │ ├── types.rs │ │ ├── tag_sets.rs │ │ └── data.rs │ ├── tokenizer │ │ ├── states.rs │ │ └── interface.rs │ └── driver.rs ├── LICENSE-MIT ├── examples │ ├── noop-tokenize.rs │ ├── capi │ │ └── tokenize.c │ ├── tokenize.rs │ ├── noop-tree-builder.rs │ └── print-tree-actions.rs └── benches │ └── html5ever.rs ├── xml5ever ├── fuzz │ ├── .gitignore │ ├── Cargo.toml │ └── fuzz_targets │ │ └── fuzz_document_parse.rs ├── examples │ ├── example.xml │ ├── simple_xml_tokenizer.rs │ ├── xml_tokenizer.rs │ └── README.md ├── src │ ├── macros.rs │ ├── tree_builder │ │ └── types.rs │ ├── lib.rs │ ├── tokenizer │ │ ├── qname.rs │ │ ├── interface.rs │ │ └── states.rs │ ├── driver.rs │ └── serialize │ │ └── mod.rs ├── Cargo.toml ├── data │ └── bench │ │ └── strong.xml ├── LICENSE-MIT ├── benches │ └── xml5ever.rs └── README.md ├── rustfmt.toml ├── .gitignore ├── .gitmodules ├── rcdom ├── data │ └── test │ │ └── ignore ├── README.md ├── tests │ ├── html-driver.rs │ ├── util │ │ ├── find_tests.rs │ │ └── runner.rs │ ├── foreach_html5lib_test │ │ └── mod.rs │ ├── xml-driver.rs │ ├── html-tree-sink.rs │ └── xml-tree-builder.rs ├── Cargo.toml ├── LICENSE-MIT ├── examples │ ├── hello_xml.rs │ ├── xml_tree_printer.rs │ ├── html2html.rs │ └── print-rcdom.rs └── custom-html5lib-tokenizer-tests │ └── regression.test ├── COPYRIGHT ├── markup5ever ├── Cargo.toml ├── LICENSE-MIT ├── lib.rs ├── util │ └── smallcharset.rs └── serialize.rs ├── .github ├── dependabot.yml └── workflows │ └── main.yml ├── web_atoms ├── Cargo.toml ├── LICENSE-MIT ├── lib.rs └── build.rs ├── AUTHORS ├── tendril ├── Cargo.toml ├── LICENSE-MIT ├── src │ ├── lib.rs │ ├── util.rs │ ├── buf32.rs │ └── utf8_decode.rs ├── benches │ ├── futf.rs │ └── tendril.rs ├── README.md └── examples │ └── fuzz.rs ├── LICENSE-MIT ├── Cargo.toml ├── RELEASING.MD └── README.md /html5ever/fuzz/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | corpus 3 | artifacts 4 | -------------------------------------------------------------------------------- /xml5ever/fuzz/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | corpus 3 | artifacts 4 | -------------------------------------------------------------------------------- /html5ever/data/bench/tiny-fragment.html: -------------------------------------------------------------------------------- 1 |

Hello, world!

2 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | match_block_trailing_comma = true 2 | reorder_imports = true 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /data/bench/uncommitted 2 | target 3 | .idea 4 | .vscode 5 | Cargo.lock 6 | *.racertmp 7 | -------------------------------------------------------------------------------- /xml5ever/examples/example.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | BobbyTables 4 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "html5lib-tests"] 2 | path = rcdom/html5lib-tests 3 | url = https://github.com/html5lib/html5lib-tests 4 | [submodule "xml5lib-tests"] 5 | path = rcdom/xml5lib-tests 6 | url = https://github.com/Ygg01/xml5lib-tests 7 | -------------------------------------------------------------------------------- /rcdom/data/test/ignore: -------------------------------------------------------------------------------- 1 | # Skipped tests for selectedcontent element 2 | # We do not implement the full DOM behavior of selectedcontent 3 | # See: https://github.com/html5lib/html5lib-tests/issues/180 4 | tb: webkit02.dat-44 5 | tb: webkit02.dat-45 6 | -------------------------------------------------------------------------------- /COPYRIGHT: -------------------------------------------------------------------------------- 1 | This project is Copyright 2014-2017 , The html5ever Project Developers (given in 2 | the file AUTHORS). 3 | 4 | Licensed under the Apache License, Version 2.0 or the MIT license , at your option. All files in the project 7 | carrying such notice may not be copied, modified, or distributed except 8 | according to those terms. 9 | -------------------------------------------------------------------------------- /rcdom/README.md: -------------------------------------------------------------------------------- 1 | # markup5ever_rcdom 2 | 3 | This crate is built for the express purpose of writing automated tests for the `html5ever` 4 | and `xml5ever` crates. It is not intended to be a production-quality DOM implementation, 5 | and has not been fuzzed or tested against arbitrary, malicious, or nontrivial inputs. No maintenance 6 | or support for any such issues will be provided. If you use this DOM implementation in a production, 7 | user-facing system, you do so at your own risk. 8 | -------------------------------------------------------------------------------- /html5ever/fuzz/Cargo.toml: -------------------------------------------------------------------------------- 1 | 2 | [package] 3 | name = "html5ever-fuzz" 4 | version = "0.0.0" 5 | authors = ["David Korczynski "] 6 | publish = false 7 | edition = "2018" 8 | 9 | [package.metadata] 10 | cargo-fuzz = true 11 | 12 | [dependencies] 13 | libfuzzer-sys = "0.4.0" 14 | 15 | [dependencies.html5ever] 16 | path = ".." 17 | 18 | [dependencies.markup5ever_rcdom] 19 | path = "../../rcdom/" 20 | 21 | # Prevent this from interfering with workspaces 22 | [workspace] 23 | members = ["."] 24 | 25 | [[bin]] 26 | name = "fuzz_document_parse" 27 | path = "fuzz_targets/fuzz_document_parse.rs" 28 | -------------------------------------------------------------------------------- /xml5ever/fuzz/Cargo.toml: -------------------------------------------------------------------------------- 1 | 2 | [package] 3 | name = "xml5ever-fuzz" 4 | version = "0.0.0" 5 | authors = ["David Korczynski "] 6 | publish = false 7 | edition = "2018" 8 | 9 | [package.metadata] 10 | cargo-fuzz = true 11 | 12 | [dependencies] 13 | libfuzzer-sys = "0.4.0" 14 | 15 | [dependencies.xml5ever] 16 | path = ".." 17 | 18 | [dependencies.markup5ever_rcdom] 19 | path = "../../rcdom/" 20 | 21 | # Prevent this from interfering with workspaces 22 | [workspace] 23 | members = ["."] 24 | 25 | [[bin]] 26 | name = "fuzz_document_parse" 27 | path = "fuzz_targets/fuzz_document_parse.rs" 28 | -------------------------------------------------------------------------------- /markup5ever/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "markup5ever" 3 | description = "Common code for xml5ever and html5ever" 4 | documentation = "https://docs.rs/markup5ever" 5 | categories = [ "parser-implementations", "web-programming" ] 6 | version.workspace = true 7 | license.workspace = true 8 | authors.workspace = true 9 | repository.workspace = true 10 | edition.workspace = true 11 | rust-version.workspace = true 12 | 13 | [lib] 14 | path = "lib.rs" 15 | 16 | [features] 17 | serde = ["web_atoms/serde"] 18 | 19 | [dependencies] 20 | web_atoms = { workspace = true } 21 | tendril = { workspace = true } 22 | log = { workspace = true } 23 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "cargo" 9 | directory: "/" 10 | schedule: 11 | interval: "weekly" 12 | 13 | - package-ecosystem: gitsubmodule 14 | directory: "/" 15 | schedule: 16 | interval: weekly 17 | open-pull-requests-limit: 10 18 | -------------------------------------------------------------------------------- /xml5ever/src/macros.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2014-2017 The html5ever Project Developers. See the 2 | // COPYRIGHT file at the top-level directory of this distribution. 3 | // 4 | // Licensed under the Apache License, Version 2.0 or the MIT license 6 | // , at your 7 | // option. This file may not be copied, modified, or distributed 8 | // except according to those terms. 9 | 10 | macro_rules! time { 11 | ($e:expr) => {{ 12 | let t0 = ::std::time::Instant::now(); 13 | let result = $e; 14 | let dt = t0.elapsed().as_nanos() as u64; 15 | (result, dt) 16 | }}; 17 | } 18 | pub(crate) use time; 19 | -------------------------------------------------------------------------------- /web_atoms/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "web_atoms" 3 | version = "0.2.0" 4 | authors = [ "The html5ever Project Developers" ] 5 | license = "MIT OR Apache-2.0" 6 | repository = "https://github.com/servo/html5ever" 7 | description = "Atoms for xml5ever and html5ever" 8 | documentation = "https://docs.rs/web_atoms" 9 | build = "build.rs" 10 | categories = [ "web-programming" ] 11 | edition = "2021" 12 | rust-version.workspace = true 13 | 14 | [lib] 15 | path = "lib.rs" 16 | 17 | [features] 18 | serde = ["string_cache/serde_support"] 19 | 20 | [dependencies] 21 | string_cache = { workspace = true } 22 | phf = { workspace = true } 23 | 24 | [build-dependencies] 25 | string_cache_codegen = { workspace = true } 26 | phf_codegen = { workspace = true } 27 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | This software was written by the following people: 2 | 3 | Adam Roben 4 | Akos Kiss 5 | Wojciech "Zarazek" Wiśniewski 6 | Chris Paris 7 | Clark Gaebel 8 | Daniel Fath 9 | Huon Wilson 10 | glennw 11 | Josh Matthews 12 | György Andrasek 13 | Keegan McAllister 14 | Eunchong Yu 15 | Manish Goregaokar 16 | Chris Morgan 17 | Mátyás Mustoha 18 | Patrick Walton 19 | Renato Zannon 20 | Simon Sapin 21 | -------------------------------------------------------------------------------- /html5ever/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "html5ever" 3 | description = "High-performance browser-grade HTML5 parser" 4 | documentation = "https://docs.rs/html5ever" 5 | categories = [ "parser-implementations", "web-programming" ] 6 | keywords = ["html", "html5", "parser", "parsing"] 7 | readme = "../README.md" 8 | version.workspace = true 9 | license.workspace = true 10 | authors.workspace = true 11 | repository.workspace = true 12 | edition.workspace = true 13 | rust-version.workspace = true 14 | 15 | [features] 16 | trace_tokenizer = [] 17 | serde = ["markup5ever/serde"] 18 | 19 | [dependencies] 20 | markup5ever = { workspace = true } 21 | log = { workspace = true } 22 | 23 | [dev-dependencies] 24 | criterion = { workspace = true } 25 | typed-arena = { workspace = true } 26 | 27 | [[bench]] 28 | name = "html5ever" 29 | harness = false 30 | -------------------------------------------------------------------------------- /html5ever/data/bench/small-fragment.html: -------------------------------------------------------------------------------- 1 |

In July 1992, the X/Open committee XoJIG was looking for a better encoding. Dave Prosser of Unix System Laboratories 2 | submitted a proposal for one that had faster implementation 3 | characteristics and introduced the improvement that 7-bit ASCII 4 | characters would only represent themselves; all multibyte 5 | sequences would include only bytes where the high bit was set. This 6 | original proposal, FSS-UTF (File System Safe UCS Transformation Format), 7 | was similar in concept to UTF-8, but lacked the crucial property of self-synchronization. 8 | -------------------------------------------------------------------------------- /xml5ever/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "xml5ever" 3 | authors = ["The xml5ever project developers"] 4 | description = "Push based streaming parser for XML." 5 | documentation = "https://docs.rs/xml5ever" 6 | homepage = "https://github.com/servo/html5ever/blob/main/xml5ever/README.md" 7 | readme = "README.md" 8 | keywords = ["xml", "xml5", "parser", "parsing"] 9 | exclude = ["xml5lib-tests/*"] 10 | categories = ["parser-implementations", "web-programming"] 11 | version.workspace = true 12 | license.workspace = true 13 | repository.workspace = true 14 | edition.workspace = true 15 | rust-version.workspace = true 16 | 17 | [features] 18 | trace_tokenizer = [] 19 | serde = ["markup5ever/serde"] 20 | 21 | [dependencies] 22 | markup5ever = { workspace = true } 23 | log = { workspace = true } 24 | 25 | [dev-dependencies] 26 | criterion = { workspace = true } 27 | 28 | [[bench]] 29 | name = "xml5ever" 30 | harness = false 31 | -------------------------------------------------------------------------------- /tendril/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "tendril" 3 | version = "0.4.3" 4 | description = "Compact buffer/string type for zero-copy parsing" 5 | authors = [ 6 | "Keegan McAllister ", 7 | "Simon Sapin ", 8 | "Chris Morgan " 9 | ] 10 | license.workspace = true 11 | repository.workspace = true 12 | rust-version.workspace = true 13 | edition.workspace = true 14 | readme = "README.md" 15 | 16 | [dependencies] 17 | encoding = { workspace = true, optional = true} 18 | encoding_rs = { workspace = true, optional = true} 19 | new_debug_unreachable = { workspace = true } 20 | utf-8 = { workspace = true } 21 | 22 | [dev-dependencies] 23 | rand = { workspace = true } 24 | criterion = { workspace = true } 25 | tendril = { workspace = true } 26 | 27 | [[bench]] 28 | name = "futf" 29 | harness = false 30 | 31 | [[bench]] 32 | name = "tendril" 33 | harness = false 34 | 35 | 36 | -------------------------------------------------------------------------------- /html5ever/src/lib.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2014-2017 The html5ever Project Developers. See the 2 | // COPYRIGHT file at the top-level directory of this distribution. 3 | // 4 | // Licensed under the Apache License, Version 2.0 or the MIT license 6 | // , at your 7 | // option. This file may not be copied, modified, or distributed 8 | // except according to those terms. 9 | 10 | #![crate_name = "html5ever"] 11 | #![crate_type = "dylib"] 12 | #![cfg_attr(test, deny(warnings))] 13 | #![allow(unused_parens)] 14 | #![warn(unreachable_pub)] 15 | 16 | pub use driver::{parse_document, parse_fragment, ParseOpts, Parser}; 17 | pub use markup5ever::*; 18 | 19 | pub use serialize::serialize; 20 | 21 | mod util { 22 | pub(crate) mod str; 23 | } 24 | 25 | pub(crate) mod macros; 26 | 27 | pub mod driver; 28 | pub mod serialize; 29 | pub mod tokenizer; 30 | pub mod tree_builder; 31 | -------------------------------------------------------------------------------- /html5ever/data/bench/strong.html: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /xml5ever/data/bench/strong.xml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rcdom/tests/html-driver.rs: -------------------------------------------------------------------------------- 1 | use html5ever::driver; 2 | use html5ever::serialize; 3 | use html5ever::tendril::TendrilSink; 4 | use markup5ever_rcdom::{RcDom, SerializableHandle}; 5 | 6 | #[test] 7 | fn from_utf8() { 8 | let dom = driver::parse_document(RcDom::default(), Default::default()) 9 | .from_utf8() 10 | .one("Test".as_bytes()); 11 | let mut serialized = Vec::new(); 12 | let document: SerializableHandle = dom.document.clone().into(); 13 | serialize::serialize(&mut serialized, &document, Default::default()).unwrap(); 14 | assert_eq!( 15 | String::from_utf8(serialized).unwrap().replace(' ', ""), 16 | "<html><head><title>Test" 17 | ); 18 | } 19 | 20 | #[test] 21 | fn many_templates() { 22 | let mut body = String::new(); 23 | for _ in 1..10000 { 24 | body.push_str("