├── afl-fuzz ├── .gitignore ├── in │ ├── 1.xml │ ├── 2.xml │ ├── 3.xml │ ├── 4.xml │ ├── 5.xml │ └── 6.xml ├── Cargo.toml ├── README.md └── src │ └── main.rs ├── .gitignore ├── fuzz ├── .gitignore ├── README.md ├── fuzz_targets │ └── fuzz_xml.rs └── Cargo.toml ├── tests └── integration │ ├── main.rs │ ├── api.rs │ ├── text.rs │ ├── comments.rs │ ├── cdata.rs │ ├── document.rs │ ├── token.rs │ ├── pi.rs │ ├── doctype.rs │ └── elements.rs ├── Cargo.toml ├── examples └── parse.rs ├── README.tpl ├── .github └── workflows │ └── ci.yml ├── LICENSE-MIT ├── src ├── strspan.rs ├── xmlchar.rs ├── error.rs ├── stream.rs └── lib.rs ├── README.md ├── CHANGELOG.md └── LICENSE-APACHE /afl-fuzz/.gitignore: -------------------------------------------------------------------------------- 1 | /out 2 | -------------------------------------------------------------------------------- /afl-fuzz/in/1.xml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /afl-fuzz/in/2.xml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /afl-fuzz/in/3.xml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | Cargo.lock 3 | .idea 4 | -------------------------------------------------------------------------------- /fuzz/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | corpus 3 | artifacts 4 | -------------------------------------------------------------------------------- /afl-fuzz/in/4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /afl-fuzz/in/5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /afl-fuzz/in/6.xml: -------------------------------------------------------------------------------- 1 | 3 | ]> 4 | 5 | -------------------------------------------------------------------------------- /fuzz/README.md: -------------------------------------------------------------------------------- 1 | ## Prepare 2 | 3 | ``` 4 | cargo install cargo-fuzz 5 | ``` 6 | 7 | ## Run 8 | 9 | ``` 10 | cd .. 11 | cargo +nightly fuzz run fuzz_xml 12 | ``` 13 | -------------------------------------------------------------------------------- /afl-fuzz/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "afl-fuzz" 3 | version = "0.1.0" 4 | authors = ["RazrFalcon "] 5 | 6 | [dependencies] 7 | afl = "0.5" 8 | xmlparser = { path = ".." } 9 | -------------------------------------------------------------------------------- /afl-fuzz/README.md: -------------------------------------------------------------------------------- 1 | ## Prepare 2 | 3 | ``` 4 | cargo install afl 5 | ``` 6 | 7 | ## Run 8 | 9 | ``` 10 | cargo afl build 11 | cargo afl fuzz -i in -o out target/debug/afl-fuzz 12 | ``` 13 | -------------------------------------------------------------------------------- /tests/integration/main.rs: -------------------------------------------------------------------------------- 1 | extern crate xmlparser as xml; 2 | 3 | #[macro_use] 4 | mod token; 5 | 6 | mod api; 7 | mod cdata; 8 | mod comments; 9 | mod doctype; 10 | mod document; 11 | mod elements; 12 | mod pi; 13 | mod text; 14 | -------------------------------------------------------------------------------- /afl-fuzz/src/main.rs: -------------------------------------------------------------------------------- 1 | extern crate afl; 2 | extern crate xmlparser; 3 | 4 | use std::str; 5 | 6 | use afl::fuzz; 7 | 8 | fn main() { 9 | fuzz!(|data: &[u8]| { 10 | if let Ok(text) = str::from_utf8(data) { 11 | for _ in xmlparser::Tokenizer::from(text) {} 12 | } 13 | }); 14 | } 15 | -------------------------------------------------------------------------------- /fuzz/fuzz_targets/fuzz_xml.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | 3 | #[macro_use] extern crate libfuzzer_sys; 4 | extern crate xmlparser; 5 | 6 | use std::str; 7 | 8 | fuzz_target!(|data: &[u8]| { 9 | if let Ok(text) = str::from_utf8(data) { 10 | let mut n = 0; 11 | for _ in xmlparser::Tokenizer::from(text) { 12 | n += 1; 13 | 14 | if n == 1000 { 15 | panic!("endless loop"); 16 | } 17 | } 18 | } 19 | }); 20 | -------------------------------------------------------------------------------- /fuzz/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "xmlparser-fuzz" 3 | version = "0.0.1" 4 | authors = ["Automatically generated"] 5 | publish = false 6 | 7 | [package.metadata] 8 | cargo-fuzz = true 9 | 10 | [dependencies.xmlparser] 11 | path = ".." 12 | 13 | [dependencies.libfuzzer-sys] 14 | git = "https://github.com/rust-fuzz/libfuzzer-sys.git" 15 | 16 | # Prevent this from interfering with workspaces 17 | [workspace] 18 | members = ["."] 19 | 20 | [[bin]] 21 | name = "fuzz_xml" 22 | path = "fuzz_targets/fuzz_xml.rs" 23 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "xmlparser" 3 | version = "0.13.6" 4 | authors = ["Yevhenii Reizner "] 5 | edition = "2018" 6 | description = "Pull-based, zero-allocation XML parser." 7 | documentation = "https://docs.rs/xmlparser" 8 | readme = "README.md" 9 | homepage = "https://github.com/RazrFalcon/xmlparser" 10 | repository = "https://github.com/RazrFalcon/xmlparser" 11 | license = "MIT OR Apache-2.0" 12 | keywords = ["parser", "tokenizer", "xml"] 13 | categories = ["parser-implementations"] 14 | 15 | [features] 16 | default = ["std"] 17 | std = [] 18 | -------------------------------------------------------------------------------- /examples/parse.rs: -------------------------------------------------------------------------------- 1 | extern crate xmlparser as xml; 2 | 3 | use std::env; 4 | use std::fs; 5 | use std::io::Read; 6 | 7 | fn main() { 8 | let args = env::args().collect::>(); 9 | if args.len() != 2 { 10 | println!("Usage: parse file.xml"); 11 | return; 12 | } 13 | 14 | let text = load_file(&args[1]); 15 | 16 | if let Err(e) = parse(&text) { 17 | println!("Error: {}.", e); 18 | } 19 | } 20 | 21 | fn parse(text: &str) -> Result<(), xml::Error> { 22 | for token in xml::Tokenizer::from(text) { 23 | println!("{:?}", token?); 24 | } 25 | 26 | Ok(()) 27 | } 28 | 29 | fn load_file(path: &str) -> String { 30 | let mut file = fs::File::open(path).unwrap(); 31 | let mut text = String::new(); 32 | file.read_to_string(&mut text).unwrap(); 33 | text 34 | } 35 | -------------------------------------------------------------------------------- /tests/integration/api.rs: -------------------------------------------------------------------------------- 1 | extern crate xmlparser; 2 | 3 | use xmlparser::*; 4 | 5 | #[test] 6 | fn text_pos_1() { 7 | let mut s = Stream::from("text"); 8 | s.advance(2); 9 | assert_eq!(s.gen_text_pos(), TextPos::new(1, 3)); 10 | } 11 | 12 | #[test] 13 | fn text_pos_2() { 14 | let mut s = Stream::from("text\ntext"); 15 | s.advance(6); 16 | assert_eq!(s.gen_text_pos(), TextPos::new(2, 2)); 17 | } 18 | 19 | #[test] 20 | fn text_pos_3() { 21 | let mut s = Stream::from("текст\nтекст"); 22 | s.advance(15); 23 | assert_eq!(s.gen_text_pos(), TextPos::new(2, 3)); 24 | } 25 | 26 | #[test] 27 | fn token_size() { 28 | assert!(::std::mem::size_of::() <= 196); 29 | } 30 | 31 | #[test] 32 | fn span_size() { 33 | assert!(::std::mem::size_of::() <= 48); 34 | } 35 | 36 | #[test] 37 | fn err_size_1() { 38 | assert!(::std::mem::size_of::() <= 64); 39 | } 40 | 41 | #[test] 42 | fn err_size_2() { 43 | assert!(::std::mem::size_of::() <= 64); 44 | } 45 | -------------------------------------------------------------------------------- /README.tpl: -------------------------------------------------------------------------------- 1 | ## {{crate}} 2 | [![Build Status](https://travis-ci.org/RazrFalcon/{{crate}}.svg?branch=master)](https://travis-ci.org/RazrFalcon/{{crate}}) 3 | [![Crates.io](https://img.shields.io/crates/v/{{crate}}.svg)](https://crates.io/crates/{{crate}}) 4 | [![Documentation](https://docs.rs/{{crate}}/badge.svg)](https://docs.rs/{{crate}}) 5 | [![Rust 1.31+](https://img.shields.io/badge/rust-1.31+-orange.svg)](https://www.rust-lang.org) 6 | 7 | {{readme}} 8 | 9 | ### License 10 | 11 | Licensed under either of 12 | 13 | - Apache License, Version 2.0 14 | ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) 15 | - MIT license 16 | ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) 17 | 18 | at your option. 19 | 20 | ### Contribution 21 | 22 | Unless you explicitly state otherwise, any contribution intentionally submitted 23 | for inclusion in the work by you, as defined in the Apache-2.0 license, shall be 24 | dual licensed as above, without any additional terms or conditions. 25 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | pull_request: {} 5 | push: 6 | branches: 7 | - master 8 | schedule: 9 | - cron: '43 20 * * 3' 10 | 11 | concurrency: 12 | group: $-$ 13 | cancel-in-progress: true 14 | 15 | jobs: 16 | msrv: 17 | runs-on: ubuntu-latest 18 | steps: 19 | - uses: actions/checkout@v4 20 | - uses: dtolnay/rust-toolchain@1.31 21 | - run: cargo build --lib 22 | 23 | test: 24 | runs-on: ubuntu-latest 25 | steps: 26 | - uses: actions/checkout@v4 27 | - uses: dtolnay/rust-toolchain@stable 28 | - run: cargo test --all-targets 29 | - run: cargo test --doc 30 | 31 | clippy: 32 | runs-on: ubuntu-latest 33 | steps: 34 | - uses: actions/checkout@v4 35 | - uses: dtolnay/rust-toolchain@stable 36 | with: 37 | components: clippy 38 | - run: cargo clippy --all-features --all-targets -- -D warnings 39 | 40 | rustfmt: 41 | runs-on: ubuntu-latest 42 | steps: 43 | - uses: actions/checkout@v4 44 | - uses: dtolnay/rust-toolchain@stable 45 | with: 46 | components: rustfmt 47 | - run: cargo fmt --check --all 48 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2018 Reizner Evgeniy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/integration/text.rs: -------------------------------------------------------------------------------- 1 | use crate::token::*; 2 | 3 | test!( 4 | text_01, 5 | "

text

text

欄

\r\n\t

]]>

", 70 | Token::ElementStart("", "p", 0..2), 71 | Token::ElementEnd(ElementEnd::Open, 2..3), 72 | Token::Error( 73 | "invalid character data at 1:4 cause ']]>' is not allowed inside a character data" 74 | .to_string() 75 | ) 76 | ); 77 | 78 | test!( 79 | text_err_02, 80 | "

\u{0c}

", 81 | Token::ElementStart("", "p", 0..2), 82 | Token::ElementEnd(ElementEnd::Open, 2..3), 83 | Token::Error( 84 | "invalid character data at 1:4 cause a non-XML character '\\u{c}' found at 1:4".to_string() 85 | ) 86 | ); 87 | -------------------------------------------------------------------------------- /tests/integration/comments.rs: -------------------------------------------------------------------------------- 1 | use crate::token::*; 2 | 3 | test!( 4 | comment_01, 5 | "", 6 | Token::Comment("comment", 0..14) 7 | ); 8 | test!(comment_02, "", Token::Comment("", 0..13)); 9 | test!(comment_03, "", Token::Comment("", Token::Comment("", Token::Comment("<", Token::Comment("<", Token::Comment("-->", Token::Comment("<>", 0..9)); 15 | test!(comment_09, "", Token::Comment("<", 0..8)); 16 | test!(comment_10, "", Token::Comment("", Token::Comment("", 0..7)); 18 | 19 | macro_rules! test_err { 20 | ($name:ident, $text:expr) => { 21 | #[test] 22 | fn $name() { 23 | let mut p = xml::Tokenizer::from($text); 24 | assert!(p.next().unwrap().is_err()); 25 | } 26 | }; 27 | } 28 | 29 | test_err!(comment_err_01, ""); 30 | test_err!(comment_err_02, ""); 33 | test_err!(comment_err_05, ""); 35 | test_err!(comment_err_07, ""); 43 | test_err!(comment_err_15, ""); 48 | test_err!(comment_err_20, ""); 55 | test_err!(comment_err_27, ""); 56 | test_err!(comment_err_28, ""); 57 | test_err!(comment_err_29, ""); 61 | test_err!(comment_err_33, ""); 62 | test_err!(comment_err_34, ""); 63 | test_err!(comment_err_35, ""); 64 | -------------------------------------------------------------------------------- /tests/integration/cdata.rs: -------------------------------------------------------------------------------- 1 | extern crate xmlparser as xml; 2 | 3 | use crate::token::*; 4 | 5 | test!( 6 | cdata_01, 7 | "

text]]>

]]>

\n \t

", 92 | Token::ElementStart("", "p", 0..2), 93 | Token::ElementEnd(ElementEnd::Open, 2..3), 94 | Token::Error( 95 | "invalid CDATA at 1:4 cause a non-XML character '\\u{1}' found at 1:13".to_string() 96 | ) 97 | ); 98 | -------------------------------------------------------------------------------- /src/strspan.rs: -------------------------------------------------------------------------------- 1 | use core::fmt; 2 | use core::ops::{Deref, Range}; 3 | 4 | /// A string slice. 5 | /// 6 | /// Like `&str`, but also contains the position in the input XML 7 | /// from which it was parsed. 8 | #[must_use] 9 | #[derive(Clone, Copy, PartialEq, Eq, Hash)] 10 | pub struct StrSpan<'a> { 11 | text: &'a str, 12 | start: usize, 13 | } 14 | 15 | impl<'a> From<&'a str> for StrSpan<'a> { 16 | #[inline] 17 | fn from(text: &'a str) -> Self { 18 | StrSpan { text, start: 0 } 19 | } 20 | } 21 | 22 | impl PartialEq for StrSpan<'_> { 23 | fn eq(&self, other: &str) -> bool { 24 | self.text == other 25 | } 26 | } 27 | 28 | impl PartialEq<&str> for StrSpan<'_> { 29 | fn eq(&self, other: &&str) -> bool { 30 | self.text == *other 31 | } 32 | } 33 | 34 | impl PartialEq> for str { 35 | fn eq(&self, other: &StrSpan<'_>) -> bool { 36 | self == other.text 37 | } 38 | } 39 | 40 | impl PartialEq> for &str { 41 | fn eq(&self, other: &StrSpan<'_>) -> bool { 42 | *self == other.text 43 | } 44 | } 45 | 46 | impl<'a> StrSpan<'a> { 47 | /// Constructs a new `StrSpan` from substring. 48 | #[inline] 49 | pub(crate) fn from_substr(text: &str, start: usize, end: usize) -> StrSpan { 50 | debug_assert!(start <= end); 51 | StrSpan { 52 | text: &text[start..end], 53 | start, 54 | } 55 | } 56 | 57 | /// Returns `true` is self is empty. 58 | pub fn is_empty(&self) -> bool { 59 | self.text.is_empty() 60 | } 61 | 62 | /// Returns the start position of the span. 63 | #[inline] 64 | pub fn start(&self) -> usize { 65 | self.start 66 | } 67 | 68 | /// Returns the end position of the span. 69 | #[inline] 70 | pub fn end(&self) -> usize { 71 | self.start + self.text.len() 72 | } 73 | 74 | /// Returns the range of the span. 75 | #[inline] 76 | pub fn range(&self) -> Range { 77 | self.start..self.end() 78 | } 79 | 80 | /// Returns the span as a string slice 81 | #[inline] 82 | pub fn as_str(&self) -> &'a str { 83 | self.text 84 | } 85 | 86 | /// Returns an underling string region as `StrSpan`. 87 | #[inline] 88 | pub(crate) fn slice_region(&self, start: usize, end: usize) -> StrSpan<'a> { 89 | StrSpan::from_substr(self.text, start, end) 90 | } 91 | } 92 | 93 | impl fmt::Debug for StrSpan<'_> { 94 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 95 | write!( 96 | f, 97 | "StrSpan({:?} {}..{})", 98 | self.as_str(), 99 | self.start(), 100 | self.end() 101 | ) 102 | } 103 | } 104 | 105 | impl fmt::Display for StrSpan<'_> { 106 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 107 | write!(f, "{}", self.as_str()) 108 | } 109 | } 110 | 111 | impl Deref for StrSpan<'_> { 112 | type Target = str; 113 | 114 | fn deref(&self) -> &Self::Target { 115 | self.text 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /tests/integration/document.rs: -------------------------------------------------------------------------------- 1 | use std::str; 2 | 3 | use crate::token::*; 4 | 5 | test!(document_01, "",); 6 | 7 | test!(document_02, " ",); 8 | 9 | test!(document_03, " \n\t\r ",); 10 | 11 | // BOM 12 | test!( 13 | document_05, 14 | str::from_utf8(b"\xEF\xBB\xBF").unwrap(), 15 | Token::ElementStart("", "a", 3..5), 16 | Token::ElementEnd(ElementEnd::Empty, 5..7) 17 | ); 18 | 19 | test!( 20 | document_06, 21 | str::from_utf8(b"\xEF\xBB\xBF").unwrap(), 22 | Token::Declaration("1.0", None, None, 3..24) 23 | ); 24 | 25 | test!( 26 | document_07, 27 | "\n\n\ 28 | ", 29 | Token::Declaration("1.0", Some("utf-8"), None, 0..38), 30 | Token::Comment(" comment ", 39..55), 31 | Token::EmptyDtd( 32 | "svg", 33 | Some(ExternalId::Public( 34 | "-//W3C//DTD SVG 1.1//EN", 35 | "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd" 36 | )), 37 | 56..154 38 | ) 39 | ); 40 | 41 | test!( 42 | document_08, 43 | "\n\ 44 | ", 45 | Token::PI("xml-stylesheet", None, 0..18), 46 | Token::EmptyDtd( 47 | "svg", 48 | Some(ExternalId::Public( 49 | "-//W3C//DTD SVG 1.1//EN", 50 | "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd" 51 | )), 52 | 19..117 53 | ) 54 | ); 55 | 56 | test!( 57 | document_09, 58 | "\n\n\ 59 | ", 60 | Token::Declaration("1.0", Some("utf-8"), None, 0..38), 61 | Token::PI("xml-stylesheet", None, 39..57), 62 | Token::EmptyDtd( 63 | "svg", 64 | Some(ExternalId::Public( 65 | "-//W3C//DTD SVG 1.1//EN", 66 | "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd" 67 | )), 68 | 58..156 69 | ) 70 | ); 71 | 72 | test!( 73 | document_err_01, 74 | "", 75 | Token::Error("unknown token at 1:1".to_string()) 76 | ); 77 | 78 | test!( 79 | document_err_02, 80 | " &www---------Ӥ+----------w-----www_", 81 | Token::Error("unknown token at 1:2".to_string()) 82 | ); 83 | 84 | test!( 85 | document_err_03, 86 | "q", 87 | Token::Error("unknown token at 1:1".to_string()) 88 | ); 89 | 90 | test!( 91 | document_err_04, 92 | "", 93 | Token::Error("unknown token at 1:1".to_string()) 94 | ); 95 | 96 | test!( 97 | document_err_05, 98 | "", 99 | Token::EmptyDtd("greeting1", None, 0..20), 100 | Token::Error("unknown token at 1:21".to_string()) 101 | ); 102 | 103 | test!( 104 | document_err_06, 105 | " ", 106 | Token::Error("unknown token at 1:1".to_string()) 107 | ); 108 | 109 | #[test] 110 | fn parse_fragment_1() { 111 | let s = "

text

278 | /// ------ - text 279 | /// ``` 280 | /// 281 | /// The token span is equal to the `text`. 282 | Text { text: StrSpan<'a> }, 283 | 284 | /// CDATA token. 285 | /// 286 | /// ```text 287 | ///

288 | /// ---- - text 289 | /// ---------------- - span 290 | /// ``` 291 | Cdata { 292 | text: StrSpan<'a>, 293 | span: StrSpan<'a>, 294 | }, 295 | } 296 | 297 | impl<'a> Token<'a> { 298 | /// Returns the [`StrSpan`] encompassing all of the token. 299 | pub fn span(&self) -> StrSpan<'a> { 300 | let span = match self { 301 | Token::Declaration { span, .. } => span, 302 | Token::ProcessingInstruction { span, .. } => span, 303 | Token::Comment { span, .. } => span, 304 | Token::DtdStart { span, .. } => span, 305 | Token::EmptyDtd { span, .. } => span, 306 | Token::EntityDeclaration { span, .. } => span, 307 | Token::DtdEnd { span, .. } => span, 308 | Token::ElementStart { span, .. } => span, 309 | Token::Attribute { span, .. } => span, 310 | Token::ElementEnd { span, .. } => span, 311 | Token::Text { text, .. } => text, 312 | Token::Cdata { span, .. } => span, 313 | }; 314 | *span 315 | } 316 | } 317 | 318 | /// `ElementEnd` token. 319 | #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] 320 | pub enum ElementEnd<'a> { 321 | /// Indicates `>` 322 | Open, 323 | /// Indicates `` 324 | Close(StrSpan<'a>, StrSpan<'a>), 325 | /// Indicates `/>` 326 | Empty, 327 | } 328 | 329 | /// Representation of the [ExternalID](https://www.w3.org/TR/xml/#NT-ExternalID) value. 330 | #[allow(missing_docs)] 331 | #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] 332 | pub enum ExternalId<'a> { 333 | System(StrSpan<'a>), 334 | Public(StrSpan<'a>, StrSpan<'a>), 335 | } 336 | 337 | /// Representation of the [EntityDef](https://www.w3.org/TR/xml/#NT-EntityDef) value. 338 | #[allow(missing_docs)] 339 | #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] 340 | pub enum EntityDefinition<'a> { 341 | EntityValue(StrSpan<'a>), 342 | ExternalId(ExternalId<'a>), 343 | } 344 | 345 | type Result = core::result::Result; 346 | type StreamResult = core::result::Result; 347 | 348 | #[derive(Clone, Copy, PartialEq, Debug)] 349 | enum State { 350 | Declaration, 351 | AfterDeclaration, 352 | Dtd, 353 | AfterDtd, 354 | Elements, 355 | Attributes, 356 | AfterElements, 357 | End, 358 | } 359 | 360 | /// Tokenizer for the XML structure. 361 | #[derive(Clone)] 362 | pub struct Tokenizer<'a> { 363 | stream: Stream<'a>, 364 | state: State, 365 | depth: usize, 366 | fragment_parsing: bool, 367 | } 368 | 369 | impl core::fmt::Debug for Tokenizer<'_> { 370 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { 371 | write!(f, "Tokenizer {{ ... }}") 372 | } 373 | } 374 | 375 | impl<'a> From<&'a str> for Tokenizer<'a> { 376 | #[inline] 377 | fn from(text: &'a str) -> Self { 378 | let mut stream = Stream::from(text); 379 | 380 | // Skip UTF-8 BOM. 381 | if stream.starts_with(&[0xEF, 0xBB, 0xBF]) { 382 | stream.advance(3); 383 | } 384 | 385 | Tokenizer { 386 | stream, 387 | state: State::Declaration, 388 | depth: 0, 389 | fragment_parsing: false, 390 | } 391 | } 392 | } 393 | 394 | macro_rules! map_err_at { 395 | ($fun:expr, $stream:expr, $err:ident) => {{ 396 | let start = $stream.pos(); 397 | $fun.map_err(|e| Error::$err(e, $stream.gen_text_pos_from(start))) 398 | }}; 399 | } 400 | 401 | impl<'a> Tokenizer<'a> { 402 | /// Enables document fragment parsing. 403 | /// 404 | /// By default, `xmlparser` will check for DTD, root element, etc. 405 | /// But if we have to parse an XML fragment, it will lead to an error. 406 | /// This method switches the parser to the root element content parsing mode, 407 | /// so it will treat any data as a content of the root element. 408 | pub fn from_fragment(full_text: &'a str, fragment: core::ops::Range) -> Self { 409 | Tokenizer { 410 | stream: Stream::from_substr(full_text, fragment), 411 | state: State::Elements, 412 | depth: 0, 413 | fragment_parsing: true, 414 | } 415 | } 416 | 417 | fn parse_next_impl(&mut self) -> Option>> { 418 | let s = &mut self.stream; 419 | 420 | if s.at_end() { 421 | return None; 422 | } 423 | 424 | let start = s.pos(); 425 | 426 | match self.state { 427 | State::Declaration => { 428 | self.state = State::AfterDeclaration; 429 | if s.starts_with(b" { 436 | if s.starts_with(b" self.state = State::Dtd, 440 | Ok(Token::EmptyDtd { .. }) => self.state = State::AfterDtd, 441 | _ => {} 442 | } 443 | 444 | Some(t) 445 | } else if s.starts_with(b"' 728 | fn parse_comment_impl(s: &mut Stream<'a>) -> StreamResult> { 729 | let start = s.pos(); 730 | s.advance(4); 731 | let text = s.consume_chars(|s, c| !(c == '-' && s.starts_with(b"-->")))?; 732 | s.skip_string(b"-->")?; 733 | 734 | if text.as_str().contains("--") { 735 | return Err(StreamError::InvalidCommentData); 736 | } 737 | 738 | if text.as_str().ends_with('-') { 739 | return Err(StreamError::InvalidCommentEnd); 740 | } 741 | 742 | let span = s.slice_back(start); 743 | 744 | Ok(Token::Comment { text, span }) 745 | } 746 | 747 | fn parse_pi(s: &mut Stream<'a>) -> Result> { 748 | map_err_at!(Self::parse_pi_impl(s), s, InvalidPI) 749 | } 750 | 751 | // PI ::= '' Char*)))? '?>' 752 | // PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) 753 | fn parse_pi_impl(s: &mut Stream<'a>) -> StreamResult> { 754 | let start = s.pos(); 755 | s.advance(2); 756 | let target = s.consume_name()?; 757 | s.skip_spaces(); 758 | let content = s.consume_chars(|s, c| !(c == '?' && s.starts_with(b"?>")))?; 759 | let content = if !content.is_empty() { 760 | Some(content) 761 | } else { 762 | None 763 | }; 764 | 765 | s.skip_string(b"?>")?; 766 | 767 | let span = s.slice_back(start); 768 | 769 | Ok(Token::ProcessingInstruction { 770 | target, 771 | content, 772 | span, 773 | }) 774 | } 775 | 776 | fn parse_doctype(s: &mut Stream<'a>) -> Result> { 777 | map_err_at!(Self::parse_doctype_impl(s), s, InvalidDoctype) 778 | } 779 | 780 | // doctypedecl ::= '' 781 | fn parse_doctype_impl(s: &mut Stream<'a>) -> StreamResult> { 782 | let start = s.pos(); 783 | s.advance(9); 784 | 785 | s.consume_spaces()?; 786 | let name = s.consume_name()?; 787 | s.skip_spaces(); 788 | 789 | let external_id = Self::parse_external_id(s)?; 790 | s.skip_spaces(); 791 | 792 | let c = s.curr_byte()?; 793 | if c != b'[' && c != b'>' { 794 | static EXPECTED: &[u8] = b"[>"; 795 | return Err(StreamError::InvalidCharMultiple( 796 | c, 797 | EXPECTED, 798 | s.gen_text_pos(), 799 | )); 800 | } 801 | 802 | s.advance(1); 803 | 804 | let span = s.slice_back(start); 805 | if c == b'[' { 806 | Ok(Token::DtdStart { 807 | name, 808 | external_id, 809 | span, 810 | }) 811 | } else { 812 | Ok(Token::EmptyDtd { 813 | name, 814 | external_id, 815 | span, 816 | }) 817 | } 818 | } 819 | 820 | // ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral 821 | fn parse_external_id(s: &mut Stream<'a>) -> StreamResult>> { 822 | let v = if s.starts_with(b"SYSTEM") || s.starts_with(b"PUBLIC") { 823 | let start = s.pos(); 824 | s.advance(6); 825 | let id = s.slice_back(start); 826 | 827 | s.consume_spaces()?; 828 | let quote = s.consume_quote()?; 829 | let literal1 = s.consume_bytes(|_, c| c != quote); 830 | s.consume_byte(quote)?; 831 | 832 | let v = if id.as_str() == "SYSTEM" { 833 | ExternalId::System(literal1) 834 | } else { 835 | s.consume_spaces()?; 836 | let quote = s.consume_quote()?; 837 | let literal2 = s.consume_bytes(|_, c| c != quote); 838 | s.consume_byte(quote)?; 839 | 840 | ExternalId::Public(literal1, literal2) 841 | }; 842 | 843 | Some(v) 844 | } else { 845 | None 846 | }; 847 | 848 | Ok(v) 849 | } 850 | 851 | fn parse_entity_decl(s: &mut Stream<'a>) -> Result> { 852 | map_err_at!(Self::parse_entity_decl_impl(s), s, InvalidEntity) 853 | } 854 | 855 | // EntityDecl ::= GEDecl | PEDecl 856 | // GEDecl ::= '' 857 | // PEDecl ::= '' 858 | fn parse_entity_decl_impl(s: &mut Stream<'a>) -> StreamResult> { 859 | let start = s.pos(); 860 | s.advance(8); 861 | 862 | s.consume_spaces()?; 863 | 864 | let is_ge = if s.try_consume_byte(b'%') { 865 | s.consume_spaces()?; 866 | false 867 | } else { 868 | true 869 | }; 870 | 871 | let name = s.consume_name()?; 872 | s.consume_spaces()?; 873 | let definition = Self::parse_entity_def(s, is_ge)?; 874 | s.skip_spaces(); 875 | s.consume_byte(b'>')?; 876 | 877 | let span = s.slice_back(start); 878 | 879 | Ok(Token::EntityDeclaration { 880 | name, 881 | definition, 882 | span, 883 | }) 884 | } 885 | 886 | // EntityDef ::= EntityValue | (ExternalID NDataDecl?) 887 | // PEDef ::= EntityValue | ExternalID 888 | // EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | "'" ([^%&'] 889 | // | PEReference | Reference)* "'" 890 | // ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral 891 | // NDataDecl ::= S 'NDATA' S Name 892 | fn parse_entity_def(s: &mut Stream<'a>, is_ge: bool) -> StreamResult> { 893 | let c = s.curr_byte()?; 894 | match c { 895 | b'"' | b'\'' => { 896 | let quote = s.consume_quote()?; 897 | let value = s.consume_bytes(|_, c| c != quote); 898 | s.consume_byte(quote)?; 899 | 900 | Ok(EntityDefinition::EntityValue(value)) 901 | } 902 | b'S' | b'P' => { 903 | if let Some(id) = Self::parse_external_id(s)? { 904 | if is_ge { 905 | s.skip_spaces(); 906 | if s.starts_with(b"NDATA") { 907 | s.advance(5); 908 | s.consume_spaces()?; 909 | s.skip_name()?; 910 | // TODO: NDataDecl is not supported 911 | } 912 | } 913 | 914 | Ok(EntityDefinition::ExternalId(id)) 915 | } else { 916 | Err(StreamError::InvalidExternalID) 917 | } 918 | } 919 | _ => { 920 | static EXPECTED: &[u8] = b"\"'SP"; 921 | let pos = s.gen_text_pos(); 922 | Err(StreamError::InvalidCharMultiple(c, EXPECTED, pos)) 923 | } 924 | } 925 | } 926 | 927 | fn consume_decl(s: &mut Stream) -> StreamResult<()> { 928 | s.skip_bytes(|_, c| c != b'>'); 929 | s.consume_byte(b'>')?; 930 | Ok(()) 931 | } 932 | 933 | fn parse_cdata(s: &mut Stream<'a>) -> Result> { 934 | map_err_at!(Self::parse_cdata_impl(s), s, InvalidCdata) 935 | } 936 | 937 | // CDSect ::= CDStart CData CDEnd 938 | // CDStart ::= '' Char*)) 940 | // CDEnd ::= ']]>' 941 | fn parse_cdata_impl(s: &mut Stream<'a>) -> StreamResult> { 942 | let start = s.pos(); 943 | s.advance(9); 944 | let text = s.consume_chars(|s, c| !(c == ']' && s.starts_with(b"]]>")))?; 945 | s.skip_string(b"]]>")?; 946 | let span = s.slice_back(start); 947 | Ok(Token::Cdata { text, span }) 948 | } 949 | 950 | fn parse_element_start(s: &mut Stream<'a>) -> Result> { 951 | map_err_at!(Self::parse_element_start_impl(s), s, InvalidElement) 952 | } 953 | 954 | // '<' Name (S Attribute)* S? '>' 955 | fn parse_element_start_impl(s: &mut Stream<'a>) -> StreamResult> { 956 | let start = s.pos(); 957 | s.advance(1); 958 | let (prefix, local) = s.consume_qname()?; 959 | let span = s.slice_back(start); 960 | 961 | Ok(Token::ElementStart { 962 | prefix, 963 | local, 964 | span, 965 | }) 966 | } 967 | 968 | fn parse_close_element(s: &mut Stream<'a>) -> Result> { 969 | map_err_at!(Self::parse_close_element_impl(s), s, InvalidElement) 970 | } 971 | 972 | // '' 973 | fn parse_close_element_impl(s: &mut Stream<'a>) -> StreamResult> { 974 | let start = s.pos(); 975 | s.advance(2); 976 | 977 | let (prefix, tag_name) = s.consume_qname()?; 978 | s.skip_spaces(); 979 | s.consume_byte(b'>')?; 980 | 981 | let span = s.slice_back(start); 982 | 983 | Ok(Token::ElementEnd { 984 | end: ElementEnd::Close(prefix, tag_name), 985 | span, 986 | }) 987 | } 988 | 989 | // Name Eq AttValue 990 | fn parse_attribute(s: &mut Stream<'a>) -> StreamResult> { 991 | let attr_start = s.pos(); 992 | let has_space = s.starts_with_space(); 993 | s.skip_spaces(); 994 | 995 | if let Ok(c) = s.curr_byte() { 996 | let start = s.pos(); 997 | 998 | match c { 999 | b'/' => { 1000 | s.advance(1); 1001 | s.consume_byte(b'>')?; 1002 | let span = s.slice_back(start); 1003 | return Ok(Token::ElementEnd { 1004 | end: ElementEnd::Empty, 1005 | span, 1006 | }); 1007 | } 1008 | b'>' => { 1009 | s.advance(1); 1010 | let span = s.slice_back(start); 1011 | return Ok(Token::ElementEnd { 1012 | end: ElementEnd::Open, 1013 | span, 1014 | }); 1015 | } 1016 | _ => {} 1017 | } 1018 | } 1019 | 1020 | if !has_space { 1021 | if !s.at_end() { 1022 | return Err(StreamError::InvalidSpace( 1023 | s.curr_byte_unchecked(), 1024 | s.gen_text_pos_from(attr_start), 1025 | )); 1026 | } else { 1027 | return Err(StreamError::UnexpectedEndOfStream); 1028 | } 1029 | } 1030 | 1031 | let start = s.pos(); 1032 | 1033 | let (prefix, local) = s.consume_qname()?; 1034 | s.consume_eq()?; 1035 | let quote = s.consume_quote()?; 1036 | let quote_c = quote as char; 1037 | // The attribute value must not contain the < character. 1038 | let value = s.consume_chars(|_, c| c != quote_c && c != '<')?; 1039 | s.consume_byte(quote)?; 1040 | let span = s.slice_back(start); 1041 | 1042 | Ok(Token::Attribute { 1043 | prefix, 1044 | local, 1045 | value, 1046 | span, 1047 | }) 1048 | } 1049 | 1050 | fn parse_text(s: &mut Stream<'a>) -> Result> { 1051 | map_err_at!(Self::parse_text_impl(s), s, InvalidCharData) 1052 | } 1053 | 1054 | fn parse_text_impl(s: &mut Stream<'a>) -> StreamResult> { 1055 | let text = s.consume_chars(|_, c| c != '<')?; 1056 | 1057 | // According to the spec, `]]>` must not appear inside a Text node. 1058 | // https://www.w3.org/TR/xml/#syntax 1059 | // 1060 | // Search for `>` first, since it's a bit faster than looking for `]]>`. 1061 | if text.as_str().contains('>') && text.as_str().contains("]]>") { 1062 | return Err(StreamError::InvalidCharacterData); 1063 | } 1064 | 1065 | Ok(Token::Text { text }) 1066 | } 1067 | 1068 | /// Returns a copy of the tokenizer's stream. 1069 | pub fn stream(&self) -> Stream<'a> { 1070 | self.stream 1071 | } 1072 | } 1073 | 1074 | impl<'a> Iterator for Tokenizer<'a> { 1075 | type Item = Result>; 1076 | 1077 | #[inline] 1078 | fn next(&mut self) -> Option { 1079 | let mut t = None; 1080 | while !self.stream.at_end() && self.state != State::End && t.is_none() { 1081 | t = self.parse_next_impl(); 1082 | } 1083 | 1084 | if let Some(Err(_)) = t { 1085 | self.stream.jump_to_end(); 1086 | self.state = State::End; 1087 | } 1088 | 1089 | t 1090 | } 1091 | } 1092 | --------------------------------------------------------------------------------