├── afl-fuzz ├── .gitignore ├── in │ ├── 1.xml │ ├── 2.xml │ ├── 3.xml │ ├── 4.xml │ ├── 5.xml │ └── 6.xml ├── Cargo.toml ├── README.md └── src │ └── main.rs ├── .gitignore ├── fuzz ├── .gitignore ├── README.md ├── fuzz_targets │ └── fuzz_xml.rs └── Cargo.toml ├── tests └── integration │ ├── main.rs │ ├── api.rs │ ├── text.rs │ ├── comments.rs │ ├── cdata.rs │ ├── document.rs │ ├── token.rs │ ├── pi.rs │ ├── doctype.rs │ └── elements.rs ├── Cargo.toml ├── examples └── parse.rs ├── README.tpl ├── .github └── workflows │ └── ci.yml ├── LICENSE-MIT ├── src ├── strspan.rs ├── xmlchar.rs ├── error.rs ├── stream.rs └── lib.rs ├── README.md ├── CHANGELOG.md └── LICENSE-APACHE /afl-fuzz/.gitignore: -------------------------------------------------------------------------------- 1 | /out 2 | -------------------------------------------------------------------------------- /afl-fuzz/in/1.xml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /afl-fuzz/in/2.xml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /afl-fuzz/in/3.xml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | Cargo.lock 3 | .idea 4 | -------------------------------------------------------------------------------- /fuzz/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | corpus 3 | artifacts 4 | -------------------------------------------------------------------------------- /afl-fuzz/in/4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /afl-fuzz/in/5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /afl-fuzz/in/6.xml: -------------------------------------------------------------------------------- 1 | 3 | ]> 4 | 5 | -------------------------------------------------------------------------------- /fuzz/README.md: -------------------------------------------------------------------------------- 1 | ## Prepare 2 | 3 | ``` 4 | cargo install cargo-fuzz 5 | ``` 6 | 7 | ## Run 8 | 9 | ``` 10 | cd .. 11 | cargo +nightly fuzz run fuzz_xml 12 | ``` 13 | -------------------------------------------------------------------------------- /afl-fuzz/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "afl-fuzz" 3 | version = "0.1.0" 4 | authors = ["RazrFalcon "] 5 | 6 | [dependencies] 7 | afl = "0.5" 8 | xmlparser = { path = ".." } 9 | -------------------------------------------------------------------------------- /afl-fuzz/README.md: -------------------------------------------------------------------------------- 1 | ## Prepare 2 | 3 | ``` 4 | cargo install afl 5 | ``` 6 | 7 | ## Run 8 | 9 | ``` 10 | cargo afl build 11 | cargo afl fuzz -i in -o out target/debug/afl-fuzz 12 | ``` 13 | -------------------------------------------------------------------------------- /tests/integration/main.rs: -------------------------------------------------------------------------------- 1 | extern crate xmlparser as xml; 2 | 3 | #[macro_use] 4 | mod token; 5 | 6 | mod api; 7 | mod cdata; 8 | mod comments; 9 | mod doctype; 10 | mod document; 11 | mod elements; 12 | mod pi; 13 | mod text; 14 | -------------------------------------------------------------------------------- /afl-fuzz/src/main.rs: -------------------------------------------------------------------------------- 1 | extern crate afl; 2 | extern crate xmlparser; 3 | 4 | use std::str; 5 | 6 | use afl::fuzz; 7 | 8 | fn main() { 9 | fuzz!(|data: &[u8]| { 10 | if let Ok(text) = str::from_utf8(data) { 11 | for _ in xmlparser::Tokenizer::from(text) {} 12 | } 13 | }); 14 | } 15 | -------------------------------------------------------------------------------- /fuzz/fuzz_targets/fuzz_xml.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | 3 | #[macro_use] extern crate libfuzzer_sys; 4 | extern crate xmlparser; 5 | 6 | use std::str; 7 | 8 | fuzz_target!(|data: &[u8]| { 9 | if let Ok(text) = str::from_utf8(data) { 10 | let mut n = 0; 11 | for _ in xmlparser::Tokenizer::from(text) { 12 | n += 1; 13 | 14 | if n == 1000 { 15 | panic!("endless loop"); 16 | } 17 | } 18 | } 19 | }); 20 | -------------------------------------------------------------------------------- /fuzz/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "xmlparser-fuzz" 3 | version = "0.0.1" 4 | authors = ["Automatically generated"] 5 | publish = false 6 | 7 | [package.metadata] 8 | cargo-fuzz = true 9 | 10 | [dependencies.xmlparser] 11 | path = ".." 12 | 13 | [dependencies.libfuzzer-sys] 14 | git = "https://github.com/rust-fuzz/libfuzzer-sys.git" 15 | 16 | # Prevent this from interfering with workspaces 17 | [workspace] 18 | members = ["."] 19 | 20 | [[bin]] 21 | name = "fuzz_xml" 22 | path = "fuzz_targets/fuzz_xml.rs" 23 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "xmlparser" 3 | version = "0.13.6" 4 | authors = ["Yevhenii Reizner "] 5 | edition = "2018" 6 | description = "Pull-based, zero-allocation XML parser." 7 | documentation = "https://docs.rs/xmlparser" 8 | readme = "README.md" 9 | homepage = "https://github.com/RazrFalcon/xmlparser" 10 | repository = "https://github.com/RazrFalcon/xmlparser" 11 | license = "MIT OR Apache-2.0" 12 | keywords = ["parser", "tokenizer", "xml"] 13 | categories = ["parser-implementations"] 14 | 15 | [features] 16 | default = ["std"] 17 | std = [] 18 | -------------------------------------------------------------------------------- /examples/parse.rs: -------------------------------------------------------------------------------- 1 | extern crate xmlparser as xml; 2 | 3 | use std::env; 4 | use std::fs; 5 | use std::io::Read; 6 | 7 | fn main() { 8 | let args = env::args().collect::>(); 9 | if args.len() != 2 { 10 | println!("Usage: parse file.xml"); 11 | return; 12 | } 13 | 14 | let text = load_file(&args[1]); 15 | 16 | if let Err(e) = parse(&text) { 17 | println!("Error: {}.", e); 18 | } 19 | } 20 | 21 | fn parse(text: &str) -> Result<(), xml::Error> { 22 | for token in xml::Tokenizer::from(text) { 23 | println!("{:?}", token?); 24 | } 25 | 26 | Ok(()) 27 | } 28 | 29 | fn load_file(path: &str) -> String { 30 | let mut file = fs::File::open(path).unwrap(); 31 | let mut text = String::new(); 32 | file.read_to_string(&mut text).unwrap(); 33 | text 34 | } 35 | -------------------------------------------------------------------------------- /tests/integration/api.rs: -------------------------------------------------------------------------------- 1 | extern crate xmlparser; 2 | 3 | use xmlparser::*; 4 | 5 | #[test] 6 | fn text_pos_1() { 7 | let mut s = Stream::from("text"); 8 | s.advance(2); 9 | assert_eq!(s.gen_text_pos(), TextPos::new(1, 3)); 10 | } 11 | 12 | #[test] 13 | fn text_pos_2() { 14 | let mut s = Stream::from("text\ntext"); 15 | s.advance(6); 16 | assert_eq!(s.gen_text_pos(), TextPos::new(2, 2)); 17 | } 18 | 19 | #[test] 20 | fn text_pos_3() { 21 | let mut s = Stream::from("текст\nтекст"); 22 | s.advance(15); 23 | assert_eq!(s.gen_text_pos(), TextPos::new(2, 3)); 24 | } 25 | 26 | #[test] 27 | fn token_size() { 28 | assert!(::std::mem::size_of::() <= 196); 29 | } 30 | 31 | #[test] 32 | fn span_size() { 33 | assert!(::std::mem::size_of::() <= 48); 34 | } 35 | 36 | #[test] 37 | fn err_size_1() { 38 | assert!(::std::mem::size_of::() <= 64); 39 | } 40 | 41 | #[test] 42 | fn err_size_2() { 43 | assert!(::std::mem::size_of::() <= 64); 44 | } 45 | -------------------------------------------------------------------------------- /README.tpl: -------------------------------------------------------------------------------- 1 | ## {{crate}} 2 | [![Build Status](https://travis-ci.org/RazrFalcon/{{crate}}.svg?branch=master)](https://travis-ci.org/RazrFalcon/{{crate}}) 3 | [![Crates.io](https://img.shields.io/crates/v/{{crate}}.svg)](https://crates.io/crates/{{crate}}) 4 | [![Documentation](https://docs.rs/{{crate}}/badge.svg)](https://docs.rs/{{crate}}) 5 | [![Rust 1.31+](https://img.shields.io/badge/rust-1.31+-orange.svg)](https://www.rust-lang.org) 6 | 7 | {{readme}} 8 | 9 | ### License 10 | 11 | Licensed under either of 12 | 13 | - Apache License, Version 2.0 14 | ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) 15 | - MIT license 16 | ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) 17 | 18 | at your option. 19 | 20 | ### Contribution 21 | 22 | Unless you explicitly state otherwise, any contribution intentionally submitted 23 | for inclusion in the work by you, as defined in the Apache-2.0 license, shall be 24 | dual licensed as above, without any additional terms or conditions. 25 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | pull_request: {} 5 | push: 6 | branches: 7 | - master 8 | schedule: 9 | - cron: '43 20 * * 3' 10 | 11 | concurrency: 12 | group: $-$ 13 | cancel-in-progress: true 14 | 15 | jobs: 16 | msrv: 17 | runs-on: ubuntu-latest 18 | steps: 19 | - uses: actions/checkout@v4 20 | - uses: dtolnay/rust-toolchain@1.31 21 | - run: cargo build --lib 22 | 23 | test: 24 | runs-on: ubuntu-latest 25 | steps: 26 | - uses: actions/checkout@v4 27 | - uses: dtolnay/rust-toolchain@stable 28 | - run: cargo test --all-targets 29 | - run: cargo test --doc 30 | 31 | clippy: 32 | runs-on: ubuntu-latest 33 | steps: 34 | - uses: actions/checkout@v4 35 | - uses: dtolnay/rust-toolchain@stable 36 | with: 37 | components: clippy 38 | - run: cargo clippy --all-features --all-targets -- -D warnings 39 | 40 | rustfmt: 41 | runs-on: ubuntu-latest 42 | steps: 43 | - uses: actions/checkout@v4 44 | - uses: dtolnay/rust-toolchain@stable 45 | with: 46 | components: rustfmt 47 | - run: cargo fmt --check --all 48 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2018 Reizner Evgeniy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/integration/text.rs: -------------------------------------------------------------------------------- 1 | use crate::token::*; 2 | 3 | test!( 4 | text_01, 5 | "

text

", 6 | Token::ElementStart("", "p", 0..2), 7 | Token::ElementEnd(ElementEnd::Open, 2..3), 8 | Token::Text("text", 3..7), 9 | Token::ElementEnd(ElementEnd::Close("", "p"), 7..11) 10 | ); 11 | 12 | test!( 13 | text_02, 14 | "

text

", 15 | Token::ElementStart("", "p", 0..2), 16 | Token::ElementEnd(ElementEnd::Open, 2..3), 17 | Token::Text(" text ", 3..9), 18 | Token::ElementEnd(ElementEnd::Close("", "p"), 9..13) 19 | ); 20 | 21 | // 欄 is EF A4 9D. And EF can be mistreated for UTF-8 BOM. 22 | test!( 23 | text_03, 24 | "

", 25 | Token::ElementStart("", "p", 0..2), 26 | Token::ElementEnd(ElementEnd::Open, 2..3), 27 | Token::Text("欄", 3..6), 28 | Token::ElementEnd(ElementEnd::Close("", "p"), 6..10) 29 | ); 30 | 31 | test!( 32 | text_04, 33 | "

", 34 | Token::ElementStart("", "p", 0..2), 35 | Token::ElementEnd(ElementEnd::Open, 2..3), 36 | Token::Text(" ", 3..4), 37 | Token::ElementEnd(ElementEnd::Close("", "p"), 4..8) 38 | ); 39 | 40 | test!( 41 | text_05, 42 | "

\r\n\t

", 43 | Token::ElementStart("", "p", 0..2), 44 | Token::ElementEnd(ElementEnd::Open, 2..3), 45 | Token::Text(" \r\n\t ", 3..8), 46 | Token::ElementEnd(ElementEnd::Close("", "p"), 8..12) 47 | ); 48 | 49 | test!( 50 | text_06, 51 | "

", 52 | Token::ElementStart("", "p", 0..2), 53 | Token::ElementEnd(ElementEnd::Open, 2..3), 54 | Token::Text(" ", 3..9), 55 | Token::ElementEnd(ElementEnd::Close("", "p"), 9..13) 56 | ); 57 | 58 | test!( 59 | text_07, 60 | "

]>

", 61 | Token::ElementStart("", "p", 0..2), 62 | Token::ElementEnd(ElementEnd::Open, 2..3), 63 | Token::Text("]>", 3..5), 64 | Token::ElementEnd(ElementEnd::Close("", "p"), 5..9) 65 | ); 66 | 67 | test!( 68 | text_err_01, 69 | "

]]>

", 70 | Token::ElementStart("", "p", 0..2), 71 | Token::ElementEnd(ElementEnd::Open, 2..3), 72 | Token::Error( 73 | "invalid character data at 1:4 cause ']]>' is not allowed inside a character data" 74 | .to_string() 75 | ) 76 | ); 77 | 78 | test!( 79 | text_err_02, 80 | "

\u{0c}

", 81 | Token::ElementStart("", "p", 0..2), 82 | Token::ElementEnd(ElementEnd::Open, 2..3), 83 | Token::Error( 84 | "invalid character data at 1:4 cause a non-XML character '\\u{c}' found at 1:4".to_string() 85 | ) 86 | ); 87 | -------------------------------------------------------------------------------- /tests/integration/comments.rs: -------------------------------------------------------------------------------- 1 | use crate::token::*; 2 | 3 | test!( 4 | comment_01, 5 | "", 6 | Token::Comment("comment", 0..14) 7 | ); 8 | test!(comment_02, "", Token::Comment("", 0..13)); 9 | test!(comment_03, "", Token::Comment("", Token::Comment("", Token::Comment("<", Token::Comment("<", Token::Comment("-->", Token::Comment("<>", 0..9)); 15 | test!(comment_09, "", Token::Comment("<", 0..8)); 16 | test!(comment_10, "", Token::Comment("", Token::Comment("", 0..7)); 18 | 19 | macro_rules! test_err { 20 | ($name:ident, $text:expr) => { 21 | #[test] 22 | fn $name() { 23 | let mut p = xml::Tokenizer::from($text); 24 | assert!(p.next().unwrap().is_err()); 25 | } 26 | }; 27 | } 28 | 29 | test_err!(comment_err_01, ""); 30 | test_err!(comment_err_02, ""); 33 | test_err!(comment_err_05, ""); 35 | test_err!(comment_err_07, ""); 43 | test_err!(comment_err_15, ""); 48 | test_err!(comment_err_20, ""); 55 | test_err!(comment_err_27, ""); 56 | test_err!(comment_err_28, ""); 57 | test_err!(comment_err_29, ""); 61 | test_err!(comment_err_33, ""); 62 | test_err!(comment_err_34, ""); 63 | test_err!(comment_err_35, ""); 64 | -------------------------------------------------------------------------------- /tests/integration/cdata.rs: -------------------------------------------------------------------------------- 1 | extern crate xmlparser as xml; 2 | 3 | use crate::token::*; 4 | 5 | test!( 6 | cdata_01, 7 | "

", 8 | Token::ElementStart("", "p", 0..2), 9 | Token::ElementEnd(ElementEnd::Open, 2..3), 10 | Token::Cdata("content", 3..22), 11 | Token::ElementEnd(ElementEnd::Close("", "p"), 22..26) 12 | ); 13 | 14 | test!( 15 | cdata_02, 16 | "

", 17 | Token::ElementStart("", "p", 0..2), 18 | Token::ElementEnd(ElementEnd::Open, 2..3), 19 | Token::Cdata("&ing", 3..22), 20 | Token::ElementEnd(ElementEnd::Close("", "p"), 22..26) 21 | ); 22 | 23 | test!( 24 | cdata_03, 25 | "

", 26 | Token::ElementStart("", "p", 0..2), 27 | Token::ElementEnd(ElementEnd::Open, 2..3), 28 | Token::Cdata("&ing ]", 3..24), 29 | Token::ElementEnd(ElementEnd::Close("", "p"), 24..28) 30 | ); 31 | 32 | test!( 33 | cdata_04, 34 | "

", 35 | Token::ElementStart("", "p", 0..2), 36 | Token::ElementEnd(ElementEnd::Open, 2..3), 37 | Token::Cdata("&ing]] ", 3..25), 38 | Token::ElementEnd(ElementEnd::Close("", "p"), 25..29) 39 | ); 40 | 41 | test!( 42 | cdata_05, 43 | "

text]]>

", 44 | Token::ElementStart("", "p", 0..2), 45 | Token::ElementEnd(ElementEnd::Open, 2..3), 46 | Token::Cdata("text", 3..38), 47 | Token::ElementEnd(ElementEnd::Close("", "p"), 38..42) 48 | ); 49 | 50 | test!( 51 | cdata_06, 52 | "

]]>

", 53 | Token::ElementStart("", "p", 0..2), 54 | Token::ElementEnd(ElementEnd::Open, 2..3), 55 | Token::Cdata("", 3..66), 56 | Token::ElementEnd(ElementEnd::Close("", "p"), 66..70) 57 | ); 58 | 59 | test!( 60 | cdata_07, 61 | "

", 62 | Token::ElementStart("", "p", 0..2), 63 | Token::ElementEnd(ElementEnd::Open, 2..3), 64 | Token::Cdata("1", 3..16), 65 | Token::Cdata("2", 16..29), 66 | Token::ElementEnd(ElementEnd::Close("", "p"), 29..33) 67 | ); 68 | 69 | test!( 70 | cdata_08, 71 | "

\n \t

", 72 | Token::ElementStart("", "p", 0..2), 73 | Token::ElementEnd(ElementEnd::Open, 2..3), 74 | Token::Text(" \n ", 3..6), 75 | Token::Cdata("data", 6..22), 76 | Token::Text(" \t ", 22..25), 77 | Token::ElementEnd(ElementEnd::Close("", "p"), 25..29) 78 | ); 79 | 80 | test!( 81 | cdata_09, 82 | "

", 83 | Token::ElementStart("", "p", 0..2), 84 | Token::ElementEnd(ElementEnd::Open, 2..3), 85 | Token::Cdata("bracket ]after", 3..29), 86 | Token::ElementEnd(ElementEnd::Close("", "p"), 29..33) 87 | ); 88 | 89 | test!( 90 | cdata_err_01, 91 | "

", 92 | Token::ElementStart("", "p", 0..2), 93 | Token::ElementEnd(ElementEnd::Open, 2..3), 94 | Token::Error( 95 | "invalid CDATA at 1:4 cause a non-XML character '\\u{1}' found at 1:13".to_string() 96 | ) 97 | ); 98 | -------------------------------------------------------------------------------- /src/strspan.rs: -------------------------------------------------------------------------------- 1 | use core::fmt; 2 | use core::ops::{Deref, Range}; 3 | 4 | /// A string slice. 5 | /// 6 | /// Like `&str`, but also contains the position in the input XML 7 | /// from which it was parsed. 8 | #[must_use] 9 | #[derive(Clone, Copy, PartialEq, Eq, Hash)] 10 | pub struct StrSpan<'a> { 11 | text: &'a str, 12 | start: usize, 13 | } 14 | 15 | impl<'a> From<&'a str> for StrSpan<'a> { 16 | #[inline] 17 | fn from(text: &'a str) -> Self { 18 | StrSpan { text, start: 0 } 19 | } 20 | } 21 | 22 | impl PartialEq for StrSpan<'_> { 23 | fn eq(&self, other: &str) -> bool { 24 | self.text == other 25 | } 26 | } 27 | 28 | impl PartialEq<&str> for StrSpan<'_> { 29 | fn eq(&self, other: &&str) -> bool { 30 | self.text == *other 31 | } 32 | } 33 | 34 | impl PartialEq> for str { 35 | fn eq(&self, other: &StrSpan<'_>) -> bool { 36 | self == other.text 37 | } 38 | } 39 | 40 | impl PartialEq> for &str { 41 | fn eq(&self, other: &StrSpan<'_>) -> bool { 42 | *self == other.text 43 | } 44 | } 45 | 46 | impl<'a> StrSpan<'a> { 47 | /// Constructs a new `StrSpan` from substring. 48 | #[inline] 49 | pub(crate) fn from_substr(text: &str, start: usize, end: usize) -> StrSpan { 50 | debug_assert!(start <= end); 51 | StrSpan { 52 | text: &text[start..end], 53 | start, 54 | } 55 | } 56 | 57 | /// Returns `true` is self is empty. 58 | pub fn is_empty(&self) -> bool { 59 | self.text.is_empty() 60 | } 61 | 62 | /// Returns the start position of the span. 63 | #[inline] 64 | pub fn start(&self) -> usize { 65 | self.start 66 | } 67 | 68 | /// Returns the end position of the span. 69 | #[inline] 70 | pub fn end(&self) -> usize { 71 | self.start + self.text.len() 72 | } 73 | 74 | /// Returns the range of the span. 75 | #[inline] 76 | pub fn range(&self) -> Range { 77 | self.start..self.end() 78 | } 79 | 80 | /// Returns the span as a string slice 81 | #[inline] 82 | pub fn as_str(&self) -> &'a str { 83 | self.text 84 | } 85 | 86 | /// Returns an underling string region as `StrSpan`. 87 | #[inline] 88 | pub(crate) fn slice_region(&self, start: usize, end: usize) -> StrSpan<'a> { 89 | StrSpan::from_substr(self.text, start, end) 90 | } 91 | } 92 | 93 | impl fmt::Debug for StrSpan<'_> { 94 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 95 | write!( 96 | f, 97 | "StrSpan({:?} {}..{})", 98 | self.as_str(), 99 | self.start(), 100 | self.end() 101 | ) 102 | } 103 | } 104 | 105 | impl fmt::Display for StrSpan<'_> { 106 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 107 | write!(f, "{}", self.as_str()) 108 | } 109 | } 110 | 111 | impl Deref for StrSpan<'_> { 112 | type Target = str; 113 | 114 | fn deref(&self) -> &Self::Target { 115 | self.text 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /tests/integration/document.rs: -------------------------------------------------------------------------------- 1 | use std::str; 2 | 3 | use crate::token::*; 4 | 5 | test!(document_01, "",); 6 | 7 | test!(document_02, " ",); 8 | 9 | test!(document_03, " \n\t\r ",); 10 | 11 | // BOM 12 | test!( 13 | document_05, 14 | str::from_utf8(b"\xEF\xBB\xBF").unwrap(), 15 | Token::ElementStart("", "a", 3..5), 16 | Token::ElementEnd(ElementEnd::Empty, 5..7) 17 | ); 18 | 19 | test!( 20 | document_06, 21 | str::from_utf8(b"\xEF\xBB\xBF").unwrap(), 22 | Token::Declaration("1.0", None, None, 3..24) 23 | ); 24 | 25 | test!( 26 | document_07, 27 | "\n\n\ 28 | ", 29 | Token::Declaration("1.0", Some("utf-8"), None, 0..38), 30 | Token::Comment(" comment ", 39..55), 31 | Token::EmptyDtd( 32 | "svg", 33 | Some(ExternalId::Public( 34 | "-//W3C//DTD SVG 1.1//EN", 35 | "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd" 36 | )), 37 | 56..154 38 | ) 39 | ); 40 | 41 | test!( 42 | document_08, 43 | "\n\ 44 | ", 45 | Token::PI("xml-stylesheet", None, 0..18), 46 | Token::EmptyDtd( 47 | "svg", 48 | Some(ExternalId::Public( 49 | "-//W3C//DTD SVG 1.1//EN", 50 | "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd" 51 | )), 52 | 19..117 53 | ) 54 | ); 55 | 56 | test!( 57 | document_09, 58 | "\n\n\ 59 | ", 60 | Token::Declaration("1.0", Some("utf-8"), None, 0..38), 61 | Token::PI("xml-stylesheet", None, 39..57), 62 | Token::EmptyDtd( 63 | "svg", 64 | Some(ExternalId::Public( 65 | "-//W3C//DTD SVG 1.1//EN", 66 | "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd" 67 | )), 68 | 58..156 69 | ) 70 | ); 71 | 72 | test!( 73 | document_err_01, 74 | "", 75 | Token::Error("unknown token at 1:1".to_string()) 76 | ); 77 | 78 | test!( 79 | document_err_02, 80 | " &www---------Ӥ+----------w-----www_", 81 | Token::Error("unknown token at 1:2".to_string()) 82 | ); 83 | 84 | test!( 85 | document_err_03, 86 | "q", 87 | Token::Error("unknown token at 1:1".to_string()) 88 | ); 89 | 90 | test!( 91 | document_err_04, 92 | "", 93 | Token::Error("unknown token at 1:1".to_string()) 94 | ); 95 | 96 | test!( 97 | document_err_05, 98 | "", 99 | Token::EmptyDtd("greeting1", None, 0..20), 100 | Token::Error("unknown token at 1:21".to_string()) 101 | ); 102 | 103 | test!( 104 | document_err_06, 105 | " ", 106 | Token::Error("unknown token at 1:1".to_string()) 107 | ); 108 | 109 | #[test] 110 | fn parse_fragment_1() { 111 | let s = "

"; 112 | let mut p = xml::Tokenizer::from_fragment(s, 0..s.len()); 113 | 114 | match p.next().unwrap().unwrap() { 115 | xml::Token::ElementStart { local, .. } => assert_eq!(local.as_str(), "p"), 116 | _ => panic!(), 117 | } 118 | 119 | match p.next().unwrap().unwrap() { 120 | xml::Token::ElementEnd { .. } => {} 121 | _ => panic!(), 122 | } 123 | 124 | match p.next().unwrap().unwrap() { 125 | xml::Token::ElementStart { local, .. } => assert_eq!(local.as_str(), "p"), 126 | _ => panic!(), 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # xmlparser 2 | 3 | [github](https://github.com/RazrFalcon/xmlparser) 4 | [crates.io](https://crates.io/crates/xmlparser) 5 | [docs.rs](https://docs.rs/xmlparser) 6 | [build status](https://github.com/RazrFalcon/xmlparser/actions?query=branch%3Amaster) 7 | 8 | *xmlparser* is a low-level, pull-based, zero-allocation 9 | [XML 1.0](https://www.w3.org/TR/xml/) parser. 10 | 11 |
12 | 13 | ## Example 14 | 15 | ```rust 16 | for token in xmlparser::Tokenizer::from("") { 17 | println!("{:?}", token); 18 | } 19 | ``` 20 | 21 |
22 | 23 | ## Why a new library? 24 | 25 | This library is basically a low-level XML tokenizer that preserves the 26 | positions of the tokens and is not intended to be used directly. 27 | 28 | If you are looking for a higher level solution, check out 29 | [roxmltree](https://github.com/RazrFalcon/roxmltree). 30 | 31 |
32 | 33 | ## Benefits 34 | 35 | - All tokens contain `StrSpan` structs which represent the position of the 36 | substring in the original document. 37 | - Good error processing. All error types contain the position (line:column) 38 | where it occurred. 39 | - No heap allocations. 40 | - No dependencies. 41 | - Tiny. ~1400 LOC and ~30KiB in the release build according to 42 | `cargo-bloat`. 43 | - Supports `no_std` builds. To use without the standard library, disable the 44 | default features. 45 | 46 |
47 | 48 | ## Limitations 49 | 50 | - Currently, only ENTITY objects are parsed from the DOCTYPE. All others are 51 | ignored. 52 | - No tree structure validation. So an XML like 53 | `` or a string without root element will be 54 | parsed without errors. You should check for this manually. On the other 55 | hand `
` will lead to an error. 56 | - Duplicated attributes is not an error. So XML like `` 57 | will be parsed without errors. You should check for this manually. 58 | - UTF-8 only. 59 | 60 |
61 | 62 | ## Safety 63 | 64 | - The library must not panic. Any panic is considered a critical bug and 65 | should be reported. 66 | - The library forbids unsafe code. 67 | 68 |
69 | 70 | ## License 71 | 72 | Licensed under either of 73 | 74 | - Apache License, Version 2.0 ([LICENSE-APACHE] or 75 | http://www.apache.org/licenses/LICENSE-2.0) 76 | - MIT license ([LICENSE-MIT] or http://opensource.org/licenses/MIT) 77 | 78 | at your option. 79 | 80 |
81 | 82 | ### Contribution 83 | 84 | Unless you explicitly state otherwise, any contribution intentionally submitted 85 | for inclusion in the work by you, as defined in the Apache-2.0 license, shall be 86 | dual licensed as above, without any additional terms or conditions. 87 | 88 | [LICENSE-APACHE]: https://github.com/RazrFalcon/xmlparser/blob/master/LICENSE-APACHE 89 | [LICENSE-MIT]: https://github.com/RazrFalcon/xmlparser/blob/master/LICENSE-MIT 90 | -------------------------------------------------------------------------------- /src/xmlchar.rs: -------------------------------------------------------------------------------- 1 | /// Extension methods for XML-subset only operations. 2 | pub trait XmlCharExt { 3 | /// Checks if the value is within the 4 | /// [NameStartChar](https://www.w3.org/TR/xml/#NT-NameStartChar) range. 5 | fn is_xml_name_start(&self) -> bool; 6 | 7 | /// Checks if the value is within the 8 | /// [NameChar](https://www.w3.org/TR/xml/#NT-NameChar) range. 9 | fn is_xml_name(&self) -> bool; 10 | 11 | /// Checks if the value is within the 12 | /// [Char](https://www.w3.org/TR/xml/#NT-Char) range. 13 | fn is_xml_char(&self) -> bool; 14 | } 15 | 16 | impl XmlCharExt for char { 17 | #[inline] 18 | #[allow(clippy::match_like_matches_macro)] 19 | fn is_xml_name_start(&self) -> bool { 20 | // Check for ASCII first. 21 | if *self as u32 <= 128 { 22 | return matches!(*self as u8, b'A'...b'Z' | b'a'...b'z' | b':' | b'_'); 23 | } 24 | 25 | match *self as u32 { 26 | 0x0000C0...0x0000D6 27 | | 0x0000D8...0x0000F6 28 | | 0x0000F8...0x0002FF 29 | | 0x000370...0x00037D 30 | | 0x00037F...0x001FFF 31 | | 0x00200C...0x00200D 32 | | 0x002070...0x00218F 33 | | 0x002C00...0x002FEF 34 | | 0x003001...0x00D7FF 35 | | 0x00F900...0x00FDCF 36 | | 0x00FDF0...0x00FFFD 37 | | 0x010000...0x0EFFFF => true, 38 | _ => false, 39 | } 40 | } 41 | 42 | #[inline] 43 | #[allow(clippy::match_like_matches_macro)] 44 | fn is_xml_name(&self) -> bool { 45 | // Check for ASCII first. 46 | if *self as u32 <= 128 { 47 | return (*self as u8).is_xml_name(); 48 | } 49 | 50 | match *self as u32 { 51 | 0x0000B7 52 | | 0x0000C0...0x0000D6 53 | | 0x0000D8...0x0000F6 54 | | 0x0000F8...0x0002FF 55 | | 0x000300...0x00036F 56 | | 0x000370...0x00037D 57 | | 0x00037F...0x001FFF 58 | | 0x00200C...0x00200D 59 | | 0x00203F...0x002040 60 | | 0x002070...0x00218F 61 | | 0x002C00...0x002FEF 62 | | 0x003001...0x00D7FF 63 | | 0x00F900...0x00FDCF 64 | | 0x00FDF0...0x00FFFD 65 | | 0x010000...0x0EFFFF => true, 66 | _ => false, 67 | } 68 | } 69 | 70 | #[inline] 71 | fn is_xml_char(&self) -> bool { 72 | // Does not check for surrogate code points U+D800-U+DFFF, 73 | // since that check was performed by Rust when the `&str` was constructed. 74 | if (*self as u32) < 0x20 { 75 | return (*self as u8).is_xml_space(); 76 | } 77 | !matches!(*self as u32, 0xFFFF | 0xFFFE) 78 | } 79 | } 80 | 81 | /// Extension methods for XML-subset only operations. 82 | pub trait XmlByteExt { 83 | /// Checks if byte is a digit. 84 | /// 85 | /// `[0-9]` 86 | fn is_xml_digit(&self) -> bool; 87 | 88 | /// Checks if byte is a hex digit. 89 | /// 90 | /// `[0-9A-Fa-f]` 91 | fn is_xml_hex_digit(&self) -> bool; 92 | 93 | /// Checks if byte is a space. 94 | /// 95 | /// `[ \r\n\t]` 96 | fn is_xml_space(&self) -> bool; 97 | 98 | /// Checks if byte is an ASCII char. 99 | /// 100 | /// `[A-Za-z]` 101 | fn is_xml_letter(&self) -> bool; 102 | 103 | /// Checks if byte is within the ASCII 104 | /// [Char](https://www.w3.org/TR/xml/#NT-Char) range. 105 | fn is_xml_name(&self) -> bool; 106 | } 107 | 108 | impl XmlByteExt for u8 { 109 | #[inline] 110 | fn is_xml_digit(&self) -> bool { 111 | matches!(*self, b'0'...b'9') 112 | } 113 | 114 | #[inline] 115 | fn is_xml_hex_digit(&self) -> bool { 116 | matches!(*self, b'0'...b'9' | b'A'...b'F' | b'a'...b'f') 117 | } 118 | 119 | #[inline] 120 | fn is_xml_space(&self) -> bool { 121 | matches!(*self, b' ' | b'\t' | b'\n' | b'\r') 122 | } 123 | 124 | #[inline] 125 | fn is_xml_letter(&self) -> bool { 126 | matches!(*self, b'A'...b'Z' | b'a'...b'z') 127 | } 128 | 129 | #[inline] 130 | fn is_xml_name(&self) -> bool { 131 | matches!(*self, b'A'...b'Z' | b'a'...b'z'| b'0'...b'9'| b':' | b'_' | b'-' | b'.') 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /tests/integration/token.rs: -------------------------------------------------------------------------------- 1 | type Range = ::std::ops::Range; 2 | 3 | #[derive(PartialEq, Debug)] 4 | pub enum Token<'a> { 5 | Declaration(&'a str, Option<&'a str>, Option, Range), 6 | PI(&'a str, Option<&'a str>, Range), 7 | Comment(&'a str, Range), 8 | DtdStart(&'a str, Option>, Range), 9 | EmptyDtd(&'a str, Option>, Range), 10 | EntityDecl(&'a str, EntityDefinition<'a>, Range), 11 | DtdEnd(Range), 12 | ElementStart(&'a str, &'a str, Range), 13 | Attribute(&'a str, &'a str, &'a str, Range), 14 | ElementEnd(ElementEnd<'a>, Range), 15 | Text(&'a str, Range), 16 | Cdata(&'a str, Range), 17 | Error(String), 18 | } 19 | 20 | #[derive(PartialEq, Debug)] 21 | pub enum ElementEnd<'a> { 22 | Open, 23 | Close(&'a str, &'a str), 24 | Empty, 25 | } 26 | 27 | #[derive(PartialEq, Debug)] 28 | pub enum ExternalId<'a> { 29 | System(&'a str), 30 | Public(&'a str, &'a str), 31 | } 32 | 33 | #[derive(PartialEq, Debug)] 34 | pub enum EntityDefinition<'a> { 35 | EntityValue(&'a str), 36 | ExternalId(ExternalId<'a>), 37 | } 38 | 39 | #[macro_export] 40 | macro_rules! test { 41 | ($name:ident, $text:expr, $($token:expr),*) => ( 42 | #[test] 43 | fn $name() { 44 | let mut p = xml::Tokenizer::from($text); 45 | $( 46 | let t = p.next().unwrap(); 47 | assert_eq!(to_test_token(t), $token); 48 | )* 49 | assert!(p.next().is_none()); 50 | } 51 | ) 52 | } 53 | 54 | #[inline(never)] 55 | pub fn to_test_token(token: Result) -> Token { 56 | match token { 57 | Ok(xml::Token::Declaration { 58 | version, 59 | encoding, 60 | standalone, 61 | span, 62 | }) => Token::Declaration( 63 | version.as_str(), 64 | encoding.map(|v| v.as_str()), 65 | standalone, 66 | span.range(), 67 | ), 68 | Ok(xml::Token::ProcessingInstruction { 69 | target, 70 | content, 71 | span, 72 | }) => Token::PI(target.as_str(), content.map(|v| v.as_str()), span.range()), 73 | Ok(xml::Token::Comment { text, span }) => Token::Comment(text.as_str(), span.range()), 74 | Ok(xml::Token::DtdStart { 75 | name, 76 | external_id, 77 | span, 78 | }) => Token::DtdStart( 79 | name.as_str(), 80 | external_id.map(|v| to_test_external_id(v)), 81 | span.range(), 82 | ), 83 | Ok(xml::Token::EmptyDtd { 84 | name, 85 | external_id, 86 | span, 87 | }) => Token::EmptyDtd( 88 | name.as_str(), 89 | external_id.map(|v| to_test_external_id(v)), 90 | span.range(), 91 | ), 92 | Ok(xml::Token::EntityDeclaration { 93 | name, 94 | definition, 95 | span, 96 | }) => Token::EntityDecl( 97 | name.as_str(), 98 | match definition { 99 | xml::EntityDefinition::EntityValue(name) => { 100 | EntityDefinition::EntityValue(name.as_str()) 101 | } 102 | xml::EntityDefinition::ExternalId(id) => { 103 | EntityDefinition::ExternalId(to_test_external_id(id)) 104 | } 105 | }, 106 | span.range(), 107 | ), 108 | Ok(xml::Token::DtdEnd { span }) => Token::DtdEnd(span.range()), 109 | Ok(xml::Token::ElementStart { 110 | prefix, 111 | local, 112 | span, 113 | }) => Token::ElementStart(prefix.as_str(), local.as_str(), span.range()), 114 | Ok(xml::Token::Attribute { 115 | prefix, 116 | local, 117 | value, 118 | span, 119 | }) => Token::Attribute( 120 | prefix.as_str(), 121 | local.as_str(), 122 | value.as_str(), 123 | span.range(), 124 | ), 125 | Ok(xml::Token::ElementEnd { end, span }) => Token::ElementEnd( 126 | match end { 127 | xml::ElementEnd::Open => ElementEnd::Open, 128 | xml::ElementEnd::Close(prefix, local) => { 129 | ElementEnd::Close(prefix.as_str(), local.as_str()) 130 | } 131 | xml::ElementEnd::Empty => ElementEnd::Empty, 132 | }, 133 | span.range(), 134 | ), 135 | Ok(xml::Token::Text { text }) => Token::Text(text.as_str(), text.range()), 136 | Ok(xml::Token::Cdata { text, span }) => Token::Cdata(text.as_str(), span.range()), 137 | Err(ref e) => Token::Error(e.to_string()), 138 | } 139 | } 140 | 141 | fn to_test_external_id(id: xml::ExternalId) -> ExternalId { 142 | match id { 143 | xml::ExternalId::System(name) => ExternalId::System(name.as_str()), 144 | xml::ExternalId::Public(name, value) => ExternalId::Public(name.as_str(), value.as_str()), 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /tests/integration/pi.rs: -------------------------------------------------------------------------------- 1 | use crate::token::*; 2 | 3 | test!(pi_01, "", Token::PI("xslt", Some("ma"), 0..11)); 4 | 5 | test!( 6 | pi_02, 7 | "", 8 | Token::PI("xslt", Some("m"), 0..13) 9 | ); 10 | 11 | test!(pi_03, "", Token::PI("xslt", None, 0..8)); 12 | 13 | test!(pi_04, "", Token::PI("xslt", None, 0..9)); 14 | 15 | test!( 16 | pi_05, 17 | "", 18 | Token::PI("xml-stylesheet", None, 0..18) 19 | ); 20 | 21 | test!( 22 | pi_err_01, 23 | "", 24 | Token::Error("invalid processing instruction at 1:1 cause invalid name token".to_string()) 25 | ); 26 | 27 | test!( 28 | declaration_01, 29 | "", 30 | Token::Declaration("1.0", None, None, 0..21) 31 | ); 32 | 33 | test!( 34 | declaration_02, 35 | "", 36 | Token::Declaration("1.0", None, None, 0..21) 37 | ); 38 | 39 | test!( 40 | declaration_03, 41 | "", 42 | Token::Declaration("1.0", Some("UTF-8"), None, 0..38) 43 | ); 44 | 45 | test!( 46 | declaration_04, 47 | "", 48 | Token::Declaration("1.0", Some("UTF-8"), None, 0..38) 49 | ); 50 | 51 | test!( 52 | declaration_05, 53 | "", 54 | Token::Declaration("1.0", Some("utf-8"), None, 0..38) 55 | ); 56 | 57 | test!( 58 | declaration_06, 59 | "", 60 | Token::Declaration("1.0", Some("EUC-JP"), None, 0..39) 61 | ); 62 | 63 | test!( 64 | declaration_07, 65 | "", 66 | Token::Declaration("1.0", Some("UTF-8"), Some(true), 0..55) 67 | ); 68 | 69 | test!( 70 | declaration_08, 71 | "", 72 | Token::Declaration("1.0", Some("UTF-8"), Some(false), 0..54) 73 | ); 74 | 75 | test!( 76 | declaration_09, 77 | "", 78 | Token::Declaration("1.0", None, Some(false), 0..37) 79 | ); 80 | 81 | test!( 82 | declaration_10, 83 | "", 84 | Token::Declaration("1.0", None, Some(false), 0..38) 85 | ); 86 | 87 | // Declaration with an invalid order 88 | test!( 89 | declaration_err_01, 90 | "", 91 | Token::Error("invalid XML declaration at 1:1 cause expected 'version' at 1:7".to_string()) 92 | ); 93 | 94 | test!( 95 | declaration_err_02, 96 | "", 97 | Token::Error("invalid XML declaration at 1:1 cause expected '\'' not '*' at 1:31".to_string()) 98 | ); 99 | 100 | test!( 101 | declaration_err_03, 102 | "", 103 | Token::Error("invalid XML declaration at 1:1 cause expected '1.' at 1:16".to_string()) 104 | ); 105 | 106 | test!( 107 | declaration_err_04, 108 | "", 109 | Token::Error("invalid XML declaration at 1:1 cause expected 'yes', 'no' at 1:33".to_string()) 110 | ); 111 | 112 | test!( 113 | declaration_err_05, 114 | "", 115 | Token::Error("invalid XML declaration at 1:1 cause expected '?>' at 1:21".to_string()) 116 | ); 117 | 118 | test!( 119 | declaration_err_06, 120 | "", 121 | Token::Error("invalid XML declaration at 1:1 cause expected '?>' at 1:55".to_string()) 122 | ); 123 | 124 | test!( 125 | declaration_err_07, 126 | "\u{000a}' at 3:7".to_string()) 128 | ); 129 | 130 | test!( 131 | declaration_err_08, 132 | "", 133 | Token::Error("invalid XML declaration at 1:1 cause expected 'version' at 2:2".to_string()) 134 | ); 135 | 136 | test!( 137 | declaration_err_09, 138 | "", 139 | Token::Error("invalid XML declaration at 1:1 cause expected 'version' at 2:2".to_string()) 140 | ); 141 | 142 | // XML declaration allowed only at the start of the document. 143 | test!( 144 | declaration_err_10, 145 | " ", 146 | Token::Error("unknown token at 1:2".to_string()) 147 | ); 148 | 149 | // XML declaration allowed only at the start of the document. 150 | test!( 151 | declaration_err_11, 152 | "", 153 | Token::Comment(" comment ", 0..16), 154 | Token::Error("unknown token at 1:17".to_string()) 155 | ); 156 | 157 | // Duplicate. 158 | test!( 159 | declaration_err_12, 160 | "", 161 | Token::Declaration("1.0", None, None, 0..21), 162 | Token::Error("unknown token at 1:22".to_string()) 163 | ); 164 | 165 | test!( 166 | declaration_err_13, 167 | "", 168 | Token::Error( 169 | "invalid processing instruction at 1:1 cause a non-XML character '\\u{1}' found at 1:10" 170 | .to_string() 171 | ) 172 | ); 173 | 174 | test!( 175 | declaration_err_14, 176 | "", 177 | Token::Error("invalid XML declaration at 1:1 cause expected space not 'e' at 1:20".to_string()) 178 | ); 179 | 180 | test!( 181 | declaration_err_15, 182 | "", 183 | Token::Error("invalid XML declaration at 1:1 cause expected space not 's' at 1:37".to_string()) 184 | ); 185 | 186 | test!( 187 | declaration_err_16, 188 | "' at 1:20".to_string()) 190 | ); 191 | -------------------------------------------------------------------------------- /tests/integration/doctype.rs: -------------------------------------------------------------------------------- 1 | use crate::token::*; 2 | 3 | test!( 4 | dtd_01, 5 | "", 6 | Token::EmptyDtd("greeting", Some(ExternalId::System("hello.dtd")), 0..38) 7 | ); 8 | 9 | test!( 10 | dtd_02, 11 | "", 12 | Token::EmptyDtd( 13 | "greeting", 14 | Some(ExternalId::Public("hello.dtd", "goodbye.dtd")), 15 | 0..52 16 | ) 17 | ); 18 | 19 | test!( 20 | dtd_03, 21 | "", 22 | Token::EmptyDtd("greeting", Some(ExternalId::System("hello.dtd")), 0..38) 23 | ); 24 | 25 | test!( 26 | dtd_04, 27 | "", 28 | Token::EmptyDtd("greeting", None, 0..19) 29 | ); 30 | 31 | test!( 32 | dtd_05, 33 | "", 34 | Token::DtdStart("greeting", None, 0..20), 35 | Token::DtdEnd(20..22) 36 | ); 37 | 38 | test!( 39 | dtd_06, 40 | "
", 41 | Token::EmptyDtd("greeting", None, 0..19), 42 | Token::ElementStart("", "a", 19..21), 43 | Token::ElementEnd(ElementEnd::Empty, 21..23) 44 | ); 45 | 46 | test!( 47 | dtd_07, 48 | "", 49 | Token::DtdStart("greeting", None, 0..20), 50 | Token::DtdEnd(20..23) 51 | ); 52 | 53 | test!( 54 | dtd_08, 55 | "", 56 | Token::DtdStart("greeting", None, 0..20), 57 | Token::DtdEnd(21..24) 58 | ); 59 | 60 | test!( 61 | dtd_entity_01, 62 | " 64 | ]>", 65 | Token::DtdStart("svg", None, 0..15), 66 | Token::EntityDecl( 67 | "ns_extend", 68 | EntityDefinition::EntityValue("http://ns.adobe.com/Extensibility/1.0/"), 69 | 20..80, 70 | ), 71 | Token::DtdEnd(81..83) 72 | ); 73 | 74 | test!( 75 | dtd_entity_02, 76 | " 79 | ]>", 80 | Token::DtdStart("svg", None, 0..15), 81 | Token::EntityDecl( 82 | "Pub-Status", 83 | EntityDefinition::EntityValue("This is a pre-release of the\nspecification."), 84 | 20..86, 85 | ), 86 | Token::DtdEnd(87..89) 87 | ); 88 | 89 | test!( 90 | dtd_entity_03, 91 | " 93 | ]>", 94 | Token::DtdStart("svg", None, 0..15), 95 | Token::EntityDecl( 96 | "open-hatch", 97 | EntityDefinition::ExternalId(ExternalId::System( 98 | "http://www.textuality.com/boilerplate/OpenHatch.xml" 99 | )), 100 | 20..101, 101 | ), 102 | Token::DtdEnd(102..104) 103 | ); 104 | 105 | test!( 106 | dtd_entity_04, 107 | " 111 | ]>", 112 | Token::DtdStart("svg", None, 0..15), 113 | Token::EntityDecl( 114 | "open-hatch", 115 | EntityDefinition::ExternalId(ExternalId::Public( 116 | "-//Textuality//TEXT Standard open-hatch boilerplate//EN", 117 | "http://www.textuality.com/boilerplate/OpenHatch.xml" 118 | )), 119 | 20..185, 120 | ), 121 | Token::DtdEnd(186..188) 122 | ); 123 | 124 | // TODO: NDATA will be ignored 125 | test!( 126 | dtd_entity_05, 127 | " 129 | ]>", 130 | Token::DtdStart("svg", None, 0..15), 131 | Token::EntityDecl( 132 | "hatch-pic", 133 | EntityDefinition::ExternalId(ExternalId::System("../grafix/OpenHatch.gif")), 134 | 20..83, 135 | ), 136 | Token::DtdEnd(84..86) 137 | ); 138 | 139 | // TODO: unsupported data will be ignored 140 | test!( 141 | dtd_entity_06, 142 | " 144 | 145 | 146 | 147 | ]>", 148 | Token::DtdStart("svg", None, 0..15), 149 | Token::EntityDecl( 150 | "ns_extend", 151 | EntityDefinition::EntityValue("http://ns.adobe.com/Extensibility/1.0/"), 152 | 44..104 153 | ), 154 | Token::DtdEnd(203..205) 155 | ); 156 | 157 | // We do not support !ELEMENT DTD token and it will be skipped. 158 | // Previously, we were calling `Tokenizer::next` after the skip, 159 | // which is recursive and could cause a stack overflow when there are too many sequential 160 | // unsupported tokens. 161 | // This tests checks that the current code do not crash with stack overflow. 162 | #[test] 163 | fn dtd_entity_07() { 164 | let mut text = "\n"); 167 | } 168 | text.push_str("]>\n"); 169 | 170 | let mut p = xml::Tokenizer::from(text.as_str()); 171 | assert_eq!( 172 | to_test_token(p.next().unwrap()), 173 | Token::DtdStart("svg", None, 0..15) 174 | ); 175 | assert_eq!( 176 | to_test_token(p.next().unwrap()), 177 | Token::DtdEnd(10016..10018) 178 | ); 179 | } 180 | 181 | test!( 182 | dtd_err_01, 183 | "\u{000a}<", 184 | Token::Error("invalid DTD at 1:1 cause expected space not 'E' at 1:10".to_string()) 185 | ); 186 | 187 | test!( 188 | dtd_err_02, 189 | "' not '!' at 1:16".to_string()) 217 | ); 218 | -------------------------------------------------------------------------------- /src/error.rs: -------------------------------------------------------------------------------- 1 | use core::fmt; 2 | use core::str; 3 | #[cfg(feature = "std")] 4 | use std::error; 5 | 6 | /// An XML parser errors. 7 | #[allow(missing_docs)] 8 | #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] 9 | pub enum Error { 10 | InvalidDeclaration(StreamError, TextPos), 11 | InvalidComment(StreamError, TextPos), 12 | InvalidPI(StreamError, TextPos), 13 | InvalidDoctype(StreamError, TextPos), 14 | InvalidEntity(StreamError, TextPos), 15 | InvalidElement(StreamError, TextPos), 16 | InvalidAttribute(StreamError, TextPos), 17 | InvalidCdata(StreamError, TextPos), 18 | InvalidCharData(StreamError, TextPos), 19 | UnknownToken(TextPos), 20 | } 21 | 22 | impl Error { 23 | /// Returns the error position. 24 | pub fn pos(&self) -> TextPos { 25 | match *self { 26 | Error::InvalidDeclaration(_, pos) => pos, 27 | Error::InvalidComment(_, pos) => pos, 28 | Error::InvalidPI(_, pos) => pos, 29 | Error::InvalidDoctype(_, pos) => pos, 30 | Error::InvalidEntity(_, pos) => pos, 31 | Error::InvalidElement(_, pos) => pos, 32 | Error::InvalidAttribute(_, pos) => pos, 33 | Error::InvalidCdata(_, pos) => pos, 34 | Error::InvalidCharData(_, pos) => pos, 35 | Error::UnknownToken(pos) => pos, 36 | } 37 | } 38 | } 39 | 40 | impl fmt::Display for Error { 41 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 42 | match *self { 43 | Error::InvalidDeclaration(ref cause, pos) => { 44 | write!(f, "invalid XML declaration at {} cause {}", pos, cause) 45 | } 46 | Error::InvalidComment(ref cause, pos) => { 47 | write!(f, "invalid comment at {} cause {}", pos, cause) 48 | } 49 | Error::InvalidPI(ref cause, pos) => { 50 | write!( 51 | f, 52 | "invalid processing instruction at {} cause {}", 53 | pos, cause 54 | ) 55 | } 56 | Error::InvalidDoctype(ref cause, pos) => { 57 | write!(f, "invalid DTD at {} cause {}", pos, cause) 58 | } 59 | Error::InvalidEntity(ref cause, pos) => { 60 | write!(f, "invalid DTD entity at {} cause {}", pos, cause) 61 | } 62 | Error::InvalidElement(ref cause, pos) => { 63 | write!(f, "invalid element at {} cause {}", pos, cause) 64 | } 65 | Error::InvalidAttribute(ref cause, pos) => { 66 | write!(f, "invalid attribute at {} cause {}", pos, cause) 67 | } 68 | Error::InvalidCdata(ref cause, pos) => { 69 | write!(f, "invalid CDATA at {} cause {}", pos, cause) 70 | } 71 | Error::InvalidCharData(ref cause, pos) => { 72 | write!(f, "invalid character data at {} cause {}", pos, cause) 73 | } 74 | Error::UnknownToken(pos) => { 75 | write!(f, "unknown token at {}", pos) 76 | } 77 | } 78 | } 79 | } 80 | 81 | #[cfg(feature = "std")] 82 | impl error::Error for Error { 83 | fn description(&self) -> &str { 84 | "an XML parsing error" 85 | } 86 | } 87 | 88 | /// A stream parser errors. 89 | #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] 90 | pub enum StreamError { 91 | /// The steam ended earlier than we expected. 92 | /// 93 | /// Should only appear on invalid input data. 94 | /// Errors in a valid XML should be handled by errors below. 95 | UnexpectedEndOfStream, 96 | 97 | /// An invalid name. 98 | InvalidName, 99 | 100 | /// A non-XML character has occurred. 101 | /// 102 | /// Valid characters are: 103 | NonXmlChar(char, TextPos), 104 | 105 | /// An invalid/unexpected character. 106 | /// 107 | /// The first byte is an actual one, the second one is expected. 108 | /// 109 | /// We are using a single value to reduce the struct size. 110 | InvalidChar(u8, u8, TextPos), 111 | 112 | /// An invalid/unexpected character. 113 | /// 114 | /// Just like `InvalidChar`, but specifies multiple expected characters. 115 | InvalidCharMultiple(u8, &'static [u8], TextPos), 116 | 117 | /// An unexpected character instead of `"` or `'`. 118 | InvalidQuote(u8, TextPos), 119 | 120 | /// An unexpected character instead of an XML space. 121 | /// 122 | /// Includes: `' ' \n \r \t `. 123 | InvalidSpace(u8, TextPos), 124 | 125 | /// An unexpected string. 126 | /// 127 | /// Contains what string was expected. 128 | InvalidString(&'static str, TextPos), 129 | 130 | /// An invalid reference. 131 | InvalidReference, 132 | 133 | /// An invalid ExternalID in the DTD. 134 | InvalidExternalID, 135 | 136 | /// Comment cannot contain `--`. 137 | InvalidCommentData, 138 | 139 | /// Comment cannot end with `-`. 140 | InvalidCommentEnd, 141 | 142 | /// A Character Data node contains an invalid data. 143 | /// 144 | /// Currently, only `]]>` is not allowed. 145 | InvalidCharacterData, 146 | } 147 | 148 | impl fmt::Display for StreamError { 149 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 150 | match *self { 151 | StreamError::UnexpectedEndOfStream => { 152 | write!(f, "unexpected end of stream") 153 | } 154 | StreamError::InvalidName => { 155 | write!(f, "invalid name token") 156 | } 157 | StreamError::NonXmlChar(c, pos) => { 158 | write!(f, "a non-XML character {:?} found at {}", c, pos) 159 | } 160 | StreamError::InvalidChar(actual, expected, pos) => { 161 | write!( 162 | f, 163 | "expected '{}' not '{}' at {}", 164 | expected as char, actual as char, pos 165 | ) 166 | } 167 | StreamError::InvalidCharMultiple(actual, expected, pos) => { 168 | let mut expected_iter = expected.iter().peekable(); 169 | 170 | write!(f, "expected ")?; 171 | while let Some(&c) = expected_iter.next() { 172 | write!(f, "'{}'", c as char)?; 173 | if expected_iter.peek().is_some() { 174 | write!(f, ", ")?; 175 | } 176 | } 177 | write!(f, " not '{}' at {}", actual as char, pos) 178 | } 179 | StreamError::InvalidQuote(c, pos) => { 180 | write!(f, "expected quote mark not '{}' at {}", c as char, pos) 181 | } 182 | StreamError::InvalidSpace(c, pos) => { 183 | write!(f, "expected space not '{}' at {}", c as char, pos) 184 | } 185 | StreamError::InvalidString(expected, pos) => { 186 | write!(f, "expected '{}' at {}", expected, pos) 187 | } 188 | StreamError::InvalidReference => { 189 | write!(f, "invalid reference") 190 | } 191 | StreamError::InvalidExternalID => { 192 | write!(f, "invalid ExternalID") 193 | } 194 | StreamError::InvalidCommentData => { 195 | write!(f, "'--' is not allowed in comments") 196 | } 197 | StreamError::InvalidCommentEnd => { 198 | write!(f, "comment cannot end with '-'") 199 | } 200 | StreamError::InvalidCharacterData => { 201 | write!(f, "']]>' is not allowed inside a character data") 202 | } 203 | } 204 | } 205 | } 206 | 207 | #[cfg(feature = "std")] 208 | impl error::Error for StreamError { 209 | fn description(&self) -> &str { 210 | "an XML stream parsing error" 211 | } 212 | } 213 | 214 | /// Position in text. 215 | /// 216 | /// Position indicates a row/line and a column in the original text. Starting from 1:1. 217 | #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] 218 | #[allow(missing_docs)] 219 | pub struct TextPos { 220 | pub row: u32, 221 | pub col: u32, 222 | } 223 | 224 | impl TextPos { 225 | /// Constructs a new `TextPos`. 226 | /// 227 | /// Should not be invoked manually, but rather via `Stream::gen_text_pos`. 228 | pub fn new(row: u32, col: u32) -> TextPos { 229 | TextPos { row, col } 230 | } 231 | } 232 | 233 | impl fmt::Display for TextPos { 234 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 235 | write!(f, "{}:{}", self.row, self.col) 236 | } 237 | } 238 | -------------------------------------------------------------------------------- /tests/integration/elements.rs: -------------------------------------------------------------------------------- 1 | use crate::token::*; 2 | 3 | test!( 4 | element_01, 5 | "", 6 | Token::ElementStart("", "a", 0..2), 7 | Token::ElementEnd(ElementEnd::Empty, 2..4) 8 | ); 9 | 10 | test!( 11 | element_02, 12 | "", 13 | Token::ElementStart("", "a", 0..2), 14 | Token::ElementEnd(ElementEnd::Open, 2..3), 15 | Token::ElementEnd(ElementEnd::Close("", "a"), 3..7) 16 | ); 17 | 18 | test!( 19 | element_03, 20 | " \t \n ", 21 | Token::ElementStart("", "a", 5..7), 22 | Token::ElementEnd(ElementEnd::Empty, 7..9) 23 | ); 24 | 25 | test!( 26 | element_04, 27 | " \t \n ", 28 | Token::ElementStart("", "b", 5..7), 29 | Token::ElementEnd(ElementEnd::Open, 7..8), 30 | Token::ElementStart("", "a", 8..10), 31 | Token::ElementEnd(ElementEnd::Empty, 10..12), 32 | Token::ElementEnd(ElementEnd::Close("", "b"), 12..16) 33 | ); 34 | 35 | test!( 36 | element_06, 37 | "<俄语 լեզու=\"ռուսերեն\">данные", 38 | Token::ElementStart("", "俄语", 0..7), 39 | Token::Attribute("", "լեզու", "ռուսերեն", 8..37), 40 | Token::ElementEnd(ElementEnd::Open, 37..38), 41 | Token::Text("данные", 38..50), 42 | Token::ElementEnd(ElementEnd::Close("", "俄语"), 50..59) 43 | ); 44 | 45 | test!( 46 | element_07, 47 | "", 48 | Token::ElementStart("svg", "circle", 0..11), 49 | Token::ElementEnd(ElementEnd::Open, 11..12), 50 | Token::ElementEnd(ElementEnd::Close("svg", "circle"), 12..25) 51 | ); 52 | 53 | test!( 54 | element_08, 55 | "<:circle/>", 56 | Token::ElementStart("", "circle", 0..8), 57 | Token::ElementEnd(ElementEnd::Empty, 8..10) 58 | ); 59 | 60 | test!( 61 | element_err_01, 62 | "<>", 63 | Token::Error("invalid element at 1:1 cause invalid name token".to_string()) 64 | ); 65 | 66 | test!( 67 | element_err_02, 68 | "", 113 | Token::ElementStart("", "a", 0..2), 114 | Token::ElementEnd(ElementEnd::Open, 2..3), 115 | Token::ElementEnd(ElementEnd::Close("", "a"), 3..7), 116 | Token::Error("unknown token at 1:8".to_string()) 117 | ); 118 | 119 | test!( 120 | element_err_10, 121 | "", 122 | Token::ElementStart("", "a", 0..2), 123 | Token::ElementEnd(ElementEnd::Empty, 2..4), 124 | Token::Error("unknown token at 1:5".to_string()) 125 | ); 126 | 127 | test!( 128 | element_err_11, 129 | "
", 130 | Token::ElementStart("", "a", 0..2), 131 | Token::ElementEnd(ElementEnd::Open, 2..3), 132 | Token::Error("invalid element at 1:4 cause expected '>' not '/' at 1:8".to_string()) 133 | ); 134 | 135 | test!( 136 | element_err_12, 137 | "", 138 | Token::Error("invalid element at 1:1 cause invalid name token".to_string()) 139 | ); 140 | 141 | test!( 142 | element_err_13, 143 | "\ 144 | 145 | 146 | ", 147 | Token::ElementStart("", "root", 0..5), 148 | Token::ElementEnd(ElementEnd::Open, 5..6), 149 | Token::Text("\n", 6..7), 150 | Token::ElementEnd(ElementEnd::Close("", "root"), 7..14), 151 | Token::Error("unknown token at 3:1".to_string()) 152 | ); 153 | 154 | test!( 155 | element_err_14, 156 | "<-svg/>", 157 | Token::Error("invalid element at 1:1 cause invalid name token".to_string()) 158 | ); 159 | 160 | test!( 161 | element_err_15, 162 | "", 163 | Token::Error("invalid element at 1:1 cause invalid name token".to_string()) 164 | ); 165 | 166 | test!( 167 | element_err_16, 168 | "", 169 | Token::Error("invalid element at 1:1 cause invalid name token".to_string()) 170 | ); 171 | 172 | test!( 173 | element_err_17, 174 | "", 175 | Token::Error("invalid element at 1:1 cause invalid name token".to_string()) 176 | ); 177 | 178 | test!( 179 | element_err_18, 180 | "<::svg/>", 181 | Token::Error("invalid element at 1:1 cause invalid name token".to_string()) 182 | ); 183 | 184 | test!( 185 | element_err_19, 186 | "<", 187 | Token::ElementStart("", "a", 0..2), 188 | Token::ElementEnd(ElementEnd::Open, 2..3), 189 | Token::Error("unknown token at 1:4".to_string()) 190 | ); 191 | 192 | test!( 193 | attribute_01, 194 | "", 195 | Token::ElementStart("", "a", 0..2), 196 | Token::Attribute("", "ax", "test", 3..12), 197 | Token::ElementEnd(ElementEnd::Empty, 12..14) 198 | ); 199 | 200 | test!( 201 | attribute_02, 202 | "", 203 | Token::ElementStart("", "a", 0..2), 204 | Token::Attribute("", "ax", "test", 3..12), 205 | Token::ElementEnd(ElementEnd::Empty, 12..14) 206 | ); 207 | 208 | test!( 209 | attribute_03, 210 | "", 211 | Token::ElementStart("", "a", 0..2), 212 | Token::Attribute("", "b", "test1", 3..12), 213 | Token::Attribute("", "c", "test2", 13..22), 214 | Token::ElementEnd(ElementEnd::Empty, 22..24) 215 | ); 216 | 217 | test!( 218 | attribute_04, 219 | "", 220 | Token::ElementStart("", "a", 0..2), 221 | Token::Attribute("", "b", "\"test1\"", 3..14), 222 | Token::Attribute("", "c", "'test2'", 15..26), 223 | Token::ElementEnd(ElementEnd::Empty, 26..28) 224 | ); 225 | 226 | test!( 227 | attribute_05, 228 | "", 229 | Token::ElementStart("", "c", 0..2), 230 | Token::Attribute("", "a", "test1' c='test2", 3..22), 231 | Token::Attribute("", "b", "test1\" c=\"test2", 23..42), 232 | Token::ElementEnd(ElementEnd::Empty, 42..44) 233 | ); 234 | 235 | test!( 236 | attribute_06, 237 | "", 238 | Token::ElementStart("", "c", 0..2), 239 | Token::Attribute("", "a", "test1", 5..21), 240 | Token::ElementEnd(ElementEnd::Empty, 26..28) 241 | ); 242 | 243 | test!( 244 | attribute_07, 245 | "", 246 | Token::ElementStart("", "c", 0..2), 247 | Token::Attribute("q", "a", "b", 3..10), 248 | Token::ElementEnd(ElementEnd::Empty, 10..12) 249 | ); 250 | 251 | test!( 252 | attribute_err_01, 253 | "", 254 | Token::ElementStart("", "c", 0..2), 255 | Token::Error("invalid attribute at 1:3 cause expected quote mark not 't' at 1:7".to_string()) 256 | ); 257 | 258 | test!( 259 | attribute_err_02, 260 | "", 261 | Token::ElementStart("", "c", 0..2), 262 | Token::Error("invalid attribute at 1:3 cause expected \'=\' not \'>\' at 1:5".to_string()) 263 | ); 264 | 265 | test!( 266 | attribute_err_03, 267 | "", 268 | Token::ElementStart("", "c", 0..2), 269 | Token::Error("invalid attribute at 1:3 cause expected '=' not '/' at 1:5".to_string()) 270 | ); 271 | 272 | test!( 273 | attribute_err_04, 274 | "", 275 | Token::ElementStart("", "c", 0..2), 276 | Token::Attribute("", "a", "b", 3..8), 277 | Token::Error("invalid attribute at 1:9 cause expected '=' not '/' at 1:11".to_string()) 278 | ); 279 | 280 | test!( 281 | attribute_err_05, 282 | "", 283 | Token::ElementStart("", "c", 0..2), 284 | Token::Error("invalid attribute at 1:3 cause expected ''' not '<' at 1:7".to_string()) 285 | ); 286 | 287 | test!( 288 | attribute_err_06, 289 | "", 290 | Token::ElementStart("", "c", 0..2), 291 | Token::Error( 292 | "invalid attribute at 1:3 cause a non-XML character '\\u{1}' found at 1:7".to_string() 293 | ) 294 | ); 295 | 296 | test!( 297 | attribute_err_07, 298 | "", 299 | Token::ElementStart("", "c", 0..2), 300 | Token::Attribute("", "a", "v", 3..8), 301 | Token::Error("invalid attribute at 1:9 cause expected space not 'b' at 1:9".to_string()) 302 | ); 303 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](http://keepachangelog.com/) 5 | and this project adheres to [Semantic Versioning](http://semver.org/). 6 | 7 | ## [Unreleased] 8 | 9 | ## [0.13.6] - 2023-09-30 10 | ### Added 11 | - `Token::span`, `Tokenizer::stream` and allow cloning of `Tokenizer`. 12 | Thanks to [@krtab](https://github.com/krtab). 13 | 14 | ### Changed 15 | - Optimize `is_xml_char` function. Makes parsing 5-10% faster. 16 | Thanks to [@Simon-Martens](https://github.com/Simon-Martens). 17 | 18 | ## [0.13.5] - 2022-10-18 19 | ### Fixed 20 | - Do no use recursive calls during parsing. Could lead to stack overflow on some input. 21 | - Revert _Do not expand predefined references in `Stream::consume_reference`._ 22 | - Tests on Rust 1.61. Thanks to [@krtab](https://github.com/krtab). 23 | 24 | ## [0.13.4] - 2021-06-24 25 | ### Fixed 26 | - Do not expand predefined references in `Stream::consume_reference`. 27 | Thanks to [@Jesse-Bakker](https://github.com/Jesse-Bakker). 28 | 29 | ## [0.13.3] - 2020-09-02 30 | ### Changed 31 | - Documentation fixes by [@kneasle](https://github.com/kneasle). 32 | 33 | ### Fixed 34 | - `DtdEnd` token parsing when `]` and `>` are separated by a whitespace. 35 | 36 | ## [0.13.2] - 2020-06-15 37 | ### Fixed 38 | - Allow processing instruction before DTD. 39 | 40 | ## [0.13.1] - 2020-03-12 41 | ### Fixed 42 | - Allow comments before DTD. 43 | 44 | ## [0.13.0] - 2020-01-07 45 | ### Changed 46 | - Moved to Rust 2018. 47 | - Completely new `Error` enum. 48 | - New error messages. 49 | - 10-20% faster parsing. 50 | - Use `Tokenizer::from_fragment` instead of `Tokenizer::enable_fragment_mode`. 51 | 52 | ### Removed 53 | - `TokenType`. 54 | 55 | ## [0.12.0] - 2019-12-21 56 | ### Changed 57 | - `]]>` is no longer allowed inside a Text node. 58 | - Only [XML characters](https://www.w3.org/TR/xml/#char32) are allowed now. 59 | Otherwise, `StreamError::NonXmlChar` will occur. 60 | - Disallow `-` at the end of a comment. `` is an error now. 61 | - A missing space between attributes is an error now. 62 | - `StreamError::InvalidQuote` and `StreamError::InvalidSpace` signature changed. 63 | 64 | ## [0.11.0] - 2019-11-18 65 | ### Added 66 | - `no_std` support thanks to [hugwijst](https://github.com/hugwijst). 67 | 68 | ### Changed 69 | - `StreamError::InvalidString` doesn't store an actual string now. 70 | 71 | ## [0.10.0] - 2019-09-14 72 | ### Changed 73 | - 10-15% faster parsing. 74 | - Merge `ByteStream` and `Stream`. 75 | - `StreamError::InvalidChar` signature changed. 76 | - `StreamError::InvalidChar` was split into `InvalidChar` and `InvalidCharMultiple`. 77 | 78 | ### Fixed 79 | - Check for [NameStartChar](https://www.w3.org/TR/xml/#NT-NameStartChar) 80 | during qualified name parsing. 81 | 82 | E.g. `<-p>` is an invalid tag name from now. 83 | - Qualified name with multiple `:` is an error now. 84 | - `]>` is a valid text/`CharData` now. Previously it was parsed as `DoctypeEnd`. 85 | 86 | ### Removed 87 | - `StreamError::InvalidAttributeValue`. `StreamError::InvalidChar` will be emitted instead. 88 | 89 | ## [0.9.0] - 2019-02-27 90 | ### Added 91 | - `span` field to all `Token` variants, which contains a whole token span in bytes. 92 | - `Stream::try_consume_byte`. 93 | 94 | ### Changed 95 | - All `Token` variants are structs now and not tuples. 96 | - `StrSpan` contains an actual string span an not only region now. 97 | 98 | So we can use a non-panic and zero-cost `StrSpan::as_str` instead 99 | of `StrSpan::to_str`, that was performing slicing each time. 100 | - Split `Stream` into `ByteStream` and `Stream`. 101 | - `Stream::skip_spaces` will parse only ASCII whitespace now. 102 | - Rename `StrSpan::to_str` into `StrSpan::as_str`. 103 | - Rename `Reference::EntityRef` into `Reference::Entity`. 104 | - Rename `Reference::CharRef` into `Reference::Char`. 105 | - `StrSpan::from_substr` and `StrSpan::slice_region` are private now. 106 | 107 | ### Removed 108 | - `Token::Whitespaces`. Will be parsed as `Token::Text`. 109 | - `Stream::curr_char`. 110 | - `Stream::is_curr_byte_eq`. 111 | - `Stream::consume_either`. 112 | - `Stream::skip_ascii_spaces`. Use `Stream::skip_spaces` instead. 113 | - `StrSpan::trim`. 114 | - `StrSpan::len`. 115 | - `StrSpan::full_len`. 116 | - `StrSpan::as_bytes`. 117 | 118 | ### Fixed 119 | - Declaration attributes with mixed quotes parsing. 120 | 121 | ## [0.8.1] - 2019-01-02 122 | ### Changed 123 | - Changed the crate category in the Cargo.toml 124 | 125 | ## [0.8.0] - 2018-12-13 126 | ### Added 127 | - `Error::pos()`. 128 | 129 | ### Changed 130 | - Rename `Stream::gen_error_pos` into `Stream::gen_text_pos`. 131 | - Rename `Stream::gen_error_pos_from` into `Stream::gen_text_pos_from`. 132 | - `Stream::gen_text_pos` speed up. 133 | 134 | ### Fixed 135 | - `TextPos` is Unicode aware now. 136 | - XML declaration parsing when file has a BOM. 137 | 138 | ## [0.7.0] - 2018-10-29 139 | ### Changed 140 | - `<` inside an attribute value is an error now. 141 | - `Token::Declaration` represents *standalone* as `bool` now. 142 | - XML declaration must be defined only once now. 143 | - XML declaration must start at 0 position. 144 | - DTD must be defined only once now. 145 | 146 | ## [0.6.1] - 2018-10-08 147 | ### Added 148 | - `Stream::curr_byte_unchecked`. 149 | 150 | ### Fixed 151 | - UTF-8 BOM processing. 152 | 153 | ## [0.6.0] - 2018-08-31 154 | ### Changed 155 | - `Reference::EntityRef` contains `&str` and not `StrSpan` now. 156 | - Rename `Stream::try_consume_char_reference` into `try_consume_reference`. 157 | And it will return `Reference` and not `char` now. 158 | - Rename `Tokenizer::set_fragment_mode` into `enable_fragment_mode`. 159 | - Rename `ErrorPos` into `TextPos`. 160 | 161 | ### Fixed 162 | - `TextPos` calculation via `Stream::gen_error_pos`. 163 | 164 | ### Removed 165 | - `TextUnescape` and `XmlSpace` because useless. 166 | 167 | ## [0.5.0] - 2018-06-14 168 | ### Added 169 | - `StreamError::InvalidChar`. 170 | - `StreamError::InvalidSpace`. 171 | - `StreamError::InvalidString`. 172 | 173 | ### Changed 174 | - `Stream::consume_reference` will return only `InvalidReference` error from now. 175 | - `Error::InvalidTokenWithCause` merged into `Error::InvalidToken`. 176 | - `Stream::gen_error_pos_from` does not require `mut self` from now. 177 | - `StreamError::InvalidChar` requires `Vec` and not `String` from now. 178 | - `ErrorPos` uses `u32` and not `usize` from now. 179 | 180 | ### Removed 181 | - `failure` dependency. 182 | - `log` dependency. 183 | 184 | ## [0.4.1] - 2018-05-23 185 | ### Added 186 | - An ability to parse an XML fragment. 187 | 188 | ## [0.4.0] - 2018-04-21 189 | ### Changed 190 | - Relicense from MIT to MIT/Apache-2.0. 191 | 192 | ### Removed 193 | - `FromSpan` trait. 194 | - `from_str` and `from_span` methods are removed. Use the `From` trait instead. 195 | 196 | ## [0.3.0] - 2018-04-10 197 | ### Changed 198 | - Use `failure` instead of `error-chain`. 199 | - Minimum Rust version is 1.18. 200 | - New error messages. 201 | - `TokenType` is properly public now. 202 | 203 | ### Removed 204 | - `ChainedError` 205 | 206 | ## [0.2.0] - 2018-03-11 207 | ### Added 208 | - Qualified name parsing. 209 | 210 | ### Changed 211 | - **Breaking**. `Token::ElementStart` and `Token::Attribute` contains prefix 212 | and local part of the qualified name now. 213 | 214 | ## [0.1.2] - 2018-02-12 215 | ### Added 216 | - `Stream::skip_ascii_spaces`. 217 | - Small performance optimizations. 218 | 219 | ## [0.1.1] - 2018-01-17 220 | ### Changed 221 | - `log` 0.3 -> 0.4 222 | 223 | [Unreleased]: https://github.com/RazrFalcon/xmlparser/compare/v0.13.6...HEAD 224 | [0.13.6]: https://github.com/RazrFalcon/xmlparser/compare/v0.13.5...v0.13.6 225 | [0.13.5]: https://github.com/RazrFalcon/xmlparser/compare/v0.13.4...v0.13.5 226 | [0.13.4]: https://github.com/RazrFalcon/xmlparser/compare/v0.13.3...v0.13.4 227 | [0.13.3]: https://github.com/RazrFalcon/xmlparser/compare/v0.13.2...v0.13.3 228 | [0.13.2]: https://github.com/RazrFalcon/xmlparser/compare/v0.13.1...v0.13.2 229 | [0.13.1]: https://github.com/RazrFalcon/xmlparser/compare/v0.13.0...v0.13.1 230 | [0.13.0]: https://github.com/RazrFalcon/xmlparser/compare/v0.12.0...v0.13.0 231 | [0.12.0]: https://github.com/RazrFalcon/xmlparser/compare/v0.11.0...v0.12.0 232 | [0.11.0]: https://github.com/RazrFalcon/xmlparser/compare/v0.10.0...v0.11.0 233 | [0.10.0]: https://github.com/RazrFalcon/xmlparser/compare/v0.9.0...v0.10.0 234 | [0.9.0]: https://github.com/RazrFalcon/xmlparser/compare/v0.8.1...v0.9.0 235 | [0.8.1]: https://github.com/RazrFalcon/xmlparser/compare/v0.8.0...v0.8.1 236 | [0.8.0]: https://github.com/RazrFalcon/xmlparser/compare/v0.7.0...v0.8.0 237 | [0.7.0]: https://github.com/RazrFalcon/xmlparser/compare/v0.6.1...v0.7.0 238 | [0.6.1]: https://github.com/RazrFalcon/xmlparser/compare/v0.6.0...v0.6.1 239 | [0.6.0]: https://github.com/RazrFalcon/xmlparser/compare/v0.5.0...v0.6.0 240 | [0.5.0]: https://github.com/RazrFalcon/xmlparser/compare/v0.4.1...v0.5.0 241 | [0.4.1]: https://github.com/RazrFalcon/xmlparser/compare/v0.4.0...v0.4.1 242 | [0.4.0]: https://github.com/RazrFalcon/xmlparser/compare/v0.3.0...v0.4.0 243 | [0.3.0]: https://github.com/RazrFalcon/xmlparser/compare/v0.2.0...v0.3.0 244 | [0.2.0]: https://github.com/RazrFalcon/xmlparser/compare/v0.1.2...v0.2.0 245 | [0.1.2]: https://github.com/RazrFalcon/xmlparser/compare/v0.1.1...v0.1.2 246 | [0.1.1]: https://github.com/RazrFalcon/xmlparser/compare/v0.1.0...v0.1.1 247 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /src/stream.rs: -------------------------------------------------------------------------------- 1 | use core::char; 2 | use core::cmp; 3 | use core::ops::Range; 4 | use core::str; 5 | 6 | use crate::{StrSpan, StreamError, TextPos, XmlByteExt, XmlCharExt}; 7 | 8 | type Result = ::core::result::Result; 9 | 10 | /// Representation of the [Reference](https://www.w3.org/TR/xml/#NT-Reference) value. 11 | #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] 12 | pub enum Reference<'a> { 13 | /// An entity reference. 14 | /// 15 | /// 16 | Entity(&'a str), 17 | 18 | /// A character reference. 19 | /// 20 | /// 21 | Char(char), 22 | } 23 | 24 | /// A streaming XML parsing interface. 25 | #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] 26 | pub struct Stream<'a> { 27 | pos: usize, 28 | end: usize, 29 | span: StrSpan<'a>, 30 | } 31 | 32 | impl<'a> From<&'a str> for Stream<'a> { 33 | #[inline] 34 | fn from(text: &'a str) -> Self { 35 | Stream { 36 | pos: 0, 37 | end: text.len(), 38 | span: text.into(), 39 | } 40 | } 41 | } 42 | 43 | impl<'a> From> for Stream<'a> { 44 | #[inline] 45 | fn from(span: StrSpan<'a>) -> Self { 46 | Stream { 47 | pos: 0, 48 | end: span.as_str().len(), 49 | span, 50 | } 51 | } 52 | } 53 | 54 | impl<'a> Stream<'a> { 55 | /// Creates a new stream from a specified `text` substring. 56 | #[inline] 57 | pub fn from_substr(text: &'a str, fragment: Range) -> Self { 58 | Stream { 59 | pos: fragment.start, 60 | end: fragment.end, 61 | span: text.into(), 62 | } 63 | } 64 | 65 | /// Returns an underling string span. 66 | #[inline] 67 | pub fn span(&self) -> StrSpan<'a> { 68 | self.span 69 | } 70 | 71 | /// Returns current position. 72 | #[inline] 73 | pub fn pos(&self) -> usize { 74 | self.pos 75 | } 76 | 77 | /// Sets current position equal to the end. 78 | /// 79 | /// Used to indicate end of parsing on error. 80 | #[inline] 81 | pub fn jump_to_end(&mut self) { 82 | self.pos = self.end; 83 | } 84 | 85 | /// Checks if the stream is reached the end. 86 | /// 87 | /// Any [`pos()`] value larger than original text length indicates stream end. 88 | /// 89 | /// Accessing stream after reaching end via safe methods will produce 90 | /// an `UnexpectedEndOfStream` error. 91 | /// 92 | /// Accessing stream after reaching end via *_unchecked methods will produce 93 | /// a Rust's bound checking error. 94 | /// 95 | /// [`pos()`]: #method.pos 96 | #[inline] 97 | pub fn at_end(&self) -> bool { 98 | self.pos >= self.end 99 | } 100 | 101 | /// Returns a byte from a current stream position. 102 | /// 103 | /// # Errors 104 | /// 105 | /// - `UnexpectedEndOfStream` 106 | #[inline] 107 | pub fn curr_byte(&self) -> Result { 108 | if self.at_end() { 109 | return Err(StreamError::UnexpectedEndOfStream); 110 | } 111 | 112 | Ok(self.curr_byte_unchecked()) 113 | } 114 | 115 | /// Returns a byte from a current stream position. 116 | /// 117 | /// # Panics 118 | /// 119 | /// - if the current position is after the end of the data 120 | #[inline] 121 | pub fn curr_byte_unchecked(&self) -> u8 { 122 | self.span.as_bytes()[self.pos] 123 | } 124 | 125 | /// Returns a next byte from a current stream position. 126 | /// 127 | /// # Errors 128 | /// 129 | /// - `UnexpectedEndOfStream` 130 | #[inline] 131 | pub fn next_byte(&self) -> Result { 132 | if self.pos + 1 >= self.end { 133 | return Err(StreamError::UnexpectedEndOfStream); 134 | } 135 | 136 | Ok(self.span.as_bytes()[self.pos + 1]) 137 | } 138 | 139 | /// Advances by `n` bytes. 140 | /// 141 | /// # Examples 142 | /// 143 | /// ```rust,should_panic 144 | /// use xmlparser::Stream; 145 | /// 146 | /// let mut s = Stream::from("text"); 147 | /// s.advance(2); // ok 148 | /// s.advance(20); // will cause a panic via debug_assert!(). 149 | /// ``` 150 | #[inline] 151 | pub fn advance(&mut self, n: usize) { 152 | debug_assert!(self.pos + n <= self.end); 153 | self.pos += n; 154 | } 155 | 156 | /// Checks that the stream starts with a selected text. 157 | /// 158 | /// We are using `&[u8]` instead of `&str` for performance reasons. 159 | /// 160 | /// # Examples 161 | /// 162 | /// ``` 163 | /// use xmlparser::Stream; 164 | /// 165 | /// let mut s = Stream::from("Some text."); 166 | /// s.advance(5); 167 | /// assert_eq!(s.starts_with(b"text"), true); 168 | /// assert_eq!(s.starts_with(b"long"), false); 169 | /// ``` 170 | #[inline] 171 | pub fn starts_with(&self, text: &[u8]) -> bool { 172 | self.span.as_bytes()[self.pos..self.end].starts_with(text) 173 | } 174 | 175 | /// Consumes the current byte if it's equal to the provided byte. 176 | /// 177 | /// # Errors 178 | /// 179 | /// - `InvalidChar` 180 | /// - `UnexpectedEndOfStream` 181 | /// 182 | /// # Examples 183 | /// 184 | /// ``` 185 | /// use xmlparser::Stream; 186 | /// 187 | /// let mut s = Stream::from("Some text."); 188 | /// assert!(s.consume_byte(b'S').is_ok()); 189 | /// assert!(s.consume_byte(b'o').is_ok()); 190 | /// assert!(s.consume_byte(b'm').is_ok()); 191 | /// assert!(s.consume_byte(b'q').is_err()); 192 | /// ``` 193 | pub fn consume_byte(&mut self, c: u8) -> Result<()> { 194 | let curr = self.curr_byte()?; 195 | if curr != c { 196 | return Err(StreamError::InvalidChar(curr, c, self.gen_text_pos())); 197 | } 198 | 199 | self.advance(1); 200 | Ok(()) 201 | } 202 | 203 | /// Tries to consume the current byte if it's equal to the provided byte. 204 | /// 205 | /// Unlike `consume_byte()` will not return any errors. 206 | pub fn try_consume_byte(&mut self, c: u8) -> bool { 207 | match self.curr_byte() { 208 | Ok(b) if b == c => { 209 | self.advance(1); 210 | true 211 | } 212 | _ => false, 213 | } 214 | } 215 | 216 | /// Skips selected string. 217 | /// 218 | /// # Errors 219 | /// 220 | /// - `InvalidString` 221 | pub fn skip_string(&mut self, text: &'static [u8]) -> Result<()> { 222 | if !self.starts_with(text) { 223 | let pos = self.gen_text_pos(); 224 | 225 | // Assume that all input `text` are valid UTF-8 strings, so unwrap is safe. 226 | let expected = str::from_utf8(text).unwrap(); 227 | 228 | return Err(StreamError::InvalidString(expected, pos)); 229 | } 230 | 231 | self.advance(text.len()); 232 | Ok(()) 233 | } 234 | 235 | /// Consumes bytes by the predicate and returns them. 236 | /// 237 | /// The result can be empty. 238 | #[inline] 239 | pub fn consume_bytes(&mut self, f: F) -> StrSpan<'a> 240 | where 241 | F: Fn(&Stream, u8) -> bool, 242 | { 243 | let start = self.pos; 244 | self.skip_bytes(f); 245 | self.slice_back(start) 246 | } 247 | 248 | /// Skips bytes by the predicate. 249 | pub fn skip_bytes(&mut self, f: F) 250 | where 251 | F: Fn(&Stream, u8) -> bool, 252 | { 253 | while !self.at_end() && f(self, self.curr_byte_unchecked()) { 254 | self.advance(1); 255 | } 256 | } 257 | 258 | /// Consumes chars by the predicate and returns them. 259 | /// 260 | /// The result can be empty. 261 | #[inline] 262 | pub fn consume_chars(&mut self, f: F) -> Result> 263 | where 264 | F: Fn(&Stream, char) -> bool, 265 | { 266 | let start = self.pos; 267 | self.skip_chars(f)?; 268 | Ok(self.slice_back(start)) 269 | } 270 | 271 | /// Skips chars by the predicate. 272 | #[inline] 273 | pub fn skip_chars(&mut self, f: F) -> Result<()> 274 | where 275 | F: Fn(&Stream, char) -> bool, 276 | { 277 | for c in self.chars() { 278 | if !c.is_xml_char() { 279 | return Err(StreamError::NonXmlChar(c, self.gen_text_pos())); 280 | } else if f(self, c) { 281 | self.advance(c.len_utf8()); 282 | } else { 283 | break; 284 | } 285 | } 286 | 287 | Ok(()) 288 | } 289 | 290 | #[inline] 291 | pub(crate) fn chars(&self) -> str::Chars<'a> { 292 | self.span.as_str()[self.pos..self.end].chars() 293 | } 294 | 295 | /// Slices data from `pos` to the current position. 296 | #[inline] 297 | pub fn slice_back(&self, pos: usize) -> StrSpan<'a> { 298 | self.span.slice_region(pos, self.pos) 299 | } 300 | 301 | /// Slices data from the current position to the end. 302 | #[inline] 303 | pub fn slice_tail(&self) -> StrSpan<'a> { 304 | self.span.slice_region(self.pos, self.end) 305 | } 306 | 307 | /// Skips whitespaces. 308 | /// 309 | /// Accepted values: `' ' \n \r \t`. 310 | #[inline] 311 | pub fn skip_spaces(&mut self) { 312 | while !self.at_end() && self.curr_byte_unchecked().is_xml_space() { 313 | self.advance(1); 314 | } 315 | } 316 | 317 | /// Checks if the stream is starts with a space. 318 | #[inline] 319 | pub fn starts_with_space(&self) -> bool { 320 | !self.at_end() && self.curr_byte_unchecked().is_xml_space() 321 | } 322 | 323 | /// Consumes whitespaces. 324 | /// 325 | /// Like [`skip_spaces()`], but checks that first char is actually a space. 326 | /// 327 | /// [`skip_spaces()`]: #method.skip_spaces 328 | /// 329 | /// # Errors 330 | /// 331 | /// - `InvalidSpace` 332 | pub fn consume_spaces(&mut self) -> Result<()> { 333 | if self.at_end() { 334 | return Err(StreamError::UnexpectedEndOfStream); 335 | } 336 | 337 | if !self.starts_with_space() { 338 | return Err(StreamError::InvalidSpace( 339 | self.curr_byte_unchecked(), 340 | self.gen_text_pos(), 341 | )); 342 | } 343 | 344 | self.skip_spaces(); 345 | Ok(()) 346 | } 347 | 348 | /// Consumes an XML character reference if there is one. 349 | /// 350 | /// On error will reset the position to the original. 351 | pub fn try_consume_reference(&mut self) -> Option> { 352 | let start = self.pos(); 353 | 354 | // Consume reference on a substream. 355 | let mut s = *self; 356 | match s.consume_reference() { 357 | Ok(r) => { 358 | // If the current data is a reference than advance the current stream 359 | // by number of bytes read by substream. 360 | self.advance(s.pos() - start); 361 | Some(r) 362 | } 363 | Err(_) => None, 364 | } 365 | } 366 | 367 | /// Consumes an XML reference. 368 | /// 369 | /// Consumes according to: 370 | /// 371 | /// # Errors 372 | /// 373 | /// - `InvalidReference` 374 | pub fn consume_reference(&mut self) -> Result> { 375 | self._consume_reference() 376 | .map_err(|_| StreamError::InvalidReference) 377 | } 378 | 379 | #[inline(never)] 380 | fn _consume_reference(&mut self) -> Result> { 381 | if !self.try_consume_byte(b'&') { 382 | return Err(StreamError::InvalidReference); 383 | } 384 | 385 | let reference = if self.try_consume_byte(b'#') { 386 | let (value, radix) = if self.try_consume_byte(b'x') { 387 | let value = self.consume_bytes(|_, c| c.is_xml_hex_digit()).as_str(); 388 | (value, 16) 389 | } else { 390 | let value = self.consume_bytes(|_, c| c.is_xml_digit()).as_str(); 391 | (value, 10) 392 | }; 393 | 394 | let n = u32::from_str_radix(value, radix).map_err(|_| StreamError::InvalidReference)?; 395 | 396 | let c = char::from_u32(n).unwrap_or('\u{FFFD}'); 397 | if !c.is_xml_char() { 398 | return Err(StreamError::InvalidReference); 399 | } 400 | 401 | Reference::Char(c) 402 | } else { 403 | let name = self.consume_name()?; 404 | match name.as_str() { 405 | "quot" => Reference::Char('"'), 406 | "amp" => Reference::Char('&'), 407 | "apos" => Reference::Char('\''), 408 | "lt" => Reference::Char('<'), 409 | "gt" => Reference::Char('>'), 410 | _ => Reference::Entity(name.as_str()), 411 | } 412 | }; 413 | 414 | self.consume_byte(b';')?; 415 | 416 | Ok(reference) 417 | } 418 | 419 | /// Consumes an XML name and returns it. 420 | /// 421 | /// Consumes according to: 422 | /// 423 | /// # Errors 424 | /// 425 | /// - `InvalidName` - if name is empty or starts with an invalid char 426 | /// - `UnexpectedEndOfStream` 427 | pub fn consume_name(&mut self) -> Result> { 428 | let start = self.pos(); 429 | self.skip_name()?; 430 | 431 | let name = self.slice_back(start); 432 | if name.is_empty() { 433 | return Err(StreamError::InvalidName); 434 | } 435 | 436 | Ok(name) 437 | } 438 | 439 | /// Skips an XML name. 440 | /// 441 | /// The same as `consume_name()`, but does not return a consumed name. 442 | /// 443 | /// # Errors 444 | /// 445 | /// - `InvalidName` - if name is empty or starts with an invalid char 446 | pub fn skip_name(&mut self) -> Result<()> { 447 | let mut iter = self.chars(); 448 | if let Some(c) = iter.next() { 449 | if c.is_xml_name_start() { 450 | self.advance(c.len_utf8()); 451 | } else { 452 | return Err(StreamError::InvalidName); 453 | } 454 | } 455 | 456 | for c in iter { 457 | if c.is_xml_name() { 458 | self.advance(c.len_utf8()); 459 | } else { 460 | break; 461 | } 462 | } 463 | 464 | Ok(()) 465 | } 466 | 467 | /// Consumes a qualified XML name and returns it. 468 | /// 469 | /// Consumes according to: 470 | /// 471 | /// # Errors 472 | /// 473 | /// - `InvalidName` - if name is empty or starts with an invalid char 474 | #[inline(never)] 475 | pub fn consume_qname(&mut self) -> Result<(StrSpan<'a>, StrSpan<'a>)> { 476 | let start = self.pos(); 477 | 478 | let mut splitter = None; 479 | 480 | while !self.at_end() { 481 | // Check for ASCII first for performance reasons. 482 | let b = self.curr_byte_unchecked(); 483 | if b < 128 { 484 | if b == b':' { 485 | if splitter.is_none() { 486 | splitter = Some(self.pos()); 487 | self.advance(1); 488 | } else { 489 | // Multiple `:` is an error. 490 | return Err(StreamError::InvalidName); 491 | } 492 | } else if b.is_xml_name() { 493 | self.advance(1); 494 | } else { 495 | break; 496 | } 497 | } else { 498 | // Fallback to Unicode code point. 499 | match self.chars().nth(0) { 500 | Some(c) if c.is_xml_name() => { 501 | self.advance(c.len_utf8()); 502 | } 503 | _ => break, 504 | } 505 | } 506 | } 507 | 508 | let (prefix, local) = if let Some(splitter) = splitter { 509 | let prefix = self.span().slice_region(start, splitter); 510 | let local = self.slice_back(splitter + 1); 511 | (prefix, local) 512 | } else { 513 | let local = self.slice_back(start); 514 | ("".into(), local) 515 | }; 516 | 517 | // Prefix must start with a `NameStartChar`. 518 | if let Some(c) = prefix.as_str().chars().nth(0) { 519 | if !c.is_xml_name_start() { 520 | return Err(StreamError::InvalidName); 521 | } 522 | } 523 | 524 | // Local name must start with a `NameStartChar`. 525 | if let Some(c) = local.as_str().chars().nth(0) { 526 | if !c.is_xml_name_start() { 527 | return Err(StreamError::InvalidName); 528 | } 529 | } else { 530 | // If empty - error. 531 | return Err(StreamError::InvalidName); 532 | } 533 | 534 | Ok((prefix, local)) 535 | } 536 | 537 | /// Consumes `=`. 538 | /// 539 | /// Consumes according to: 540 | /// 541 | /// # Errors 542 | /// 543 | /// - `InvalidChar` 544 | /// - `UnexpectedEndOfStream` 545 | pub fn consume_eq(&mut self) -> Result<()> { 546 | self.skip_spaces(); 547 | self.consume_byte(b'=')?; 548 | self.skip_spaces(); 549 | 550 | Ok(()) 551 | } 552 | 553 | /// Consumes quote. 554 | /// 555 | /// Consumes `'` or `"` and returns it. 556 | /// 557 | /// # Errors 558 | /// 559 | /// - `InvalidQuote` 560 | /// - `UnexpectedEndOfStream` 561 | pub fn consume_quote(&mut self) -> Result { 562 | let c = self.curr_byte()?; 563 | if c == b'\'' || c == b'"' { 564 | self.advance(1); 565 | Ok(c) 566 | } else { 567 | Err(StreamError::InvalidQuote(c, self.gen_text_pos())) 568 | } 569 | } 570 | 571 | /// Calculates a current absolute position. 572 | /// 573 | /// This operation is very expensive. Use only for errors. 574 | #[inline(never)] 575 | pub fn gen_text_pos(&self) -> TextPos { 576 | let text = self.span.as_str(); 577 | let end = self.pos; 578 | 579 | let row = Self::calc_curr_row(text, end); 580 | let col = Self::calc_curr_col(text, end); 581 | TextPos::new(row, col) 582 | } 583 | 584 | /// Calculates an absolute position at `pos`. 585 | /// 586 | /// This operation is very expensive. Use only for errors. 587 | /// 588 | /// # Examples 589 | /// 590 | /// ``` 591 | /// let s = xmlparser::Stream::from("text"); 592 | /// 593 | /// assert_eq!(s.gen_text_pos_from(2), xmlparser::TextPos::new(1, 3)); 594 | /// assert_eq!(s.gen_text_pos_from(9999), xmlparser::TextPos::new(1, 5)); 595 | /// ``` 596 | #[inline(never)] 597 | pub fn gen_text_pos_from(&self, pos: usize) -> TextPos { 598 | let mut s = *self; 599 | s.pos = cmp::min(pos, s.span.as_str().len()); 600 | s.gen_text_pos() 601 | } 602 | 603 | fn calc_curr_row(text: &str, end: usize) -> u32 { 604 | let mut row = 1; 605 | for c in &text.as_bytes()[..end] { 606 | if *c == b'\n' { 607 | row += 1; 608 | } 609 | } 610 | 611 | row 612 | } 613 | 614 | fn calc_curr_col(text: &str, end: usize) -> u32 { 615 | let mut col = 1; 616 | for c in text[..end].chars().rev() { 617 | if c == '\n' { 618 | break; 619 | } else { 620 | col += 1; 621 | } 622 | } 623 | 624 | col 625 | } 626 | } 627 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! [github](https://github.com/RazrFalcon/xmlparser) 2 | //! [crates.io](https://crates.io/crates/xmlparser) 3 | //! [docs.rs](https://docs.rs/xmlparser) 4 | //! 5 | //! *xmlparser* is a low-level, pull-based, zero-allocation 6 | //! [XML 1.0](https://www.w3.org/TR/xml/) parser. 7 | //! 8 | //!
9 | //! 10 | //! ## Example 11 | //! 12 | //! ```rust 13 | //! for token in xmlparser::Tokenizer::from("") { 14 | //! println!("{:?}", token); 15 | //! } 16 | //! ``` 17 | //! 18 | //!
19 | //! 20 | //! ## Why a new library? 21 | //! 22 | //! This library is basically a low-level XML tokenizer that preserves the 23 | //! positions of the tokens and is not intended to be used directly. 24 | //! 25 | //! If you are looking for a higher level solution, check out 26 | //! [roxmltree](https://github.com/RazrFalcon/roxmltree). 27 | //! 28 | //!
29 | //! 30 | //! ## Benefits 31 | //! 32 | //! - All tokens contain `StrSpan` structs which represent the position of the 33 | //! substring in the original document. 34 | //! - Good error processing. All error types contain the position (line:column) 35 | //! where it occurred. 36 | //! - No heap allocations. 37 | //! - No dependencies. 38 | //! - Tiny. ~1400 LOC and ~30KiB in the release build according to 39 | //! `cargo-bloat`. 40 | //! - Supports `no_std` builds. To use without the standard library, disable the 41 | //! default features. 42 | //! 43 | //!
44 | //! 45 | //! ## Limitations 46 | //! 47 | //! - Currently, only ENTITY objects are parsed from the DOCTYPE. All others are 48 | //! ignored. 49 | //! - No tree structure validation. So an XML like 50 | //! `` or a string without root element will be 51 | //! parsed without errors. You should check for this manually. On the other 52 | //! hand `
` will lead to an error. 53 | //! - Duplicated attributes is not an error. So XML like `` 54 | //! will be parsed without errors. You should check for this manually. 55 | //! - UTF-8 only. 56 | //! 57 | //!
58 | //! 59 | //! ## Safety 60 | //! 61 | //! - The library must not panic. Any panic is considered a critical bug and 62 | //! should be reported. 63 | //! - The library forbids unsafe code. 64 | //! 65 | //!
66 | //! 67 | //! ## License 68 | //! 69 | //! Licensed under either of 70 | //! 71 | //! - Apache License, Version 2.0 ([LICENSE-APACHE] or 72 | //! http://www.apache.org/licenses/LICENSE-2.0) 73 | //! - MIT license ([LICENSE-MIT] or http://opensource.org/licenses/MIT) 74 | //! 75 | //! at your option. 76 | //! 77 | //!
78 | //! 79 | //! ### Contribution 80 | //! 81 | //! Unless you explicitly state otherwise, any contribution intentionally submitted 82 | //! for inclusion in the work by you, as defined in the Apache-2.0 license, shall be 83 | //! dual licensed as above, without any additional terms or conditions. 84 | //! 85 | //! [LICENSE-APACHE]: https://github.com/RazrFalcon/xmlparser/blob/master/LICENSE-APACHE 86 | //! [LICENSE-MIT]: https://github.com/RazrFalcon/xmlparser/blob/master/LICENSE-MIT 87 | 88 | #![no_std] 89 | #![forbid(unsafe_code)] 90 | #![warn(missing_docs)] 91 | #![allow(ellipsis_inclusive_range_patterns)] 92 | 93 | #[cfg(feature = "std")] 94 | #[macro_use] 95 | extern crate std; 96 | 97 | macro_rules! matches { 98 | ($expression:expr, $($pattern:tt)+) => { 99 | match $expression { 100 | $($pattern)+ => true, 101 | _ => false 102 | } 103 | } 104 | } 105 | 106 | mod error; 107 | mod stream; 108 | mod strspan; 109 | mod xmlchar; 110 | 111 | pub use crate::error::*; 112 | pub use crate::stream::*; 113 | pub use crate::strspan::*; 114 | pub use crate::xmlchar::*; 115 | 116 | /// An XML token. 117 | #[allow(missing_docs)] 118 | #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] 119 | pub enum Token<'a> { 120 | /// Declaration token. 121 | /// 122 | /// ```text 123 | /// 124 | /// --- - version 125 | /// ----- - encoding? 126 | /// --- - standalone? 127 | /// ------------------------------------------------------- - span 128 | /// ``` 129 | Declaration { 130 | version: StrSpan<'a>, 131 | encoding: Option>, 132 | standalone: Option, 133 | span: StrSpan<'a>, 134 | }, 135 | 136 | /// Processing instruction token. 137 | /// 138 | /// ```text 139 | /// 140 | /// ------ - target 141 | /// ------- - content? 142 | /// ------------------ - span 143 | /// ``` 144 | ProcessingInstruction { 145 | target: StrSpan<'a>, 146 | content: Option>, 147 | span: StrSpan<'a>, 148 | }, 149 | 150 | /// Comment token. 151 | /// 152 | /// ```text 153 | /// 154 | /// ------ - text 155 | /// ------------- - span 156 | /// ``` 157 | Comment { 158 | text: StrSpan<'a>, 159 | span: StrSpan<'a>, 160 | }, 161 | 162 | /// DOCTYPE start token. 163 | /// 164 | /// ```text 165 | /// , 172 | external_id: Option>, 173 | span: StrSpan<'a>, 174 | }, 175 | 176 | /// Empty DOCTYPE token. 177 | /// 178 | /// ```text 179 | /// 180 | /// -------- - name 181 | /// ------------------ - external_id? 182 | /// -------------------------------------- - span 183 | /// ``` 184 | EmptyDtd { 185 | name: StrSpan<'a>, 186 | external_id: Option>, 187 | span: StrSpan<'a>, 188 | }, 189 | 190 | /// ENTITY token. 191 | /// 192 | /// Can appear only inside the DTD. 193 | /// 194 | /// ```text 195 | /// 196 | /// --------- - name 197 | /// --------------- - definition 198 | /// ------------------------------------- - span 199 | /// ``` 200 | EntityDeclaration { 201 | name: StrSpan<'a>, 202 | definition: EntityDefinition<'a>, 203 | span: StrSpan<'a>, 204 | }, 205 | 206 | /// DOCTYPE end token. 207 | /// 208 | /// ```text 209 | /// 212 | /// -- - span 213 | /// ``` 214 | DtdEnd { span: StrSpan<'a> }, 215 | 216 | /// Element start token. 217 | /// 218 | /// ```text 219 | /// 220 | /// -- - prefix 221 | /// ---- - local 222 | /// -------- - span 223 | /// ``` 224 | ElementStart { 225 | prefix: StrSpan<'a>, 226 | local: StrSpan<'a>, 227 | span: StrSpan<'a>, 228 | }, 229 | 230 | /// Attribute token. 231 | /// 232 | /// ```text 233 | /// 234 | /// -- - prefix 235 | /// ---- - local 236 | /// ----- - value 237 | /// --------------- - span 238 | /// ``` 239 | Attribute { 240 | prefix: StrSpan<'a>, 241 | local: StrSpan<'a>, 242 | value: StrSpan<'a>, 243 | span: StrSpan<'a>, 244 | }, 245 | 246 | /// Element end token. 247 | /// 248 | /// ```text 249 | /// text 250 | /// - ElementEnd::Open 251 | /// - - span 252 | /// ``` 253 | /// 254 | /// ```text 255 | /// text 256 | /// -- ---- - ElementEnd::Close(prefix, local) 257 | /// ---------- - span 258 | /// ``` 259 | /// 260 | /// ```text 261 | /// 262 | /// - ElementEnd::Empty 263 | /// -- - span 264 | /// ``` 265 | ElementEnd { 266 | end: ElementEnd<'a>, 267 | span: StrSpan<'a>, 268 | }, 269 | 270 | /// Text token. 271 | /// 272 | /// Contains text between elements including whitespaces. 273 | /// Basically everything between `>` and `<`. 274 | /// Except `]]>`, which is not allowed and will lead to an error. 275 | /// 276 | /// ```text 277 | ///

text

278 | /// ------ - text 279 | /// ``` 280 | /// 281 | /// The token span is equal to the `text`. 282 | Text { text: StrSpan<'a> }, 283 | 284 | /// CDATA token. 285 | /// 286 | /// ```text 287 | ///

288 | /// ---- - text 289 | /// ---------------- - span 290 | /// ``` 291 | Cdata { 292 | text: StrSpan<'a>, 293 | span: StrSpan<'a>, 294 | }, 295 | } 296 | 297 | impl<'a> Token<'a> { 298 | /// Returns the [`StrSpan`] encompassing all of the token. 299 | pub fn span(&self) -> StrSpan<'a> { 300 | let span = match self { 301 | Token::Declaration { span, .. } => span, 302 | Token::ProcessingInstruction { span, .. } => span, 303 | Token::Comment { span, .. } => span, 304 | Token::DtdStart { span, .. } => span, 305 | Token::EmptyDtd { span, .. } => span, 306 | Token::EntityDeclaration { span, .. } => span, 307 | Token::DtdEnd { span, .. } => span, 308 | Token::ElementStart { span, .. } => span, 309 | Token::Attribute { span, .. } => span, 310 | Token::ElementEnd { span, .. } => span, 311 | Token::Text { text, .. } => text, 312 | Token::Cdata { span, .. } => span, 313 | }; 314 | *span 315 | } 316 | } 317 | 318 | /// `ElementEnd` token. 319 | #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] 320 | pub enum ElementEnd<'a> { 321 | /// Indicates `>` 322 | Open, 323 | /// Indicates `` 324 | Close(StrSpan<'a>, StrSpan<'a>), 325 | /// Indicates `/>` 326 | Empty, 327 | } 328 | 329 | /// Representation of the [ExternalID](https://www.w3.org/TR/xml/#NT-ExternalID) value. 330 | #[allow(missing_docs)] 331 | #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] 332 | pub enum ExternalId<'a> { 333 | System(StrSpan<'a>), 334 | Public(StrSpan<'a>, StrSpan<'a>), 335 | } 336 | 337 | /// Representation of the [EntityDef](https://www.w3.org/TR/xml/#NT-EntityDef) value. 338 | #[allow(missing_docs)] 339 | #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] 340 | pub enum EntityDefinition<'a> { 341 | EntityValue(StrSpan<'a>), 342 | ExternalId(ExternalId<'a>), 343 | } 344 | 345 | type Result = core::result::Result; 346 | type StreamResult = core::result::Result; 347 | 348 | #[derive(Clone, Copy, PartialEq, Debug)] 349 | enum State { 350 | Declaration, 351 | AfterDeclaration, 352 | Dtd, 353 | AfterDtd, 354 | Elements, 355 | Attributes, 356 | AfterElements, 357 | End, 358 | } 359 | 360 | /// Tokenizer for the XML structure. 361 | #[derive(Clone)] 362 | pub struct Tokenizer<'a> { 363 | stream: Stream<'a>, 364 | state: State, 365 | depth: usize, 366 | fragment_parsing: bool, 367 | } 368 | 369 | impl core::fmt::Debug for Tokenizer<'_> { 370 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { 371 | write!(f, "Tokenizer {{ ... }}") 372 | } 373 | } 374 | 375 | impl<'a> From<&'a str> for Tokenizer<'a> { 376 | #[inline] 377 | fn from(text: &'a str) -> Self { 378 | let mut stream = Stream::from(text); 379 | 380 | // Skip UTF-8 BOM. 381 | if stream.starts_with(&[0xEF, 0xBB, 0xBF]) { 382 | stream.advance(3); 383 | } 384 | 385 | Tokenizer { 386 | stream, 387 | state: State::Declaration, 388 | depth: 0, 389 | fragment_parsing: false, 390 | } 391 | } 392 | } 393 | 394 | macro_rules! map_err_at { 395 | ($fun:expr, $stream:expr, $err:ident) => {{ 396 | let start = $stream.pos(); 397 | $fun.map_err(|e| Error::$err(e, $stream.gen_text_pos_from(start))) 398 | }}; 399 | } 400 | 401 | impl<'a> Tokenizer<'a> { 402 | /// Enables document fragment parsing. 403 | /// 404 | /// By default, `xmlparser` will check for DTD, root element, etc. 405 | /// But if we have to parse an XML fragment, it will lead to an error. 406 | /// This method switches the parser to the root element content parsing mode, 407 | /// so it will treat any data as a content of the root element. 408 | pub fn from_fragment(full_text: &'a str, fragment: core::ops::Range) -> Self { 409 | Tokenizer { 410 | stream: Stream::from_substr(full_text, fragment), 411 | state: State::Elements, 412 | depth: 0, 413 | fragment_parsing: true, 414 | } 415 | } 416 | 417 | fn parse_next_impl(&mut self) -> Option>> { 418 | let s = &mut self.stream; 419 | 420 | if s.at_end() { 421 | return None; 422 | } 423 | 424 | let start = s.pos(); 425 | 426 | match self.state { 427 | State::Declaration => { 428 | self.state = State::AfterDeclaration; 429 | if s.starts_with(b" { 436 | if s.starts_with(b" self.state = State::Dtd, 440 | Ok(Token::EmptyDtd { .. }) => self.state = State::AfterDtd, 441 | _ => {} 442 | } 443 | 444 | Some(t) 445 | } else if s.starts_with(b"' 728 | fn parse_comment_impl(s: &mut Stream<'a>) -> StreamResult> { 729 | let start = s.pos(); 730 | s.advance(4); 731 | let text = s.consume_chars(|s, c| !(c == '-' && s.starts_with(b"-->")))?; 732 | s.skip_string(b"-->")?; 733 | 734 | if text.as_str().contains("--") { 735 | return Err(StreamError::InvalidCommentData); 736 | } 737 | 738 | if text.as_str().ends_with('-') { 739 | return Err(StreamError::InvalidCommentEnd); 740 | } 741 | 742 | let span = s.slice_back(start); 743 | 744 | Ok(Token::Comment { text, span }) 745 | } 746 | 747 | fn parse_pi(s: &mut Stream<'a>) -> Result> { 748 | map_err_at!(Self::parse_pi_impl(s), s, InvalidPI) 749 | } 750 | 751 | // PI ::= '' Char*)))? '?>' 752 | // PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) 753 | fn parse_pi_impl(s: &mut Stream<'a>) -> StreamResult> { 754 | let start = s.pos(); 755 | s.advance(2); 756 | let target = s.consume_name()?; 757 | s.skip_spaces(); 758 | let content = s.consume_chars(|s, c| !(c == '?' && s.starts_with(b"?>")))?; 759 | let content = if !content.is_empty() { 760 | Some(content) 761 | } else { 762 | None 763 | }; 764 | 765 | s.skip_string(b"?>")?; 766 | 767 | let span = s.slice_back(start); 768 | 769 | Ok(Token::ProcessingInstruction { 770 | target, 771 | content, 772 | span, 773 | }) 774 | } 775 | 776 | fn parse_doctype(s: &mut Stream<'a>) -> Result> { 777 | map_err_at!(Self::parse_doctype_impl(s), s, InvalidDoctype) 778 | } 779 | 780 | // doctypedecl ::= '' 781 | fn parse_doctype_impl(s: &mut Stream<'a>) -> StreamResult> { 782 | let start = s.pos(); 783 | s.advance(9); 784 | 785 | s.consume_spaces()?; 786 | let name = s.consume_name()?; 787 | s.skip_spaces(); 788 | 789 | let external_id = Self::parse_external_id(s)?; 790 | s.skip_spaces(); 791 | 792 | let c = s.curr_byte()?; 793 | if c != b'[' && c != b'>' { 794 | static EXPECTED: &[u8] = b"[>"; 795 | return Err(StreamError::InvalidCharMultiple( 796 | c, 797 | EXPECTED, 798 | s.gen_text_pos(), 799 | )); 800 | } 801 | 802 | s.advance(1); 803 | 804 | let span = s.slice_back(start); 805 | if c == b'[' { 806 | Ok(Token::DtdStart { 807 | name, 808 | external_id, 809 | span, 810 | }) 811 | } else { 812 | Ok(Token::EmptyDtd { 813 | name, 814 | external_id, 815 | span, 816 | }) 817 | } 818 | } 819 | 820 | // ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral 821 | fn parse_external_id(s: &mut Stream<'a>) -> StreamResult>> { 822 | let v = if s.starts_with(b"SYSTEM") || s.starts_with(b"PUBLIC") { 823 | let start = s.pos(); 824 | s.advance(6); 825 | let id = s.slice_back(start); 826 | 827 | s.consume_spaces()?; 828 | let quote = s.consume_quote()?; 829 | let literal1 = s.consume_bytes(|_, c| c != quote); 830 | s.consume_byte(quote)?; 831 | 832 | let v = if id.as_str() == "SYSTEM" { 833 | ExternalId::System(literal1) 834 | } else { 835 | s.consume_spaces()?; 836 | let quote = s.consume_quote()?; 837 | let literal2 = s.consume_bytes(|_, c| c != quote); 838 | s.consume_byte(quote)?; 839 | 840 | ExternalId::Public(literal1, literal2) 841 | }; 842 | 843 | Some(v) 844 | } else { 845 | None 846 | }; 847 | 848 | Ok(v) 849 | } 850 | 851 | fn parse_entity_decl(s: &mut Stream<'a>) -> Result> { 852 | map_err_at!(Self::parse_entity_decl_impl(s), s, InvalidEntity) 853 | } 854 | 855 | // EntityDecl ::= GEDecl | PEDecl 856 | // GEDecl ::= '' 857 | // PEDecl ::= '' 858 | fn parse_entity_decl_impl(s: &mut Stream<'a>) -> StreamResult> { 859 | let start = s.pos(); 860 | s.advance(8); 861 | 862 | s.consume_spaces()?; 863 | 864 | let is_ge = if s.try_consume_byte(b'%') { 865 | s.consume_spaces()?; 866 | false 867 | } else { 868 | true 869 | }; 870 | 871 | let name = s.consume_name()?; 872 | s.consume_spaces()?; 873 | let definition = Self::parse_entity_def(s, is_ge)?; 874 | s.skip_spaces(); 875 | s.consume_byte(b'>')?; 876 | 877 | let span = s.slice_back(start); 878 | 879 | Ok(Token::EntityDeclaration { 880 | name, 881 | definition, 882 | span, 883 | }) 884 | } 885 | 886 | // EntityDef ::= EntityValue | (ExternalID NDataDecl?) 887 | // PEDef ::= EntityValue | ExternalID 888 | // EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | "'" ([^%&'] 889 | // | PEReference | Reference)* "'" 890 | // ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral 891 | // NDataDecl ::= S 'NDATA' S Name 892 | fn parse_entity_def(s: &mut Stream<'a>, is_ge: bool) -> StreamResult> { 893 | let c = s.curr_byte()?; 894 | match c { 895 | b'"' | b'\'' => { 896 | let quote = s.consume_quote()?; 897 | let value = s.consume_bytes(|_, c| c != quote); 898 | s.consume_byte(quote)?; 899 | 900 | Ok(EntityDefinition::EntityValue(value)) 901 | } 902 | b'S' | b'P' => { 903 | if let Some(id) = Self::parse_external_id(s)? { 904 | if is_ge { 905 | s.skip_spaces(); 906 | if s.starts_with(b"NDATA") { 907 | s.advance(5); 908 | s.consume_spaces()?; 909 | s.skip_name()?; 910 | // TODO: NDataDecl is not supported 911 | } 912 | } 913 | 914 | Ok(EntityDefinition::ExternalId(id)) 915 | } else { 916 | Err(StreamError::InvalidExternalID) 917 | } 918 | } 919 | _ => { 920 | static EXPECTED: &[u8] = b"\"'SP"; 921 | let pos = s.gen_text_pos(); 922 | Err(StreamError::InvalidCharMultiple(c, EXPECTED, pos)) 923 | } 924 | } 925 | } 926 | 927 | fn consume_decl(s: &mut Stream) -> StreamResult<()> { 928 | s.skip_bytes(|_, c| c != b'>'); 929 | s.consume_byte(b'>')?; 930 | Ok(()) 931 | } 932 | 933 | fn parse_cdata(s: &mut Stream<'a>) -> Result> { 934 | map_err_at!(Self::parse_cdata_impl(s), s, InvalidCdata) 935 | } 936 | 937 | // CDSect ::= CDStart CData CDEnd 938 | // CDStart ::= '' Char*)) 940 | // CDEnd ::= ']]>' 941 | fn parse_cdata_impl(s: &mut Stream<'a>) -> StreamResult> { 942 | let start = s.pos(); 943 | s.advance(9); 944 | let text = s.consume_chars(|s, c| !(c == ']' && s.starts_with(b"]]>")))?; 945 | s.skip_string(b"]]>")?; 946 | let span = s.slice_back(start); 947 | Ok(Token::Cdata { text, span }) 948 | } 949 | 950 | fn parse_element_start(s: &mut Stream<'a>) -> Result> { 951 | map_err_at!(Self::parse_element_start_impl(s), s, InvalidElement) 952 | } 953 | 954 | // '<' Name (S Attribute)* S? '>' 955 | fn parse_element_start_impl(s: &mut Stream<'a>) -> StreamResult> { 956 | let start = s.pos(); 957 | s.advance(1); 958 | let (prefix, local) = s.consume_qname()?; 959 | let span = s.slice_back(start); 960 | 961 | Ok(Token::ElementStart { 962 | prefix, 963 | local, 964 | span, 965 | }) 966 | } 967 | 968 | fn parse_close_element(s: &mut Stream<'a>) -> Result> { 969 | map_err_at!(Self::parse_close_element_impl(s), s, InvalidElement) 970 | } 971 | 972 | // '' 973 | fn parse_close_element_impl(s: &mut Stream<'a>) -> StreamResult> { 974 | let start = s.pos(); 975 | s.advance(2); 976 | 977 | let (prefix, tag_name) = s.consume_qname()?; 978 | s.skip_spaces(); 979 | s.consume_byte(b'>')?; 980 | 981 | let span = s.slice_back(start); 982 | 983 | Ok(Token::ElementEnd { 984 | end: ElementEnd::Close(prefix, tag_name), 985 | span, 986 | }) 987 | } 988 | 989 | // Name Eq AttValue 990 | fn parse_attribute(s: &mut Stream<'a>) -> StreamResult> { 991 | let attr_start = s.pos(); 992 | let has_space = s.starts_with_space(); 993 | s.skip_spaces(); 994 | 995 | if let Ok(c) = s.curr_byte() { 996 | let start = s.pos(); 997 | 998 | match c { 999 | b'/' => { 1000 | s.advance(1); 1001 | s.consume_byte(b'>')?; 1002 | let span = s.slice_back(start); 1003 | return Ok(Token::ElementEnd { 1004 | end: ElementEnd::Empty, 1005 | span, 1006 | }); 1007 | } 1008 | b'>' => { 1009 | s.advance(1); 1010 | let span = s.slice_back(start); 1011 | return Ok(Token::ElementEnd { 1012 | end: ElementEnd::Open, 1013 | span, 1014 | }); 1015 | } 1016 | _ => {} 1017 | } 1018 | } 1019 | 1020 | if !has_space { 1021 | if !s.at_end() { 1022 | return Err(StreamError::InvalidSpace( 1023 | s.curr_byte_unchecked(), 1024 | s.gen_text_pos_from(attr_start), 1025 | )); 1026 | } else { 1027 | return Err(StreamError::UnexpectedEndOfStream); 1028 | } 1029 | } 1030 | 1031 | let start = s.pos(); 1032 | 1033 | let (prefix, local) = s.consume_qname()?; 1034 | s.consume_eq()?; 1035 | let quote = s.consume_quote()?; 1036 | let quote_c = quote as char; 1037 | // The attribute value must not contain the < character. 1038 | let value = s.consume_chars(|_, c| c != quote_c && c != '<')?; 1039 | s.consume_byte(quote)?; 1040 | let span = s.slice_back(start); 1041 | 1042 | Ok(Token::Attribute { 1043 | prefix, 1044 | local, 1045 | value, 1046 | span, 1047 | }) 1048 | } 1049 | 1050 | fn parse_text(s: &mut Stream<'a>) -> Result> { 1051 | map_err_at!(Self::parse_text_impl(s), s, InvalidCharData) 1052 | } 1053 | 1054 | fn parse_text_impl(s: &mut Stream<'a>) -> StreamResult> { 1055 | let text = s.consume_chars(|_, c| c != '<')?; 1056 | 1057 | // According to the spec, `]]>` must not appear inside a Text node. 1058 | // https://www.w3.org/TR/xml/#syntax 1059 | // 1060 | // Search for `>` first, since it's a bit faster than looking for `]]>`. 1061 | if text.as_str().contains('>') && text.as_str().contains("]]>") { 1062 | return Err(StreamError::InvalidCharacterData); 1063 | } 1064 | 1065 | Ok(Token::Text { text }) 1066 | } 1067 | 1068 | /// Returns a copy of the tokenizer's stream. 1069 | pub fn stream(&self) -> Stream<'a> { 1070 | self.stream 1071 | } 1072 | } 1073 | 1074 | impl<'a> Iterator for Tokenizer<'a> { 1075 | type Item = Result>; 1076 | 1077 | #[inline] 1078 | fn next(&mut self) -> Option { 1079 | let mut t = None; 1080 | while !self.stream.at_end() && self.state != State::End && t.is_none() { 1081 | t = self.parse_next_impl(); 1082 | } 1083 | 1084 | if let Some(Err(_)) = t { 1085 | self.stream.jump_to_end(); 1086 | self.state = State::End; 1087 | } 1088 | 1089 | t 1090 | } 1091 | } 1092 | --------------------------------------------------------------------------------