├── afl-fuzz
├── .gitignore
├── in
│ ├── 1.xml
│ ├── 2.xml
│ ├── 3.xml
│ ├── 4.xml
│ ├── 5.xml
│ └── 6.xml
├── Cargo.toml
├── README.md
└── src
│ └── main.rs
├── .gitignore
├── fuzz
├── .gitignore
├── README.md
├── fuzz_targets
│ └── fuzz_xml.rs
└── Cargo.toml
├── tests
└── integration
│ ├── main.rs
│ ├── api.rs
│ ├── text.rs
│ ├── comments.rs
│ ├── cdata.rs
│ ├── document.rs
│ ├── token.rs
│ ├── pi.rs
│ ├── doctype.rs
│ └── elements.rs
├── Cargo.toml
├── examples
└── parse.rs
├── README.tpl
├── .github
└── workflows
│ └── ci.yml
├── LICENSE-MIT
├── src
├── strspan.rs
├── xmlchar.rs
├── error.rs
├── stream.rs
└── lib.rs
├── README.md
├── CHANGELOG.md
└── LICENSE-APACHE
/afl-fuzz/.gitignore:
--------------------------------------------------------------------------------
1 | /out
2 |
--------------------------------------------------------------------------------
/afl-fuzz/in/1.xml:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/afl-fuzz/in/2.xml:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/afl-fuzz/in/3.xml:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | Cargo.lock
3 | .idea
4 |
--------------------------------------------------------------------------------
/fuzz/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | corpus
3 | artifacts
4 |
--------------------------------------------------------------------------------
/afl-fuzz/in/4.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/afl-fuzz/in/5.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/afl-fuzz/in/6.xml:
--------------------------------------------------------------------------------
1 |
3 | ]>
4 |
5 |
--------------------------------------------------------------------------------
/fuzz/README.md:
--------------------------------------------------------------------------------
1 | ## Prepare
2 |
3 | ```
4 | cargo install cargo-fuzz
5 | ```
6 |
7 | ## Run
8 |
9 | ```
10 | cd ..
11 | cargo +nightly fuzz run fuzz_xml
12 | ```
13 |
--------------------------------------------------------------------------------
/afl-fuzz/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "afl-fuzz"
3 | version = "0.1.0"
4 | authors = ["RazrFalcon "]
5 |
6 | [dependencies]
7 | afl = "0.5"
8 | xmlparser = { path = ".." }
9 |
--------------------------------------------------------------------------------
/afl-fuzz/README.md:
--------------------------------------------------------------------------------
1 | ## Prepare
2 |
3 | ```
4 | cargo install afl
5 | ```
6 |
7 | ## Run
8 |
9 | ```
10 | cargo afl build
11 | cargo afl fuzz -i in -o out target/debug/afl-fuzz
12 | ```
13 |
--------------------------------------------------------------------------------
/tests/integration/main.rs:
--------------------------------------------------------------------------------
1 | extern crate xmlparser as xml;
2 |
3 | #[macro_use]
4 | mod token;
5 |
6 | mod api;
7 | mod cdata;
8 | mod comments;
9 | mod doctype;
10 | mod document;
11 | mod elements;
12 | mod pi;
13 | mod text;
14 |
--------------------------------------------------------------------------------
/afl-fuzz/src/main.rs:
--------------------------------------------------------------------------------
1 | extern crate afl;
2 | extern crate xmlparser;
3 |
4 | use std::str;
5 |
6 | use afl::fuzz;
7 |
8 | fn main() {
9 | fuzz!(|data: &[u8]| {
10 | if let Ok(text) = str::from_utf8(data) {
11 | for _ in xmlparser::Tokenizer::from(text) {}
12 | }
13 | });
14 | }
15 |
--------------------------------------------------------------------------------
/fuzz/fuzz_targets/fuzz_xml.rs:
--------------------------------------------------------------------------------
1 | #![no_main]
2 |
3 | #[macro_use] extern crate libfuzzer_sys;
4 | extern crate xmlparser;
5 |
6 | use std::str;
7 |
8 | fuzz_target!(|data: &[u8]| {
9 | if let Ok(text) = str::from_utf8(data) {
10 | let mut n = 0;
11 | for _ in xmlparser::Tokenizer::from(text) {
12 | n += 1;
13 |
14 | if n == 1000 {
15 | panic!("endless loop");
16 | }
17 | }
18 | }
19 | });
20 |
--------------------------------------------------------------------------------
/fuzz/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "xmlparser-fuzz"
3 | version = "0.0.1"
4 | authors = ["Automatically generated"]
5 | publish = false
6 |
7 | [package.metadata]
8 | cargo-fuzz = true
9 |
10 | [dependencies.xmlparser]
11 | path = ".."
12 |
13 | [dependencies.libfuzzer-sys]
14 | git = "https://github.com/rust-fuzz/libfuzzer-sys.git"
15 |
16 | # Prevent this from interfering with workspaces
17 | [workspace]
18 | members = ["."]
19 |
20 | [[bin]]
21 | name = "fuzz_xml"
22 | path = "fuzz_targets/fuzz_xml.rs"
23 |
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "xmlparser"
3 | version = "0.13.6"
4 | authors = ["Yevhenii Reizner "]
5 | edition = "2018"
6 | description = "Pull-based, zero-allocation XML parser."
7 | documentation = "https://docs.rs/xmlparser"
8 | readme = "README.md"
9 | homepage = "https://github.com/RazrFalcon/xmlparser"
10 | repository = "https://github.com/RazrFalcon/xmlparser"
11 | license = "MIT OR Apache-2.0"
12 | keywords = ["parser", "tokenizer", "xml"]
13 | categories = ["parser-implementations"]
14 |
15 | [features]
16 | default = ["std"]
17 | std = []
18 |
--------------------------------------------------------------------------------
/examples/parse.rs:
--------------------------------------------------------------------------------
1 | extern crate xmlparser as xml;
2 |
3 | use std::env;
4 | use std::fs;
5 | use std::io::Read;
6 |
7 | fn main() {
8 | let args = env::args().collect::>();
9 | if args.len() != 2 {
10 | println!("Usage: parse file.xml");
11 | return;
12 | }
13 |
14 | let text = load_file(&args[1]);
15 |
16 | if let Err(e) = parse(&text) {
17 | println!("Error: {}.", e);
18 | }
19 | }
20 |
21 | fn parse(text: &str) -> Result<(), xml::Error> {
22 | for token in xml::Tokenizer::from(text) {
23 | println!("{:?}", token?);
24 | }
25 |
26 | Ok(())
27 | }
28 |
29 | fn load_file(path: &str) -> String {
30 | let mut file = fs::File::open(path).unwrap();
31 | let mut text = String::new();
32 | file.read_to_string(&mut text).unwrap();
33 | text
34 | }
35 |
--------------------------------------------------------------------------------
/tests/integration/api.rs:
--------------------------------------------------------------------------------
1 | extern crate xmlparser;
2 |
3 | use xmlparser::*;
4 |
5 | #[test]
6 | fn text_pos_1() {
7 | let mut s = Stream::from("text");
8 | s.advance(2);
9 | assert_eq!(s.gen_text_pos(), TextPos::new(1, 3));
10 | }
11 |
12 | #[test]
13 | fn text_pos_2() {
14 | let mut s = Stream::from("text\ntext");
15 | s.advance(6);
16 | assert_eq!(s.gen_text_pos(), TextPos::new(2, 2));
17 | }
18 |
19 | #[test]
20 | fn text_pos_3() {
21 | let mut s = Stream::from("текст\nтекст");
22 | s.advance(15);
23 | assert_eq!(s.gen_text_pos(), TextPos::new(2, 3));
24 | }
25 |
26 | #[test]
27 | fn token_size() {
28 | assert!(::std::mem::size_of::() <= 196);
29 | }
30 |
31 | #[test]
32 | fn span_size() {
33 | assert!(::std::mem::size_of::() <= 48);
34 | }
35 |
36 | #[test]
37 | fn err_size_1() {
38 | assert!(::std::mem::size_of::() <= 64);
39 | }
40 |
41 | #[test]
42 | fn err_size_2() {
43 | assert!(::std::mem::size_of::() <= 64);
44 | }
45 |
--------------------------------------------------------------------------------
/README.tpl:
--------------------------------------------------------------------------------
1 | ## {{crate}}
2 | [](https://travis-ci.org/RazrFalcon/{{crate}})
3 | [](https://crates.io/crates/{{crate}})
4 | [](https://docs.rs/{{crate}})
5 | [](https://www.rust-lang.org)
6 |
7 | {{readme}}
8 |
9 | ### License
10 |
11 | Licensed under either of
12 |
13 | - Apache License, Version 2.0
14 | ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
15 | - MIT license
16 | ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
17 |
18 | at your option.
19 |
20 | ### Contribution
21 |
22 | Unless you explicitly state otherwise, any contribution intentionally submitted
23 | for inclusion in the work by you, as defined in the Apache-2.0 license, shall be
24 | dual licensed as above, without any additional terms or conditions.
25 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 |
3 | on:
4 | pull_request: {}
5 | push:
6 | branches:
7 | - master
8 | schedule:
9 | - cron: '43 20 * * 3'
10 |
11 | concurrency:
12 | group: $-$
13 | cancel-in-progress: true
14 |
15 | jobs:
16 | msrv:
17 | runs-on: ubuntu-latest
18 | steps:
19 | - uses: actions/checkout@v4
20 | - uses: dtolnay/rust-toolchain@1.31
21 | - run: cargo build --lib
22 |
23 | test:
24 | runs-on: ubuntu-latest
25 | steps:
26 | - uses: actions/checkout@v4
27 | - uses: dtolnay/rust-toolchain@stable
28 | - run: cargo test --all-targets
29 | - run: cargo test --doc
30 |
31 | clippy:
32 | runs-on: ubuntu-latest
33 | steps:
34 | - uses: actions/checkout@v4
35 | - uses: dtolnay/rust-toolchain@stable
36 | with:
37 | components: clippy
38 | - run: cargo clippy --all-features --all-targets -- -D warnings
39 |
40 | rustfmt:
41 | runs-on: ubuntu-latest
42 | steps:
43 | - uses: actions/checkout@v4
44 | - uses: dtolnay/rust-toolchain@stable
45 | with:
46 | components: rustfmt
47 | - run: cargo fmt --check --all
48 |
--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2018 Reizner Evgeniy
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/tests/integration/text.rs:
--------------------------------------------------------------------------------
1 | use crate::token::*;
2 |
3 | test!(
4 | text_01,
5 | "text
",
6 | Token::ElementStart("", "p", 0..2),
7 | Token::ElementEnd(ElementEnd::Open, 2..3),
8 | Token::Text("text", 3..7),
9 | Token::ElementEnd(ElementEnd::Close("", "p"), 7..11)
10 | );
11 |
12 | test!(
13 | text_02,
14 | " text
",
15 | Token::ElementStart("", "p", 0..2),
16 | Token::ElementEnd(ElementEnd::Open, 2..3),
17 | Token::Text(" text ", 3..9),
18 | Token::ElementEnd(ElementEnd::Close("", "p"), 9..13)
19 | );
20 |
21 | // 欄 is EF A4 9D. And EF can be mistreated for UTF-8 BOM.
22 | test!(
23 | text_03,
24 | "欄
",
25 | Token::ElementStart("", "p", 0..2),
26 | Token::ElementEnd(ElementEnd::Open, 2..3),
27 | Token::Text("欄", 3..6),
28 | Token::ElementEnd(ElementEnd::Close("", "p"), 6..10)
29 | );
30 |
31 | test!(
32 | text_04,
33 | "
",
34 | Token::ElementStart("", "p", 0..2),
35 | Token::ElementEnd(ElementEnd::Open, 2..3),
36 | Token::Text(" ", 3..4),
37 | Token::ElementEnd(ElementEnd::Close("", "p"), 4..8)
38 | );
39 |
40 | test!(
41 | text_05,
42 | " \r\n\t
",
43 | Token::ElementStart("", "p", 0..2),
44 | Token::ElementEnd(ElementEnd::Open, 2..3),
45 | Token::Text(" \r\n\t ", 3..8),
46 | Token::ElementEnd(ElementEnd::Close("", "p"), 8..12)
47 | );
48 |
49 | test!(
50 | text_06,
51 | "
",
52 | Token::ElementStart("", "p", 0..2),
53 | Token::ElementEnd(ElementEnd::Open, 2..3),
54 | Token::Text(" ", 3..9),
55 | Token::ElementEnd(ElementEnd::Close("", "p"), 9..13)
56 | );
57 |
58 | test!(
59 | text_07,
60 | "]>
",
61 | Token::ElementStart("", "p", 0..2),
62 | Token::ElementEnd(ElementEnd::Open, 2..3),
63 | Token::Text("]>", 3..5),
64 | Token::ElementEnd(ElementEnd::Close("", "p"), 5..9)
65 | );
66 |
67 | test!(
68 | text_err_01,
69 | "]]>
",
70 | Token::ElementStart("", "p", 0..2),
71 | Token::ElementEnd(ElementEnd::Open, 2..3),
72 | Token::Error(
73 | "invalid character data at 1:4 cause ']]>' is not allowed inside a character data"
74 | .to_string()
75 | )
76 | );
77 |
78 | test!(
79 | text_err_02,
80 | "\u{0c}
",
81 | Token::ElementStart("", "p", 0..2),
82 | Token::ElementEnd(ElementEnd::Open, 2..3),
83 | Token::Error(
84 | "invalid character data at 1:4 cause a non-XML character '\\u{c}' found at 1:4".to_string()
85 | )
86 | );
87 |
--------------------------------------------------------------------------------
/tests/integration/comments.rs:
--------------------------------------------------------------------------------
1 | use crate::token::*;
2 |
3 | test!(
4 | comment_01,
5 | "",
6 | Token::Comment("comment", 0..14)
7 | );
8 | test!(comment_02, "", Token::Comment("", 0..13));
9 | test!(comment_03, "", Token::Comment("", Token::Comment("", Token::Comment("<", Token::Comment("<", Token::Comment("-->", Token::Comment("<>", 0..9));
15 | test!(comment_09, "", Token::Comment("<", 0..8));
16 | test!(comment_10, "", Token::Comment("", Token::Comment("", 0..7));
18 |
19 | macro_rules! test_err {
20 | ($name:ident, $text:expr) => {
21 | #[test]
22 | fn $name() {
23 | let mut p = xml::Tokenizer::from($text);
24 | assert!(p.next().unwrap().is_err());
25 | }
26 | };
27 | }
28 |
29 | test_err!(comment_err_01, "");
30 | test_err!(comment_err_02, "");
33 | test_err!(comment_err_05, "");
35 | test_err!(comment_err_07, "");
43 | test_err!(comment_err_15, "");
48 | test_err!(comment_err_20, "");
55 | test_err!(comment_err_27, "");
56 | test_err!(comment_err_28, "");
57 | test_err!(comment_err_29, "");
61 | test_err!(comment_err_33, "");
62 | test_err!(comment_err_34, "");
63 | test_err!(comment_err_35, "");
64 |
--------------------------------------------------------------------------------
/tests/integration/cdata.rs:
--------------------------------------------------------------------------------
1 | extern crate xmlparser as xml;
2 |
3 | use crate::token::*;
4 |
5 | test!(
6 | cdata_01,
7 | "",
8 | Token::ElementStart("", "p", 0..2),
9 | Token::ElementEnd(ElementEnd::Open, 2..3),
10 | Token::Cdata("content", 3..22),
11 | Token::ElementEnd(ElementEnd::Close("", "p"), 22..26)
12 | );
13 |
14 | test!(
15 | cdata_02,
16 | "",
17 | Token::ElementStart("", "p", 0..2),
18 | Token::ElementEnd(ElementEnd::Open, 2..3),
19 | Token::Cdata("&ing", 3..22),
20 | Token::ElementEnd(ElementEnd::Close("", "p"), 22..26)
21 | );
22 |
23 | test!(
24 | cdata_03,
25 | "",
26 | Token::ElementStart("", "p", 0..2),
27 | Token::ElementEnd(ElementEnd::Open, 2..3),
28 | Token::Cdata("&ing ]", 3..24),
29 | Token::ElementEnd(ElementEnd::Close("", "p"), 24..28)
30 | );
31 |
32 | test!(
33 | cdata_04,
34 | "",
35 | Token::ElementStart("", "p", 0..2),
36 | Token::ElementEnd(ElementEnd::Open, 2..3),
37 | Token::Cdata("&ing]] ", 3..25),
38 | Token::ElementEnd(ElementEnd::Close("", "p"), 25..29)
39 | );
40 |
41 | test!(
42 | cdata_05,
43 | "text]]>
",
44 | Token::ElementStart("", "p", 0..2),
45 | Token::ElementEnd(ElementEnd::Open, 2..3),
46 | Token::Cdata("text", 3..38),
47 | Token::ElementEnd(ElementEnd::Close("", "p"), 38..42)
48 | );
49 |
50 | test!(
51 | cdata_06,
52 | "]]>
",
53 | Token::ElementStart("", "p", 0..2),
54 | Token::ElementEnd(ElementEnd::Open, 2..3),
55 | Token::Cdata("", 3..66),
56 | Token::ElementEnd(ElementEnd::Close("", "p"), 66..70)
57 | );
58 |
59 | test!(
60 | cdata_07,
61 | "",
62 | Token::ElementStart("", "p", 0..2),
63 | Token::ElementEnd(ElementEnd::Open, 2..3),
64 | Token::Cdata("1", 3..16),
65 | Token::Cdata("2", 16..29),
66 | Token::ElementEnd(ElementEnd::Close("", "p"), 29..33)
67 | );
68 |
69 | test!(
70 | cdata_08,
71 | " \n \t
",
72 | Token::ElementStart("", "p", 0..2),
73 | Token::ElementEnd(ElementEnd::Open, 2..3),
74 | Token::Text(" \n ", 3..6),
75 | Token::Cdata("data", 6..22),
76 | Token::Text(" \t ", 22..25),
77 | Token::ElementEnd(ElementEnd::Close("", "p"), 25..29)
78 | );
79 |
80 | test!(
81 | cdata_09,
82 | "",
83 | Token::ElementStart("", "p", 0..2),
84 | Token::ElementEnd(ElementEnd::Open, 2..3),
85 | Token::Cdata("bracket ]after", 3..29),
86 | Token::ElementEnd(ElementEnd::Close("", "p"), 29..33)
87 | );
88 |
89 | test!(
90 | cdata_err_01,
91 | "",
92 | Token::ElementStart("", "p", 0..2),
93 | Token::ElementEnd(ElementEnd::Open, 2..3),
94 | Token::Error(
95 | "invalid CDATA at 1:4 cause a non-XML character '\\u{1}' found at 1:13".to_string()
96 | )
97 | );
98 |
--------------------------------------------------------------------------------
/src/strspan.rs:
--------------------------------------------------------------------------------
1 | use core::fmt;
2 | use core::ops::{Deref, Range};
3 |
4 | /// A string slice.
5 | ///
6 | /// Like `&str`, but also contains the position in the input XML
7 | /// from which it was parsed.
8 | #[must_use]
9 | #[derive(Clone, Copy, PartialEq, Eq, Hash)]
10 | pub struct StrSpan<'a> {
11 | text: &'a str,
12 | start: usize,
13 | }
14 |
15 | impl<'a> From<&'a str> for StrSpan<'a> {
16 | #[inline]
17 | fn from(text: &'a str) -> Self {
18 | StrSpan { text, start: 0 }
19 | }
20 | }
21 |
22 | impl PartialEq for StrSpan<'_> {
23 | fn eq(&self, other: &str) -> bool {
24 | self.text == other
25 | }
26 | }
27 |
28 | impl PartialEq<&str> for StrSpan<'_> {
29 | fn eq(&self, other: &&str) -> bool {
30 | self.text == *other
31 | }
32 | }
33 |
34 | impl PartialEq> for str {
35 | fn eq(&self, other: &StrSpan<'_>) -> bool {
36 | self == other.text
37 | }
38 | }
39 |
40 | impl PartialEq> for &str {
41 | fn eq(&self, other: &StrSpan<'_>) -> bool {
42 | *self == other.text
43 | }
44 | }
45 |
46 | impl<'a> StrSpan<'a> {
47 | /// Constructs a new `StrSpan` from substring.
48 | #[inline]
49 | pub(crate) fn from_substr(text: &str, start: usize, end: usize) -> StrSpan {
50 | debug_assert!(start <= end);
51 | StrSpan {
52 | text: &text[start..end],
53 | start,
54 | }
55 | }
56 |
57 | /// Returns `true` is self is empty.
58 | pub fn is_empty(&self) -> bool {
59 | self.text.is_empty()
60 | }
61 |
62 | /// Returns the start position of the span.
63 | #[inline]
64 | pub fn start(&self) -> usize {
65 | self.start
66 | }
67 |
68 | /// Returns the end position of the span.
69 | #[inline]
70 | pub fn end(&self) -> usize {
71 | self.start + self.text.len()
72 | }
73 |
74 | /// Returns the range of the span.
75 | #[inline]
76 | pub fn range(&self) -> Range {
77 | self.start..self.end()
78 | }
79 |
80 | /// Returns the span as a string slice
81 | #[inline]
82 | pub fn as_str(&self) -> &'a str {
83 | self.text
84 | }
85 |
86 | /// Returns an underling string region as `StrSpan`.
87 | #[inline]
88 | pub(crate) fn slice_region(&self, start: usize, end: usize) -> StrSpan<'a> {
89 | StrSpan::from_substr(self.text, start, end)
90 | }
91 | }
92 |
93 | impl fmt::Debug for StrSpan<'_> {
94 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
95 | write!(
96 | f,
97 | "StrSpan({:?} {}..{})",
98 | self.as_str(),
99 | self.start(),
100 | self.end()
101 | )
102 | }
103 | }
104 |
105 | impl fmt::Display for StrSpan<'_> {
106 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
107 | write!(f, "{}", self.as_str())
108 | }
109 | }
110 |
111 | impl Deref for StrSpan<'_> {
112 | type Target = str;
113 |
114 | fn deref(&self) -> &Self::Target {
115 | self.text
116 | }
117 | }
118 |
--------------------------------------------------------------------------------
/tests/integration/document.rs:
--------------------------------------------------------------------------------
1 | use std::str;
2 |
3 | use crate::token::*;
4 |
5 | test!(document_01, "",);
6 |
7 | test!(document_02, " ",);
8 |
9 | test!(document_03, " \n\t\r ",);
10 |
11 | // BOM
12 | test!(
13 | document_05,
14 | str::from_utf8(b"\xEF\xBB\xBF").unwrap(),
15 | Token::ElementStart("", "a", 3..5),
16 | Token::ElementEnd(ElementEnd::Empty, 5..7)
17 | );
18 |
19 | test!(
20 | document_06,
21 | str::from_utf8(b"\xEF\xBB\xBF").unwrap(),
22 | Token::Declaration("1.0", None, None, 3..24)
23 | );
24 |
25 | test!(
26 | document_07,
27 | "\n\n\
28 | ",
29 | Token::Declaration("1.0", Some("utf-8"), None, 0..38),
30 | Token::Comment(" comment ", 39..55),
31 | Token::EmptyDtd(
32 | "svg",
33 | Some(ExternalId::Public(
34 | "-//W3C//DTD SVG 1.1//EN",
35 | "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"
36 | )),
37 | 56..154
38 | )
39 | );
40 |
41 | test!(
42 | document_08,
43 | "\n\
44 | ",
45 | Token::PI("xml-stylesheet", None, 0..18),
46 | Token::EmptyDtd(
47 | "svg",
48 | Some(ExternalId::Public(
49 | "-//W3C//DTD SVG 1.1//EN",
50 | "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"
51 | )),
52 | 19..117
53 | )
54 | );
55 |
56 | test!(
57 | document_09,
58 | "\n\n\
59 | ",
60 | Token::Declaration("1.0", Some("utf-8"), None, 0..38),
61 | Token::PI("xml-stylesheet", None, 39..57),
62 | Token::EmptyDtd(
63 | "svg",
64 | Some(ExternalId::Public(
65 | "-//W3C//DTD SVG 1.1//EN",
66 | "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"
67 | )),
68 | 58..156
69 | )
70 | );
71 |
72 | test!(
73 | document_err_01,
74 | "",
75 | Token::Error("unknown token at 1:1".to_string())
76 | );
77 |
78 | test!(
79 | document_err_02,
80 | " &www---------Ӥ+----------w-----www_",
81 | Token::Error("unknown token at 1:2".to_string())
82 | );
83 |
84 | test!(
85 | document_err_03,
86 | "q",
87 | Token::Error("unknown token at 1:1".to_string())
88 | );
89 |
90 | test!(
91 | document_err_04,
92 | "",
93 | Token::Error("unknown token at 1:1".to_string())
94 | );
95 |
96 | test!(
97 | document_err_05,
98 | "",
99 | Token::EmptyDtd("greeting1", None, 0..20),
100 | Token::Error("unknown token at 1:21".to_string())
101 | );
102 |
103 | test!(
104 | document_err_06,
105 | " ",
106 | Token::Error("unknown token at 1:1".to_string())
107 | );
108 |
109 | #[test]
110 | fn parse_fragment_1() {
111 | let s = "";
112 | let mut p = xml::Tokenizer::from_fragment(s, 0..s.len());
113 |
114 | match p.next().unwrap().unwrap() {
115 | xml::Token::ElementStart { local, .. } => assert_eq!(local.as_str(), "p"),
116 | _ => panic!(),
117 | }
118 |
119 | match p.next().unwrap().unwrap() {
120 | xml::Token::ElementEnd { .. } => {}
121 | _ => panic!(),
122 | }
123 |
124 | match p.next().unwrap().unwrap() {
125 | xml::Token::ElementStart { local, .. } => assert_eq!(local.as_str(), "p"),
126 | _ => panic!(),
127 | }
128 | }
129 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # xmlparser
2 |
3 | [
](https://github.com/RazrFalcon/xmlparser)
4 | [
](https://crates.io/crates/xmlparser)
5 | [
](https://docs.rs/xmlparser)
6 | [
](https://github.com/RazrFalcon/xmlparser/actions?query=branch%3Amaster)
7 |
8 | *xmlparser* is a low-level, pull-based, zero-allocation
9 | [XML 1.0](https://www.w3.org/TR/xml/) parser.
10 |
11 |
12 |
13 | ## Example
14 |
15 | ```rust
16 | for token in xmlparser::Tokenizer::from("") {
17 | println!("{:?}", token);
18 | }
19 | ```
20 |
21 |
22 |
23 | ## Why a new library?
24 |
25 | This library is basically a low-level XML tokenizer that preserves the
26 | positions of the tokens and is not intended to be used directly.
27 |
28 | If you are looking for a higher level solution, check out
29 | [roxmltree](https://github.com/RazrFalcon/roxmltree).
30 |
31 |
32 |
33 | ## Benefits
34 |
35 | - All tokens contain `StrSpan` structs which represent the position of the
36 | substring in the original document.
37 | - Good error processing. All error types contain the position (line:column)
38 | where it occurred.
39 | - No heap allocations.
40 | - No dependencies.
41 | - Tiny. ~1400 LOC and ~30KiB in the release build according to
42 | `cargo-bloat`.
43 | - Supports `no_std` builds. To use without the standard library, disable the
44 | default features.
45 |
46 |
47 |
48 | ## Limitations
49 |
50 | - Currently, only ENTITY objects are parsed from the DOCTYPE. All others are
51 | ignored.
52 | - No tree structure validation. So an XML like
53 | `` or a string without root element will be
54 | parsed without errors. You should check for this manually. On the other
55 | hand `` will lead to an error.
56 | - Duplicated attributes is not an error. So XML like ` `
57 | will be parsed without errors. You should check for this manually.
58 | - UTF-8 only.
59 |
60 |
61 |
62 | ## Safety
63 |
64 | - The library must not panic. Any panic is considered a critical bug and
65 | should be reported.
66 | - The library forbids unsafe code.
67 |
68 |
69 |
70 | ## License
71 |
72 | Licensed under either of
73 |
74 | - Apache License, Version 2.0 ([LICENSE-APACHE] or
75 | http://www.apache.org/licenses/LICENSE-2.0)
76 | - MIT license ([LICENSE-MIT] or http://opensource.org/licenses/MIT)
77 |
78 | at your option.
79 |
80 |
81 |
82 | ### Contribution
83 |
84 | Unless you explicitly state otherwise, any contribution intentionally submitted
85 | for inclusion in the work by you, as defined in the Apache-2.0 license, shall be
86 | dual licensed as above, without any additional terms or conditions.
87 |
88 | [LICENSE-APACHE]: https://github.com/RazrFalcon/xmlparser/blob/master/LICENSE-APACHE
89 | [LICENSE-MIT]: https://github.com/RazrFalcon/xmlparser/blob/master/LICENSE-MIT
90 |
--------------------------------------------------------------------------------
/src/xmlchar.rs:
--------------------------------------------------------------------------------
1 | /// Extension methods for XML-subset only operations.
2 | pub trait XmlCharExt {
3 | /// Checks if the value is within the
4 | /// [NameStartChar](https://www.w3.org/TR/xml/#NT-NameStartChar) range.
5 | fn is_xml_name_start(&self) -> bool;
6 |
7 | /// Checks if the value is within the
8 | /// [NameChar](https://www.w3.org/TR/xml/#NT-NameChar) range.
9 | fn is_xml_name(&self) -> bool;
10 |
11 | /// Checks if the value is within the
12 | /// [Char](https://www.w3.org/TR/xml/#NT-Char) range.
13 | fn is_xml_char(&self) -> bool;
14 | }
15 |
16 | impl XmlCharExt for char {
17 | #[inline]
18 | #[allow(clippy::match_like_matches_macro)]
19 | fn is_xml_name_start(&self) -> bool {
20 | // Check for ASCII first.
21 | if *self as u32 <= 128 {
22 | return matches!(*self as u8, b'A'...b'Z' | b'a'...b'z' | b':' | b'_');
23 | }
24 |
25 | match *self as u32 {
26 | 0x0000C0...0x0000D6
27 | | 0x0000D8...0x0000F6
28 | | 0x0000F8...0x0002FF
29 | | 0x000370...0x00037D
30 | | 0x00037F...0x001FFF
31 | | 0x00200C...0x00200D
32 | | 0x002070...0x00218F
33 | | 0x002C00...0x002FEF
34 | | 0x003001...0x00D7FF
35 | | 0x00F900...0x00FDCF
36 | | 0x00FDF0...0x00FFFD
37 | | 0x010000...0x0EFFFF => true,
38 | _ => false,
39 | }
40 | }
41 |
42 | #[inline]
43 | #[allow(clippy::match_like_matches_macro)]
44 | fn is_xml_name(&self) -> bool {
45 | // Check for ASCII first.
46 | if *self as u32 <= 128 {
47 | return (*self as u8).is_xml_name();
48 | }
49 |
50 | match *self as u32 {
51 | 0x0000B7
52 | | 0x0000C0...0x0000D6
53 | | 0x0000D8...0x0000F6
54 | | 0x0000F8...0x0002FF
55 | | 0x000300...0x00036F
56 | | 0x000370...0x00037D
57 | | 0x00037F...0x001FFF
58 | | 0x00200C...0x00200D
59 | | 0x00203F...0x002040
60 | | 0x002070...0x00218F
61 | | 0x002C00...0x002FEF
62 | | 0x003001...0x00D7FF
63 | | 0x00F900...0x00FDCF
64 | | 0x00FDF0...0x00FFFD
65 | | 0x010000...0x0EFFFF => true,
66 | _ => false,
67 | }
68 | }
69 |
70 | #[inline]
71 | fn is_xml_char(&self) -> bool {
72 | // Does not check for surrogate code points U+D800-U+DFFF,
73 | // since that check was performed by Rust when the `&str` was constructed.
74 | if (*self as u32) < 0x20 {
75 | return (*self as u8).is_xml_space();
76 | }
77 | !matches!(*self as u32, 0xFFFF | 0xFFFE)
78 | }
79 | }
80 |
81 | /// Extension methods for XML-subset only operations.
82 | pub trait XmlByteExt {
83 | /// Checks if byte is a digit.
84 | ///
85 | /// `[0-9]`
86 | fn is_xml_digit(&self) -> bool;
87 |
88 | /// Checks if byte is a hex digit.
89 | ///
90 | /// `[0-9A-Fa-f]`
91 | fn is_xml_hex_digit(&self) -> bool;
92 |
93 | /// Checks if byte is a space.
94 | ///
95 | /// `[ \r\n\t]`
96 | fn is_xml_space(&self) -> bool;
97 |
98 | /// Checks if byte is an ASCII char.
99 | ///
100 | /// `[A-Za-z]`
101 | fn is_xml_letter(&self) -> bool;
102 |
103 | /// Checks if byte is within the ASCII
104 | /// [Char](https://www.w3.org/TR/xml/#NT-Char) range.
105 | fn is_xml_name(&self) -> bool;
106 | }
107 |
108 | impl XmlByteExt for u8 {
109 | #[inline]
110 | fn is_xml_digit(&self) -> bool {
111 | matches!(*self, b'0'...b'9')
112 | }
113 |
114 | #[inline]
115 | fn is_xml_hex_digit(&self) -> bool {
116 | matches!(*self, b'0'...b'9' | b'A'...b'F' | b'a'...b'f')
117 | }
118 |
119 | #[inline]
120 | fn is_xml_space(&self) -> bool {
121 | matches!(*self, b' ' | b'\t' | b'\n' | b'\r')
122 | }
123 |
124 | #[inline]
125 | fn is_xml_letter(&self) -> bool {
126 | matches!(*self, b'A'...b'Z' | b'a'...b'z')
127 | }
128 |
129 | #[inline]
130 | fn is_xml_name(&self) -> bool {
131 | matches!(*self, b'A'...b'Z' | b'a'...b'z'| b'0'...b'9'| b':' | b'_' | b'-' | b'.')
132 | }
133 | }
134 |
--------------------------------------------------------------------------------
/tests/integration/token.rs:
--------------------------------------------------------------------------------
1 | type Range = ::std::ops::Range;
2 |
3 | #[derive(PartialEq, Debug)]
4 | pub enum Token<'a> {
5 | Declaration(&'a str, Option<&'a str>, Option, Range),
6 | PI(&'a str, Option<&'a str>, Range),
7 | Comment(&'a str, Range),
8 | DtdStart(&'a str, Option>, Range),
9 | EmptyDtd(&'a str, Option>, Range),
10 | EntityDecl(&'a str, EntityDefinition<'a>, Range),
11 | DtdEnd(Range),
12 | ElementStart(&'a str, &'a str, Range),
13 | Attribute(&'a str, &'a str, &'a str, Range),
14 | ElementEnd(ElementEnd<'a>, Range),
15 | Text(&'a str, Range),
16 | Cdata(&'a str, Range),
17 | Error(String),
18 | }
19 |
20 | #[derive(PartialEq, Debug)]
21 | pub enum ElementEnd<'a> {
22 | Open,
23 | Close(&'a str, &'a str),
24 | Empty,
25 | }
26 |
27 | #[derive(PartialEq, Debug)]
28 | pub enum ExternalId<'a> {
29 | System(&'a str),
30 | Public(&'a str, &'a str),
31 | }
32 |
33 | #[derive(PartialEq, Debug)]
34 | pub enum EntityDefinition<'a> {
35 | EntityValue(&'a str),
36 | ExternalId(ExternalId<'a>),
37 | }
38 |
39 | #[macro_export]
40 | macro_rules! test {
41 | ($name:ident, $text:expr, $($token:expr),*) => (
42 | #[test]
43 | fn $name() {
44 | let mut p = xml::Tokenizer::from($text);
45 | $(
46 | let t = p.next().unwrap();
47 | assert_eq!(to_test_token(t), $token);
48 | )*
49 | assert!(p.next().is_none());
50 | }
51 | )
52 | }
53 |
54 | #[inline(never)]
55 | pub fn to_test_token(token: Result) -> Token {
56 | match token {
57 | Ok(xml::Token::Declaration {
58 | version,
59 | encoding,
60 | standalone,
61 | span,
62 | }) => Token::Declaration(
63 | version.as_str(),
64 | encoding.map(|v| v.as_str()),
65 | standalone,
66 | span.range(),
67 | ),
68 | Ok(xml::Token::ProcessingInstruction {
69 | target,
70 | content,
71 | span,
72 | }) => Token::PI(target.as_str(), content.map(|v| v.as_str()), span.range()),
73 | Ok(xml::Token::Comment { text, span }) => Token::Comment(text.as_str(), span.range()),
74 | Ok(xml::Token::DtdStart {
75 | name,
76 | external_id,
77 | span,
78 | }) => Token::DtdStart(
79 | name.as_str(),
80 | external_id.map(|v| to_test_external_id(v)),
81 | span.range(),
82 | ),
83 | Ok(xml::Token::EmptyDtd {
84 | name,
85 | external_id,
86 | span,
87 | }) => Token::EmptyDtd(
88 | name.as_str(),
89 | external_id.map(|v| to_test_external_id(v)),
90 | span.range(),
91 | ),
92 | Ok(xml::Token::EntityDeclaration {
93 | name,
94 | definition,
95 | span,
96 | }) => Token::EntityDecl(
97 | name.as_str(),
98 | match definition {
99 | xml::EntityDefinition::EntityValue(name) => {
100 | EntityDefinition::EntityValue(name.as_str())
101 | }
102 | xml::EntityDefinition::ExternalId(id) => {
103 | EntityDefinition::ExternalId(to_test_external_id(id))
104 | }
105 | },
106 | span.range(),
107 | ),
108 | Ok(xml::Token::DtdEnd { span }) => Token::DtdEnd(span.range()),
109 | Ok(xml::Token::ElementStart {
110 | prefix,
111 | local,
112 | span,
113 | }) => Token::ElementStart(prefix.as_str(), local.as_str(), span.range()),
114 | Ok(xml::Token::Attribute {
115 | prefix,
116 | local,
117 | value,
118 | span,
119 | }) => Token::Attribute(
120 | prefix.as_str(),
121 | local.as_str(),
122 | value.as_str(),
123 | span.range(),
124 | ),
125 | Ok(xml::Token::ElementEnd { end, span }) => Token::ElementEnd(
126 | match end {
127 | xml::ElementEnd::Open => ElementEnd::Open,
128 | xml::ElementEnd::Close(prefix, local) => {
129 | ElementEnd::Close(prefix.as_str(), local.as_str())
130 | }
131 | xml::ElementEnd::Empty => ElementEnd::Empty,
132 | },
133 | span.range(),
134 | ),
135 | Ok(xml::Token::Text { text }) => Token::Text(text.as_str(), text.range()),
136 | Ok(xml::Token::Cdata { text, span }) => Token::Cdata(text.as_str(), span.range()),
137 | Err(ref e) => Token::Error(e.to_string()),
138 | }
139 | }
140 |
141 | fn to_test_external_id(id: xml::ExternalId) -> ExternalId {
142 | match id {
143 | xml::ExternalId::System(name) => ExternalId::System(name.as_str()),
144 | xml::ExternalId::Public(name, value) => ExternalId::Public(name.as_str(), value.as_str()),
145 | }
146 | }
147 |
--------------------------------------------------------------------------------
/tests/integration/pi.rs:
--------------------------------------------------------------------------------
1 | use crate::token::*;
2 |
3 | test!(pi_01, "", Token::PI("xslt", Some("ma"), 0..11));
4 |
5 | test!(
6 | pi_02,
7 | "",
8 | Token::PI("xslt", Some("m"), 0..13)
9 | );
10 |
11 | test!(pi_03, "", Token::PI("xslt", None, 0..8));
12 |
13 | test!(pi_04, "", Token::PI("xslt", None, 0..9));
14 |
15 | test!(
16 | pi_05,
17 | "",
18 | Token::PI("xml-stylesheet", None, 0..18)
19 | );
20 |
21 | test!(
22 | pi_err_01,
23 | "?xml \t\n m?>",
24 | Token::Error("invalid processing instruction at 1:1 cause invalid name token".to_string())
25 | );
26 |
27 | test!(
28 | declaration_01,
29 | "",
30 | Token::Declaration("1.0", None, None, 0..21)
31 | );
32 |
33 | test!(
34 | declaration_02,
35 | "",
36 | Token::Declaration("1.0", None, None, 0..21)
37 | );
38 |
39 | test!(
40 | declaration_03,
41 | "",
42 | Token::Declaration("1.0", Some("UTF-8"), None, 0..38)
43 | );
44 |
45 | test!(
46 | declaration_04,
47 | "",
48 | Token::Declaration("1.0", Some("UTF-8"), None, 0..38)
49 | );
50 |
51 | test!(
52 | declaration_05,
53 | "",
54 | Token::Declaration("1.0", Some("utf-8"), None, 0..38)
55 | );
56 |
57 | test!(
58 | declaration_06,
59 | "",
60 | Token::Declaration("1.0", Some("EUC-JP"), None, 0..39)
61 | );
62 |
63 | test!(
64 | declaration_07,
65 | "",
66 | Token::Declaration("1.0", Some("UTF-8"), Some(true), 0..55)
67 | );
68 |
69 | test!(
70 | declaration_08,
71 | "",
72 | Token::Declaration("1.0", Some("UTF-8"), Some(false), 0..54)
73 | );
74 |
75 | test!(
76 | declaration_09,
77 | "",
78 | Token::Declaration("1.0", None, Some(false), 0..37)
79 | );
80 |
81 | test!(
82 | declaration_10,
83 | "",
84 | Token::Declaration("1.0", None, Some(false), 0..38)
85 | );
86 |
87 | // Declaration with an invalid order
88 | test!(
89 | declaration_err_01,
90 | "",
91 | Token::Error("invalid XML declaration at 1:1 cause expected 'version' at 1:7".to_string())
92 | );
93 |
94 | test!(
95 | declaration_err_02,
96 | "",
97 | Token::Error("invalid XML declaration at 1:1 cause expected '\'' not '*' at 1:31".to_string())
98 | );
99 |
100 | test!(
101 | declaration_err_03,
102 | "",
103 | Token::Error("invalid XML declaration at 1:1 cause expected '1.' at 1:16".to_string())
104 | );
105 |
106 | test!(
107 | declaration_err_04,
108 | "",
109 | Token::Error("invalid XML declaration at 1:1 cause expected 'yes', 'no' at 1:33".to_string())
110 | );
111 |
112 | test!(
113 | declaration_err_05,
114 | "",
115 | Token::Error("invalid XML declaration at 1:1 cause expected '?>' at 1:21".to_string())
116 | );
117 |
118 | test!(
119 | declaration_err_06,
120 | "",
121 | Token::Error("invalid XML declaration at 1:1 cause expected '?>' at 1:55".to_string())
122 | );
123 |
124 | test!(
125 | declaration_err_07,
126 | "\u{000a}' at 3:7".to_string())
128 | );
129 |
130 | test!(
131 | declaration_err_08,
132 | "",
133 | Token::Error("invalid XML declaration at 1:1 cause expected 'version' at 2:2".to_string())
134 | );
135 |
136 | test!(
137 | declaration_err_09,
138 | "",
139 | Token::Error("invalid XML declaration at 1:1 cause expected 'version' at 2:2".to_string())
140 | );
141 |
142 | // XML declaration allowed only at the start of the document.
143 | test!(
144 | declaration_err_10,
145 | " ",
146 | Token::Error("unknown token at 1:2".to_string())
147 | );
148 |
149 | // XML declaration allowed only at the start of the document.
150 | test!(
151 | declaration_err_11,
152 | "",
153 | Token::Comment(" comment ", 0..16),
154 | Token::Error("unknown token at 1:17".to_string())
155 | );
156 |
157 | // Duplicate.
158 | test!(
159 | declaration_err_12,
160 | "",
161 | Token::Declaration("1.0", None, None, 0..21),
162 | Token::Error("unknown token at 1:22".to_string())
163 | );
164 |
165 | test!(
166 | declaration_err_13,
167 | "",
168 | Token::Error(
169 | "invalid processing instruction at 1:1 cause a non-XML character '\\u{1}' found at 1:10"
170 | .to_string()
171 | )
172 | );
173 |
174 | test!(
175 | declaration_err_14,
176 | "",
177 | Token::Error("invalid XML declaration at 1:1 cause expected space not 'e' at 1:20".to_string())
178 | );
179 |
180 | test!(
181 | declaration_err_15,
182 | "",
183 | Token::Error("invalid XML declaration at 1:1 cause expected space not 's' at 1:37".to_string())
184 | );
185 |
186 | test!(
187 | declaration_err_16,
188 | "' at 1:20".to_string())
190 | );
191 |
--------------------------------------------------------------------------------
/tests/integration/doctype.rs:
--------------------------------------------------------------------------------
1 | use crate::token::*;
2 |
3 | test!(
4 | dtd_01,
5 | "",
6 | Token::EmptyDtd("greeting", Some(ExternalId::System("hello.dtd")), 0..38)
7 | );
8 |
9 | test!(
10 | dtd_02,
11 | "",
12 | Token::EmptyDtd(
13 | "greeting",
14 | Some(ExternalId::Public("hello.dtd", "goodbye.dtd")),
15 | 0..52
16 | )
17 | );
18 |
19 | test!(
20 | dtd_03,
21 | "",
22 | Token::EmptyDtd("greeting", Some(ExternalId::System("hello.dtd")), 0..38)
23 | );
24 |
25 | test!(
26 | dtd_04,
27 | "",
28 | Token::EmptyDtd("greeting", None, 0..19)
29 | );
30 |
31 | test!(
32 | dtd_05,
33 | "",
34 | Token::DtdStart("greeting", None, 0..20),
35 | Token::DtdEnd(20..22)
36 | );
37 |
38 | test!(
39 | dtd_06,
40 | "",
41 | Token::EmptyDtd("greeting", None, 0..19),
42 | Token::ElementStart("", "a", 19..21),
43 | Token::ElementEnd(ElementEnd::Empty, 21..23)
44 | );
45 |
46 | test!(
47 | dtd_07,
48 | "",
49 | Token::DtdStart("greeting", None, 0..20),
50 | Token::DtdEnd(20..23)
51 | );
52 |
53 | test!(
54 | dtd_08,
55 | "",
56 | Token::DtdStart("greeting", None, 0..20),
57 | Token::DtdEnd(21..24)
58 | );
59 |
60 | test!(
61 | dtd_entity_01,
62 | "
64 | ]>",
65 | Token::DtdStart("svg", None, 0..15),
66 | Token::EntityDecl(
67 | "ns_extend",
68 | EntityDefinition::EntityValue("http://ns.adobe.com/Extensibility/1.0/"),
69 | 20..80,
70 | ),
71 | Token::DtdEnd(81..83)
72 | );
73 |
74 | test!(
75 | dtd_entity_02,
76 | "
79 | ]>",
80 | Token::DtdStart("svg", None, 0..15),
81 | Token::EntityDecl(
82 | "Pub-Status",
83 | EntityDefinition::EntityValue("This is a pre-release of the\nspecification."),
84 | 20..86,
85 | ),
86 | Token::DtdEnd(87..89)
87 | );
88 |
89 | test!(
90 | dtd_entity_03,
91 | "
93 | ]>",
94 | Token::DtdStart("svg", None, 0..15),
95 | Token::EntityDecl(
96 | "open-hatch",
97 | EntityDefinition::ExternalId(ExternalId::System(
98 | "http://www.textuality.com/boilerplate/OpenHatch.xml"
99 | )),
100 | 20..101,
101 | ),
102 | Token::DtdEnd(102..104)
103 | );
104 |
105 | test!(
106 | dtd_entity_04,
107 | "
111 | ]>",
112 | Token::DtdStart("svg", None, 0..15),
113 | Token::EntityDecl(
114 | "open-hatch",
115 | EntityDefinition::ExternalId(ExternalId::Public(
116 | "-//Textuality//TEXT Standard open-hatch boilerplate//EN",
117 | "http://www.textuality.com/boilerplate/OpenHatch.xml"
118 | )),
119 | 20..185,
120 | ),
121 | Token::DtdEnd(186..188)
122 | );
123 |
124 | // TODO: NDATA will be ignored
125 | test!(
126 | dtd_entity_05,
127 | "
129 | ]>",
130 | Token::DtdStart("svg", None, 0..15),
131 | Token::EntityDecl(
132 | "hatch-pic",
133 | EntityDefinition::ExternalId(ExternalId::System("../grafix/OpenHatch.gif")),
134 | 20..83,
135 | ),
136 | Token::DtdEnd(84..86)
137 | );
138 |
139 | // TODO: unsupported data will be ignored
140 | test!(
141 | dtd_entity_06,
142 | "
144 |
145 |
146 |
147 | ]>",
148 | Token::DtdStart("svg", None, 0..15),
149 | Token::EntityDecl(
150 | "ns_extend",
151 | EntityDefinition::EntityValue("http://ns.adobe.com/Extensibility/1.0/"),
152 | 44..104
153 | ),
154 | Token::DtdEnd(203..205)
155 | );
156 |
157 | // We do not support !ELEMENT DTD token and it will be skipped.
158 | // Previously, we were calling `Tokenizer::next` after the skip,
159 | // which is recursive and could cause a stack overflow when there are too many sequential
160 | // unsupported tokens.
161 | // This tests checks that the current code do not crash with stack overflow.
162 | #[test]
163 | fn dtd_entity_07() {
164 | let mut text = "\n");
167 | }
168 | text.push_str("]>\n");
169 |
170 | let mut p = xml::Tokenizer::from(text.as_str());
171 | assert_eq!(
172 | to_test_token(p.next().unwrap()),
173 | Token::DtdStart("svg", None, 0..15)
174 | );
175 | assert_eq!(
176 | to_test_token(p.next().unwrap()),
177 | Token::DtdEnd(10016..10018)
178 | );
179 | }
180 |
181 | test!(
182 | dtd_err_01,
183 | "\u{000a}<",
184 | Token::Error("invalid DTD at 1:1 cause expected space not 'E' at 1:10".to_string())
185 | );
186 |
187 | test!(
188 | dtd_err_02,
189 | "' not '!' at 1:16".to_string())
217 | );
218 |
--------------------------------------------------------------------------------
/src/error.rs:
--------------------------------------------------------------------------------
1 | use core::fmt;
2 | use core::str;
3 | #[cfg(feature = "std")]
4 | use std::error;
5 |
6 | /// An XML parser errors.
7 | #[allow(missing_docs)]
8 | #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
9 | pub enum Error {
10 | InvalidDeclaration(StreamError, TextPos),
11 | InvalidComment(StreamError, TextPos),
12 | InvalidPI(StreamError, TextPos),
13 | InvalidDoctype(StreamError, TextPos),
14 | InvalidEntity(StreamError, TextPos),
15 | InvalidElement(StreamError, TextPos),
16 | InvalidAttribute(StreamError, TextPos),
17 | InvalidCdata(StreamError, TextPos),
18 | InvalidCharData(StreamError, TextPos),
19 | UnknownToken(TextPos),
20 | }
21 |
22 | impl Error {
23 | /// Returns the error position.
24 | pub fn pos(&self) -> TextPos {
25 | match *self {
26 | Error::InvalidDeclaration(_, pos) => pos,
27 | Error::InvalidComment(_, pos) => pos,
28 | Error::InvalidPI(_, pos) => pos,
29 | Error::InvalidDoctype(_, pos) => pos,
30 | Error::InvalidEntity(_, pos) => pos,
31 | Error::InvalidElement(_, pos) => pos,
32 | Error::InvalidAttribute(_, pos) => pos,
33 | Error::InvalidCdata(_, pos) => pos,
34 | Error::InvalidCharData(_, pos) => pos,
35 | Error::UnknownToken(pos) => pos,
36 | }
37 | }
38 | }
39 |
40 | impl fmt::Display for Error {
41 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
42 | match *self {
43 | Error::InvalidDeclaration(ref cause, pos) => {
44 | write!(f, "invalid XML declaration at {} cause {}", pos, cause)
45 | }
46 | Error::InvalidComment(ref cause, pos) => {
47 | write!(f, "invalid comment at {} cause {}", pos, cause)
48 | }
49 | Error::InvalidPI(ref cause, pos) => {
50 | write!(
51 | f,
52 | "invalid processing instruction at {} cause {}",
53 | pos, cause
54 | )
55 | }
56 | Error::InvalidDoctype(ref cause, pos) => {
57 | write!(f, "invalid DTD at {} cause {}", pos, cause)
58 | }
59 | Error::InvalidEntity(ref cause, pos) => {
60 | write!(f, "invalid DTD entity at {} cause {}", pos, cause)
61 | }
62 | Error::InvalidElement(ref cause, pos) => {
63 | write!(f, "invalid element at {} cause {}", pos, cause)
64 | }
65 | Error::InvalidAttribute(ref cause, pos) => {
66 | write!(f, "invalid attribute at {} cause {}", pos, cause)
67 | }
68 | Error::InvalidCdata(ref cause, pos) => {
69 | write!(f, "invalid CDATA at {} cause {}", pos, cause)
70 | }
71 | Error::InvalidCharData(ref cause, pos) => {
72 | write!(f, "invalid character data at {} cause {}", pos, cause)
73 | }
74 | Error::UnknownToken(pos) => {
75 | write!(f, "unknown token at {}", pos)
76 | }
77 | }
78 | }
79 | }
80 |
81 | #[cfg(feature = "std")]
82 | impl error::Error for Error {
83 | fn description(&self) -> &str {
84 | "an XML parsing error"
85 | }
86 | }
87 |
88 | /// A stream parser errors.
89 | #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
90 | pub enum StreamError {
91 | /// The steam ended earlier than we expected.
92 | ///
93 | /// Should only appear on invalid input data.
94 | /// Errors in a valid XML should be handled by errors below.
95 | UnexpectedEndOfStream,
96 |
97 | /// An invalid name.
98 | InvalidName,
99 |
100 | /// A non-XML character has occurred.
101 | ///
102 | /// Valid characters are:
103 | NonXmlChar(char, TextPos),
104 |
105 | /// An invalid/unexpected character.
106 | ///
107 | /// The first byte is an actual one, the second one is expected.
108 | ///
109 | /// We are using a single value to reduce the struct size.
110 | InvalidChar(u8, u8, TextPos),
111 |
112 | /// An invalid/unexpected character.
113 | ///
114 | /// Just like `InvalidChar`, but specifies multiple expected characters.
115 | InvalidCharMultiple(u8, &'static [u8], TextPos),
116 |
117 | /// An unexpected character instead of `"` or `'`.
118 | InvalidQuote(u8, TextPos),
119 |
120 | /// An unexpected character instead of an XML space.
121 | ///
122 | /// Includes: `' ' \n \r \t
`.
123 | InvalidSpace(u8, TextPos),
124 |
125 | /// An unexpected string.
126 | ///
127 | /// Contains what string was expected.
128 | InvalidString(&'static str, TextPos),
129 |
130 | /// An invalid reference.
131 | InvalidReference,
132 |
133 | /// An invalid ExternalID in the DTD.
134 | InvalidExternalID,
135 |
136 | /// Comment cannot contain `--`.
137 | InvalidCommentData,
138 |
139 | /// Comment cannot end with `-`.
140 | InvalidCommentEnd,
141 |
142 | /// A Character Data node contains an invalid data.
143 | ///
144 | /// Currently, only `]]>` is not allowed.
145 | InvalidCharacterData,
146 | }
147 |
148 | impl fmt::Display for StreamError {
149 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
150 | match *self {
151 | StreamError::UnexpectedEndOfStream => {
152 | write!(f, "unexpected end of stream")
153 | }
154 | StreamError::InvalidName => {
155 | write!(f, "invalid name token")
156 | }
157 | StreamError::NonXmlChar(c, pos) => {
158 | write!(f, "a non-XML character {:?} found at {}", c, pos)
159 | }
160 | StreamError::InvalidChar(actual, expected, pos) => {
161 | write!(
162 | f,
163 | "expected '{}' not '{}' at {}",
164 | expected as char, actual as char, pos
165 | )
166 | }
167 | StreamError::InvalidCharMultiple(actual, expected, pos) => {
168 | let mut expected_iter = expected.iter().peekable();
169 |
170 | write!(f, "expected ")?;
171 | while let Some(&c) = expected_iter.next() {
172 | write!(f, "'{}'", c as char)?;
173 | if expected_iter.peek().is_some() {
174 | write!(f, ", ")?;
175 | }
176 | }
177 | write!(f, " not '{}' at {}", actual as char, pos)
178 | }
179 | StreamError::InvalidQuote(c, pos) => {
180 | write!(f, "expected quote mark not '{}' at {}", c as char, pos)
181 | }
182 | StreamError::InvalidSpace(c, pos) => {
183 | write!(f, "expected space not '{}' at {}", c as char, pos)
184 | }
185 | StreamError::InvalidString(expected, pos) => {
186 | write!(f, "expected '{}' at {}", expected, pos)
187 | }
188 | StreamError::InvalidReference => {
189 | write!(f, "invalid reference")
190 | }
191 | StreamError::InvalidExternalID => {
192 | write!(f, "invalid ExternalID")
193 | }
194 | StreamError::InvalidCommentData => {
195 | write!(f, "'--' is not allowed in comments")
196 | }
197 | StreamError::InvalidCommentEnd => {
198 | write!(f, "comment cannot end with '-'")
199 | }
200 | StreamError::InvalidCharacterData => {
201 | write!(f, "']]>' is not allowed inside a character data")
202 | }
203 | }
204 | }
205 | }
206 |
207 | #[cfg(feature = "std")]
208 | impl error::Error for StreamError {
209 | fn description(&self) -> &str {
210 | "an XML stream parsing error"
211 | }
212 | }
213 |
214 | /// Position in text.
215 | ///
216 | /// Position indicates a row/line and a column in the original text. Starting from 1:1.
217 | #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
218 | #[allow(missing_docs)]
219 | pub struct TextPos {
220 | pub row: u32,
221 | pub col: u32,
222 | }
223 |
224 | impl TextPos {
225 | /// Constructs a new `TextPos`.
226 | ///
227 | /// Should not be invoked manually, but rather via `Stream::gen_text_pos`.
228 | pub fn new(row: u32, col: u32) -> TextPos {
229 | TextPos { row, col }
230 | }
231 | }
232 |
233 | impl fmt::Display for TextPos {
234 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
235 | write!(f, "{}:{}", self.row, self.col)
236 | }
237 | }
238 |
--------------------------------------------------------------------------------
/tests/integration/elements.rs:
--------------------------------------------------------------------------------
1 | use crate::token::*;
2 |
3 | test!(
4 | element_01,
5 | "",
6 | Token::ElementStart("", "a", 0..2),
7 | Token::ElementEnd(ElementEnd::Empty, 2..4)
8 | );
9 |
10 | test!(
11 | element_02,
12 | "",
13 | Token::ElementStart("", "a", 0..2),
14 | Token::ElementEnd(ElementEnd::Open, 2..3),
15 | Token::ElementEnd(ElementEnd::Close("", "a"), 3..7)
16 | );
17 |
18 | test!(
19 | element_03,
20 | " \t \n ",
21 | Token::ElementStart("", "a", 5..7),
22 | Token::ElementEnd(ElementEnd::Empty, 7..9)
23 | );
24 |
25 | test!(
26 | element_04,
27 | " \t \n ",
28 | Token::ElementStart("", "b", 5..7),
29 | Token::ElementEnd(ElementEnd::Open, 7..8),
30 | Token::ElementStart("", "a", 8..10),
31 | Token::ElementEnd(ElementEnd::Empty, 10..12),
32 | Token::ElementEnd(ElementEnd::Close("", "b"), 12..16)
33 | );
34 |
35 | test!(
36 | element_06,
37 | "<俄语 լեզու=\"ռուսերեն\">данные俄语>",
38 | Token::ElementStart("", "俄语", 0..7),
39 | Token::Attribute("", "լեզու", "ռուսերեն", 8..37),
40 | Token::ElementEnd(ElementEnd::Open, 37..38),
41 | Token::Text("данные", 38..50),
42 | Token::ElementEnd(ElementEnd::Close("", "俄语"), 50..59)
43 | );
44 |
45 | test!(
46 | element_07,
47 | "",
48 | Token::ElementStart("svg", "circle", 0..11),
49 | Token::ElementEnd(ElementEnd::Open, 11..12),
50 | Token::ElementEnd(ElementEnd::Close("svg", "circle"), 12..25)
51 | );
52 |
53 | test!(
54 | element_08,
55 | "<:circle/>",
56 | Token::ElementStart("", "circle", 0..8),
57 | Token::ElementEnd(ElementEnd::Empty, 8..10)
58 | );
59 |
60 | test!(
61 | element_err_01,
62 | "<>",
63 | Token::Error("invalid element at 1:1 cause invalid name token".to_string())
64 | );
65 |
66 | test!(
67 | element_err_02,
68 | "",
69 | Token::Error("invalid element at 1:1 cause invalid name token".to_string())
70 | );
71 |
72 | test!(
73 | element_err_03,
74 | "",
113 | Token::ElementStart("", "a", 0..2),
114 | Token::ElementEnd(ElementEnd::Open, 2..3),
115 | Token::ElementEnd(ElementEnd::Close("", "a"), 3..7),
116 | Token::Error("unknown token at 1:8".to_string())
117 | );
118 |
119 | test!(
120 | element_err_10,
121 | "",
122 | Token::ElementStart("", "a", 0..2),
123 | Token::ElementEnd(ElementEnd::Empty, 2..4),
124 | Token::Error("unknown token at 1:5".to_string())
125 | );
126 |
127 | test!(
128 | element_err_11,
129 | "",
130 | Token::ElementStart("", "a", 0..2),
131 | Token::ElementEnd(ElementEnd::Open, 2..3),
132 | Token::Error("invalid element at 1:4 cause expected '>' not '/' at 1:8".to_string())
133 | );
134 |
135 | test!(
136 | element_err_12,
137 | "",
138 | Token::Error("invalid element at 1:1 cause invalid name token".to_string())
139 | );
140 |
141 | test!(
142 | element_err_13,
143 | "\
144 |
145 |
146 | ",
147 | Token::ElementStart("", "root", 0..5),
148 | Token::ElementEnd(ElementEnd::Open, 5..6),
149 | Token::Text("\n", 6..7),
150 | Token::ElementEnd(ElementEnd::Close("", "root"), 7..14),
151 | Token::Error("unknown token at 3:1".to_string())
152 | );
153 |
154 | test!(
155 | element_err_14,
156 | "<-svg/>",
157 | Token::Error("invalid element at 1:1 cause invalid name token".to_string())
158 | );
159 |
160 | test!(
161 | element_err_15,
162 | "",
163 | Token::Error("invalid element at 1:1 cause invalid name token".to_string())
164 | );
165 |
166 | test!(
167 | element_err_16,
168 | "",
169 | Token::Error("invalid element at 1:1 cause invalid name token".to_string())
170 | );
171 |
172 | test!(
173 | element_err_17,
174 | "",
175 | Token::Error("invalid element at 1:1 cause invalid name token".to_string())
176 | );
177 |
178 | test!(
179 | element_err_18,
180 | "<::svg/>",
181 | Token::Error("invalid element at 1:1 cause invalid name token".to_string())
182 | );
183 |
184 | test!(
185 | element_err_19,
186 | "<",
187 | Token::ElementStart("", "a", 0..2),
188 | Token::ElementEnd(ElementEnd::Open, 2..3),
189 | Token::Error("unknown token at 1:4".to_string())
190 | );
191 |
192 | test!(
193 | attribute_01,
194 | "",
195 | Token::ElementStart("", "a", 0..2),
196 | Token::Attribute("", "ax", "test", 3..12),
197 | Token::ElementEnd(ElementEnd::Empty, 12..14)
198 | );
199 |
200 | test!(
201 | attribute_02,
202 | "",
203 | Token::ElementStart("", "a", 0..2),
204 | Token::Attribute("", "ax", "test", 3..12),
205 | Token::ElementEnd(ElementEnd::Empty, 12..14)
206 | );
207 |
208 | test!(
209 | attribute_03,
210 | "",
211 | Token::ElementStart("", "a", 0..2),
212 | Token::Attribute("", "b", "test1", 3..12),
213 | Token::Attribute("", "c", "test2", 13..22),
214 | Token::ElementEnd(ElementEnd::Empty, 22..24)
215 | );
216 |
217 | test!(
218 | attribute_04,
219 | "",
220 | Token::ElementStart("", "a", 0..2),
221 | Token::Attribute("", "b", "\"test1\"", 3..14),
222 | Token::Attribute("", "c", "'test2'", 15..26),
223 | Token::ElementEnd(ElementEnd::Empty, 26..28)
224 | );
225 |
226 | test!(
227 | attribute_05,
228 | "",
229 | Token::ElementStart("", "c", 0..2),
230 | Token::Attribute("", "a", "test1' c='test2", 3..22),
231 | Token::Attribute("", "b", "test1\" c=\"test2", 23..42),
232 | Token::ElementEnd(ElementEnd::Empty, 42..44)
233 | );
234 |
235 | test!(
236 | attribute_06,
237 | "",
238 | Token::ElementStart("", "c", 0..2),
239 | Token::Attribute("", "a", "test1", 5..21),
240 | Token::ElementEnd(ElementEnd::Empty, 26..28)
241 | );
242 |
243 | test!(
244 | attribute_07,
245 | "",
246 | Token::ElementStart("", "c", 0..2),
247 | Token::Attribute("q", "a", "b", 3..10),
248 | Token::ElementEnd(ElementEnd::Empty, 10..12)
249 | );
250 |
251 | test!(
252 | attribute_err_01,
253 | "",
254 | Token::ElementStart("", "c", 0..2),
255 | Token::Error("invalid attribute at 1:3 cause expected quote mark not 't' at 1:7".to_string())
256 | );
257 |
258 | test!(
259 | attribute_err_02,
260 | "",
261 | Token::ElementStart("", "c", 0..2),
262 | Token::Error("invalid attribute at 1:3 cause expected \'=\' not \'>\' at 1:5".to_string())
263 | );
264 |
265 | test!(
266 | attribute_err_03,
267 | "",
268 | Token::ElementStart("", "c", 0..2),
269 | Token::Error("invalid attribute at 1:3 cause expected '=' not '/' at 1:5".to_string())
270 | );
271 |
272 | test!(
273 | attribute_err_04,
274 | "",
275 | Token::ElementStart("", "c", 0..2),
276 | Token::Attribute("", "a", "b", 3..8),
277 | Token::Error("invalid attribute at 1:9 cause expected '=' not '/' at 1:11".to_string())
278 | );
279 |
280 | test!(
281 | attribute_err_05,
282 | "",
283 | Token::ElementStart("", "c", 0..2),
284 | Token::Error("invalid attribute at 1:3 cause expected ''' not '<' at 1:7".to_string())
285 | );
286 |
287 | test!(
288 | attribute_err_06,
289 | "",
290 | Token::ElementStart("", "c", 0..2),
291 | Token::Error(
292 | "invalid attribute at 1:3 cause a non-XML character '\\u{1}' found at 1:7".to_string()
293 | )
294 | );
295 |
296 | test!(
297 | attribute_err_07,
298 | "",
299 | Token::ElementStart("", "c", 0..2),
300 | Token::Attribute("", "a", "v", 3..8),
301 | Token::Error("invalid attribute at 1:9 cause expected space not 'b' at 1:9".to_string())
302 | );
303 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Change Log
2 | All notable changes to this project will be documented in this file.
3 |
4 | The format is based on [Keep a Changelog](http://keepachangelog.com/)
5 | and this project adheres to [Semantic Versioning](http://semver.org/).
6 |
7 | ## [Unreleased]
8 |
9 | ## [0.13.6] - 2023-09-30
10 | ### Added
11 | - `Token::span`, `Tokenizer::stream` and allow cloning of `Tokenizer`.
12 | Thanks to [@krtab](https://github.com/krtab).
13 |
14 | ### Changed
15 | - Optimize `is_xml_char` function. Makes parsing 5-10% faster.
16 | Thanks to [@Simon-Martens](https://github.com/Simon-Martens).
17 |
18 | ## [0.13.5] - 2022-10-18
19 | ### Fixed
20 | - Do no use recursive calls during parsing. Could lead to stack overflow on some input.
21 | - Revert _Do not expand predefined references in `Stream::consume_reference`._
22 | - Tests on Rust 1.61. Thanks to [@krtab](https://github.com/krtab).
23 |
24 | ## [0.13.4] - 2021-06-24
25 | ### Fixed
26 | - Do not expand predefined references in `Stream::consume_reference`.
27 | Thanks to [@Jesse-Bakker](https://github.com/Jesse-Bakker).
28 |
29 | ## [0.13.3] - 2020-09-02
30 | ### Changed
31 | - Documentation fixes by [@kneasle](https://github.com/kneasle).
32 |
33 | ### Fixed
34 | - `DtdEnd` token parsing when `]` and `>` are separated by a whitespace.
35 |
36 | ## [0.13.2] - 2020-06-15
37 | ### Fixed
38 | - Allow processing instruction before DTD.
39 |
40 | ## [0.13.1] - 2020-03-12
41 | ### Fixed
42 | - Allow comments before DTD.
43 |
44 | ## [0.13.0] - 2020-01-07
45 | ### Changed
46 | - Moved to Rust 2018.
47 | - Completely new `Error` enum.
48 | - New error messages.
49 | - 10-20% faster parsing.
50 | - Use `Tokenizer::from_fragment` instead of `Tokenizer::enable_fragment_mode`.
51 |
52 | ### Removed
53 | - `TokenType`.
54 |
55 | ## [0.12.0] - 2019-12-21
56 | ### Changed
57 | - `]]>` is no longer allowed inside a Text node.
58 | - Only [XML characters](https://www.w3.org/TR/xml/#char32) are allowed now.
59 | Otherwise, `StreamError::NonXmlChar` will occur.
60 | - Disallow `-` at the end of a comment. `` is an error now.
61 | - A missing space between attributes is an error now.
62 | - `StreamError::InvalidQuote` and `StreamError::InvalidSpace` signature changed.
63 |
64 | ## [0.11.0] - 2019-11-18
65 | ### Added
66 | - `no_std` support thanks to [hugwijst](https://github.com/hugwijst).
67 |
68 | ### Changed
69 | - `StreamError::InvalidString` doesn't store an actual string now.
70 |
71 | ## [0.10.0] - 2019-09-14
72 | ### Changed
73 | - 10-15% faster parsing.
74 | - Merge `ByteStream` and `Stream`.
75 | - `StreamError::InvalidChar` signature changed.
76 | - `StreamError::InvalidChar` was split into `InvalidChar` and `InvalidCharMultiple`.
77 |
78 | ### Fixed
79 | - Check for [NameStartChar](https://www.w3.org/TR/xml/#NT-NameStartChar)
80 | during qualified name parsing.
81 |
82 | E.g. `<-p>` is an invalid tag name from now.
83 | - Qualified name with multiple `:` is an error now.
84 | - `]>` is a valid text/`CharData` now. Previously it was parsed as `DoctypeEnd`.
85 |
86 | ### Removed
87 | - `StreamError::InvalidAttributeValue`. `StreamError::InvalidChar` will be emitted instead.
88 |
89 | ## [0.9.0] - 2019-02-27
90 | ### Added
91 | - `span` field to all `Token` variants, which contains a whole token span in bytes.
92 | - `Stream::try_consume_byte`.
93 |
94 | ### Changed
95 | - All `Token` variants are structs now and not tuples.
96 | - `StrSpan` contains an actual string span an not only region now.
97 |
98 | So we can use a non-panic and zero-cost `StrSpan::as_str` instead
99 | of `StrSpan::to_str`, that was performing slicing each time.
100 | - Split `Stream` into `ByteStream` and `Stream`.
101 | - `Stream::skip_spaces` will parse only ASCII whitespace now.
102 | - Rename `StrSpan::to_str` into `StrSpan::as_str`.
103 | - Rename `Reference::EntityRef` into `Reference::Entity`.
104 | - Rename `Reference::CharRef` into `Reference::Char`.
105 | - `StrSpan::from_substr` and `StrSpan::slice_region` are private now.
106 |
107 | ### Removed
108 | - `Token::Whitespaces`. Will be parsed as `Token::Text`.
109 | - `Stream::curr_char`.
110 | - `Stream::is_curr_byte_eq`.
111 | - `Stream::consume_either`.
112 | - `Stream::skip_ascii_spaces`. Use `Stream::skip_spaces` instead.
113 | - `StrSpan::trim`.
114 | - `StrSpan::len`.
115 | - `StrSpan::full_len`.
116 | - `StrSpan::as_bytes`.
117 |
118 | ### Fixed
119 | - Declaration attributes with mixed quotes parsing.
120 |
121 | ## [0.8.1] - 2019-01-02
122 | ### Changed
123 | - Changed the crate category in the Cargo.toml
124 |
125 | ## [0.8.0] - 2018-12-13
126 | ### Added
127 | - `Error::pos()`.
128 |
129 | ### Changed
130 | - Rename `Stream::gen_error_pos` into `Stream::gen_text_pos`.
131 | - Rename `Stream::gen_error_pos_from` into `Stream::gen_text_pos_from`.
132 | - `Stream::gen_text_pos` speed up.
133 |
134 | ### Fixed
135 | - `TextPos` is Unicode aware now.
136 | - XML declaration parsing when file has a BOM.
137 |
138 | ## [0.7.0] - 2018-10-29
139 | ### Changed
140 | - `<` inside an attribute value is an error now.
141 | - `Token::Declaration` represents *standalone* as `bool` now.
142 | - XML declaration must be defined only once now.
143 | - XML declaration must start at 0 position.
144 | - DTD must be defined only once now.
145 |
146 | ## [0.6.1] - 2018-10-08
147 | ### Added
148 | - `Stream::curr_byte_unchecked`.
149 |
150 | ### Fixed
151 | - UTF-8 BOM processing.
152 |
153 | ## [0.6.0] - 2018-08-31
154 | ### Changed
155 | - `Reference::EntityRef` contains `&str` and not `StrSpan` now.
156 | - Rename `Stream::try_consume_char_reference` into `try_consume_reference`.
157 | And it will return `Reference` and not `char` now.
158 | - Rename `Tokenizer::set_fragment_mode` into `enable_fragment_mode`.
159 | - Rename `ErrorPos` into `TextPos`.
160 |
161 | ### Fixed
162 | - `TextPos` calculation via `Stream::gen_error_pos`.
163 |
164 | ### Removed
165 | - `TextUnescape` and `XmlSpace` because useless.
166 |
167 | ## [0.5.0] - 2018-06-14
168 | ### Added
169 | - `StreamError::InvalidChar`.
170 | - `StreamError::InvalidSpace`.
171 | - `StreamError::InvalidString`.
172 |
173 | ### Changed
174 | - `Stream::consume_reference` will return only `InvalidReference` error from now.
175 | - `Error::InvalidTokenWithCause` merged into `Error::InvalidToken`.
176 | - `Stream::gen_error_pos_from` does not require `mut self` from now.
177 | - `StreamError::InvalidChar` requires `Vec` and not `String` from now.
178 | - `ErrorPos` uses `u32` and not `usize` from now.
179 |
180 | ### Removed
181 | - `failure` dependency.
182 | - `log` dependency.
183 |
184 | ## [0.4.1] - 2018-05-23
185 | ### Added
186 | - An ability to parse an XML fragment.
187 |
188 | ## [0.4.0] - 2018-04-21
189 | ### Changed
190 | - Relicense from MIT to MIT/Apache-2.0.
191 |
192 | ### Removed
193 | - `FromSpan` trait.
194 | - `from_str` and `from_span` methods are removed. Use the `From` trait instead.
195 |
196 | ## [0.3.0] - 2018-04-10
197 | ### Changed
198 | - Use `failure` instead of `error-chain`.
199 | - Minimum Rust version is 1.18.
200 | - New error messages.
201 | - `TokenType` is properly public now.
202 |
203 | ### Removed
204 | - `ChainedError`
205 |
206 | ## [0.2.0] - 2018-03-11
207 | ### Added
208 | - Qualified name parsing.
209 |
210 | ### Changed
211 | - **Breaking**. `Token::ElementStart` and `Token::Attribute` contains prefix
212 | and local part of the qualified name now.
213 |
214 | ## [0.1.2] - 2018-02-12
215 | ### Added
216 | - `Stream::skip_ascii_spaces`.
217 | - Small performance optimizations.
218 |
219 | ## [0.1.1] - 2018-01-17
220 | ### Changed
221 | - `log` 0.3 -> 0.4
222 |
223 | [Unreleased]: https://github.com/RazrFalcon/xmlparser/compare/v0.13.6...HEAD
224 | [0.13.6]: https://github.com/RazrFalcon/xmlparser/compare/v0.13.5...v0.13.6
225 | [0.13.5]: https://github.com/RazrFalcon/xmlparser/compare/v0.13.4...v0.13.5
226 | [0.13.4]: https://github.com/RazrFalcon/xmlparser/compare/v0.13.3...v0.13.4
227 | [0.13.3]: https://github.com/RazrFalcon/xmlparser/compare/v0.13.2...v0.13.3
228 | [0.13.2]: https://github.com/RazrFalcon/xmlparser/compare/v0.13.1...v0.13.2
229 | [0.13.1]: https://github.com/RazrFalcon/xmlparser/compare/v0.13.0...v0.13.1
230 | [0.13.0]: https://github.com/RazrFalcon/xmlparser/compare/v0.12.0...v0.13.0
231 | [0.12.0]: https://github.com/RazrFalcon/xmlparser/compare/v0.11.0...v0.12.0
232 | [0.11.0]: https://github.com/RazrFalcon/xmlparser/compare/v0.10.0...v0.11.0
233 | [0.10.0]: https://github.com/RazrFalcon/xmlparser/compare/v0.9.0...v0.10.0
234 | [0.9.0]: https://github.com/RazrFalcon/xmlparser/compare/v0.8.1...v0.9.0
235 | [0.8.1]: https://github.com/RazrFalcon/xmlparser/compare/v0.8.0...v0.8.1
236 | [0.8.0]: https://github.com/RazrFalcon/xmlparser/compare/v0.7.0...v0.8.0
237 | [0.7.0]: https://github.com/RazrFalcon/xmlparser/compare/v0.6.1...v0.7.0
238 | [0.6.1]: https://github.com/RazrFalcon/xmlparser/compare/v0.6.0...v0.6.1
239 | [0.6.0]: https://github.com/RazrFalcon/xmlparser/compare/v0.5.0...v0.6.0
240 | [0.5.0]: https://github.com/RazrFalcon/xmlparser/compare/v0.4.1...v0.5.0
241 | [0.4.1]: https://github.com/RazrFalcon/xmlparser/compare/v0.4.0...v0.4.1
242 | [0.4.0]: https://github.com/RazrFalcon/xmlparser/compare/v0.3.0...v0.4.0
243 | [0.3.0]: https://github.com/RazrFalcon/xmlparser/compare/v0.2.0...v0.3.0
244 | [0.2.0]: https://github.com/RazrFalcon/xmlparser/compare/v0.1.2...v0.2.0
245 | [0.1.2]: https://github.com/RazrFalcon/xmlparser/compare/v0.1.1...v0.1.2
246 | [0.1.1]: https://github.com/RazrFalcon/xmlparser/compare/v0.1.0...v0.1.1
247 |
--------------------------------------------------------------------------------
/LICENSE-APACHE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/src/stream.rs:
--------------------------------------------------------------------------------
1 | use core::char;
2 | use core::cmp;
3 | use core::ops::Range;
4 | use core::str;
5 |
6 | use crate::{StrSpan, StreamError, TextPos, XmlByteExt, XmlCharExt};
7 |
8 | type Result = ::core::result::Result;
9 |
10 | /// Representation of the [Reference](https://www.w3.org/TR/xml/#NT-Reference) value.
11 | #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
12 | pub enum Reference<'a> {
13 | /// An entity reference.
14 | ///
15 | ///
16 | Entity(&'a str),
17 |
18 | /// A character reference.
19 | ///
20 | ///
21 | Char(char),
22 | }
23 |
24 | /// A streaming XML parsing interface.
25 | #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
26 | pub struct Stream<'a> {
27 | pos: usize,
28 | end: usize,
29 | span: StrSpan<'a>,
30 | }
31 |
32 | impl<'a> From<&'a str> for Stream<'a> {
33 | #[inline]
34 | fn from(text: &'a str) -> Self {
35 | Stream {
36 | pos: 0,
37 | end: text.len(),
38 | span: text.into(),
39 | }
40 | }
41 | }
42 |
43 | impl<'a> From> for Stream<'a> {
44 | #[inline]
45 | fn from(span: StrSpan<'a>) -> Self {
46 | Stream {
47 | pos: 0,
48 | end: span.as_str().len(),
49 | span,
50 | }
51 | }
52 | }
53 |
54 | impl<'a> Stream<'a> {
55 | /// Creates a new stream from a specified `text` substring.
56 | #[inline]
57 | pub fn from_substr(text: &'a str, fragment: Range) -> Self {
58 | Stream {
59 | pos: fragment.start,
60 | end: fragment.end,
61 | span: text.into(),
62 | }
63 | }
64 |
65 | /// Returns an underling string span.
66 | #[inline]
67 | pub fn span(&self) -> StrSpan<'a> {
68 | self.span
69 | }
70 |
71 | /// Returns current position.
72 | #[inline]
73 | pub fn pos(&self) -> usize {
74 | self.pos
75 | }
76 |
77 | /// Sets current position equal to the end.
78 | ///
79 | /// Used to indicate end of parsing on error.
80 | #[inline]
81 | pub fn jump_to_end(&mut self) {
82 | self.pos = self.end;
83 | }
84 |
85 | /// Checks if the stream is reached the end.
86 | ///
87 | /// Any [`pos()`] value larger than original text length indicates stream end.
88 | ///
89 | /// Accessing stream after reaching end via safe methods will produce
90 | /// an `UnexpectedEndOfStream` error.
91 | ///
92 | /// Accessing stream after reaching end via *_unchecked methods will produce
93 | /// a Rust's bound checking error.
94 | ///
95 | /// [`pos()`]: #method.pos
96 | #[inline]
97 | pub fn at_end(&self) -> bool {
98 | self.pos >= self.end
99 | }
100 |
101 | /// Returns a byte from a current stream position.
102 | ///
103 | /// # Errors
104 | ///
105 | /// - `UnexpectedEndOfStream`
106 | #[inline]
107 | pub fn curr_byte(&self) -> Result {
108 | if self.at_end() {
109 | return Err(StreamError::UnexpectedEndOfStream);
110 | }
111 |
112 | Ok(self.curr_byte_unchecked())
113 | }
114 |
115 | /// Returns a byte from a current stream position.
116 | ///
117 | /// # Panics
118 | ///
119 | /// - if the current position is after the end of the data
120 | #[inline]
121 | pub fn curr_byte_unchecked(&self) -> u8 {
122 | self.span.as_bytes()[self.pos]
123 | }
124 |
125 | /// Returns a next byte from a current stream position.
126 | ///
127 | /// # Errors
128 | ///
129 | /// - `UnexpectedEndOfStream`
130 | #[inline]
131 | pub fn next_byte(&self) -> Result {
132 | if self.pos + 1 >= self.end {
133 | return Err(StreamError::UnexpectedEndOfStream);
134 | }
135 |
136 | Ok(self.span.as_bytes()[self.pos + 1])
137 | }
138 |
139 | /// Advances by `n` bytes.
140 | ///
141 | /// # Examples
142 | ///
143 | /// ```rust,should_panic
144 | /// use xmlparser::Stream;
145 | ///
146 | /// let mut s = Stream::from("text");
147 | /// s.advance(2); // ok
148 | /// s.advance(20); // will cause a panic via debug_assert!().
149 | /// ```
150 | #[inline]
151 | pub fn advance(&mut self, n: usize) {
152 | debug_assert!(self.pos + n <= self.end);
153 | self.pos += n;
154 | }
155 |
156 | /// Checks that the stream starts with a selected text.
157 | ///
158 | /// We are using `&[u8]` instead of `&str` for performance reasons.
159 | ///
160 | /// # Examples
161 | ///
162 | /// ```
163 | /// use xmlparser::Stream;
164 | ///
165 | /// let mut s = Stream::from("Some text.");
166 | /// s.advance(5);
167 | /// assert_eq!(s.starts_with(b"text"), true);
168 | /// assert_eq!(s.starts_with(b"long"), false);
169 | /// ```
170 | #[inline]
171 | pub fn starts_with(&self, text: &[u8]) -> bool {
172 | self.span.as_bytes()[self.pos..self.end].starts_with(text)
173 | }
174 |
175 | /// Consumes the current byte if it's equal to the provided byte.
176 | ///
177 | /// # Errors
178 | ///
179 | /// - `InvalidChar`
180 | /// - `UnexpectedEndOfStream`
181 | ///
182 | /// # Examples
183 | ///
184 | /// ```
185 | /// use xmlparser::Stream;
186 | ///
187 | /// let mut s = Stream::from("Some text.");
188 | /// assert!(s.consume_byte(b'S').is_ok());
189 | /// assert!(s.consume_byte(b'o').is_ok());
190 | /// assert!(s.consume_byte(b'm').is_ok());
191 | /// assert!(s.consume_byte(b'q').is_err());
192 | /// ```
193 | pub fn consume_byte(&mut self, c: u8) -> Result<()> {
194 | let curr = self.curr_byte()?;
195 | if curr != c {
196 | return Err(StreamError::InvalidChar(curr, c, self.gen_text_pos()));
197 | }
198 |
199 | self.advance(1);
200 | Ok(())
201 | }
202 |
203 | /// Tries to consume the current byte if it's equal to the provided byte.
204 | ///
205 | /// Unlike `consume_byte()` will not return any errors.
206 | pub fn try_consume_byte(&mut self, c: u8) -> bool {
207 | match self.curr_byte() {
208 | Ok(b) if b == c => {
209 | self.advance(1);
210 | true
211 | }
212 | _ => false,
213 | }
214 | }
215 |
216 | /// Skips selected string.
217 | ///
218 | /// # Errors
219 | ///
220 | /// - `InvalidString`
221 | pub fn skip_string(&mut self, text: &'static [u8]) -> Result<()> {
222 | if !self.starts_with(text) {
223 | let pos = self.gen_text_pos();
224 |
225 | // Assume that all input `text` are valid UTF-8 strings, so unwrap is safe.
226 | let expected = str::from_utf8(text).unwrap();
227 |
228 | return Err(StreamError::InvalidString(expected, pos));
229 | }
230 |
231 | self.advance(text.len());
232 | Ok(())
233 | }
234 |
235 | /// Consumes bytes by the predicate and returns them.
236 | ///
237 | /// The result can be empty.
238 | #[inline]
239 | pub fn consume_bytes(&mut self, f: F) -> StrSpan<'a>
240 | where
241 | F: Fn(&Stream, u8) -> bool,
242 | {
243 | let start = self.pos;
244 | self.skip_bytes(f);
245 | self.slice_back(start)
246 | }
247 |
248 | /// Skips bytes by the predicate.
249 | pub fn skip_bytes(&mut self, f: F)
250 | where
251 | F: Fn(&Stream, u8) -> bool,
252 | {
253 | while !self.at_end() && f(self, self.curr_byte_unchecked()) {
254 | self.advance(1);
255 | }
256 | }
257 |
258 | /// Consumes chars by the predicate and returns them.
259 | ///
260 | /// The result can be empty.
261 | #[inline]
262 | pub fn consume_chars(&mut self, f: F) -> Result>
263 | where
264 | F: Fn(&Stream, char) -> bool,
265 | {
266 | let start = self.pos;
267 | self.skip_chars(f)?;
268 | Ok(self.slice_back(start))
269 | }
270 |
271 | /// Skips chars by the predicate.
272 | #[inline]
273 | pub fn skip_chars(&mut self, f: F) -> Result<()>
274 | where
275 | F: Fn(&Stream, char) -> bool,
276 | {
277 | for c in self.chars() {
278 | if !c.is_xml_char() {
279 | return Err(StreamError::NonXmlChar(c, self.gen_text_pos()));
280 | } else if f(self, c) {
281 | self.advance(c.len_utf8());
282 | } else {
283 | break;
284 | }
285 | }
286 |
287 | Ok(())
288 | }
289 |
290 | #[inline]
291 | pub(crate) fn chars(&self) -> str::Chars<'a> {
292 | self.span.as_str()[self.pos..self.end].chars()
293 | }
294 |
295 | /// Slices data from `pos` to the current position.
296 | #[inline]
297 | pub fn slice_back(&self, pos: usize) -> StrSpan<'a> {
298 | self.span.slice_region(pos, self.pos)
299 | }
300 |
301 | /// Slices data from the current position to the end.
302 | #[inline]
303 | pub fn slice_tail(&self) -> StrSpan<'a> {
304 | self.span.slice_region(self.pos, self.end)
305 | }
306 |
307 | /// Skips whitespaces.
308 | ///
309 | /// Accepted values: `' ' \n \r \t`.
310 | #[inline]
311 | pub fn skip_spaces(&mut self) {
312 | while !self.at_end() && self.curr_byte_unchecked().is_xml_space() {
313 | self.advance(1);
314 | }
315 | }
316 |
317 | /// Checks if the stream is starts with a space.
318 | #[inline]
319 | pub fn starts_with_space(&self) -> bool {
320 | !self.at_end() && self.curr_byte_unchecked().is_xml_space()
321 | }
322 |
323 | /// Consumes whitespaces.
324 | ///
325 | /// Like [`skip_spaces()`], but checks that first char is actually a space.
326 | ///
327 | /// [`skip_spaces()`]: #method.skip_spaces
328 | ///
329 | /// # Errors
330 | ///
331 | /// - `InvalidSpace`
332 | pub fn consume_spaces(&mut self) -> Result<()> {
333 | if self.at_end() {
334 | return Err(StreamError::UnexpectedEndOfStream);
335 | }
336 |
337 | if !self.starts_with_space() {
338 | return Err(StreamError::InvalidSpace(
339 | self.curr_byte_unchecked(),
340 | self.gen_text_pos(),
341 | ));
342 | }
343 |
344 | self.skip_spaces();
345 | Ok(())
346 | }
347 |
348 | /// Consumes an XML character reference if there is one.
349 | ///
350 | /// On error will reset the position to the original.
351 | pub fn try_consume_reference(&mut self) -> Option> {
352 | let start = self.pos();
353 |
354 | // Consume reference on a substream.
355 | let mut s = *self;
356 | match s.consume_reference() {
357 | Ok(r) => {
358 | // If the current data is a reference than advance the current stream
359 | // by number of bytes read by substream.
360 | self.advance(s.pos() - start);
361 | Some(r)
362 | }
363 | Err(_) => None,
364 | }
365 | }
366 |
367 | /// Consumes an XML reference.
368 | ///
369 | /// Consumes according to:
370 | ///
371 | /// # Errors
372 | ///
373 | /// - `InvalidReference`
374 | pub fn consume_reference(&mut self) -> Result> {
375 | self._consume_reference()
376 | .map_err(|_| StreamError::InvalidReference)
377 | }
378 |
379 | #[inline(never)]
380 | fn _consume_reference(&mut self) -> Result> {
381 | if !self.try_consume_byte(b'&') {
382 | return Err(StreamError::InvalidReference);
383 | }
384 |
385 | let reference = if self.try_consume_byte(b'#') {
386 | let (value, radix) = if self.try_consume_byte(b'x') {
387 | let value = self.consume_bytes(|_, c| c.is_xml_hex_digit()).as_str();
388 | (value, 16)
389 | } else {
390 | let value = self.consume_bytes(|_, c| c.is_xml_digit()).as_str();
391 | (value, 10)
392 | };
393 |
394 | let n = u32::from_str_radix(value, radix).map_err(|_| StreamError::InvalidReference)?;
395 |
396 | let c = char::from_u32(n).unwrap_or('\u{FFFD}');
397 | if !c.is_xml_char() {
398 | return Err(StreamError::InvalidReference);
399 | }
400 |
401 | Reference::Char(c)
402 | } else {
403 | let name = self.consume_name()?;
404 | match name.as_str() {
405 | "quot" => Reference::Char('"'),
406 | "amp" => Reference::Char('&'),
407 | "apos" => Reference::Char('\''),
408 | "lt" => Reference::Char('<'),
409 | "gt" => Reference::Char('>'),
410 | _ => Reference::Entity(name.as_str()),
411 | }
412 | };
413 |
414 | self.consume_byte(b';')?;
415 |
416 | Ok(reference)
417 | }
418 |
419 | /// Consumes an XML name and returns it.
420 | ///
421 | /// Consumes according to:
422 | ///
423 | /// # Errors
424 | ///
425 | /// - `InvalidName` - if name is empty or starts with an invalid char
426 | /// - `UnexpectedEndOfStream`
427 | pub fn consume_name(&mut self) -> Result> {
428 | let start = self.pos();
429 | self.skip_name()?;
430 |
431 | let name = self.slice_back(start);
432 | if name.is_empty() {
433 | return Err(StreamError::InvalidName);
434 | }
435 |
436 | Ok(name)
437 | }
438 |
439 | /// Skips an XML name.
440 | ///
441 | /// The same as `consume_name()`, but does not return a consumed name.
442 | ///
443 | /// # Errors
444 | ///
445 | /// - `InvalidName` - if name is empty or starts with an invalid char
446 | pub fn skip_name(&mut self) -> Result<()> {
447 | let mut iter = self.chars();
448 | if let Some(c) = iter.next() {
449 | if c.is_xml_name_start() {
450 | self.advance(c.len_utf8());
451 | } else {
452 | return Err(StreamError::InvalidName);
453 | }
454 | }
455 |
456 | for c in iter {
457 | if c.is_xml_name() {
458 | self.advance(c.len_utf8());
459 | } else {
460 | break;
461 | }
462 | }
463 |
464 | Ok(())
465 | }
466 |
467 | /// Consumes a qualified XML name and returns it.
468 | ///
469 | /// Consumes according to:
470 | ///
471 | /// # Errors
472 | ///
473 | /// - `InvalidName` - if name is empty or starts with an invalid char
474 | #[inline(never)]
475 | pub fn consume_qname(&mut self) -> Result<(StrSpan<'a>, StrSpan<'a>)> {
476 | let start = self.pos();
477 |
478 | let mut splitter = None;
479 |
480 | while !self.at_end() {
481 | // Check for ASCII first for performance reasons.
482 | let b = self.curr_byte_unchecked();
483 | if b < 128 {
484 | if b == b':' {
485 | if splitter.is_none() {
486 | splitter = Some(self.pos());
487 | self.advance(1);
488 | } else {
489 | // Multiple `:` is an error.
490 | return Err(StreamError::InvalidName);
491 | }
492 | } else if b.is_xml_name() {
493 | self.advance(1);
494 | } else {
495 | break;
496 | }
497 | } else {
498 | // Fallback to Unicode code point.
499 | match self.chars().nth(0) {
500 | Some(c) if c.is_xml_name() => {
501 | self.advance(c.len_utf8());
502 | }
503 | _ => break,
504 | }
505 | }
506 | }
507 |
508 | let (prefix, local) = if let Some(splitter) = splitter {
509 | let prefix = self.span().slice_region(start, splitter);
510 | let local = self.slice_back(splitter + 1);
511 | (prefix, local)
512 | } else {
513 | let local = self.slice_back(start);
514 | ("".into(), local)
515 | };
516 |
517 | // Prefix must start with a `NameStartChar`.
518 | if let Some(c) = prefix.as_str().chars().nth(0) {
519 | if !c.is_xml_name_start() {
520 | return Err(StreamError::InvalidName);
521 | }
522 | }
523 |
524 | // Local name must start with a `NameStartChar`.
525 | if let Some(c) = local.as_str().chars().nth(0) {
526 | if !c.is_xml_name_start() {
527 | return Err(StreamError::InvalidName);
528 | }
529 | } else {
530 | // If empty - error.
531 | return Err(StreamError::InvalidName);
532 | }
533 |
534 | Ok((prefix, local))
535 | }
536 |
537 | /// Consumes `=`.
538 | ///
539 | /// Consumes according to:
540 | ///
541 | /// # Errors
542 | ///
543 | /// - `InvalidChar`
544 | /// - `UnexpectedEndOfStream`
545 | pub fn consume_eq(&mut self) -> Result<()> {
546 | self.skip_spaces();
547 | self.consume_byte(b'=')?;
548 | self.skip_spaces();
549 |
550 | Ok(())
551 | }
552 |
553 | /// Consumes quote.
554 | ///
555 | /// Consumes `'` or `"` and returns it.
556 | ///
557 | /// # Errors
558 | ///
559 | /// - `InvalidQuote`
560 | /// - `UnexpectedEndOfStream`
561 | pub fn consume_quote(&mut self) -> Result {
562 | let c = self.curr_byte()?;
563 | if c == b'\'' || c == b'"' {
564 | self.advance(1);
565 | Ok(c)
566 | } else {
567 | Err(StreamError::InvalidQuote(c, self.gen_text_pos()))
568 | }
569 | }
570 |
571 | /// Calculates a current absolute position.
572 | ///
573 | /// This operation is very expensive. Use only for errors.
574 | #[inline(never)]
575 | pub fn gen_text_pos(&self) -> TextPos {
576 | let text = self.span.as_str();
577 | let end = self.pos;
578 |
579 | let row = Self::calc_curr_row(text, end);
580 | let col = Self::calc_curr_col(text, end);
581 | TextPos::new(row, col)
582 | }
583 |
584 | /// Calculates an absolute position at `pos`.
585 | ///
586 | /// This operation is very expensive. Use only for errors.
587 | ///
588 | /// # Examples
589 | ///
590 | /// ```
591 | /// let s = xmlparser::Stream::from("text");
592 | ///
593 | /// assert_eq!(s.gen_text_pos_from(2), xmlparser::TextPos::new(1, 3));
594 | /// assert_eq!(s.gen_text_pos_from(9999), xmlparser::TextPos::new(1, 5));
595 | /// ```
596 | #[inline(never)]
597 | pub fn gen_text_pos_from(&self, pos: usize) -> TextPos {
598 | let mut s = *self;
599 | s.pos = cmp::min(pos, s.span.as_str().len());
600 | s.gen_text_pos()
601 | }
602 |
603 | fn calc_curr_row(text: &str, end: usize) -> u32 {
604 | let mut row = 1;
605 | for c in &text.as_bytes()[..end] {
606 | if *c == b'\n' {
607 | row += 1;
608 | }
609 | }
610 |
611 | row
612 | }
613 |
614 | fn calc_curr_col(text: &str, end: usize) -> u32 {
615 | let mut col = 1;
616 | for c in text[..end].chars().rev() {
617 | if c == '\n' {
618 | break;
619 | } else {
620 | col += 1;
621 | }
622 | }
623 |
624 | col
625 | }
626 | }
627 |
--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
1 | //! [
](https://github.com/RazrFalcon/xmlparser)
2 | //! [
](https://crates.io/crates/xmlparser)
3 | //! [
](https://docs.rs/xmlparser)
4 | //!
5 | //! *xmlparser* is a low-level, pull-based, zero-allocation
6 | //! [XML 1.0](https://www.w3.org/TR/xml/) parser.
7 | //!
8 | //!
9 | //!
10 | //! ## Example
11 | //!
12 | //! ```rust
13 | //! for token in xmlparser::Tokenizer::from("") {
14 | //! println!("{:?}", token);
15 | //! }
16 | //! ```
17 | //!
18 | //!
19 | //!
20 | //! ## Why a new library?
21 | //!
22 | //! This library is basically a low-level XML tokenizer that preserves the
23 | //! positions of the tokens and is not intended to be used directly.
24 | //!
25 | //! If you are looking for a higher level solution, check out
26 | //! [roxmltree](https://github.com/RazrFalcon/roxmltree).
27 | //!
28 | //!
29 | //!
30 | //! ## Benefits
31 | //!
32 | //! - All tokens contain `StrSpan` structs which represent the position of the
33 | //! substring in the original document.
34 | //! - Good error processing. All error types contain the position (line:column)
35 | //! where it occurred.
36 | //! - No heap allocations.
37 | //! - No dependencies.
38 | //! - Tiny. ~1400 LOC and ~30KiB in the release build according to
39 | //! `cargo-bloat`.
40 | //! - Supports `no_std` builds. To use without the standard library, disable the
41 | //! default features.
42 | //!
43 | //!
44 | //!
45 | //! ## Limitations
46 | //!
47 | //! - Currently, only ENTITY objects are parsed from the DOCTYPE. All others are
48 | //! ignored.
49 | //! - No tree structure validation. So an XML like
50 | //! `` or a string without root element will be
51 | //! parsed without errors. You should check for this manually. On the other
52 | //! hand `` will lead to an error.
53 | //! - Duplicated attributes is not an error. So XML like ` `
54 | //! will be parsed without errors. You should check for this manually.
55 | //! - UTF-8 only.
56 | //!
57 | //!
58 | //!
59 | //! ## Safety
60 | //!
61 | //! - The library must not panic. Any panic is considered a critical bug and
62 | //! should be reported.
63 | //! - The library forbids unsafe code.
64 | //!
65 | //!
66 | //!
67 | //! ## License
68 | //!
69 | //! Licensed under either of
70 | //!
71 | //! - Apache License, Version 2.0 ([LICENSE-APACHE] or
72 | //! http://www.apache.org/licenses/LICENSE-2.0)
73 | //! - MIT license ([LICENSE-MIT] or http://opensource.org/licenses/MIT)
74 | //!
75 | //! at your option.
76 | //!
77 | //!
78 | //!
79 | //! ### Contribution
80 | //!
81 | //! Unless you explicitly state otherwise, any contribution intentionally submitted
82 | //! for inclusion in the work by you, as defined in the Apache-2.0 license, shall be
83 | //! dual licensed as above, without any additional terms or conditions.
84 | //!
85 | //! [LICENSE-APACHE]: https://github.com/RazrFalcon/xmlparser/blob/master/LICENSE-APACHE
86 | //! [LICENSE-MIT]: https://github.com/RazrFalcon/xmlparser/blob/master/LICENSE-MIT
87 |
88 | #![no_std]
89 | #![forbid(unsafe_code)]
90 | #![warn(missing_docs)]
91 | #![allow(ellipsis_inclusive_range_patterns)]
92 |
93 | #[cfg(feature = "std")]
94 | #[macro_use]
95 | extern crate std;
96 |
97 | macro_rules! matches {
98 | ($expression:expr, $($pattern:tt)+) => {
99 | match $expression {
100 | $($pattern)+ => true,
101 | _ => false
102 | }
103 | }
104 | }
105 |
106 | mod error;
107 | mod stream;
108 | mod strspan;
109 | mod xmlchar;
110 |
111 | pub use crate::error::*;
112 | pub use crate::stream::*;
113 | pub use crate::strspan::*;
114 | pub use crate::xmlchar::*;
115 |
116 | /// An XML token.
117 | #[allow(missing_docs)]
118 | #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
119 | pub enum Token<'a> {
120 | /// Declaration token.
121 | ///
122 | /// ```text
123 | ///
124 | /// --- - version
125 | /// ----- - encoding?
126 | /// --- - standalone?
127 | /// ------------------------------------------------------- - span
128 | /// ```
129 | Declaration {
130 | version: StrSpan<'a>,
131 | encoding: Option>,
132 | standalone: Option,
133 | span: StrSpan<'a>,
134 | },
135 |
136 | /// Processing instruction token.
137 | ///
138 | /// ```text
139 | ///
140 | /// ------ - target
141 | /// ------- - content?
142 | /// ------------------ - span
143 | /// ```
144 | ProcessingInstruction {
145 | target: StrSpan<'a>,
146 | content: Option>,
147 | span: StrSpan<'a>,
148 | },
149 |
150 | /// Comment token.
151 | ///
152 | /// ```text
153 | ///
154 | /// ------ - text
155 | /// ------------- - span
156 | /// ```
157 | Comment {
158 | text: StrSpan<'a>,
159 | span: StrSpan<'a>,
160 | },
161 |
162 | /// DOCTYPE start token.
163 | ///
164 | /// ```text
165 | /// ,
172 | external_id: Option>,
173 | span: StrSpan<'a>,
174 | },
175 |
176 | /// Empty DOCTYPE token.
177 | ///
178 | /// ```text
179 | ///
180 | /// -------- - name
181 | /// ------------------ - external_id?
182 | /// -------------------------------------- - span
183 | /// ```
184 | EmptyDtd {
185 | name: StrSpan<'a>,
186 | external_id: Option>,
187 | span: StrSpan<'a>,
188 | },
189 |
190 | /// ENTITY token.
191 | ///
192 | /// Can appear only inside the DTD.
193 | ///
194 | /// ```text
195 | ///
196 | /// --------- - name
197 | /// --------------- - definition
198 | /// ------------------------------------- - span
199 | /// ```
200 | EntityDeclaration {
201 | name: StrSpan<'a>,
202 | definition: EntityDefinition<'a>,
203 | span: StrSpan<'a>,
204 | },
205 |
206 | /// DOCTYPE end token.
207 | ///
208 | /// ```text
209 | ///
212 | /// -- - span
213 | /// ```
214 | DtdEnd { span: StrSpan<'a> },
215 |
216 | /// Element start token.
217 | ///
218 | /// ```text
219 | ///
220 | /// -- - prefix
221 | /// ---- - local
222 | /// -------- - span
223 | /// ```
224 | ElementStart {
225 | prefix: StrSpan<'a>,
226 | local: StrSpan<'a>,
227 | span: StrSpan<'a>,
228 | },
229 |
230 | /// Attribute token.
231 | ///
232 | /// ```text
233 | ///
234 | /// -- - prefix
235 | /// ---- - local
236 | /// ----- - value
237 | /// --------------- - span
238 | /// ```
239 | Attribute {
240 | prefix: StrSpan<'a>,
241 | local: StrSpan<'a>,
242 | value: StrSpan<'a>,
243 | span: StrSpan<'a>,
244 | },
245 |
246 | /// Element end token.
247 | ///
248 | /// ```text
249 | /// text
250 | /// - ElementEnd::Open
251 | /// - - span
252 | /// ```
253 | ///
254 | /// ```text
255 | /// text
256 | /// -- ---- - ElementEnd::Close(prefix, local)
257 | /// ---------- - span
258 | /// ```
259 | ///
260 | /// ```text
261 | ///
262 | /// - ElementEnd::Empty
263 | /// -- - span
264 | /// ```
265 | ElementEnd {
266 | end: ElementEnd<'a>,
267 | span: StrSpan<'a>,
268 | },
269 |
270 | /// Text token.
271 | ///
272 | /// Contains text between elements including whitespaces.
273 | /// Basically everything between `>` and `<`.
274 | /// Except `]]>`, which is not allowed and will lead to an error.
275 | ///
276 | /// ```text
277 | /// text
278 | /// ------ - text
279 | /// ```
280 | ///
281 | /// The token span is equal to the `text`.
282 | Text { text: StrSpan<'a> },
283 |
284 | /// CDATA token.
285 | ///
286 | /// ```text
287 | ///
288 | /// ---- - text
289 | /// ---------------- - span
290 | /// ```
291 | Cdata {
292 | text: StrSpan<'a>,
293 | span: StrSpan<'a>,
294 | },
295 | }
296 |
297 | impl<'a> Token<'a> {
298 | /// Returns the [`StrSpan`] encompassing all of the token.
299 | pub fn span(&self) -> StrSpan<'a> {
300 | let span = match self {
301 | Token::Declaration { span, .. } => span,
302 | Token::ProcessingInstruction { span, .. } => span,
303 | Token::Comment { span, .. } => span,
304 | Token::DtdStart { span, .. } => span,
305 | Token::EmptyDtd { span, .. } => span,
306 | Token::EntityDeclaration { span, .. } => span,
307 | Token::DtdEnd { span, .. } => span,
308 | Token::ElementStart { span, .. } => span,
309 | Token::Attribute { span, .. } => span,
310 | Token::ElementEnd { span, .. } => span,
311 | Token::Text { text, .. } => text,
312 | Token::Cdata { span, .. } => span,
313 | };
314 | *span
315 | }
316 | }
317 |
318 | /// `ElementEnd` token.
319 | #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
320 | pub enum ElementEnd<'a> {
321 | /// Indicates `>`
322 | Open,
323 | /// Indicates ``
324 | Close(StrSpan<'a>, StrSpan<'a>),
325 | /// Indicates `/>`
326 | Empty,
327 | }
328 |
329 | /// Representation of the [ExternalID](https://www.w3.org/TR/xml/#NT-ExternalID) value.
330 | #[allow(missing_docs)]
331 | #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
332 | pub enum ExternalId<'a> {
333 | System(StrSpan<'a>),
334 | Public(StrSpan<'a>, StrSpan<'a>),
335 | }
336 |
337 | /// Representation of the [EntityDef](https://www.w3.org/TR/xml/#NT-EntityDef) value.
338 | #[allow(missing_docs)]
339 | #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
340 | pub enum EntityDefinition<'a> {
341 | EntityValue(StrSpan<'a>),
342 | ExternalId(ExternalId<'a>),
343 | }
344 |
345 | type Result = core::result::Result;
346 | type StreamResult = core::result::Result;
347 |
348 | #[derive(Clone, Copy, PartialEq, Debug)]
349 | enum State {
350 | Declaration,
351 | AfterDeclaration,
352 | Dtd,
353 | AfterDtd,
354 | Elements,
355 | Attributes,
356 | AfterElements,
357 | End,
358 | }
359 |
360 | /// Tokenizer for the XML structure.
361 | #[derive(Clone)]
362 | pub struct Tokenizer<'a> {
363 | stream: Stream<'a>,
364 | state: State,
365 | depth: usize,
366 | fragment_parsing: bool,
367 | }
368 |
369 | impl core::fmt::Debug for Tokenizer<'_> {
370 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
371 | write!(f, "Tokenizer {{ ... }}")
372 | }
373 | }
374 |
375 | impl<'a> From<&'a str> for Tokenizer<'a> {
376 | #[inline]
377 | fn from(text: &'a str) -> Self {
378 | let mut stream = Stream::from(text);
379 |
380 | // Skip UTF-8 BOM.
381 | if stream.starts_with(&[0xEF, 0xBB, 0xBF]) {
382 | stream.advance(3);
383 | }
384 |
385 | Tokenizer {
386 | stream,
387 | state: State::Declaration,
388 | depth: 0,
389 | fragment_parsing: false,
390 | }
391 | }
392 | }
393 |
394 | macro_rules! map_err_at {
395 | ($fun:expr, $stream:expr, $err:ident) => {{
396 | let start = $stream.pos();
397 | $fun.map_err(|e| Error::$err(e, $stream.gen_text_pos_from(start)))
398 | }};
399 | }
400 |
401 | impl<'a> Tokenizer<'a> {
402 | /// Enables document fragment parsing.
403 | ///
404 | /// By default, `xmlparser` will check for DTD, root element, etc.
405 | /// But if we have to parse an XML fragment, it will lead to an error.
406 | /// This method switches the parser to the root element content parsing mode,
407 | /// so it will treat any data as a content of the root element.
408 | pub fn from_fragment(full_text: &'a str, fragment: core::ops::Range) -> Self {
409 | Tokenizer {
410 | stream: Stream::from_substr(full_text, fragment),
411 | state: State::Elements,
412 | depth: 0,
413 | fragment_parsing: true,
414 | }
415 | }
416 |
417 | fn parse_next_impl(&mut self) -> Option>> {
418 | let s = &mut self.stream;
419 |
420 | if s.at_end() {
421 | return None;
422 | }
423 |
424 | let start = s.pos();
425 |
426 | match self.state {
427 | State::Declaration => {
428 | self.state = State::AfterDeclaration;
429 | if s.starts_with(b" {
436 | if s.starts_with(b" self.state = State::Dtd,
440 | Ok(Token::EmptyDtd { .. }) => self.state = State::AfterDtd,
441 | _ => {}
442 | }
443 |
444 | Some(t)
445 | } else if s.starts_with(b"'
728 | fn parse_comment_impl(s: &mut Stream<'a>) -> StreamResult> {
729 | let start = s.pos();
730 | s.advance(4);
731 | let text = s.consume_chars(|s, c| !(c == '-' && s.starts_with(b"-->")))?;
732 | s.skip_string(b"-->")?;
733 |
734 | if text.as_str().contains("--") {
735 | return Err(StreamError::InvalidCommentData);
736 | }
737 |
738 | if text.as_str().ends_with('-') {
739 | return Err(StreamError::InvalidCommentEnd);
740 | }
741 |
742 | let span = s.slice_back(start);
743 |
744 | Ok(Token::Comment { text, span })
745 | }
746 |
747 | fn parse_pi(s: &mut Stream<'a>) -> Result> {
748 | map_err_at!(Self::parse_pi_impl(s), s, InvalidPI)
749 | }
750 |
751 | // PI ::= '' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
752 | // PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
753 | fn parse_pi_impl(s: &mut Stream<'a>) -> StreamResult> {
754 | let start = s.pos();
755 | s.advance(2);
756 | let target = s.consume_name()?;
757 | s.skip_spaces();
758 | let content = s.consume_chars(|s, c| !(c == '?' && s.starts_with(b"?>")))?;
759 | let content = if !content.is_empty() {
760 | Some(content)
761 | } else {
762 | None
763 | };
764 |
765 | s.skip_string(b"?>")?;
766 |
767 | let span = s.slice_back(start);
768 |
769 | Ok(Token::ProcessingInstruction {
770 | target,
771 | content,
772 | span,
773 | })
774 | }
775 |
776 | fn parse_doctype(s: &mut Stream<'a>) -> Result> {
777 | map_err_at!(Self::parse_doctype_impl(s), s, InvalidDoctype)
778 | }
779 |
780 | // doctypedecl ::= ''
781 | fn parse_doctype_impl(s: &mut Stream<'a>) -> StreamResult> {
782 | let start = s.pos();
783 | s.advance(9);
784 |
785 | s.consume_spaces()?;
786 | let name = s.consume_name()?;
787 | s.skip_spaces();
788 |
789 | let external_id = Self::parse_external_id(s)?;
790 | s.skip_spaces();
791 |
792 | let c = s.curr_byte()?;
793 | if c != b'[' && c != b'>' {
794 | static EXPECTED: &[u8] = b"[>";
795 | return Err(StreamError::InvalidCharMultiple(
796 | c,
797 | EXPECTED,
798 | s.gen_text_pos(),
799 | ));
800 | }
801 |
802 | s.advance(1);
803 |
804 | let span = s.slice_back(start);
805 | if c == b'[' {
806 | Ok(Token::DtdStart {
807 | name,
808 | external_id,
809 | span,
810 | })
811 | } else {
812 | Ok(Token::EmptyDtd {
813 | name,
814 | external_id,
815 | span,
816 | })
817 | }
818 | }
819 |
820 | // ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
821 | fn parse_external_id(s: &mut Stream<'a>) -> StreamResult