├── redirect-xml-rs
├── README.md
├── src
│ └── lib.rs
└── Cargo.toml
├── tests
├── cases
│ ├── quote.xml
│ ├── xmlnsquote.xml
│ └── autosar.xml
├── xmlts20130923.zip
├── documents
│ ├── sample_6.xml
│ ├── sample_5.xml
│ ├── sample_5_short.txt
│ ├── sample_6_full.txt
│ ├── sample_7.xml
│ ├── sample_3.xml
│ ├── sample_4.xml
│ ├── sample_4_short.txt
│ ├── sample_3_short.txt
│ ├── sample_7_full.txt
│ ├── sample_2.xml
│ ├── sample_4_full.txt
│ ├── sample_3_full.txt
│ ├── sample_2_short.txt
│ ├── sample_1.xml
│ ├── sample_1_short.txt
│ ├── sample_2_full.txt
│ ├── sample_8_coalesce_all.txt
│ ├── sample_8_coalesce_cwscdch.txt
│ ├── sample_1_full.txt
│ ├── sample_8.xml
│ ├── sample_8_c.txt
│ ├── sample_8_coalesce_wscdch.txt
│ ├── sample_8_wscdch.txt
│ ├── sample_8_full.txt
│ └── sample_8_wsch.txt
├── rmt-ns11.fail.txt
├── tests.xml
├── sun-valid.fail.txt
├── errata3e.fail.txt
├── ibm_oasis_valid.fail.txt
├── ibm_valid.fail.txt
├── xml11.fail.txt
├── errata2e.fail.txt
├── errata4e.fail.txt
├── rmt-ns10.fail.txt
├── sun-not-wf.fail.txt
├── streaming.rs
├── oasis.fail.txt
├── xmltest.fail.txt
└── xmlconf.rs
├── .github
├── FUNDING.yml
└── workflows
│ └── main.yml
├── .gitignore
├── README.md
├── .rustfmt.toml
├── Cargo.toml
├── src
├── lib.rs
├── macros.rs
├── reader
│ ├── parser
│ │ ├── inside_comment.rs
│ │ ├── inside_cdata.rs
│ │ ├── inside_closing_tag_name.rs
│ │ ├── inside_reference.rs
│ │ ├── inside_processing_instruction.rs
│ │ ├── inside_opening_tag.rs
│ │ ├── outside_tag.rs
│ │ └── inside_declaration.rs
│ ├── indexset.rs
│ └── events.rs
├── attribute.rs
├── writer.rs
├── escape.rs
├── common.rs
├── reader.rs
├── writer
│ ├── config.rs
│ └── events.rs
└── name.rs
├── LICENSE
├── examples
├── rewrite.rs
├── print_events.rs
└── xml-analyze.rs
├── Changelog.md
└── benches
└── bench.rs
/redirect-xml-rs/README.md:
--------------------------------------------------------------------------------
1 | ../README.md
--------------------------------------------------------------------------------
/tests/cases/quote.xml:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: ["kornelski"]
2 | liberapay: ["kornel"]
3 |
--------------------------------------------------------------------------------
/tests/cases/xmlnsquote.xml:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/tests/xmlts20130923.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netvl/xml-rs/HEAD/tests/xmlts20130923.zip
--------------------------------------------------------------------------------
/tests/documents/sample_6.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Hello
5 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | *.swo
3 | /doc
4 | *~
5 | target/
6 | Cargo.lock
7 | .idea/
8 | *.iml
9 | /tests/xmlconf/
10 | .DS_Store
11 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | The `xml-rs` project has a new home
2 | ===================================
3 |
4 | The current repository is:
5 |
6 | https://github.com/kornelski/xml-rs
7 |
8 |
--------------------------------------------------------------------------------
/tests/documents/sample_5.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | test ©≂̸
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.rustfmt.toml:
--------------------------------------------------------------------------------
1 | # rustfmt is too aggressive and introduces too many inconsistencies and questionable choices to be applied unconditionally
2 | # please do not use it.
3 | disable_all_formatting = true
4 |
--------------------------------------------------------------------------------
/tests/rmt-ns11.fail.txt:
--------------------------------------------------------------------------------
1 | rmt-ns11-003 003.xml 1.1 style prefix unbinding ; 10:16 Cannot undefine prefix 'a'
2 | rmt-ns11-004 004.xml 1.1 style prefix unbinding and rebinding ; 11:16 Cannot undefine prefix 'a'
3 |
--------------------------------------------------------------------------------
/tests/documents/sample_5_short.txt:
--------------------------------------------------------------------------------
1 | StartDocument(1.0, utf-8)
2 | Doctype("")
3 | StartElement(p)
4 | StartElement(a)
5 | Characters("test ©≂̸")
6 | EndElement(a)
7 | EndElement(p)
8 | EndDocument
9 |
--------------------------------------------------------------------------------
/tests/documents/sample_6_full.txt:
--------------------------------------------------------------------------------
1 | StartDocument(1.0, UTF-8)
2 | Whitespace("\n")
3 | ProcessingInstruction(xml-stylesheet="href=\"doc.xsl\"")
4 | Whitespace("\n\n")
5 | StartElement(doc)
6 | Characters("Hello")
7 | EndElement(doc)
8 | EndDocument
9 |
--------------------------------------------------------------------------------
/tests/tests.xml:
--------------------------------------------------------------------------------
1 |
2 | issue 152
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/tests/sun-valid.fail.txt:
--------------------------------------------------------------------------------
1 | not-sa03 not-sa03.xml A non-standalone document is valid if declared as such.; 19:20 Unexpected entity: internal
2 | v-pe00 pe00.xml Tests construction of internal entity replacement text, using an example in the XML specification. ; 2:12 Unexpected entity: book
3 |
--------------------------------------------------------------------------------
/tests/documents/sample_7.xml:
--------------------------------------------------------------------------------
1 |
3 |
4 |
5 | why">
6 | ]>
7 |
8 | omg why
9 | &ersand;
10 | &rsq;
11 | &lb;
12 | &omg;
13 |
14 |
--------------------------------------------------------------------------------
/tests/errata3e.fail.txt:
--------------------------------------------------------------------------------
1 | rmt-e3e-12 E12.xml Default values for attributes may not contain references to external entities.
2 | rmt-e3e-13 E13.xml Even internal parameter entity references are enough to make undeclared entities into mere validity errors rather than well-formedness errors. ; 7:11 Unexpected entity: ent2
3 |
--------------------------------------------------------------------------------
/tests/documents/sample_3.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | test
5 | kkss" = ddd' >
6 | ddddd!e3-->
5 | test
6 | kkss" = ddd' >
7 | ddddd!e3-->
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
--------------------------------------------------------------------------------
/examples/rewrite.rs:
--------------------------------------------------------------------------------
1 | //! See for a real-world example.
2 |
3 | use std::fs::File;
4 | use std::io::BufReader;
5 | use std::path::Path;
6 | use xml::EmitterConfig;
7 | use xml::reader::{ParserConfig, Result};
8 |
9 | fn main() -> Result<(), Box> {
10 | let arg = std::env::args_os().nth(1);
11 | let file_path = Path::new(arg.as_deref().unwrap_or("tests/documents/sample_1.xml".as_ref()));
12 | let file = BufReader::new(File::open(file_path)
13 | .map_err(|e| format!("Can't open {}: {e}", file_path.display()))?);
14 |
15 | let mut reader = ParserConfig::default()
16 | .ignore_root_level_whitespace(true)
17 | .ignore_comments(false)
18 | .cdata_to_characters(true)
19 | .coalesce_characters(true)
20 | .create_reader(file);
21 |
22 | let stdout = std::io::stdout().lock();
23 |
24 | let mut writer = EmitterConfig::default()
25 | .create_writer(stdout);
26 |
27 | loop {
28 | let reader_event = reader.next()?;
29 |
30 | match reader_event {
31 | xml::reader::XmlEvent::EndDocument => break,
32 | xml::reader::XmlEvent::StartElement { name, mut attributes, namespace } => {
33 | let event = xml::writer::XmlEvent::StartElement {
34 | name: name.borrow(),
35 | namespace: namespace.borrow(),
36 | attributes: attributes.iter_mut().map(|attr| {
37 | attr.value = alternating_caps(&attr.value);
38 | attr.borrow()
39 | }).collect(),
40 | };
41 | writer.write(event)?;
42 | },
43 | xml::reader::XmlEvent::Characters(text) => {
44 | let text = alternating_caps(&text);
45 | let event = xml::writer::XmlEvent::Characters(&text);
46 | writer.write(event)?;
47 | },
48 | xml::reader::XmlEvent::Comment(text) => {
49 | let text = alternating_caps(&text);
50 | let event = xml::writer::XmlEvent::Comment(&text);
51 | writer.write(event)?;
52 | },
53 | other => {
54 | if let Some(writer_event) = other.as_writer_event() {
55 | writer.write(writer_event)?;
56 | }
57 | },
58 | }
59 | }
60 | Ok(())
61 | }
62 |
63 | fn alternating_caps(text: &str) -> String {
64 | text.chars().enumerate()
65 | .map(|(i, ch)| if i&1==0 { ch.to_ascii_uppercase() } else { ch.to_ascii_lowercase() })
66 | .collect()
67 | }
68 |
--------------------------------------------------------------------------------
/tests/sun-not-wf.fail.txt:
--------------------------------------------------------------------------------
1 | attlist01 attlist01.xml SGML's NUTOKEN is not allowed.
2 | attlist02 attlist02.xml SGML's NUTOKENS attribute type is not allowed.
3 | attlist03 attlist03.xml Comma doesn't separate enumerations, unlike in SGML.
4 | attlist04 attlist04.xml SGML's NUMBER attribute type is not allowed.
5 | attlist05 attlist05.xml SGML's NUMBERS attribute type is not allowed.
6 | attlist06 attlist06.xml SGML's NAME attribute type is not allowed.
7 | attlist07 attlist07.xml SGML's NAMES attribute type is not allowed.
8 | attlist08 attlist08.xml SGML's #CURRENT is not allowed.
9 | attlist09 attlist09.xml SGML's #CONREF is not allowed.
10 | cond01 cond01.xml Only INCLUDE and IGNORE are conditional section keywords
11 | cond02 cond02.xml Must have keyword in conditional sections
12 | content01 content01.xml No whitespace before "?" in content model
13 | content02 content02.xml No whitespace before "*" in content model
14 | content03 content03.xml No whitespace before "+" in content model
15 | decl01 decl01.xml External entities may not have standalone decls.
16 | nwf-dtd00 dtd00.xml Comma mandatory in content model
17 | nwf-dtd01 dtd01.xml Can't mix comma and vertical bar in content models
18 | dtd04 dtd04.xml PUBLIC literal must be quoted
19 | dtd05 dtd05.xml SYSTEM identifier must be quoted
20 | dtd07 dtd07.xml Text declarations (which optionally begin any external entity) are required to have "encoding=...".
21 | encoding07 encoding07.xml Text declarations (which optionally begin any external entity) are required to have "encoding=...".
22 | pi pi.xml No space between PI target name and data
23 | pubid01 pubid01.xml Illegal entity ref in public ID
24 | pubid02 pubid02.xml Illegal characters in public ID
25 | pubid03 pubid03.xml Illegal characters in public ID
26 | pubid04 pubid04.xml Illegal characters in public ID
27 | pubid05 pubid05.xml SGML-ism: public ID without system ID
28 | sgml04 sgml04.xml ATTLIST declarations apply to only one element, unlike SGML
29 | sgml05 sgml05.xml ELEMENT declarations apply to only one element, unlike SGML
30 | sgml06 sgml06.xml ATTLIST declarations are never global, unlike in SGML
31 | sgml07 sgml07.xml SGML Tag minimization specifications are not allowed
32 | sgml08 sgml08.xml SGML Tag minimization specifications are not allowed
33 | sgml09 sgml09.xml SGML Content model exception specifications are not allowed
34 | sgml10 sgml10.xml SGML Content model exception specifications are not allowed
35 | sgml11 sgml11.xml CDATA is not a valid content model spec
36 | sgml12 sgml12.xml RCDATA is not a valid content model spec
37 | sgml13 sgml13.xml SGML Unordered content models not allowed
38 |
--------------------------------------------------------------------------------
/examples/print_events.rs:
--------------------------------------------------------------------------------
1 | use std::fs::File;
2 | use std::io::BufReader;
3 | use xml::common::Position;
4 | use xml::reader::{ParserConfig, XmlEvent};
5 |
6 | fn main() {
7 | let file_path = std::env::args_os().nth(1).expect("Please specify a path to an XML file");
8 | let file = File::open(file_path).unwrap();
9 |
10 | let mut reader = ParserConfig::default()
11 | .ignore_root_level_whitespace(false)
12 | .create_reader(BufReader::new(file));
13 |
14 | loop {
15 | match reader.next() {
16 | Ok(e) => {
17 | print!("{}\t", reader.position());
18 |
19 | match e {
20 | XmlEvent::StartDocument { version, encoding, .. } => {
21 | println!("StartDocument({version}, {encoding})");
22 | },
23 | XmlEvent::EndDocument => {
24 | println!("EndDocument");
25 | break;
26 | },
27 | XmlEvent::ProcessingInstruction { name, data } => {
28 | println!("ProcessingInstruction({name}={:?})", data.as_deref().unwrap_or_default());
29 | },
30 | XmlEvent::StartElement { name, attributes, .. } => {
31 | if attributes.is_empty() {
32 | println!("StartElement({name})");
33 | } else {
34 | let attrs: Vec<_> = attributes
35 | .iter()
36 | .map(|a| format!("{}={:?}", &a.name, a.value))
37 | .collect();
38 | println!("StartElement({name} [{}])", attrs.join(", "));
39 | }
40 | },
41 | XmlEvent::EndElement { name } => {
42 | println!("EndElement({name})");
43 | },
44 | XmlEvent::Comment(data) => {
45 | println!(r#"Comment("{}")"#, data.escape_debug());
46 | },
47 | XmlEvent::CData(data) => println!(r#"CData("{}")"#, data.escape_debug()),
48 | XmlEvent::Characters(data) => {
49 | println!(r#"Characters("{}")"#, data.escape_debug());
50 | },
51 | XmlEvent::Whitespace(data) => {
52 | println!(r#"Whitespace("{}")"#, data.escape_debug());
53 | },
54 | XmlEvent::Doctype { syntax } => {
55 | println!(r#"Doctype("{}")"#, syntax.escape_debug());
56 | },
57 | }
58 | },
59 | Err(e) => {
60 | eprintln!("Error at {}: {e}", reader.position());
61 | break;
62 | },
63 | }
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/src/attribute.rs:
--------------------------------------------------------------------------------
1 | //! Contains XML attributes manipulation types and functions.
2 |
3 | use std::fmt;
4 |
5 | use crate::escape::{AttributeEscapes, Escaped};
6 | use crate::name::{Name, OwnedName};
7 |
8 | /// A borrowed version of an XML attribute.
9 | ///
10 | /// Consists of a borrowed qualified name and a borrowed string value.
11 | #[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)]
12 | pub struct Attribute<'a> {
13 | /// Attribute name.
14 | pub name: Name<'a>,
15 |
16 | /// Attribute value.
17 | pub value: &'a str,
18 | }
19 |
20 | impl fmt::Display for Attribute<'_> {
21 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
22 | write!(f, "{}=\"{}\"", self.name, Escaped::::new(self.value))
23 | }
24 | }
25 |
26 | impl<'a> Attribute<'a> {
27 | /// Creates an owned attribute out of this borrowed one.
28 | #[inline]
29 | #[must_use]
30 | pub fn to_owned(&self) -> OwnedAttribute {
31 | OwnedAttribute {
32 | name: self.name.into(),
33 | value: self.value.into(),
34 | }
35 | }
36 |
37 | /// Creates a borrowed attribute using the provided borrowed name and a borrowed string value.
38 | #[inline]
39 | #[must_use]
40 | pub const fn new(name: Name<'a>, value: &'a str) -> Self {
41 | Attribute { name, value }
42 | }
43 | }
44 |
45 | /// An owned version of an XML attribute.
46 | ///
47 | /// Consists of an owned qualified name and an owned string value.
48 | #[derive(Clone, Eq, PartialEq, Hash, Debug)]
49 | pub struct OwnedAttribute {
50 | /// Attribute name.
51 | pub name: OwnedName,
52 |
53 | /// Attribute value.
54 | pub value: String,
55 | }
56 |
57 | impl OwnedAttribute {
58 | /// Returns a borrowed `Attribute` out of this owned one.
59 | #[must_use]
60 | #[inline]
61 | pub fn borrow(&self) -> Attribute<'_> {
62 | Attribute {
63 | name: self.name.borrow(),
64 | value: &self.value,
65 | }
66 | }
67 |
68 | /// Creates a new owned attribute using the provided owned name and an owned string value.
69 | #[inline]
70 | pub fn new>(name: OwnedName, value: S) -> Self {
71 | Self { name, value: value.into() }
72 | }
73 | }
74 |
75 | impl fmt::Display for OwnedAttribute {
76 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
77 | write!(f, "{}=\"{}\"", self.name, Escaped::::new(&self.value))
78 | }
79 | }
80 |
81 | #[cfg(test)]
82 | mod tests {
83 | use super::Attribute;
84 |
85 | use crate::name::Name;
86 |
87 | #[test]
88 | fn attribute_display() {
89 | let attr = Attribute::new(
90 | Name::qualified("attribute", "urn:namespace", Some("n")),
91 | "its value with > & \" ' < weird symbols",
92 | );
93 |
94 | assert_eq!(
95 | &*attr.to_string(),
96 | "{urn:namespace}n:attribute=\"its value with > & " ' < weird symbols\""
97 | );
98 | }
99 | }
100 |
--------------------------------------------------------------------------------
/examples/xml-analyze.rs:
--------------------------------------------------------------------------------
1 | #![forbid(unsafe_code)]
2 |
3 | use std::collections::HashSet;
4 | use std::fs::File;
5 | use std::io::{self, BufReader, Read};
6 | use std::{cmp, env};
7 |
8 | use xml::ParserConfig;
9 | use xml::reader::XmlEvent;
10 |
11 | fn main() -> Result<(), Box> {
12 | let mut file;
13 | let mut stdin;
14 | let source: &mut dyn Read = if let Some(file_name) = env::args().nth(1) {
15 | file = File::open(file_name).map_err(|e| format!("Cannot open input file: {e}"))?;
16 | &mut file
17 | } else {
18 | stdin = io::stdin();
19 | &mut stdin
20 | };
21 |
22 | let reader = ParserConfig::new()
23 | .whitespace_to_characters(true)
24 | .ignore_comments(false)
25 | .create_reader(BufReader::new(source));
26 |
27 | let mut processing_instructions = 0;
28 | let mut elements = 0;
29 | let mut character_blocks = 0;
30 | let mut cdata_blocks = 0;
31 | let mut characters = 0;
32 | let mut comment_blocks = 0;
33 | let mut comment_characters = 0;
34 | let mut namespaces = HashSet::new();
35 | let mut depth = 0;
36 | let mut max_depth = 0;
37 |
38 | for e in reader {
39 | let e = e.map_err(|e| format!("Error parsing XML document: {e}"))?;
40 | match e {
41 | XmlEvent::StartDocument { version, encoding, standalone } => {
42 | println!(
43 | "XML document version {}, encoded in {}, {}standalone",
44 | version, encoding, if standalone.unwrap_or(false) { "" } else { "not " }
45 | );
46 | },
47 | XmlEvent::Doctype { syntax } => {
48 | println!("The Doctype is: {syntax}");
49 | },
50 | XmlEvent::EndDocument => println!("Document finished"),
51 | XmlEvent::ProcessingInstruction { .. } => processing_instructions += 1,
52 | XmlEvent::Whitespace(_) => {}, // can't happen due to configuration
53 | XmlEvent::Characters(s) => {
54 | character_blocks += 1;
55 | characters += s.len();
56 | },
57 | XmlEvent::CData(s) => {
58 | cdata_blocks += 1;
59 | characters += s.len();
60 | },
61 | XmlEvent::Comment(s) => {
62 | comment_blocks += 1;
63 | comment_characters += s.len();
64 | },
65 | XmlEvent::StartElement { namespace, .. } => {
66 | depth += 1;
67 | max_depth = cmp::max(max_depth, depth);
68 | elements += 1;
69 | namespaces.extend(namespace.0.into_values());
70 | },
71 | XmlEvent::EndElement { .. } => {
72 | depth -= 1;
73 | },
74 | }
75 | }
76 |
77 | namespaces.remove(xml::namespace::NS_EMPTY_URI);
78 | namespaces.remove(xml::namespace::NS_XMLNS_URI);
79 | namespaces.remove(xml::namespace::NS_XML_URI);
80 |
81 | println!("Elements: {elements}, maximum depth: {max_depth}");
82 | println!("Namespaces (excluding built-in): {}", namespaces.len());
83 | println!("Characters: {characters}, characters blocks: {character_blocks}, CDATA blocks: {cdata_blocks}");
84 | println!("Comment blocks: {comment_blocks}, comment characters: {comment_characters}");
85 | println!("Processing instructions (excluding built-in): {processing_instructions}");
86 |
87 | Ok(())
88 | }
89 |
--------------------------------------------------------------------------------
/src/reader/parser/inside_reference.rs:
--------------------------------------------------------------------------------
1 | use super::{PullParser, Result, State};
2 | use crate::common::{is_name_char, is_name_start_char, is_whitespace_char};
3 | use crate::reader::error::SyntaxError;
4 | use crate::reader::lexer::Token;
5 | use std::char;
6 |
7 | impl PullParser {
8 | pub fn inside_reference(&mut self, t: Token) -> Option {
9 | match t {
10 | Token::Character(c) if !self.data.ref_data.is_empty() && is_name_char(c) ||
11 | self.data.ref_data.is_empty() && (is_name_start_char(c) || c == '#') => {
12 | self.data.ref_data.push(c);
13 | None
14 | },
15 |
16 | Token::ReferenceEnd => {
17 | let name = self.data.take_ref_data();
18 | if name.is_empty() {
19 | return Some(self.error(SyntaxError::EmptyEntity));
20 | }
21 |
22 | let c = match &*name {
23 | "lt" => Some('<'),
24 | "gt" => Some('>'),
25 | "amp" => Some('&'),
26 | "apos" => Some('\''),
27 | "quot" => Some('"'),
28 | _ if name.starts_with('#') => match self.numeric_reference_from_str(&name[1..]) {
29 | Ok(c) => Some(c),
30 | Err(e) => return Some(self.error(e)),
31 | },
32 | _ => None,
33 | };
34 | if let Some(c) = c {
35 | self.buf.push(c);
36 | } else if let Some(v) = self.config.extra_entities.get(&name) {
37 | self.buf.push_str(v);
38 | } else if let Some(v) = self.entities.get(&name) {
39 | if self.state_after_reference == State::OutsideTag {
40 | // an entity can expand to *elements*, so outside of a tag it needs a full reparse
41 | if let Err(e) = self.lexer.reparse(v) {
42 | return Some(Err(e));
43 | }
44 | } else {
45 | // however, inside attributes it's not allowed to affect attribute quoting,
46 | // so it can't be fed to the lexer
47 | self.buf.push_str(v);
48 | }
49 | } else {
50 | return Some(self.error(SyntaxError::UnexpectedEntity(name.into())));
51 | }
52 | let prev_st = self.state_after_reference;
53 | if prev_st == State::OutsideTag && !is_whitespace_char(self.buf.chars().last().unwrap_or('\0')) {
54 | self.inside_whitespace = false;
55 | }
56 | self.into_state_continue(prev_st)
57 | },
58 |
59 | _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
60 | }
61 | }
62 |
63 | pub(crate) fn numeric_reference_from_str(&self, num_str: &str) -> std::result::Result {
64 | let val = if let Some(hex) = num_str.strip_prefix('x') {
65 | u32::from_str_radix(hex, 16).map_err(move |_| SyntaxError::InvalidNumericEntity(num_str.into()))?
66 | } else {
67 | num_str.parse::().map_err(move |_| SyntaxError::InvalidNumericEntity(num_str.into()))?
68 | };
69 | match char::from_u32(val) {
70 | Some(c) if self.is_valid_xml_char(c) => Ok(c),
71 | Some(_) if self.config.replace_unknown_entity_references => Ok('\u{fffd}'),
72 | None if self.config.replace_unknown_entity_references => Ok('\u{fffd}'),
73 | _ => Err(SyntaxError::InvalidCharacterEntity(val)),
74 | }
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/src/reader/indexset.rs:
--------------------------------------------------------------------------------
1 | use crate::attribute::OwnedAttribute;
2 | use crate::name::OwnedName;
3 |
4 | use std::collections::hash_map::RandomState;
5 | use std::collections::HashSet;
6 | use std::hash::{BuildHasher, Hash, Hasher};
7 |
8 | /// An ordered set
9 | pub struct AttributesSet {
10 | vec: Vec,
11 | /// Uses a no-op hasher, because these u64s are hashes already
12 | may_contain: HashSet,
13 | /// This is real hasher for the `OwnedName`
14 | hasher: RandomState,
15 | }
16 |
17 | /// Use linear search and don't allocate `HashSet` if there are few attributes,
18 | /// because allocation costs more than a few comparisons.
19 | const HASH_THRESHOLD: usize = 8;
20 |
21 | impl AttributesSet {
22 | pub fn new() -> Self {
23 | Self {
24 | vec: Vec::new(),
25 | hasher: RandomState::new(),
26 | may_contain: HashSet::default(),
27 | }
28 | }
29 |
30 | fn hash(&self, val: &OwnedName) -> u64 {
31 | let mut h = self.hasher.build_hasher();
32 | val.hash(&mut h);
33 | h.finish()
34 | }
35 |
36 | pub fn len(&self) -> usize {
37 | self.vec.len()
38 | }
39 |
40 | pub fn contains(&self, name: &OwnedName) -> bool {
41 | // fall back to linear search only on duplicate or hash collision
42 | (self.vec.len() < HASH_THRESHOLD || self.may_contain.contains(&self.hash(name))) &&
43 | self.vec.iter().any(move |a| &a.name == name)
44 | }
45 |
46 | pub fn push(&mut self, attr: OwnedAttribute) {
47 | if self.vec.len() >= HASH_THRESHOLD {
48 | if self.vec.len() == HASH_THRESHOLD {
49 | self.may_contain.reserve(HASH_THRESHOLD * 2);
50 | for attr in &self.vec {
51 | self.may_contain.insert(self.hash(&attr.name));
52 | }
53 | }
54 | self.may_contain.insert(self.hash(&attr.name));
55 | }
56 | self.vec.push(attr);
57 | }
58 |
59 | pub fn into_vec(self) -> Vec {
60 | self.vec
61 | }
62 | }
63 |
64 | #[test]
65 | fn indexset() {
66 | let mut s = AttributesSet::new();
67 | let not_here = OwnedName {
68 | local_name: "attr1000".into(),
69 | namespace: Some("test".into()),
70 | prefix: None,
71 | };
72 |
73 | // this test will take a lot of time if the `contains()` is linear, and the loop is quadratic
74 | for i in 0..50000 {
75 | let name = OwnedName {
76 | local_name: format!("attr{i}"), namespace: None, prefix: None,
77 | };
78 | assert!(!s.contains(&name));
79 |
80 | s.push(OwnedAttribute { name, value: String::new() });
81 | assert!(!s.contains(¬_here));
82 | }
83 |
84 | assert!(s.contains(&OwnedName {
85 | local_name: "attr1234".into(), namespace: None, prefix: None,
86 | }));
87 | assert!(s.contains(&OwnedName {
88 | local_name: "attr0".into(), namespace: None, prefix: None,
89 | }));
90 | assert!(s.contains(&OwnedName {
91 | local_name: "attr49999".into(), namespace: None, prefix: None,
92 | }));
93 | }
94 |
95 | /// Hashser that does nothing except passing u64 through
96 | struct U64Hasher(u64);
97 |
98 | impl Hasher for U64Hasher {
99 | fn finish(&self) -> u64 { self.0 }
100 | fn write(&mut self, slice: &[u8]) {
101 | for &v in slice { self.0 ^= u64::from(v) } // unused in practice
102 | }
103 | fn write_u64(&mut self, i: u64) {
104 | self.0 ^= i;
105 | }
106 | }
107 |
108 | #[derive(Default)]
109 | struct U64HasherBuilder;
110 |
111 | impl BuildHasher for U64HasherBuilder {
112 | type Hasher = U64Hasher;
113 | fn build_hasher(&self) -> U64Hasher { U64Hasher(0) }
114 | }
115 |
--------------------------------------------------------------------------------
/tests/documents/sample_8_c.txt:
--------------------------------------------------------------------------------
1 | StartDocument(1.0, UTF-8)
2 | StartElement(el)
3 | Whitespace("\n")
4 | CData("")
5 | Whitespace("\n")
6 | CData("")
7 | Whitespace("\n\n")
8 | Whitespace("\n")
9 | Whitespace("\n")
10 | CData("")
11 | Whitespace("\n")
12 | StartElement(br)
13 | EndElement(br)
14 | Whitespace("\n")
15 | StartElement(s)
16 | EndElement(s)
17 | Whitespace("\n")
18 | StartElement(s)
19 | EndElement(s)
20 | Whitespace("\n")
21 | StartElement(s)
22 | CData("")
23 | EndElement(s)
24 | Whitespace("\n\n\n")
25 | CData("")
26 | Whitespace("\n")
27 | Whitespace("\n\n")
28 | CData("")
29 | CData("")
30 | Whitespace("\n")
31 | CData("")
32 | CData("")
33 | Whitespace("\n")
34 | CData("")
35 | CData("")
36 | Whitespace("\n")
37 | CData("")
38 | StartElement(br)
39 | EndElement(br)
40 | CData("")
41 | Whitespace("\n")
42 | CData("")
43 | StartElement(s)
44 | EndElement(s)
45 | CData("")
46 | Whitespace("\n")
47 | CData("")
48 | StartElement(s)
49 | EndElement(s)
50 | CData("")
51 | Whitespace("\n")
52 | CData("")
53 | StartElement(s)
54 | EndElement(s)
55 | CData("")
56 | Whitespace("\n\n")
57 | CData("")
58 | Whitespace("\n")
59 | CData("")
60 | Whitespace("\n\n")
61 | Whitespace("\n")
62 | Whitespace("\n")
63 | CData("")
64 | Whitespace("\n")
65 | StartElement(br)
66 | EndElement(br)
67 | Whitespace("\n")
68 | StartElement(s)
69 | EndElement(s)
70 | Whitespace("\n")
71 | StartElement(s)
72 | EndElement(s)
73 | Whitespace("\n")
74 | StartElement(s)
75 | CData("")
76 | EndElement(s)
77 | Whitespace("\n\n\n")
78 | CData("")
79 | Whitespace("\n")
80 | Whitespace("\n\n")
81 | CData("")
82 | CData("")
83 | Whitespace("\n")
84 | CData("")
85 | CData("")
86 | Whitespace("\n")
87 | CData("")
88 | CData("")
89 | Whitespace("\n")
90 | CData("")
91 | StartElement(br)
92 | EndElement(br)
93 | CData("")
94 | Whitespace("\n")
95 | CData("")
96 | StartElement(s)
97 | EndElement(s)
98 | CData("")
99 | Whitespace("\n")
100 | CData("")
101 | StartElement(s)
102 | EndElement(s)
103 | CData("")
104 | Whitespace("\n")
105 | CData("")
106 | StartElement(s)
107 | EndElement(s)
108 | CData("")
109 | Whitespace("\n\n\n")
110 | Whitespace("\n\n")
111 | CData("")
112 | Whitespace("\n")
113 | CData("")
114 | Whitespace("\n\n")
115 | Whitespace("\n")
116 | Whitespace("\n")
117 | CData("")
118 | Whitespace("\n")
119 | StartElement(br)
120 | EndElement(br)
121 | Whitespace("\n")
122 | StartElement(s)
123 | EndElement(s)
124 | Whitespace("\n")
125 | StartElement(s)
126 | EndElement(s)
127 | Whitespace("\n")
128 | StartElement(s)
129 | CData("")
130 | EndElement(s)
131 | Whitespace("\n\n\n")
132 | CData("")
133 | Whitespace("\n")
134 | Whitespace("\n\n")
135 | CData("")
136 | CData("")
137 | Whitespace("\n")
138 | CData("")
139 | CData("")
140 | Whitespace("\n")
141 | CData("")
142 | CData("")
143 | Whitespace("\n")
144 | CData("")
145 | StartElement(br)
146 | EndElement(br)
147 | CData("")
148 | Whitespace("\n")
149 | CData("")
150 | StartElement(s)
151 | EndElement(s)
152 | CData("")
153 | Whitespace("\n")
154 | CData("")
155 | StartElement(s)
156 | EndElement(s)
157 | CData("")
158 | Whitespace("\n")
159 | CData("")
160 | StartElement(s)
161 | EndElement(s)
162 | CData("")
163 | Whitespace("\n\n")
164 | CData("")
165 | Whitespace("\n\n")
166 | CData("")
167 | CData("")
168 | StartElement(br)
169 | EndElement(br)
170 | StartElement(s)
171 | EndElement(s)
172 | StartElement(s)
173 | EndElement(s)
174 | StartElement(s)
175 | CData("")
176 | EndElement(s)
177 | CData("")
178 | CData("")
179 | CData("")
180 | CData("")
181 | CData("")
182 | CData("")
183 | CData("")
184 | CData("")
185 | StartElement(br)
186 | EndElement(br)
187 | CData("")
188 | CData("")
189 | StartElement(s)
190 | EndElement(s)
191 | CData("")
192 | CData("")
193 | StartElement(s)
194 | EndElement(s)
195 | CData("")
196 | CData("")
197 | StartElement(s)
198 | EndElement(s)
199 | CData("")
200 | Whitespace("\n")
201 | EndElement(el)
202 | EndDocument
203 |
--------------------------------------------------------------------------------
/tests/documents/sample_8_coalesce_wscdch.txt:
--------------------------------------------------------------------------------
1 | StartDocument(1.0, UTF-8)
2 | StartElement(el)
3 | Characters("\n")
4 | Comment("ws")
5 | Characters("\n\n\n")
6 | Comment("ws")
7 | Comment("ws")
8 | Characters("\n")
9 | Comment("ws")
10 | Comment("ws")
11 | Characters("\n")
12 | Comment("ws")
13 | Comment("ws")
14 | Characters("\n")
15 | Comment("ws")
16 | StartElement(br)
17 | EndElement(br)
18 | Comment("ws")
19 | Characters("\n")
20 | Comment("ws")
21 | StartElement(s)
22 | EndElement(s)
23 | Comment("ws")
24 | Characters("\n")
25 | Comment("ws")
26 | StartElement(s)
27 | EndElement(s)
28 | Comment("ws")
29 | Characters("\n")
30 | Comment("ws")
31 | StartElement(s)
32 | EndElement(s)
33 | Comment("ws")
34 | Characters("\n\n\n")
35 | Comment("ws")
36 | Characters("\n")
37 | Comment("ws")
38 | Characters("\n\n\n\n")
39 | Comment("ws")
40 | Characters("\n")
41 | StartElement(br)
42 | EndElement(br)
43 | Characters("\n")
44 | StartElement(s)
45 | EndElement(s)
46 | Characters("\n")
47 | StartElement(s)
48 | EndElement(s)
49 | Characters("\n")
50 | StartElement(s)
51 | Comment("ws")
52 | EndElement(s)
53 | Characters("\n\n")
54 | Comment("ws")
55 | Characters("\n\n\n")
56 | Comment("ws")
57 | Comment("ws")
58 | Characters("\n")
59 | Comment("ws")
60 | Comment("ws")
61 | Characters("\n")
62 | Comment("ws")
63 | Comment("ws")
64 | Characters("\n")
65 | Comment("ws")
66 | StartElement(br)
67 | EndElement(br)
68 | Comment("ws")
69 | Characters("\n")
70 | Comment("ws")
71 | StartElement(s)
72 | EndElement(s)
73 | Comment("ws")
74 | Characters("\n")
75 | Comment("ws")
76 | StartElement(s)
77 | EndElement(s)
78 | Comment("ws")
79 | Characters("\n")
80 | Comment("ws")
81 | StartElement(s)
82 | EndElement(s)
83 | Comment("ws")
84 | Characters("\n\n\n")
85 | Comment("ws")
86 | Characters("\n")
87 | Comment("ws")
88 | Characters("\n\n\n\n")
89 | Comment("ws")
90 | Characters("\n")
91 | StartElement(br)
92 | EndElement(br)
93 | Characters("\n")
94 | StartElement(s)
95 | EndElement(s)
96 | Characters("\n")
97 | StartElement(s)
98 | EndElement(s)
99 | Characters("\n")
100 | StartElement(s)
101 | Comment("ws")
102 | EndElement(s)
103 | Characters("\n\n\n")
104 | Comment("noWS")
105 | Characters("\n\n")
106 | Comment("ws")
107 | Characters("\n\n\n")
108 | Comment("ws")
109 | Comment("ws")
110 | Characters("\n")
111 | Comment("ws")
112 | Comment("ws")
113 | Characters("\n")
114 | Comment("ws")
115 | Comment("ws")
116 | Characters("\n")
117 | Comment("ws")
118 | StartElement(br)
119 | EndElement(br)
120 | Comment("ws")
121 | Characters("\n")
122 | Comment("ws")
123 | StartElement(s)
124 | EndElement(s)
125 | Comment("ws")
126 | Characters("\n")
127 | Comment("ws")
128 | StartElement(s)
129 | EndElement(s)
130 | Comment("ws")
131 | Characters("\n")
132 | Comment("ws")
133 | StartElement(s)
134 | EndElement(s)
135 | Comment("ws")
136 | Characters("\n\n\n")
137 | Comment("ws")
138 | Characters("\n")
139 | Comment("ws")
140 | Characters("\n\n\n\n")
141 | Comment("ws")
142 | Characters("\n")
143 | StartElement(br)
144 | EndElement(br)
145 | Characters("\n")
146 | StartElement(s)
147 | EndElement(s)
148 | Characters("\n")
149 | StartElement(s)
150 | EndElement(s)
151 | Characters("\n")
152 | StartElement(s)
153 | Comment("ws")
154 | EndElement(s)
155 | Characters("\n\n")
156 | Comment("ws")
157 | Characters("\n\n")
158 | Comment("ws")
159 | Comment("ws")
160 | Comment("ws")
161 | Comment("ws")
162 | Comment("ws")
163 | Comment("ws")
164 | Comment("ws")
165 | StartElement(br)
166 | EndElement(br)
167 | Comment("ws")
168 | Comment("ws")
169 | StartElement(s)
170 | EndElement(s)
171 | Comment("ws")
172 | Comment("ws")
173 | StartElement(s)
174 | EndElement(s)
175 | Comment("ws")
176 | Comment("ws")
177 | StartElement(s)
178 | EndElement(s)
179 | Comment("ws")
180 | Comment("ws")
181 | Comment("ws")
182 | Comment("ws")
183 | StartElement(br)
184 | EndElement(br)
185 | StartElement(s)
186 | EndElement(s)
187 | StartElement(s)
188 | EndElement(s)
189 | StartElement(s)
190 | Comment("ws")
191 | EndElement(s)
192 | Characters("\n")
193 | EndElement(el)
194 | EndDocument
195 |
--------------------------------------------------------------------------------
/tests/documents/sample_8_wscdch.txt:
--------------------------------------------------------------------------------
1 | StartDocument(1.0, UTF-8)
2 | StartElement(el)
3 | Characters("\n")
4 | Comment("ws")
5 | Characters("\n")
6 | Characters("\n\n")
7 | Comment("ws")
8 | Comment("ws")
9 | Characters("\n")
10 | Comment("ws")
11 | Comment("ws")
12 | Characters("\n")
13 | Comment("ws")
14 | Comment("ws")
15 | Characters("\n")
16 | Comment("ws")
17 | StartElement(br)
18 | EndElement(br)
19 | Comment("ws")
20 | Characters("\n")
21 | Comment("ws")
22 | StartElement(s)
23 | EndElement(s)
24 | Comment("ws")
25 | Characters("\n")
26 | Comment("ws")
27 | StartElement(s)
28 | EndElement(s)
29 | Comment("ws")
30 | Characters("\n")
31 | Comment("ws")
32 | StartElement(s)
33 | EndElement(s)
34 | Comment("ws")
35 | Characters("\n\n\n")
36 | Comment("ws")
37 | Characters("\n")
38 | Comment("ws")
39 | Characters("\n\n")
40 | Characters("\n")
41 | Characters("\n")
42 | Comment("ws")
43 | Characters("\n")
44 | StartElement(br)
45 | EndElement(br)
46 | Characters("\n")
47 | StartElement(s)
48 | EndElement(s)
49 | Characters("\n")
50 | StartElement(s)
51 | EndElement(s)
52 | Characters("\n")
53 | StartElement(s)
54 | Comment("ws")
55 | EndElement(s)
56 | Characters("\n\n")
57 | Comment("ws")
58 | Characters("\n")
59 | Characters("\n\n")
60 | Comment("ws")
61 | Comment("ws")
62 | Characters("\n")
63 | Comment("ws")
64 | Comment("ws")
65 | Characters("\n")
66 | Comment("ws")
67 | Comment("ws")
68 | Characters("\n")
69 | Comment("ws")
70 | StartElement(br)
71 | EndElement(br)
72 | Comment("ws")
73 | Characters("\n")
74 | Comment("ws")
75 | StartElement(s)
76 | EndElement(s)
77 | Comment("ws")
78 | Characters("\n")
79 | Comment("ws")
80 | StartElement(s)
81 | EndElement(s)
82 | Comment("ws")
83 | Characters("\n")
84 | Comment("ws")
85 | StartElement(s)
86 | EndElement(s)
87 | Comment("ws")
88 | Characters("\n\n\n")
89 | Comment("ws")
90 | Characters("\n")
91 | Comment("ws")
92 | Characters("\n\n")
93 | Characters("\n")
94 | Characters("\n")
95 | Comment("ws")
96 | Characters("\n")
97 | StartElement(br)
98 | EndElement(br)
99 | Characters("\n")
100 | StartElement(s)
101 | EndElement(s)
102 | Characters("\n")
103 | StartElement(s)
104 | EndElement(s)
105 | Characters("\n")
106 | StartElement(s)
107 | Comment("ws")
108 | EndElement(s)
109 | Characters("\n\n\n")
110 | Comment("noWS")
111 | Characters("\n\n")
112 | Comment("ws")
113 | Characters("\n")
114 | Characters("\n\n")
115 | Comment("ws")
116 | Comment("ws")
117 | Characters("\n")
118 | Comment("ws")
119 | Comment("ws")
120 | Characters("\n")
121 | Comment("ws")
122 | Comment("ws")
123 | Characters("\n")
124 | Comment("ws")
125 | StartElement(br)
126 | EndElement(br)
127 | Comment("ws")
128 | Characters("\n")
129 | Comment("ws")
130 | StartElement(s)
131 | EndElement(s)
132 | Comment("ws")
133 | Characters("\n")
134 | Comment("ws")
135 | StartElement(s)
136 | EndElement(s)
137 | Comment("ws")
138 | Characters("\n")
139 | Comment("ws")
140 | StartElement(s)
141 | EndElement(s)
142 | Comment("ws")
143 | Characters("\n\n\n")
144 | Comment("ws")
145 | Characters("\n")
146 | Comment("ws")
147 | Characters("\n\n")
148 | Characters("\n")
149 | Characters("\n")
150 | Comment("ws")
151 | Characters("\n")
152 | StartElement(br)
153 | EndElement(br)
154 | Characters("\n")
155 | StartElement(s)
156 | EndElement(s)
157 | Characters("\n")
158 | StartElement(s)
159 | EndElement(s)
160 | Characters("\n")
161 | StartElement(s)
162 | Comment("ws")
163 | EndElement(s)
164 | Characters("\n\n")
165 | Comment("ws")
166 | Characters("\n\n")
167 | Comment("ws")
168 | Comment("ws")
169 | Comment("ws")
170 | Comment("ws")
171 | Comment("ws")
172 | Comment("ws")
173 | Comment("ws")
174 | StartElement(br)
175 | EndElement(br)
176 | Comment("ws")
177 | Comment("ws")
178 | StartElement(s)
179 | EndElement(s)
180 | Comment("ws")
181 | Comment("ws")
182 | StartElement(s)
183 | EndElement(s)
184 | Comment("ws")
185 | Comment("ws")
186 | StartElement(s)
187 | EndElement(s)
188 | Comment("ws")
189 | Comment("ws")
190 | Comment("ws")
191 | Comment("ws")
192 | StartElement(br)
193 | EndElement(br)
194 | StartElement(s)
195 | EndElement(s)
196 | StartElement(s)
197 | EndElement(s)
198 | StartElement(s)
199 | Comment("ws")
200 | EndElement(s)
201 | Characters("\n")
202 | EndElement(el)
203 | EndDocument
204 |
--------------------------------------------------------------------------------
/src/writer.rs:
--------------------------------------------------------------------------------
1 | //! Contains high-level interface for an events-based XML emitter.
2 | //!
3 | //! The most important type in this module is `EventWriter` which allows writing an XML document
4 | //! to some output stream.
5 |
6 | pub use self::config::EmitterConfig;
7 | pub use self::emitter::EmitterError as Error;
8 | pub use self::emitter::Result;
9 | pub use self::events::XmlEvent;
10 |
11 | use self::emitter::Emitter;
12 |
13 | use std::io::prelude::*;
14 |
15 | mod config;
16 | mod emitter;
17 | pub mod events;
18 |
19 | /// A wrapper around an `std::io::Write` instance which emits XML document according to provided
20 | /// events.
21 | pub struct EventWriter {
22 | sink: W,
23 | emitter: Emitter,
24 | }
25 |
26 | impl EventWriter {
27 | /// Creates a new `EventWriter` out of an `std::io::Write` instance using the default
28 | /// configuration.
29 | #[inline]
30 | pub fn new(sink: W) -> Self {
31 | Self::new_with_config(sink, EmitterConfig::new())
32 | }
33 |
34 | /// Creates a new `EventWriter` out of an `std::io::Write` instance using the provided
35 | /// configuration.
36 | #[inline]
37 | pub fn new_with_config(sink: W, config: EmitterConfig) -> Self {
38 | Self {
39 | sink,
40 | emitter: Emitter::new(config),
41 | }
42 | }
43 |
44 | /// Writes the next piece of XML document according to the provided event.
45 | ///
46 | /// Note that output data may not exactly correspond to the written event because
47 | /// of various configuration options. For example, `XmlEvent::EndElement` may
48 | /// correspond to a separate closing element or it may cause writing an empty element.
49 | /// Another example is that `XmlEvent::CData` may be represented as characters in
50 | /// the output stream.
51 | pub fn write<'a, E>(&mut self, event: E) -> Result<()> where E: Into> {
52 | match event.into() {
53 | XmlEvent::StartDocument { version, encoding, standalone } =>
54 | self.emitter.emit_start_document(&mut self.sink, version, encoding.unwrap_or("UTF-8"), standalone),
55 | XmlEvent::ProcessingInstruction { name, data } =>
56 | self.emitter.emit_processing_instruction(&mut self.sink, name, data),
57 | XmlEvent::StartElement { name, attributes, namespace } => {
58 | self.emitter.namespace_stack_mut().push_empty().checked_target().extend(namespace.as_ref());
59 | self.emitter.emit_start_element(&mut self.sink, name, &attributes)
60 | },
61 | XmlEvent::EndElement { name } => {
62 | let r = self.emitter.emit_end_element(&mut self.sink, name);
63 | self.emitter.namespace_stack_mut().try_pop();
64 | r
65 | },
66 | XmlEvent::Comment(content) => self.emitter.emit_comment(&mut self.sink, content),
67 | XmlEvent::CData(content) => self.emitter.emit_cdata(&mut self.sink, content),
68 | XmlEvent::Characters(content) => self.emitter.emit_characters(&mut self.sink, content),
69 | XmlEvent::RawCharacters(content) => self.emitter.emit_raw_characters(&mut self.sink, content),
70 | XmlEvent::Doctype(content) => self.emitter.emit_raw_characters(&mut self.sink, content),
71 | }
72 | }
73 |
74 | /// Returns a mutable reference to the underlying `Writer`.
75 | ///
76 | /// Note that having a reference to the underlying sink makes it very easy to emit invalid XML
77 | /// documents. Use this method with care. Valid use cases for this method include accessing
78 | /// methods like `Write::flush`, which do not emit new data but rather change the state
79 | /// of the stream itself.
80 | pub fn inner_mut(&mut self) -> &mut W {
81 | &mut self.sink
82 | }
83 |
84 | /// Returns an immutable reference to the underlying `Writer`.
85 | pub fn inner_ref(&self) -> &W {
86 | &self.sink
87 | }
88 |
89 | /// Unwraps this `EventWriter`, returning the underlying writer.
90 | ///
91 | /// Note that this is a destructive operation: unwrapping a writer and then wrapping
92 | /// it again with `EventWriter::new()` will create a fresh writer whose state will be
93 | /// blank; for example, accumulated namespaces will be reset.
94 | pub fn into_inner(self) -> W {
95 | self.sink
96 | }
97 | }
98 |
--------------------------------------------------------------------------------
/tests/streaming.rs:
--------------------------------------------------------------------------------
1 | #![forbid(unsafe_code)]
2 |
3 | use std::io::{Cursor, Write};
4 |
5 | use xml::EventReader;
6 | use xml::reader::{ParserConfig, XmlEvent};
7 |
8 | macro_rules! assert_match {
9 | ($actual:expr, $expected:pat) => {
10 | match $actual {
11 | $expected => {},
12 | _ => panic!("assertion failed: `(left matches right)` \
13 | (left: `{:?}`, right: `{}`", $actual, stringify!($expected))
14 | }
15 | };
16 | ($actual:expr, $expected:pat if $guard:expr) => {
17 | match $actual {
18 | $expected if $guard => {},
19 | _ => panic!("assertion failed: `(left matches right)` \
20 | (left: `{:?}`, right: `{} if {}`",
21 | $actual, stringify!($expected), stringify!($guard))
22 | }
23 | };
24 | }
25 |
26 | fn write_and_reset_position(c: &mut Cursor, data: &[u8]) where Cursor: Write {
27 | let p = c.position();
28 | c.write_all(data).unwrap();
29 | c.set_position(p);
30 | }
31 |
32 | #[test]
33 | fn reading_streamed_content() {
34 | let buf = Cursor::new(b"".to_vec());
35 | let reader = EventReader::new(buf);
36 |
37 | let mut it = reader.into_iter();
38 |
39 | assert_match!(it.next(), Some(Ok(XmlEvent::StartDocument { .. })));
40 | assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "root");
41 |
42 | write_and_reset_position(it.source_mut(), b"content");
43 | assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-1");
44 | assert_match!(it.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "content");
45 | assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-1");
46 |
47 | write_and_reset_position(it.source_mut(), b"");
48 | assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-2");
49 | assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-2");
50 |
51 | write_and_reset_position(it.source_mut(), b"");
52 | assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-3");
53 | assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-3");
54 | // doesn't seem to work because of how tags parsing is done
55 | // write_and_reset_position(it.source_mut(), b"some text");
56 | // assert_match!(it.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "some text");
57 |
58 | write_and_reset_position(it.source_mut(), b"");
59 | assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "root");
60 | assert_match!(it.next(), Some(Ok(XmlEvent::EndDocument)));
61 | assert_match!(it.next(), None);
62 | }
63 |
64 | #[test]
65 | fn reading_streamed_content2() {
66 | let buf = Cursor::new(b"".to_vec());
67 | let mut config = ParserConfig::new();
68 | config.ignore_end_of_stream = true;
69 | let readerb = EventReader::new_with_config(buf, config);
70 |
71 | let mut reader = readerb.into_iter();
72 |
73 | assert_match!(reader.next(), Some(Ok(XmlEvent::StartDocument { .. })));
74 | assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "root");
75 |
76 | write_and_reset_position(reader.source_mut(), b"content");
77 | assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-1");
78 | assert_match!(reader.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "content");
79 | assert_match!(reader.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-1");
80 |
81 | write_and_reset_position(reader.source_mut(), b"content");
82 |
83 | assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-2");
84 | assert_match!(reader.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "content");
85 | assert_match!(reader.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-2");
86 | assert_match!(reader.next(), Some(Err(_)));
87 | write_and_reset_position(reader.source_mut(), b"");
88 | assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-3");
89 | write_and_reset_position(reader.source_mut(), b" {
92 | panic!("At this point, parser must not detect something.");
93 | },
94 | Some(Err(_)) => {},
95 | }
96 | write_and_reset_position(reader.source_mut(), b" />");
97 | assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-4");
98 | }
99 |
--------------------------------------------------------------------------------
/src/escape.rs:
--------------------------------------------------------------------------------
1 | //! Contains functions for performing XML special characters escaping.
2 |
3 | use std::borrow::Cow;
4 | use std::fmt::{Display, Formatter, Result};
5 | use std::marker::PhantomData;
6 |
7 | pub(crate) trait Escapes {
8 | fn escape(c: u8) -> Option<&'static str>;
9 |
10 | fn byte_needs_escaping(c: u8) -> bool {
11 | Self::escape(c).is_some()
12 | }
13 |
14 | fn str_needs_escaping(s: &str) -> bool {
15 | s.bytes().any(|c| Self::escape(c).is_some())
16 | }
17 | }
18 |
19 | pub(crate) struct Escaped<'a, E: Escapes> {
20 | _escape_phantom: PhantomData,
21 | to_escape: &'a str,
22 | }
23 |
24 | impl<'a, E: Escapes> Escaped<'a, E> {
25 | pub const fn new(s: &'a str) -> Self {
26 | Escaped {
27 | _escape_phantom: PhantomData,
28 | to_escape: s,
29 | }
30 | }
31 | }
32 |
33 | impl Display for Escaped<'_, E> {
34 | fn fmt(&self, f: &mut Formatter<'_>) -> Result {
35 | let mut total_remaining = self.to_escape;
36 |
37 | // find the next occurence
38 | while let Some(n) = total_remaining.bytes().position(E::byte_needs_escaping) {
39 | let (start, remaining) = total_remaining.split_at(n);
40 |
41 | f.write_str(start)?;
42 |
43 | // unwrap is safe because we checked is_some for position n earlier
44 | let next_byte = remaining.bytes().next().unwrap();
45 | let replacement = E::escape(next_byte).unwrap_or("unexpected token");
46 | f.write_str(replacement)?;
47 |
48 | total_remaining = &remaining[1..];
49 | }
50 |
51 | f.write_str(total_remaining)
52 | }
53 | }
54 |
55 | fn escape_str(s: &str) -> Cow<'_, str> {
56 | if E::str_needs_escaping(s) {
57 | Cow::Owned(Escaped::::new(s).to_string())
58 | } else {
59 | Cow::Borrowed(s)
60 | }
61 | }
62 |
63 | macro_rules! escapes {
64 | {
65 | $name: ident,
66 | $($k: expr => $v: expr),* $(,)?
67 | } => {
68 | pub(crate) struct $name;
69 |
70 | impl Escapes for $name {
71 | fn escape(c: u8) -> Option<&'static str> {
72 | match c {
73 | $( $k => Some($v),)*
74 | _ => None
75 | }
76 | }
77 | }
78 | };
79 | }
80 |
81 | escapes!(
82 | AttributeEscapes,
83 | b'<' => "<",
84 | b'>' => ">",
85 | b'"' => """,
86 | b'\'' => "'",
87 | b'&' => "&",
88 | b'\n' => "
",
89 | b'\r' => "
",
90 | );
91 |
92 | escapes!(
93 | PcDataEscapes,
94 | b'<' => "<",
95 | b'>' => ">",
96 | b'&' => "&",
97 | );
98 |
99 | /// Performs escaping of common XML characters inside an attribute value.
100 | ///
101 | /// This function replaces several important markup characters with their
102 | /// entity equivalents:
103 | ///
104 | /// * `<` → `<`
105 | /// * `>` → `>`
106 | /// * `"` → `"`
107 | /// * `'` → `'`
108 | /// * `&` → `&`
109 | ///
110 | /// The following characters are escaped so that attributes are printed on
111 | /// a single line:
112 | /// * `\n` → `
`
113 | /// * `\r` → `
`
114 | ///
115 | /// The resulting string is safe to use inside XML attribute values or in PCDATA sections.
116 | ///
117 | /// Does not perform allocations if the given string does not contain escapable characters.
118 | #[inline]
119 | #[must_use]
120 | pub fn escape_str_attribute(s: &str) -> Cow<'_, str> {
121 | escape_str::(s)
122 | }
123 |
124 | /// Performs escaping of common XML characters inside PCDATA.
125 | ///
126 | /// This function replaces several important markup characters with their
127 | /// entity equivalents:
128 | ///
129 | /// * `<` → `<`
130 | /// * `&` → `&`
131 | ///
132 | /// The resulting string is safe to use inside PCDATA sections but NOT inside attribute values.
133 | ///
134 | /// Does not perform allocations if the given string does not contain escapable characters.
135 | #[inline]
136 | #[must_use]
137 | pub fn escape_str_pcdata(s: &str) -> Cow<'_, str> {
138 | escape_str::(s)
139 | }
140 |
141 | #[cfg(test)]
142 | mod tests {
143 | use super::{escape_str_attribute, escape_str_pcdata};
144 |
145 | #[test]
146 | fn test_escape_str_attribute() {
147 | assert_eq!(escape_str_attribute("<>'\"&\n\r"), "<>'"&
");
148 | assert_eq!(escape_str_attribute("no_escapes"), "no_escapes");
149 | }
150 |
151 | #[test]
152 | fn test_escape_str_pcdata() {
153 | assert_eq!(escape_str_pcdata("<>&"), "<>&");
154 | assert_eq!(escape_str_pcdata("no_escapes"), "no_escapes");
155 | }
156 |
157 | #[test]
158 | fn test_escape_multibyte_code_points() {
159 | assert_eq!(escape_str_attribute("☃<"), "☃<");
160 | assert_eq!(escape_str_pcdata("☃<"), "☃<");
161 | }
162 | }
163 |
--------------------------------------------------------------------------------
/tests/documents/sample_8_full.txt:
--------------------------------------------------------------------------------
1 | StartDocument(1.0, UTF-8)
2 | StartElement(el)
3 | Whitespace("\n")
4 | Comment("ws")
5 | CData("")
6 | Whitespace("\n")
7 | CData("")
8 | Whitespace("\n\n")
9 | Comment("ws")
10 | Comment("ws")
11 | Whitespace("\n")
12 | Comment("ws")
13 | Comment("ws")
14 | Whitespace("\n")
15 | Comment("ws")
16 | CData("")
17 | Comment("ws")
18 | Whitespace("\n")
19 | Comment("ws")
20 | StartElement(br)
21 | EndElement(br)
22 | Comment("ws")
23 | Whitespace("\n")
24 | Comment("ws")
25 | StartElement(s)
26 | EndElement(s)
27 | Comment("ws")
28 | Whitespace("\n")
29 | Comment("ws")
30 | StartElement(s)
31 | EndElement(s)
32 | Comment("ws")
33 | Whitespace("\n")
34 | Comment("ws")
35 | StartElement(s)
36 | CData("")
37 | EndElement(s)
38 | Comment("ws")
39 | Whitespace("\n\n\n")
40 | CData("")
41 | Comment("ws")
42 | Whitespace("\n")
43 | Comment("ws")
44 | Whitespace("\n\n")
45 | CData("")
46 | CData("")
47 | Whitespace("\n")
48 | CData("")
49 | CData("")
50 | Whitespace("\n")
51 | CData("")
52 | Comment("ws")
53 | CData("")
54 | Whitespace("\n")
55 | CData("")
56 | StartElement(br)
57 | EndElement(br)
58 | CData("")
59 | Whitespace("\n")
60 | CData("")
61 | StartElement(s)
62 | EndElement(s)
63 | CData("")
64 | Whitespace("\n")
65 | CData("")
66 | StartElement(s)
67 | EndElement(s)
68 | CData("")
69 | Whitespace("\n")
70 | CData("")
71 | StartElement(s)
72 | Comment("ws")
73 | EndElement(s)
74 | CData("")
75 | Whitespace("\n\n")
76 | Comment("ws")
77 | CData("")
78 | Whitespace("\n")
79 | CData("")
80 | Whitespace("\n\n")
81 | Comment("ws")
82 | Comment("ws")
83 | Whitespace("\n")
84 | Comment("ws")
85 | Comment("ws")
86 | Whitespace("\n")
87 | Comment("ws")
88 | CData("")
89 | Comment("ws")
90 | Whitespace("\n")
91 | Comment("ws")
92 | StartElement(br)
93 | EndElement(br)
94 | Comment("ws")
95 | Whitespace("\n")
96 | Comment("ws")
97 | StartElement(s)
98 | EndElement(s)
99 | Comment("ws")
100 | Whitespace("\n")
101 | Comment("ws")
102 | StartElement(s)
103 | EndElement(s)
104 | Comment("ws")
105 | Whitespace("\n")
106 | Comment("ws")
107 | StartElement(s)
108 | CData("")
109 | EndElement(s)
110 | Comment("ws")
111 | Whitespace("\n\n\n")
112 | CData("")
113 | Comment("ws")
114 | Whitespace("\n")
115 | Comment("ws")
116 | Whitespace("\n\n")
117 | CData("")
118 | CData("")
119 | Whitespace("\n")
120 | CData("")
121 | CData("")
122 | Whitespace("\n")
123 | CData("")
124 | Comment("ws")
125 | CData("")
126 | Whitespace("\n")
127 | CData("")
128 | StartElement(br)
129 | EndElement(br)
130 | CData("")
131 | Whitespace("\n")
132 | CData("")
133 | StartElement(s)
134 | EndElement(s)
135 | CData("")
136 | Whitespace("\n")
137 | CData("")
138 | StartElement(s)
139 | EndElement(s)
140 | CData("")
141 | Whitespace("\n")
142 | CData("")
143 | StartElement(s)
144 | Comment("ws")
145 | EndElement(s)
146 | CData("")
147 | Whitespace("\n\n\n")
148 | Comment("noWS")
149 | Whitespace("\n\n")
150 | Comment("ws")
151 | CData("")
152 | Whitespace("\n")
153 | CData("")
154 | Whitespace("\n\n")
155 | Comment("ws")
156 | Comment("ws")
157 | Whitespace("\n")
158 | Comment("ws")
159 | Comment("ws")
160 | Whitespace("\n")
161 | Comment("ws")
162 | CData("")
163 | Comment("ws")
164 | Whitespace("\n")
165 | Comment("ws")
166 | StartElement(br)
167 | EndElement(br)
168 | Comment("ws")
169 | Whitespace("\n")
170 | Comment("ws")
171 | StartElement(s)
172 | EndElement(s)
173 | Comment("ws")
174 | Whitespace("\n")
175 | Comment("ws")
176 | StartElement(s)
177 | EndElement(s)
178 | Comment("ws")
179 | Whitespace("\n")
180 | Comment("ws")
181 | StartElement(s)
182 | CData("")
183 | EndElement(s)
184 | Comment("ws")
185 | Whitespace("\n\n\n")
186 | CData("")
187 | Comment("ws")
188 | Whitespace("\n")
189 | Comment("ws")
190 | Whitespace("\n\n")
191 | CData("")
192 | CData("")
193 | Whitespace("\n")
194 | CData("")
195 | CData("")
196 | Whitespace("\n")
197 | CData("")
198 | Comment("ws")
199 | CData("")
200 | Whitespace("\n")
201 | CData("")
202 | StartElement(br)
203 | EndElement(br)
204 | CData("")
205 | Whitespace("\n")
206 | CData("")
207 | StartElement(s)
208 | EndElement(s)
209 | CData("")
210 | Whitespace("\n")
211 | CData("")
212 | StartElement(s)
213 | EndElement(s)
214 | CData("")
215 | Whitespace("\n")
216 | CData("")
217 | StartElement(s)
218 | Comment("ws")
219 | EndElement(s)
220 | CData("")
221 | Whitespace("\n\n")
222 | Comment("ws")
223 | CData("")
224 | Whitespace("\n\n")
225 | CData("")
226 | Comment("ws")
227 | Comment("ws")
228 | Comment("ws")
229 | Comment("ws")
230 | Comment("ws")
231 | CData("")
232 | Comment("ws")
233 | Comment("ws")
234 | StartElement(br)
235 | EndElement(br)
236 | Comment("ws")
237 | Comment("ws")
238 | StartElement(s)
239 | EndElement(s)
240 | Comment("ws")
241 | Comment("ws")
242 | StartElement(s)
243 | EndElement(s)
244 | Comment("ws")
245 | Comment("ws")
246 | StartElement(s)
247 | CData("")
248 | EndElement(s)
249 | Comment("ws")
250 | CData("")
251 | Comment("ws")
252 | Comment("ws")
253 | CData("")
254 | CData("")
255 | CData("")
256 | CData("")
257 | CData("")
258 | Comment("ws")
259 | CData("")
260 | CData("")
261 | StartElement(br)
262 | EndElement(br)
263 | CData("")
264 | CData("")
265 | StartElement(s)
266 | EndElement(s)
267 | CData("")
268 | CData("")
269 | StartElement(s)
270 | EndElement(s)
271 | CData("")
272 | CData("")
273 | StartElement(s)
274 | Comment("ws")
275 | EndElement(s)
276 | CData("")
277 | Whitespace("\n")
278 | EndElement(el)
279 | EndDocument
280 |
--------------------------------------------------------------------------------
/tests/documents/sample_8_wsch.txt:
--------------------------------------------------------------------------------
1 | StartDocument(1.0, UTF-8)
2 | StartElement(el)
3 | Characters("\n")
4 | Comment("ws")
5 | CData("")
6 | Characters("\n")
7 | CData("")
8 | Characters("\n\n")
9 | Comment("ws")
10 | Comment("ws")
11 | Characters("\n")
12 | Comment("ws")
13 | Comment("ws")
14 | Characters("\n")
15 | Comment("ws")
16 | CData("")
17 | Comment("ws")
18 | Characters("\n")
19 | Comment("ws")
20 | StartElement(br)
21 | EndElement(br)
22 | Comment("ws")
23 | Characters("\n")
24 | Comment("ws")
25 | StartElement(s)
26 | EndElement(s)
27 | Comment("ws")
28 | Characters("\n")
29 | Comment("ws")
30 | StartElement(s)
31 | EndElement(s)
32 | Comment("ws")
33 | Characters("\n")
34 | Comment("ws")
35 | StartElement(s)
36 | CData("")
37 | EndElement(s)
38 | Comment("ws")
39 | Characters("\n\n\n")
40 | CData("")
41 | Comment("ws")
42 | Characters("\n")
43 | Comment("ws")
44 | Characters("\n\n")
45 | CData("")
46 | CData("")
47 | Characters("\n")
48 | CData("")
49 | CData("")
50 | Characters("\n")
51 | CData("")
52 | Comment("ws")
53 | CData("")
54 | Characters("\n")
55 | CData("")
56 | StartElement(br)
57 | EndElement(br)
58 | CData("")
59 | Characters("\n")
60 | CData("")
61 | StartElement(s)
62 | EndElement(s)
63 | CData("")
64 | Characters("\n")
65 | CData("")
66 | StartElement(s)
67 | EndElement(s)
68 | CData("")
69 | Characters("\n")
70 | CData("")
71 | StartElement(s)
72 | Comment("ws")
73 | EndElement(s)
74 | CData("")
75 | Characters("\n\n")
76 | Comment("ws")
77 | CData("")
78 | Characters("\n")
79 | CData("")
80 | Characters("\n\n")
81 | Comment("ws")
82 | Comment("ws")
83 | Characters("\n")
84 | Comment("ws")
85 | Comment("ws")
86 | Characters("\n")
87 | Comment("ws")
88 | CData("")
89 | Comment("ws")
90 | Characters("\n")
91 | Comment("ws")
92 | StartElement(br)
93 | EndElement(br)
94 | Comment("ws")
95 | Characters("\n")
96 | Comment("ws")
97 | StartElement(s)
98 | EndElement(s)
99 | Comment("ws")
100 | Characters("\n")
101 | Comment("ws")
102 | StartElement(s)
103 | EndElement(s)
104 | Comment("ws")
105 | Characters("\n")
106 | Comment("ws")
107 | StartElement(s)
108 | CData("")
109 | EndElement(s)
110 | Comment("ws")
111 | Characters("\n\n\n")
112 | CData("")
113 | Comment("ws")
114 | Characters("\n")
115 | Comment("ws")
116 | Characters("\n\n")
117 | CData("")
118 | CData("")
119 | Characters("\n")
120 | CData("")
121 | CData("")
122 | Characters("\n")
123 | CData("")
124 | Comment("ws")
125 | CData("")
126 | Characters("\n")
127 | CData("")
128 | StartElement(br)
129 | EndElement(br)
130 | CData("")
131 | Characters("\n")
132 | CData("")
133 | StartElement(s)
134 | EndElement(s)
135 | CData("")
136 | Characters("\n")
137 | CData("")
138 | StartElement(s)
139 | EndElement(s)
140 | CData("")
141 | Characters("\n")
142 | CData("")
143 | StartElement(s)
144 | Comment("ws")
145 | EndElement(s)
146 | CData("")
147 | Characters("\n\n\n")
148 | Comment("noWS")
149 | Characters("\n\n")
150 | Comment("ws")
151 | CData("")
152 | Characters("\n")
153 | CData("")
154 | Characters("\n\n")
155 | Comment("ws")
156 | Comment("ws")
157 | Characters("\n")
158 | Comment("ws")
159 | Comment("ws")
160 | Characters("\n")
161 | Comment("ws")
162 | CData("")
163 | Comment("ws")
164 | Characters("\n")
165 | Comment("ws")
166 | StartElement(br)
167 | EndElement(br)
168 | Comment("ws")
169 | Characters("\n")
170 | Comment("ws")
171 | StartElement(s)
172 | EndElement(s)
173 | Comment("ws")
174 | Characters("\n")
175 | Comment("ws")
176 | StartElement(s)
177 | EndElement(s)
178 | Comment("ws")
179 | Characters("\n")
180 | Comment("ws")
181 | StartElement(s)
182 | CData("")
183 | EndElement(s)
184 | Comment("ws")
185 | Characters("\n\n\n")
186 | CData("")
187 | Comment("ws")
188 | Characters("\n")
189 | Comment("ws")
190 | Characters("\n\n")
191 | CData("")
192 | CData("")
193 | Characters("\n")
194 | CData("")
195 | CData("")
196 | Characters("\n")
197 | CData("")
198 | Comment("ws")
199 | CData("")
200 | Characters("\n")
201 | CData("")
202 | StartElement(br)
203 | EndElement(br)
204 | CData("")
205 | Characters("\n")
206 | CData("")
207 | StartElement(s)
208 | EndElement(s)
209 | CData("")
210 | Characters("\n")
211 | CData("")
212 | StartElement(s)
213 | EndElement(s)
214 | CData("")
215 | Characters("\n")
216 | CData("")
217 | StartElement(s)
218 | Comment("ws")
219 | EndElement(s)
220 | CData("")
221 | Characters("\n\n")
222 | Comment("ws")
223 | CData("")
224 | Characters("\n\n")
225 | CData("")
226 | Comment("ws")
227 | Comment("ws")
228 | Comment("ws")
229 | Comment("ws")
230 | Comment("ws")
231 | CData("")
232 | Comment("ws")
233 | Comment("ws")
234 | StartElement(br)
235 | EndElement(br)
236 | Comment("ws")
237 | Comment("ws")
238 | StartElement(s)
239 | EndElement(s)
240 | Comment("ws")
241 | Comment("ws")
242 | StartElement(s)
243 | EndElement(s)
244 | Comment("ws")
245 | Comment("ws")
246 | StartElement(s)
247 | CData("")
248 | EndElement(s)
249 | Comment("ws")
250 | CData("")
251 | Comment("ws")
252 | Comment("ws")
253 | CData("")
254 | CData("")
255 | CData("")
256 | CData("")
257 | CData("")
258 | Comment("ws")
259 | CData("")
260 | CData("")
261 | StartElement(br)
262 | EndElement(br)
263 | CData("")
264 | CData("")
265 | StartElement(s)
266 | EndElement(s)
267 | CData("")
268 | CData("")
269 | StartElement(s)
270 | EndElement(s)
271 | CData("")
272 | CData("")
273 | StartElement(s)
274 | Comment("ws")
275 | EndElement(s)
276 | CData("")
277 | Characters("\n")
278 | EndElement(el)
279 | EndDocument
280 |
--------------------------------------------------------------------------------
/tests/oasis.fail.txt:
--------------------------------------------------------------------------------
1 | o-p04pass1 p04pass1.xml names with all valid ASCII characters, and one from each other class in NameChar ; 5:8 Element A.-:̀· prefix is unbound
2 | o-p05pass1 p05pass1.xml various valid Name constructions ; 2:8 Element A:._-0 prefix is unbound
3 | o-p09fail1 p09fail1.xml EntityValue excludes '%'
4 | o-p09fail2 p09fail2.xml EntityValue excludes '&'
5 | o-p12fail1 p12fail1.xml '"' excluded
6 | o-p12fail2 p12fail2.xml '\' excluded
7 | o-p12fail3 p12fail3.xml entity references excluded
8 | o-p12fail4 p12fail4.xml '>' excluded
9 | o-p12fail5 p12fail5.xml '<' excluded
10 | o-p12fail6 p12fail6.xml built-in entity refs excluded
11 | o-p12fail7 p12fail7.xml The public ID has a tab character, which is disallowed
12 | o-p30fail1 p30fail1.xml An XML declaration is not the same as a TextDecl
13 | o-p31fail1 p31fail1.xml external subset excludes doctypedecl
14 | o-p45fail2 p45fail2.xml S before contentspec is required.
15 | o-p45fail3 p45fail3.xml only one content spec
16 | o-p45fail4 p45fail4.xml no comments in declarations (contrast with SGML)
17 | o-p46fail1 p46fail1.xml no parens on declared content
18 | o-p46fail2 p46fail2.xml no inclusions (contrast with SGML)
19 | o-p46fail3 p46fail3.xml no exclusions (contrast with SGML)
20 | o-p46fail4 p46fail4.xml no space before occurrence
21 | o-p46fail5 p46fail5.xml single group
22 | o-p46fail6 p46fail6.xml can't be both declared and modeled
23 | o-p47fail1 p47fail1.xml Invalid operator '|' must match previous operator ','
24 | o-p47fail2 p47fail2.xml Illegal character '-' in Element-content model
25 | o-p47fail3 p47fail3.xml Optional character must follow a name or list
26 | o-p47fail4 p47fail4.xml Illegal space before optional character
27 | o-p48fail1 p48fail1.xml Illegal space before optional character
28 | o-p48fail2 p48fail2.xml Illegal space before optional character
29 | o-p51fail1 p51fail1.xml occurrence on #PCDATA group must be *
30 | o-p51fail2 p51fail2.xml occurrence on #PCDATA group must be *
31 | o-p51fail3 p51fail3.xml #PCDATA must come first
32 | o-p51fail4 p51fail4.xml occurrence on #PCDATA group must be *
33 | o-p51fail5 p51fail5.xml only '|' connectors
34 | o-p51fail6 p51fail6.xml Only '|' connectors and occurrence on #PCDATA group must be *
35 | o-p51fail7 p51fail7.xml no nested groups
36 | o-p52fail1 p52fail1.xml A name is required
37 | o-p53fail1 p53fail1.xml S is required before default
38 | o-p53fail2 p53fail2.xml S is required before type
39 | o-p53fail3 p53fail3.xml type is required
40 | o-p53fail4 p53fail4.xml default is required
41 | o-p53fail5 p53fail5.xml name is requried
42 | o-p54fail1 p54fail1.xml don't pass unknown attribute types
43 | o-p55fail1 p55fail1.xml must be upper case
44 | o-p56fail1 p56fail1.xml no IDS type
45 | o-p56fail2 p56fail2.xml no NUMBER type
46 | o-p56fail3 p56fail3.xml no NAME type
47 | o-p56fail4 p56fail4.xml no ENTITYS type - types must be upper case
48 | o-p56fail5 p56fail5.xml types must be upper case
49 | o-p57fail1 p57fail1.xml no keyword for NMTOKEN enumeration
50 | o-p58fail1 p58fail1.xml at least one value required
51 | o-p58fail2 p58fail2.xml separator must be '|'
52 | o-p58fail3 p58fail3.xml notations are NAMEs, not NMTOKENs -- note: Leaving the invalid notation undeclared would cause a validating parser to fail without checking the name syntax, so the notation is declared with an invalid name. A parser that reports error positions should report an error at the AttlistDecl on line 6, before reaching the notation declaration.
53 | o-p58fail4 p58fail4.xml NOTATION must be upper case
54 | o-p58fail5 p58fail5.xml S after keyword is required
55 | o-p58fail6 p58fail6.xml parentheses are require
56 | o-p58fail7 p58fail7.xml values are unquoted
57 | o-p58fail8 p58fail8.xml values are unquoted
58 | o-p59fail1 p59fail1.xml at least one required
59 | o-p59fail2 p59fail2.xml separator must be ","
60 | o-p59fail3 p59fail3.xml values are unquoted
61 | o-p60fail1 p60fail1.xml keywords must be upper case
62 | o-p60fail2 p60fail2.xml S is required after #FIXED
63 | o-p60fail3 p60fail3.xml only #FIXED has both keyword and value
64 | o-p60fail4 p60fail4.xml #FIXED required value
65 | o-p60fail5 p60fail5.xml only one default type
66 | o-p61fail1 p61fail1.xml no other types, including TEMP, which is valid in SGML
67 | o-p62fail1 p62fail1.xml INCLUDE must be upper case
68 | o-p62fail2 p62fail2.xml no spaces in terminating delimiter
69 | o-p63fail1 p63fail1.xml IGNORE must be upper case
70 | o-p63fail2 p63fail2.xml delimiters must be balanced
71 | o-p64fail1 p64fail1.xml section delimiters must balance
72 | o-p64fail2 p64fail2.xml section delimiters must balance
73 | o-p72fail2 p72fail2.xml S is required after '%'
74 | o-p73fail2 p73fail2.xml Only one replacement value
75 | o-p73fail3 p73fail3.xml No NDataDecl on replacement text
76 | o-p74fail1 p74fail1.xml no NDataDecls on parameter entities
77 | o-p74fail3 p74fail3.xml only one value
78 | o-p75fail1 p75fail1.xml S required after "PUBLIC"
79 | o-p75fail2 p75fail2.xml S required after "SYSTEM"
80 | o-p75fail3 p75fail3.xml S required between literals
81 | o-p75fail4 p75fail4.xml "SYSTEM" implies only one literal
82 | o-p75fail5 p75fail5.xml only one keyword
83 | o-p75fail6 p75fail6.xml "PUBLIC" requires two literals (contrast with SGML)
84 | o-p76fail1 p76fail1.xml S is required before "NDATA"
85 | o-p76fail2 p76fail2.xml "NDATA" is upper-case
86 | o-p76fail3 p76fail3.xml notation name is required
87 | o-p76fail4 p76fail4.xml notation names are Names
88 | o-p11pass1 p11pass1.xml system literals may not contain URI fragments
89 |
--------------------------------------------------------------------------------
/src/reader/parser/inside_processing_instruction.rs:
--------------------------------------------------------------------------------
1 | use crate::common::{is_name_char, is_name_start_char, is_whitespace_char};
2 | use crate::reader::error::SyntaxError;
3 |
4 | use crate::reader::events::XmlEvent;
5 | use crate::reader::lexer::Token;
6 |
7 | use super::{DeclarationSubstate, Encountered, ProcessingInstructionSubstate, PullParser, Result, State};
8 |
9 | impl PullParser {
10 | pub fn inside_processing_instruction(&mut self, t: Token, s: ProcessingInstructionSubstate) -> Option {
11 | match s {
12 | ProcessingInstructionSubstate::PIInsideName => match t {
13 | Token::Character(c) if self.buf.is_empty() && is_name_start_char(c) ||
14 | self.buf_has_data() && is_name_char(c) => {
15 | if self.buf.len() > self.config.max_name_length {
16 | return Some(self.error(SyntaxError::ExceededConfiguredLimit));
17 | }
18 | self.buf.push(c);
19 | None
20 | },
21 |
22 | Token::ProcessingInstructionEnd => {
23 | // self.buf contains PI name
24 | let name = self.take_buf();
25 |
26 | // Don't need to check for declaration because it has mandatory attributes
27 | // but there is none
28 | match &*name {
29 | // Name is empty, it is an error
30 | "" => Some(self.error(SyntaxError::ProcessingInstructionWithoutName)),
31 |
32 | // Found
35 | Some(self.error(SyntaxError::InvalidXmlProcessingInstruction(name.into()))),
36 |
37 | // All is ok, emitting event
38 | _ => {
39 | debug_assert!(self.next_event.is_none(), "{:?}", self.next_event);
40 | // can't have a PI before ` {
59 | // self.buf contains PI name
60 | let name = self.take_buf();
61 |
62 | match &*name {
63 | // We have not ever encountered an element and have not parsed XML declaration
64 | "xml" if self.encountered == Encountered::None =>
65 | self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeVersion)),
66 |
67 | // Found
70 | Some(self.error(SyntaxError::InvalidXmlProcessingInstruction(name.into()))),
71 |
72 | // All is ok, starting parsing PI data
73 | _ => {
74 | self.data.name = name;
75 | // can't have a PI before ` {
83 | let buf = self.take_buf();
84 | Some(self.error(SyntaxError::UnexpectedProcessingInstruction(buf.into(), t)))
85 | },
86 | },
87 |
88 | ProcessingInstructionSubstate::PIInsideData => match t {
89 | Token::ProcessingInstructionEnd => {
90 | let name = self.data.take_name();
91 | let data = self.take_buf();
92 | self.into_state_emit(
93 | State::OutsideTag,
94 | Ok(XmlEvent::ProcessingInstruction { name, data: Some(data) }),
95 | )
96 | },
97 |
98 | Token::Character(c) if !self.is_valid_xml_char(c) => {
99 | Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
100 | },
101 |
102 | // Any other token should be treated as plain characters
103 | _ => {
104 | if self.buf.len() > self.config.max_data_length {
105 | return Some(self.error(SyntaxError::ExceededConfiguredLimit));
106 | }
107 | t.push_to_string(&mut self.buf);
108 | None
109 | },
110 | },
111 | }
112 | }
113 | }
114 |
--------------------------------------------------------------------------------
/src/common.rs:
--------------------------------------------------------------------------------
1 | //! Contains common types and functions used throughout the library.
2 |
3 | use std::fmt;
4 |
5 | /// Represents a position inside some textual document.
6 | #[derive(Copy, Clone, PartialEq, Eq)]
7 | pub struct TextPosition {
8 | #[doc(hidden)]
9 | pub row: u64,
10 |
11 | #[doc(hidden)]
12 | pub column: u64,
13 | }
14 |
15 | impl TextPosition {
16 | /// Creates a new position initialized to the beginning of the document
17 | #[inline]
18 | #[must_use]
19 | pub const fn new() -> Self {
20 | Self { row: 0, column: 0 }
21 | }
22 |
23 | /// Advances the position in a line
24 | #[inline]
25 | pub fn advance(&mut self, count: u8) {
26 | self.column += u64::from(count);
27 | }
28 |
29 | #[doc(hidden)]
30 | #[deprecated]
31 | pub fn advance_to_tab(&mut self, width: u8) {
32 | let width = u64::from(width);
33 | self.column += width - self.column % width;
34 | }
35 |
36 | /// Advances the position to the beginning of the next line
37 | #[inline]
38 | pub fn new_line(&mut self) {
39 | self.column = 0;
40 | self.row += 1;
41 | }
42 |
43 | /// Row, counting from 0. Add 1 to display as users expect!
44 | #[must_use]
45 | pub fn row(&self) -> u64 {
46 | self.row
47 | }
48 |
49 | /// Column, counting from 0. Add 1 to display as users expect!
50 | #[must_use]
51 | pub fn column(&self) -> u64 {
52 | self.column
53 | }
54 | }
55 |
56 | impl fmt::Debug for TextPosition {
57 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
58 | fmt::Display::fmt(self, f)
59 | }
60 | }
61 |
62 | impl fmt::Display for TextPosition {
63 | #[inline]
64 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
65 | write!(f, "{}:{}", self.row + 1, self.column + 1)
66 | }
67 | }
68 |
69 | /// Get the position in the document corresponding to the object
70 | ///
71 | /// This trait is implemented by parsers, lexers and errors.
72 | pub trait Position {
73 | /// Returns the current position or a position corresponding to the object.
74 | fn position(&self) -> TextPosition;
75 | }
76 |
77 | impl Position for TextPosition {
78 | #[inline]
79 | fn position(&self) -> TextPosition {
80 | *self
81 | }
82 | }
83 |
84 | /// XML version enumeration.
85 | #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
86 | pub enum XmlVersion {
87 | /// XML version 1.0.
88 | Version10,
89 |
90 | /// XML version 1.1.
91 | Version11,
92 | }
93 |
94 | impl fmt::Display for XmlVersion {
95 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
96 | match *self {
97 | Self::Version10 => "1.0",
98 | Self::Version11 => "1.1",
99 | }.fmt(f)
100 | }
101 | }
102 |
103 | impl fmt::Debug for XmlVersion {
104 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
105 | fmt::Display::fmt(self, f)
106 | }
107 | }
108 |
109 | /// Checks whether the given character is a white space character (`S`)
110 | /// as is defined by XML 1.1 specification, [section 2.3][1].
111 | ///
112 | /// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn
113 | #[must_use]
114 | #[inline]
115 | pub const fn is_whitespace_char(c: char) -> bool {
116 | matches!(c, '\x20' | '\x0a' | '\x09' | '\x0d')
117 | }
118 |
119 | /// Checks whether the given string is compound only by white space
120 | /// characters (`S`) using the previous `is_whitespace_char` to check
121 | /// all characters of this string
122 | pub fn is_whitespace_str(s: &str) -> bool {
123 | s.chars().all(is_whitespace_char)
124 | }
125 |
126 | /// Is it a valid character in XML 1.0
127 | #[must_use]
128 | pub const fn is_xml10_char(c: char) -> bool {
129 | matches!(c, '\u{09}' | '\u{0A}' | '\u{0D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..)
130 | }
131 |
132 | /// Is it a valid character in XML 1.1
133 | #[must_use]
134 | pub const fn is_xml11_char(c: char) -> bool {
135 | matches!(c, '\u{01}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..)
136 | }
137 |
138 | /// Is it a valid character in XML 1.1 but not part of the restricted character set
139 | #[must_use]
140 | pub const fn is_xml11_char_not_restricted(c: char) -> bool {
141 | is_xml11_char(c) &&
142 | !matches!(c, '\u{01}'..='\u{08}' | '\u{0B}'..='\u{0C}' | '\u{0E}'..='\u{1F}' | '\u{7F}'..='\u{84}' | '\u{86}'..='\u{9F}')
143 | }
144 |
145 | /// Checks whether the given character is a name start character (`NameStartChar`)
146 | /// as is defined by XML 1.1 specification, [section 2.3][1].
147 | ///
148 | /// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn
149 | #[must_use]
150 | pub const fn is_name_start_char(c: char) -> bool {
151 | matches!(c,
152 | ':' | 'A'..='Z' | '_' | 'a'..='z' |
153 | '\u{C0}'..='\u{D6}' | '\u{D8}'..='\u{F6}' | '\u{F8}'..='\u{2FF}' |
154 | '\u{370}'..='\u{37D}' | '\u{37F}'..='\u{1FFF}' |
155 | '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' |
156 | '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' |
157 | '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' |
158 | '\u{10000}'..='\u{EFFFF}'
159 | )
160 | }
161 |
162 | /// Checks whether the given character is a name character (`NameChar`)
163 | /// as is defined by XML 1.1 specification, [section 2.3][1].
164 | ///
165 | /// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn
166 | #[must_use]
167 | pub const fn is_name_char(c: char) -> bool {
168 | if is_name_start_char(c) {
169 | return true;
170 | }
171 | matches!(c,
172 | '-' | '.' | '0'..='9' | '\u{B7}' |
173 | '\u{300}'..='\u{36F}' | '\u{203F}'..='\u{2040}'
174 | )
175 | }
176 |
--------------------------------------------------------------------------------
/src/reader.rs:
--------------------------------------------------------------------------------
1 | //! Contains high-level interface for a pull-based XML parser.
2 | //!
3 | //! The most important type in this module is `EventReader`, which provides an iterator
4 | //! view for events in XML document.
5 |
6 | use std::io::Read;
7 | use std::iter::FusedIterator;
8 | use std::result;
9 |
10 | use crate::common::{Position, TextPosition};
11 |
12 | pub use self::config::ParserConfig;
13 | pub use self::error::{Error, ErrorKind};
14 | pub use self::events::XmlEvent;
15 |
16 | // back compat
17 | #[doc(hidden)]
18 | #[deprecated(note = "Merged into ParserConfig")]
19 | pub type ParserConfig2 = ParserConfig;
20 |
21 | use self::parser::PullParser;
22 |
23 | mod config;
24 | mod error;
25 | mod events;
26 | mod indexset;
27 | mod lexer;
28 | mod parser;
29 |
30 | /// A result type yielded by `XmlReader`.
31 | pub type Result = result::Result;
32 |
33 | /// A wrapper around an `std::io::Read` instance which provides pull-based XML parsing.
34 | ///
35 | /// The reader should be wrapped in a `BufReader`, otherwise parsing may be very slow.
36 | pub struct EventReader {
37 | source: R,
38 | parser: PullParser,
39 | }
40 |
41 | impl EventReader {
42 | /// Creates a new reader, consuming the given stream. The reader should be wrapped in a `BufReader`, otherwise parsing may be very slow.
43 | #[inline]
44 | pub fn new(source: R) -> Self {
45 | Self::new_with_config(source, ParserConfig::new())
46 | }
47 |
48 | /// Creates a new reader with the provded configuration, consuming the given stream. The reader should be wrapped in a `BufReader`, otherwise parsing may be very slow.
49 | #[inline]
50 | pub fn new_with_config(source: R, config: impl Into) -> Self {
51 | Self {
52 | source,
53 | parser: PullParser::new(config),
54 | }
55 | }
56 |
57 | /// Pulls and returns next XML event from the stream.
58 | ///
59 | /// If this returns [Err] or [`XmlEvent::EndDocument`] then further calls to
60 | /// this method will return this event again.
61 | #[inline]
62 | #[allow(clippy::should_implement_trait)]
63 | pub fn next(&mut self) -> Result {
64 | self.parser.next(&mut self.source)
65 | }
66 |
67 | /// Skips all XML events until the next end tag at the current level.
68 | ///
69 | /// Convenience function that is useful for the case where you have
70 | /// encountered a start tag that is of no interest and want to
71 | /// skip the entire XML subtree until the corresponding end tag.
72 | #[inline]
73 | pub fn skip(&mut self) -> Result<()> {
74 | let mut depth = 1;
75 |
76 | while depth > 0 {
77 | match self.next()? {
78 | XmlEvent::StartElement { .. } => depth += 1,
79 | XmlEvent::EndElement { .. } => depth -= 1,
80 | XmlEvent::EndDocument => return Err(Error {
81 | kind: ErrorKind::UnexpectedEof,
82 | pos: self.parser.position(),
83 | }),
84 | _ => {},
85 | }
86 | }
87 |
88 | Ok(())
89 | }
90 |
91 | /// Access underlying reader
92 | ///
93 | /// Using it directly while the event reader is parsing is not recommended
94 | pub fn source(&self) -> &R { &self.source }
95 |
96 | /// Access underlying reader
97 | ///
98 | /// Using it directly while the event reader is parsing is not recommended
99 | pub fn source_mut(&mut self) -> &mut R { &mut self.source }
100 |
101 | /// Unwraps this `EventReader`, returning the underlying reader.
102 | ///
103 | /// Note that this operation is destructive; unwrapping the reader and wrapping it
104 | /// again with `EventReader::new()` will create a fresh reader which will attempt
105 | /// to parse an XML document from the beginning.
106 | pub fn into_inner(self) -> R {
107 | self.source
108 | }
109 |
110 | /// Returns the DOCTYPE of the document if it has already been seen
111 | ///
112 | /// Available only after the root `StartElement` event
113 | #[inline]
114 | #[deprecated(note = "there is `XmlEvent::Doctype` now")]
115 | #[allow(deprecated)]
116 | pub fn doctype(&self) -> Option<&str> {
117 | self.parser.doctype()
118 | }
119 | }
120 |
121 | impl Position for EventReader {
122 | /// Returns the position of the last event produced by the reader.
123 | #[inline]
124 | fn position(&self) -> TextPosition {
125 | self.parser.position()
126 | }
127 | }
128 |
129 | impl IntoIterator for EventReader {
130 | type IntoIter = Events;
131 | type Item = Result;
132 |
133 | fn into_iter(self) -> Events {
134 | Events { reader: self, finished: false }
135 | }
136 | }
137 |
138 | /// An iterator over XML events created from some type implementing `Read`.
139 | ///
140 | /// When the next event is `xml::event::Error` or `xml::event::EndDocument`, then
141 | /// it will be returned by the iterator once, and then it will stop producing events.
142 | pub struct Events {
143 | reader: EventReader,
144 | finished: bool,
145 | }
146 |
147 | impl Events {
148 | /// Unwraps the iterator, returning the internal `EventReader`.
149 | #[inline]
150 | pub fn into_inner(self) -> EventReader {
151 | self.reader
152 | }
153 |
154 | /// Access the underlying reader
155 | ///
156 | /// It's not recommended to use it while the events are still being parsed
157 | pub fn source(&self) -> &R { &self.reader.source }
158 |
159 | /// Access the underlying reader
160 | ///
161 | /// It's not recommended to use it while the events are still being parsed
162 | pub fn source_mut(&mut self) -> &mut R { &mut self.reader.source }
163 | }
164 |
165 | impl FusedIterator for Events {
166 | }
167 |
168 | impl Iterator for Events {
169 | type Item = Result;
170 |
171 | #[inline]
172 | fn next(&mut self) -> Option> {
173 | if self.finished && !self.reader.parser.is_ignoring_end_of_stream() {
174 | None
175 | } else {
176 | let ev = self.reader.next();
177 | if let Ok(XmlEvent::EndDocument) | Err(_) = ev {
178 | self.finished = true;
179 | }
180 | Some(ev)
181 | }
182 | }
183 | }
184 |
185 | impl<'r> EventReader<&'r [u8]> {
186 | /// A convenience method to create an `XmlReader` from a string slice.
187 | #[inline]
188 | #[must_use]
189 | #[allow(clippy::should_implement_trait)]
190 | pub fn from_str(source: &'r str) -> Self {
191 | EventReader::new(source.as_bytes())
192 | }
193 | }
194 |
--------------------------------------------------------------------------------
/src/writer/config.rs:
--------------------------------------------------------------------------------
1 | //! Contains emitter configuration structure.
2 |
3 | use crate::writer::EventWriter;
4 | use std::borrow::Cow;
5 | use std::io::Write;
6 |
7 | /// Emitter configuration structure.
8 | ///
9 | /// This structure contains various options which control XML document emitter behavior.
10 | #[derive(Clone, PartialEq, Eq, Debug)]
11 | pub struct EmitterConfig {
12 | /// Line separator used to separate lines in formatted output. Default is `"\n"`.
13 | pub line_separator: Cow<'static, str>,
14 |
15 | /// A string which will be used for a single level of indentation. Default is `" "`
16 | /// (two spaces).
17 | pub indent_string: Cow<'static, str>,
18 |
19 | /// Whether or not the emitted document should be indented. Default is false.
20 | ///
21 | /// The emitter is capable to perform automatic indentation of the emitted XML document.
22 | /// It is done in stream-like fashion and does not require the knowledge of the whole
23 | /// document in advance.
24 | ///
25 | /// Sometimes, however, automatic indentation is undesirable, e.g. when you want to keep
26 | /// existing layout when processing an existing XML document. Also the indentiation algorithm
27 | /// is not thoroughly tested. Hence by default it is disabled.
28 | pub perform_indent: bool,
29 |
30 | /// Whether or not characters in output events will be escaped. Default is true.
31 | ///
32 | /// The emitter can automatically escape characters which can't appear in PCDATA sections
33 | /// or element attributes of an XML document, like `<` or `"` (in attributes). This may
34 | /// introduce some overhead because then every corresponding piece of character data
35 | /// should be scanned for invalid characters.
36 | ///
37 | /// If this option is disabled, the XML writer may produce non-well-formed documents, so
38 | /// use `false` value for this option with care.
39 | pub perform_escaping: bool,
40 |
41 | /// Whether or not to write XML document declaration at the beginning of a document.
42 | /// Default is true.
43 | ///
44 | /// This option controls whether the document declaration should be emitted automatically
45 | /// before a root element is written if it was not emitted explicitly by the user.
46 | pub write_document_declaration: bool,
47 |
48 | /// Whether or not to convert elements with empty content to empty elements. Default is true.
49 | ///
50 | /// This option allows turning elements like `` (an element with empty content)
51 | /// into `` (an empty element).
52 | pub normalize_empty_elements: bool,
53 |
54 | /// Whether or not to emit CDATA events as plain characters. Default is false.
55 | ///
56 | /// This option forces the emitter to convert CDATA events into regular character events,
57 | /// performing all the necessary escaping beforehand. This may be occasionally useful
58 | /// for feeding the document into incorrect parsers which do not support CDATA.
59 | pub cdata_to_characters: bool,
60 |
61 | /// Whether or not to keep element names to support `EndElement` events without explicit names.
62 | /// Default is true.
63 | ///
64 | /// This option makes the emitter to keep names of written elements in order to allow
65 | /// omitting names when writing closing element tags. This could incur some memory overhead.
66 | pub keep_element_names_stack: bool,
67 |
68 | /// Whether or not to automatically insert leading and trailing spaces in emitted comments,
69 | /// if necessary. Default is true.
70 | ///
71 | /// This is a convenience option in order for the user not to append spaces before and after
72 | /// comments text in order to get more pretty comments: `` instead of
73 | /// ``.
74 | pub autopad_comments: bool,
75 |
76 | /// Whether or not to automatically insert spaces before the trailing `/>` in self-closing
77 | /// elements. Default is true.
78 | ///
79 | /// This option is only meaningful if `normalize_empty_elements` is true. For example, the
80 | /// element `` would be unaffected. When `normalize_empty_elements` is true, then when
81 | /// this option is also true, the same element would appear ``. If this option is false,
82 | /// then the same element would appear ``.
83 | pub pad_self_closing: bool,
84 | }
85 |
86 | impl EmitterConfig {
87 | /// Creates an emitter configuration with default values.
88 | ///
89 | /// You can tweak default options with builder-like pattern:
90 | ///
91 | /// ```rust
92 | /// use xml::writer::EmitterConfig;
93 | ///
94 | /// let config = EmitterConfig::new()
95 | /// .line_separator("\r\n")
96 | /// .perform_indent(true)
97 | /// .normalize_empty_elements(false);
98 | /// ```
99 | #[inline]
100 | #[must_use]
101 | pub fn new() -> Self {
102 | Self {
103 | line_separator: "\n".into(),
104 | indent_string: " ".into(), // two spaces
105 | perform_indent: false,
106 | perform_escaping: true,
107 | write_document_declaration: true,
108 | normalize_empty_elements: true,
109 | cdata_to_characters: false,
110 | keep_element_names_stack: true,
111 | autopad_comments: true,
112 | pad_self_closing: true,
113 | }
114 | }
115 |
116 | /// Creates an XML writer with this configuration.
117 | ///
118 | /// This is a convenience method for configuring and creating a writer at the same time:
119 | ///
120 | /// ```rust
121 | /// use xml::writer::EmitterConfig;
122 | ///
123 | /// let mut target: Vec = Vec::new();
124 | ///
125 | /// let writer = EmitterConfig::new()
126 | /// .line_separator("\r\n")
127 | /// .perform_indent(true)
128 | /// .normalize_empty_elements(false)
129 | /// .create_writer(&mut target);
130 | /// ```
131 | ///
132 | /// This method is exactly equivalent to calling `EventWriter::new_with_config()` with
133 | /// this configuration object.
134 | #[inline]
135 | pub fn create_writer(self, sink: W) -> EventWriter {
136 | EventWriter::new_with_config(sink, self)
137 | }
138 | }
139 |
140 | impl Default for EmitterConfig {
141 | #[inline]
142 | fn default() -> Self {
143 | Self::new()
144 | }
145 | }
146 |
147 | gen_setters!(EmitterConfig,
148 | line_separator: into Cow<'static, str>,
149 | indent_string: into Cow<'static, str>,
150 | perform_indent: val bool,
151 | write_document_declaration: val bool,
152 | normalize_empty_elements: val bool,
153 | cdata_to_characters: val bool,
154 | keep_element_names_stack: val bool,
155 | autopad_comments: val bool,
156 | pad_self_closing: val bool
157 | );
158 |
--------------------------------------------------------------------------------
/src/reader/parser/inside_opening_tag.rs:
--------------------------------------------------------------------------------
1 | use crate::attribute::OwnedAttribute;
2 | use crate::common::{is_name_start_char, is_whitespace_char};
3 | use crate::namespace;
4 | use crate::reader::error::SyntaxError;
5 |
6 | use crate::reader::lexer::Token;
7 |
8 | use super::{OpeningTagSubstate, PullParser, QualifiedNameTarget, Result, State};
9 |
10 | impl PullParser {
11 | pub fn inside_opening_tag(&mut self, t: Token, s: OpeningTagSubstate) -> Option {
12 | let max_attrs = self.config.max_attributes;
13 | match s {
14 | OpeningTagSubstate::InsideName => self.read_qualified_name(t, QualifiedNameTarget::OpeningTagNameTarget, |this, token, name| {
15 | match name.prefix_ref() {
16 | Some(prefix) if prefix == namespace::NS_XML_PREFIX ||
17 | prefix == namespace::NS_XMLNS_PREFIX =>
18 | Some(this.error(SyntaxError::InvalidNamePrefix(prefix.into()))),
19 | _ => {
20 | this.data.element_name = Some(name.clone());
21 | match token {
22 | Token::TagEnd => this.emit_start_element(false),
23 | Token::EmptyTagEnd => this.emit_start_element(true),
24 | Token::Character(c) if is_whitespace_char(c) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)),
25 | _ => {
26 | debug_assert!(false, "unreachable");
27 | None
28 | },
29 | }
30 | }
31 | }
32 | }),
33 |
34 | OpeningTagSubstate::InsideTag => match t {
35 | Token::TagEnd => self.emit_start_element(false),
36 | Token::EmptyTagEnd => self.emit_start_element(true),
37 | Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace
38 | Token::Character(c) if is_name_start_char(c) => {
39 | if self.buf.len() > self.config.max_name_length {
40 | return Some(self.error(SyntaxError::ExceededConfiguredLimit));
41 | }
42 | self.buf.push(c);
43 | self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeName))
44 | },
45 | _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))),
46 | },
47 |
48 | OpeningTagSubstate::InsideAttributeName => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| {
49 | // check that no attribute with such name is already present
50 | // if there is one, XML is not well-formed
51 | if this.data.attributes.contains(&name) {
52 | return Some(this.error(SyntaxError::RedefinedAttribute(name.to_string().into())))
53 | }
54 |
55 | this.data.attr_name = Some(name);
56 | match token {
57 | Token::EqualsSign => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)),
58 | Token::Character(c) if is_whitespace_char(c) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeName)),
59 | _ => Some(this.error(SyntaxError::UnexpectedTokenInOpeningTag(t))) // likely unreachable
60 | }
61 | }),
62 |
63 | OpeningTagSubstate::AfterAttributeName => match t {
64 | Token::EqualsSign => self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)),
65 | Token::Character(c) if is_whitespace_char(c) => None,
66 | _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t)))
67 | },
68 |
69 | OpeningTagSubstate::InsideAttributeValue => self.read_attribute_value(t, |this, value| {
70 | let name = this.data.take_attr_name()?; // will always succeed here
71 | match name.prefix_ref() {
72 | // declaring a new prefix; it is sufficient to check prefix only
73 | // because "xmlns" prefix is reserved
74 | Some(namespace::NS_XMLNS_PREFIX) => {
75 | let ln = &*name.local_name;
76 | if ln == namespace::NS_XMLNS_PREFIX {
77 | Some(this.error(SyntaxError::CannotRedefineXmlnsPrefix))
78 | } else if ln == namespace::NS_XML_PREFIX && &*value != namespace::NS_XML_URI {
79 | Some(this.error(SyntaxError::CannotRedefineXmlPrefix))
80 | } else if value.is_empty() {
81 | Some(this.error(SyntaxError::CannotUndefinePrefix(ln.into())))
82 | } else {
83 | this.nst.put(name.local_name.clone(), value);
84 | this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue))
85 | }
86 | },
87 |
88 | // declaring default namespace
89 | None if &*name.local_name == namespace::NS_XMLNS_PREFIX =>
90 | match &*value {
91 | namespace::NS_XMLNS_PREFIX | namespace::NS_XML_PREFIX | namespace::NS_XML_URI | namespace::NS_XMLNS_URI =>
92 | Some(this.error(SyntaxError::InvalidDefaultNamespace(value.into()))),
93 | _ => {
94 | this.nst.put(namespace::NS_NO_PREFIX, value.clone());
95 | this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue))
96 | }
97 | },
98 |
99 | // regular attribute
100 | _ => {
101 | if this.data.attributes.len() >= max_attrs {
102 | return Some(this.error(SyntaxError::ExceededConfiguredLimit));
103 | }
104 | this.data.attributes.push(OwnedAttribute { name, value });
105 | this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue))
106 | },
107 | }
108 | }),
109 |
110 | OpeningTagSubstate::AfterAttributeValue => match t {
111 | Token::Character(c) if is_whitespace_char(c) => {
112 | self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag))
113 | },
114 | Token::TagEnd => self.emit_start_element(false),
115 | Token::EmptyTagEnd => self.emit_start_element(true),
116 | _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))),
117 | },
118 | }
119 | }
120 | }
121 |
--------------------------------------------------------------------------------
/Changelog.md:
--------------------------------------------------------------------------------
1 | ## Version 1.0.0
2 |
3 | * Added `Doctype` event
4 | * Marked structs as `#[non_exhaustive]`
5 | * Merged `ParserConfig2` back into `ParserConfig`
6 | * Added option to the writer to pass through XML markup unmodified
7 | * `xml-analyze` binary has been moved to examples
8 | * Writer escapes `--` in comments and `]]>` in CDATA
9 |
10 | ## Version 0.8.27
11 |
12 | * Added detection of invalid `` in attributes
13 |
14 | ## Version 0.8.26
15 |
16 | * Fixed buffering of files with a broken UTF-16 encoding
17 |
18 | ## Version 0.8.25
19 |
20 | * `TryFrom` for converting from reader to writer events, to make `.as_writer_event()` more discoverable.
21 |
22 | ## Version 0.8.24
23 |
24 | * Fixed reporting of line/column position of CDATA when trimming whitespace
25 |
26 | ## Version 0.8.23
27 |
28 | * StartDocument event will consistently use uppercase "UTF-8" name for encoding when the document did not declare it expicitly, but beware that documents can still use lowercase encoding names, so you must always use case-insensitive comparisons.
29 |
30 | ## Version 0.8.22
31 |
32 | * Ability to retrieve the whole DOCTYPE. For backwards compatibility, it's a getter on the reader, not an event.
33 |
34 | ## Version 0.8.21
35 |
36 | * Added `EventWriter::inner_ref`
37 | * ~15% performance improvement
38 |
39 | ## Version 0.8.20
40 |
41 | * Fixed escaping of literal `]]>` in CDATA
42 |
43 | ## Version 0.8.19
44 |
45 | * Fixed whitespace event when parsing DOCTYPE with internal subset
46 |
47 | ## Version 0.8.18
48 |
49 | * Option to tolerate invalid entities and chars
50 |
51 | ## Version 0.8.17
52 |
53 | * Added configuration for document size/complexity limits.
54 |
55 | ## Version 0.8.16
56 |
57 | * Fixed error line numbers when parsing CDATA as characters
58 |
59 | ## Version 0.8.15
60 |
61 | * Improved speed of parsing elements with huge number of arguments
62 |
63 | ## Version 0.8.14
64 |
65 | * Fixed error line numbers when ignoring comments
66 |
67 | ## Version 0.8.13
68 |
69 | * Backward-compatibility fix
70 |
71 | ## Version 0.8.12
72 |
73 | * Improved conformance of parsing invalid codepoints, XML prolog
74 | * Reduced number of allocations
75 |
76 | ## Version 0.8.11
77 |
78 | * Improved conformance of PI
79 | * Forbidden invalid multiple root elements, unless an option allowing them is enabled.
80 |
81 | ## Version 0.8.10
82 |
83 | * Improved parsing conformance
84 | * Internal error handling improvements
85 |
86 | ## Version 0.8.9
87 |
88 | * Added support for UTF-16 and ASCII
89 | * Fixed CDATA parsing
90 | * Added PE entities parsing
91 |
92 | ## Version 0.8.8
93 |
94 | * Added recursive entity expansion (with length protection)
95 | * Expanded parsing of DTD
96 |
97 | ## Version 0.8.7
98 |
99 | * Basic parsing of DTD internal subset
100 | * Speed improvements
101 |
102 | ## Version 0.8.6
103 |
104 | * Fixed parsing of incorrectly nested comments and processing instructions
105 |
106 | ## Version 0.8.5
107 |
108 | * Updated source code to edition 2018 and fixed/updated some Rust idioms.
109 |
110 | ## Version 0.8.4
111 |
112 | * Fixed recognition of `?>`, `]]>` and `/>` tokens as characters.
113 | * Fixed writer output operations to use `write_all` to ensure that the data
114 | is written fully.
115 | * The document declaration is now written before any characters automatically.
116 |
117 | ## Version 0.8.3
118 |
119 | * Added a new parser option, `ignore_root_level_whitespace`, which makes the parser
120 | skip emitting whitespace events outside of the root element when set to `true`.
121 | This helps with certain tasks like canonicalization.
122 |
123 | ## Version 0.8.2
124 |
125 | * Added a new parser option, `replace_unknown_entity_references`, which allows to ignore
126 | invalid Unicode code points and replace them with a Unicode "replacement character"
127 | during parsing. This can be helpful to deal with e.g. UTF-16 surrogate pairs.
128 | * Added a new emitter option, `pad_self_closing`, which determines the style of the self-closing
129 | elements when they are emitted: `` (`true`) vs `` (`false`).
130 |
131 | ## Version 0.8.1
132 |
133 | * Fixed various issues with tests introduced by updates in Rust.
134 | * Adjusted the lexer to ignore contents of the `` tag.
135 | * Removed unnecessary unsafety in tests.
136 | * Added tests for doc comments in the readme file.
137 | * Switched to GitHub Actions from Travis CI.
138 |
139 | ## Version 0.8.0
140 |
141 | * Same as 0.7.1, with 0.7.1 being yanked because of the incorrect semver bump.
142 |
143 | ## Version 0.7.1
144 |
145 | * Removed dependency on bitflags.
146 | * Added the `XmlWriter::inner_mut()` method.
147 | * Fixed some rustdoc warnings.
148 |
149 | ## Version 0.7.0
150 |
151 | * Same as 0.6.2, with 0.6.2 being yanked because of the incompatible bump of minimum required version of rustc.
152 |
153 | ## Version 0.6.2
154 |
155 | * Bumped `bitflags` to 1.0.
156 |
157 | ## Version 0.6.1
158 |
159 | * Fixed the writer to escape some special characters when writing attribute values.
160 |
161 | ## Version 0.6.0
162 |
163 | * Changed the target type of extra entities from `char` to `String`. This is an incompatible
164 | change.
165 |
166 | ## Version 0.5.0
167 |
168 | * Added support for ignoring EOF errors in order to read documents from streams incrementally.
169 | * Bumped `bitflags` to 0.9.
170 |
171 | ## Version 0.4.1
172 |
173 | * Added missing `Debug` implementation to `xml::writer::XmlEvent`.
174 |
175 | ## Version 0.4.0
176 |
177 | * Bumped version number, since changes introduced in 0.3.7 break backwards compatibility.
178 |
179 | ## Version 0.3.8
180 |
181 | * Fixed a problem introduced in 0.3.7 with entities in attributes causing parsing errors.
182 |
183 | ## Version 0.3.7
184 |
185 | * Fixed the problem with parsing non-whitespace character entities as whitespace (issue #140).
186 | * Added support for configuring custom entities in the parser configuration.
187 |
188 | ## Version 0.3.6
189 |
190 | * Added an `Error` implementation for `EmitterError`.
191 | * Fixed escaping of strings with multi-byte code points.
192 |
193 | ## Version 0.3.5
194 |
195 | * Added `Debug` implementation for `XmlVersion`.
196 | * Fixed some failing tests.
197 |
198 | ## Version 0.3.3
199 |
200 | * Updated `bitflags` to 0.7.
201 |
202 | ## Version 0.3.2
203 |
204 | * Added `From` for `xml::reader::Error`, which improves usability of working with parsing errors.
205 |
206 | ## Version 0.3.1
207 |
208 | * Bumped `bitflags` dependency to 0.4, some internal warning fixes.
209 |
210 | ## Version 0.3.0
211 |
212 | * Changed error handling in `EventReader` - now I/O errors are properly bubbled up from the lexer.
213 |
214 | ## Version 0.2.4
215 |
216 | * Fixed #112 - incorrect handling of namespace redefinitions when writing a document.
217 |
218 | ## Version 0.2.3
219 |
220 | * Added `into_inner()` methods to `EventReader` and `EventWriter`.
221 |
222 | ## Version 0.2.2
223 |
224 | * Using `join` instead of the deprecated `connect`.
225 | * Added a simple XML analyzer program which demonstrates library usage and can be used to check XML documents for well-formedness.
226 | * Fixed incorrect handling of unqualified attribute names (#107).
227 | * Added this changelog.
228 |
229 | ## Version 0.2.1
230 |
231 | * Fixed #105 - incorrect handling of double dashes.
232 |
233 | ## Version 0.2.0
234 |
235 | * Major update, includes proper document writing support and significant architecture changes.
236 |
--------------------------------------------------------------------------------
/tests/xmltest.fail.txt:
--------------------------------------------------------------------------------
1 | not-wf-sa-003 003.xml Processing Instruction target name is required.
2 | not-wf-sa-054 054.xml PUBLIC requires two literals.
3 | not-wf-sa-056 056.xml Invalid Document Type Definition format - misplaced comment.
4 | not-wf-sa-057 057.xml This isn't SGML; comments can't exist in declarations.
5 | not-wf-sa-058 058.xml Invalid character , in ATTLIST enumeration
6 | not-wf-sa-059 059.xml String literal must be in quotes.
7 | not-wf-sa-060 060.xml Invalid type NAME defined in ATTLIST.
8 | not-wf-sa-061 061.xml External entity declarations require whitespace between public and system IDs.
9 | not-wf-sa-064 064.xml Space is required between attribute type and default values in declarations.
10 | not-wf-sa-065 065.xml Space is required between attribute name and type in declarations.
11 | not-wf-sa-066 066.xml Required whitespace is missing.
12 | not-wf-sa-067 067.xml Space is required between attribute type and default values in declarations.
13 | not-wf-sa-068 068.xml Space is required between NOTATION keyword and list of enumerated choices in declarations.
14 | not-wf-sa-069 069.xml Space is required before an NDATA entity annotation.
15 | not-wf-sa-074 074.xml Internal general parsed entities are only well formed if they match the "content" production.
16 | not-wf-sa-075 075.xml ENTITY can't reference itself directly or indirectly.
17 | not-wf-sa-077 077.xml Undefined ENTITY bar.
18 | not-wf-sa-078 078.xml Undefined ENTITY foo.
19 | not-wf-sa-079 079.xml ENTITY can't reference itself directly or indirectly.
20 | not-wf-sa-080 080.xml ENTITY can't reference itself directly or indirectly.
21 | not-wf-sa-081 081.xml This tests the No External Entity References WFC, since the entity is referred to within an attribute.
22 | not-wf-sa-082 082.xml This tests the No External Entity References WFC, since the entity is referred to within an attribute.
23 | not-wf-sa-083 083.xml Undefined NOTATION n.
24 | not-wf-sa-084 084.xml Tests the Parsed Entity WFC by referring to an unparsed entity. (This precedes the error of not declaring that entity's notation, which may be detected any time before the DTD parsing is completed.)
25 | not-wf-sa-085 085.xml Public IDs may not contain "[".
26 | not-wf-sa-086 086.xml Public IDs may not contain "[".
27 | not-wf-sa-087 087.xml Public IDs may not contain "[".
28 | not-wf-sa-089 089.xml Parameter entities "are" always parsed; NDATA annotations are not permitted.
29 | not-wf-sa-091 091.xml Parameter entities "are" always parsed; NDATA annotations are not permitted.
30 | not-wf-sa-104 104.xml Internal general parsed entities are only well formed if they match the "content" production.
31 | not-wf-sa-115 115.xml The replacement text of this entity is an illegal character reference, which must be rejected when it is parsed in the context of an attribute value.
32 | not-wf-sa-116 116.xml Internal general parsed entities are only well formed if they match the "content" production. This is a partial character reference, not a full one.
33 | not-wf-sa-117 117.xml Internal general parsed entities are only well formed if they match the "content" production. This is a partial character reference, not a full one.
34 | not-wf-sa-119 119.xml Internal general parsed entities are only well formed if they match the "content" production. This is a partial character reference, not a full one.
35 | not-wf-sa-122 122.xml Invalid syntax mixed connectors are used.
36 | not-wf-sa-123 123.xml Invalid syntax mismatched parenthesis.
37 | not-wf-sa-124 124.xml Invalid format of Mixed-content declaration.
38 | not-wf-sa-125 125.xml Invalid syntax extra set of parenthesis not necessary.
39 | not-wf-sa-126 126.xml Invalid syntax Mixed-content must be defined as zero or more.
40 | not-wf-sa-127 127.xml Invalid syntax Mixed-content must be defined as zero or more.
41 | not-wf-sa-128 128.xml Invalid CDATA syntax.
42 | not-wf-sa-129 129.xml Invalid syntax for Element Type Declaration.
43 | not-wf-sa-130 130.xml Invalid syntax for Element Type Declaration.
44 | not-wf-sa-131 131.xml Invalid syntax for Element Type Declaration.
45 | not-wf-sa-132 132.xml Invalid syntax mixed connectors used.
46 | not-wf-sa-133 133.xml Illegal whitespace before optional character causes syntax error.
47 | not-wf-sa-134 134.xml Illegal whitespace before optional character causes syntax error.
48 | not-wf-sa-135 135.xml Invalid character used as connector.
49 | not-wf-sa-136 136.xml Tag omission is invalid in XML.
50 | not-wf-sa-137 137.xml Space is required before a content model.
51 | not-wf-sa-138 138.xml Invalid syntax for content particle.
52 | not-wf-sa-139 139.xml The element-content model should not be empty.
53 | not-wf-sa-149 149.xml XML Declaration may not be within a DTD.
54 | not-wf-sa-158 158.xml SGML-ism: "#NOTATION gif" can't have attributes.
55 | not-wf-sa-159 159.xml Uses '&' unquoted in an entity declaration, which is illegal syntax for an entity reference.
56 | not-wf-sa-160 160.xml Violates the PEs in Internal Subset WFC by using a PE reference within a declaration.
57 | not-wf-sa-161 161.xml Violates the PEs in Internal Subset WFC by using a PE reference within a declaration.
58 | not-wf-sa-162 162.xml Violates the PEs in Internal Subset WFC by using a PE reference within a declaration.
59 | not-wf-sa-164 164.xml Invalid placement of Parameter entity reference.
60 | not-wf-sa-180 180.xml The Entity Declared WFC requires entities to be declared before they are used in an attribute list declaration.
61 | not-wf-sa-181 181.xml Internal parsed entities must match the content production to be well formed.
62 | not-wf-sa-182 182.xml Internal parsed entities must match the content production to be well formed.
63 | not-wf-sa-183 183.xml Mixed content declarations may not include content particles.
64 | not-wf-sa-184 184.xml In mixed content models, element names must not be parenthesized.
65 | not-wf-not-sa-001 001.xml Conditional sections must be properly terminated ("]>" used instead of "]]>").
66 | not-wf-not-sa-002 002.xml Processing instruction target names may not be "XML" in any combination of cases.
67 | not-wf-not-sa-003 003.xml Conditional sections must be properly terminated ("]]>" omitted).
68 | not-wf-not-sa-004 004.xml Conditional sections must be properly terminated ("]]>" omitted).
69 | not-wf-not-sa-005 005.xml Tests the Entity Declared VC by referring to an undefined parameter entity within an external entity.
70 | not-wf-not-sa-006 006.xml Conditional sections need a '[' after the INCLUDE or IGNORE.
71 | not-wf-not-sa-007 007.xml A declaration may not begin any external entity; it's only found once, in the document entity.
72 | not-wf-not-sa-008 008.xml In DTDs, the '%' character must be part of a parameter entity reference.
73 | not-wf-not-sa-009 009.xml This test violates WFC:PE Between Declarations in Production 28a. The last character of a markup declaration is not contained in the same parameter-entity text replacement.
74 | not-wf-ext-sa-001 001.xml Tests the No Recursion WFC by having an external general entity be self-recursive.
75 | not-wf-ext-sa-002 002.xml External entities have "text declarations", which do not permit the "standalone=..." attribute that's allowed in XML declarations.
76 | not-wf-ext-sa-003 003.xml Only one text declaration is permitted; a second one looks like an illegal processing instruction (target names of "xml" in any case are not allowed).
77 | valid-sa-012 012.xml Uses a legal XML 1.0 name consisting of a single colon character (disallowed by the latest XML Namespaces draft).; 5:7 Qualified name is invalid: :
78 | valid-not-sa-031 031.xml Expands a general entity which contains a CDATA section with what looks like a markup declaration (but is just text since it's in a CDATA section).; 2:8 Unexpected entity: e
79 |
--------------------------------------------------------------------------------
/tests/xmlconf.rs:
--------------------------------------------------------------------------------
1 | //! W3C XML conformance test suite
2 |
3 | use std::collections::{HashMap, HashSet};
4 | use std::fs::File;
5 | use std::io::BufReader;
6 | use std::path::Path;
7 | use std::process::Command;
8 | use std::sync::Mutex;
9 | use xml::reader::XmlEvent;
10 | use xml::{EventWriter, ParserConfig};
11 |
12 | static UNZIP: Mutex<()> = Mutex::new(());
13 |
14 | fn ensure_unzipped() {
15 | let _g = UNZIP.lock().expect("unzip already failed");
16 |
17 | // test suite license only allows redistribution of unmodified zip!
18 | if !Path::new("tests/xmlconf").exists() {
19 | assert!(Command::new("unzip")
20 | .current_dir("tests")
21 | .arg("xmlts20130923.zip")
22 | .status().unwrap().success(), "must unzip");
23 | }
24 | }
25 |
26 | #[track_caller]
27 | fn run_suite(suite_rel_path: &str) {
28 | run_suite_with_config(suite_rel_path, ParserConfig::default().allow_multiple_root_elements(true));
29 | run_suite_with_config(suite_rel_path, ParserConfig::default().coalesce_characters(false));
30 | run_suite_with_config(suite_rel_path, ParserConfig::default().ignore_comments(false));
31 | run_suite_with_config(suite_rel_path, ParserConfig::new().trim_whitespace(true).whitespace_to_characters(true).cdata_to_characters(true).ignore_comments(true).coalesce_characters(true));
32 | run_suite_with_config(suite_rel_path, ParserConfig::default().allow_multiple_root_elements(false).ignore_root_level_whitespace(false));
33 | }
34 |
35 | #[track_caller]
36 | fn run_suite_with_config(suite_rel_path: &str, parser_config: ParserConfig) {
37 | ensure_unzipped();
38 |
39 | let suite_path = Path::new("tests").join(suite_rel_path);
40 | let known_failures_file_path = Path::new("tests").join(suite_path.with_extension("fail.txt").file_name().unwrap());
41 | let mut new_known_failures_file = if std::env::var("PRINT_SPEC").map_or(false, |val| val == "1") { Some(String::new()) } else { None };
42 |
43 | let known_broken_test_ids: HashSet<_> = std::fs::read_to_string(&known_failures_file_path).unwrap_or_default().lines()
44 | .map(|l| l.trim().split(' ').next().unwrap().to_string()).collect();
45 |
46 | let root = suite_path.parent().unwrap();
47 | let mut parsed = 0;
48 |
49 | let f = BufReader::new(File::open(&suite_path)
50 | .map_err(|e| format!("{}: {e}", suite_path.display())).unwrap());
51 | let r = ParserConfig::default().allow_multiple_root_elements(true).create_reader(f);
52 | let mut desc = String::new();
53 | let mut attr = HashMap::::new();
54 | for e in r {
55 | let e = e.map_err(|e| format!("{}: {e}", suite_path.display())).expect("testsuite validity");
56 | match e {
57 | XmlEvent::Characters(chr) => {
58 | desc.push_str(&chr.replace('\n', " ").replace(" ", " ").replace(" ", " "));
59 | },
60 | XmlEvent::EndElement { name } if name.local_name == "TEST" => {
61 | let path = root.join(&attr["URI"]);
62 | let test_type = attr["TYPE"].as_str();
63 | let id = attr.get("ID").map(|a| a.as_str()).unwrap_or_else(|| path.file_stem().unwrap().to_str().unwrap());
64 |
65 | if attr.get("EDITION").map(|s| s.as_str()) == Some("1 2 3 4") {
66 | // tests obsolete things changed in edition 5
67 | continue;
68 | }
69 |
70 | let res = match test_type {
71 | "valid" => expect_well_formed(&path, &desc, parser_config.clone()),
72 | "invalid" => expect_well_formed(&path, &desc, parser_config.clone()), // invalid is still well-formed
73 | "not-wf" | "error" => expect_ill_formed(&path, &desc),
74 | other => unimplemented!("{other}?? type"),
75 | };
76 |
77 | if let Some(out) = new_known_failures_file.as_mut() {
78 | if let Err(e) = res {
79 | use std::fmt::Write;
80 | writeln!(out, "{id} {}", e.to_string().replace('\n', " ")).unwrap();
81 | }
82 | } else {
83 | let known_bad = known_broken_test_ids.contains(id);
84 | match res {
85 | Err(_) if known_bad => {},
86 | Err(e) => panic!("{suite_rel_path} failed on {} ({id})\n{e}", path.display()),
87 | Ok(()) if known_bad => panic!("expected {} ({id}) to fail, but it passes {test_type} of {suite_rel_path} now\n{desc}", path.display()),
88 | Ok(()) => {},
89 | }
90 | }
91 |
92 | parsed += 1;
93 | },
94 | XmlEvent::StartElement { name, attributes, namespace: _ } if name.local_name == "TEST" => {
95 | desc.clear();
96 | attr = attributes.into_iter().map(|a| (a.name.local_name, a.value)).collect();
97 | },
98 | _ => {},
99 | }
100 | }
101 | if let Some(out) = new_known_failures_file {
102 | if out.is_empty() {
103 | let _ = std::fs::remove_file(known_failures_file_path);
104 | } else {
105 | std::fs::write(known_failures_file_path, out).unwrap();
106 | }
107 | }
108 | assert!(parsed > 0);
109 | }
110 |
111 | #[track_caller]
112 | fn expect_well_formed(xml_path: &Path, msg: &str, parser_config: ParserConfig) -> Result<(), Box> {
113 | let f = BufReader::new(File::open(xml_path).expect("testcase"));
114 | let r = parser_config.create_reader(f);
115 | let mut w = EventWriter::new(Vec::new());
116 | let mut seen_any = false;
117 | let mut writes_failed = None;
118 | let mut document_started = false;
119 | for e in r {
120 | let e = e.map_err(|e| format!("{} {msg}; {e}", xml_path.file_name().and_then(std::ffi::OsStr::to_str).unwrap()))?;
121 | match e {
122 | XmlEvent::EndElement { .. } => {
123 | seen_any = true;
124 | },
125 | XmlEvent::StartDocument { .. } => {
126 | if document_started { return Err("document started twice".into()); }
127 | document_started = true;
128 | },
129 | _ => {},
130 | }
131 | if let Some(e) = e.as_writer_event() {
132 | if let Err(e) = w.write(e) {
133 | writes_failed = Some(e);
134 | }
135 | }
136 | }
137 | if !seen_any {
138 | return Err("no elements found".into());
139 | }
140 | if let Some(e) = writes_failed {
141 | panic!("{} write failed on {e}", xml_path.display());
142 | }
143 | Ok(())
144 | }
145 |
146 | #[track_caller]
147 | fn expect_ill_formed(xml_path: &Path, msg: &str) -> Result<(), Box> {
148 | let f = BufReader::new(File::open(xml_path)?);
149 | let r = ParserConfig::new().allow_multiple_root_elements(false).create_reader(f);
150 | for e in r {
151 | if e.is_err() {
152 | return Ok(());
153 | }
154 | }
155 | Err(format!("{} {msg}", xml_path.file_name().and_then(std::ffi::OsStr::to_str).unwrap()).into())
156 | }
157 |
158 | #[test]
159 | fn eduni_errata_2e() {
160 | run_suite("xmlconf/eduni/errata-2e/errata2e.xml");
161 | }
162 |
163 | #[test]
164 | fn eduni_errata_3e() {
165 | run_suite("xmlconf/eduni/errata-3e/errata3e.xml");
166 | }
167 |
168 | #[test]
169 | fn eduni_errata_4e() {
170 | run_suite("xmlconf/eduni/errata-4e/errata4e.xml");
171 | }
172 |
173 | #[test]
174 | fn eduni_misc_ht() {
175 | run_suite("xmlconf/eduni/misc/ht-bh.xml");
176 | }
177 |
178 | #[test]
179 | fn eduni_namespaces_10() {
180 | run_suite("xmlconf/eduni/namespaces/1.0/rmt-ns10.xml");
181 | }
182 |
183 | #[test]
184 | fn eduni_namespaces_11() {
185 | run_suite("xmlconf/eduni/namespaces/1.1/rmt-ns11.xml");
186 | }
187 |
188 | #[test]
189 | fn eduni_namespaces_errata() {
190 | run_suite("xmlconf/eduni/namespaces/errata-1e/errata1e.xml");
191 | }
192 |
193 | #[test]
194 | fn eduni_xml_11() {
195 | run_suite("xmlconf/eduni/xml-1.1/xml11.xml");
196 | }
197 |
198 | #[test]
199 | fn ibm_oasis_valid() {
200 | run_suite("xmlconf/ibm/ibm_oasis_valid.xml");
201 | }
202 |
203 | #[test]
204 | fn ibm_xml_11() {
205 | run_suite("xmlconf/ibm/xml-1.1/ibm_valid.xml");
206 | }
207 |
208 | #[test]
209 | fn oasis() {
210 | run_suite("xmlconf/oasis/oasis.xml");
211 | }
212 |
213 | #[test]
214 | fn sun_valid() {
215 | run_suite("xmlconf/sun/sun-valid.xml");
216 | }
217 |
218 | #[test]
219 | fn sun_ill_formed() {
220 | run_suite("xmlconf/sun/sun-not-wf.xml");
221 | }
222 |
223 | #[test]
224 | fn japanese() {
225 | run_suite("xmlconf/japanese/japanese.xml");
226 | }
227 |
228 | #[test]
229 | fn xmltest() {
230 | run_suite("xmlconf/xmltest/xmltest.xml");
231 | }
232 |
233 | #[test]
234 | fn own_tests() {
235 | run_suite("tests.xml");
236 | }
237 |
--------------------------------------------------------------------------------
/benches/bench.rs:
--------------------------------------------------------------------------------
1 | #![feature(test)]
2 |
3 | extern crate test;
4 | use test::Bencher;
5 | use xml::{EventReader, EventWriter};
6 |
7 | #[bench]
8 | fn read(bencher: &mut Bencher) {
9 | let xml = std::fs::read("tests/documents/sample_1.xml").unwrap();
10 | bencher.iter(move || {
11 | let parser = EventReader::new(xml.as_slice());
12 | for e in parser {
13 | e.unwrap();
14 | }
15 | });
16 | }
17 |
18 | #[bench]
19 | fn read_lots_attrs(bencher: &mut Bencher) {
20 | let xml = r#""#;
23 | bencher.iter(move || {
24 | let parser = EventReader::new(xml.as_bytes());
25 | for e in parser {
26 | e.unwrap();
27 | }
28 | });
29 | }
30 |
31 | #[bench]
32 | fn write(bencher: &mut Bencher) {
33 | let xml = std::fs::read("tests/documents/sample_1.xml").unwrap();
34 | let events: Vec<_> = EventReader::new(xml.as_slice()).into_iter().map(|e| e.unwrap()).collect();
35 | let events: Vec<_> = events.iter().filter_map(|e| e.as_writer_event()).collect();
36 |
37 | bencher.iter(move || {
38 | let mut serializer = EventWriter::new(Vec::new());
39 | for e in &events {
40 | serializer.write((*e).clone()).unwrap();
41 | }
42 | serializer.into_inner()
43 | });
44 | }
45 |
--------------------------------------------------------------------------------
/src/reader/events.rs:
--------------------------------------------------------------------------------
1 | //! Contains `XmlEvent` datatype, instances of which are emitted by the parser.
2 |
3 | use crate::attribute::OwnedAttribute;
4 | use crate::common::XmlVersion;
5 | use crate::name::OwnedName;
6 | use crate::namespace::Namespace;
7 | use std::fmt;
8 |
9 | /// An element of an XML input stream.
10 | ///
11 | /// Items of this enum are emitted by `reader::EventReader`. They correspond to different
12 | /// elements of an XML document.
13 | #[derive(PartialEq, Clone)]
14 | pub enum XmlEvent {
15 | /// Corresponds to XML document declaration.
16 | ///
17 | /// This event is always emitted before any other event. It is emitted
18 | /// even if the actual declaration is not present in the document.
19 | StartDocument {
20 | /// XML version.
21 | ///
22 | /// If XML declaration is not present, defaults to `Version10`.
23 | version: XmlVersion,
24 |
25 | /// XML document encoding.
26 | ///
27 | /// If XML declaration is not present or does not contain `encoding` attribute,
28 | /// defaults to `"UTF-8"`. This field is currently used for no other purpose than
29 | /// informational.
30 | encoding: String,
31 |
32 | /// XML standalone declaration.
33 | ///
34 | /// If XML document is not present or does not contain `standalone` attribute,
35 | /// defaults to `None`. This field is currently used for no other purpose than
36 | /// informational.
37 | standalone: Option,
38 | },
39 |
40 | /// Denotes to the end of the document stream.
41 | ///
42 | /// This event is always emitted after any other event (except `Error`). After it
43 | /// is emitted for the first time, it will always be emitted on next event pull attempts.
44 | EndDocument,
45 |
46 | /// Denotes an XML processing instruction.
47 | ///
48 | /// This event contains a processing instruction target (`name`) and opaque `data`. It
49 | /// is up to the application to process them.
50 | ProcessingInstruction {
51 | /// Processing instruction target.
52 | name: String,
53 |
54 | /// Processing instruction content.
55 | data: Option,
56 | },
57 |
58 | /// Denotes a beginning of an XML element.
59 | ///
60 | /// This event is emitted after parsing opening tags or after parsing bodiless tags. In the
61 | /// latter case `EndElement` event immediately follows.
62 | StartElement {
63 | /// Qualified name of the element.
64 | name: OwnedName,
65 |
66 | /// A list of attributes associated with the element.
67 | ///
68 | /// Currently attributes are not checked for duplicates (TODO)
69 | attributes: Vec,
70 |
71 | /// Contents of the namespace mapping at this point of the document.
72 | namespace: Namespace,
73 | },
74 |
75 | /// Denotes an end of an XML element.
76 | ///
77 | /// This event is emitted after parsing closing tags or after parsing bodiless tags. In the
78 | /// latter case it is emitted immediately after corresponding `StartElement` event.
79 | EndElement {
80 | /// Qualified name of the element.
81 | name: OwnedName,
82 | },
83 |
84 | /// Denotes CDATA content.
85 | ///
86 | /// This event contains unparsed data. No unescaping will be performed.
87 | ///
88 | /// It is possible to configure a parser to emit `Characters` event instead of `CData`. See
89 | /// `pull::ParserConfiguration` structure for more information.
90 | CData(String),
91 |
92 | /// Denotes a comment.
93 | ///
94 | /// It is possible to configure a parser to ignore comments, so this event will never be emitted.
95 | /// See `pull::ParserConfiguration` structure for more information.
96 | Comment(String),
97 |
98 | /// Denotes character data outside of tags.
99 | ///
100 | /// Contents of this event will always be unescaped, so no entities like `<` or `&` or `{`
101 | /// will appear in it.
102 | ///
103 | /// It is possible to configure a parser to trim leading and trailing whitespace for this event.
104 | /// See `pull::ParserConfiguration` structure for more information.
105 | Characters(String),
106 |
107 | /// Denotes a chunk of whitespace outside of tags.
108 | ///
109 | /// It is possible to configure a parser to emit `Characters` event instead of `Whitespace`.
110 | /// See `pull::ParserConfiguration` structure for more information. When combined with whitespace
111 | /// trimming, it will eliminate standalone whitespace from the event stream completely.
112 | Whitespace(String),
113 | /// The whole DOCTYPE markup
114 | Doctype {
115 | /// Everything including `<` and `>`
116 | syntax: String,
117 | },
118 | }
119 |
120 | impl fmt::Debug for XmlEvent {
121 | #[cold]
122 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
123 | match self {
124 | Self::StartDocument { version, encoding, standalone } =>
125 | write!(f, "StartDocument({}, {}, {:?})", version, *encoding, standalone),
126 | Self::EndDocument =>
127 | write!(f, "EndDocument"),
128 | Self::ProcessingInstruction { name, data } =>
129 | write!(f, "ProcessingInstruction({}{})", *name, match data {
130 | Some(data) => format!(", {data}"),
131 | None => String::new()
132 | }),
133 | Self::StartElement { name, attributes, namespace: Namespace(namespace) } =>
134 | write!(f, "StartElement({}, {:?}{})", name, namespace, if attributes.is_empty() {
135 | String::new()
136 | } else {
137 | let attributes: Vec = attributes.iter().map(
138 | |a| format!("{} -> {}", a.name, a.value)
139 | ).collect();
140 | format!(", [{}]", attributes.join(", "))
141 | }),
142 | Self::EndElement { name } =>
143 | write!(f, "EndElement({name})"),
144 | Self::Comment(data) =>
145 | write!(f, "Comment({data})"),
146 | Self::CData(data) =>
147 | write!(f, "CData({data})"),
148 | Self::Characters(data) =>
149 | write!(f, "Characters({data})"),
150 | Self::Whitespace(data) =>
151 | write!(f, "Whitespace({data})"),
152 | Self::Doctype { syntax } =>
153 | write!(f, "Doctype({syntax})"),
154 | }
155 | }
156 | }
157 |
158 | impl XmlEvent {
159 | /// Obtains a writer event from this reader event.
160 | ///
161 | /// This method is useful for streaming processing of XML documents where the output
162 | /// is also an XML document. With this method it is possible to process some events
163 | /// while passing other events through to the writer unchanged:
164 | ///
165 | /// ```rust
166 | /// use std::str;
167 | ///
168 | /// use xml::reader::XmlEvent as ReaderEvent;
169 | /// use xml::writer::XmlEvent as WriterEvent;
170 | /// use xml::{EventReader, EventWriter};
171 | ///
172 | /// let mut input: &[u8] = b"world";
173 | /// let mut output: Vec = Vec::new();
174 | ///
175 | /// {
176 | /// let mut reader = EventReader::new(&mut input);
177 | /// let mut writer = EventWriter::new(&mut output);
178 | ///
179 | /// for e in reader {
180 | /// match e.unwrap() {
181 | /// ReaderEvent::Characters(s) =>
182 | /// writer.write(WriterEvent::characters(&s.to_uppercase())).unwrap(),
183 | /// e => if let Some(e) = e.as_writer_event() {
184 | /// writer.write(e).unwrap()
185 | /// }
186 | /// }
187 | /// }
188 | /// }
189 | ///
190 | /// assert_eq!(
191 | /// str::from_utf8(&output).unwrap(),
192 | /// r#"WORLD"#
193 | /// );
194 | /// ```
195 | ///
196 | /// Note that this API may change or get additions in future to improve its ergonomics.
197 | #[must_use]
198 | pub fn as_writer_event(&self) -> Option> {
199 | match self {
200 | Self::StartDocument { version, encoding, standalone } =>
201 | Some(crate::writer::events::XmlEvent::StartDocument {
202 | version: *version,
203 | encoding: Some(encoding),
204 | standalone: *standalone
205 | }),
206 | Self::ProcessingInstruction { name, data } =>
207 | Some(crate::writer::events::XmlEvent::ProcessingInstruction {
208 | name,
209 | data: data.as_ref().map(|s| &**s)
210 | }),
211 | Self::StartElement { name, attributes, namespace } =>
212 | Some(crate::writer::events::XmlEvent::StartElement {
213 | name: name.borrow(),
214 | attributes: attributes.iter().map(|a| a.borrow()).collect(),
215 | namespace: namespace.borrow(),
216 | }),
217 | Self::EndElement { name } =>
218 | Some(crate::writer::events::XmlEvent::EndElement { name: Some(name.borrow()) }),
219 | Self::Comment(data) => Some(crate::writer::events::XmlEvent::Comment(data)),
220 | Self::CData(data) => Some(crate::writer::events::XmlEvent::CData(data)),
221 | Self::Characters(data) |
222 | Self::Whitespace(data) => Some(crate::writer::events::XmlEvent::Characters(data)),
223 | Self::Doctype { syntax } => Some(crate::writer::events::XmlEvent::Doctype(syntax)),
224 | Self::EndDocument => None,
225 | }
226 | }
227 | }
228 |
--------------------------------------------------------------------------------
/src/reader/parser/outside_tag.rs:
--------------------------------------------------------------------------------
1 | use crate::common::is_whitespace_char;
2 | use crate::reader::error::SyntaxError;
3 | use crate::reader::events::XmlEvent;
4 | use crate::reader::lexer::Token;
5 |
6 | use super::{
7 | ClosingTagSubstate, DoctypeSubstate, Encountered, OpeningTagSubstate,
8 | ProcessingInstructionSubstate, PullParser, Result, State,
9 | };
10 |
11 | impl PullParser {
12 | pub fn outside_tag(&mut self, t: Token) -> Option {
13 | match t {
14 | Token::Character(c) => {
15 | if is_whitespace_char(c) {
16 | // skip whitespace outside of the root element
17 | if (self.config.trim_whitespace && self.buf.is_empty()) ||
18 | (self.depth() == 0 && self.config.ignore_root_level_whitespace) {
19 | return None;
20 | }
21 | } else {
22 | self.inside_whitespace = false;
23 | if self.depth() == 0 {
24 | return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t)));
25 | }
26 | }
27 |
28 | if !self.is_valid_xml_char_not_restricted(c) {
29 | return Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)));
30 | }
31 |
32 | if self.buf.is_empty() {
33 | self.push_pos();
34 | } else if self.buf.len() > self.config.max_data_length {
35 | return Some(self.error(SyntaxError::ExceededConfiguredLimit));
36 | }
37 | self.buf.push(c);
38 | None
39 | },
40 |
41 | Token::CommentEnd | Token::TagEnd | Token::EqualsSign |
42 | Token::DoubleQuote | Token::SingleQuote |
43 | Token::ProcessingInstructionEnd | Token::EmptyTagEnd => {
44 | if self.depth() == 0 {
45 | return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t)));
46 | }
47 | self.inside_whitespace = false;
48 |
49 | if let Some(s) = t.as_static_str() {
50 | if self.buf.is_empty() {
51 | self.push_pos();
52 | } else if self.buf.len() > self.config.max_data_length {
53 | return Some(self.error(SyntaxError::ExceededConfiguredLimit));
54 | }
55 |
56 | self.buf.push_str(s);
57 | }
58 | None
59 | },
60 |
61 | Token::ReferenceStart if self.depth() > 0 => {
62 | self.state_after_reference = State::OutsideTag;
63 | self.into_state_continue(State::InsideReference)
64 | },
65 |
66 | Token::ReferenceEnd if self.depth() > 0 => { // Semi-colon in a text outside an entity
67 | self.inside_whitespace = false;
68 | if self.buf.len() > self.config.max_data_length {
69 | return Some(self.error(SyntaxError::ExceededConfiguredLimit));
70 | }
71 | Token::ReferenceEnd.push_to_string(&mut self.buf);
72 | None
73 | },
74 |
75 | Token::CommentStart if self.config.coalesce_characters && self.config.ignore_comments => {
76 | let next_event = self.set_encountered(Encountered::Comment);
77 | // We need to switch the lexer into a comment mode inside comments
78 | self.into_state(State::InsideComment, next_event)
79 | }
80 |
81 | Token::CDataStart if self.depth() > 0 && self.config.coalesce_characters && self.config.cdata_to_characters => {
82 | if self.buf.is_empty() {
83 | self.push_pos(); // CDataEnd will pop pos if the buffer remains empty
84 | }
85 | // if coalescing chars, continue without event
86 | self.into_state_continue(State::InsideCData)
87 | },
88 |
89 | _ => {
90 | // Encountered some markup event, flush the buffer as characters
91 | // or a whitespace
92 | let mut next_event = if self.buf_has_data() {
93 | let buf = self.take_buf();
94 | if self.inside_whitespace && self.config.trim_whitespace {
95 | // there will be no event emitted for this, but start of buffering has pushed a pos
96 | self.next_pos();
97 | None
98 | } else if self.inside_whitespace && !self.config.whitespace_to_characters {
99 | debug_assert!(buf.chars().all(|ch| ch.is_whitespace()), "ws={buf:?}");
100 | Some(Ok(XmlEvent::Whitespace(buf)))
101 | } else if self.config.trim_whitespace {
102 | Some(Ok(XmlEvent::Characters(buf.trim_matches(is_whitespace_char).into())))
103 | } else {
104 | Some(Ok(XmlEvent::Characters(buf)))
105 | }
106 | } else { None };
107 | self.inside_whitespace = true; // Reset inside_whitespace flag
108 |
109 | // pos is popped whenever an event is emitted, so pushes must happen only if there will be an event to balance it
110 | // and ignored comments don't pop
111 | if t != Token::CommentStart || !self.config.ignore_comments {
112 | self.push_pos();
113 | }
114 | match t {
115 | Token::OpeningTagStart if self.depth() > 0 || self.encountered < Encountered::Element || self.config.allow_multiple_root_elements => {
116 | if let Some(e) = self.set_encountered(Encountered::Element) {
117 | next_event = Some(e);
118 | }
119 | self.nst.push_empty();
120 | self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event)
121 | },
122 |
123 | Token::ClosingTagStart if self.depth() > 0 =>
124 | self.into_state(State::InsideClosingTag(ClosingTagSubstate::CTInsideName), next_event),
125 |
126 | Token::CommentStart => {
127 | if let Some(e) = self.set_encountered(Encountered::Comment) {
128 | next_event = Some(e);
129 | }
130 | // We need to switch the lexer into a comment mode inside comments
131 | self.into_state(State::InsideComment, next_event)
132 | },
133 |
134 | Token::DoctypeStart if self.encountered < Encountered::Doctype => {
135 | if let Some(e) = self.set_encountered(Encountered::Doctype) {
136 | next_event = Some(e);
137 | }
138 | self.data.doctype = Some(Token::DoctypeStart.to_string());
139 |
140 | self.push_pos();
141 | self.into_state(State::InsideDoctype(DoctypeSubstate::Outside), next_event)
142 | },
143 |
144 | Token::ProcessingInstructionStart =>
145 | self.into_state(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName), next_event),
146 |
147 | Token::CDataStart if self.depth() > 0 => {
148 | self.into_state(State::InsideCData, next_event)
149 | },
150 |
151 | _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
152 | }
153 | },
154 | }
155 | }
156 |
157 | pub fn document_start(&mut self, t: Token) -> Option {
158 | debug_assert!(self.encountered < Encountered::Declaration);
159 |
160 | match t {
161 | Token::Character(c) => {
162 | let next_event = self.set_encountered(Encountered::AnyChars);
163 |
164 | if !is_whitespace_char(c) {
165 | return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t)));
166 | }
167 | self.inside_whitespace = true;
168 |
169 | // skip whitespace outside of the root element
170 | if (self.config.trim_whitespace && self.buf.is_empty()) ||
171 | (self.depth() == 0 && self.config.ignore_root_level_whitespace) {
172 | return self.into_state(State::OutsideTag, next_event);
173 | }
174 |
175 | self.push_pos();
176 | self.buf.push(c);
177 | self.into_state(State::OutsideTag, next_event)
178 | },
179 |
180 | Token::CommentStart => {
181 | let next_event = self.set_encountered(Encountered::Comment);
182 | self.into_state(State::InsideComment, next_event)
183 | },
184 |
185 | Token::OpeningTagStart => {
186 | let next_event = self.set_encountered(Encountered::Element);
187 | self.nst.push_empty();
188 | self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event)
189 | },
190 |
191 | Token::DoctypeStart => {
192 | let next_event = self.set_encountered(Encountered::Doctype);
193 | self.data.doctype = Some(Token::DoctypeStart.to_string());
194 |
195 | self.push_pos();
196 | self.into_state(State::InsideDoctype(DoctypeSubstate::Outside), next_event)
197 | },
198 |
199 | Token::ProcessingInstructionStart => {
200 | self.push_pos();
201 | self.into_state_continue(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName))
202 | },
203 |
204 | _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
205 | }
206 | }
207 | }
208 |
--------------------------------------------------------------------------------
/src/reader/parser/inside_declaration.rs:
--------------------------------------------------------------------------------
1 | use crate::common::{is_whitespace_char, XmlVersion};
2 | use crate::reader::error::SyntaxError;
3 | use crate::reader::events::XmlEvent;
4 | use crate::reader::lexer::Token;
5 | use crate::util::Encoding;
6 |
7 | use super::{
8 | DeclarationSubstate, Encountered, PullParser, QualifiedNameTarget, Result, State,
9 | DEFAULT_VERSION,
10 | };
11 |
12 | impl PullParser {
13 | #[inline(never)]
14 | fn emit_start_document(&mut self) -> Option {
15 | debug_assert!(self.encountered == Encountered::None);
16 | self.encountered = Encountered::Declaration;
17 |
18 | let version = self.data.version;
19 | let encoding = self.data.take_encoding();
20 | let standalone = self.data.standalone;
21 |
22 | if let Some(new_encoding) = encoding.as_deref() {
23 | let new_encoding = match new_encoding.parse() {
24 | Ok(e) => e,
25 | Err(_) if self.config.ignore_invalid_encoding_declarations => Encoding::Latin1,
26 | Err(_) => return Some(self.error(SyntaxError::UnsupportedEncoding(new_encoding.into()))),
27 | };
28 | let current_encoding = self.lexer.encoding();
29 | if current_encoding != new_encoding {
30 | let set = match (current_encoding, new_encoding) {
31 | (Encoding::Unknown | Encoding::Default, new) if new != Encoding::Utf16 => new,
32 | (Encoding::Utf16Be | Encoding::Utf16Le, Encoding::Utf16) => current_encoding,
33 | _ if self.config.ignore_invalid_encoding_declarations => current_encoding,
34 | _ => return Some(self.error(SyntaxError::ConflictingEncoding(new_encoding, current_encoding))),
35 | };
36 | self.lexer.set_encoding(set);
37 | }
38 | }
39 |
40 | let current_encoding = self.lexer.encoding();
41 | self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartDocument {
42 | version: version.unwrap_or(DEFAULT_VERSION),
43 | encoding: encoding.unwrap_or_else(move || current_encoding.to_string()),
44 | standalone
45 | }))
46 | }
47 |
48 | // TODO: remove redundancy via macros or extra methods
49 | pub fn inside_declaration(&mut self, t: Token, s: DeclarationSubstate) -> Option {
50 |
51 | match s {
52 | DeclarationSubstate::BeforeVersion => match t {
53 | Token::Character('v') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersion)),
54 | Token::Character(c) if is_whitespace_char(c) => None, // continue
55 | _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
56 | },
57 |
58 | DeclarationSubstate::InsideVersion => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| {
59 | match &*name.local_name {
60 | "ersion" if name.namespace.is_none() =>
61 | this.into_state_continue(State::InsideDeclaration(
62 | if token == Token::EqualsSign {
63 | DeclarationSubstate::InsideVersionValue
64 | } else {
65 | DeclarationSubstate::AfterVersion
66 | }
67 | )),
68 | _ => Some(this.error(SyntaxError::UnexpectedNameInsideXml(name.to_string().into()))),
69 | }
70 | }),
71 |
72 | DeclarationSubstate::AfterVersion => match t {
73 | Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersionValue)),
74 | Token::Character(c) if is_whitespace_char(c) => None,
75 | _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
76 | },
77 |
78 | DeclarationSubstate::InsideVersionValue => self.read_attribute_value(t, |this, value| {
79 | this.data.version = match &*value {
80 | "1.0" => Some(XmlVersion::Version10),
81 | "1.1" => Some(XmlVersion::Version11),
82 | _ => None
83 | };
84 | if this.data.version.is_some() {
85 | this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterVersionValue))
86 | } else {
87 | Some(this.error(SyntaxError::UnexpectedXmlVersion(value.into())))
88 | }
89 | }),
90 |
91 | DeclarationSubstate::AfterVersionValue => match t {
92 | Token::Character(c) if is_whitespace_char(c) => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeEncoding)),
93 | Token::ProcessingInstructionEnd => self.emit_start_document(),
94 | _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
95 | },
96 |
97 | DeclarationSubstate::BeforeEncoding => match t {
98 | Token::Character('e') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncoding)),
99 | Token::Character('s') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)),
100 | Token::ProcessingInstructionEnd => self.emit_start_document(),
101 | Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace
102 | _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
103 | },
104 |
105 | DeclarationSubstate::InsideEncoding => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| {
106 | match &*name.local_name {
107 | "ncoding" if name.namespace.is_none() =>
108 | this.into_state_continue(State::InsideDeclaration(
109 | if token == Token::EqualsSign { DeclarationSubstate::InsideEncodingValue } else { DeclarationSubstate::AfterEncoding }
110 | )),
111 | _ => Some(this.error(SyntaxError::UnexpectedName(name.to_string().into())))
112 | }
113 | }),
114 |
115 | DeclarationSubstate::AfterEncoding => match t {
116 | Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncodingValue)),
117 | Token::Character(c) if is_whitespace_char(c) => None,
118 | _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
119 | },
120 |
121 | DeclarationSubstate::InsideEncodingValue => self.read_attribute_value(t, |this, value| {
122 | this.data.encoding = Some(value);
123 | this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterEncodingValue))
124 | }),
125 |
126 | DeclarationSubstate::AfterEncodingValue => match t {
127 | Token::Character(c) if is_whitespace_char(c) => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeStandaloneDecl)),
128 | Token::ProcessingInstructionEnd => self.emit_start_document(),
129 | _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
130 | },
131 |
132 | DeclarationSubstate::BeforeStandaloneDecl => match t {
133 | Token::Character('s') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)),
134 | Token::ProcessingInstructionEnd => self.emit_start_document(),
135 | Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace
136 | _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
137 | },
138 |
139 | DeclarationSubstate::InsideStandaloneDecl => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| {
140 | match &*name.local_name {
141 | "tandalone" if name.namespace.is_none() =>
142 | this.into_state_continue(State::InsideDeclaration(
143 | if token == Token::EqualsSign {
144 | DeclarationSubstate::InsideStandaloneDeclValue
145 | } else {
146 | DeclarationSubstate::AfterStandaloneDecl
147 | }
148 | )),
149 | _ => Some(this.error(SyntaxError::UnexpectedName(name.to_string().into()))),
150 | }
151 | }),
152 |
153 | DeclarationSubstate::AfterStandaloneDecl => match t {
154 | Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDeclValue)),
155 | Token::Character(c) if is_whitespace_char(c) => None,
156 | _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
157 | },
158 |
159 | DeclarationSubstate::InsideStandaloneDeclValue => self.read_attribute_value(t, |this, value| {
160 | let standalone = match &*value {
161 | "yes" => Some(true),
162 | "no" => Some(false),
163 | _ => None
164 | };
165 | if standalone.is_some() {
166 | this.data.standalone = standalone;
167 | this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterStandaloneDeclValue))
168 | } else {
169 | Some(this.error(SyntaxError::InvalidStandaloneDeclaration(value.into())))
170 | }
171 | }),
172 |
173 | DeclarationSubstate::AfterStandaloneDeclValue => match t {
174 | Token::ProcessingInstructionEnd => self.emit_start_document(),
175 | Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace
176 | _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
177 | },
178 | }
179 | }
180 | }
181 |
--------------------------------------------------------------------------------
/src/writer/events.rs:
--------------------------------------------------------------------------------
1 | //! Contains `XmlEvent` datatype, instances of which are consumed by the writer.
2 |
3 | use std::borrow::Cow;
4 |
5 | use crate::attribute::Attribute;
6 | use crate::common::XmlVersion;
7 | use crate::name::Name;
8 | use crate::namespace::{Namespace, NS_NO_PREFIX};
9 | use crate::reader::ErrorKind;
10 |
11 | /// A part of an XML output stream.
12 | ///
13 | /// Objects of this enum are consumed by `EventWriter`. They correspond to different parts of
14 | /// an XML document.
15 | #[derive(Debug, Clone)]
16 | #[non_exhaustive]
17 | pub enum XmlEvent<'a> {
18 | /// Corresponds to XML document declaration.
19 | ///
20 | /// This event should always be written before any other event. If it is not written
21 | /// at all, a default XML declaration will be outputted if the corresponding option
22 | /// is set in the configuration. Otherwise an error will be returned.
23 | StartDocument {
24 | /// XML version.
25 | ///
26 | /// Defaults to `XmlVersion::Version10`.
27 | version: XmlVersion,
28 |
29 | /// XML document encoding.
30 | ///
31 | /// Defaults to `Some("UTF-8")`.
32 | encoding: Option<&'a str>,
33 |
34 | /// XML standalone declaration.
35 | ///
36 | /// Defaults to `None`.
37 | standalone: Option,
38 | },
39 |
40 | /// Denotes an XML processing instruction.
41 | ProcessingInstruction {
42 | /// Processing instruction target.
43 | name: &'a str,
44 |
45 | /// Processing instruction content.
46 | data: Option<&'a str>,
47 | },
48 |
49 | /// Denotes a beginning of an XML element.
50 | StartElement {
51 | /// Qualified name of the element.
52 | name: Name<'a>,
53 |
54 | /// A list of attributes associated with the element.
55 | ///
56 | /// Currently attributes are not checked for duplicates (TODO). Attribute values
57 | /// will be escaped, and all characters invalid for attribute values like `"` or `<`
58 | /// will be changed into character entities.
59 | attributes: Cow<'a, [Attribute<'a>]>,
60 |
61 | /// Contents of the namespace mapping at this point of the document.
62 | ///
63 | /// This mapping will be inspected for "new" entries, and if at this point of the document
64 | /// a particular pair of prefix and namespace URI is already defined, no namespace
65 | /// attributes will be emitted.
66 | namespace: Cow<'a, Namespace>,
67 | },
68 |
69 | /// Denotes an end of an XML element.
70 | EndElement {
71 | /// Optional qualified name of the element.
72 | ///
73 | /// If `None`, then it is assumed that the element name should be the last valid one.
74 | /// If `Some` and element names tracking is enabled, then the writer will check it for
75 | /// correctness.
76 | name: Option>,
77 | },
78 |
79 | /// Denotes CDATA content.
80 | ///
81 | /// This event contains unparsed data, and no escaping will be performed when writing it
82 | /// to the output stream.
83 | CData(&'a str),
84 |
85 | /// Denotes a comment.
86 | ///
87 | /// The string will be checked for invalid sequences and error will be returned by the
88 | /// write operation
89 | Comment(&'a str),
90 |
91 | /// Denotes character data outside of tags.
92 | ///
93 | /// Contents of this event will be escaped if `perform_escaping` option is enabled,
94 | /// that is, every character invalid for PCDATA will appear as a character entity.
95 | Characters(&'a str),
96 |
97 | /// Emits raw characters which will never be escaped.
98 | ///
99 | /// This event is only used for writing to an output stream, there is no equivalent
100 | /// reader event. Care must be taken when using this event, as it can easily result
101 | /// non-well-formed documents.
102 | RawCharacters(&'a str),
103 |
104 | /// Syntax of the `DOCTYPE`, everyhing including `<` and `>`
105 | Doctype(&'a str),
106 | }
107 |
108 | impl<'a> XmlEvent<'a> {
109 | /// Returns an writer event for a processing instruction.
110 | #[inline]
111 | #[must_use]
112 | pub const fn processing_instruction(name: &'a str, data: Option<&'a str>) -> Self {
113 | XmlEvent::ProcessingInstruction { name, data }
114 | }
115 |
116 | /// Returns a builder for a starting element.
117 | ///
118 | /// This builder can then be used to tweak attributes and namespace starting at
119 | /// this element.
120 | #[inline]
121 | pub fn start_element(name: S) -> StartElementBuilder<'a> where S: Into> {
122 | StartElementBuilder {
123 | name: name.into(),
124 | attributes: Vec::new(),
125 | namespace: Namespace::empty(),
126 | }
127 | }
128 |
129 | /// Returns a builder for an closing element.
130 | ///
131 | /// This method, unline `start_element()`, does not accept a name because by default
132 | /// the writer is able to determine it automatically. However, when this functionality
133 | /// is disabled, it is possible to specify the name with `name()` method on the builder.
134 | #[inline]
135 | #[must_use]
136 | pub const fn end_element() -> EndElementBuilder<'a> {
137 | EndElementBuilder { name: None }
138 | }
139 |
140 | /// Returns a CDATA event.
141 | ///
142 | /// Naturally, the provided string won't be escaped, except for closing CDATA token `]]>`
143 | /// (depending on the configuration).
144 | #[inline]
145 | #[must_use]
146 | pub const fn cdata(data: &'a str) -> Self {
147 | XmlEvent::CData(data)
148 | }
149 |
150 | /// Returns a regular characters (PCDATA) event.
151 | ///
152 | /// All offending symbols, in particular, `&` and `<`, will be escaped by the writer.
153 | #[inline]
154 | #[must_use]
155 | pub const fn characters(data: &'a str) -> Self {
156 | XmlEvent::Characters(data)
157 | }
158 |
159 | /// Returns a raw characters event.
160 | ///
161 | /// No escaping takes place.
162 | /// This event is only used for writing to an output stream, there is no equivalent
163 | /// reader event. Care must be taken when using this event, as it can easily result
164 | /// non-well-formed documents.
165 | #[inline]
166 | #[must_use]
167 | pub const fn raw_characters(data: &'a str) -> Self {
168 | XmlEvent::RawCharacters(data)
169 | }
170 |
171 | /// Returns a comment event.
172 | #[inline]
173 | #[must_use]
174 | pub const fn comment(data: &'a str) -> Self {
175 | XmlEvent::Comment(data)
176 | }
177 | }
178 |
179 | impl<'a> From<&'a str> for XmlEvent<'a> {
180 | #[inline]
181 | fn from(s: &'a str) -> Self {
182 | XmlEvent::Characters(s)
183 | }
184 | }
185 |
186 | /// A builder for a closing element event.
187 | pub struct EndElementBuilder<'a> {
188 | name: Option>,
189 | }
190 |
191 | /// A builder for a closing element event.
192 | impl<'a> EndElementBuilder<'a> {
193 | /// Sets the name of this closing element.
194 | ///
195 | /// Usually the writer is able to determine closing element names automatically. If
196 | /// this functionality is enabled (by default it is), then this name is checked for correctness.
197 | /// It is possible, however, to disable such behavior; then the user must ensure that
198 | /// closing element name is correct manually.
199 | #[inline]
200 | #[must_use]
201 | pub fn name(mut self, name: N) -> Self where N: Into> {
202 | self.name = Some(name.into());
203 | self
204 | }
205 | }
206 |
207 | impl<'a> From> for XmlEvent<'a> {
208 | fn from(b: EndElementBuilder<'a>) -> Self {
209 | XmlEvent::EndElement { name: b.name }
210 | }
211 | }
212 |
213 | /// A builder for a starting element event.
214 | pub struct StartElementBuilder<'a> {
215 | name: Name<'a>,
216 | attributes: Vec>,
217 | namespace: Namespace,
218 | }
219 |
220 | impl<'a> StartElementBuilder<'a> {
221 | /// Sets an attribute value of this element to the given string.
222 | ///
223 | /// This method can be used to add attributes to the starting element. Name is a qualified
224 | /// name; its namespace is ignored, but its prefix is checked for correctness, that is,
225 | /// it is checked that the prefix is bound to some namespace in the current context.
226 | ///
227 | /// Currently attributes are not checked for duplicates. Note that duplicate attributes
228 | /// are a violation of XML document well-formedness.
229 | ///
230 | /// The writer checks that you don't specify reserved prefix names, for example `xmlns`.
231 | #[inline]
232 | #[must_use]
233 | pub fn attr(mut self, name: N, value: &'a str) -> Self
234 | where N: Into> {
235 | self.attributes.push(Attribute::new(name.into(), value));
236 | self
237 | }
238 |
239 | /// Adds a namespace to the current namespace context.
240 | ///
241 | /// If no namespace URI was bound to the provided prefix at this point of the document,
242 | /// then the mapping from the prefix to the provided namespace URI will be written as
243 | /// a part of this element attribute set.
244 | ///
245 | /// If the same namespace URI was bound to the provided prefix at this point of the document,
246 | /// then no namespace attributes will be emitted.
247 | ///
248 | /// If some other namespace URI was bound to the provided prefix at this point of the document,
249 | /// then another binding will be added as a part of this element attribute set, shadowing
250 | /// the outer binding.
251 | #[inline]
252 | #[must_use]
253 | pub fn ns(mut self, prefix: S1, uri: S2) -> Self
254 | where S1: Into, S2: Into
255 | {
256 | self.namespace.put(prefix, uri);
257 | self
258 | }
259 |
260 | /// Adds a default namespace mapping to the current namespace context.
261 | ///
262 | /// Same rules as for `ns()` are also valid for the default namespace mapping.
263 | #[inline]
264 | #[must_use]
265 | pub fn default_ns(mut self, uri: S) -> Self
266 | where S: Into {
267 | self.namespace.put(NS_NO_PREFIX, uri);
268 | self
269 | }
270 | }
271 |
272 | impl<'a> From> for XmlEvent<'a> {
273 | #[inline]
274 | fn from(b: StartElementBuilder<'a>) -> Self {
275 | XmlEvent::StartElement {
276 | name: b.name,
277 | attributes: Cow::Owned(b.attributes),
278 | namespace: Cow::Owned(b.namespace),
279 | }
280 | }
281 | }
282 |
283 | impl<'a> TryFrom<&'a crate::reader::XmlEvent> for XmlEvent<'a> {
284 | type Error = crate::reader::Error;
285 |
286 | fn try_from(event: &crate::reader::XmlEvent) -> Result, Self::Error> {
287 | Ok(event.as_writer_event().ok_or(ErrorKind::UnexpectedEof)?)
288 | }
289 | }
290 |
--------------------------------------------------------------------------------
/src/name.rs:
--------------------------------------------------------------------------------
1 | //! Contains XML qualified names manipulation types and functions.
2 |
3 | use std::fmt;
4 | use std::str::FromStr;
5 |
6 | use crate::namespace::NS_NO_PREFIX;
7 |
8 | /// Represents a qualified XML name.
9 | ///
10 | /// A qualified name always consists at least of a local name. It can optionally contain
11 | /// a prefix; when reading an XML document, if it contains a prefix, it must also contain a
12 | /// namespace URI, but this is not enforced statically; see below. The name can contain a
13 | /// namespace without a prefix; in that case a default, empty prefix is assumed.
14 | ///
15 | /// When writing XML documents, it is possible to omit the namespace URI, leaving only
16 | /// the prefix. In this case the writer will check that the specifed prefix is bound to some
17 | /// URI in the current namespace context. If both prefix and namespace URI are specified,
18 | /// it is checked that the current namespace context contains this exact correspondence
19 | /// between prefix and namespace URI.
20 | ///
21 | /// # Prefixes and URIs
22 | ///
23 | /// A qualified name with a prefix must always contain a proper namespace URI --- names with
24 | /// a prefix but without a namespace associated with that prefix are meaningless. However,
25 | /// it is impossible to obtain proper namespace URI by a prefix without a context, and such
26 | /// context is only available when parsing a document (or it can be constructed manually
27 | /// when writing a document). Tying a name to a context statically seems impractical. This
28 | /// may change in future, though.
29 | ///
30 | /// # Conversions
31 | ///
32 | /// `Name` implements some `From` instances for conversion from strings and tuples. For example:
33 | ///
34 | /// ```rust
35 | /// # use xml::name::Name;
36 | /// let n1: Name = "p:some-name".into();
37 | /// let n2: Name = ("p", "some-name").into();
38 | ///
39 | /// assert_eq!(n1, n2);
40 | /// assert_eq!(n1.local_name, "some-name");
41 | /// assert_eq!(n1.prefix, Some("p"));
42 | /// assert!(n1.namespace.is_none());
43 | /// ```
44 | ///
45 | /// This is added to support easy specification of XML elements when writing XML documents.
46 | #[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
47 | pub struct Name<'a> {
48 | /// A local name, e.g. `string` in `xsi:string`.
49 | pub local_name: &'a str,
50 |
51 | /// A namespace URI, e.g. `http://www.w3.org/2000/xmlns/`.
52 | pub namespace: Option<&'a str>,
53 |
54 | /// A name prefix, e.g. `xsi` in `xsi:string`.
55 | pub prefix: Option<&'a str>,
56 | }
57 |
58 | impl<'a> From<&'a str> for Name<'a> {
59 | fn from(s: &'a str) -> Self {
60 | if let Some((prefix, name)) = s.split_once(':') {
61 | Name::prefixed(name, prefix)
62 | } else {
63 | Name::local(s)
64 | }
65 | }
66 | }
67 |
68 | impl<'a> From<(&'a str, &'a str)> for Name<'a> {
69 | fn from((prefix, name): (&'a str, &'a str)) -> Self {
70 | Name::prefixed(name, prefix)
71 | }
72 | }
73 |
74 | impl fmt::Display for Name<'_> {
75 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
76 | if let Some(namespace) = self.namespace {
77 | write!(f, "{{{namespace}}}")?;
78 | }
79 |
80 | if let Some(prefix) = self.prefix {
81 | write!(f, "{prefix}:")?;
82 | }
83 |
84 | f.write_str(self.local_name)
85 | }
86 | }
87 |
88 | impl<'a> Name<'a> {
89 | /// Returns an owned variant of the qualified name.
90 | #[must_use]
91 | pub fn to_owned(&self) -> OwnedName {
92 | OwnedName {
93 | local_name: self.local_name.into(),
94 | namespace: self.namespace.map(std::convert::Into::into),
95 | prefix: self.prefix.map(std::convert::Into::into),
96 | }
97 | }
98 |
99 | /// Returns a new `Name` instance representing plain local name.
100 | #[inline]
101 | #[must_use]
102 | pub const fn local(local_name: &str) -> Name<'_> {
103 | Name {
104 | local_name,
105 | prefix: None,
106 | namespace: None,
107 | }
108 | }
109 |
110 | /// Returns a new `Name` instance with the given local name and prefix.
111 | #[inline]
112 | #[must_use]
113 | pub const fn prefixed(local_name: &'a str, prefix: &'a str) -> Self {
114 | Name {
115 | local_name,
116 | namespace: None,
117 | prefix: Some(prefix),
118 | }
119 | }
120 |
121 | /// Returns a new `Name` instance representing a qualified name with or without a prefix and
122 | /// with a namespace URI.
123 | #[inline]
124 | #[must_use]
125 | pub const fn qualified(local_name: &'a str, namespace: &'a str, prefix: Option<&'a str>) -> Self {
126 | Name {
127 | local_name,
128 | namespace: Some(namespace),
129 | prefix,
130 | }
131 | }
132 |
133 | /// Returns a correct XML representation of this local name and prefix.
134 | ///
135 | /// This method is different from the autoimplemented `to_string()` because it does not
136 | /// include namespace URI in the result.
137 | #[must_use]
138 | pub fn to_repr(&self) -> String {
139 | self.repr_display().to_string()
140 | }
141 |
142 | /// Returns a structure which can be displayed with `std::fmt` machinery to obtain this
143 | /// local name and prefix.
144 | ///
145 | /// This method is needed for efficiency purposes in order not to create unnecessary
146 | /// allocations.
147 | #[inline]
148 | #[must_use]
149 | pub const fn repr_display(&self) -> ReprDisplay<'_, '_> {
150 | ReprDisplay(self)
151 | }
152 |
153 | /// Returns either a prefix of this name or `namespace::NS_NO_PREFIX` constant.
154 | #[inline]
155 | #[must_use]
156 | pub fn prefix_repr(&self) -> &str {
157 | self.prefix.unwrap_or(NS_NO_PREFIX)
158 | }
159 | }
160 |
161 | /// A wrapper around `Name` whose `Display` implementation prints the wrapped name as it is
162 | /// displayed in an XML document.
163 | pub struct ReprDisplay<'a, 'b>(&'a Name<'b>);
164 |
165 | impl<'a, 'b: 'a> fmt::Display for ReprDisplay<'a, 'b> {
166 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
167 | match self.0.prefix {
168 | Some(prefix) => write!(f, "{}:{}", prefix, self.0.local_name),
169 | None => self.0.local_name.fmt(f),
170 | }
171 | }
172 | }
173 |
174 | /// An owned variant of `Name`.
175 | ///
176 | /// Everything about `Name` applies to this structure as well.
177 | #[derive(Clone, PartialEq, Eq, Hash, Debug)]
178 | pub struct OwnedName {
179 | /// A local name, e.g. `string` in `xsi:string`.
180 | pub local_name: String,
181 |
182 | /// A namespace URI, e.g. `http://www.w3.org/2000/xmlns/`.
183 | pub namespace: Option,
184 |
185 | /// A name prefix, e.g. `xsi` in `xsi:string`.
186 | pub prefix: Option,
187 | }
188 |
189 | impl fmt::Display for OwnedName {
190 | #[inline]
191 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
192 | fmt::Display::fmt(&self.borrow(), f)
193 | }
194 | }
195 |
196 | impl OwnedName {
197 | /// Constructs a borrowed `Name` based on this owned name.
198 | #[must_use]
199 | #[inline]
200 | pub fn borrow(&self) -> Name<'_> {
201 | Name {
202 | local_name: &self.local_name,
203 | namespace: self.namespace.as_deref(),
204 | prefix: self.prefix.as_deref(),
205 | }
206 | }
207 |
208 | /// Returns a new `OwnedName` instance representing a plain local name.
209 | #[inline]
210 | pub fn local(local_name: S) -> Self where S: Into {
211 | Self {
212 | local_name: local_name.into(),
213 | namespace: None,
214 | prefix: None,
215 | }
216 | }
217 |
218 | /// Returns a new `OwnedName` instance representing a qualified name with or without
219 | /// a prefix and with a namespace URI.
220 | #[inline]
221 | pub fn qualified(local_name: S1, namespace: S2, prefix: Option) -> Self
222 | where S1: Into, S2: Into, S3: Into
223 | {
224 | Self {
225 | local_name: local_name.into(),
226 | namespace: Some(namespace.into()),
227 | prefix: prefix.map(std::convert::Into::into),
228 | }
229 | }
230 |
231 | /// Returns an optional prefix by reference, equivalent to `self.borrow().prefix`
232 | /// but avoids extra work.
233 | #[inline]
234 | #[must_use]
235 | pub fn prefix_ref(&self) -> Option<&str> {
236 | self.prefix.as_deref()
237 | }
238 |
239 | /// Returns an optional namespace by reference, equivalen to `self.borrow().namespace`
240 | /// but avoids extra work.
241 | #[inline]
242 | #[must_use]
243 | pub fn namespace_ref(&self) -> Option<&str> {
244 | self.namespace.as_deref()
245 | }
246 | }
247 |
248 | impl<'a> From> for OwnedName {
249 | #[inline]
250 | fn from(n: Name<'a>) -> Self {
251 | n.to_owned()
252 | }
253 | }
254 |
255 | impl FromStr for OwnedName {
256 | type Err = ();
257 |
258 | /// Parses the given string slice into a qualified name.
259 | ///
260 | /// This function, when finishes sucessfully, always return a qualified
261 | /// name without a namespace (`name.namespace == None`). It should be filled later
262 | /// using proper `NamespaceStack`.
263 | ///
264 | /// It is supposed that all characters in the argument string are correct
265 | /// as defined by the XML specification. No additional checks except a check
266 | /// for emptiness are done.
267 | fn from_str(s: &str) -> Result {
268 | let mut it = s.split(':');
269 |
270 | let r = match (it.next(), it.next(), it.next()) {
271 | (Some(prefix), Some(local_name), None) if !prefix.is_empty() &&
272 | !local_name.is_empty() =>
273 | Some((local_name.into(), Some(prefix.into()))),
274 | (Some(local_name), None, None) if !local_name.is_empty() =>
275 | Some((local_name.into(), None)),
276 | (_, _, _) => None
277 | };
278 | r.map(|(local_name, prefix)| Self {
279 | local_name,
280 | namespace: None,
281 | prefix
282 | }).ok_or(())
283 | }
284 | }
285 |
286 | #[cfg(test)]
287 | mod tests {
288 | use super::OwnedName;
289 |
290 | #[test]
291 | fn test_owned_name_from_str() {
292 | assert_eq!("prefix:name".parse(), Ok(OwnedName {
293 | local_name: "name".into(),
294 | namespace: None,
295 | prefix: Some("prefix".into())
296 | }));
297 |
298 | assert_eq!("name".parse(), Ok(OwnedName {
299 | local_name: "name".into(),
300 | namespace: None,
301 | prefix: None
302 | }));
303 |
304 | assert_eq!("".parse(), Err::(()));
305 | assert_eq!(":".parse(), Err::(()));
306 | assert_eq!(":a".parse(), Err::(()));
307 | assert_eq!("a:".parse(), Err::(()));
308 | assert_eq!("a:b:c".parse(), Err::(()));
309 | }
310 | }
311 |
--------------------------------------------------------------------------------