├── redirect-xml-rs ├── README.md ├── src │ └── lib.rs └── Cargo.toml ├── tests ├── cases │ ├── quote.xml │ ├── xmlnsquote.xml │ └── autosar.xml ├── xmlts20130923.zip ├── documents │ ├── sample_6.xml │ ├── sample_5.xml │ ├── sample_5_short.txt │ ├── sample_6_full.txt │ ├── sample_7.xml │ ├── sample_3.xml │ ├── sample_4.xml │ ├── sample_4_short.txt │ ├── sample_3_short.txt │ ├── sample_7_full.txt │ ├── sample_2.xml │ ├── sample_4_full.txt │ ├── sample_3_full.txt │ ├── sample_2_short.txt │ ├── sample_1.xml │ ├── sample_1_short.txt │ ├── sample_2_full.txt │ ├── sample_8_coalesce_all.txt │ ├── sample_8_coalesce_cwscdch.txt │ ├── sample_1_full.txt │ ├── sample_8.xml │ ├── sample_8_c.txt │ ├── sample_8_coalesce_wscdch.txt │ ├── sample_8_wscdch.txt │ ├── sample_8_full.txt │ └── sample_8_wsch.txt ├── rmt-ns11.fail.txt ├── tests.xml ├── sun-valid.fail.txt ├── errata3e.fail.txt ├── ibm_oasis_valid.fail.txt ├── ibm_valid.fail.txt ├── xml11.fail.txt ├── errata2e.fail.txt ├── errata4e.fail.txt ├── rmt-ns10.fail.txt ├── sun-not-wf.fail.txt ├── streaming.rs ├── oasis.fail.txt ├── xmltest.fail.txt └── xmlconf.rs ├── .github ├── FUNDING.yml └── workflows │ └── main.yml ├── .gitignore ├── README.md ├── .rustfmt.toml ├── Cargo.toml ├── src ├── lib.rs ├── macros.rs ├── reader │ ├── parser │ │ ├── inside_comment.rs │ │ ├── inside_cdata.rs │ │ ├── inside_closing_tag_name.rs │ │ ├── inside_reference.rs │ │ ├── inside_processing_instruction.rs │ │ ├── inside_opening_tag.rs │ │ ├── outside_tag.rs │ │ └── inside_declaration.rs │ ├── indexset.rs │ └── events.rs ├── attribute.rs ├── writer.rs ├── escape.rs ├── common.rs ├── reader.rs ├── writer │ ├── config.rs │ └── events.rs └── name.rs ├── LICENSE ├── examples ├── rewrite.rs ├── print_events.rs └── xml-analyze.rs ├── Changelog.md └── benches └── bench.rs /redirect-xml-rs/README.md: -------------------------------------------------------------------------------- 1 | ../README.md -------------------------------------------------------------------------------- /tests/cases/quote.xml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: ["kornelski"] 2 | liberapay: ["kornel"] 3 | -------------------------------------------------------------------------------- /tests/cases/xmlnsquote.xml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/xmlts20130923.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netvl/xml-rs/HEAD/tests/xmlts20130923.zip -------------------------------------------------------------------------------- /tests/documents/sample_6.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Hello 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.swo 3 | /doc 4 | *~ 5 | target/ 6 | Cargo.lock 7 | .idea/ 8 | *.iml 9 | /tests/xmlconf/ 10 | .DS_Store 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | The `xml-rs` project has a new home 2 | =================================== 3 | 4 | The current repository is: 5 | 6 | https://github.com/kornelski/xml-rs 7 | 8 | -------------------------------------------------------------------------------- /tests/documents/sample_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 |

4 | test ©≂̸ 5 |

6 | 7 | 8 | -------------------------------------------------------------------------------- /.rustfmt.toml: -------------------------------------------------------------------------------- 1 | # rustfmt is too aggressive and introduces too many inconsistencies and questionable choices to be applied unconditionally 2 | # please do not use it. 3 | disable_all_formatting = true 4 | -------------------------------------------------------------------------------- /tests/rmt-ns11.fail.txt: -------------------------------------------------------------------------------- 1 | rmt-ns11-003 003.xml 1.1 style prefix unbinding ; 10:16 Cannot undefine prefix 'a' 2 | rmt-ns11-004 004.xml 1.1 style prefix unbinding and rebinding ; 11:16 Cannot undefine prefix 'a' 3 | -------------------------------------------------------------------------------- /tests/documents/sample_5_short.txt: -------------------------------------------------------------------------------- 1 | StartDocument(1.0, utf-8) 2 | Doctype("") 3 | StartElement(p) 4 | StartElement(a) 5 | Characters("test ©≂̸") 6 | EndElement(a) 7 | EndElement(p) 8 | EndDocument 9 | -------------------------------------------------------------------------------- /tests/documents/sample_6_full.txt: -------------------------------------------------------------------------------- 1 | StartDocument(1.0, UTF-8) 2 | Whitespace("\n") 3 | ProcessingInstruction(xml-stylesheet="href=\"doc.xsl\"") 4 | Whitespace("\n\n") 5 | StartElement(doc) 6 | Characters("Hello") 7 | EndElement(doc) 8 | EndDocument 9 | -------------------------------------------------------------------------------- /tests/tests.xml: -------------------------------------------------------------------------------- 1 | 2 | issue 152 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /tests/sun-valid.fail.txt: -------------------------------------------------------------------------------- 1 | not-sa03 not-sa03.xml A non-standalone document is valid if declared as such.; 19:20 Unexpected entity: internal 2 | v-pe00 pe00.xml Tests construction of internal entity replacement text, using an example in the XML specification. ; 2:12 Unexpected entity: book 3 | -------------------------------------------------------------------------------- /tests/documents/sample_7.xml: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | why"> 6 | ]> 7 | 8 | omg why 9 | &ersand; 10 | &rsq; 11 | &lb; 12 | &omg; 13 | 14 | -------------------------------------------------------------------------------- /tests/errata3e.fail.txt: -------------------------------------------------------------------------------- 1 | rmt-e3e-12 E12.xml Default values for attributes may not contain references to external entities. 2 | rmt-e3e-13 E13.xml Even internal parameter entity references are enough to make undeclared entities into mere validity errors rather than well-formedness errors. ; 7:11 Unexpected entity: ent2 3 | -------------------------------------------------------------------------------- /tests/documents/sample_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | test 5 | kkss" = ddd' > 6 | ddddd!e3--> 5 | test 6 | kkss" = ddd' > 7 | ddddd!e3--> 3 | 4 | 5 | 6 | 7 | 8 |
9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 |
21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 |
44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 |
58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 |
70 | 71 | 72 | 73 | 74 | 75 | 76 |

77 | 78 | -------------------------------------------------------------------------------- /examples/rewrite.rs: -------------------------------------------------------------------------------- 1 | //! See for a real-world example. 2 | 3 | use std::fs::File; 4 | use std::io::BufReader; 5 | use std::path::Path; 6 | use xml::EmitterConfig; 7 | use xml::reader::{ParserConfig, Result}; 8 | 9 | fn main() -> Result<(), Box> { 10 | let arg = std::env::args_os().nth(1); 11 | let file_path = Path::new(arg.as_deref().unwrap_or("tests/documents/sample_1.xml".as_ref())); 12 | let file = BufReader::new(File::open(file_path) 13 | .map_err(|e| format!("Can't open {}: {e}", file_path.display()))?); 14 | 15 | let mut reader = ParserConfig::default() 16 | .ignore_root_level_whitespace(true) 17 | .ignore_comments(false) 18 | .cdata_to_characters(true) 19 | .coalesce_characters(true) 20 | .create_reader(file); 21 | 22 | let stdout = std::io::stdout().lock(); 23 | 24 | let mut writer = EmitterConfig::default() 25 | .create_writer(stdout); 26 | 27 | loop { 28 | let reader_event = reader.next()?; 29 | 30 | match reader_event { 31 | xml::reader::XmlEvent::EndDocument => break, 32 | xml::reader::XmlEvent::StartElement { name, mut attributes, namespace } => { 33 | let event = xml::writer::XmlEvent::StartElement { 34 | name: name.borrow(), 35 | namespace: namespace.borrow(), 36 | attributes: attributes.iter_mut().map(|attr| { 37 | attr.value = alternating_caps(&attr.value); 38 | attr.borrow() 39 | }).collect(), 40 | }; 41 | writer.write(event)?; 42 | }, 43 | xml::reader::XmlEvent::Characters(text) => { 44 | let text = alternating_caps(&text); 45 | let event = xml::writer::XmlEvent::Characters(&text); 46 | writer.write(event)?; 47 | }, 48 | xml::reader::XmlEvent::Comment(text) => { 49 | let text = alternating_caps(&text); 50 | let event = xml::writer::XmlEvent::Comment(&text); 51 | writer.write(event)?; 52 | }, 53 | other => { 54 | if let Some(writer_event) = other.as_writer_event() { 55 | writer.write(writer_event)?; 56 | } 57 | }, 58 | } 59 | } 60 | Ok(()) 61 | } 62 | 63 | fn alternating_caps(text: &str) -> String { 64 | text.chars().enumerate() 65 | .map(|(i, ch)| if i&1==0 { ch.to_ascii_uppercase() } else { ch.to_ascii_lowercase() }) 66 | .collect() 67 | } 68 | -------------------------------------------------------------------------------- /tests/sun-not-wf.fail.txt: -------------------------------------------------------------------------------- 1 | attlist01 attlist01.xml SGML's NUTOKEN is not allowed. 2 | attlist02 attlist02.xml SGML's NUTOKENS attribute type is not allowed. 3 | attlist03 attlist03.xml Comma doesn't separate enumerations, unlike in SGML. 4 | attlist04 attlist04.xml SGML's NUMBER attribute type is not allowed. 5 | attlist05 attlist05.xml SGML's NUMBERS attribute type is not allowed. 6 | attlist06 attlist06.xml SGML's NAME attribute type is not allowed. 7 | attlist07 attlist07.xml SGML's NAMES attribute type is not allowed. 8 | attlist08 attlist08.xml SGML's #CURRENT is not allowed. 9 | attlist09 attlist09.xml SGML's #CONREF is not allowed. 10 | cond01 cond01.xml Only INCLUDE and IGNORE are conditional section keywords 11 | cond02 cond02.xml Must have keyword in conditional sections 12 | content01 content01.xml No whitespace before "?" in content model 13 | content02 content02.xml No whitespace before "*" in content model 14 | content03 content03.xml No whitespace before "+" in content model 15 | decl01 decl01.xml External entities may not have standalone decls. 16 | nwf-dtd00 dtd00.xml Comma mandatory in content model 17 | nwf-dtd01 dtd01.xml Can't mix comma and vertical bar in content models 18 | dtd04 dtd04.xml PUBLIC literal must be quoted 19 | dtd05 dtd05.xml SYSTEM identifier must be quoted 20 | dtd07 dtd07.xml Text declarations (which optionally begin any external entity) are required to have "encoding=...". 21 | encoding07 encoding07.xml Text declarations (which optionally begin any external entity) are required to have "encoding=...". 22 | pi pi.xml No space between PI target name and data 23 | pubid01 pubid01.xml Illegal entity ref in public ID 24 | pubid02 pubid02.xml Illegal characters in public ID 25 | pubid03 pubid03.xml Illegal characters in public ID 26 | pubid04 pubid04.xml Illegal characters in public ID 27 | pubid05 pubid05.xml SGML-ism: public ID without system ID 28 | sgml04 sgml04.xml ATTLIST declarations apply to only one element, unlike SGML 29 | sgml05 sgml05.xml ELEMENT declarations apply to only one element, unlike SGML 30 | sgml06 sgml06.xml ATTLIST declarations are never global, unlike in SGML 31 | sgml07 sgml07.xml SGML Tag minimization specifications are not allowed 32 | sgml08 sgml08.xml SGML Tag minimization specifications are not allowed 33 | sgml09 sgml09.xml SGML Content model exception specifications are not allowed 34 | sgml10 sgml10.xml SGML Content model exception specifications are not allowed 35 | sgml11 sgml11.xml CDATA is not a valid content model spec 36 | sgml12 sgml12.xml RCDATA is not a valid content model spec 37 | sgml13 sgml13.xml SGML Unordered content models not allowed 38 | -------------------------------------------------------------------------------- /examples/print_events.rs: -------------------------------------------------------------------------------- 1 | use std::fs::File; 2 | use std::io::BufReader; 3 | use xml::common::Position; 4 | use xml::reader::{ParserConfig, XmlEvent}; 5 | 6 | fn main() { 7 | let file_path = std::env::args_os().nth(1).expect("Please specify a path to an XML file"); 8 | let file = File::open(file_path).unwrap(); 9 | 10 | let mut reader = ParserConfig::default() 11 | .ignore_root_level_whitespace(false) 12 | .create_reader(BufReader::new(file)); 13 | 14 | loop { 15 | match reader.next() { 16 | Ok(e) => { 17 | print!("{}\t", reader.position()); 18 | 19 | match e { 20 | XmlEvent::StartDocument { version, encoding, .. } => { 21 | println!("StartDocument({version}, {encoding})"); 22 | }, 23 | XmlEvent::EndDocument => { 24 | println!("EndDocument"); 25 | break; 26 | }, 27 | XmlEvent::ProcessingInstruction { name, data } => { 28 | println!("ProcessingInstruction({name}={:?})", data.as_deref().unwrap_or_default()); 29 | }, 30 | XmlEvent::StartElement { name, attributes, .. } => { 31 | if attributes.is_empty() { 32 | println!("StartElement({name})"); 33 | } else { 34 | let attrs: Vec<_> = attributes 35 | .iter() 36 | .map(|a| format!("{}={:?}", &a.name, a.value)) 37 | .collect(); 38 | println!("StartElement({name} [{}])", attrs.join(", ")); 39 | } 40 | }, 41 | XmlEvent::EndElement { name } => { 42 | println!("EndElement({name})"); 43 | }, 44 | XmlEvent::Comment(data) => { 45 | println!(r#"Comment("{}")"#, data.escape_debug()); 46 | }, 47 | XmlEvent::CData(data) => println!(r#"CData("{}")"#, data.escape_debug()), 48 | XmlEvent::Characters(data) => { 49 | println!(r#"Characters("{}")"#, data.escape_debug()); 50 | }, 51 | XmlEvent::Whitespace(data) => { 52 | println!(r#"Whitespace("{}")"#, data.escape_debug()); 53 | }, 54 | XmlEvent::Doctype { syntax } => { 55 | println!(r#"Doctype("{}")"#, syntax.escape_debug()); 56 | }, 57 | } 58 | }, 59 | Err(e) => { 60 | eprintln!("Error at {}: {e}", reader.position()); 61 | break; 62 | }, 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/attribute.rs: -------------------------------------------------------------------------------- 1 | //! Contains XML attributes manipulation types and functions. 2 | 3 | use std::fmt; 4 | 5 | use crate::escape::{AttributeEscapes, Escaped}; 6 | use crate::name::{Name, OwnedName}; 7 | 8 | /// A borrowed version of an XML attribute. 9 | /// 10 | /// Consists of a borrowed qualified name and a borrowed string value. 11 | #[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)] 12 | pub struct Attribute<'a> { 13 | /// Attribute name. 14 | pub name: Name<'a>, 15 | 16 | /// Attribute value. 17 | pub value: &'a str, 18 | } 19 | 20 | impl fmt::Display for Attribute<'_> { 21 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 22 | write!(f, "{}=\"{}\"", self.name, Escaped::::new(self.value)) 23 | } 24 | } 25 | 26 | impl<'a> Attribute<'a> { 27 | /// Creates an owned attribute out of this borrowed one. 28 | #[inline] 29 | #[must_use] 30 | pub fn to_owned(&self) -> OwnedAttribute { 31 | OwnedAttribute { 32 | name: self.name.into(), 33 | value: self.value.into(), 34 | } 35 | } 36 | 37 | /// Creates a borrowed attribute using the provided borrowed name and a borrowed string value. 38 | #[inline] 39 | #[must_use] 40 | pub const fn new(name: Name<'a>, value: &'a str) -> Self { 41 | Attribute { name, value } 42 | } 43 | } 44 | 45 | /// An owned version of an XML attribute. 46 | /// 47 | /// Consists of an owned qualified name and an owned string value. 48 | #[derive(Clone, Eq, PartialEq, Hash, Debug)] 49 | pub struct OwnedAttribute { 50 | /// Attribute name. 51 | pub name: OwnedName, 52 | 53 | /// Attribute value. 54 | pub value: String, 55 | } 56 | 57 | impl OwnedAttribute { 58 | /// Returns a borrowed `Attribute` out of this owned one. 59 | #[must_use] 60 | #[inline] 61 | pub fn borrow(&self) -> Attribute<'_> { 62 | Attribute { 63 | name: self.name.borrow(), 64 | value: &self.value, 65 | } 66 | } 67 | 68 | /// Creates a new owned attribute using the provided owned name and an owned string value. 69 | #[inline] 70 | pub fn new>(name: OwnedName, value: S) -> Self { 71 | Self { name, value: value.into() } 72 | } 73 | } 74 | 75 | impl fmt::Display for OwnedAttribute { 76 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 77 | write!(f, "{}=\"{}\"", self.name, Escaped::::new(&self.value)) 78 | } 79 | } 80 | 81 | #[cfg(test)] 82 | mod tests { 83 | use super::Attribute; 84 | 85 | use crate::name::Name; 86 | 87 | #[test] 88 | fn attribute_display() { 89 | let attr = Attribute::new( 90 | Name::qualified("attribute", "urn:namespace", Some("n")), 91 | "its value with > & \" ' < weird symbols", 92 | ); 93 | 94 | assert_eq!( 95 | &*attr.to_string(), 96 | "{urn:namespace}n:attribute=\"its value with > & " ' < weird symbols\"" 97 | ); 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /examples/xml-analyze.rs: -------------------------------------------------------------------------------- 1 | #![forbid(unsafe_code)] 2 | 3 | use std::collections::HashSet; 4 | use std::fs::File; 5 | use std::io::{self, BufReader, Read}; 6 | use std::{cmp, env}; 7 | 8 | use xml::ParserConfig; 9 | use xml::reader::XmlEvent; 10 | 11 | fn main() -> Result<(), Box> { 12 | let mut file; 13 | let mut stdin; 14 | let source: &mut dyn Read = if let Some(file_name) = env::args().nth(1) { 15 | file = File::open(file_name).map_err(|e| format!("Cannot open input file: {e}"))?; 16 | &mut file 17 | } else { 18 | stdin = io::stdin(); 19 | &mut stdin 20 | }; 21 | 22 | let reader = ParserConfig::new() 23 | .whitespace_to_characters(true) 24 | .ignore_comments(false) 25 | .create_reader(BufReader::new(source)); 26 | 27 | let mut processing_instructions = 0; 28 | let mut elements = 0; 29 | let mut character_blocks = 0; 30 | let mut cdata_blocks = 0; 31 | let mut characters = 0; 32 | let mut comment_blocks = 0; 33 | let mut comment_characters = 0; 34 | let mut namespaces = HashSet::new(); 35 | let mut depth = 0; 36 | let mut max_depth = 0; 37 | 38 | for e in reader { 39 | let e = e.map_err(|e| format!("Error parsing XML document: {e}"))?; 40 | match e { 41 | XmlEvent::StartDocument { version, encoding, standalone } => { 42 | println!( 43 | "XML document version {}, encoded in {}, {}standalone", 44 | version, encoding, if standalone.unwrap_or(false) { "" } else { "not " } 45 | ); 46 | }, 47 | XmlEvent::Doctype { syntax } => { 48 | println!("The Doctype is: {syntax}"); 49 | }, 50 | XmlEvent::EndDocument => println!("Document finished"), 51 | XmlEvent::ProcessingInstruction { .. } => processing_instructions += 1, 52 | XmlEvent::Whitespace(_) => {}, // can't happen due to configuration 53 | XmlEvent::Characters(s) => { 54 | character_blocks += 1; 55 | characters += s.len(); 56 | }, 57 | XmlEvent::CData(s) => { 58 | cdata_blocks += 1; 59 | characters += s.len(); 60 | }, 61 | XmlEvent::Comment(s) => { 62 | comment_blocks += 1; 63 | comment_characters += s.len(); 64 | }, 65 | XmlEvent::StartElement { namespace, .. } => { 66 | depth += 1; 67 | max_depth = cmp::max(max_depth, depth); 68 | elements += 1; 69 | namespaces.extend(namespace.0.into_values()); 70 | }, 71 | XmlEvent::EndElement { .. } => { 72 | depth -= 1; 73 | }, 74 | } 75 | } 76 | 77 | namespaces.remove(xml::namespace::NS_EMPTY_URI); 78 | namespaces.remove(xml::namespace::NS_XMLNS_URI); 79 | namespaces.remove(xml::namespace::NS_XML_URI); 80 | 81 | println!("Elements: {elements}, maximum depth: {max_depth}"); 82 | println!("Namespaces (excluding built-in): {}", namespaces.len()); 83 | println!("Characters: {characters}, characters blocks: {character_blocks}, CDATA blocks: {cdata_blocks}"); 84 | println!("Comment blocks: {comment_blocks}, comment characters: {comment_characters}"); 85 | println!("Processing instructions (excluding built-in): {processing_instructions}"); 86 | 87 | Ok(()) 88 | } 89 | -------------------------------------------------------------------------------- /src/reader/parser/inside_reference.rs: -------------------------------------------------------------------------------- 1 | use super::{PullParser, Result, State}; 2 | use crate::common::{is_name_char, is_name_start_char, is_whitespace_char}; 3 | use crate::reader::error::SyntaxError; 4 | use crate::reader::lexer::Token; 5 | use std::char; 6 | 7 | impl PullParser { 8 | pub fn inside_reference(&mut self, t: Token) -> Option { 9 | match t { 10 | Token::Character(c) if !self.data.ref_data.is_empty() && is_name_char(c) || 11 | self.data.ref_data.is_empty() && (is_name_start_char(c) || c == '#') => { 12 | self.data.ref_data.push(c); 13 | None 14 | }, 15 | 16 | Token::ReferenceEnd => { 17 | let name = self.data.take_ref_data(); 18 | if name.is_empty() { 19 | return Some(self.error(SyntaxError::EmptyEntity)); 20 | } 21 | 22 | let c = match &*name { 23 | "lt" => Some('<'), 24 | "gt" => Some('>'), 25 | "amp" => Some('&'), 26 | "apos" => Some('\''), 27 | "quot" => Some('"'), 28 | _ if name.starts_with('#') => match self.numeric_reference_from_str(&name[1..]) { 29 | Ok(c) => Some(c), 30 | Err(e) => return Some(self.error(e)), 31 | }, 32 | _ => None, 33 | }; 34 | if let Some(c) = c { 35 | self.buf.push(c); 36 | } else if let Some(v) = self.config.extra_entities.get(&name) { 37 | self.buf.push_str(v); 38 | } else if let Some(v) = self.entities.get(&name) { 39 | if self.state_after_reference == State::OutsideTag { 40 | // an entity can expand to *elements*, so outside of a tag it needs a full reparse 41 | if let Err(e) = self.lexer.reparse(v) { 42 | return Some(Err(e)); 43 | } 44 | } else { 45 | // however, inside attributes it's not allowed to affect attribute quoting, 46 | // so it can't be fed to the lexer 47 | self.buf.push_str(v); 48 | } 49 | } else { 50 | return Some(self.error(SyntaxError::UnexpectedEntity(name.into()))); 51 | } 52 | let prev_st = self.state_after_reference; 53 | if prev_st == State::OutsideTag && !is_whitespace_char(self.buf.chars().last().unwrap_or('\0')) { 54 | self.inside_whitespace = false; 55 | } 56 | self.into_state_continue(prev_st) 57 | }, 58 | 59 | _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), 60 | } 61 | } 62 | 63 | pub(crate) fn numeric_reference_from_str(&self, num_str: &str) -> std::result::Result { 64 | let val = if let Some(hex) = num_str.strip_prefix('x') { 65 | u32::from_str_radix(hex, 16).map_err(move |_| SyntaxError::InvalidNumericEntity(num_str.into()))? 66 | } else { 67 | num_str.parse::().map_err(move |_| SyntaxError::InvalidNumericEntity(num_str.into()))? 68 | }; 69 | match char::from_u32(val) { 70 | Some(c) if self.is_valid_xml_char(c) => Ok(c), 71 | Some(_) if self.config.replace_unknown_entity_references => Ok('\u{fffd}'), 72 | None if self.config.replace_unknown_entity_references => Ok('\u{fffd}'), 73 | _ => Err(SyntaxError::InvalidCharacterEntity(val)), 74 | } 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/reader/indexset.rs: -------------------------------------------------------------------------------- 1 | use crate::attribute::OwnedAttribute; 2 | use crate::name::OwnedName; 3 | 4 | use std::collections::hash_map::RandomState; 5 | use std::collections::HashSet; 6 | use std::hash::{BuildHasher, Hash, Hasher}; 7 | 8 | /// An ordered set 9 | pub struct AttributesSet { 10 | vec: Vec, 11 | /// Uses a no-op hasher, because these u64s are hashes already 12 | may_contain: HashSet, 13 | /// This is real hasher for the `OwnedName` 14 | hasher: RandomState, 15 | } 16 | 17 | /// Use linear search and don't allocate `HashSet` if there are few attributes, 18 | /// because allocation costs more than a few comparisons. 19 | const HASH_THRESHOLD: usize = 8; 20 | 21 | impl AttributesSet { 22 | pub fn new() -> Self { 23 | Self { 24 | vec: Vec::new(), 25 | hasher: RandomState::new(), 26 | may_contain: HashSet::default(), 27 | } 28 | } 29 | 30 | fn hash(&self, val: &OwnedName) -> u64 { 31 | let mut h = self.hasher.build_hasher(); 32 | val.hash(&mut h); 33 | h.finish() 34 | } 35 | 36 | pub fn len(&self) -> usize { 37 | self.vec.len() 38 | } 39 | 40 | pub fn contains(&self, name: &OwnedName) -> bool { 41 | // fall back to linear search only on duplicate or hash collision 42 | (self.vec.len() < HASH_THRESHOLD || self.may_contain.contains(&self.hash(name))) && 43 | self.vec.iter().any(move |a| &a.name == name) 44 | } 45 | 46 | pub fn push(&mut self, attr: OwnedAttribute) { 47 | if self.vec.len() >= HASH_THRESHOLD { 48 | if self.vec.len() == HASH_THRESHOLD { 49 | self.may_contain.reserve(HASH_THRESHOLD * 2); 50 | for attr in &self.vec { 51 | self.may_contain.insert(self.hash(&attr.name)); 52 | } 53 | } 54 | self.may_contain.insert(self.hash(&attr.name)); 55 | } 56 | self.vec.push(attr); 57 | } 58 | 59 | pub fn into_vec(self) -> Vec { 60 | self.vec 61 | } 62 | } 63 | 64 | #[test] 65 | fn indexset() { 66 | let mut s = AttributesSet::new(); 67 | let not_here = OwnedName { 68 | local_name: "attr1000".into(), 69 | namespace: Some("test".into()), 70 | prefix: None, 71 | }; 72 | 73 | // this test will take a lot of time if the `contains()` is linear, and the loop is quadratic 74 | for i in 0..50000 { 75 | let name = OwnedName { 76 | local_name: format!("attr{i}"), namespace: None, prefix: None, 77 | }; 78 | assert!(!s.contains(&name)); 79 | 80 | s.push(OwnedAttribute { name, value: String::new() }); 81 | assert!(!s.contains(¬_here)); 82 | } 83 | 84 | assert!(s.contains(&OwnedName { 85 | local_name: "attr1234".into(), namespace: None, prefix: None, 86 | })); 87 | assert!(s.contains(&OwnedName { 88 | local_name: "attr0".into(), namespace: None, prefix: None, 89 | })); 90 | assert!(s.contains(&OwnedName { 91 | local_name: "attr49999".into(), namespace: None, prefix: None, 92 | })); 93 | } 94 | 95 | /// Hashser that does nothing except passing u64 through 96 | struct U64Hasher(u64); 97 | 98 | impl Hasher for U64Hasher { 99 | fn finish(&self) -> u64 { self.0 } 100 | fn write(&mut self, slice: &[u8]) { 101 | for &v in slice { self.0 ^= u64::from(v) } // unused in practice 102 | } 103 | fn write_u64(&mut self, i: u64) { 104 | self.0 ^= i; 105 | } 106 | } 107 | 108 | #[derive(Default)] 109 | struct U64HasherBuilder; 110 | 111 | impl BuildHasher for U64HasherBuilder { 112 | type Hasher = U64Hasher; 113 | fn build_hasher(&self) -> U64Hasher { U64Hasher(0) } 114 | } 115 | -------------------------------------------------------------------------------- /tests/documents/sample_8_c.txt: -------------------------------------------------------------------------------- 1 | StartDocument(1.0, UTF-8) 2 | StartElement(el) 3 | Whitespace("\n") 4 | CData("") 5 | Whitespace("\n") 6 | CData("") 7 | Whitespace("\n\n") 8 | Whitespace("\n") 9 | Whitespace("\n") 10 | CData("") 11 | Whitespace("\n") 12 | StartElement(br) 13 | EndElement(br) 14 | Whitespace("\n") 15 | StartElement(s) 16 | EndElement(s) 17 | Whitespace("\n") 18 | StartElement(s) 19 | EndElement(s) 20 | Whitespace("\n") 21 | StartElement(s) 22 | CData("") 23 | EndElement(s) 24 | Whitespace("\n\n\n") 25 | CData("") 26 | Whitespace("\n") 27 | Whitespace("\n\n") 28 | CData("") 29 | CData("") 30 | Whitespace("\n") 31 | CData("") 32 | CData("") 33 | Whitespace("\n") 34 | CData("") 35 | CData("") 36 | Whitespace("\n") 37 | CData("") 38 | StartElement(br) 39 | EndElement(br) 40 | CData("") 41 | Whitespace("\n") 42 | CData("") 43 | StartElement(s) 44 | EndElement(s) 45 | CData("") 46 | Whitespace("\n") 47 | CData("") 48 | StartElement(s) 49 | EndElement(s) 50 | CData("") 51 | Whitespace("\n") 52 | CData("") 53 | StartElement(s) 54 | EndElement(s) 55 | CData("") 56 | Whitespace("\n\n") 57 | CData("") 58 | Whitespace("\n") 59 | CData("") 60 | Whitespace("\n\n") 61 | Whitespace("\n") 62 | Whitespace("\n") 63 | CData("") 64 | Whitespace("\n") 65 | StartElement(br) 66 | EndElement(br) 67 | Whitespace("\n") 68 | StartElement(s) 69 | EndElement(s) 70 | Whitespace("\n") 71 | StartElement(s) 72 | EndElement(s) 73 | Whitespace("\n") 74 | StartElement(s) 75 | CData("") 76 | EndElement(s) 77 | Whitespace("\n\n\n") 78 | CData("") 79 | Whitespace("\n") 80 | Whitespace("\n\n") 81 | CData("") 82 | CData("") 83 | Whitespace("\n") 84 | CData("") 85 | CData("") 86 | Whitespace("\n") 87 | CData("") 88 | CData("") 89 | Whitespace("\n") 90 | CData("") 91 | StartElement(br) 92 | EndElement(br) 93 | CData("") 94 | Whitespace("\n") 95 | CData("") 96 | StartElement(s) 97 | EndElement(s) 98 | CData("") 99 | Whitespace("\n") 100 | CData("") 101 | StartElement(s) 102 | EndElement(s) 103 | CData("") 104 | Whitespace("\n") 105 | CData("") 106 | StartElement(s) 107 | EndElement(s) 108 | CData("") 109 | Whitespace("\n\n\n") 110 | Whitespace("\n\n") 111 | CData("") 112 | Whitespace("\n") 113 | CData("") 114 | Whitespace("\n\n") 115 | Whitespace("\n") 116 | Whitespace("\n") 117 | CData("") 118 | Whitespace("\n") 119 | StartElement(br) 120 | EndElement(br) 121 | Whitespace("\n") 122 | StartElement(s) 123 | EndElement(s) 124 | Whitespace("\n") 125 | StartElement(s) 126 | EndElement(s) 127 | Whitespace("\n") 128 | StartElement(s) 129 | CData("") 130 | EndElement(s) 131 | Whitespace("\n\n\n") 132 | CData("") 133 | Whitespace("\n") 134 | Whitespace("\n\n") 135 | CData("") 136 | CData("") 137 | Whitespace("\n") 138 | CData("") 139 | CData("") 140 | Whitespace("\n") 141 | CData("") 142 | CData("") 143 | Whitespace("\n") 144 | CData("") 145 | StartElement(br) 146 | EndElement(br) 147 | CData("") 148 | Whitespace("\n") 149 | CData("") 150 | StartElement(s) 151 | EndElement(s) 152 | CData("") 153 | Whitespace("\n") 154 | CData("") 155 | StartElement(s) 156 | EndElement(s) 157 | CData("") 158 | Whitespace("\n") 159 | CData("") 160 | StartElement(s) 161 | EndElement(s) 162 | CData("") 163 | Whitespace("\n\n") 164 | CData("") 165 | Whitespace("\n\n") 166 | CData("") 167 | CData("") 168 | StartElement(br) 169 | EndElement(br) 170 | StartElement(s) 171 | EndElement(s) 172 | StartElement(s) 173 | EndElement(s) 174 | StartElement(s) 175 | CData("") 176 | EndElement(s) 177 | CData("") 178 | CData("") 179 | CData("") 180 | CData("") 181 | CData("") 182 | CData("") 183 | CData("") 184 | CData("") 185 | StartElement(br) 186 | EndElement(br) 187 | CData("") 188 | CData("") 189 | StartElement(s) 190 | EndElement(s) 191 | CData("") 192 | CData("") 193 | StartElement(s) 194 | EndElement(s) 195 | CData("") 196 | CData("") 197 | StartElement(s) 198 | EndElement(s) 199 | CData("") 200 | Whitespace("\n") 201 | EndElement(el) 202 | EndDocument 203 | -------------------------------------------------------------------------------- /tests/documents/sample_8_coalesce_wscdch.txt: -------------------------------------------------------------------------------- 1 | StartDocument(1.0, UTF-8) 2 | StartElement(el) 3 | Characters("\n") 4 | Comment("ws") 5 | Characters("\n\n\n") 6 | Comment("ws") 7 | Comment("ws") 8 | Characters("\n") 9 | Comment("ws") 10 | Comment("ws") 11 | Characters("\n") 12 | Comment("ws") 13 | Comment("ws") 14 | Characters("\n") 15 | Comment("ws") 16 | StartElement(br) 17 | EndElement(br) 18 | Comment("ws") 19 | Characters("\n") 20 | Comment("ws") 21 | StartElement(s) 22 | EndElement(s) 23 | Comment("ws") 24 | Characters("\n") 25 | Comment("ws") 26 | StartElement(s) 27 | EndElement(s) 28 | Comment("ws") 29 | Characters("\n") 30 | Comment("ws") 31 | StartElement(s) 32 | EndElement(s) 33 | Comment("ws") 34 | Characters("\n\n\n") 35 | Comment("ws") 36 | Characters("\n") 37 | Comment("ws") 38 | Characters("\n\n\n\n") 39 | Comment("ws") 40 | Characters("\n") 41 | StartElement(br) 42 | EndElement(br) 43 | Characters("\n") 44 | StartElement(s) 45 | EndElement(s) 46 | Characters("\n") 47 | StartElement(s) 48 | EndElement(s) 49 | Characters("\n") 50 | StartElement(s) 51 | Comment("ws") 52 | EndElement(s) 53 | Characters("\n\n") 54 | Comment("ws") 55 | Characters("\n\n\n") 56 | Comment("ws") 57 | Comment("ws") 58 | Characters("\n") 59 | Comment("ws") 60 | Comment("ws") 61 | Characters("\n") 62 | Comment("ws") 63 | Comment("ws") 64 | Characters("\n") 65 | Comment("ws") 66 | StartElement(br) 67 | EndElement(br) 68 | Comment("ws") 69 | Characters("\n") 70 | Comment("ws") 71 | StartElement(s) 72 | EndElement(s) 73 | Comment("ws") 74 | Characters("\n") 75 | Comment("ws") 76 | StartElement(s) 77 | EndElement(s) 78 | Comment("ws") 79 | Characters("\n") 80 | Comment("ws") 81 | StartElement(s) 82 | EndElement(s) 83 | Comment("ws") 84 | Characters("\n\n\n") 85 | Comment("ws") 86 | Characters("\n") 87 | Comment("ws") 88 | Characters("\n\n\n\n") 89 | Comment("ws") 90 | Characters("\n") 91 | StartElement(br) 92 | EndElement(br) 93 | Characters("\n") 94 | StartElement(s) 95 | EndElement(s) 96 | Characters("\n") 97 | StartElement(s) 98 | EndElement(s) 99 | Characters("\n") 100 | StartElement(s) 101 | Comment("ws") 102 | EndElement(s) 103 | Characters("\n\n\n") 104 | Comment("noWS") 105 | Characters("\n\n") 106 | Comment("ws") 107 | Characters("\n\n\n") 108 | Comment("ws") 109 | Comment("ws") 110 | Characters("\n") 111 | Comment("ws") 112 | Comment("ws") 113 | Characters("\n") 114 | Comment("ws") 115 | Comment("ws") 116 | Characters("\n") 117 | Comment("ws") 118 | StartElement(br) 119 | EndElement(br) 120 | Comment("ws") 121 | Characters("\n") 122 | Comment("ws") 123 | StartElement(s) 124 | EndElement(s) 125 | Comment("ws") 126 | Characters("\n") 127 | Comment("ws") 128 | StartElement(s) 129 | EndElement(s) 130 | Comment("ws") 131 | Characters("\n") 132 | Comment("ws") 133 | StartElement(s) 134 | EndElement(s) 135 | Comment("ws") 136 | Characters("\n\n\n") 137 | Comment("ws") 138 | Characters("\n") 139 | Comment("ws") 140 | Characters("\n\n\n\n") 141 | Comment("ws") 142 | Characters("\n") 143 | StartElement(br) 144 | EndElement(br) 145 | Characters("\n") 146 | StartElement(s) 147 | EndElement(s) 148 | Characters("\n") 149 | StartElement(s) 150 | EndElement(s) 151 | Characters("\n") 152 | StartElement(s) 153 | Comment("ws") 154 | EndElement(s) 155 | Characters("\n\n") 156 | Comment("ws") 157 | Characters("\n\n") 158 | Comment("ws") 159 | Comment("ws") 160 | Comment("ws") 161 | Comment("ws") 162 | Comment("ws") 163 | Comment("ws") 164 | Comment("ws") 165 | StartElement(br) 166 | EndElement(br) 167 | Comment("ws") 168 | Comment("ws") 169 | StartElement(s) 170 | EndElement(s) 171 | Comment("ws") 172 | Comment("ws") 173 | StartElement(s) 174 | EndElement(s) 175 | Comment("ws") 176 | Comment("ws") 177 | StartElement(s) 178 | EndElement(s) 179 | Comment("ws") 180 | Comment("ws") 181 | Comment("ws") 182 | Comment("ws") 183 | StartElement(br) 184 | EndElement(br) 185 | StartElement(s) 186 | EndElement(s) 187 | StartElement(s) 188 | EndElement(s) 189 | StartElement(s) 190 | Comment("ws") 191 | EndElement(s) 192 | Characters("\n") 193 | EndElement(el) 194 | EndDocument 195 | -------------------------------------------------------------------------------- /tests/documents/sample_8_wscdch.txt: -------------------------------------------------------------------------------- 1 | StartDocument(1.0, UTF-8) 2 | StartElement(el) 3 | Characters("\n") 4 | Comment("ws") 5 | Characters("\n") 6 | Characters("\n\n") 7 | Comment("ws") 8 | Comment("ws") 9 | Characters("\n") 10 | Comment("ws") 11 | Comment("ws") 12 | Characters("\n") 13 | Comment("ws") 14 | Comment("ws") 15 | Characters("\n") 16 | Comment("ws") 17 | StartElement(br) 18 | EndElement(br) 19 | Comment("ws") 20 | Characters("\n") 21 | Comment("ws") 22 | StartElement(s) 23 | EndElement(s) 24 | Comment("ws") 25 | Characters("\n") 26 | Comment("ws") 27 | StartElement(s) 28 | EndElement(s) 29 | Comment("ws") 30 | Characters("\n") 31 | Comment("ws") 32 | StartElement(s) 33 | EndElement(s) 34 | Comment("ws") 35 | Characters("\n\n\n") 36 | Comment("ws") 37 | Characters("\n") 38 | Comment("ws") 39 | Characters("\n\n") 40 | Characters("\n") 41 | Characters("\n") 42 | Comment("ws") 43 | Characters("\n") 44 | StartElement(br) 45 | EndElement(br) 46 | Characters("\n") 47 | StartElement(s) 48 | EndElement(s) 49 | Characters("\n") 50 | StartElement(s) 51 | EndElement(s) 52 | Characters("\n") 53 | StartElement(s) 54 | Comment("ws") 55 | EndElement(s) 56 | Characters("\n\n") 57 | Comment("ws") 58 | Characters("\n") 59 | Characters("\n\n") 60 | Comment("ws") 61 | Comment("ws") 62 | Characters("\n") 63 | Comment("ws") 64 | Comment("ws") 65 | Characters("\n") 66 | Comment("ws") 67 | Comment("ws") 68 | Characters("\n") 69 | Comment("ws") 70 | StartElement(br) 71 | EndElement(br) 72 | Comment("ws") 73 | Characters("\n") 74 | Comment("ws") 75 | StartElement(s) 76 | EndElement(s) 77 | Comment("ws") 78 | Characters("\n") 79 | Comment("ws") 80 | StartElement(s) 81 | EndElement(s) 82 | Comment("ws") 83 | Characters("\n") 84 | Comment("ws") 85 | StartElement(s) 86 | EndElement(s) 87 | Comment("ws") 88 | Characters("\n\n\n") 89 | Comment("ws") 90 | Characters("\n") 91 | Comment("ws") 92 | Characters("\n\n") 93 | Characters("\n") 94 | Characters("\n") 95 | Comment("ws") 96 | Characters("\n") 97 | StartElement(br) 98 | EndElement(br) 99 | Characters("\n") 100 | StartElement(s) 101 | EndElement(s) 102 | Characters("\n") 103 | StartElement(s) 104 | EndElement(s) 105 | Characters("\n") 106 | StartElement(s) 107 | Comment("ws") 108 | EndElement(s) 109 | Characters("\n\n\n") 110 | Comment("noWS") 111 | Characters("\n\n") 112 | Comment("ws") 113 | Characters("\n") 114 | Characters("\n\n") 115 | Comment("ws") 116 | Comment("ws") 117 | Characters("\n") 118 | Comment("ws") 119 | Comment("ws") 120 | Characters("\n") 121 | Comment("ws") 122 | Comment("ws") 123 | Characters("\n") 124 | Comment("ws") 125 | StartElement(br) 126 | EndElement(br) 127 | Comment("ws") 128 | Characters("\n") 129 | Comment("ws") 130 | StartElement(s) 131 | EndElement(s) 132 | Comment("ws") 133 | Characters("\n") 134 | Comment("ws") 135 | StartElement(s) 136 | EndElement(s) 137 | Comment("ws") 138 | Characters("\n") 139 | Comment("ws") 140 | StartElement(s) 141 | EndElement(s) 142 | Comment("ws") 143 | Characters("\n\n\n") 144 | Comment("ws") 145 | Characters("\n") 146 | Comment("ws") 147 | Characters("\n\n") 148 | Characters("\n") 149 | Characters("\n") 150 | Comment("ws") 151 | Characters("\n") 152 | StartElement(br) 153 | EndElement(br) 154 | Characters("\n") 155 | StartElement(s) 156 | EndElement(s) 157 | Characters("\n") 158 | StartElement(s) 159 | EndElement(s) 160 | Characters("\n") 161 | StartElement(s) 162 | Comment("ws") 163 | EndElement(s) 164 | Characters("\n\n") 165 | Comment("ws") 166 | Characters("\n\n") 167 | Comment("ws") 168 | Comment("ws") 169 | Comment("ws") 170 | Comment("ws") 171 | Comment("ws") 172 | Comment("ws") 173 | Comment("ws") 174 | StartElement(br) 175 | EndElement(br) 176 | Comment("ws") 177 | Comment("ws") 178 | StartElement(s) 179 | EndElement(s) 180 | Comment("ws") 181 | Comment("ws") 182 | StartElement(s) 183 | EndElement(s) 184 | Comment("ws") 185 | Comment("ws") 186 | StartElement(s) 187 | EndElement(s) 188 | Comment("ws") 189 | Comment("ws") 190 | Comment("ws") 191 | Comment("ws") 192 | StartElement(br) 193 | EndElement(br) 194 | StartElement(s) 195 | EndElement(s) 196 | StartElement(s) 197 | EndElement(s) 198 | StartElement(s) 199 | Comment("ws") 200 | EndElement(s) 201 | Characters("\n") 202 | EndElement(el) 203 | EndDocument 204 | -------------------------------------------------------------------------------- /src/writer.rs: -------------------------------------------------------------------------------- 1 | //! Contains high-level interface for an events-based XML emitter. 2 | //! 3 | //! The most important type in this module is `EventWriter` which allows writing an XML document 4 | //! to some output stream. 5 | 6 | pub use self::config::EmitterConfig; 7 | pub use self::emitter::EmitterError as Error; 8 | pub use self::emitter::Result; 9 | pub use self::events::XmlEvent; 10 | 11 | use self::emitter::Emitter; 12 | 13 | use std::io::prelude::*; 14 | 15 | mod config; 16 | mod emitter; 17 | pub mod events; 18 | 19 | /// A wrapper around an `std::io::Write` instance which emits XML document according to provided 20 | /// events. 21 | pub struct EventWriter { 22 | sink: W, 23 | emitter: Emitter, 24 | } 25 | 26 | impl EventWriter { 27 | /// Creates a new `EventWriter` out of an `std::io::Write` instance using the default 28 | /// configuration. 29 | #[inline] 30 | pub fn new(sink: W) -> Self { 31 | Self::new_with_config(sink, EmitterConfig::new()) 32 | } 33 | 34 | /// Creates a new `EventWriter` out of an `std::io::Write` instance using the provided 35 | /// configuration. 36 | #[inline] 37 | pub fn new_with_config(sink: W, config: EmitterConfig) -> Self { 38 | Self { 39 | sink, 40 | emitter: Emitter::new(config), 41 | } 42 | } 43 | 44 | /// Writes the next piece of XML document according to the provided event. 45 | /// 46 | /// Note that output data may not exactly correspond to the written event because 47 | /// of various configuration options. For example, `XmlEvent::EndElement` may 48 | /// correspond to a separate closing element or it may cause writing an empty element. 49 | /// Another example is that `XmlEvent::CData` may be represented as characters in 50 | /// the output stream. 51 | pub fn write<'a, E>(&mut self, event: E) -> Result<()> where E: Into> { 52 | match event.into() { 53 | XmlEvent::StartDocument { version, encoding, standalone } => 54 | self.emitter.emit_start_document(&mut self.sink, version, encoding.unwrap_or("UTF-8"), standalone), 55 | XmlEvent::ProcessingInstruction { name, data } => 56 | self.emitter.emit_processing_instruction(&mut self.sink, name, data), 57 | XmlEvent::StartElement { name, attributes, namespace } => { 58 | self.emitter.namespace_stack_mut().push_empty().checked_target().extend(namespace.as_ref()); 59 | self.emitter.emit_start_element(&mut self.sink, name, &attributes) 60 | }, 61 | XmlEvent::EndElement { name } => { 62 | let r = self.emitter.emit_end_element(&mut self.sink, name); 63 | self.emitter.namespace_stack_mut().try_pop(); 64 | r 65 | }, 66 | XmlEvent::Comment(content) => self.emitter.emit_comment(&mut self.sink, content), 67 | XmlEvent::CData(content) => self.emitter.emit_cdata(&mut self.sink, content), 68 | XmlEvent::Characters(content) => self.emitter.emit_characters(&mut self.sink, content), 69 | XmlEvent::RawCharacters(content) => self.emitter.emit_raw_characters(&mut self.sink, content), 70 | XmlEvent::Doctype(content) => self.emitter.emit_raw_characters(&mut self.sink, content), 71 | } 72 | } 73 | 74 | /// Returns a mutable reference to the underlying `Writer`. 75 | /// 76 | /// Note that having a reference to the underlying sink makes it very easy to emit invalid XML 77 | /// documents. Use this method with care. Valid use cases for this method include accessing 78 | /// methods like `Write::flush`, which do not emit new data but rather change the state 79 | /// of the stream itself. 80 | pub fn inner_mut(&mut self) -> &mut W { 81 | &mut self.sink 82 | } 83 | 84 | /// Returns an immutable reference to the underlying `Writer`. 85 | pub fn inner_ref(&self) -> &W { 86 | &self.sink 87 | } 88 | 89 | /// Unwraps this `EventWriter`, returning the underlying writer. 90 | /// 91 | /// Note that this is a destructive operation: unwrapping a writer and then wrapping 92 | /// it again with `EventWriter::new()` will create a fresh writer whose state will be 93 | /// blank; for example, accumulated namespaces will be reset. 94 | pub fn into_inner(self) -> W { 95 | self.sink 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /tests/streaming.rs: -------------------------------------------------------------------------------- 1 | #![forbid(unsafe_code)] 2 | 3 | use std::io::{Cursor, Write}; 4 | 5 | use xml::EventReader; 6 | use xml::reader::{ParserConfig, XmlEvent}; 7 | 8 | macro_rules! assert_match { 9 | ($actual:expr, $expected:pat) => { 10 | match $actual { 11 | $expected => {}, 12 | _ => panic!("assertion failed: `(left matches right)` \ 13 | (left: `{:?}`, right: `{}`", $actual, stringify!($expected)) 14 | } 15 | }; 16 | ($actual:expr, $expected:pat if $guard:expr) => { 17 | match $actual { 18 | $expected if $guard => {}, 19 | _ => panic!("assertion failed: `(left matches right)` \ 20 | (left: `{:?}`, right: `{} if {}`", 21 | $actual, stringify!($expected), stringify!($guard)) 22 | } 23 | }; 24 | } 25 | 26 | fn write_and_reset_position(c: &mut Cursor, data: &[u8]) where Cursor: Write { 27 | let p = c.position(); 28 | c.write_all(data).unwrap(); 29 | c.set_position(p); 30 | } 31 | 32 | #[test] 33 | fn reading_streamed_content() { 34 | let buf = Cursor::new(b"".to_vec()); 35 | let reader = EventReader::new(buf); 36 | 37 | let mut it = reader.into_iter(); 38 | 39 | assert_match!(it.next(), Some(Ok(XmlEvent::StartDocument { .. }))); 40 | assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "root"); 41 | 42 | write_and_reset_position(it.source_mut(), b"content"); 43 | assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-1"); 44 | assert_match!(it.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "content"); 45 | assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-1"); 46 | 47 | write_and_reset_position(it.source_mut(), b""); 48 | assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-2"); 49 | assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-2"); 50 | 51 | write_and_reset_position(it.source_mut(), b""); 52 | assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-3"); 53 | assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-3"); 54 | // doesn't seem to work because of how tags parsing is done 55 | // write_and_reset_position(it.source_mut(), b"some text"); 56 | // assert_match!(it.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "some text"); 57 | 58 | write_and_reset_position(it.source_mut(), b""); 59 | assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "root"); 60 | assert_match!(it.next(), Some(Ok(XmlEvent::EndDocument))); 61 | assert_match!(it.next(), None); 62 | } 63 | 64 | #[test] 65 | fn reading_streamed_content2() { 66 | let buf = Cursor::new(b"".to_vec()); 67 | let mut config = ParserConfig::new(); 68 | config.ignore_end_of_stream = true; 69 | let readerb = EventReader::new_with_config(buf, config); 70 | 71 | let mut reader = readerb.into_iter(); 72 | 73 | assert_match!(reader.next(), Some(Ok(XmlEvent::StartDocument { .. }))); 74 | assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "root"); 75 | 76 | write_and_reset_position(reader.source_mut(), b"content"); 77 | assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-1"); 78 | assert_match!(reader.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "content"); 79 | assert_match!(reader.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-1"); 80 | 81 | write_and_reset_position(reader.source_mut(), b"content"); 82 | 83 | assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-2"); 84 | assert_match!(reader.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "content"); 85 | assert_match!(reader.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-2"); 86 | assert_match!(reader.next(), Some(Err(_))); 87 | write_and_reset_position(reader.source_mut(), b""); 88 | assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-3"); 89 | write_and_reset_position(reader.source_mut(), b" { 92 | panic!("At this point, parser must not detect something."); 93 | }, 94 | Some(Err(_)) => {}, 95 | } 96 | write_and_reset_position(reader.source_mut(), b" />"); 97 | assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-4"); 98 | } 99 | -------------------------------------------------------------------------------- /src/escape.rs: -------------------------------------------------------------------------------- 1 | //! Contains functions for performing XML special characters escaping. 2 | 3 | use std::borrow::Cow; 4 | use std::fmt::{Display, Formatter, Result}; 5 | use std::marker::PhantomData; 6 | 7 | pub(crate) trait Escapes { 8 | fn escape(c: u8) -> Option<&'static str>; 9 | 10 | fn byte_needs_escaping(c: u8) -> bool { 11 | Self::escape(c).is_some() 12 | } 13 | 14 | fn str_needs_escaping(s: &str) -> bool { 15 | s.bytes().any(|c| Self::escape(c).is_some()) 16 | } 17 | } 18 | 19 | pub(crate) struct Escaped<'a, E: Escapes> { 20 | _escape_phantom: PhantomData, 21 | to_escape: &'a str, 22 | } 23 | 24 | impl<'a, E: Escapes> Escaped<'a, E> { 25 | pub const fn new(s: &'a str) -> Self { 26 | Escaped { 27 | _escape_phantom: PhantomData, 28 | to_escape: s, 29 | } 30 | } 31 | } 32 | 33 | impl Display for Escaped<'_, E> { 34 | fn fmt(&self, f: &mut Formatter<'_>) -> Result { 35 | let mut total_remaining = self.to_escape; 36 | 37 | // find the next occurence 38 | while let Some(n) = total_remaining.bytes().position(E::byte_needs_escaping) { 39 | let (start, remaining) = total_remaining.split_at(n); 40 | 41 | f.write_str(start)?; 42 | 43 | // unwrap is safe because we checked is_some for position n earlier 44 | let next_byte = remaining.bytes().next().unwrap(); 45 | let replacement = E::escape(next_byte).unwrap_or("unexpected token"); 46 | f.write_str(replacement)?; 47 | 48 | total_remaining = &remaining[1..]; 49 | } 50 | 51 | f.write_str(total_remaining) 52 | } 53 | } 54 | 55 | fn escape_str(s: &str) -> Cow<'_, str> { 56 | if E::str_needs_escaping(s) { 57 | Cow::Owned(Escaped::::new(s).to_string()) 58 | } else { 59 | Cow::Borrowed(s) 60 | } 61 | } 62 | 63 | macro_rules! escapes { 64 | { 65 | $name: ident, 66 | $($k: expr => $v: expr),* $(,)? 67 | } => { 68 | pub(crate) struct $name; 69 | 70 | impl Escapes for $name { 71 | fn escape(c: u8) -> Option<&'static str> { 72 | match c { 73 | $( $k => Some($v),)* 74 | _ => None 75 | } 76 | } 77 | } 78 | }; 79 | } 80 | 81 | escapes!( 82 | AttributeEscapes, 83 | b'<' => "<", 84 | b'>' => ">", 85 | b'"' => """, 86 | b'\'' => "'", 87 | b'&' => "&", 88 | b'\n' => " ", 89 | b'\r' => " ", 90 | ); 91 | 92 | escapes!( 93 | PcDataEscapes, 94 | b'<' => "<", 95 | b'>' => ">", 96 | b'&' => "&", 97 | ); 98 | 99 | /// Performs escaping of common XML characters inside an attribute value. 100 | /// 101 | /// This function replaces several important markup characters with their 102 | /// entity equivalents: 103 | /// 104 | /// * `<` → `<` 105 | /// * `>` → `>` 106 | /// * `"` → `"` 107 | /// * `'` → `'` 108 | /// * `&` → `&` 109 | /// 110 | /// The following characters are escaped so that attributes are printed on 111 | /// a single line: 112 | /// * `\n` → ` ` 113 | /// * `\r` → ` ` 114 | /// 115 | /// The resulting string is safe to use inside XML attribute values or in PCDATA sections. 116 | /// 117 | /// Does not perform allocations if the given string does not contain escapable characters. 118 | #[inline] 119 | #[must_use] 120 | pub fn escape_str_attribute(s: &str) -> Cow<'_, str> { 121 | escape_str::(s) 122 | } 123 | 124 | /// Performs escaping of common XML characters inside PCDATA. 125 | /// 126 | /// This function replaces several important markup characters with their 127 | /// entity equivalents: 128 | /// 129 | /// * `<` → `<` 130 | /// * `&` → `&` 131 | /// 132 | /// The resulting string is safe to use inside PCDATA sections but NOT inside attribute values. 133 | /// 134 | /// Does not perform allocations if the given string does not contain escapable characters. 135 | #[inline] 136 | #[must_use] 137 | pub fn escape_str_pcdata(s: &str) -> Cow<'_, str> { 138 | escape_str::(s) 139 | } 140 | 141 | #[cfg(test)] 142 | mod tests { 143 | use super::{escape_str_attribute, escape_str_pcdata}; 144 | 145 | #[test] 146 | fn test_escape_str_attribute() { 147 | assert_eq!(escape_str_attribute("<>'\"&\n\r"), "<>'"& "); 148 | assert_eq!(escape_str_attribute("no_escapes"), "no_escapes"); 149 | } 150 | 151 | #[test] 152 | fn test_escape_str_pcdata() { 153 | assert_eq!(escape_str_pcdata("<>&"), "<>&"); 154 | assert_eq!(escape_str_pcdata("no_escapes"), "no_escapes"); 155 | } 156 | 157 | #[test] 158 | fn test_escape_multibyte_code_points() { 159 | assert_eq!(escape_str_attribute("☃<"), "☃<"); 160 | assert_eq!(escape_str_pcdata("☃<"), "☃<"); 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /tests/documents/sample_8_full.txt: -------------------------------------------------------------------------------- 1 | StartDocument(1.0, UTF-8) 2 | StartElement(el) 3 | Whitespace("\n") 4 | Comment("ws") 5 | CData("") 6 | Whitespace("\n") 7 | CData("") 8 | Whitespace("\n\n") 9 | Comment("ws") 10 | Comment("ws") 11 | Whitespace("\n") 12 | Comment("ws") 13 | Comment("ws") 14 | Whitespace("\n") 15 | Comment("ws") 16 | CData("") 17 | Comment("ws") 18 | Whitespace("\n") 19 | Comment("ws") 20 | StartElement(br) 21 | EndElement(br) 22 | Comment("ws") 23 | Whitespace("\n") 24 | Comment("ws") 25 | StartElement(s) 26 | EndElement(s) 27 | Comment("ws") 28 | Whitespace("\n") 29 | Comment("ws") 30 | StartElement(s) 31 | EndElement(s) 32 | Comment("ws") 33 | Whitespace("\n") 34 | Comment("ws") 35 | StartElement(s) 36 | CData("") 37 | EndElement(s) 38 | Comment("ws") 39 | Whitespace("\n\n\n") 40 | CData("") 41 | Comment("ws") 42 | Whitespace("\n") 43 | Comment("ws") 44 | Whitespace("\n\n") 45 | CData("") 46 | CData("") 47 | Whitespace("\n") 48 | CData("") 49 | CData("") 50 | Whitespace("\n") 51 | CData("") 52 | Comment("ws") 53 | CData("") 54 | Whitespace("\n") 55 | CData("") 56 | StartElement(br) 57 | EndElement(br) 58 | CData("") 59 | Whitespace("\n") 60 | CData("") 61 | StartElement(s) 62 | EndElement(s) 63 | CData("") 64 | Whitespace("\n") 65 | CData("") 66 | StartElement(s) 67 | EndElement(s) 68 | CData("") 69 | Whitespace("\n") 70 | CData("") 71 | StartElement(s) 72 | Comment("ws") 73 | EndElement(s) 74 | CData("") 75 | Whitespace("\n\n") 76 | Comment("ws") 77 | CData("") 78 | Whitespace("\n") 79 | CData("") 80 | Whitespace("\n\n") 81 | Comment("ws") 82 | Comment("ws") 83 | Whitespace("\n") 84 | Comment("ws") 85 | Comment("ws") 86 | Whitespace("\n") 87 | Comment("ws") 88 | CData("") 89 | Comment("ws") 90 | Whitespace("\n") 91 | Comment("ws") 92 | StartElement(br) 93 | EndElement(br) 94 | Comment("ws") 95 | Whitespace("\n") 96 | Comment("ws") 97 | StartElement(s) 98 | EndElement(s) 99 | Comment("ws") 100 | Whitespace("\n") 101 | Comment("ws") 102 | StartElement(s) 103 | EndElement(s) 104 | Comment("ws") 105 | Whitespace("\n") 106 | Comment("ws") 107 | StartElement(s) 108 | CData("") 109 | EndElement(s) 110 | Comment("ws") 111 | Whitespace("\n\n\n") 112 | CData("") 113 | Comment("ws") 114 | Whitespace("\n") 115 | Comment("ws") 116 | Whitespace("\n\n") 117 | CData("") 118 | CData("") 119 | Whitespace("\n") 120 | CData("") 121 | CData("") 122 | Whitespace("\n") 123 | CData("") 124 | Comment("ws") 125 | CData("") 126 | Whitespace("\n") 127 | CData("") 128 | StartElement(br) 129 | EndElement(br) 130 | CData("") 131 | Whitespace("\n") 132 | CData("") 133 | StartElement(s) 134 | EndElement(s) 135 | CData("") 136 | Whitespace("\n") 137 | CData("") 138 | StartElement(s) 139 | EndElement(s) 140 | CData("") 141 | Whitespace("\n") 142 | CData("") 143 | StartElement(s) 144 | Comment("ws") 145 | EndElement(s) 146 | CData("") 147 | Whitespace("\n\n\n") 148 | Comment("noWS") 149 | Whitespace("\n\n") 150 | Comment("ws") 151 | CData("") 152 | Whitespace("\n") 153 | CData("") 154 | Whitespace("\n\n") 155 | Comment("ws") 156 | Comment("ws") 157 | Whitespace("\n") 158 | Comment("ws") 159 | Comment("ws") 160 | Whitespace("\n") 161 | Comment("ws") 162 | CData("") 163 | Comment("ws") 164 | Whitespace("\n") 165 | Comment("ws") 166 | StartElement(br) 167 | EndElement(br) 168 | Comment("ws") 169 | Whitespace("\n") 170 | Comment("ws") 171 | StartElement(s) 172 | EndElement(s) 173 | Comment("ws") 174 | Whitespace("\n") 175 | Comment("ws") 176 | StartElement(s) 177 | EndElement(s) 178 | Comment("ws") 179 | Whitespace("\n") 180 | Comment("ws") 181 | StartElement(s) 182 | CData("") 183 | EndElement(s) 184 | Comment("ws") 185 | Whitespace("\n\n\n") 186 | CData("") 187 | Comment("ws") 188 | Whitespace("\n") 189 | Comment("ws") 190 | Whitespace("\n\n") 191 | CData("") 192 | CData("") 193 | Whitespace("\n") 194 | CData("") 195 | CData("") 196 | Whitespace("\n") 197 | CData("") 198 | Comment("ws") 199 | CData("") 200 | Whitespace("\n") 201 | CData("") 202 | StartElement(br) 203 | EndElement(br) 204 | CData("") 205 | Whitespace("\n") 206 | CData("") 207 | StartElement(s) 208 | EndElement(s) 209 | CData("") 210 | Whitespace("\n") 211 | CData("") 212 | StartElement(s) 213 | EndElement(s) 214 | CData("") 215 | Whitespace("\n") 216 | CData("") 217 | StartElement(s) 218 | Comment("ws") 219 | EndElement(s) 220 | CData("") 221 | Whitespace("\n\n") 222 | Comment("ws") 223 | CData("") 224 | Whitespace("\n\n") 225 | CData("") 226 | Comment("ws") 227 | Comment("ws") 228 | Comment("ws") 229 | Comment("ws") 230 | Comment("ws") 231 | CData("") 232 | Comment("ws") 233 | Comment("ws") 234 | StartElement(br) 235 | EndElement(br) 236 | Comment("ws") 237 | Comment("ws") 238 | StartElement(s) 239 | EndElement(s) 240 | Comment("ws") 241 | Comment("ws") 242 | StartElement(s) 243 | EndElement(s) 244 | Comment("ws") 245 | Comment("ws") 246 | StartElement(s) 247 | CData("") 248 | EndElement(s) 249 | Comment("ws") 250 | CData("") 251 | Comment("ws") 252 | Comment("ws") 253 | CData("") 254 | CData("") 255 | CData("") 256 | CData("") 257 | CData("") 258 | Comment("ws") 259 | CData("") 260 | CData("") 261 | StartElement(br) 262 | EndElement(br) 263 | CData("") 264 | CData("") 265 | StartElement(s) 266 | EndElement(s) 267 | CData("") 268 | CData("") 269 | StartElement(s) 270 | EndElement(s) 271 | CData("") 272 | CData("") 273 | StartElement(s) 274 | Comment("ws") 275 | EndElement(s) 276 | CData("") 277 | Whitespace("\n") 278 | EndElement(el) 279 | EndDocument 280 | -------------------------------------------------------------------------------- /tests/documents/sample_8_wsch.txt: -------------------------------------------------------------------------------- 1 | StartDocument(1.0, UTF-8) 2 | StartElement(el) 3 | Characters("\n") 4 | Comment("ws") 5 | CData("") 6 | Characters("\n") 7 | CData("") 8 | Characters("\n\n") 9 | Comment("ws") 10 | Comment("ws") 11 | Characters("\n") 12 | Comment("ws") 13 | Comment("ws") 14 | Characters("\n") 15 | Comment("ws") 16 | CData("") 17 | Comment("ws") 18 | Characters("\n") 19 | Comment("ws") 20 | StartElement(br) 21 | EndElement(br) 22 | Comment("ws") 23 | Characters("\n") 24 | Comment("ws") 25 | StartElement(s) 26 | EndElement(s) 27 | Comment("ws") 28 | Characters("\n") 29 | Comment("ws") 30 | StartElement(s) 31 | EndElement(s) 32 | Comment("ws") 33 | Characters("\n") 34 | Comment("ws") 35 | StartElement(s) 36 | CData("") 37 | EndElement(s) 38 | Comment("ws") 39 | Characters("\n\n\n") 40 | CData("") 41 | Comment("ws") 42 | Characters("\n") 43 | Comment("ws") 44 | Characters("\n\n") 45 | CData("") 46 | CData("") 47 | Characters("\n") 48 | CData("") 49 | CData("") 50 | Characters("\n") 51 | CData("") 52 | Comment("ws") 53 | CData("") 54 | Characters("\n") 55 | CData("") 56 | StartElement(br) 57 | EndElement(br) 58 | CData("") 59 | Characters("\n") 60 | CData("") 61 | StartElement(s) 62 | EndElement(s) 63 | CData("") 64 | Characters("\n") 65 | CData("") 66 | StartElement(s) 67 | EndElement(s) 68 | CData("") 69 | Characters("\n") 70 | CData("") 71 | StartElement(s) 72 | Comment("ws") 73 | EndElement(s) 74 | CData("") 75 | Characters("\n\n") 76 | Comment("ws") 77 | CData("") 78 | Characters("\n") 79 | CData("") 80 | Characters("\n\n") 81 | Comment("ws") 82 | Comment("ws") 83 | Characters("\n") 84 | Comment("ws") 85 | Comment("ws") 86 | Characters("\n") 87 | Comment("ws") 88 | CData("") 89 | Comment("ws") 90 | Characters("\n") 91 | Comment("ws") 92 | StartElement(br) 93 | EndElement(br) 94 | Comment("ws") 95 | Characters("\n") 96 | Comment("ws") 97 | StartElement(s) 98 | EndElement(s) 99 | Comment("ws") 100 | Characters("\n") 101 | Comment("ws") 102 | StartElement(s) 103 | EndElement(s) 104 | Comment("ws") 105 | Characters("\n") 106 | Comment("ws") 107 | StartElement(s) 108 | CData("") 109 | EndElement(s) 110 | Comment("ws") 111 | Characters("\n\n\n") 112 | CData("") 113 | Comment("ws") 114 | Characters("\n") 115 | Comment("ws") 116 | Characters("\n\n") 117 | CData("") 118 | CData("") 119 | Characters("\n") 120 | CData("") 121 | CData("") 122 | Characters("\n") 123 | CData("") 124 | Comment("ws") 125 | CData("") 126 | Characters("\n") 127 | CData("") 128 | StartElement(br) 129 | EndElement(br) 130 | CData("") 131 | Characters("\n") 132 | CData("") 133 | StartElement(s) 134 | EndElement(s) 135 | CData("") 136 | Characters("\n") 137 | CData("") 138 | StartElement(s) 139 | EndElement(s) 140 | CData("") 141 | Characters("\n") 142 | CData("") 143 | StartElement(s) 144 | Comment("ws") 145 | EndElement(s) 146 | CData("") 147 | Characters("\n\n\n") 148 | Comment("noWS") 149 | Characters("\n\n") 150 | Comment("ws") 151 | CData("") 152 | Characters("\n") 153 | CData("") 154 | Characters("\n\n") 155 | Comment("ws") 156 | Comment("ws") 157 | Characters("\n") 158 | Comment("ws") 159 | Comment("ws") 160 | Characters("\n") 161 | Comment("ws") 162 | CData("") 163 | Comment("ws") 164 | Characters("\n") 165 | Comment("ws") 166 | StartElement(br) 167 | EndElement(br) 168 | Comment("ws") 169 | Characters("\n") 170 | Comment("ws") 171 | StartElement(s) 172 | EndElement(s) 173 | Comment("ws") 174 | Characters("\n") 175 | Comment("ws") 176 | StartElement(s) 177 | EndElement(s) 178 | Comment("ws") 179 | Characters("\n") 180 | Comment("ws") 181 | StartElement(s) 182 | CData("") 183 | EndElement(s) 184 | Comment("ws") 185 | Characters("\n\n\n") 186 | CData("") 187 | Comment("ws") 188 | Characters("\n") 189 | Comment("ws") 190 | Characters("\n\n") 191 | CData("") 192 | CData("") 193 | Characters("\n") 194 | CData("") 195 | CData("") 196 | Characters("\n") 197 | CData("") 198 | Comment("ws") 199 | CData("") 200 | Characters("\n") 201 | CData("") 202 | StartElement(br) 203 | EndElement(br) 204 | CData("") 205 | Characters("\n") 206 | CData("") 207 | StartElement(s) 208 | EndElement(s) 209 | CData("") 210 | Characters("\n") 211 | CData("") 212 | StartElement(s) 213 | EndElement(s) 214 | CData("") 215 | Characters("\n") 216 | CData("") 217 | StartElement(s) 218 | Comment("ws") 219 | EndElement(s) 220 | CData("") 221 | Characters("\n\n") 222 | Comment("ws") 223 | CData("") 224 | Characters("\n\n") 225 | CData("") 226 | Comment("ws") 227 | Comment("ws") 228 | Comment("ws") 229 | Comment("ws") 230 | Comment("ws") 231 | CData("") 232 | Comment("ws") 233 | Comment("ws") 234 | StartElement(br) 235 | EndElement(br) 236 | Comment("ws") 237 | Comment("ws") 238 | StartElement(s) 239 | EndElement(s) 240 | Comment("ws") 241 | Comment("ws") 242 | StartElement(s) 243 | EndElement(s) 244 | Comment("ws") 245 | Comment("ws") 246 | StartElement(s) 247 | CData("") 248 | EndElement(s) 249 | Comment("ws") 250 | CData("") 251 | Comment("ws") 252 | Comment("ws") 253 | CData("") 254 | CData("") 255 | CData("") 256 | CData("") 257 | CData("") 258 | Comment("ws") 259 | CData("") 260 | CData("") 261 | StartElement(br) 262 | EndElement(br) 263 | CData("") 264 | CData("") 265 | StartElement(s) 266 | EndElement(s) 267 | CData("") 268 | CData("") 269 | StartElement(s) 270 | EndElement(s) 271 | CData("") 272 | CData("") 273 | StartElement(s) 274 | Comment("ws") 275 | EndElement(s) 276 | CData("") 277 | Characters("\n") 278 | EndElement(el) 279 | EndDocument 280 | -------------------------------------------------------------------------------- /tests/oasis.fail.txt: -------------------------------------------------------------------------------- 1 | o-p04pass1 p04pass1.xml names with all valid ASCII characters, and one from each other class in NameChar ; 5:8 Element A.-:̀· prefix is unbound 2 | o-p05pass1 p05pass1.xml various valid Name constructions ; 2:8 Element A:._-0 prefix is unbound 3 | o-p09fail1 p09fail1.xml EntityValue excludes '%' 4 | o-p09fail2 p09fail2.xml EntityValue excludes '&' 5 | o-p12fail1 p12fail1.xml '"' excluded 6 | o-p12fail2 p12fail2.xml '\' excluded 7 | o-p12fail3 p12fail3.xml entity references excluded 8 | o-p12fail4 p12fail4.xml '>' excluded 9 | o-p12fail5 p12fail5.xml '<' excluded 10 | o-p12fail6 p12fail6.xml built-in entity refs excluded 11 | o-p12fail7 p12fail7.xml The public ID has a tab character, which is disallowed 12 | o-p30fail1 p30fail1.xml An XML declaration is not the same as a TextDecl 13 | o-p31fail1 p31fail1.xml external subset excludes doctypedecl 14 | o-p45fail2 p45fail2.xml S before contentspec is required. 15 | o-p45fail3 p45fail3.xml only one content spec 16 | o-p45fail4 p45fail4.xml no comments in declarations (contrast with SGML) 17 | o-p46fail1 p46fail1.xml no parens on declared content 18 | o-p46fail2 p46fail2.xml no inclusions (contrast with SGML) 19 | o-p46fail3 p46fail3.xml no exclusions (contrast with SGML) 20 | o-p46fail4 p46fail4.xml no space before occurrence 21 | o-p46fail5 p46fail5.xml single group 22 | o-p46fail6 p46fail6.xml can't be both declared and modeled 23 | o-p47fail1 p47fail1.xml Invalid operator '|' must match previous operator ',' 24 | o-p47fail2 p47fail2.xml Illegal character '-' in Element-content model 25 | o-p47fail3 p47fail3.xml Optional character must follow a name or list 26 | o-p47fail4 p47fail4.xml Illegal space before optional character 27 | o-p48fail1 p48fail1.xml Illegal space before optional character 28 | o-p48fail2 p48fail2.xml Illegal space before optional character 29 | o-p51fail1 p51fail1.xml occurrence on #PCDATA group must be * 30 | o-p51fail2 p51fail2.xml occurrence on #PCDATA group must be * 31 | o-p51fail3 p51fail3.xml #PCDATA must come first 32 | o-p51fail4 p51fail4.xml occurrence on #PCDATA group must be * 33 | o-p51fail5 p51fail5.xml only '|' connectors 34 | o-p51fail6 p51fail6.xml Only '|' connectors and occurrence on #PCDATA group must be * 35 | o-p51fail7 p51fail7.xml no nested groups 36 | o-p52fail1 p52fail1.xml A name is required 37 | o-p53fail1 p53fail1.xml S is required before default 38 | o-p53fail2 p53fail2.xml S is required before type 39 | o-p53fail3 p53fail3.xml type is required 40 | o-p53fail4 p53fail4.xml default is required 41 | o-p53fail5 p53fail5.xml name is requried 42 | o-p54fail1 p54fail1.xml don't pass unknown attribute types 43 | o-p55fail1 p55fail1.xml must be upper case 44 | o-p56fail1 p56fail1.xml no IDS type 45 | o-p56fail2 p56fail2.xml no NUMBER type 46 | o-p56fail3 p56fail3.xml no NAME type 47 | o-p56fail4 p56fail4.xml no ENTITYS type - types must be upper case 48 | o-p56fail5 p56fail5.xml types must be upper case 49 | o-p57fail1 p57fail1.xml no keyword for NMTOKEN enumeration 50 | o-p58fail1 p58fail1.xml at least one value required 51 | o-p58fail2 p58fail2.xml separator must be '|' 52 | o-p58fail3 p58fail3.xml notations are NAMEs, not NMTOKENs -- note: Leaving the invalid notation undeclared would cause a validating parser to fail without checking the name syntax, so the notation is declared with an invalid name. A parser that reports error positions should report an error at the AttlistDecl on line 6, before reaching the notation declaration. 53 | o-p58fail4 p58fail4.xml NOTATION must be upper case 54 | o-p58fail5 p58fail5.xml S after keyword is required 55 | o-p58fail6 p58fail6.xml parentheses are require 56 | o-p58fail7 p58fail7.xml values are unquoted 57 | o-p58fail8 p58fail8.xml values are unquoted 58 | o-p59fail1 p59fail1.xml at least one required 59 | o-p59fail2 p59fail2.xml separator must be "," 60 | o-p59fail3 p59fail3.xml values are unquoted 61 | o-p60fail1 p60fail1.xml keywords must be upper case 62 | o-p60fail2 p60fail2.xml S is required after #FIXED 63 | o-p60fail3 p60fail3.xml only #FIXED has both keyword and value 64 | o-p60fail4 p60fail4.xml #FIXED required value 65 | o-p60fail5 p60fail5.xml only one default type 66 | o-p61fail1 p61fail1.xml no other types, including TEMP, which is valid in SGML 67 | o-p62fail1 p62fail1.xml INCLUDE must be upper case 68 | o-p62fail2 p62fail2.xml no spaces in terminating delimiter 69 | o-p63fail1 p63fail1.xml IGNORE must be upper case 70 | o-p63fail2 p63fail2.xml delimiters must be balanced 71 | o-p64fail1 p64fail1.xml section delimiters must balance 72 | o-p64fail2 p64fail2.xml section delimiters must balance 73 | o-p72fail2 p72fail2.xml S is required after '%' 74 | o-p73fail2 p73fail2.xml Only one replacement value 75 | o-p73fail3 p73fail3.xml No NDataDecl on replacement text 76 | o-p74fail1 p74fail1.xml no NDataDecls on parameter entities 77 | o-p74fail3 p74fail3.xml only one value 78 | o-p75fail1 p75fail1.xml S required after "PUBLIC" 79 | o-p75fail2 p75fail2.xml S required after "SYSTEM" 80 | o-p75fail3 p75fail3.xml S required between literals 81 | o-p75fail4 p75fail4.xml "SYSTEM" implies only one literal 82 | o-p75fail5 p75fail5.xml only one keyword 83 | o-p75fail6 p75fail6.xml "PUBLIC" requires two literals (contrast with SGML) 84 | o-p76fail1 p76fail1.xml S is required before "NDATA" 85 | o-p76fail2 p76fail2.xml "NDATA" is upper-case 86 | o-p76fail3 p76fail3.xml notation name is required 87 | o-p76fail4 p76fail4.xml notation names are Names 88 | o-p11pass1 p11pass1.xml system literals may not contain URI fragments 89 | -------------------------------------------------------------------------------- /src/reader/parser/inside_processing_instruction.rs: -------------------------------------------------------------------------------- 1 | use crate::common::{is_name_char, is_name_start_char, is_whitespace_char}; 2 | use crate::reader::error::SyntaxError; 3 | 4 | use crate::reader::events::XmlEvent; 5 | use crate::reader::lexer::Token; 6 | 7 | use super::{DeclarationSubstate, Encountered, ProcessingInstructionSubstate, PullParser, Result, State}; 8 | 9 | impl PullParser { 10 | pub fn inside_processing_instruction(&mut self, t: Token, s: ProcessingInstructionSubstate) -> Option { 11 | match s { 12 | ProcessingInstructionSubstate::PIInsideName => match t { 13 | Token::Character(c) if self.buf.is_empty() && is_name_start_char(c) || 14 | self.buf_has_data() && is_name_char(c) => { 15 | if self.buf.len() > self.config.max_name_length { 16 | return Some(self.error(SyntaxError::ExceededConfiguredLimit)); 17 | } 18 | self.buf.push(c); 19 | None 20 | }, 21 | 22 | Token::ProcessingInstructionEnd => { 23 | // self.buf contains PI name 24 | let name = self.take_buf(); 25 | 26 | // Don't need to check for declaration because it has mandatory attributes 27 | // but there is none 28 | match &*name { 29 | // Name is empty, it is an error 30 | "" => Some(self.error(SyntaxError::ProcessingInstructionWithoutName)), 31 | 32 | // Found 35 | Some(self.error(SyntaxError::InvalidXmlProcessingInstruction(name.into()))), 36 | 37 | // All is ok, emitting event 38 | _ => { 39 | debug_assert!(self.next_event.is_none(), "{:?}", self.next_event); 40 | // can't have a PI before ` { 59 | // self.buf contains PI name 60 | let name = self.take_buf(); 61 | 62 | match &*name { 63 | // We have not ever encountered an element and have not parsed XML declaration 64 | "xml" if self.encountered == Encountered::None => 65 | self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeVersion)), 66 | 67 | // Found 70 | Some(self.error(SyntaxError::InvalidXmlProcessingInstruction(name.into()))), 71 | 72 | // All is ok, starting parsing PI data 73 | _ => { 74 | self.data.name = name; 75 | // can't have a PI before ` { 83 | let buf = self.take_buf(); 84 | Some(self.error(SyntaxError::UnexpectedProcessingInstruction(buf.into(), t))) 85 | }, 86 | }, 87 | 88 | ProcessingInstructionSubstate::PIInsideData => match t { 89 | Token::ProcessingInstructionEnd => { 90 | let name = self.data.take_name(); 91 | let data = self.take_buf(); 92 | self.into_state_emit( 93 | State::OutsideTag, 94 | Ok(XmlEvent::ProcessingInstruction { name, data: Some(data) }), 95 | ) 96 | }, 97 | 98 | Token::Character(c) if !self.is_valid_xml_char(c) => { 99 | Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) 100 | }, 101 | 102 | // Any other token should be treated as plain characters 103 | _ => { 104 | if self.buf.len() > self.config.max_data_length { 105 | return Some(self.error(SyntaxError::ExceededConfiguredLimit)); 106 | } 107 | t.push_to_string(&mut self.buf); 108 | None 109 | }, 110 | }, 111 | } 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /src/common.rs: -------------------------------------------------------------------------------- 1 | //! Contains common types and functions used throughout the library. 2 | 3 | use std::fmt; 4 | 5 | /// Represents a position inside some textual document. 6 | #[derive(Copy, Clone, PartialEq, Eq)] 7 | pub struct TextPosition { 8 | #[doc(hidden)] 9 | pub row: u64, 10 | 11 | #[doc(hidden)] 12 | pub column: u64, 13 | } 14 | 15 | impl TextPosition { 16 | /// Creates a new position initialized to the beginning of the document 17 | #[inline] 18 | #[must_use] 19 | pub const fn new() -> Self { 20 | Self { row: 0, column: 0 } 21 | } 22 | 23 | /// Advances the position in a line 24 | #[inline] 25 | pub fn advance(&mut self, count: u8) { 26 | self.column += u64::from(count); 27 | } 28 | 29 | #[doc(hidden)] 30 | #[deprecated] 31 | pub fn advance_to_tab(&mut self, width: u8) { 32 | let width = u64::from(width); 33 | self.column += width - self.column % width; 34 | } 35 | 36 | /// Advances the position to the beginning of the next line 37 | #[inline] 38 | pub fn new_line(&mut self) { 39 | self.column = 0; 40 | self.row += 1; 41 | } 42 | 43 | /// Row, counting from 0. Add 1 to display as users expect! 44 | #[must_use] 45 | pub fn row(&self) -> u64 { 46 | self.row 47 | } 48 | 49 | /// Column, counting from 0. Add 1 to display as users expect! 50 | #[must_use] 51 | pub fn column(&self) -> u64 { 52 | self.column 53 | } 54 | } 55 | 56 | impl fmt::Debug for TextPosition { 57 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 58 | fmt::Display::fmt(self, f) 59 | } 60 | } 61 | 62 | impl fmt::Display for TextPosition { 63 | #[inline] 64 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 65 | write!(f, "{}:{}", self.row + 1, self.column + 1) 66 | } 67 | } 68 | 69 | /// Get the position in the document corresponding to the object 70 | /// 71 | /// This trait is implemented by parsers, lexers and errors. 72 | pub trait Position { 73 | /// Returns the current position or a position corresponding to the object. 74 | fn position(&self) -> TextPosition; 75 | } 76 | 77 | impl Position for TextPosition { 78 | #[inline] 79 | fn position(&self) -> TextPosition { 80 | *self 81 | } 82 | } 83 | 84 | /// XML version enumeration. 85 | #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] 86 | pub enum XmlVersion { 87 | /// XML version 1.0. 88 | Version10, 89 | 90 | /// XML version 1.1. 91 | Version11, 92 | } 93 | 94 | impl fmt::Display for XmlVersion { 95 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 96 | match *self { 97 | Self::Version10 => "1.0", 98 | Self::Version11 => "1.1", 99 | }.fmt(f) 100 | } 101 | } 102 | 103 | impl fmt::Debug for XmlVersion { 104 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 105 | fmt::Display::fmt(self, f) 106 | } 107 | } 108 | 109 | /// Checks whether the given character is a white space character (`S`) 110 | /// as is defined by XML 1.1 specification, [section 2.3][1]. 111 | /// 112 | /// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn 113 | #[must_use] 114 | #[inline] 115 | pub const fn is_whitespace_char(c: char) -> bool { 116 | matches!(c, '\x20' | '\x0a' | '\x09' | '\x0d') 117 | } 118 | 119 | /// Checks whether the given string is compound only by white space 120 | /// characters (`S`) using the previous `is_whitespace_char` to check 121 | /// all characters of this string 122 | pub fn is_whitespace_str(s: &str) -> bool { 123 | s.chars().all(is_whitespace_char) 124 | } 125 | 126 | /// Is it a valid character in XML 1.0 127 | #[must_use] 128 | pub const fn is_xml10_char(c: char) -> bool { 129 | matches!(c, '\u{09}' | '\u{0A}' | '\u{0D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..) 130 | } 131 | 132 | /// Is it a valid character in XML 1.1 133 | #[must_use] 134 | pub const fn is_xml11_char(c: char) -> bool { 135 | matches!(c, '\u{01}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..) 136 | } 137 | 138 | /// Is it a valid character in XML 1.1 but not part of the restricted character set 139 | #[must_use] 140 | pub const fn is_xml11_char_not_restricted(c: char) -> bool { 141 | is_xml11_char(c) && 142 | !matches!(c, '\u{01}'..='\u{08}' | '\u{0B}'..='\u{0C}' | '\u{0E}'..='\u{1F}' | '\u{7F}'..='\u{84}' | '\u{86}'..='\u{9F}') 143 | } 144 | 145 | /// Checks whether the given character is a name start character (`NameStartChar`) 146 | /// as is defined by XML 1.1 specification, [section 2.3][1]. 147 | /// 148 | /// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn 149 | #[must_use] 150 | pub const fn is_name_start_char(c: char) -> bool { 151 | matches!(c, 152 | ':' | 'A'..='Z' | '_' | 'a'..='z' | 153 | '\u{C0}'..='\u{D6}' | '\u{D8}'..='\u{F6}' | '\u{F8}'..='\u{2FF}' | 154 | '\u{370}'..='\u{37D}' | '\u{37F}'..='\u{1FFF}' | 155 | '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' | 156 | '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' | 157 | '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' | 158 | '\u{10000}'..='\u{EFFFF}' 159 | ) 160 | } 161 | 162 | /// Checks whether the given character is a name character (`NameChar`) 163 | /// as is defined by XML 1.1 specification, [section 2.3][1]. 164 | /// 165 | /// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn 166 | #[must_use] 167 | pub const fn is_name_char(c: char) -> bool { 168 | if is_name_start_char(c) { 169 | return true; 170 | } 171 | matches!(c, 172 | '-' | '.' | '0'..='9' | '\u{B7}' | 173 | '\u{300}'..='\u{36F}' | '\u{203F}'..='\u{2040}' 174 | ) 175 | } 176 | -------------------------------------------------------------------------------- /src/reader.rs: -------------------------------------------------------------------------------- 1 | //! Contains high-level interface for a pull-based XML parser. 2 | //! 3 | //! The most important type in this module is `EventReader`, which provides an iterator 4 | //! view for events in XML document. 5 | 6 | use std::io::Read; 7 | use std::iter::FusedIterator; 8 | use std::result; 9 | 10 | use crate::common::{Position, TextPosition}; 11 | 12 | pub use self::config::ParserConfig; 13 | pub use self::error::{Error, ErrorKind}; 14 | pub use self::events::XmlEvent; 15 | 16 | // back compat 17 | #[doc(hidden)] 18 | #[deprecated(note = "Merged into ParserConfig")] 19 | pub type ParserConfig2 = ParserConfig; 20 | 21 | use self::parser::PullParser; 22 | 23 | mod config; 24 | mod error; 25 | mod events; 26 | mod indexset; 27 | mod lexer; 28 | mod parser; 29 | 30 | /// A result type yielded by `XmlReader`. 31 | pub type Result = result::Result; 32 | 33 | /// A wrapper around an `std::io::Read` instance which provides pull-based XML parsing. 34 | /// 35 | /// The reader should be wrapped in a `BufReader`, otherwise parsing may be very slow. 36 | pub struct EventReader { 37 | source: R, 38 | parser: PullParser, 39 | } 40 | 41 | impl EventReader { 42 | /// Creates a new reader, consuming the given stream. The reader should be wrapped in a `BufReader`, otherwise parsing may be very slow. 43 | #[inline] 44 | pub fn new(source: R) -> Self { 45 | Self::new_with_config(source, ParserConfig::new()) 46 | } 47 | 48 | /// Creates a new reader with the provded configuration, consuming the given stream. The reader should be wrapped in a `BufReader`, otherwise parsing may be very slow. 49 | #[inline] 50 | pub fn new_with_config(source: R, config: impl Into) -> Self { 51 | Self { 52 | source, 53 | parser: PullParser::new(config), 54 | } 55 | } 56 | 57 | /// Pulls and returns next XML event from the stream. 58 | /// 59 | /// If this returns [Err] or [`XmlEvent::EndDocument`] then further calls to 60 | /// this method will return this event again. 61 | #[inline] 62 | #[allow(clippy::should_implement_trait)] 63 | pub fn next(&mut self) -> Result { 64 | self.parser.next(&mut self.source) 65 | } 66 | 67 | /// Skips all XML events until the next end tag at the current level. 68 | /// 69 | /// Convenience function that is useful for the case where you have 70 | /// encountered a start tag that is of no interest and want to 71 | /// skip the entire XML subtree until the corresponding end tag. 72 | #[inline] 73 | pub fn skip(&mut self) -> Result<()> { 74 | let mut depth = 1; 75 | 76 | while depth > 0 { 77 | match self.next()? { 78 | XmlEvent::StartElement { .. } => depth += 1, 79 | XmlEvent::EndElement { .. } => depth -= 1, 80 | XmlEvent::EndDocument => return Err(Error { 81 | kind: ErrorKind::UnexpectedEof, 82 | pos: self.parser.position(), 83 | }), 84 | _ => {}, 85 | } 86 | } 87 | 88 | Ok(()) 89 | } 90 | 91 | /// Access underlying reader 92 | /// 93 | /// Using it directly while the event reader is parsing is not recommended 94 | pub fn source(&self) -> &R { &self.source } 95 | 96 | /// Access underlying reader 97 | /// 98 | /// Using it directly while the event reader is parsing is not recommended 99 | pub fn source_mut(&mut self) -> &mut R { &mut self.source } 100 | 101 | /// Unwraps this `EventReader`, returning the underlying reader. 102 | /// 103 | /// Note that this operation is destructive; unwrapping the reader and wrapping it 104 | /// again with `EventReader::new()` will create a fresh reader which will attempt 105 | /// to parse an XML document from the beginning. 106 | pub fn into_inner(self) -> R { 107 | self.source 108 | } 109 | 110 | /// Returns the DOCTYPE of the document if it has already been seen 111 | /// 112 | /// Available only after the root `StartElement` event 113 | #[inline] 114 | #[deprecated(note = "there is `XmlEvent::Doctype` now")] 115 | #[allow(deprecated)] 116 | pub fn doctype(&self) -> Option<&str> { 117 | self.parser.doctype() 118 | } 119 | } 120 | 121 | impl Position for EventReader { 122 | /// Returns the position of the last event produced by the reader. 123 | #[inline] 124 | fn position(&self) -> TextPosition { 125 | self.parser.position() 126 | } 127 | } 128 | 129 | impl IntoIterator for EventReader { 130 | type IntoIter = Events; 131 | type Item = Result; 132 | 133 | fn into_iter(self) -> Events { 134 | Events { reader: self, finished: false } 135 | } 136 | } 137 | 138 | /// An iterator over XML events created from some type implementing `Read`. 139 | /// 140 | /// When the next event is `xml::event::Error` or `xml::event::EndDocument`, then 141 | /// it will be returned by the iterator once, and then it will stop producing events. 142 | pub struct Events { 143 | reader: EventReader, 144 | finished: bool, 145 | } 146 | 147 | impl Events { 148 | /// Unwraps the iterator, returning the internal `EventReader`. 149 | #[inline] 150 | pub fn into_inner(self) -> EventReader { 151 | self.reader 152 | } 153 | 154 | /// Access the underlying reader 155 | /// 156 | /// It's not recommended to use it while the events are still being parsed 157 | pub fn source(&self) -> &R { &self.reader.source } 158 | 159 | /// Access the underlying reader 160 | /// 161 | /// It's not recommended to use it while the events are still being parsed 162 | pub fn source_mut(&mut self) -> &mut R { &mut self.reader.source } 163 | } 164 | 165 | impl FusedIterator for Events { 166 | } 167 | 168 | impl Iterator for Events { 169 | type Item = Result; 170 | 171 | #[inline] 172 | fn next(&mut self) -> Option> { 173 | if self.finished && !self.reader.parser.is_ignoring_end_of_stream() { 174 | None 175 | } else { 176 | let ev = self.reader.next(); 177 | if let Ok(XmlEvent::EndDocument) | Err(_) = ev { 178 | self.finished = true; 179 | } 180 | Some(ev) 181 | } 182 | } 183 | } 184 | 185 | impl<'r> EventReader<&'r [u8]> { 186 | /// A convenience method to create an `XmlReader` from a string slice. 187 | #[inline] 188 | #[must_use] 189 | #[allow(clippy::should_implement_trait)] 190 | pub fn from_str(source: &'r str) -> Self { 191 | EventReader::new(source.as_bytes()) 192 | } 193 | } 194 | -------------------------------------------------------------------------------- /src/writer/config.rs: -------------------------------------------------------------------------------- 1 | //! Contains emitter configuration structure. 2 | 3 | use crate::writer::EventWriter; 4 | use std::borrow::Cow; 5 | use std::io::Write; 6 | 7 | /// Emitter configuration structure. 8 | /// 9 | /// This structure contains various options which control XML document emitter behavior. 10 | #[derive(Clone, PartialEq, Eq, Debug)] 11 | pub struct EmitterConfig { 12 | /// Line separator used to separate lines in formatted output. Default is `"\n"`. 13 | pub line_separator: Cow<'static, str>, 14 | 15 | /// A string which will be used for a single level of indentation. Default is `" "` 16 | /// (two spaces). 17 | pub indent_string: Cow<'static, str>, 18 | 19 | /// Whether or not the emitted document should be indented. Default is false. 20 | /// 21 | /// The emitter is capable to perform automatic indentation of the emitted XML document. 22 | /// It is done in stream-like fashion and does not require the knowledge of the whole 23 | /// document in advance. 24 | /// 25 | /// Sometimes, however, automatic indentation is undesirable, e.g. when you want to keep 26 | /// existing layout when processing an existing XML document. Also the indentiation algorithm 27 | /// is not thoroughly tested. Hence by default it is disabled. 28 | pub perform_indent: bool, 29 | 30 | /// Whether or not characters in output events will be escaped. Default is true. 31 | /// 32 | /// The emitter can automatically escape characters which can't appear in PCDATA sections 33 | /// or element attributes of an XML document, like `<` or `"` (in attributes). This may 34 | /// introduce some overhead because then every corresponding piece of character data 35 | /// should be scanned for invalid characters. 36 | /// 37 | /// If this option is disabled, the XML writer may produce non-well-formed documents, so 38 | /// use `false` value for this option with care. 39 | pub perform_escaping: bool, 40 | 41 | /// Whether or not to write XML document declaration at the beginning of a document. 42 | /// Default is true. 43 | /// 44 | /// This option controls whether the document declaration should be emitted automatically 45 | /// before a root element is written if it was not emitted explicitly by the user. 46 | pub write_document_declaration: bool, 47 | 48 | /// Whether or not to convert elements with empty content to empty elements. Default is true. 49 | /// 50 | /// This option allows turning elements like `` (an element with empty content) 51 | /// into `` (an empty element). 52 | pub normalize_empty_elements: bool, 53 | 54 | /// Whether or not to emit CDATA events as plain characters. Default is false. 55 | /// 56 | /// This option forces the emitter to convert CDATA events into regular character events, 57 | /// performing all the necessary escaping beforehand. This may be occasionally useful 58 | /// for feeding the document into incorrect parsers which do not support CDATA. 59 | pub cdata_to_characters: bool, 60 | 61 | /// Whether or not to keep element names to support `EndElement` events without explicit names. 62 | /// Default is true. 63 | /// 64 | /// This option makes the emitter to keep names of written elements in order to allow 65 | /// omitting names when writing closing element tags. This could incur some memory overhead. 66 | pub keep_element_names_stack: bool, 67 | 68 | /// Whether or not to automatically insert leading and trailing spaces in emitted comments, 69 | /// if necessary. Default is true. 70 | /// 71 | /// This is a convenience option in order for the user not to append spaces before and after 72 | /// comments text in order to get more pretty comments: `` instead of 73 | /// ``. 74 | pub autopad_comments: bool, 75 | 76 | /// Whether or not to automatically insert spaces before the trailing `/>` in self-closing 77 | /// elements. Default is true. 78 | /// 79 | /// This option is only meaningful if `normalize_empty_elements` is true. For example, the 80 | /// element `` would be unaffected. When `normalize_empty_elements` is true, then when 81 | /// this option is also true, the same element would appear ``. If this option is false, 82 | /// then the same element would appear ``. 83 | pub pad_self_closing: bool, 84 | } 85 | 86 | impl EmitterConfig { 87 | /// Creates an emitter configuration with default values. 88 | /// 89 | /// You can tweak default options with builder-like pattern: 90 | /// 91 | /// ```rust 92 | /// use xml::writer::EmitterConfig; 93 | /// 94 | /// let config = EmitterConfig::new() 95 | /// .line_separator("\r\n") 96 | /// .perform_indent(true) 97 | /// .normalize_empty_elements(false); 98 | /// ``` 99 | #[inline] 100 | #[must_use] 101 | pub fn new() -> Self { 102 | Self { 103 | line_separator: "\n".into(), 104 | indent_string: " ".into(), // two spaces 105 | perform_indent: false, 106 | perform_escaping: true, 107 | write_document_declaration: true, 108 | normalize_empty_elements: true, 109 | cdata_to_characters: false, 110 | keep_element_names_stack: true, 111 | autopad_comments: true, 112 | pad_self_closing: true, 113 | } 114 | } 115 | 116 | /// Creates an XML writer with this configuration. 117 | /// 118 | /// This is a convenience method for configuring and creating a writer at the same time: 119 | /// 120 | /// ```rust 121 | /// use xml::writer::EmitterConfig; 122 | /// 123 | /// let mut target: Vec = Vec::new(); 124 | /// 125 | /// let writer = EmitterConfig::new() 126 | /// .line_separator("\r\n") 127 | /// .perform_indent(true) 128 | /// .normalize_empty_elements(false) 129 | /// .create_writer(&mut target); 130 | /// ``` 131 | /// 132 | /// This method is exactly equivalent to calling `EventWriter::new_with_config()` with 133 | /// this configuration object. 134 | #[inline] 135 | pub fn create_writer(self, sink: W) -> EventWriter { 136 | EventWriter::new_with_config(sink, self) 137 | } 138 | } 139 | 140 | impl Default for EmitterConfig { 141 | #[inline] 142 | fn default() -> Self { 143 | Self::new() 144 | } 145 | } 146 | 147 | gen_setters!(EmitterConfig, 148 | line_separator: into Cow<'static, str>, 149 | indent_string: into Cow<'static, str>, 150 | perform_indent: val bool, 151 | write_document_declaration: val bool, 152 | normalize_empty_elements: val bool, 153 | cdata_to_characters: val bool, 154 | keep_element_names_stack: val bool, 155 | autopad_comments: val bool, 156 | pad_self_closing: val bool 157 | ); 158 | -------------------------------------------------------------------------------- /src/reader/parser/inside_opening_tag.rs: -------------------------------------------------------------------------------- 1 | use crate::attribute::OwnedAttribute; 2 | use crate::common::{is_name_start_char, is_whitespace_char}; 3 | use crate::namespace; 4 | use crate::reader::error::SyntaxError; 5 | 6 | use crate::reader::lexer::Token; 7 | 8 | use super::{OpeningTagSubstate, PullParser, QualifiedNameTarget, Result, State}; 9 | 10 | impl PullParser { 11 | pub fn inside_opening_tag(&mut self, t: Token, s: OpeningTagSubstate) -> Option { 12 | let max_attrs = self.config.max_attributes; 13 | match s { 14 | OpeningTagSubstate::InsideName => self.read_qualified_name(t, QualifiedNameTarget::OpeningTagNameTarget, |this, token, name| { 15 | match name.prefix_ref() { 16 | Some(prefix) if prefix == namespace::NS_XML_PREFIX || 17 | prefix == namespace::NS_XMLNS_PREFIX => 18 | Some(this.error(SyntaxError::InvalidNamePrefix(prefix.into()))), 19 | _ => { 20 | this.data.element_name = Some(name.clone()); 21 | match token { 22 | Token::TagEnd => this.emit_start_element(false), 23 | Token::EmptyTagEnd => this.emit_start_element(true), 24 | Token::Character(c) if is_whitespace_char(c) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)), 25 | _ => { 26 | debug_assert!(false, "unreachable"); 27 | None 28 | }, 29 | } 30 | } 31 | } 32 | }), 33 | 34 | OpeningTagSubstate::InsideTag => match t { 35 | Token::TagEnd => self.emit_start_element(false), 36 | Token::EmptyTagEnd => self.emit_start_element(true), 37 | Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace 38 | Token::Character(c) if is_name_start_char(c) => { 39 | if self.buf.len() > self.config.max_name_length { 40 | return Some(self.error(SyntaxError::ExceededConfiguredLimit)); 41 | } 42 | self.buf.push(c); 43 | self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeName)) 44 | }, 45 | _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))), 46 | }, 47 | 48 | OpeningTagSubstate::InsideAttributeName => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { 49 | // check that no attribute with such name is already present 50 | // if there is one, XML is not well-formed 51 | if this.data.attributes.contains(&name) { 52 | return Some(this.error(SyntaxError::RedefinedAttribute(name.to_string().into()))) 53 | } 54 | 55 | this.data.attr_name = Some(name); 56 | match token { 57 | Token::EqualsSign => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)), 58 | Token::Character(c) if is_whitespace_char(c) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeName)), 59 | _ => Some(this.error(SyntaxError::UnexpectedTokenInOpeningTag(t))) // likely unreachable 60 | } 61 | }), 62 | 63 | OpeningTagSubstate::AfterAttributeName => match t { 64 | Token::EqualsSign => self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)), 65 | Token::Character(c) if is_whitespace_char(c) => None, 66 | _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))) 67 | }, 68 | 69 | OpeningTagSubstate::InsideAttributeValue => self.read_attribute_value(t, |this, value| { 70 | let name = this.data.take_attr_name()?; // will always succeed here 71 | match name.prefix_ref() { 72 | // declaring a new prefix; it is sufficient to check prefix only 73 | // because "xmlns" prefix is reserved 74 | Some(namespace::NS_XMLNS_PREFIX) => { 75 | let ln = &*name.local_name; 76 | if ln == namespace::NS_XMLNS_PREFIX { 77 | Some(this.error(SyntaxError::CannotRedefineXmlnsPrefix)) 78 | } else if ln == namespace::NS_XML_PREFIX && &*value != namespace::NS_XML_URI { 79 | Some(this.error(SyntaxError::CannotRedefineXmlPrefix)) 80 | } else if value.is_empty() { 81 | Some(this.error(SyntaxError::CannotUndefinePrefix(ln.into()))) 82 | } else { 83 | this.nst.put(name.local_name.clone(), value); 84 | this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue)) 85 | } 86 | }, 87 | 88 | // declaring default namespace 89 | None if &*name.local_name == namespace::NS_XMLNS_PREFIX => 90 | match &*value { 91 | namespace::NS_XMLNS_PREFIX | namespace::NS_XML_PREFIX | namespace::NS_XML_URI | namespace::NS_XMLNS_URI => 92 | Some(this.error(SyntaxError::InvalidDefaultNamespace(value.into()))), 93 | _ => { 94 | this.nst.put(namespace::NS_NO_PREFIX, value.clone()); 95 | this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue)) 96 | } 97 | }, 98 | 99 | // regular attribute 100 | _ => { 101 | if this.data.attributes.len() >= max_attrs { 102 | return Some(this.error(SyntaxError::ExceededConfiguredLimit)); 103 | } 104 | this.data.attributes.push(OwnedAttribute { name, value }); 105 | this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue)) 106 | }, 107 | } 108 | }), 109 | 110 | OpeningTagSubstate::AfterAttributeValue => match t { 111 | Token::Character(c) if is_whitespace_char(c) => { 112 | self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)) 113 | }, 114 | Token::TagEnd => self.emit_start_element(false), 115 | Token::EmptyTagEnd => self.emit_start_element(true), 116 | _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))), 117 | }, 118 | } 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /Changelog.md: -------------------------------------------------------------------------------- 1 | ## Version 1.0.0 2 | 3 | * Added `Doctype` event 4 | * Marked structs as `#[non_exhaustive]` 5 | * Merged `ParserConfig2` back into `ParserConfig` 6 | * Added option to the writer to pass through XML markup unmodified 7 | * `xml-analyze` binary has been moved to examples 8 | * Writer escapes `--` in comments and `]]>` in CDATA 9 | 10 | ## Version 0.8.27 11 | 12 | * Added detection of invalid `` in CDATA 42 | 43 | ## Version 0.8.19 44 | 45 | * Fixed whitespace event when parsing DOCTYPE with internal subset 46 | 47 | ## Version 0.8.18 48 | 49 | * Option to tolerate invalid entities and chars 50 | 51 | ## Version 0.8.17 52 | 53 | * Added configuration for document size/complexity limits. 54 | 55 | ## Version 0.8.16 56 | 57 | * Fixed error line numbers when parsing CDATA as characters 58 | 59 | ## Version 0.8.15 60 | 61 | * Improved speed of parsing elements with huge number of arguments 62 | 63 | ## Version 0.8.14 64 | 65 | * Fixed error line numbers when ignoring comments 66 | 67 | ## Version 0.8.13 68 | 69 | * Backward-compatibility fix 70 | 71 | ## Version 0.8.12 72 | 73 | * Improved conformance of parsing invalid codepoints, XML prolog 74 | * Reduced number of allocations 75 | 76 | ## Version 0.8.11 77 | 78 | * Improved conformance of PI 79 | * Forbidden invalid multiple root elements, unless an option allowing them is enabled. 80 | 81 | ## Version 0.8.10 82 | 83 | * Improved parsing conformance 84 | * Internal error handling improvements 85 | 86 | ## Version 0.8.9 87 | 88 | * Added support for UTF-16 and ASCII 89 | * Fixed CDATA parsing 90 | * Added PE entities parsing 91 | 92 | ## Version 0.8.8 93 | 94 | * Added recursive entity expansion (with length protection) 95 | * Expanded parsing of DTD 96 | 97 | ## Version 0.8.7 98 | 99 | * Basic parsing of DTD internal subset 100 | * Speed improvements 101 | 102 | ## Version 0.8.6 103 | 104 | * Fixed parsing of incorrectly nested comments and processing instructions 105 | 106 | ## Version 0.8.5 107 | 108 | * Updated source code to edition 2018 and fixed/updated some Rust idioms. 109 | 110 | ## Version 0.8.4 111 | 112 | * Fixed recognition of `?>`, `]]>` and `/>` tokens as characters. 113 | * Fixed writer output operations to use `write_all` to ensure that the data 114 | is written fully. 115 | * The document declaration is now written before any characters automatically. 116 | 117 | ## Version 0.8.3 118 | 119 | * Added a new parser option, `ignore_root_level_whitespace`, which makes the parser 120 | skip emitting whitespace events outside of the root element when set to `true`. 121 | This helps with certain tasks like canonicalization. 122 | 123 | ## Version 0.8.2 124 | 125 | * Added a new parser option, `replace_unknown_entity_references`, which allows to ignore 126 | invalid Unicode code points and replace them with a Unicode "replacement character" 127 | during parsing. This can be helpful to deal with e.g. UTF-16 surrogate pairs. 128 | * Added a new emitter option, `pad_self_closing`, which determines the style of the self-closing 129 | elements when they are emitted: `` (`true`) vs `` (`false`). 130 | 131 | ## Version 0.8.1 132 | 133 | * Fixed various issues with tests introduced by updates in Rust. 134 | * Adjusted the lexer to ignore contents of the `` tag. 135 | * Removed unnecessary unsafety in tests. 136 | * Added tests for doc comments in the readme file. 137 | * Switched to GitHub Actions from Travis CI. 138 | 139 | ## Version 0.8.0 140 | 141 | * Same as 0.7.1, with 0.7.1 being yanked because of the incorrect semver bump. 142 | 143 | ## Version 0.7.1 144 | 145 | * Removed dependency on bitflags. 146 | * Added the `XmlWriter::inner_mut()` method. 147 | * Fixed some rustdoc warnings. 148 | 149 | ## Version 0.7.0 150 | 151 | * Same as 0.6.2, with 0.6.2 being yanked because of the incompatible bump of minimum required version of rustc. 152 | 153 | ## Version 0.6.2 154 | 155 | * Bumped `bitflags` to 1.0. 156 | 157 | ## Version 0.6.1 158 | 159 | * Fixed the writer to escape some special characters when writing attribute values. 160 | 161 | ## Version 0.6.0 162 | 163 | * Changed the target type of extra entities from `char` to `String`. This is an incompatible 164 | change. 165 | 166 | ## Version 0.5.0 167 | 168 | * Added support for ignoring EOF errors in order to read documents from streams incrementally. 169 | * Bumped `bitflags` to 0.9. 170 | 171 | ## Version 0.4.1 172 | 173 | * Added missing `Debug` implementation to `xml::writer::XmlEvent`. 174 | 175 | ## Version 0.4.0 176 | 177 | * Bumped version number, since changes introduced in 0.3.7 break backwards compatibility. 178 | 179 | ## Version 0.3.8 180 | 181 | * Fixed a problem introduced in 0.3.7 with entities in attributes causing parsing errors. 182 | 183 | ## Version 0.3.7 184 | 185 | * Fixed the problem with parsing non-whitespace character entities as whitespace (issue #140). 186 | * Added support for configuring custom entities in the parser configuration. 187 | 188 | ## Version 0.3.6 189 | 190 | * Added an `Error` implementation for `EmitterError`. 191 | * Fixed escaping of strings with multi-byte code points. 192 | 193 | ## Version 0.3.5 194 | 195 | * Added `Debug` implementation for `XmlVersion`. 196 | * Fixed some failing tests. 197 | 198 | ## Version 0.3.3 199 | 200 | * Updated `bitflags` to 0.7. 201 | 202 | ## Version 0.3.2 203 | 204 | * Added `From` for `xml::reader::Error`, which improves usability of working with parsing errors. 205 | 206 | ## Version 0.3.1 207 | 208 | * Bumped `bitflags` dependency to 0.4, some internal warning fixes. 209 | 210 | ## Version 0.3.0 211 | 212 | * Changed error handling in `EventReader` - now I/O errors are properly bubbled up from the lexer. 213 | 214 | ## Version 0.2.4 215 | 216 | * Fixed #112 - incorrect handling of namespace redefinitions when writing a document. 217 | 218 | ## Version 0.2.3 219 | 220 | * Added `into_inner()` methods to `EventReader` and `EventWriter`. 221 | 222 | ## Version 0.2.2 223 | 224 | * Using `join` instead of the deprecated `connect`. 225 | * Added a simple XML analyzer program which demonstrates library usage and can be used to check XML documents for well-formedness. 226 | * Fixed incorrect handling of unqualified attribute names (#107). 227 | * Added this changelog. 228 | 229 | ## Version 0.2.1 230 | 231 | * Fixed #105 - incorrect handling of double dashes. 232 | 233 | ## Version 0.2.0 234 | 235 | * Major update, includes proper document writing support and significant architecture changes. 236 | -------------------------------------------------------------------------------- /tests/xmltest.fail.txt: -------------------------------------------------------------------------------- 1 | not-wf-sa-003 003.xml Processing Instruction target name is required. 2 | not-wf-sa-054 054.xml PUBLIC requires two literals. 3 | not-wf-sa-056 056.xml Invalid Document Type Definition format - misplaced comment. 4 | not-wf-sa-057 057.xml This isn't SGML; comments can't exist in declarations. 5 | not-wf-sa-058 058.xml Invalid character , in ATTLIST enumeration 6 | not-wf-sa-059 059.xml String literal must be in quotes. 7 | not-wf-sa-060 060.xml Invalid type NAME defined in ATTLIST. 8 | not-wf-sa-061 061.xml External entity declarations require whitespace between public and system IDs. 9 | not-wf-sa-064 064.xml Space is required between attribute type and default values in declarations. 10 | not-wf-sa-065 065.xml Space is required between attribute name and type in declarations. 11 | not-wf-sa-066 066.xml Required whitespace is missing. 12 | not-wf-sa-067 067.xml Space is required between attribute type and default values in declarations. 13 | not-wf-sa-068 068.xml Space is required between NOTATION keyword and list of enumerated choices in declarations. 14 | not-wf-sa-069 069.xml Space is required before an NDATA entity annotation. 15 | not-wf-sa-074 074.xml Internal general parsed entities are only well formed if they match the "content" production. 16 | not-wf-sa-075 075.xml ENTITY can't reference itself directly or indirectly. 17 | not-wf-sa-077 077.xml Undefined ENTITY bar. 18 | not-wf-sa-078 078.xml Undefined ENTITY foo. 19 | not-wf-sa-079 079.xml ENTITY can't reference itself directly or indirectly. 20 | not-wf-sa-080 080.xml ENTITY can't reference itself directly or indirectly. 21 | not-wf-sa-081 081.xml This tests the No External Entity References WFC, since the entity is referred to within an attribute. 22 | not-wf-sa-082 082.xml This tests the No External Entity References WFC, since the entity is referred to within an attribute. 23 | not-wf-sa-083 083.xml Undefined NOTATION n. 24 | not-wf-sa-084 084.xml Tests the Parsed Entity WFC by referring to an unparsed entity. (This precedes the error of not declaring that entity's notation, which may be detected any time before the DTD parsing is completed.) 25 | not-wf-sa-085 085.xml Public IDs may not contain "[". 26 | not-wf-sa-086 086.xml Public IDs may not contain "[". 27 | not-wf-sa-087 087.xml Public IDs may not contain "[". 28 | not-wf-sa-089 089.xml Parameter entities "are" always parsed; NDATA annotations are not permitted. 29 | not-wf-sa-091 091.xml Parameter entities "are" always parsed; NDATA annotations are not permitted. 30 | not-wf-sa-104 104.xml Internal general parsed entities are only well formed if they match the "content" production. 31 | not-wf-sa-115 115.xml The replacement text of this entity is an illegal character reference, which must be rejected when it is parsed in the context of an attribute value. 32 | not-wf-sa-116 116.xml Internal general parsed entities are only well formed if they match the "content" production. This is a partial character reference, not a full one. 33 | not-wf-sa-117 117.xml Internal general parsed entities are only well formed if they match the "content" production. This is a partial character reference, not a full one. 34 | not-wf-sa-119 119.xml Internal general parsed entities are only well formed if they match the "content" production. This is a partial character reference, not a full one. 35 | not-wf-sa-122 122.xml Invalid syntax mixed connectors are used. 36 | not-wf-sa-123 123.xml Invalid syntax mismatched parenthesis. 37 | not-wf-sa-124 124.xml Invalid format of Mixed-content declaration. 38 | not-wf-sa-125 125.xml Invalid syntax extra set of parenthesis not necessary. 39 | not-wf-sa-126 126.xml Invalid syntax Mixed-content must be defined as zero or more. 40 | not-wf-sa-127 127.xml Invalid syntax Mixed-content must be defined as zero or more. 41 | not-wf-sa-128 128.xml Invalid CDATA syntax. 42 | not-wf-sa-129 129.xml Invalid syntax for Element Type Declaration. 43 | not-wf-sa-130 130.xml Invalid syntax for Element Type Declaration. 44 | not-wf-sa-131 131.xml Invalid syntax for Element Type Declaration. 45 | not-wf-sa-132 132.xml Invalid syntax mixed connectors used. 46 | not-wf-sa-133 133.xml Illegal whitespace before optional character causes syntax error. 47 | not-wf-sa-134 134.xml Illegal whitespace before optional character causes syntax error. 48 | not-wf-sa-135 135.xml Invalid character used as connector. 49 | not-wf-sa-136 136.xml Tag omission is invalid in XML. 50 | not-wf-sa-137 137.xml Space is required before a content model. 51 | not-wf-sa-138 138.xml Invalid syntax for content particle. 52 | not-wf-sa-139 139.xml The element-content model should not be empty. 53 | not-wf-sa-149 149.xml XML Declaration may not be within a DTD. 54 | not-wf-sa-158 158.xml SGML-ism: "#NOTATION gif" can't have attributes. 55 | not-wf-sa-159 159.xml Uses '&' unquoted in an entity declaration, which is illegal syntax for an entity reference. 56 | not-wf-sa-160 160.xml Violates the PEs in Internal Subset WFC by using a PE reference within a declaration. 57 | not-wf-sa-161 161.xml Violates the PEs in Internal Subset WFC by using a PE reference within a declaration. 58 | not-wf-sa-162 162.xml Violates the PEs in Internal Subset WFC by using a PE reference within a declaration. 59 | not-wf-sa-164 164.xml Invalid placement of Parameter entity reference. 60 | not-wf-sa-180 180.xml The Entity Declared WFC requires entities to be declared before they are used in an attribute list declaration. 61 | not-wf-sa-181 181.xml Internal parsed entities must match the content production to be well formed. 62 | not-wf-sa-182 182.xml Internal parsed entities must match the content production to be well formed. 63 | not-wf-sa-183 183.xml Mixed content declarations may not include content particles. 64 | not-wf-sa-184 184.xml In mixed content models, element names must not be parenthesized. 65 | not-wf-not-sa-001 001.xml Conditional sections must be properly terminated ("]>" used instead of "]]>"). 66 | not-wf-not-sa-002 002.xml Processing instruction target names may not be "XML" in any combination of cases. 67 | not-wf-not-sa-003 003.xml Conditional sections must be properly terminated ("]]>" omitted). 68 | not-wf-not-sa-004 004.xml Conditional sections must be properly terminated ("]]>" omitted). 69 | not-wf-not-sa-005 005.xml Tests the Entity Declared VC by referring to an undefined parameter entity within an external entity. 70 | not-wf-not-sa-006 006.xml Conditional sections need a '[' after the INCLUDE or IGNORE. 71 | not-wf-not-sa-007 007.xml A declaration may not begin any external entity; it's only found once, in the document entity. 72 | not-wf-not-sa-008 008.xml In DTDs, the '%' character must be part of a parameter entity reference. 73 | not-wf-not-sa-009 009.xml This test violates WFC:PE Between Declarations in Production 28a. The last character of a markup declaration is not contained in the same parameter-entity text replacement. 74 | not-wf-ext-sa-001 001.xml Tests the No Recursion WFC by having an external general entity be self-recursive. 75 | not-wf-ext-sa-002 002.xml External entities have "text declarations", which do not permit the "standalone=..." attribute that's allowed in XML declarations. 76 | not-wf-ext-sa-003 003.xml Only one text declaration is permitted; a second one looks like an illegal processing instruction (target names of "xml" in any case are not allowed). 77 | valid-sa-012 012.xml Uses a legal XML 1.0 name consisting of a single colon character (disallowed by the latest XML Namespaces draft).; 5:7 Qualified name is invalid: : 78 | valid-not-sa-031 031.xml Expands a general entity which contains a CDATA section with what looks like a markup declaration (but is just text since it's in a CDATA section).; 2:8 Unexpected entity: e 79 | -------------------------------------------------------------------------------- /tests/xmlconf.rs: -------------------------------------------------------------------------------- 1 | //! W3C XML conformance test suite 2 | 3 | use std::collections::{HashMap, HashSet}; 4 | use std::fs::File; 5 | use std::io::BufReader; 6 | use std::path::Path; 7 | use std::process::Command; 8 | use std::sync::Mutex; 9 | use xml::reader::XmlEvent; 10 | use xml::{EventWriter, ParserConfig}; 11 | 12 | static UNZIP: Mutex<()> = Mutex::new(()); 13 | 14 | fn ensure_unzipped() { 15 | let _g = UNZIP.lock().expect("unzip already failed"); 16 | 17 | // test suite license only allows redistribution of unmodified zip! 18 | if !Path::new("tests/xmlconf").exists() { 19 | assert!(Command::new("unzip") 20 | .current_dir("tests") 21 | .arg("xmlts20130923.zip") 22 | .status().unwrap().success(), "must unzip"); 23 | } 24 | } 25 | 26 | #[track_caller] 27 | fn run_suite(suite_rel_path: &str) { 28 | run_suite_with_config(suite_rel_path, ParserConfig::default().allow_multiple_root_elements(true)); 29 | run_suite_with_config(suite_rel_path, ParserConfig::default().coalesce_characters(false)); 30 | run_suite_with_config(suite_rel_path, ParserConfig::default().ignore_comments(false)); 31 | run_suite_with_config(suite_rel_path, ParserConfig::new().trim_whitespace(true).whitespace_to_characters(true).cdata_to_characters(true).ignore_comments(true).coalesce_characters(true)); 32 | run_suite_with_config(suite_rel_path, ParserConfig::default().allow_multiple_root_elements(false).ignore_root_level_whitespace(false)); 33 | } 34 | 35 | #[track_caller] 36 | fn run_suite_with_config(suite_rel_path: &str, parser_config: ParserConfig) { 37 | ensure_unzipped(); 38 | 39 | let suite_path = Path::new("tests").join(suite_rel_path); 40 | let known_failures_file_path = Path::new("tests").join(suite_path.with_extension("fail.txt").file_name().unwrap()); 41 | let mut new_known_failures_file = if std::env::var("PRINT_SPEC").map_or(false, |val| val == "1") { Some(String::new()) } else { None }; 42 | 43 | let known_broken_test_ids: HashSet<_> = std::fs::read_to_string(&known_failures_file_path).unwrap_or_default().lines() 44 | .map(|l| l.trim().split(' ').next().unwrap().to_string()).collect(); 45 | 46 | let root = suite_path.parent().unwrap(); 47 | let mut parsed = 0; 48 | 49 | let f = BufReader::new(File::open(&suite_path) 50 | .map_err(|e| format!("{}: {e}", suite_path.display())).unwrap()); 51 | let r = ParserConfig::default().allow_multiple_root_elements(true).create_reader(f); 52 | let mut desc = String::new(); 53 | let mut attr = HashMap::::new(); 54 | for e in r { 55 | let e = e.map_err(|e| format!("{}: {e}", suite_path.display())).expect("testsuite validity"); 56 | match e { 57 | XmlEvent::Characters(chr) => { 58 | desc.push_str(&chr.replace('\n', " ").replace(" ", " ").replace(" ", " ")); 59 | }, 60 | XmlEvent::EndElement { name } if name.local_name == "TEST" => { 61 | let path = root.join(&attr["URI"]); 62 | let test_type = attr["TYPE"].as_str(); 63 | let id = attr.get("ID").map(|a| a.as_str()).unwrap_or_else(|| path.file_stem().unwrap().to_str().unwrap()); 64 | 65 | if attr.get("EDITION").map(|s| s.as_str()) == Some("1 2 3 4") { 66 | // tests obsolete things changed in edition 5 67 | continue; 68 | } 69 | 70 | let res = match test_type { 71 | "valid" => expect_well_formed(&path, &desc, parser_config.clone()), 72 | "invalid" => expect_well_formed(&path, &desc, parser_config.clone()), // invalid is still well-formed 73 | "not-wf" | "error" => expect_ill_formed(&path, &desc), 74 | other => unimplemented!("{other}?? type"), 75 | }; 76 | 77 | if let Some(out) = new_known_failures_file.as_mut() { 78 | if let Err(e) = res { 79 | use std::fmt::Write; 80 | writeln!(out, "{id} {}", e.to_string().replace('\n', " ")).unwrap(); 81 | } 82 | } else { 83 | let known_bad = known_broken_test_ids.contains(id); 84 | match res { 85 | Err(_) if known_bad => {}, 86 | Err(e) => panic!("{suite_rel_path} failed on {} ({id})\n{e}", path.display()), 87 | Ok(()) if known_bad => panic!("expected {} ({id}) to fail, but it passes {test_type} of {suite_rel_path} now\n{desc}", path.display()), 88 | Ok(()) => {}, 89 | } 90 | } 91 | 92 | parsed += 1; 93 | }, 94 | XmlEvent::StartElement { name, attributes, namespace: _ } if name.local_name == "TEST" => { 95 | desc.clear(); 96 | attr = attributes.into_iter().map(|a| (a.name.local_name, a.value)).collect(); 97 | }, 98 | _ => {}, 99 | } 100 | } 101 | if let Some(out) = new_known_failures_file { 102 | if out.is_empty() { 103 | let _ = std::fs::remove_file(known_failures_file_path); 104 | } else { 105 | std::fs::write(known_failures_file_path, out).unwrap(); 106 | } 107 | } 108 | assert!(parsed > 0); 109 | } 110 | 111 | #[track_caller] 112 | fn expect_well_formed(xml_path: &Path, msg: &str, parser_config: ParserConfig) -> Result<(), Box> { 113 | let f = BufReader::new(File::open(xml_path).expect("testcase")); 114 | let r = parser_config.create_reader(f); 115 | let mut w = EventWriter::new(Vec::new()); 116 | let mut seen_any = false; 117 | let mut writes_failed = None; 118 | let mut document_started = false; 119 | for e in r { 120 | let e = e.map_err(|e| format!("{} {msg}; {e}", xml_path.file_name().and_then(std::ffi::OsStr::to_str).unwrap()))?; 121 | match e { 122 | XmlEvent::EndElement { .. } => { 123 | seen_any = true; 124 | }, 125 | XmlEvent::StartDocument { .. } => { 126 | if document_started { return Err("document started twice".into()); } 127 | document_started = true; 128 | }, 129 | _ => {}, 130 | } 131 | if let Some(e) = e.as_writer_event() { 132 | if let Err(e) = w.write(e) { 133 | writes_failed = Some(e); 134 | } 135 | } 136 | } 137 | if !seen_any { 138 | return Err("no elements found".into()); 139 | } 140 | if let Some(e) = writes_failed { 141 | panic!("{} write failed on {e}", xml_path.display()); 142 | } 143 | Ok(()) 144 | } 145 | 146 | #[track_caller] 147 | fn expect_ill_formed(xml_path: &Path, msg: &str) -> Result<(), Box> { 148 | let f = BufReader::new(File::open(xml_path)?); 149 | let r = ParserConfig::new().allow_multiple_root_elements(false).create_reader(f); 150 | for e in r { 151 | if e.is_err() { 152 | return Ok(()); 153 | } 154 | } 155 | Err(format!("{} {msg}", xml_path.file_name().and_then(std::ffi::OsStr::to_str).unwrap()).into()) 156 | } 157 | 158 | #[test] 159 | fn eduni_errata_2e() { 160 | run_suite("xmlconf/eduni/errata-2e/errata2e.xml"); 161 | } 162 | 163 | #[test] 164 | fn eduni_errata_3e() { 165 | run_suite("xmlconf/eduni/errata-3e/errata3e.xml"); 166 | } 167 | 168 | #[test] 169 | fn eduni_errata_4e() { 170 | run_suite("xmlconf/eduni/errata-4e/errata4e.xml"); 171 | } 172 | 173 | #[test] 174 | fn eduni_misc_ht() { 175 | run_suite("xmlconf/eduni/misc/ht-bh.xml"); 176 | } 177 | 178 | #[test] 179 | fn eduni_namespaces_10() { 180 | run_suite("xmlconf/eduni/namespaces/1.0/rmt-ns10.xml"); 181 | } 182 | 183 | #[test] 184 | fn eduni_namespaces_11() { 185 | run_suite("xmlconf/eduni/namespaces/1.1/rmt-ns11.xml"); 186 | } 187 | 188 | #[test] 189 | fn eduni_namespaces_errata() { 190 | run_suite("xmlconf/eduni/namespaces/errata-1e/errata1e.xml"); 191 | } 192 | 193 | #[test] 194 | fn eduni_xml_11() { 195 | run_suite("xmlconf/eduni/xml-1.1/xml11.xml"); 196 | } 197 | 198 | #[test] 199 | fn ibm_oasis_valid() { 200 | run_suite("xmlconf/ibm/ibm_oasis_valid.xml"); 201 | } 202 | 203 | #[test] 204 | fn ibm_xml_11() { 205 | run_suite("xmlconf/ibm/xml-1.1/ibm_valid.xml"); 206 | } 207 | 208 | #[test] 209 | fn oasis() { 210 | run_suite("xmlconf/oasis/oasis.xml"); 211 | } 212 | 213 | #[test] 214 | fn sun_valid() { 215 | run_suite("xmlconf/sun/sun-valid.xml"); 216 | } 217 | 218 | #[test] 219 | fn sun_ill_formed() { 220 | run_suite("xmlconf/sun/sun-not-wf.xml"); 221 | } 222 | 223 | #[test] 224 | fn japanese() { 225 | run_suite("xmlconf/japanese/japanese.xml"); 226 | } 227 | 228 | #[test] 229 | fn xmltest() { 230 | run_suite("xmlconf/xmltest/xmltest.xml"); 231 | } 232 | 233 | #[test] 234 | fn own_tests() { 235 | run_suite("tests.xml"); 236 | } 237 | -------------------------------------------------------------------------------- /benches/bench.rs: -------------------------------------------------------------------------------- 1 | #![feature(test)] 2 | 3 | extern crate test; 4 | use test::Bencher; 5 | use xml::{EventReader, EventWriter}; 6 | 7 | #[bench] 8 | fn read(bencher: &mut Bencher) { 9 | let xml = std::fs::read("tests/documents/sample_1.xml").unwrap(); 10 | bencher.iter(move || { 11 | let parser = EventReader::new(xml.as_slice()); 12 | for e in parser { 13 | e.unwrap(); 14 | } 15 | }); 16 | } 17 | 18 | #[bench] 19 | fn read_lots_attrs(bencher: &mut Bencher) { 20 | let xml = r#""#; 23 | bencher.iter(move || { 24 | let parser = EventReader::new(xml.as_bytes()); 25 | for e in parser { 26 | e.unwrap(); 27 | } 28 | }); 29 | } 30 | 31 | #[bench] 32 | fn write(bencher: &mut Bencher) { 33 | let xml = std::fs::read("tests/documents/sample_1.xml").unwrap(); 34 | let events: Vec<_> = EventReader::new(xml.as_slice()).into_iter().map(|e| e.unwrap()).collect(); 35 | let events: Vec<_> = events.iter().filter_map(|e| e.as_writer_event()).collect(); 36 | 37 | bencher.iter(move || { 38 | let mut serializer = EventWriter::new(Vec::new()); 39 | for e in &events { 40 | serializer.write((*e).clone()).unwrap(); 41 | } 42 | serializer.into_inner() 43 | }); 44 | } 45 | -------------------------------------------------------------------------------- /src/reader/events.rs: -------------------------------------------------------------------------------- 1 | //! Contains `XmlEvent` datatype, instances of which are emitted by the parser. 2 | 3 | use crate::attribute::OwnedAttribute; 4 | use crate::common::XmlVersion; 5 | use crate::name::OwnedName; 6 | use crate::namespace::Namespace; 7 | use std::fmt; 8 | 9 | /// An element of an XML input stream. 10 | /// 11 | /// Items of this enum are emitted by `reader::EventReader`. They correspond to different 12 | /// elements of an XML document. 13 | #[derive(PartialEq, Clone)] 14 | pub enum XmlEvent { 15 | /// Corresponds to XML document declaration. 16 | /// 17 | /// This event is always emitted before any other event. It is emitted 18 | /// even if the actual declaration is not present in the document. 19 | StartDocument { 20 | /// XML version. 21 | /// 22 | /// If XML declaration is not present, defaults to `Version10`. 23 | version: XmlVersion, 24 | 25 | /// XML document encoding. 26 | /// 27 | /// If XML declaration is not present or does not contain `encoding` attribute, 28 | /// defaults to `"UTF-8"`. This field is currently used for no other purpose than 29 | /// informational. 30 | encoding: String, 31 | 32 | /// XML standalone declaration. 33 | /// 34 | /// If XML document is not present or does not contain `standalone` attribute, 35 | /// defaults to `None`. This field is currently used for no other purpose than 36 | /// informational. 37 | standalone: Option, 38 | }, 39 | 40 | /// Denotes to the end of the document stream. 41 | /// 42 | /// This event is always emitted after any other event (except `Error`). After it 43 | /// is emitted for the first time, it will always be emitted on next event pull attempts. 44 | EndDocument, 45 | 46 | /// Denotes an XML processing instruction. 47 | /// 48 | /// This event contains a processing instruction target (`name`) and opaque `data`. It 49 | /// is up to the application to process them. 50 | ProcessingInstruction { 51 | /// Processing instruction target. 52 | name: String, 53 | 54 | /// Processing instruction content. 55 | data: Option, 56 | }, 57 | 58 | /// Denotes a beginning of an XML element. 59 | /// 60 | /// This event is emitted after parsing opening tags or after parsing bodiless tags. In the 61 | /// latter case `EndElement` event immediately follows. 62 | StartElement { 63 | /// Qualified name of the element. 64 | name: OwnedName, 65 | 66 | /// A list of attributes associated with the element. 67 | /// 68 | /// Currently attributes are not checked for duplicates (TODO) 69 | attributes: Vec, 70 | 71 | /// Contents of the namespace mapping at this point of the document. 72 | namespace: Namespace, 73 | }, 74 | 75 | /// Denotes an end of an XML element. 76 | /// 77 | /// This event is emitted after parsing closing tags or after parsing bodiless tags. In the 78 | /// latter case it is emitted immediately after corresponding `StartElement` event. 79 | EndElement { 80 | /// Qualified name of the element. 81 | name: OwnedName, 82 | }, 83 | 84 | /// Denotes CDATA content. 85 | /// 86 | /// This event contains unparsed data. No unescaping will be performed. 87 | /// 88 | /// It is possible to configure a parser to emit `Characters` event instead of `CData`. See 89 | /// `pull::ParserConfiguration` structure for more information. 90 | CData(String), 91 | 92 | /// Denotes a comment. 93 | /// 94 | /// It is possible to configure a parser to ignore comments, so this event will never be emitted. 95 | /// See `pull::ParserConfiguration` structure for more information. 96 | Comment(String), 97 | 98 | /// Denotes character data outside of tags. 99 | /// 100 | /// Contents of this event will always be unescaped, so no entities like `<` or `&` or `{` 101 | /// will appear in it. 102 | /// 103 | /// It is possible to configure a parser to trim leading and trailing whitespace for this event. 104 | /// See `pull::ParserConfiguration` structure for more information. 105 | Characters(String), 106 | 107 | /// Denotes a chunk of whitespace outside of tags. 108 | /// 109 | /// It is possible to configure a parser to emit `Characters` event instead of `Whitespace`. 110 | /// See `pull::ParserConfiguration` structure for more information. When combined with whitespace 111 | /// trimming, it will eliminate standalone whitespace from the event stream completely. 112 | Whitespace(String), 113 | /// The whole DOCTYPE markup 114 | Doctype { 115 | /// Everything including `<` and `>` 116 | syntax: String, 117 | }, 118 | } 119 | 120 | impl fmt::Debug for XmlEvent { 121 | #[cold] 122 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 123 | match self { 124 | Self::StartDocument { version, encoding, standalone } => 125 | write!(f, "StartDocument({}, {}, {:?})", version, *encoding, standalone), 126 | Self::EndDocument => 127 | write!(f, "EndDocument"), 128 | Self::ProcessingInstruction { name, data } => 129 | write!(f, "ProcessingInstruction({}{})", *name, match data { 130 | Some(data) => format!(", {data}"), 131 | None => String::new() 132 | }), 133 | Self::StartElement { name, attributes, namespace: Namespace(namespace) } => 134 | write!(f, "StartElement({}, {:?}{})", name, namespace, if attributes.is_empty() { 135 | String::new() 136 | } else { 137 | let attributes: Vec = attributes.iter().map( 138 | |a| format!("{} -> {}", a.name, a.value) 139 | ).collect(); 140 | format!(", [{}]", attributes.join(", ")) 141 | }), 142 | Self::EndElement { name } => 143 | write!(f, "EndElement({name})"), 144 | Self::Comment(data) => 145 | write!(f, "Comment({data})"), 146 | Self::CData(data) => 147 | write!(f, "CData({data})"), 148 | Self::Characters(data) => 149 | write!(f, "Characters({data})"), 150 | Self::Whitespace(data) => 151 | write!(f, "Whitespace({data})"), 152 | Self::Doctype { syntax } => 153 | write!(f, "Doctype({syntax})"), 154 | } 155 | } 156 | } 157 | 158 | impl XmlEvent { 159 | /// Obtains a writer event from this reader event. 160 | /// 161 | /// This method is useful for streaming processing of XML documents where the output 162 | /// is also an XML document. With this method it is possible to process some events 163 | /// while passing other events through to the writer unchanged: 164 | /// 165 | /// ```rust 166 | /// use std::str; 167 | /// 168 | /// use xml::reader::XmlEvent as ReaderEvent; 169 | /// use xml::writer::XmlEvent as WriterEvent; 170 | /// use xml::{EventReader, EventWriter}; 171 | /// 172 | /// let mut input: &[u8] = b"world"; 173 | /// let mut output: Vec = Vec::new(); 174 | /// 175 | /// { 176 | /// let mut reader = EventReader::new(&mut input); 177 | /// let mut writer = EventWriter::new(&mut output); 178 | /// 179 | /// for e in reader { 180 | /// match e.unwrap() { 181 | /// ReaderEvent::Characters(s) => 182 | /// writer.write(WriterEvent::characters(&s.to_uppercase())).unwrap(), 183 | /// e => if let Some(e) = e.as_writer_event() { 184 | /// writer.write(e).unwrap() 185 | /// } 186 | /// } 187 | /// } 188 | /// } 189 | /// 190 | /// assert_eq!( 191 | /// str::from_utf8(&output).unwrap(), 192 | /// r#"WORLD"# 193 | /// ); 194 | /// ``` 195 | /// 196 | /// Note that this API may change or get additions in future to improve its ergonomics. 197 | #[must_use] 198 | pub fn as_writer_event(&self) -> Option> { 199 | match self { 200 | Self::StartDocument { version, encoding, standalone } => 201 | Some(crate::writer::events::XmlEvent::StartDocument { 202 | version: *version, 203 | encoding: Some(encoding), 204 | standalone: *standalone 205 | }), 206 | Self::ProcessingInstruction { name, data } => 207 | Some(crate::writer::events::XmlEvent::ProcessingInstruction { 208 | name, 209 | data: data.as_ref().map(|s| &**s) 210 | }), 211 | Self::StartElement { name, attributes, namespace } => 212 | Some(crate::writer::events::XmlEvent::StartElement { 213 | name: name.borrow(), 214 | attributes: attributes.iter().map(|a| a.borrow()).collect(), 215 | namespace: namespace.borrow(), 216 | }), 217 | Self::EndElement { name } => 218 | Some(crate::writer::events::XmlEvent::EndElement { name: Some(name.borrow()) }), 219 | Self::Comment(data) => Some(crate::writer::events::XmlEvent::Comment(data)), 220 | Self::CData(data) => Some(crate::writer::events::XmlEvent::CData(data)), 221 | Self::Characters(data) | 222 | Self::Whitespace(data) => Some(crate::writer::events::XmlEvent::Characters(data)), 223 | Self::Doctype { syntax } => Some(crate::writer::events::XmlEvent::Doctype(syntax)), 224 | Self::EndDocument => None, 225 | } 226 | } 227 | } 228 | -------------------------------------------------------------------------------- /src/reader/parser/outside_tag.rs: -------------------------------------------------------------------------------- 1 | use crate::common::is_whitespace_char; 2 | use crate::reader::error::SyntaxError; 3 | use crate::reader::events::XmlEvent; 4 | use crate::reader::lexer::Token; 5 | 6 | use super::{ 7 | ClosingTagSubstate, DoctypeSubstate, Encountered, OpeningTagSubstate, 8 | ProcessingInstructionSubstate, PullParser, Result, State, 9 | }; 10 | 11 | impl PullParser { 12 | pub fn outside_tag(&mut self, t: Token) -> Option { 13 | match t { 14 | Token::Character(c) => { 15 | if is_whitespace_char(c) { 16 | // skip whitespace outside of the root element 17 | if (self.config.trim_whitespace && self.buf.is_empty()) || 18 | (self.depth() == 0 && self.config.ignore_root_level_whitespace) { 19 | return None; 20 | } 21 | } else { 22 | self.inside_whitespace = false; 23 | if self.depth() == 0 { 24 | return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t))); 25 | } 26 | } 27 | 28 | if !self.is_valid_xml_char_not_restricted(c) { 29 | return Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))); 30 | } 31 | 32 | if self.buf.is_empty() { 33 | self.push_pos(); 34 | } else if self.buf.len() > self.config.max_data_length { 35 | return Some(self.error(SyntaxError::ExceededConfiguredLimit)); 36 | } 37 | self.buf.push(c); 38 | None 39 | }, 40 | 41 | Token::CommentEnd | Token::TagEnd | Token::EqualsSign | 42 | Token::DoubleQuote | Token::SingleQuote | 43 | Token::ProcessingInstructionEnd | Token::EmptyTagEnd => { 44 | if self.depth() == 0 { 45 | return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t))); 46 | } 47 | self.inside_whitespace = false; 48 | 49 | if let Some(s) = t.as_static_str() { 50 | if self.buf.is_empty() { 51 | self.push_pos(); 52 | } else if self.buf.len() > self.config.max_data_length { 53 | return Some(self.error(SyntaxError::ExceededConfiguredLimit)); 54 | } 55 | 56 | self.buf.push_str(s); 57 | } 58 | None 59 | }, 60 | 61 | Token::ReferenceStart if self.depth() > 0 => { 62 | self.state_after_reference = State::OutsideTag; 63 | self.into_state_continue(State::InsideReference) 64 | }, 65 | 66 | Token::ReferenceEnd if self.depth() > 0 => { // Semi-colon in a text outside an entity 67 | self.inside_whitespace = false; 68 | if self.buf.len() > self.config.max_data_length { 69 | return Some(self.error(SyntaxError::ExceededConfiguredLimit)); 70 | } 71 | Token::ReferenceEnd.push_to_string(&mut self.buf); 72 | None 73 | }, 74 | 75 | Token::CommentStart if self.config.coalesce_characters && self.config.ignore_comments => { 76 | let next_event = self.set_encountered(Encountered::Comment); 77 | // We need to switch the lexer into a comment mode inside comments 78 | self.into_state(State::InsideComment, next_event) 79 | } 80 | 81 | Token::CDataStart if self.depth() > 0 && self.config.coalesce_characters && self.config.cdata_to_characters => { 82 | if self.buf.is_empty() { 83 | self.push_pos(); // CDataEnd will pop pos if the buffer remains empty 84 | } 85 | // if coalescing chars, continue without event 86 | self.into_state_continue(State::InsideCData) 87 | }, 88 | 89 | _ => { 90 | // Encountered some markup event, flush the buffer as characters 91 | // or a whitespace 92 | let mut next_event = if self.buf_has_data() { 93 | let buf = self.take_buf(); 94 | if self.inside_whitespace && self.config.trim_whitespace { 95 | // there will be no event emitted for this, but start of buffering has pushed a pos 96 | self.next_pos(); 97 | None 98 | } else if self.inside_whitespace && !self.config.whitespace_to_characters { 99 | debug_assert!(buf.chars().all(|ch| ch.is_whitespace()), "ws={buf:?}"); 100 | Some(Ok(XmlEvent::Whitespace(buf))) 101 | } else if self.config.trim_whitespace { 102 | Some(Ok(XmlEvent::Characters(buf.trim_matches(is_whitespace_char).into()))) 103 | } else { 104 | Some(Ok(XmlEvent::Characters(buf))) 105 | } 106 | } else { None }; 107 | self.inside_whitespace = true; // Reset inside_whitespace flag 108 | 109 | // pos is popped whenever an event is emitted, so pushes must happen only if there will be an event to balance it 110 | // and ignored comments don't pop 111 | if t != Token::CommentStart || !self.config.ignore_comments { 112 | self.push_pos(); 113 | } 114 | match t { 115 | Token::OpeningTagStart if self.depth() > 0 || self.encountered < Encountered::Element || self.config.allow_multiple_root_elements => { 116 | if let Some(e) = self.set_encountered(Encountered::Element) { 117 | next_event = Some(e); 118 | } 119 | self.nst.push_empty(); 120 | self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event) 121 | }, 122 | 123 | Token::ClosingTagStart if self.depth() > 0 => 124 | self.into_state(State::InsideClosingTag(ClosingTagSubstate::CTInsideName), next_event), 125 | 126 | Token::CommentStart => { 127 | if let Some(e) = self.set_encountered(Encountered::Comment) { 128 | next_event = Some(e); 129 | } 130 | // We need to switch the lexer into a comment mode inside comments 131 | self.into_state(State::InsideComment, next_event) 132 | }, 133 | 134 | Token::DoctypeStart if self.encountered < Encountered::Doctype => { 135 | if let Some(e) = self.set_encountered(Encountered::Doctype) { 136 | next_event = Some(e); 137 | } 138 | self.data.doctype = Some(Token::DoctypeStart.to_string()); 139 | 140 | self.push_pos(); 141 | self.into_state(State::InsideDoctype(DoctypeSubstate::Outside), next_event) 142 | }, 143 | 144 | Token::ProcessingInstructionStart => 145 | self.into_state(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName), next_event), 146 | 147 | Token::CDataStart if self.depth() > 0 => { 148 | self.into_state(State::InsideCData, next_event) 149 | }, 150 | 151 | _ => Some(self.error(SyntaxError::UnexpectedToken(t))), 152 | } 153 | }, 154 | } 155 | } 156 | 157 | pub fn document_start(&mut self, t: Token) -> Option { 158 | debug_assert!(self.encountered < Encountered::Declaration); 159 | 160 | match t { 161 | Token::Character(c) => { 162 | let next_event = self.set_encountered(Encountered::AnyChars); 163 | 164 | if !is_whitespace_char(c) { 165 | return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t))); 166 | } 167 | self.inside_whitespace = true; 168 | 169 | // skip whitespace outside of the root element 170 | if (self.config.trim_whitespace && self.buf.is_empty()) || 171 | (self.depth() == 0 && self.config.ignore_root_level_whitespace) { 172 | return self.into_state(State::OutsideTag, next_event); 173 | } 174 | 175 | self.push_pos(); 176 | self.buf.push(c); 177 | self.into_state(State::OutsideTag, next_event) 178 | }, 179 | 180 | Token::CommentStart => { 181 | let next_event = self.set_encountered(Encountered::Comment); 182 | self.into_state(State::InsideComment, next_event) 183 | }, 184 | 185 | Token::OpeningTagStart => { 186 | let next_event = self.set_encountered(Encountered::Element); 187 | self.nst.push_empty(); 188 | self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event) 189 | }, 190 | 191 | Token::DoctypeStart => { 192 | let next_event = self.set_encountered(Encountered::Doctype); 193 | self.data.doctype = Some(Token::DoctypeStart.to_string()); 194 | 195 | self.push_pos(); 196 | self.into_state(State::InsideDoctype(DoctypeSubstate::Outside), next_event) 197 | }, 198 | 199 | Token::ProcessingInstructionStart => { 200 | self.push_pos(); 201 | self.into_state_continue(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName)) 202 | }, 203 | 204 | _ => Some(self.error(SyntaxError::UnexpectedToken(t))), 205 | } 206 | } 207 | } 208 | -------------------------------------------------------------------------------- /src/reader/parser/inside_declaration.rs: -------------------------------------------------------------------------------- 1 | use crate::common::{is_whitespace_char, XmlVersion}; 2 | use crate::reader::error::SyntaxError; 3 | use crate::reader::events::XmlEvent; 4 | use crate::reader::lexer::Token; 5 | use crate::util::Encoding; 6 | 7 | use super::{ 8 | DeclarationSubstate, Encountered, PullParser, QualifiedNameTarget, Result, State, 9 | DEFAULT_VERSION, 10 | }; 11 | 12 | impl PullParser { 13 | #[inline(never)] 14 | fn emit_start_document(&mut self) -> Option { 15 | debug_assert!(self.encountered == Encountered::None); 16 | self.encountered = Encountered::Declaration; 17 | 18 | let version = self.data.version; 19 | let encoding = self.data.take_encoding(); 20 | let standalone = self.data.standalone; 21 | 22 | if let Some(new_encoding) = encoding.as_deref() { 23 | let new_encoding = match new_encoding.parse() { 24 | Ok(e) => e, 25 | Err(_) if self.config.ignore_invalid_encoding_declarations => Encoding::Latin1, 26 | Err(_) => return Some(self.error(SyntaxError::UnsupportedEncoding(new_encoding.into()))), 27 | }; 28 | let current_encoding = self.lexer.encoding(); 29 | if current_encoding != new_encoding { 30 | let set = match (current_encoding, new_encoding) { 31 | (Encoding::Unknown | Encoding::Default, new) if new != Encoding::Utf16 => new, 32 | (Encoding::Utf16Be | Encoding::Utf16Le, Encoding::Utf16) => current_encoding, 33 | _ if self.config.ignore_invalid_encoding_declarations => current_encoding, 34 | _ => return Some(self.error(SyntaxError::ConflictingEncoding(new_encoding, current_encoding))), 35 | }; 36 | self.lexer.set_encoding(set); 37 | } 38 | } 39 | 40 | let current_encoding = self.lexer.encoding(); 41 | self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartDocument { 42 | version: version.unwrap_or(DEFAULT_VERSION), 43 | encoding: encoding.unwrap_or_else(move || current_encoding.to_string()), 44 | standalone 45 | })) 46 | } 47 | 48 | // TODO: remove redundancy via macros or extra methods 49 | pub fn inside_declaration(&mut self, t: Token, s: DeclarationSubstate) -> Option { 50 | 51 | match s { 52 | DeclarationSubstate::BeforeVersion => match t { 53 | Token::Character('v') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersion)), 54 | Token::Character(c) if is_whitespace_char(c) => None, // continue 55 | _ => Some(self.error(SyntaxError::UnexpectedToken(t))), 56 | }, 57 | 58 | DeclarationSubstate::InsideVersion => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { 59 | match &*name.local_name { 60 | "ersion" if name.namespace.is_none() => 61 | this.into_state_continue(State::InsideDeclaration( 62 | if token == Token::EqualsSign { 63 | DeclarationSubstate::InsideVersionValue 64 | } else { 65 | DeclarationSubstate::AfterVersion 66 | } 67 | )), 68 | _ => Some(this.error(SyntaxError::UnexpectedNameInsideXml(name.to_string().into()))), 69 | } 70 | }), 71 | 72 | DeclarationSubstate::AfterVersion => match t { 73 | Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersionValue)), 74 | Token::Character(c) if is_whitespace_char(c) => None, 75 | _ => Some(self.error(SyntaxError::UnexpectedToken(t))), 76 | }, 77 | 78 | DeclarationSubstate::InsideVersionValue => self.read_attribute_value(t, |this, value| { 79 | this.data.version = match &*value { 80 | "1.0" => Some(XmlVersion::Version10), 81 | "1.1" => Some(XmlVersion::Version11), 82 | _ => None 83 | }; 84 | if this.data.version.is_some() { 85 | this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterVersionValue)) 86 | } else { 87 | Some(this.error(SyntaxError::UnexpectedXmlVersion(value.into()))) 88 | } 89 | }), 90 | 91 | DeclarationSubstate::AfterVersionValue => match t { 92 | Token::Character(c) if is_whitespace_char(c) => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeEncoding)), 93 | Token::ProcessingInstructionEnd => self.emit_start_document(), 94 | _ => Some(self.error(SyntaxError::UnexpectedToken(t))), 95 | }, 96 | 97 | DeclarationSubstate::BeforeEncoding => match t { 98 | Token::Character('e') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncoding)), 99 | Token::Character('s') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)), 100 | Token::ProcessingInstructionEnd => self.emit_start_document(), 101 | Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace 102 | _ => Some(self.error(SyntaxError::UnexpectedToken(t))), 103 | }, 104 | 105 | DeclarationSubstate::InsideEncoding => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { 106 | match &*name.local_name { 107 | "ncoding" if name.namespace.is_none() => 108 | this.into_state_continue(State::InsideDeclaration( 109 | if token == Token::EqualsSign { DeclarationSubstate::InsideEncodingValue } else { DeclarationSubstate::AfterEncoding } 110 | )), 111 | _ => Some(this.error(SyntaxError::UnexpectedName(name.to_string().into()))) 112 | } 113 | }), 114 | 115 | DeclarationSubstate::AfterEncoding => match t { 116 | Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncodingValue)), 117 | Token::Character(c) if is_whitespace_char(c) => None, 118 | _ => Some(self.error(SyntaxError::UnexpectedToken(t))), 119 | }, 120 | 121 | DeclarationSubstate::InsideEncodingValue => self.read_attribute_value(t, |this, value| { 122 | this.data.encoding = Some(value); 123 | this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterEncodingValue)) 124 | }), 125 | 126 | DeclarationSubstate::AfterEncodingValue => match t { 127 | Token::Character(c) if is_whitespace_char(c) => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeStandaloneDecl)), 128 | Token::ProcessingInstructionEnd => self.emit_start_document(), 129 | _ => Some(self.error(SyntaxError::UnexpectedToken(t))), 130 | }, 131 | 132 | DeclarationSubstate::BeforeStandaloneDecl => match t { 133 | Token::Character('s') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)), 134 | Token::ProcessingInstructionEnd => self.emit_start_document(), 135 | Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace 136 | _ => Some(self.error(SyntaxError::UnexpectedToken(t))), 137 | }, 138 | 139 | DeclarationSubstate::InsideStandaloneDecl => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { 140 | match &*name.local_name { 141 | "tandalone" if name.namespace.is_none() => 142 | this.into_state_continue(State::InsideDeclaration( 143 | if token == Token::EqualsSign { 144 | DeclarationSubstate::InsideStandaloneDeclValue 145 | } else { 146 | DeclarationSubstate::AfterStandaloneDecl 147 | } 148 | )), 149 | _ => Some(this.error(SyntaxError::UnexpectedName(name.to_string().into()))), 150 | } 151 | }), 152 | 153 | DeclarationSubstate::AfterStandaloneDecl => match t { 154 | Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDeclValue)), 155 | Token::Character(c) if is_whitespace_char(c) => None, 156 | _ => Some(self.error(SyntaxError::UnexpectedToken(t))), 157 | }, 158 | 159 | DeclarationSubstate::InsideStandaloneDeclValue => self.read_attribute_value(t, |this, value| { 160 | let standalone = match &*value { 161 | "yes" => Some(true), 162 | "no" => Some(false), 163 | _ => None 164 | }; 165 | if standalone.is_some() { 166 | this.data.standalone = standalone; 167 | this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterStandaloneDeclValue)) 168 | } else { 169 | Some(this.error(SyntaxError::InvalidStandaloneDeclaration(value.into()))) 170 | } 171 | }), 172 | 173 | DeclarationSubstate::AfterStandaloneDeclValue => match t { 174 | Token::ProcessingInstructionEnd => self.emit_start_document(), 175 | Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace 176 | _ => Some(self.error(SyntaxError::UnexpectedToken(t))), 177 | }, 178 | } 179 | } 180 | } 181 | -------------------------------------------------------------------------------- /src/writer/events.rs: -------------------------------------------------------------------------------- 1 | //! Contains `XmlEvent` datatype, instances of which are consumed by the writer. 2 | 3 | use std::borrow::Cow; 4 | 5 | use crate::attribute::Attribute; 6 | use crate::common::XmlVersion; 7 | use crate::name::Name; 8 | use crate::namespace::{Namespace, NS_NO_PREFIX}; 9 | use crate::reader::ErrorKind; 10 | 11 | /// A part of an XML output stream. 12 | /// 13 | /// Objects of this enum are consumed by `EventWriter`. They correspond to different parts of 14 | /// an XML document. 15 | #[derive(Debug, Clone)] 16 | #[non_exhaustive] 17 | pub enum XmlEvent<'a> { 18 | /// Corresponds to XML document declaration. 19 | /// 20 | /// This event should always be written before any other event. If it is not written 21 | /// at all, a default XML declaration will be outputted if the corresponding option 22 | /// is set in the configuration. Otherwise an error will be returned. 23 | StartDocument { 24 | /// XML version. 25 | /// 26 | /// Defaults to `XmlVersion::Version10`. 27 | version: XmlVersion, 28 | 29 | /// XML document encoding. 30 | /// 31 | /// Defaults to `Some("UTF-8")`. 32 | encoding: Option<&'a str>, 33 | 34 | /// XML standalone declaration. 35 | /// 36 | /// Defaults to `None`. 37 | standalone: Option, 38 | }, 39 | 40 | /// Denotes an XML processing instruction. 41 | ProcessingInstruction { 42 | /// Processing instruction target. 43 | name: &'a str, 44 | 45 | /// Processing instruction content. 46 | data: Option<&'a str>, 47 | }, 48 | 49 | /// Denotes a beginning of an XML element. 50 | StartElement { 51 | /// Qualified name of the element. 52 | name: Name<'a>, 53 | 54 | /// A list of attributes associated with the element. 55 | /// 56 | /// Currently attributes are not checked for duplicates (TODO). Attribute values 57 | /// will be escaped, and all characters invalid for attribute values like `"` or `<` 58 | /// will be changed into character entities. 59 | attributes: Cow<'a, [Attribute<'a>]>, 60 | 61 | /// Contents of the namespace mapping at this point of the document. 62 | /// 63 | /// This mapping will be inspected for "new" entries, and if at this point of the document 64 | /// a particular pair of prefix and namespace URI is already defined, no namespace 65 | /// attributes will be emitted. 66 | namespace: Cow<'a, Namespace>, 67 | }, 68 | 69 | /// Denotes an end of an XML element. 70 | EndElement { 71 | /// Optional qualified name of the element. 72 | /// 73 | /// If `None`, then it is assumed that the element name should be the last valid one. 74 | /// If `Some` and element names tracking is enabled, then the writer will check it for 75 | /// correctness. 76 | name: Option>, 77 | }, 78 | 79 | /// Denotes CDATA content. 80 | /// 81 | /// This event contains unparsed data, and no escaping will be performed when writing it 82 | /// to the output stream. 83 | CData(&'a str), 84 | 85 | /// Denotes a comment. 86 | /// 87 | /// The string will be checked for invalid sequences and error will be returned by the 88 | /// write operation 89 | Comment(&'a str), 90 | 91 | /// Denotes character data outside of tags. 92 | /// 93 | /// Contents of this event will be escaped if `perform_escaping` option is enabled, 94 | /// that is, every character invalid for PCDATA will appear as a character entity. 95 | Characters(&'a str), 96 | 97 | /// Emits raw characters which will never be escaped. 98 | /// 99 | /// This event is only used for writing to an output stream, there is no equivalent 100 | /// reader event. Care must be taken when using this event, as it can easily result 101 | /// non-well-formed documents. 102 | RawCharacters(&'a str), 103 | 104 | /// Syntax of the `DOCTYPE`, everyhing including `<` and `>` 105 | Doctype(&'a str), 106 | } 107 | 108 | impl<'a> XmlEvent<'a> { 109 | /// Returns an writer event for a processing instruction. 110 | #[inline] 111 | #[must_use] 112 | pub const fn processing_instruction(name: &'a str, data: Option<&'a str>) -> Self { 113 | XmlEvent::ProcessingInstruction { name, data } 114 | } 115 | 116 | /// Returns a builder for a starting element. 117 | /// 118 | /// This builder can then be used to tweak attributes and namespace starting at 119 | /// this element. 120 | #[inline] 121 | pub fn start_element(name: S) -> StartElementBuilder<'a> where S: Into> { 122 | StartElementBuilder { 123 | name: name.into(), 124 | attributes: Vec::new(), 125 | namespace: Namespace::empty(), 126 | } 127 | } 128 | 129 | /// Returns a builder for an closing element. 130 | /// 131 | /// This method, unline `start_element()`, does not accept a name because by default 132 | /// the writer is able to determine it automatically. However, when this functionality 133 | /// is disabled, it is possible to specify the name with `name()` method on the builder. 134 | #[inline] 135 | #[must_use] 136 | pub const fn end_element() -> EndElementBuilder<'a> { 137 | EndElementBuilder { name: None } 138 | } 139 | 140 | /// Returns a CDATA event. 141 | /// 142 | /// Naturally, the provided string won't be escaped, except for closing CDATA token `]]>` 143 | /// (depending on the configuration). 144 | #[inline] 145 | #[must_use] 146 | pub const fn cdata(data: &'a str) -> Self { 147 | XmlEvent::CData(data) 148 | } 149 | 150 | /// Returns a regular characters (PCDATA) event. 151 | /// 152 | /// All offending symbols, in particular, `&` and `<`, will be escaped by the writer. 153 | #[inline] 154 | #[must_use] 155 | pub const fn characters(data: &'a str) -> Self { 156 | XmlEvent::Characters(data) 157 | } 158 | 159 | /// Returns a raw characters event. 160 | /// 161 | /// No escaping takes place. 162 | /// This event is only used for writing to an output stream, there is no equivalent 163 | /// reader event. Care must be taken when using this event, as it can easily result 164 | /// non-well-formed documents. 165 | #[inline] 166 | #[must_use] 167 | pub const fn raw_characters(data: &'a str) -> Self { 168 | XmlEvent::RawCharacters(data) 169 | } 170 | 171 | /// Returns a comment event. 172 | #[inline] 173 | #[must_use] 174 | pub const fn comment(data: &'a str) -> Self { 175 | XmlEvent::Comment(data) 176 | } 177 | } 178 | 179 | impl<'a> From<&'a str> for XmlEvent<'a> { 180 | #[inline] 181 | fn from(s: &'a str) -> Self { 182 | XmlEvent::Characters(s) 183 | } 184 | } 185 | 186 | /// A builder for a closing element event. 187 | pub struct EndElementBuilder<'a> { 188 | name: Option>, 189 | } 190 | 191 | /// A builder for a closing element event. 192 | impl<'a> EndElementBuilder<'a> { 193 | /// Sets the name of this closing element. 194 | /// 195 | /// Usually the writer is able to determine closing element names automatically. If 196 | /// this functionality is enabled (by default it is), then this name is checked for correctness. 197 | /// It is possible, however, to disable such behavior; then the user must ensure that 198 | /// closing element name is correct manually. 199 | #[inline] 200 | #[must_use] 201 | pub fn name(mut self, name: N) -> Self where N: Into> { 202 | self.name = Some(name.into()); 203 | self 204 | } 205 | } 206 | 207 | impl<'a> From> for XmlEvent<'a> { 208 | fn from(b: EndElementBuilder<'a>) -> Self { 209 | XmlEvent::EndElement { name: b.name } 210 | } 211 | } 212 | 213 | /// A builder for a starting element event. 214 | pub struct StartElementBuilder<'a> { 215 | name: Name<'a>, 216 | attributes: Vec>, 217 | namespace: Namespace, 218 | } 219 | 220 | impl<'a> StartElementBuilder<'a> { 221 | /// Sets an attribute value of this element to the given string. 222 | /// 223 | /// This method can be used to add attributes to the starting element. Name is a qualified 224 | /// name; its namespace is ignored, but its prefix is checked for correctness, that is, 225 | /// it is checked that the prefix is bound to some namespace in the current context. 226 | /// 227 | /// Currently attributes are not checked for duplicates. Note that duplicate attributes 228 | /// are a violation of XML document well-formedness. 229 | /// 230 | /// The writer checks that you don't specify reserved prefix names, for example `xmlns`. 231 | #[inline] 232 | #[must_use] 233 | pub fn attr(mut self, name: N, value: &'a str) -> Self 234 | where N: Into> { 235 | self.attributes.push(Attribute::new(name.into(), value)); 236 | self 237 | } 238 | 239 | /// Adds a namespace to the current namespace context. 240 | /// 241 | /// If no namespace URI was bound to the provided prefix at this point of the document, 242 | /// then the mapping from the prefix to the provided namespace URI will be written as 243 | /// a part of this element attribute set. 244 | /// 245 | /// If the same namespace URI was bound to the provided prefix at this point of the document, 246 | /// then no namespace attributes will be emitted. 247 | /// 248 | /// If some other namespace URI was bound to the provided prefix at this point of the document, 249 | /// then another binding will be added as a part of this element attribute set, shadowing 250 | /// the outer binding. 251 | #[inline] 252 | #[must_use] 253 | pub fn ns(mut self, prefix: S1, uri: S2) -> Self 254 | where S1: Into, S2: Into 255 | { 256 | self.namespace.put(prefix, uri); 257 | self 258 | } 259 | 260 | /// Adds a default namespace mapping to the current namespace context. 261 | /// 262 | /// Same rules as for `ns()` are also valid for the default namespace mapping. 263 | #[inline] 264 | #[must_use] 265 | pub fn default_ns(mut self, uri: S) -> Self 266 | where S: Into { 267 | self.namespace.put(NS_NO_PREFIX, uri); 268 | self 269 | } 270 | } 271 | 272 | impl<'a> From> for XmlEvent<'a> { 273 | #[inline] 274 | fn from(b: StartElementBuilder<'a>) -> Self { 275 | XmlEvent::StartElement { 276 | name: b.name, 277 | attributes: Cow::Owned(b.attributes), 278 | namespace: Cow::Owned(b.namespace), 279 | } 280 | } 281 | } 282 | 283 | impl<'a> TryFrom<&'a crate::reader::XmlEvent> for XmlEvent<'a> { 284 | type Error = crate::reader::Error; 285 | 286 | fn try_from(event: &crate::reader::XmlEvent) -> Result, Self::Error> { 287 | Ok(event.as_writer_event().ok_or(ErrorKind::UnexpectedEof)?) 288 | } 289 | } 290 | -------------------------------------------------------------------------------- /src/name.rs: -------------------------------------------------------------------------------- 1 | //! Contains XML qualified names manipulation types and functions. 2 | 3 | use std::fmt; 4 | use std::str::FromStr; 5 | 6 | use crate::namespace::NS_NO_PREFIX; 7 | 8 | /// Represents a qualified XML name. 9 | /// 10 | /// A qualified name always consists at least of a local name. It can optionally contain 11 | /// a prefix; when reading an XML document, if it contains a prefix, it must also contain a 12 | /// namespace URI, but this is not enforced statically; see below. The name can contain a 13 | /// namespace without a prefix; in that case a default, empty prefix is assumed. 14 | /// 15 | /// When writing XML documents, it is possible to omit the namespace URI, leaving only 16 | /// the prefix. In this case the writer will check that the specifed prefix is bound to some 17 | /// URI in the current namespace context. If both prefix and namespace URI are specified, 18 | /// it is checked that the current namespace context contains this exact correspondence 19 | /// between prefix and namespace URI. 20 | /// 21 | /// # Prefixes and URIs 22 | /// 23 | /// A qualified name with a prefix must always contain a proper namespace URI --- names with 24 | /// a prefix but without a namespace associated with that prefix are meaningless. However, 25 | /// it is impossible to obtain proper namespace URI by a prefix without a context, and such 26 | /// context is only available when parsing a document (or it can be constructed manually 27 | /// when writing a document). Tying a name to a context statically seems impractical. This 28 | /// may change in future, though. 29 | /// 30 | /// # Conversions 31 | /// 32 | /// `Name` implements some `From` instances for conversion from strings and tuples. For example: 33 | /// 34 | /// ```rust 35 | /// # use xml::name::Name; 36 | /// let n1: Name = "p:some-name".into(); 37 | /// let n2: Name = ("p", "some-name").into(); 38 | /// 39 | /// assert_eq!(n1, n2); 40 | /// assert_eq!(n1.local_name, "some-name"); 41 | /// assert_eq!(n1.prefix, Some("p")); 42 | /// assert!(n1.namespace.is_none()); 43 | /// ``` 44 | /// 45 | /// This is added to support easy specification of XML elements when writing XML documents. 46 | #[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)] 47 | pub struct Name<'a> { 48 | /// A local name, e.g. `string` in `xsi:string`. 49 | pub local_name: &'a str, 50 | 51 | /// A namespace URI, e.g. `http://www.w3.org/2000/xmlns/`. 52 | pub namespace: Option<&'a str>, 53 | 54 | /// A name prefix, e.g. `xsi` in `xsi:string`. 55 | pub prefix: Option<&'a str>, 56 | } 57 | 58 | impl<'a> From<&'a str> for Name<'a> { 59 | fn from(s: &'a str) -> Self { 60 | if let Some((prefix, name)) = s.split_once(':') { 61 | Name::prefixed(name, prefix) 62 | } else { 63 | Name::local(s) 64 | } 65 | } 66 | } 67 | 68 | impl<'a> From<(&'a str, &'a str)> for Name<'a> { 69 | fn from((prefix, name): (&'a str, &'a str)) -> Self { 70 | Name::prefixed(name, prefix) 71 | } 72 | } 73 | 74 | impl fmt::Display for Name<'_> { 75 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 76 | if let Some(namespace) = self.namespace { 77 | write!(f, "{{{namespace}}}")?; 78 | } 79 | 80 | if let Some(prefix) = self.prefix { 81 | write!(f, "{prefix}:")?; 82 | } 83 | 84 | f.write_str(self.local_name) 85 | } 86 | } 87 | 88 | impl<'a> Name<'a> { 89 | /// Returns an owned variant of the qualified name. 90 | #[must_use] 91 | pub fn to_owned(&self) -> OwnedName { 92 | OwnedName { 93 | local_name: self.local_name.into(), 94 | namespace: self.namespace.map(std::convert::Into::into), 95 | prefix: self.prefix.map(std::convert::Into::into), 96 | } 97 | } 98 | 99 | /// Returns a new `Name` instance representing plain local name. 100 | #[inline] 101 | #[must_use] 102 | pub const fn local(local_name: &str) -> Name<'_> { 103 | Name { 104 | local_name, 105 | prefix: None, 106 | namespace: None, 107 | } 108 | } 109 | 110 | /// Returns a new `Name` instance with the given local name and prefix. 111 | #[inline] 112 | #[must_use] 113 | pub const fn prefixed(local_name: &'a str, prefix: &'a str) -> Self { 114 | Name { 115 | local_name, 116 | namespace: None, 117 | prefix: Some(prefix), 118 | } 119 | } 120 | 121 | /// Returns a new `Name` instance representing a qualified name with or without a prefix and 122 | /// with a namespace URI. 123 | #[inline] 124 | #[must_use] 125 | pub const fn qualified(local_name: &'a str, namespace: &'a str, prefix: Option<&'a str>) -> Self { 126 | Name { 127 | local_name, 128 | namespace: Some(namespace), 129 | prefix, 130 | } 131 | } 132 | 133 | /// Returns a correct XML representation of this local name and prefix. 134 | /// 135 | /// This method is different from the autoimplemented `to_string()` because it does not 136 | /// include namespace URI in the result. 137 | #[must_use] 138 | pub fn to_repr(&self) -> String { 139 | self.repr_display().to_string() 140 | } 141 | 142 | /// Returns a structure which can be displayed with `std::fmt` machinery to obtain this 143 | /// local name and prefix. 144 | /// 145 | /// This method is needed for efficiency purposes in order not to create unnecessary 146 | /// allocations. 147 | #[inline] 148 | #[must_use] 149 | pub const fn repr_display(&self) -> ReprDisplay<'_, '_> { 150 | ReprDisplay(self) 151 | } 152 | 153 | /// Returns either a prefix of this name or `namespace::NS_NO_PREFIX` constant. 154 | #[inline] 155 | #[must_use] 156 | pub fn prefix_repr(&self) -> &str { 157 | self.prefix.unwrap_or(NS_NO_PREFIX) 158 | } 159 | } 160 | 161 | /// A wrapper around `Name` whose `Display` implementation prints the wrapped name as it is 162 | /// displayed in an XML document. 163 | pub struct ReprDisplay<'a, 'b>(&'a Name<'b>); 164 | 165 | impl<'a, 'b: 'a> fmt::Display for ReprDisplay<'a, 'b> { 166 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 167 | match self.0.prefix { 168 | Some(prefix) => write!(f, "{}:{}", prefix, self.0.local_name), 169 | None => self.0.local_name.fmt(f), 170 | } 171 | } 172 | } 173 | 174 | /// An owned variant of `Name`. 175 | /// 176 | /// Everything about `Name` applies to this structure as well. 177 | #[derive(Clone, PartialEq, Eq, Hash, Debug)] 178 | pub struct OwnedName { 179 | /// A local name, e.g. `string` in `xsi:string`. 180 | pub local_name: String, 181 | 182 | /// A namespace URI, e.g. `http://www.w3.org/2000/xmlns/`. 183 | pub namespace: Option, 184 | 185 | /// A name prefix, e.g. `xsi` in `xsi:string`. 186 | pub prefix: Option, 187 | } 188 | 189 | impl fmt::Display for OwnedName { 190 | #[inline] 191 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 192 | fmt::Display::fmt(&self.borrow(), f) 193 | } 194 | } 195 | 196 | impl OwnedName { 197 | /// Constructs a borrowed `Name` based on this owned name. 198 | #[must_use] 199 | #[inline] 200 | pub fn borrow(&self) -> Name<'_> { 201 | Name { 202 | local_name: &self.local_name, 203 | namespace: self.namespace.as_deref(), 204 | prefix: self.prefix.as_deref(), 205 | } 206 | } 207 | 208 | /// Returns a new `OwnedName` instance representing a plain local name. 209 | #[inline] 210 | pub fn local(local_name: S) -> Self where S: Into { 211 | Self { 212 | local_name: local_name.into(), 213 | namespace: None, 214 | prefix: None, 215 | } 216 | } 217 | 218 | /// Returns a new `OwnedName` instance representing a qualified name with or without 219 | /// a prefix and with a namespace URI. 220 | #[inline] 221 | pub fn qualified(local_name: S1, namespace: S2, prefix: Option) -> Self 222 | where S1: Into, S2: Into, S3: Into 223 | { 224 | Self { 225 | local_name: local_name.into(), 226 | namespace: Some(namespace.into()), 227 | prefix: prefix.map(std::convert::Into::into), 228 | } 229 | } 230 | 231 | /// Returns an optional prefix by reference, equivalent to `self.borrow().prefix` 232 | /// but avoids extra work. 233 | #[inline] 234 | #[must_use] 235 | pub fn prefix_ref(&self) -> Option<&str> { 236 | self.prefix.as_deref() 237 | } 238 | 239 | /// Returns an optional namespace by reference, equivalen to `self.borrow().namespace` 240 | /// but avoids extra work. 241 | #[inline] 242 | #[must_use] 243 | pub fn namespace_ref(&self) -> Option<&str> { 244 | self.namespace.as_deref() 245 | } 246 | } 247 | 248 | impl<'a> From> for OwnedName { 249 | #[inline] 250 | fn from(n: Name<'a>) -> Self { 251 | n.to_owned() 252 | } 253 | } 254 | 255 | impl FromStr for OwnedName { 256 | type Err = (); 257 | 258 | /// Parses the given string slice into a qualified name. 259 | /// 260 | /// This function, when finishes sucessfully, always return a qualified 261 | /// name without a namespace (`name.namespace == None`). It should be filled later 262 | /// using proper `NamespaceStack`. 263 | /// 264 | /// It is supposed that all characters in the argument string are correct 265 | /// as defined by the XML specification. No additional checks except a check 266 | /// for emptiness are done. 267 | fn from_str(s: &str) -> Result { 268 | let mut it = s.split(':'); 269 | 270 | let r = match (it.next(), it.next(), it.next()) { 271 | (Some(prefix), Some(local_name), None) if !prefix.is_empty() && 272 | !local_name.is_empty() => 273 | Some((local_name.into(), Some(prefix.into()))), 274 | (Some(local_name), None, None) if !local_name.is_empty() => 275 | Some((local_name.into(), None)), 276 | (_, _, _) => None 277 | }; 278 | r.map(|(local_name, prefix)| Self { 279 | local_name, 280 | namespace: None, 281 | prefix 282 | }).ok_or(()) 283 | } 284 | } 285 | 286 | #[cfg(test)] 287 | mod tests { 288 | use super::OwnedName; 289 | 290 | #[test] 291 | fn test_owned_name_from_str() { 292 | assert_eq!("prefix:name".parse(), Ok(OwnedName { 293 | local_name: "name".into(), 294 | namespace: None, 295 | prefix: Some("prefix".into()) 296 | })); 297 | 298 | assert_eq!("name".parse(), Ok(OwnedName { 299 | local_name: "name".into(), 300 | namespace: None, 301 | prefix: None 302 | })); 303 | 304 | assert_eq!("".parse(), Err::(())); 305 | assert_eq!(":".parse(), Err::(())); 306 | assert_eq!(":a".parse(), Err::(())); 307 | assert_eq!("a:".parse(), Err::(())); 308 | assert_eq!("a:b:c".parse(), Err::(())); 309 | } 310 | } 311 | --------------------------------------------------------------------------------