├── .gitignore
├── .travis.yml
├── tests
├── lib.rs
├── document_test.rs
├── fixtures
│ └── sample.xml
├── xml_document_test.rs
├── element_test.rs
├── selector_test.rs
└── querying_by_selectors_test.rs
├── Cargo.toml
├── Cargo.lock
├── LICENSE
├── README.md
└── src
├── document
└── mod.rs
├── selector.rs
└── lib.rs
/.gitignore:
--------------------------------------------------------------------------------
1 | # Compiled files
2 | *.o
3 | *.so
4 | *.rlib
5 | *.dll
6 |
7 | # Executables
8 | *.exe
9 |
10 | # Generated by Cargo
11 | /target/
12 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: rust
2 |
3 | rust:
4 | - stable
5 | - beta
6 | - nightly
7 |
8 | matrix:
9 | allow_failures:
10 | - rust: nightly
11 | - rust: beta
12 |
--------------------------------------------------------------------------------
/tests/lib.rs:
--------------------------------------------------------------------------------
1 | extern crate rquery;
2 |
3 | #[cfg(test)]
4 | mod element_test;
5 |
6 | #[cfg(test)]
7 | mod selector_test;
8 |
9 | #[cfg(test)]
10 | mod xml_document_test;
11 |
12 | #[cfg(test)]
13 | mod document_test;
14 |
15 | #[cfg(test)]
16 | mod querying_by_selectors_test;
17 |
--------------------------------------------------------------------------------
/tests/document_test.rs:
--------------------------------------------------------------------------------
1 | use rquery::Document;
2 |
3 | fn new_document() -> Document {
4 | Document::new_from_xml_string(r#"
5 |
6 |
7 |
8 |
9 |
10 |
11 | "#).unwrap()
12 | }
13 |
14 | #[test]
15 | fn it_captures_the_correct_number_of_elements() {
16 | let document = new_document();
17 |
18 | assert_eq!(document.number_of_elements(), 4);
19 | }
20 |
--------------------------------------------------------------------------------
/tests/fixtures/sample.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | This is some text
4 |
5 | Sample Document
6 | Some unrecognisable scribbling
7 |
8 |
9 |
10 | -
11 | Another Sample
12 |
[http://path.to.somewhere]
13 |
14 |
15 | -
16 | Other Sample
17 |
[http://some.other.path]
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "rquery"
3 | version = "0.4.1"
4 | authors = ["Bryan Yap "]
5 | description = "A simple implementation of a HTML/XML DOM tree which allows simple operations like querying by CSS selectors, makes dealing with XML files less painful."
6 | documentation = "https://yggie.github.io/rquery/rquery"
7 | homepage = "https://github.com/yggie/rquery"
8 | repository = "https://github.com/yggie/rquery"
9 | readme = "README.md"
10 | keywords = ["xml", "DOM", "jquery"]
11 | license = "MIT"
12 |
13 | [[test]]
14 | name = "rquery-tests"
15 | path = "tests/lib.rs"
16 |
17 | [dependencies]
18 | xml-rs = "0.7"
19 |
--------------------------------------------------------------------------------
/Cargo.lock:
--------------------------------------------------------------------------------
1 | [[package]]
2 | name = "bitflags"
3 | version = "1.0.1"
4 | source = "registry+https://github.com/rust-lang/crates.io-index"
5 |
6 | [[package]]
7 | name = "rquery"
8 | version = "0.4.1"
9 | dependencies = [
10 | "xml-rs 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
11 | ]
12 |
13 | [[package]]
14 | name = "xml-rs"
15 | version = "0.7.0"
16 | source = "registry+https://github.com/rust-lang/crates.io-index"
17 | dependencies = [
18 | "bitflags 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
19 | ]
20 |
21 | [metadata]
22 | "checksum bitflags 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b3c30d3802dfb7281680d6285f2ccdaa8c2d8fee41f93805dba5c4cf50dc23cf"
23 | "checksum xml-rs 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "3c1cb601d29fe2c2ac60a2b2e5e293994d87a1f6fa9687a31a15270f909be9c2"
24 |
--------------------------------------------------------------------------------
/tests/xml_document_test.rs:
--------------------------------------------------------------------------------
1 | use rquery::Document;
2 |
3 | #[test]
4 | fn it_captures_the_correct_number_of_elements() {
5 | let result = Document::new_from_xml_string(r#"
6 |
7 |
8 |
9 |
10 |
11 |
12 | "#);
13 |
14 | assert!(result.is_ok());
15 | }
16 |
17 | #[test]
18 | fn it_can_be_created_from_a_file() {
19 | let result = Document::new_from_xml_file("tests/fixtures/sample.xml");
20 |
21 | assert!(result.is_ok());
22 | }
23 |
24 | #[test]
25 | fn it_returns_an_error_for_non_existent_files() {
26 | let result = Document::new_from_xml_file("non-existent.why");
27 |
28 | assert!(result.is_err());
29 | }
30 |
31 | #[test]
32 | fn it_returns_an_error_for_invalid_xml_files() {
33 | let result = Document::new_from_xml_file("non-existent.why");
34 |
35 | assert!(result.is_err());
36 | }
37 |
--------------------------------------------------------------------------------
/tests/element_test.rs:
--------------------------------------------------------------------------------
1 | use rquery::Document;
2 |
3 | fn new_document() -> Document {
4 | Document::new_from_xml_string(r#"
5 |
6 |
7 | This is some text
8 |
9 | "#).unwrap()
10 | }
11 |
12 |
13 | #[test]
14 | fn it_knows_its_tag_name() {
15 | let document = new_document();
16 |
17 | let element = document.select("main").unwrap();
18 | assert_eq!(element.tag_name(), "main");
19 | }
20 |
21 | #[test]
22 | fn it_knows_its_attributes() {
23 | let document = new_document();
24 |
25 | let element = document.select("main").unwrap();
26 | assert_eq!(element.attr("type").unwrap(), "simple");
27 | }
28 |
29 | #[test]
30 | fn it_knows_its_inner_text_contents() {
31 | let document = new_document();
32 |
33 |
34 | let element = document.select("main").unwrap();
35 | assert_eq!(element.text().trim(), "This is some text");
36 | }
37 |
38 | #[test]
39 | fn it_knows_its_node_indices() {
40 | let document = new_document();
41 |
42 | let element = document.select("main").unwrap();
43 | assert_eq!(element.node_index(), 1);
44 | }
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015 Bryan Yap
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
23 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # rquery
2 |
3 | [](https://travis-ci.org/yggie/rquery)
4 | [](https://yggie.github.io/rquery/rquery)
5 | [](https://crates.io/crates/rquery)
6 | [](/LICENSE)
7 |
8 | A simple implementation of a HTML/XML DOM tree which allows simple operations
9 | like querying by CSS selectors, makes dealing with XML files less painful.
10 |
11 | ## Example
12 |
13 | ```rust
14 | extern crate rquery;
15 |
16 | use rquery::Document;
17 |
18 | fn main() {
19 | let document = Document::new_from_xml_file("tests/fixtures/sample.xml").unwrap();
20 |
21 | let title = document.select("title").unwrap();
22 | assert_eq!(title.text(), "Sample Document");
23 | assert_eq!(title.attr("ref").unwrap(), "main-title");
24 |
25 | let item_count = document.select_all("item").unwrap().count();
26 | assert_eq!(item_count, 2);
27 |
28 | let item_titles = document.select_all("item > title").unwrap()
29 | .map(|element| element.text().clone())
30 | .collect::>()
31 | .join(", ");
32 | assert_eq!(item_titles, "Another Sample, Other Sample");
33 | }
34 | ```
35 |
--------------------------------------------------------------------------------
/tests/selector_test.rs:
--------------------------------------------------------------------------------
1 | use rquery::{ CompoundSelector, MatchType, Scope, Selector };
2 |
3 | fn assert_as_single_tag(compound_selector: &CompoundSelector, tag_name: &str) {
4 | assert_eq!(compound_selector.parts.len(), 1);
5 |
6 | if let &Selector::TagName(ref string) = compound_selector.parts.last().unwrap() {
7 | assert_eq!(string, tag_name)
8 | } else {
9 | panic!(format!("Did not match tag name \"{}\"", tag_name));
10 | }
11 | }
12 |
13 | #[test]
14 | fn it_can_parse_a_single_tag_selector() {
15 | let compound_selectors = CompoundSelector::parse("apples").unwrap();
16 |
17 | assert_eq!(compound_selectors.len(), 1);
18 |
19 | assert_eq!(compound_selectors[0].scope, Scope::IndirectChild);
20 | assert_as_single_tag(&compound_selectors[0], "apples");
21 | }
22 |
23 | #[test]
24 | fn it_can_parse_a_nested_tag_selectors() {
25 | let compound_selectors = CompoundSelector::parse("basket apple").unwrap();
26 |
27 | assert_eq!(compound_selectors.len(), 2);
28 |
29 | assert_eq!(compound_selectors[0].scope, Scope::IndirectChild);
30 | assert_as_single_tag(&compound_selectors[0], "basket");
31 |
32 | assert_eq!(compound_selectors[1].scope, Scope::IndirectChild);
33 | assert_as_single_tag(&compound_selectors[1], "apple");
34 | }
35 |
36 | #[test]
37 | fn it_can_parse_a_direct_child_selector() {
38 | let compound_selectors = CompoundSelector::parse("basket > apple").unwrap();
39 |
40 | assert_eq!(compound_selectors.len(), 2);
41 |
42 | assert_eq!(compound_selectors[0].scope, Scope::IndirectChild);
43 | assert_as_single_tag(&compound_selectors[0], "basket");
44 |
45 | assert_eq!(compound_selectors[1].scope, Scope::DirectChild);
46 | assert_as_single_tag(&compound_selectors[1], "apple");
47 | }
48 |
49 | #[test]
50 | fn it_can_parse_the_attribute_selector() {
51 | let compound_selectors = CompoundSelector::parse(r#"[attribute="true-value"]"#).unwrap();
52 |
53 | assert_eq!(compound_selectors.len(), 1);
54 | assert_eq!(compound_selectors[0].parts.len(), 1);
55 |
56 | if let Selector::Attribute(ref attribute, match_type, ref value) = compound_selectors[0].parts[0] {
57 | assert_eq!(attribute, "attribute");
58 | assert_eq!(match_type, MatchType::Equals);
59 | assert_eq!(value, "true-value");
60 | } else {
61 | panic!("Could not parse the attribute selector");
62 | }
63 | }
64 |
65 | #[test]
66 | fn it_can_parse_the_id_selector() {
67 | let compound_selectors = CompoundSelector::parse("#the-id").unwrap();
68 |
69 | assert_eq!(compound_selectors.len(), 1);
70 | assert_eq!(compound_selectors[0].parts.len(), 1);
71 |
72 | if let Selector::Id(ref value) = compound_selectors[0].parts[0] {
73 | assert_eq!(value, "the-id");
74 | } else {
75 | panic!("Could not parse the ID selector");
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/tests/querying_by_selectors_test.rs:
--------------------------------------------------------------------------------
1 | use rquery::{ Document, Element, SelectError, UnexpectedTokenError };
2 |
3 | pub fn new_document() -> Document {
4 | Document::new_from_xml_string(r#"
5 |
6 |
7 | This is some text
8 |
9 | Simple Sample
10 | Some unrecognisable scribbling
11 |
12 |
13 |
14 | -
15 | Another Sample
16 |
[http://path.to.somewhere]
17 |
18 |
19 | -
20 | Other Sample
21 |
[http://some.other.path]
22 |
23 |
24 |
25 |
26 |
27 |
39 |
40 | "#).unwrap()
41 | }
42 |
43 | #[test]
44 | fn it_supports_the_tag_selector() {
45 | let document = new_document();
46 |
47 | let elements: Vec<&Element> = document.select_all("note").unwrap().collect();
48 |
49 | assert_eq!(elements.len(), 1);
50 | assert_eq!(elements[0].tag_name(), "note");
51 | }
52 |
53 | #[test]
54 | fn it_supports_the_nested_tag_selector() {
55 | let document = new_document();
56 |
57 | let elements: Vec<&Element> = document.select_all("related title").unwrap().collect();
58 |
59 | assert_eq!(elements.len(), 2);
60 |
61 | let element_tag_names: Vec = elements.iter()
62 | .map(|el| el.tag_name().to_string())
63 | .collect();
64 | assert_eq!(element_tag_names, vec!("title", "title"));
65 | }
66 |
67 | #[test]
68 | fn it_supports_nesting_selectors() {
69 | let document = new_document();
70 |
71 | let elements: Vec<&Element> = document.select_all("related").unwrap()
72 | .flat_map(|element| element.select_all("title").unwrap())
73 | .collect();
74 |
75 | assert_eq!(elements.len(), 2);
76 |
77 | let element_tag_names: Vec = elements.iter()
78 | .map(|el| el.tag_name().to_string())
79 | .collect();
80 | assert_eq!(element_tag_names, vec!("title", "title"));
81 | }
82 |
83 | #[test]
84 | fn it_supports_the_direct_child_tag_selector() {
85 | let document = new_document();
86 |
87 | let elements: Vec<&Element> = document.select_all("sample > title").unwrap().collect();
88 |
89 | assert_eq!(elements.len(), 1);
90 |
91 | let element = elements[0];
92 | assert_eq!(element.tag_name(), "title");
93 | }
94 |
95 | #[test]
96 | fn it_returns_a_no_match_error_when_the_selector_does_not_match_any_element() {
97 | let document = new_document();
98 |
99 | let result = document.select("nonexistentelement");
100 |
101 | if let Err(err) = result {
102 | assert_eq!(err, SelectError::NoMatchError);
103 | } else {
104 | panic!("The select did not result in an error!");
105 | }
106 | }
107 |
108 | #[test]
109 | fn it_returns_a_parse_error_when_the_selector_is_invalid() {
110 | let document = new_document();
111 |
112 | let result = document.select_all("?");
113 |
114 | if let Err(err) = result {
115 | assert_eq!(err, SelectError::ParseError(UnexpectedTokenError('?')));
116 | } else {
117 | panic!("The invalid selector did not result in an error!");
118 | }
119 | }
120 |
121 | #[test]
122 | fn it_supports_the_attribute_selector() {
123 | let document = new_document();
124 |
125 | let elements: Vec<&Element> = document.select_all(r#"[long="false"]"#).unwrap().collect();
126 |
127 | assert_eq!(elements.len(), 1);
128 |
129 | let element = elements[0];
130 | assert_eq!(element.text(), "Some unrecognisable scribbling");
131 | }
132 |
133 | #[test]
134 | fn it_supports_the_id_selector() {
135 | let document = new_document();
136 |
137 | let elements: Vec<&Element> = document.select_all("#id-1").unwrap().collect();
138 |
139 | assert_eq!(elements.len(), 1);
140 |
141 | let element = elements[0];
142 | assert_eq!(element.tag_name(), "item");
143 | assert_eq!(element.attr("id"), Some(&"id-1".to_string()));
144 | }
145 |
146 | #[test]
147 | fn it_supports_the_compound_selectors() {
148 | let document = new_document();
149 |
150 | let elements: Vec<&Element> = document.select_all("div[type=three]").unwrap().collect();
151 |
152 | assert_eq!(elements.len(), 1);
153 |
154 | let element = elements[0];
155 | assert_eq!(element.tag_name(), "div");
156 | assert_eq!(element.attr("type"), Some(&"three".to_string()));
157 | }
158 |
159 | #[test]
160 | fn it_does_not_repeat_elements() {
161 | let document = new_document();
162 |
163 | let unique_count = document.select_all("div").unwrap().count();
164 | assert_eq!(unique_count, 8);
165 |
166 | let direct_nested_count = document.select_all("div > div").unwrap().count();
167 | assert_eq!(direct_nested_count, 5);
168 |
169 | let nested_count = document.select_all("div div").unwrap().count();
170 | assert_eq!(nested_count, 6);
171 | }
172 |
--------------------------------------------------------------------------------
/src/document/mod.rs:
--------------------------------------------------------------------------------
1 | use std::io::{ BufReader, Read };
2 | use std::fs::File;
3 | use std::rc::Rc;
4 | use std::path::Path;
5 | use std::collections::HashMap;
6 |
7 | use xml::reader::{ EventReader, XmlEvent };
8 |
9 | use super::{ Element, SelectError };
10 |
11 | /// The various errors that can happen when creating a document.
12 | #[derive(Clone, Debug)]
13 | pub enum DocumentError {
14 | UnableToOpenFile(String),
15 | ParseError(String),
16 | }
17 |
18 | /// The DOM tree representation of the parsed document.
19 | #[derive(Clone, Debug)]
20 | pub struct Document {
21 | root: Element,
22 | }
23 |
24 | impl Document {
25 | /// Creates a new document from a byte stream.
26 | pub fn new_from_xml_stream(stream: R) -> Result {
27 | let event_reader = EventReader::new(stream);
28 |
29 | let mut elements: Vec = Vec::new();
30 | let mut next_node_index = 1;
31 |
32 | for event in event_reader {
33 | match event {
34 | Ok(XmlEvent::StartElement { ref name, ref attributes, .. }) => {
35 | let attr_map = attributes.iter()
36 | .fold(HashMap::new(), |mut hash_map, attribute| {
37 | hash_map.insert(attribute.name.local_name.clone(), attribute.value.clone());
38 |
39 | return hash_map;
40 | });
41 |
42 | elements.push(Element {
43 | node_index: next_node_index,
44 | children: None,
45 | tag_name: name.local_name.clone(),
46 | attr_map: attr_map,
47 | text: String::new(),
48 | });
49 | next_node_index = next_node_index + 1;
50 | },
51 |
52 | Ok(XmlEvent::EndElement { ref name, .. }) if elements.last().unwrap().tag_name() == name.local_name => {
53 | let child_node = elements.pop().unwrap();
54 |
55 | if let Some(mut parent) = elements.pop() {
56 | if let Some(ref mut children) = parent.children {
57 | children.push(Rc::new(child_node));
58 | } else {
59 | parent.children = Some(vec!(Rc::new(child_node)));
60 | }
61 |
62 | elements.push(parent);
63 | } else {
64 | return Ok(Document {
65 | root: Element {
66 | node_index: 0,
67 | tag_name: "[root]".to_string(),
68 | children: Some(vec!(Rc::new(child_node))),
69 | attr_map: HashMap::new(),
70 | text: String::new(),
71 | }
72 | });
73 | }
74 | },
75 |
76 | Ok(XmlEvent::Characters(string)) => {
77 | elements.last_mut().unwrap().text.push_str(&string);
78 | },
79 |
80 | Ok(XmlEvent::Whitespace(string)) => {
81 | elements.last_mut().unwrap().text.push_str(&string);
82 | },
83 |
84 | Err(error) => {
85 | return Err(DocumentError::ParseError(error.to_string()));
86 | },
87 |
88 | Ok(_) => { },
89 | }
90 | }
91 |
92 | panic!("Root element was not properly returned!");
93 | }
94 |
95 | /// Creates a new document from a string.
96 | pub fn new_from_xml_string(string: &str) -> Result {
97 | Document::new_from_xml_stream(string.as_bytes())
98 | }
99 |
100 | /// Creates a new document from a file.
101 | pub fn new_from_xml_file(filename: &str) -> Result {
102 | let path = Path::new(filename);
103 |
104 | if let Ok(file) = File::open(path) {
105 | let reader = BufReader::new(file);
106 |
107 | Document::new_from_xml_stream(reader)
108 | } else {
109 | Err(DocumentError::UnableToOpenFile(path.to_str().unwrap().to_string()))
110 | }
111 | }
112 |
113 | /// Returns the total number of elements in the document.
114 | pub fn number_of_elements(&self) -> usize {
115 | self.root.subtree_size() - 1
116 | }
117 |
118 | /// Searches the document for elements matching the given CSS selector.
119 | pub fn select_all<'a>(&'a self, selector: &str) -> Result + 'a>, SelectError> {
120 | self.root.select_all(selector)
121 | }
122 |
123 | /// Just like `select_all` but only returns the first match.
124 | pub fn select<'a>(&'a self, selector: &str) -> Result<&'a Element, SelectError> {
125 | self.root.select(selector)
126 | }
127 | }
128 |
129 | #[test]
130 | fn it_assigns_node_indices_in_monotonically_increasing_order() {
131 | let document = Document::new_from_xml_string(r#"
132 |
133 |
134 | This is some text
135 |
136 | Simple Sample
137 | Some unrecognisable scribbling
138 |
139 |
140 |
141 | -
142 | Another Sample
143 |
[http://path.to.somewhere]
144 |
145 |
146 | -
147 | Other Sample
148 |
[http://some.other.path]
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
165 |
166 |
167 | "#).unwrap();
168 |
169 | assert_eq!(document.root.node_index, 0);
170 |
171 | document.root.children_deep_iter().fold(0, |index, child| {
172 | assert!(index < child.node_index);
173 | child.node_index
174 | });
175 | }
176 |
--------------------------------------------------------------------------------
/src/selector.rs:
--------------------------------------------------------------------------------
1 | use std::str::Chars;
2 | use std::iter::Peekable;
3 |
4 | /// An error which is returned when parsing a selector encounters an unexpected
5 | /// token
6 | #[derive(Clone, Copy, Debug, PartialEq)]
7 | pub struct UnexpectedTokenError(pub char);
8 |
9 | /// Represents a component of a parsed CSS selector is used to match a single
10 | /// element.
11 | #[derive(Clone, Debug)]
12 | pub struct CompoundSelector {
13 | /// The scope of the selector.
14 | pub scope: Scope,
15 | /// The individual parts that make up the compound selector.
16 | pub parts: Vec,
17 | }
18 |
19 | /// The scope of the `CompoundSelector`.
20 | #[derive(Clone, Copy, PartialEq, Debug)]
21 | pub enum Scope {
22 | /// Implies that the selector must be a direct descendent of the previous
23 | /// match (e.g. `body > header`).
24 | DirectChild,
25 | /// Implies that the selector is a descendent of the previous match (e.g.,
26 | /// `body header`).
27 | IndirectChild,
28 | }
29 |
30 | /// The individual parts of the `CompoundSelector`. For example, the selector
31 | /// `input[type="radio"]` has two parts, the `TagName` and `Attribute`
32 | /// selectors.
33 | #[derive(Clone, Debug)]
34 | pub enum Selector {
35 | /// Represents an id selector (e.g. `#the-id`)
36 | Id(String),
37 | /// Represents a tag name selector (e.g. `input`)
38 | TagName(String),
39 | /// Represents an attribute selector (e.g. `[type="radio"]`)
40 | Attribute(String, MatchType, String),
41 | }
42 |
43 | /// The match type for an attribute selector.
44 | #[derive(Clone, Copy, Debug, PartialEq)]
45 | pub enum MatchType {
46 | /// Indicates that the match must be identical
47 | Equals,
48 | }
49 |
50 | macro_rules! expect_token {
51 | ($token_option: expr, $token: expr) => {
52 | match $token_option {
53 | Some($token) => { },
54 | Some(token) => return Err(UnexpectedTokenError(token)),
55 | None => return Err(UnexpectedTokenError(' ')),
56 | }
57 | }
58 | }
59 |
60 | #[inline]
61 | fn non_digit(c: char) -> bool {
62 | ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')
63 | }
64 |
65 | #[inline]
66 | fn allowed_character(c: char) -> bool {
67 | non_digit(c) || ('0' <= c && c <= '9') || c == '-' || c == '_'
68 | }
69 |
70 | #[inline]
71 | fn valid_start_token(c: char) -> bool {
72 | c == '#' || c == '['
73 | }
74 |
75 | fn extract_valid_string(chars: &mut Peekable) -> Result {
76 | extract_valid_string_until_token(chars, ' ')
77 | }
78 |
79 | fn extract_valid_string_until_token(chars: &mut Peekable, stop_token: char) -> Result {
80 | let mut string = String::new();
81 |
82 | while let Some(&c) = chars.peek() {
83 | if c == stop_token {
84 | chars.next().unwrap();
85 | break;
86 | } else if allowed_character(c) {
87 | string.push(chars.next().unwrap());
88 | } else if valid_start_token(c) {
89 | break;
90 | } else {
91 | return Err(UnexpectedTokenError(c));
92 | }
93 | }
94 |
95 | return Ok(string);
96 | }
97 |
98 | impl Selector {
99 | fn create_list(string: &str) -> Result, UnexpectedTokenError> {
100 | let mut selectors = Vec::new();
101 |
102 | let mut chars = string.chars().peekable();
103 | while let Some(&c) = chars.peek() {
104 | match Selector::next_selector(c, &mut chars) {
105 | Ok(selector) =>
106 | selectors.push(selector),
107 |
108 | Err(err) =>
109 | return Err(err),
110 | }
111 | }
112 |
113 | return Ok(selectors);
114 | }
115 |
116 | fn next_selector(c: char, chars: &mut Peekable) -> Result {
117 | if non_digit(c) {
118 | Selector::create_tag_name(chars)
119 | } else if c == '#' {
120 | Selector::create_id(chars)
121 | } else if c == '[' {
122 | Selector::create_attribute(chars)
123 | } else {
124 | Err(UnexpectedTokenError(c))
125 | }
126 | }
127 |
128 | fn create_tag_name(chars: &mut Peekable) -> Result {
129 | extract_valid_string(chars).map(|s| Selector::TagName(s))
130 | }
131 |
132 | fn create_id(chars: &mut Peekable) -> Result {
133 | match chars.next() {
134 | Some('#') =>
135 | return extract_valid_string(chars).map(|s| Selector::Id(s)),
136 |
137 | Some(token) =>
138 | return Err(UnexpectedTokenError(token)),
139 |
140 | None =>
141 | return Err(UnexpectedTokenError(' ')),
142 | }
143 | }
144 |
145 | fn create_attribute(chars: &mut Peekable) -> Result {
146 | expect_token!(chars.next(), '[');
147 |
148 | extract_valid_string_until_token(chars, '=').and_then(|attribute| {
149 | Ok((attribute, MatchType::Equals))
150 | }).and_then(|(attribute, match_type)| {
151 | let result = if Some(&'"') == chars.peek() {
152 | chars.next().unwrap();
153 | let result = extract_valid_string_until_token(chars, '"');
154 | expect_token!(chars.next(), ']');
155 |
156 | result
157 | } else {
158 | extract_valid_string_until_token(chars, ']')
159 | };
160 |
161 | result.map(|value| {
162 | Selector::Attribute(attribute, match_type, value)
163 | })
164 | })
165 | }
166 | }
167 |
168 | struct SelectorParts> {
169 | inner_iter: I,
170 | }
171 |
172 | impl> Iterator for SelectorParts {
173 | type Item = (Scope, String);
174 |
175 | fn next(&mut self) -> Option {
176 | self.inner_iter.next().and_then(|next_part| {
177 | if &next_part == ">" {
178 | Some((Scope::DirectChild, self.inner_iter.next().unwrap()))
179 | } else {
180 | Some((Scope::IndirectChild, next_part))
181 | }
182 | })
183 | }
184 | }
185 |
186 | impl CompoundSelector {
187 | /// Parses the string and converts it to a list of `CompoundSelector`s.
188 | pub fn parse(selector: &str) -> Result, UnexpectedTokenError> {
189 | let normalized_selector = selector.split(">")
190 | .collect::>()
191 | .join(" > ");
192 |
193 | let selector_parts = SelectorParts {
194 | inner_iter: normalized_selector.split_whitespace().into_iter().map(|s| s.to_string()),
195 | };
196 |
197 | selector_parts
198 | .fold(Ok(Vec::new()), |result_so_far, (scope, part)| {
199 | if let Ok(mut compound_selectors) = result_so_far {
200 | Selector::create_list(&part).map(|parts| {
201 | compound_selectors.push(CompoundSelector {
202 | scope: scope,
203 | parts: parts
204 | });
205 |
206 | compound_selectors
207 | })
208 | } else {
209 | result_so_far
210 | }
211 | })
212 | }
213 | }
214 |
--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
1 | //! This library offers the ability to represent XML documents as DOM trees,
2 | //! allowing querying with CSS selectors.
3 | //!
4 | //! ```
5 | //! extern crate rquery;
6 | //!
7 | //! use rquery::Document;
8 | //!
9 | //! fn main() {
10 | //! let document = Document::new_from_xml_file("tests/fixtures/sample.xml").unwrap();
11 | //!
12 | //! let title = document.select("title").unwrap();
13 | //! assert_eq!(title.text(), "Sample Document");
14 | //! assert_eq!(title.attr("ref").unwrap(), "main-title");
15 | //!
16 | //! let item_count = document.select_all("item").unwrap().count();
17 | //! assert_eq!(item_count, 2);
18 | //!
19 | //! let item_titles = document.select_all("item > title").unwrap()
20 | //! .map(|element| element.text().clone())
21 | //! .collect::>()
22 | //! .join(", ");
23 | //! assert_eq!(item_titles, "Another Sample, Other Sample");
24 | //! }
25 | //! ```
26 |
27 | #![warn(missing_docs)]
28 |
29 | extern crate xml;
30 |
31 | mod selector;
32 | mod document;
33 |
34 | pub use self::document::{Document, DocumentError};
35 | pub use self::selector::{ CompoundSelector, MatchType, Scope, Selector, UnexpectedTokenError };
36 |
37 | use std::rc::Rc;
38 | use std::iter::{ empty, once };
39 | use std::marker::PhantomData;
40 | use std::collections::HashMap;
41 |
42 | /// Represents a single element in the DOM tree.
43 | #[derive(Clone, Debug)]
44 | pub struct Element {
45 | node_index: usize,
46 | tag_name: String,
47 | children: Option>>,
48 | attr_map: HashMap,
49 | text: String,
50 | }
51 |
52 | /// Errors which can be returned when performing a select operation.
53 | #[derive(Clone, Copy, Debug, PartialEq)]
54 | pub enum SelectError {
55 | /// Returned when the selector could not be parsed successfully.
56 | ParseError(UnexpectedTokenError),
57 | /// Returned when there were no matches for the selector.
58 | NoMatchError,
59 | }
60 |
61 | struct UniqueElements<'a, I: Iterator- + 'a> {
62 | next_index: usize,
63 | inner_iter: I,
64 | phantom_data: PhantomData<&'a i32>,
65 | }
66 |
67 | impl<'a, I: Iterator
- > Iterator for UniqueElements<'a, I> {
68 | type Item = &'a Element;
69 |
70 | fn next(&mut self) -> Option {
71 | loop {
72 | match self.inner_iter.next() {
73 | Some(element) if element.node_index < self.next_index => {
74 | println!("SKIPPED");
75 | // do nothing
76 | },
77 |
78 | Some(element) => {
79 | self.next_index = element.node_index + 1;
80 | return Some(element);
81 | },
82 |
83 | None => return None,
84 | }
85 | }
86 | }
87 | }
88 |
89 | impl Element {
90 | /// Searches the elements children for elements matching the given CSS
91 | /// selector.
92 | pub fn select_all<'a>(&'a self, selector: &str) -> Result + 'a>, SelectError> {
93 | CompoundSelector::parse(selector)
94 | .map_err(|err| SelectError::ParseError(err))
95 | .and_then(|compound_selectors| {
96 | let initial_iterator: Box> = Box::new(once(self));
97 |
98 | let iterator = compound_selectors.into_iter()
99 | .fold(initial_iterator, |iter, compound_selector| {
100 | let scope = compound_selector.scope;
101 |
102 | let children_iter = iter
103 | .flat_map(move |child| {
104 | match scope {
105 | Scope::IndirectChild => child.children_deep_iter(),
106 | Scope::DirectChild => child.children_iter(),
107 | }
108 | });
109 |
110 | let matching_children_iter = children_iter
111 | .filter_map(move |child| {
112 | if child.matches(&compound_selector) {
113 | Some(child)
114 | } else {
115 | None
116 | }
117 | });
118 |
119 | let unique_children_iter = UniqueElements {
120 | next_index: 0,
121 | inner_iter: matching_children_iter,
122 | phantom_data: PhantomData,
123 | };
124 |
125 | Box::new(unique_children_iter)
126 | });
127 |
128 | return Ok(iterator);
129 | })
130 | }
131 |
132 | /// Just like `select_all` but only returns the first match.
133 | pub fn select<'a>(&'a self, selector: &str) -> Result<&'a Element, SelectError> {
134 | self.select_all(selector).and_then(|mut iterator| {
135 | if let Some(element) = iterator.next() {
136 | Ok(element)
137 | } else {
138 | Err(SelectError::NoMatchError)
139 | }
140 | })
141 | }
142 |
143 | /// Returns an iterator over the element’s direct children.
144 | pub fn children_iter<'a>(&'a self) -> Box + 'a> {
145 | if let Some(ref children) = self.children {
146 | Box::new(children.iter().map(|node| -> &'a Element { node }))
147 | } else {
148 | Box::new(empty::<&'a Element>())
149 | }
150 | }
151 |
152 | /// Returns an iterator over all the element’s children, including indirect
153 | /// child elements.
154 | pub fn children_deep_iter<'a>(&'a self) -> Box + 'a> {
155 | let iterator = self.children_iter()
156 | .flat_map(|child| once(child).chain(child.children_deep_iter()));
157 |
158 | Box::new(iterator)
159 | }
160 |
161 | /// Returns the size of the DOM subtree, including the current element.
162 | pub fn subtree_size(&self) -> usize {
163 | if let Some(ref children) = self.children {
164 | children.iter().fold(1, |subtotal, child| child.subtree_size() + subtotal)
165 | } else {
166 | 1
167 | }
168 | }
169 |
170 | /// Returns the name of the element’s tag.
171 | pub fn tag_name(&self) -> &str {
172 | &self.tag_name
173 | }
174 |
175 | /// Returns the value of the element attribute if found.
176 | pub fn attr(&self, attr_name: &str) -> Option<&String> {
177 | self.attr_map.get(attr_name)
178 | }
179 |
180 | /// Returns the text contained within the element.
181 | pub fn text(&self) -> &String {
182 | &self.text
183 | }
184 |
185 | /// Returns true if the element matches the given selector.
186 | pub fn matches(&self, compound_selector: &CompoundSelector) -> bool {
187 | compound_selector.parts.iter().all(|part| {
188 | match part {
189 | &Selector::TagName(ref name) =>
190 | self.tag_name() == name,
191 |
192 | &Selector::Id(ref id) =>
193 | self.attr("id") == Some(id),
194 |
195 | &Selector::Attribute(ref attr, MatchType::Equals, ref value) =>
196 | self.attr(attr) == Some(value),
197 | }
198 | })
199 | }
200 |
201 | /// Returns the node index for the element.
202 | pub fn node_index(&self) -> usize {
203 | self.node_index
204 | }
205 | }
206 |
--------------------------------------------------------------------------------