├── .gitignore ├── src ├── traits.rs ├── attribute.rs ├── lib.rs ├── tag.rs ├── dom.rs └── parser.rs ├── Cargo.toml ├── .travis.yml ├── examples ├── domtree.rs └── passthrough.rs └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | **/*.rs.bk 3 | **/*.rs~ 4 | Cargo.lock 5 | -------------------------------------------------------------------------------- /src/traits.rs: -------------------------------------------------------------------------------- 1 | /// A trait for converting a type to HTML representation 2 | pub trait ToHTML { 3 | /// Formats the value to HTML representation. 4 | fn to_html(&self) -> String; 5 | } 6 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "domx" 3 | description = "HTML parser and DOM tree builder" 4 | version = "0.1.0" 5 | authors = ["Henrik Andersson "] 6 | repository = "https://github.com/hean01/domx.git" 7 | license = "GPL-3.0" 8 | keywords = ["HTML", "DOM", "parser"] 9 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: rust 2 | 3 | rust: 4 | - stable 5 | - beta 6 | - nightly 7 | 8 | matrix: 9 | allow_failures: 10 | - rust: nightly 11 | 12 | branches: 13 | only: 14 | # release tags 15 | - /^v\d+\.\d+\.\d+.*$/ 16 | - master 17 | 18 | notifications: 19 | email: 20 | on_success: never -------------------------------------------------------------------------------- /examples/domtree.rs: -------------------------------------------------------------------------------- 1 | extern crate domx; 2 | use std::fs::File; 3 | use std::io::BufReader; 4 | use domx::ToHTML; 5 | 6 | fn main() { 7 | 8 | if std::env::args().len() != 2 { 9 | println!("Usage: domtree "); 10 | return; 11 | } 12 | 13 | let filename = std::env::args().nth(1).unwrap(); 14 | let file = File::open(filename).unwrap(); 15 | let mut reader = BufReader::new(file); 16 | 17 | let mut dom = domx::Dom::new(); 18 | dom.parse(&mut reader).unwrap(); 19 | println!("{}", dom.to_html()); 20 | } 21 | -------------------------------------------------------------------------------- /examples/passthrough.rs: -------------------------------------------------------------------------------- 1 | extern crate domx; 2 | use std::fs::File; 3 | use std::io::BufReader; 4 | 5 | struct Dummy; 6 | impl domx::IsParser for Dummy { 7 | fn handle_starttag(self: &mut Self, element: &domx::Tag, attributes: &Vec) { 8 | 9 | let mut av: Vec = Vec::new(); 10 | 11 | av.push(element.to_string()); 12 | 13 | for ref attr in attributes { 14 | av.push(format!("{}", attr)); 15 | } 16 | 17 | print!("<{}>", av.join(" ").as_str()); 18 | } 19 | 20 | fn handle_endtag(self: &mut Self, element: &domx::Tag) { 21 | print!("", element.to_string()); 22 | } 23 | 24 | fn handle_data(self: &mut Self, data: &Vec) { 25 | print!("{}", String::from_utf8(data.clone()).unwrap()); 26 | } 27 | } 28 | 29 | fn main() { 30 | 31 | if std::env::args().len() != 2 { 32 | println!("Usage: passthrough "); 33 | return; 34 | } 35 | 36 | let filename = std::env::args().nth(1).unwrap(); 37 | let file = File::open(filename).unwrap(); 38 | let mut reader = BufReader::new(file); 39 | domx::Parser::parse(&mut reader, &mut Dummy{}).unwrap(); 40 | } 41 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # domx - HTML Parser and DOM builder 2 | 3 | [![Status](https://img.shields.io/travis/hean01/domx/master.svg)](https://travis-ci.org/hean01/domx) 4 | 5 | __domx__ includes a small HTML [Parser] and [DOM] builder for easing the 6 | work with HTML data as structured data. The goal is to be very 7 | resilience against invalid HTML documents, eg. missing closing tag 8 | etc. In worst case you just get strange data from the parser. 9 | 10 | The [Parser] itself runs through the HTML document and by using the 11 | trait [IsParser], implemented by the caller as handler, you will be 12 | notified when a opening tag, closing tag and data is parsed. 13 | Information through the callback is provided as [Tag], a vector of 14 | [Attribute] and data as a vector of u8. See example below how to use 15 | the [Parser] and a simple implementation of [IsParser]. 16 | 17 | 18 | The [DOM] builder uses the parser to build up a tree data 19 | structure of the HTML document. Which you can traverse and perform 20 | operations on such as cleaning up the document or just simplify 21 | it. Running a broken HTML, eg missing closing tags, into DOM and 22 | then saving it you will get a nice consistent and valid HTML file. 23 | 24 | __domx__ is licensed under GPLv3 25 | -------------------------------------------------------------------------------- /src/attribute.rs: -------------------------------------------------------------------------------- 1 | use std; 2 | 3 | use traits::{ToHTML}; 4 | 5 | /// Attribute representing a HTML attribute name and value. 6 | /// 7 | #[derive(Clone)] 8 | pub struct Attribute { 9 | #[doc(hidden)] 10 | pub name: Vec, 11 | #[doc(hidden)] 12 | pub value: Vec, 13 | } 14 | 15 | impl Attribute { 16 | /// Create new attribute 17 | pub fn new(name: &str, value: &str) -> Attribute { 18 | Attribute { 19 | name: name.to_string().into_bytes(), 20 | value: value.to_string().into_bytes(), 21 | } 22 | } 23 | 24 | /// Create new boolean attribute, eg. no value 25 | pub fn new_boolean(name: &str) -> Attribute { 26 | Attribute { 27 | name: name.to_string().into_bytes(), 28 | value: Vec::new(), 29 | } 30 | } 31 | 32 | /// Test if attribute is boolean value 33 | pub fn is_boolean(&self) -> bool { 34 | match self.value.len() { 35 | 0 => true, 36 | _ => false, 37 | } 38 | } 39 | 40 | /// Get attribute name as utf8 encoded string 41 | pub fn name(&self) -> String { 42 | String::from_utf8(self.name.clone()).unwrap() 43 | } 44 | 45 | /// Get attribute value as utf8 encoded string 46 | pub fn value(&self) -> String { 47 | String::from_utf8(self.value.clone()).unwrap() 48 | } 49 | } 50 | 51 | impl ToHTML for Attribute { 52 | fn to_html(&self) -> String { 53 | match self.is_boolean() { 54 | true => self.name(), 55 | false => format!("{}=\"{}\"", self.name(), self.value()) 56 | } 57 | } 58 | } 59 | 60 | impl std::fmt::Display for Attribute { 61 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { 62 | match self.is_boolean() { 63 | true => f.write_str(&self.name()), 64 | false => f.write_str(&format!("{}=\"{}\"", self.name(), self.value())) 65 | } 66 | } 67 | } 68 | 69 | #[cfg(test)] 70 | mod tests { 71 | use traits::ToHTML; 72 | use attribute::Attribute; 73 | 74 | #[test] 75 | fn new_boolean_is_boolean() { 76 | let a = Attribute::new_boolean("selected"); 77 | assert_eq!(a.is_boolean(), true); 78 | } 79 | 80 | #[test] 81 | fn new_is_not_boolean() { 82 | let a = Attribute::new("class", "info"); 83 | assert_eq!(a.is_boolean(), false); 84 | } 85 | 86 | #[test] 87 | fn new_with_utf8_value() { 88 | let a = Attribute::new("id", "💖"); 89 | assert_eq!(a.value(), "💖"); 90 | } 91 | 92 | #[test] 93 | fn attribute_to_html() { 94 | let a = Attribute::new("id", "💖"); 95 | assert_eq!(a.to_html(), "id=\"💖\""); 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! # domx - HTML Parser and DOM builder 2 | //! 3 | //! __domx__ includes a small HTML [Parser] and [DOM] builder for 4 | //! easing the work with HTML data as structured data. The goal is to 5 | //! be very resilience against invalid HTML documents, eg. missing 6 | //! closing tag etc. In worst case you just get strange data from the 7 | //! parser. 8 | //! 9 | //! The [Parser] itself runs through the HTML document and using the 10 | //! trait [IsParser], implemented by the caller as handler, you will 11 | //! be notified when a opening tag, closing tag and data is parsed. 12 | //! Information through the callback is provided as [Tag], a vector of 13 | //! [Attribute] and data as a vector of u8. See example below how to 14 | //! use the [Parser] and a simple implementation of [IsParser]. 15 | //! 16 | //! The [DOM] builder uses the parser to build up a tree data 17 | //! structure of the HTML document. Which you can traverse and perform 18 | //! operations on such as cleaning up the document or just simplify 19 | //! it. Running a broken HTML, eg missing closing tags, into DOM and 20 | //! then saving it you will get a nice consistent and valid HTML file. 21 | //! 22 | //! __domx__ is licensed under GPLv3 23 | //! 24 | //! [DOM]: struct.Dom.html 25 | //! [Parser]: struct.Parser.html 26 | //! [IsParser]: trait.IsParser.html 27 | //! [Tag]: enum.Tag.html 28 | //! [Attribute]: struct.Attribute.html 29 | //! 30 | //! 31 | //! # Panics 32 | //! 33 | //! There is only one place a panic!() is called and that is were an 34 | //! unknown HTML tag is encountered. This is temporary and will be 35 | //! removed in stable release. 36 | //! 37 | //! 38 | //! # Examples 39 | //! 40 | //! Here follows a simple example how to use the DOM parser to filter 41 | //! a HTML document reming a few element with their childs. The 42 | //! retain() method are used and a closure to test nodes in tree, just 43 | //! as one would use the retain function on rust std vector. 44 | //! 45 | //! ```rust 46 | //! #[macro_use] 47 | //! extern crate domx; 48 | //! 49 | //! use domx::{ToHTML, Tag}; 50 | //! 51 | //! fn main() { 52 | //! let mut d = dom!("
An example
\ 53 | //!

Header

Some text

"); 54 | //! 55 | //! println!("BEFORE: {} nodes\n{}", d.len(), d.to_html()); 56 | //! 57 | //! d.retain(|&ref node| { 58 | //! match node.element() { 59 | //! None => true, 60 | //! Some(x) => match *x.tag() { 61 | //! Tag::HEADER => false, 62 | //! Tag::H1 => false, 63 | //! _ => true, 64 | //! } 65 | //! } 66 | //! }); 67 | //! 68 | //! println!("AFTER: {} nodes\n{}", d.len(), d.to_html()); 69 | //! } 70 | //! ``` 71 | //! 72 | //! To use the parser you need to implement the trait IsParser and the 73 | //! three handler callbacks. The following example will show how to do 74 | //! this. 75 | //! 76 | //! ``` 77 | //! extern crate domx; 78 | //! 79 | //! use domx::{Parser, IsParser, Tag, Attribute}; 80 | //! use std::fs::File; 81 | //! use std::io::BufReader; 82 | //! 83 | //! struct MyParser; 84 | //! impl IsParser for MyParser { 85 | //! fn handle_starttag(self: &mut Self, tag: &Tag, attributes: &Vec) { 86 | //! let mut av: Vec = Vec::new(); 87 | //! 88 | //! av.push(tag.to_string()); 89 | //! 90 | //! for ref attr in attributes { 91 | //! av.push(format!("{}", attr)); 92 | //! } 93 | //! 94 | //! print!("<{}>", av.join(" ").as_str()); 95 | //! } 96 | //! 97 | //! fn handle_endtag(self: &mut Self, tag: &Tag) { 98 | //! print!("{}", tag.clone()); 99 | //! } 100 | //! 101 | //! fn handle_data(self: &mut Self, data: &Vec) { 102 | //! print!("{}", String::from_utf8(data.clone()).unwrap()); 103 | //! } 104 | //! } 105 | //! 106 | //! fn main() { 107 | //! 108 | //! if std::env::args().len() != 2 { 109 | //! println!("Usage: passthrough "); 110 | //! return; 111 | //! } 112 | //! 113 | //! let filename = std::env::args().nth(1).unwrap(); 114 | //! let file = File::open(filename).unwrap(); 115 | //! let mut reader = BufReader::new(file); 116 | //! Parser::parse(&mut reader, &mut MyParser{}).unwrap(); 117 | //! } 118 | //! ``` 119 | 120 | mod traits; 121 | pub use traits::{ToHTML}; 122 | 123 | mod tag; 124 | pub use tag::{Tag}; 125 | 126 | mod attribute; 127 | pub use attribute::{Attribute}; 128 | 129 | mod parser; 130 | pub use parser::{Parser, IsParser}; 131 | 132 | #[macro_use] 133 | mod dom; 134 | pub use dom::{Dom}; 135 | 136 | 137 | 138 | 139 | -------------------------------------------------------------------------------- /src/tag.rs: -------------------------------------------------------------------------------- 1 | use std; 2 | 3 | /// Enumeration that represents HTML tags. 4 | #[derive(Clone, PartialEq, Debug)] 5 | pub enum Tag { 6 | Unknown(String), 7 | A, 8 | ABBR, 9 | ACRONYM, 10 | ADDRESS, 11 | ARTICLE, // HTML5 12 | ASIDE, // HTML5 13 | B, 14 | BDO, 15 | BIG, 16 | BLOCKQUOTE, 17 | BODY, 18 | BR, 19 | BUTTON, 20 | CANVAS, // HTML5 21 | CITE, 22 | CODE, 23 | DD, 24 | DFN, 25 | DIV, 26 | DL, 27 | DT, 28 | EM, 29 | FIELDSET, 30 | FIGCAPTION, // HTML5 31 | FIGURE, // HTML5 32 | FOOTER, // HTML5 33 | FORM, 34 | H1, H2, H3, H4, H5, H6, 35 | HEAD, 36 | HEADER, // HTML5 37 | HGROUP, // HTML5 38 | HR, 39 | HTML, 40 | I, 41 | IFRAME, 42 | IMG, 43 | INPUT, 44 | KBD, 45 | LABEL, 46 | LI, 47 | LINK, 48 | MAP, 49 | MAIN, 50 | META, 51 | NAV, 52 | NOSCRIPT, 53 | OBJECT, 54 | OL, 55 | OPTION, 56 | OUTPUT, // HTML5 57 | P, 58 | PRE, 59 | Q, 60 | SAMP, 61 | SCRIPT, 62 | SECTION, // HTML5 63 | SELECT, 64 | SMALL, 65 | SPAN, 66 | STRONG, 67 | STYLE, 68 | SUB, 69 | SUP, 70 | TABLE, 71 | TEXTAREA, 72 | TFOOT, 73 | TIME, 74 | TITLE, 75 | TT, 76 | UL, 77 | VAR, 78 | VIDEO, // HTML5 79 | WBR, // HTML5 80 | } 81 | 82 | impl std::fmt::Display for Tag { 83 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { 84 | match self { 85 | Tag::A => f.write_str("a"), 86 | Tag::ABBR => f.write_str("abr"), 87 | Tag::ACRONYM => f.write_str("acronym"), 88 | Tag::ADDRESS => f.write_str("address"), 89 | Tag::ARTICLE => f.write_str("article"), 90 | Tag::ASIDE => f.write_str("aside"), 91 | Tag::B => f.write_str("b"), 92 | Tag::BDO => f.write_str("bdo"), 93 | Tag::BIG => f.write_str("big"), 94 | Tag::BLOCKQUOTE => f.write_str("BLOCKQUOTE"), 95 | Tag::BODY => f.write_str("body"), 96 | Tag::BR => f.write_str("br"), 97 | Tag::BUTTON => f.write_str("button"), 98 | Tag::CANVAS => f.write_str("cavas"), 99 | Tag::CITE => f.write_str("cite"), 100 | Tag::CODE => f.write_str("code"), 101 | Tag::DD => f.write_str("dd"), 102 | Tag::DFN => f.write_str("dfn"), 103 | Tag::DIV => f.write_str("div"), 104 | Tag::DL => f.write_str("dl"), 105 | Tag::DT => f.write_str("dt"), 106 | Tag::EM => f.write_str("em"), 107 | Tag::FIELDSET => f.write_str("fieldset"), 108 | Tag::FIGCAPTION => f.write_str("figcaption"), 109 | Tag::FIGURE => f.write_str("figure"), 110 | Tag::FOOTER => f.write_str("footer"), 111 | Tag::FORM => f.write_str("form"), 112 | Tag::H1 => f.write_str("h1"), 113 | Tag::H2 => f.write_str("h2"), 114 | Tag::H3 => f.write_str("h3"), 115 | Tag::H4 => f.write_str("h4"), 116 | Tag::H5 => f.write_str("h5"), 117 | Tag::H6 => f.write_str("h6"), 118 | Tag::HEAD => f.write_str("head"), 119 | Tag::HEADER => f.write_str("header"), 120 | Tag::HGROUP => f.write_str("hgroup"), 121 | Tag::HR => f.write_str("hr"), 122 | Tag::HTML => f.write_str("html"), 123 | Tag::I => f.write_str("i"), 124 | Tag::IFRAME => f.write_str("iframe"), 125 | Tag::INPUT => f.write_str("input"), 126 | Tag::IMG => f.write_str("img"), 127 | Tag::KBD => f.write_str("kdb"), 128 | Tag::LABEL => f.write_str("label"), 129 | Tag::LI => f.write_str("li"), 130 | Tag::LINK => f.write_str("link"), 131 | Tag::MAIN => f.write_str("main"), 132 | Tag::MAP => f.write_str("map"), 133 | Tag::META => f.write_str("meta"), 134 | Tag::NAV => f.write_str("nav"), 135 | Tag::NOSCRIPT => f.write_str("noscript"), 136 | Tag::OBJECT => f.write_str("object"), 137 | Tag::OL => f.write_str("ol"), 138 | Tag::OPTION => f.write_str("option"), 139 | Tag::OUTPUT => f.write_str("output"), 140 | Tag::P => f.write_str("p"), 141 | Tag::PRE => f.write_str("pre"), 142 | Tag::Q => f.write_str("q"), 143 | Tag::SAMP => f.write_str("samp"), 144 | Tag::SCRIPT => f.write_str("script"), 145 | Tag::SECTION => f.write_str("section"), 146 | Tag::SELECT => f.write_str("select"), 147 | Tag::SMALL => f.write_str("small"), 148 | Tag::SPAN => f.write_str("span"), 149 | Tag::STRONG => f.write_str("strong"), 150 | Tag::STYLE => f.write_str("style"), 151 | Tag::SUB => f.write_str("sub"), 152 | Tag::SUP => f.write_str("sup"), 153 | Tag::TABLE => f.write_str("table"), 154 | Tag::TEXTAREA => f.write_str("textarea"), 155 | Tag::TFOOT => f.write_str("tfoot"), 156 | Tag::TIME => f.write_str("time"), 157 | Tag::TITLE => f.write_str("title"), 158 | Tag::TT => f.write_str("tt"), 159 | Tag::UL => f.write_str("ul"), 160 | Tag::VAR => f.write_str("var"), 161 | Tag::VIDEO => f.write_str("video"), 162 | Tag::WBR => f.write_str("wbr"), 163 | Tag::Unknown(s) => f.write_str(s.as_str()), 164 | } 165 | } 166 | } 167 | 168 | /// Parse an Tag type from string 169 | /// 170 | /// # Examples 171 | /// 172 | /// ``` 173 | /// use domx; 174 | /// let e = "code".parse::().unwrap(); 175 | /// println!("<{}>", e); 176 | /// ``` 177 | impl std::str::FromStr for Tag { 178 | type Err = (); 179 | 180 | fn from_str(s: &str) -> Result { 181 | match s { 182 | "a" => Ok(Tag::A), 183 | "abbr" => Ok(Tag::ABBR), 184 | "acronym" => Ok(Tag::ACRONYM), 185 | "address" => Ok(Tag::ADDRESS), 186 | "article" => Ok(Tag::ARTICLE), 187 | "aside" => Ok(Tag::ASIDE), 188 | "b" => Ok(Tag::B), 189 | "bdo" => Ok(Tag::BDO), 190 | "big" => Ok(Tag::BIG), 191 | "blockquote" => Ok(Tag::BLOCKQUOTE), 192 | "body" => Ok(Tag::BODY), 193 | "br" => Ok(Tag::BR), 194 | "button" => Ok(Tag::BUTTON), 195 | "canvas" => Ok(Tag::CANVAS), 196 | "cite" => Ok(Tag::CITE), 197 | "code" => Ok(Tag::CODE), 198 | "dd" => Ok(Tag::DD), 199 | "dfn" => Ok(Tag::DFN), 200 | "div" => Ok(Tag::DIV), 201 | "dl" => Ok(Tag::DL), 202 | "dt" => Ok(Tag::DT), 203 | "em" => Ok(Tag::EM), 204 | "fieldset" => Ok(Tag::FIELDSET), 205 | "figcaption" => Ok(Tag::FIGCAPTION), 206 | "figure" => Ok(Tag::FIGURE), 207 | "footer" => Ok(Tag::FOOTER), 208 | "form" => Ok(Tag::FORM), 209 | "h1" => Ok(Tag::H1), 210 | "h2" => Ok(Tag::H2), 211 | "h3" => Ok(Tag::H3), 212 | "h4" => Ok(Tag::H4), 213 | "h5" => Ok(Tag::H5), 214 | "h6" => Ok(Tag::H6), 215 | "head" => Ok(Tag::HEAD), 216 | "header" => Ok(Tag::HEADER), 217 | "hgroup" => Ok(Tag::HGROUP), 218 | "hr" => Ok(Tag::HR), 219 | "html" => Ok(Tag::HTML), 220 | "i" => Ok(Tag::I), 221 | "iframe" => Ok(Tag::IFRAME), 222 | "input" => Ok(Tag::INPUT), 223 | "img" => Ok(Tag::IMG), 224 | "kbd" => Ok(Tag::KBD), 225 | "label" => Ok(Tag::LABEL), 226 | "li" => Ok(Tag::LI), 227 | "link" => Ok(Tag::LINK), 228 | "map" => Ok(Tag::MAP), 229 | "main" => Ok(Tag::MAIN), 230 | "meta" => Ok(Tag::META), 231 | "nav" => Ok(Tag::NAV), 232 | "noscript" => Ok(Tag::NOSCRIPT), 233 | "object" => Ok(Tag::OBJECT), 234 | "ol" => Ok(Tag::OL), 235 | "option" => Ok(Tag::OPTION), 236 | "output" => Ok(Tag::OUTPUT), 237 | "p" => Ok(Tag::P), 238 | "pre" => Ok(Tag::PRE), 239 | "q" => Ok(Tag::Q), 240 | "samp" => Ok(Tag::SAMP), 241 | "script" => Ok(Tag::SCRIPT), 242 | "section" => Ok(Tag::SECTION), 243 | "select" => Ok(Tag::SELECT), 244 | "small" => Ok(Tag::SMALL), 245 | "span" => Ok(Tag::SPAN), 246 | "strong" => Ok(Tag::STRONG), 247 | "style" => Ok(Tag::STYLE), 248 | "sub" => Ok(Tag::SUB), 249 | "sup" => Ok(Tag::SUP), 250 | "table" => Ok(Tag::TABLE), 251 | "textarea" => Ok(Tag::TEXTAREA), 252 | "tfoot" => Ok(Tag::TFOOT), 253 | "time" => Ok(Tag::TIME), 254 | "title" => Ok(Tag::TITLE), 255 | "tt" => Ok(Tag::TT), 256 | "ul" => Ok(Tag::UL), 257 | "var" => Ok(Tag::VAR), 258 | "video" => Ok(Tag::VIDEO), 259 | "wbr" => Ok(Tag::WBR), 260 | _ => Ok(Tag::Unknown(s.to_string())) 261 | } 262 | } 263 | } 264 | -------------------------------------------------------------------------------- /src/dom.rs: -------------------------------------------------------------------------------- 1 | use std; 2 | use traits::{ToHTML}; 3 | use tag::{Tag}; 4 | use attribute::{Attribute}; 5 | use parser::{IsParser, Parser}; 6 | 7 | /// Id for node references between nodes 8 | pub type NodeId = usize; 9 | 10 | 11 | const ROOT_NODE_ID: NodeId = 0; 12 | 13 | pub struct NodeElement { 14 | tag: Tag, 15 | attributes: Vec 16 | } 17 | 18 | impl NodeElement { 19 | pub fn tag(&self) -> &Tag { 20 | &self.tag 21 | } 22 | 23 | pub fn attributes(&self) -> &Vec { 24 | &self.attributes 25 | } 26 | } 27 | 28 | impl ToHTML for NodeElement { 29 | fn to_html(&self) -> String { 30 | let mut html: String = "".to_owned(); 31 | html.push_str("<"); 32 | html.push_str(&self.tag().to_string()); 33 | for attr in self.attributes().iter() { 34 | html.push_str(" "); 35 | html.push_str(&attr.to_html()); 36 | } 37 | html.push_str(">"); 38 | html 39 | } 40 | } 41 | 42 | impl std::fmt::Display for NodeElement { 43 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { 44 | f.write_str("<").unwrap(); 45 | f.write_str(&format!("{}",self.tag)).unwrap(); 46 | 47 | let mut av: Vec = Vec::new(); 48 | for ref attr in self.attributes.clone() { 49 | av.push(format!("{}", attr)); 50 | } 51 | if av.len() != 0 { 52 | f.write_str(" ").unwrap(); 53 | f.write_str(av.join(" ").as_str()).unwrap(); 54 | } 55 | 56 | f.write_str(">") 57 | } 58 | } 59 | 60 | pub enum NodeData { 61 | Element(NodeElement), 62 | Data(String), 63 | } 64 | 65 | impl ToHTML for NodeData { 66 | fn to_html(&self) -> String { 67 | (match self { 68 | &NodeData::Element(ref x) => x.to_html(), 69 | &NodeData::Data(ref x) => x.to_string() 70 | }).to_string() 71 | } 72 | } 73 | 74 | impl std::fmt::Display for NodeData { 75 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { 76 | match self { 77 | &NodeData::Element(ref x) => f.write_str(&format!("{}", x)), 78 | &NodeData::Data(ref x) => f.write_str(x) 79 | } 80 | } 81 | } 82 | 83 | /// Representing a node in the DOM tree 84 | pub struct Node { 85 | id: NodeId, 86 | parent: Option, 87 | children: Vec, 88 | data: Option, 89 | } 90 | 91 | impl Node { 92 | 93 | /// Create a new element node 94 | pub fn new_element(tag: Tag, attributes: Vec) -> Node { 95 | Node { 96 | id: 0, 97 | parent: None, 98 | children: Vec::new(), 99 | data: Some(NodeData::Element(NodeElement{ 100 | tag: tag, 101 | attributes: attributes, 102 | })), 103 | } 104 | } 105 | 106 | /// Create a new data node 107 | pub fn new_data(data: String) -> Node { 108 | Node { 109 | id: 0, 110 | parent: None, 111 | children: Vec::new(), 112 | data: Some(NodeData::Data(data)), 113 | } 114 | } 115 | 116 | /// Test if node is an element. 117 | pub fn is_element(&self) -> bool { 118 | match self.data.as_ref().unwrap() { 119 | &NodeData::Element(_) => true, 120 | _ => false 121 | } 122 | } 123 | 124 | /// Test if node is data. 125 | pub fn is_data(&self) -> bool { 126 | match self.data.as_ref().unwrap() { 127 | &NodeData::Data(_) => true, 128 | _ => false 129 | } 130 | } 131 | 132 | pub fn element(&self) -> Option<&NodeElement> { 133 | match self.data.as_ref().unwrap() { 134 | &NodeData::Element(ref x) => Some(x), 135 | _ => None 136 | } 137 | } 138 | 139 | pub fn data(&self) -> &NodeData { 140 | self.data.as_ref().unwrap() 141 | } 142 | } 143 | 144 | /// Store used for allocation 145 | struct Store { 146 | nodes: Vec> 147 | } 148 | 149 | impl std::ops::Index for Store { 150 | type Output = Option; 151 | fn index(&self, idx: usize) -> &Option { 152 | &self.nodes[idx] 153 | } 154 | } 155 | 156 | impl std::ops::IndexMut for Store { 157 | fn index_mut(&mut self, idx: usize) -> &mut Option { 158 | &mut self.nodes[idx] 159 | } 160 | } 161 | 162 | impl Store { 163 | pub fn new() -> Store { 164 | Store { 165 | nodes: vec!(Some(Node{ 166 | id: 0, 167 | parent: None, 168 | children: Vec::new(), 169 | data: None 170 | })) 171 | } 172 | } 173 | 174 | /// Add node to store and return NodeId 175 | pub fn add(self: &mut Store, node: Node) -> Result { 176 | 177 | let parent = node.parent.unwrap(); 178 | self.nodes.push(Some(node)); 179 | 180 | let id = self.nodes.len() - 1; 181 | self[parent].as_mut().unwrap().children.push(id); 182 | 183 | self.nodes[id].as_mut().unwrap().id = id; 184 | Ok(id) 185 | } 186 | 187 | pub fn is_node(self: &Store, id: NodeId) -> bool { 188 | if id <= self.nodes.len() - 1 && self.nodes[id].is_some() { 189 | return true; 190 | } 191 | return false; 192 | } 193 | 194 | /// Create a new node with parent and return NodeId 195 | pub fn new_node_with_parent(self: &mut Store, parent: NodeId) -> Result { 196 | 197 | // validate parent 198 | if !self.is_node(parent) { 199 | // Invalid parent 200 | return Err(()); 201 | } 202 | 203 | // create and add new node returning new NodeId 204 | self.add(Node{ 205 | id: 0, 206 | parent: Some(parent), 207 | children: Vec::new(), 208 | data: None 209 | }) 210 | } 211 | 212 | fn _recurse(self: &Store, id: NodeId, level: usize, enter: &mut F) 213 | where 214 | F: FnMut(NodeId, usize), 215 | { 216 | match self.nodes[id] { 217 | Some(ref x) => { 218 | for cid in x.children.iter() { 219 | enter(*cid, level); 220 | self._recurse(*cid, level + 1, enter); 221 | } 222 | } 223 | None => () 224 | } 225 | } 226 | 227 | fn _recurse_with_output(self: &Store, id: NodeId, enter: &mut F1, leave: &mut F2, output: &mut String) 228 | where 229 | F1: FnMut(&Node, &mut String), 230 | F2: FnMut(&Node, &mut String), 231 | { 232 | match self.nodes[id] { 233 | Some(ref x) => { 234 | for cid in x.children.iter() { 235 | let node = self.nodes[*cid].as_ref().unwrap(); 236 | enter(node, output); 237 | self._recurse_with_output(*cid, enter, leave, output); 238 | leave(node, output); 239 | } 240 | } 241 | None => () 242 | } 243 | } 244 | 245 | fn _recurse_remove_node(&self, id: NodeId, nodes: &mut Vec) 246 | { 247 | 248 | match self.nodes[id] { 249 | None => (), 250 | Some(ref x) => { 251 | // recurse to leaf and then remove nodes back to top 252 | { 253 | for cid in x.children.iter() { 254 | self._recurse_remove_node(*cid, nodes); 255 | } 256 | } 257 | } 258 | }; 259 | 260 | nodes.push(id); 261 | } 262 | 263 | // Get nodes that are not none in storage 264 | pub fn len(&self) -> usize { 265 | let mut cnt = 0; 266 | for n in self.nodes.iter() { 267 | cnt += match n { 268 | &None => 0, 269 | _ => 1, 270 | }; 271 | } 272 | 273 | cnt 274 | } 275 | 276 | pub fn remove(&mut self, id: NodeId) { 277 | 278 | if !self.is_node(id) { 279 | return; 280 | } 281 | 282 | let mut nodes = Vec::new(); 283 | self._recurse_remove_node(id, &mut nodes); 284 | 285 | for nid in nodes.iter() { 286 | { 287 | let parent_id = { self.nodes[*nid].as_mut().unwrap().parent.unwrap() }; 288 | let parent = self[parent_id].as_mut().unwrap(); 289 | parent.children.retain(|&x| x != *nid); 290 | } 291 | self[*nid] = None; 292 | } 293 | } 294 | 295 | pub fn recurse(self: &Store, id: NodeId, mut enter: F) 296 | where 297 | F: FnMut(NodeId, usize), 298 | { 299 | match self.is_node(id) { 300 | true => self._recurse(id, 0, &mut enter), 301 | false => (), 302 | } 303 | } 304 | 305 | pub fn retain(&mut self, mut keep: F) 306 | where 307 | F: FnMut(&Node) -> bool, 308 | { 309 | // recurse into tree and for each node call keep and store 310 | // node to be removed into vector for second remove pass node. 311 | let mut nodes = Vec::new(); 312 | self.recurse(ROOT_NODE_ID, |id, _| { 313 | 314 | if keep(self[id].as_ref().unwrap()) == false { 315 | nodes.push(id); 316 | }; 317 | }); 318 | 319 | for id in nodes { 320 | self.remove(id); 321 | } 322 | } 323 | } 324 | 325 | impl ToHTML for Store { 326 | fn to_html(&self) -> String { 327 | let mut html = "".to_string().to_owned(); 328 | self._recurse_with_output(ROOT_NODE_ID,&mut |node, output|{ 329 | output.push_str(node.data().to_html().as_str()); 330 | },&mut |node, output|{ 331 | match node.is_element() { 332 | true => { 333 | output.push_str(""); 336 | }, 337 | false => (), 338 | } 339 | }, &mut html); 340 | 341 | html 342 | } 343 | } 344 | 345 | /// Instantiates and parses a HTML document into a DOM tree structure. 346 | /// 347 | /// # Examples 348 | /// 349 | /// ``` 350 | /// # #[macro_use] 351 | /// # extern crate domx; 352 | /// # fn main() { 353 | /// let mut dom = dom!("

Hello world!

"); 354 | /// # } 355 | /// ``` 356 | #[macro_export] macro_rules! dom { 357 | ( $html:expr ) => { 358 | { 359 | let mut temp_dom = $crate::Dom::new(); 360 | let data = format!("{}", $html).into_bytes(); 361 | temp_dom.parse(&mut std::io::BufReader::new(&data[..])).unwrap(); 362 | 363 | temp_dom 364 | } 365 | }; 366 | } 367 | 368 | /// DOM tree data structure builder. 369 | /// 370 | /// Uses [Parser] to build the tree and provides a set of methods to 371 | /// work with the tree. Implement [ToHTML] trait so that one can dump 372 | /// the tree into a HTML document. 373 | /// 374 | /// [ToHtml]: trait.ToHTML.html 375 | /// [Parser]: struct.Parser.html 376 | /// 377 | pub struct Dom { 378 | store: Store, 379 | current: Option 380 | } 381 | 382 | impl Dom { 383 | pub fn new() -> Dom { 384 | Dom { 385 | store: Store::new(), 386 | current: None, 387 | } 388 | } 389 | 390 | /// Parse a HTML buffer and build DOM tree structure. 391 | /// 392 | /// Use the macro [dom!()] for easier use. 393 | /// 394 | /// [dom!()]: macro.dom.html 395 | pub fn parse(self: &mut Self, source: &mut dyn std::io::BufRead) -> Result { 396 | Parser::parse(source, self) 397 | } 398 | 399 | /// Recurse the DOM with a callback for when entering each node. 400 | /// 401 | /// # Examples 402 | /// 403 | /// ``` 404 | /// # #[macro_use] 405 | /// # extern crate domx; 406 | /// # fn main() { 407 | /// let mut d = dom!(""); 408 | /// d.recurse(|id, level| { 409 | /// println!("Enter {} id({})", level, id); 410 | /// }); 411 | /// # } 412 | /// ``` 413 | pub fn recurse(self: &Dom, enter: F) 414 | where 415 | F: FnMut(NodeId, usize), 416 | { 417 | self.store.recurse(ROOT_NODE_ID, enter); 418 | } 419 | 420 | /// Retains only the nodes specified by the predicate. 421 | /// 422 | /// # Examples 423 | /// 424 | /// ``` 425 | /// # #[macro_use] 426 | /// # extern crate domx; 427 | /// # use domx::{Tag,ToHTML}; 428 | /// # fn main() { 429 | /// let mut d = dom!("

remove

Hello World!

remove

"); 430 | /// 431 | /// println!("{}", d); 432 | /// d.retain(|&ref node| { 433 | /// match node.element() { 434 | /// None => true, 435 | /// Some(x) => match x.tag() { 436 | /// &Tag::DIV => false, 437 | /// _ => true, 438 | /// } 439 | /// } 440 | /// }); 441 | /// 442 | /// println!("{}\nLength: {}", d.to_html(), d.len()); 443 | /// # } 444 | /// ``` 445 | pub fn retain(&mut self, keep: F) 446 | where F: FnMut(&Node) -> bool 447 | { 448 | self.store.retain(keep) 449 | } 450 | 451 | pub fn len(&self) -> usize { 452 | self.store.len() - 1 453 | } 454 | } 455 | 456 | impl IsParser for Dom { 457 | fn handle_starttag(self: &mut Self, tag: &Tag, attributes: &Vec) { 458 | let parent = { 459 | match self.current { 460 | Some(x) => x, 461 | None => ROOT_NODE_ID 462 | } 463 | }; 464 | let id = self.store.new_node_with_parent(parent).unwrap(); 465 | self.store[id].as_mut().unwrap().data = Some(NodeData::Element(NodeElement{ 466 | tag: tag.clone(), 467 | attributes: attributes.clone(), 468 | })); 469 | self.current = Some(id); 470 | } 471 | 472 | fn handle_endtag(self: &mut Self, _tag: &Tag) { 473 | self.current = self.store[self.current.unwrap()].as_ref().unwrap().parent; 474 | } 475 | 476 | fn handle_data(self: &mut Self, data: &Vec) { 477 | let parent = { 478 | match self.current { 479 | Some(x) => x, 480 | None => ROOT_NODE_ID 481 | } 482 | }; 483 | let id = self.store.new_node_with_parent(parent).unwrap(); 484 | self.store[id].as_mut().unwrap().data = Some(NodeData::Data(String::from_utf8(data.clone()).unwrap())); 485 | } 486 | } 487 | 488 | impl std::fmt::Display for Dom { 489 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { 490 | self.recurse(|id, level| { 491 | let li = vec![0; level]; 492 | let indent = li.iter().fold("".to_string(), |acc, _| acc + " "); 493 | match self.store[id].as_ref().unwrap().data { 494 | None => (), 495 | Some(ref x) => { 496 | match x { 497 | &NodeData::Element(ref x) => f.write_str(&format!("{}node({}) element: {}\n", indent, id, x)).unwrap(), 498 | &NodeData::Data(ref x) => f.write_str(&format!("{}node({}) data: {:?}\n", indent, id, x)).unwrap(), 499 | } 500 | } 501 | }; 502 | 503 | }); 504 | f.write_str("") 505 | } 506 | } 507 | 508 | impl std::ops::Index for Dom { 509 | type Output = Node; 510 | fn index(&self, idx: usize) -> &Node { 511 | self.store[idx].as_ref().unwrap() 512 | } 513 | } 514 | 515 | impl std::ops::IndexMut for Dom { 516 | fn index_mut(&mut self, idx: usize) -> &mut Node { 517 | self.store[idx].as_mut().unwrap() 518 | } 519 | } 520 | 521 | impl ToHTML for Dom { 522 | fn to_html(&self) -> String { 523 | self.store.to_html() 524 | } 525 | } 526 | 527 | 528 | #[cfg(test)] 529 | mod tests { 530 | use dom::*; 531 | use tag::Tag; 532 | use attribute::Attribute; 533 | use std::io::BufReader; 534 | 535 | #[test] 536 | fn parse_empty_document() { 537 | let mut dom = ::Dom::new(); 538 | let data = "".to_string().into_bytes(); 539 | assert_eq!(dom.parse(&mut BufReader::new(&data[..])).unwrap(), 0); 540 | } 541 | 542 | #[test] 543 | fn parse_simple_document() { 544 | let dom = dom!("

Hello World!

"); 545 | assert_eq!(dom[3].data().to_string(), "

"); 546 | assert_eq!(dom[6].data().to_string(), "World"); 547 | } 548 | 549 | #[test] 550 | fn node_new_element_to_html() { 551 | let el = "p".parse::().unwrap(); 552 | let attrs = vec!(Attribute::new("id", "myid"), Attribute::new("class", "info data")); 553 | let node = Node::new_element(el, attrs); 554 | assert_eq!(node.element().unwrap().to_html(), "

"); 555 | } 556 | 557 | #[test] 558 | fn dom_retain_all() { 559 | let mut dom = dom!("

Hello World!

"); 560 | dom.retain(|_| { 561 | true 562 | }); 563 | 564 | assert_eq!(dom.len(), 7); 565 | } 566 | 567 | #[test] 568 | fn dom_retain_none() { 569 | let mut dom = dom!("

Hello World!

"); 570 | println!("{}", dom); 571 | dom.retain(|_| { 572 | false 573 | }); 574 | 575 | assert_eq!(dom.len(), 0); 576 | } 577 | 578 | #[test] 579 | fn dom_retain_all_but_p() { 580 | let mut dom = dom!("

Hello World!

"); 581 | println!("{}", dom); 582 | dom.retain(|&ref node| { 583 | match node.element() { 584 | None => true, 585 | Some(x) => match x.tag() { 586 | &Tag::P => false, 587 | _ => true, 588 | } 589 | } 590 | }); 591 | 592 | assert_eq!(dom.len(), 2); 593 | } 594 | } 595 | -------------------------------------------------------------------------------- /src/parser.rs: -------------------------------------------------------------------------------- 1 | use std; 2 | use std::io::BufRead; 3 | 4 | use tag::{Tag}; 5 | use attribute::{Attribute}; 6 | 7 | #[derive(Clone)] 8 | struct ParserTag { 9 | name: String, 10 | pub id: Option, 11 | closing: bool, 12 | data: Vec, 13 | pub attributes: Vec 14 | } 15 | 16 | impl std::fmt::Display for ParserTag { 17 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { 18 | match self.closing { 19 | false => f.write_str("<"), 20 | true => f.write_str(" = Vec::new(); 26 | for ref attr in self.attributes.clone() { 27 | av.push(format!("{}", attr)); 28 | } 29 | if av.len() != 0 { 30 | f.write_str(" ").unwrap(); 31 | f.write_str(av.join(" ").as_str()).unwrap(); 32 | } 33 | 34 | f.write_str(">") 35 | } 36 | } 37 | 38 | /// A trait for handling callbacks from [Parser](struct.Parser.html). 39 | /// 40 | pub trait IsParser { 41 | /// This method is called to handle the start tag. 42 | fn handle_starttag(self: &mut Self, tag: &Tag, attributes: &Vec); 43 | 44 | /// This method is called to handle the end tag of a element. 45 | fn handle_endtag(self: &mut Self, tag: &Tag); 46 | 47 | /// This method is called to process arbitrary data. 48 | /// 49 | /// Data beeing text nodes and the content of `````` 50 | /// and `````` tags. 51 | fn handle_data(self: &mut Self, _data: &Vec); 52 | } 53 | 54 | /// Parse a HTML document and provide data through handler [IsParser]. 55 | /// 56 | /// The parser serves the basis for parsing text files formatted in 57 | /// HTML and XHTML. The parser is not based on SGML. 58 | /// 59 | /// [IsParser]: trait.IsParser.html 60 | /// 61 | pub struct Parser; 62 | 63 | enum ParserState { 64 | FindParserTag, 65 | SkipComment, 66 | ReadParserTagName, 67 | ReadData, 68 | ReadRawData, 69 | ReadAttributeName, 70 | ReadAttributeValue, 71 | } 72 | 73 | impl Parser { 74 | 75 | fn _state_find_tag(buf: &Vec, tag: &mut ParserTag, state: &mut ParserState) -> usize { 76 | let mut processed = 0; 77 | for b in buf { 78 | if *b == '<' as u8 { 79 | tag.name = "".to_string(); 80 | tag.id = None; 81 | tag.data.clear(); 82 | tag.closing = false; 83 | *state = ParserState::ReadParserTagName; 84 | break; 85 | } 86 | processed += 1; 87 | } 88 | return processed; 89 | } 90 | 91 | fn _state_skip_comment(buf: &Vec, state: &mut ParserState) -> usize { 92 | let mut processed = 0; 93 | loop { 94 | if (processed + 3) > buf.len() { 95 | // we need more data to continue 96 | break; 97 | } 98 | 99 | if buf[processed + 0] == '-' as u8 && 100 | buf[processed + 1] == '-' as u8 && 101 | buf[processed + 2] == '>' as u8 { 102 | 103 | *state = ParserState::ReadData; 104 | processed += 3; 105 | break; 106 | } 107 | 108 | processed += 1 109 | } 110 | 111 | return processed; 112 | } 113 | 114 | fn _state_read_tag_name(buf: &Vec, tag: &mut ParserTag, state: &mut ParserState) -> usize { 115 | let mut processed = 0; 116 | 117 | for b in buf { 118 | 119 | match *b as char { 120 | // Skip begin of tag 121 | '<' => processed += 1, 122 | 123 | '\r' | '\n' => { 124 | processed += 1; 125 | }, 126 | 127 | // Comment detected 128 | '!' => { 129 | processed += 1; 130 | *state = match buf[processed] as char { 131 | '-' => ParserState::SkipComment, 132 | _ => ParserState::FindParserTag 133 | }; 134 | break; 135 | }, 136 | 137 | // Closing tag detected 138 | '/' => { 139 | tag.closing = true; 140 | processed += 1; 141 | }, 142 | 143 | // Complete tag name read 144 | '>' | ' ' => { 145 | tag.attributes.clear(); 146 | tag.attributes.push(Attribute{name: Vec::new(), value: Vec::new()}); 147 | *state = ParserState::ReadAttributeName; 148 | break; 149 | }, 150 | 151 | _ => { 152 | tag.name.push(*b as char); 153 | processed += 1; 154 | } 155 | } 156 | } 157 | 158 | return processed; 159 | } 160 | 161 | fn _is_element(buf: &Vec, el: &Option) -> Result { 162 | if el.is_none() { 163 | return Ok(false); 164 | } 165 | 166 | let t = el.as_ref().unwrap().to_string().into_bytes(); 167 | 168 | // println!("buf: {}, tag: {}", buf.len(), t.len()); 169 | if buf.len() < t.len() + 3 { 170 | return Err(()) 171 | } 172 | 173 | let b = match buf[1] as char { 174 | '/' => buf[2..2+t.len()].to_vec(), 175 | _ => buf[..t.len()].to_vec() 176 | }; 177 | 178 | Ok(t == b) 179 | } 180 | 181 | // Read data until next inline element 182 | fn _state_read_data(buf: &Vec, tag: &mut ParserTag, state: &mut ParserState, handler: &mut dyn IsParser) -> usize { 183 | let mut processed = 0; 184 | 185 | for b in buf { 186 | match *b as char { 187 | // Found begin of new tag which means we have read 188 | // available data 189 | '<' => { 190 | if tag.data.len() > 0 { 191 | handler.handle_data(&tag.data); 192 | } 193 | *state = ParserState::FindParserTag; 194 | break; 195 | }, 196 | 197 | // '\r' | '\n' => { 198 | // processed += 1; 199 | // }, 200 | 201 | _ => { 202 | tag.data.push(*b); 203 | processed += 1; 204 | } 205 | } 206 | } 207 | 208 | return processed; 209 | } 210 | 211 | // Read data until closing element, eg 212 | fn _state_read_raw_data(buf: &Vec, tag: &mut ParserTag, state: &mut ParserState, handler: &mut dyn IsParser) -> usize { 213 | let mut processed = 0; 214 | 215 | for b in buf { 216 | 217 | match *b as char { 218 | 219 | // Found possible begin of closing tag 220 | '<' => { 221 | 222 | match Parser::_is_element(&buf[processed..].to_vec(), &tag.id) { 223 | Err(_) => break, 224 | Ok(x) => { 225 | match x { 226 | true => { 227 | // Found closing tag, lets handle data 228 | if tag.data.len() > 0 { 229 | handler.handle_data(&tag.data); 230 | } 231 | *state = ParserState::FindParserTag; 232 | break; 233 | }, 234 | false => { 235 | // Does not match, just byte add to data 236 | tag.data.push(*b); 237 | processed += 1; 238 | } 239 | } 240 | } 241 | } 242 | }, 243 | 244 | _ => { 245 | tag.data.push(*b); 246 | processed += 1; 247 | } 248 | } 249 | } 250 | 251 | return processed; 252 | } 253 | 254 | 255 | fn _state_read_attribute_value(buf: &Vec, tag: &mut ParserTag, state: &mut ParserState, handler: &mut dyn IsParser ) -> usize { 256 | let mut processed = 0; 257 | for b in buf { 258 | match *b as char { 259 | '>' => { 260 | // pop last attribute if it is an empty placeholder 261 | if tag.attributes.last().as_ref().unwrap().name() == "" { 262 | tag.attributes.pop(); 263 | } 264 | 265 | tag.id = Some(tag.name 266 | .to_lowercase() 267 | .parse::().unwrap()); 268 | 269 | *state = match tag.closing { 270 | false => { 271 | handler.handle_starttag(tag.id.as_ref().unwrap(), &tag.attributes); 272 | match *tag.id.as_ref().unwrap() { 273 | Tag::SCRIPT => ParserState::ReadRawData, 274 | Tag::STYLE => ParserState::ReadRawData, 275 | _ => ParserState::ReadData, 276 | } 277 | }, 278 | true => { 279 | handler.handle_endtag(tag.id.as_ref().unwrap()); 280 | ParserState::ReadData 281 | } 282 | }; 283 | tag.attributes.clear(); 284 | processed += 1; 285 | break; 286 | }, 287 | 288 | '"' | '\'' => { 289 | 290 | let have_value = !(tag.attributes.last().as_mut().unwrap().value() == ""); 291 | 292 | if have_value && *b != ' ' as u8 { 293 | 294 | { 295 | // Trim " and ' from attribute value 296 | let ref mut value = tag.attributes.last_mut().unwrap().value; 297 | if value.len() != 0 && (value[0] == '\'' as u8 || value[0] == '"' as u8) { 298 | *value = value[1..].to_vec(); 299 | } 300 | 301 | if value.len() != 0 && (value[value.len() - 1] == '\'' as u8 || value[value.len() - 1] == '"' as u8) { 302 | *value = value[..value.len() - 1].to_vec(); 303 | } 304 | 305 | } 306 | 307 | tag.attributes.push(Attribute{name: Vec::new(), value: Vec::new()}); 308 | *state = ParserState::ReadAttributeName; 309 | processed += 1; 310 | break; 311 | 312 | } else { 313 | 314 | let ref mut value = tag.attributes.last_mut().unwrap().value; 315 | value.push(*b); 316 | processed += 1; 317 | } 318 | }, 319 | 320 | ' ' => { 321 | 322 | let (have_value, is_quoted) = { 323 | let ref mut value = tag.attributes.last_mut().unwrap().value; 324 | match value.is_empty() { 325 | true => (false, false), 326 | false => (true, (value[0] == '"' as u8 || value[0] == '\'' as u8)), 327 | } 328 | }; 329 | 330 | if have_value && !is_quoted { 331 | 332 | tag.attributes.push(Attribute{name: Vec::new(), value: Vec::new()}); 333 | *state = ParserState::ReadAttributeName; 334 | processed += 1; 335 | break; 336 | 337 | } else { 338 | 339 | let ref mut value = tag.attributes.last_mut().unwrap().value; 340 | value.push(*b); 341 | processed += 1; 342 | } 343 | } 344 | 345 | _ => { 346 | let ref mut value = tag.attributes.last_mut().unwrap().value; 347 | value.push(*b); 348 | processed += 1; 349 | } 350 | } 351 | } 352 | return processed; 353 | } 354 | 355 | fn _state_read_attribute_name(buf: &Vec, tag: &mut ParserTag, state: &mut ParserState, handler: &mut dyn IsParser) -> usize { 356 | let mut processed = 0; 357 | 358 | for b in buf { 359 | 360 | match *b as char { 361 | 362 | // Found closing, lets finish up 363 | '>' | '/' => { 364 | 365 | { 366 | if tag.attributes.last().unwrap().name.is_empty() { 367 | // pop last attribute if it is an empty placeholder 368 | tag.attributes.pop(); 369 | } 370 | } 371 | 372 | match tag.name.to_lowercase().parse::() { 373 | Ok(x) => tag.id = Some(x), 374 | Err(_) => panic!("Failed to parse element '{}' to enum", tag.name), 375 | } 376 | 377 | *state = match tag.closing { 378 | false => { 379 | handler.handle_starttag(tag.id.as_ref().unwrap(), &tag.attributes); 380 | match *tag.id.as_ref().unwrap() { 381 | Tag::SCRIPT => ParserState::ReadRawData, 382 | _ => ParserState::ReadData, 383 | } 384 | }, 385 | true => { 386 | handler.handle_endtag(tag.id.as_ref().unwrap()); 387 | ParserState::ReadData 388 | } 389 | }; 390 | 391 | processed += 1; 392 | break; 393 | }, 394 | 395 | '=' => { 396 | *state = ParserState::ReadAttributeValue; 397 | processed += 1; 398 | break; 399 | }, 400 | 401 | ' ' => { 402 | if !tag.attributes.last().unwrap().name.is_empty() { 403 | tag.attributes.push(Attribute{name: Vec::new(), value: Vec::new()}); 404 | *state = ParserState::ReadAttributeName; 405 | } 406 | 407 | processed += 1; 408 | }, 409 | 410 | '\n' | '\r' | '\t' => { 411 | processed += 1; 412 | }, 413 | 414 | _ => { 415 | let ref mut name = tag.attributes.last_mut().unwrap().name; 416 | name.push(*b); 417 | processed += 1; 418 | } 419 | } 420 | } 421 | 422 | return processed; 423 | } 424 | 425 | /// Parse a HTML document and call handlers. 426 | pub fn parse(source: &mut dyn BufRead, handler: &mut dyn IsParser) -> Result { 427 | let mut total_parsed = 0; 428 | let mut state = ParserState::FindParserTag; 429 | 430 | let mut tag = ParserTag { 431 | name: "".to_string(), 432 | id: None, 433 | closing: false, 434 | data: Vec::new(), 435 | attributes: Vec::new() 436 | }; 437 | 438 | 439 | let mut buf = Vec::new(); 440 | let mut end_of_file = false; 441 | loop { 442 | 443 | // If buffer is low and there is still data to be read, read block 444 | if !end_of_file && buf.len() < 64 { 445 | let mut block = [0; 2048]; 446 | let bytes_read = match source.read(&mut block[..]) { 447 | Ok(x) => x, 448 | Err(x) => return Err(x) 449 | }; 450 | 451 | match bytes_read { 452 | 0 => end_of_file = true, 453 | _ => { 454 | let size = buf.len(); 455 | buf.extend_from_slice(&block); 456 | buf = buf[..size+bytes_read].to_vec(); 457 | } 458 | }; 459 | } 460 | 461 | // Break out of loop if buffer is empty 462 | if buf.len() == 0 { 463 | break; 464 | } 465 | 466 | loop { 467 | let processed = match state { 468 | ParserState::FindParserTag => Parser::_state_find_tag(&buf, &mut tag, &mut state), 469 | ParserState::SkipComment => Parser::_state_skip_comment(&buf, &mut state), 470 | ParserState::ReadParserTagName => Parser::_state_read_tag_name(&buf, &mut tag, &mut state), 471 | ParserState::ReadData => Parser::_state_read_data(&buf, &mut tag, &mut state, handler), 472 | ParserState::ReadRawData => Parser::_state_read_raw_data(&buf, &mut tag, &mut state, handler), 473 | ParserState::ReadAttributeName => Parser::_state_read_attribute_name(&buf, &mut tag, &mut state, handler), 474 | ParserState::ReadAttributeValue => Parser::_state_read_attribute_value(&buf, &mut tag, &mut state, handler), 475 | }; 476 | 477 | if processed == 0 { 478 | break; 479 | } 480 | 481 | buf.drain(..processed); 482 | total_parsed += processed; 483 | 484 | if buf.len() == 0 { 485 | break; 486 | } 487 | } 488 | } 489 | 490 | Ok(total_parsed) 491 | } 492 | } 493 | 494 | #[cfg(test)] 495 | mod tests { 496 | use attribute::Attribute; 497 | use tag::Tag; 498 | use parser::{IsParser}; 499 | use std::io::BufReader; 500 | 501 | struct TestTag { 502 | tag: Tag, 503 | attributes: Vec 504 | } 505 | 506 | struct Dummy { 507 | starttag: Vec, 508 | endtag: Vec, 509 | data: Vec>, 510 | } 511 | 512 | impl Dummy { 513 | pub fn new() -> Dummy { 514 | Dummy{ 515 | starttag: Vec::new(), 516 | endtag: Vec::new(), 517 | data: Vec::new(), 518 | } 519 | } 520 | } 521 | 522 | impl IsParser for Dummy { 523 | fn handle_starttag(self: &mut Self, tag: &Tag, attributes: &Vec) { 524 | self.starttag.push(TestTag{ 525 | tag: tag.clone(), 526 | attributes: attributes.clone(), 527 | }); 528 | } 529 | 530 | fn handle_endtag(self: &mut Self, tag: &Tag) { 531 | self.endtag.push(TestTag{tag: tag.clone(), attributes: Vec::new()}); 532 | } 533 | 534 | fn handle_data(self: &mut Self, data: &Vec) { 535 | self.data.push(data.clone()); 536 | } 537 | } 538 | 539 | 540 | #[test] 541 | fn parse_empty_document() { 542 | let mut p = Dummy::new(); 543 | let data = b""; 544 | assert_eq!(::Parser::parse(&mut BufReader::new(&data[..]), &mut p).unwrap(), 0); 545 | } 546 | 547 | #[test] 548 | fn parse_simple_document() { 549 | let mut p = Dummy::new(); 550 | let data = b"Simple Example

A simple doc

This is a simple html document, as short and simple it can get.

"; 551 | assert_eq!(::Parser::parse(&mut BufReader::new(&data[..]), &mut p).unwrap(), 172); 552 | assert_eq!(p.starttag.len(), 6); 553 | assert_eq!(p.endtag.len(), 6); 554 | assert_eq!(p.data.len(), 3); 555 | assert_eq!(p.starttag[5].attributes.len(), 1); 556 | assert_eq!(p.starttag[0].tag, Tag::HTML); 557 | assert_eq!(p.starttag[1].tag, Tag::HEAD); 558 | assert_eq!(p.starttag[2].tag, Tag::TITLE); 559 | assert_eq!(p.starttag[3].tag, Tag::BODY); 560 | assert_eq!(p.starttag[4].tag, Tag::H1); 561 | assert_eq!(p.starttag[5].tag, Tag::P); 562 | } 563 | 564 | #[test] 565 | fn parse_document_with_inline_comment() { 566 | let mut p = Dummy::new(); 567 | let data = b"Simple<!-- title --> Example

A simple doc

This is a simple html document, as short and simple it can get.

"; 568 | assert_eq!(::Parser::parse(&mut BufReader::new(&data[..]), &mut p).unwrap(), 186); 569 | assert_eq!(p.starttag.len(), 6); 570 | assert_eq!(p.endtag.len(), 6); 571 | assert_eq!(p.data.len(), 4); 572 | assert_eq!(p.starttag[5].attributes.len(), 1); 573 | assert_eq!(String::from_utf8(p.data[0].clone()).unwrap(), "Simple"); 574 | assert_eq!(String::from_utf8(p.data[1].clone()).unwrap(), " Example"); 575 | } 576 | 577 | #[test] 578 | fn parse_document_with_comment() { 579 | let mut p = Dummy::new(); 580 | let data = b" Simple Example

A simple doc

This is a simple html document, as short and simple it can get.

"; 581 | assert_eq!(::Parser::parse(&mut BufReader::new(&data[..]), &mut p).unwrap(), 194); 582 | assert_eq!(p.starttag.len(), 6); 583 | assert_eq!(p.endtag.len(), 6); 584 | assert_eq!(p.data.len(), 5); 585 | assert_eq!(p.starttag[5].attributes.len(), 1); 586 | } 587 | 588 | #[test] 589 | fn parse_tag_with_utf8_data() { 590 | let mut p = Dummy::new(); 591 | let data = "

💖

".to_string().into_bytes(); 592 | assert_eq!(::Parser::parse(&mut BufReader::new(&data[..]), &mut p).unwrap(), 11); 593 | assert_eq!(String::from_utf8(p.data[0].clone()).unwrap(), "💖"); 594 | } 595 | 596 | #[test] 597 | fn parse_attribute_with_utf8_value() { 598 | let mut p = Dummy::new(); 599 | let data = "

Sparkle heart

".to_string().into_bytes(); 600 | assert_eq!(::Parser::parse(&mut BufReader::new(&data[..]), &mut p).unwrap(), 30); 601 | assert_eq!(String::from_utf8(p.data[0].clone()).unwrap(), "Sparkle heart"); 602 | assert_eq!(p.starttag[0].attributes[0].name(), "id"); 603 | assert_eq!(p.starttag[0].attributes[0].value(), "💖"); 604 | } 605 | 606 | #[test] 607 | fn parse_tag_with_one_attribute_without_qouted_value() { 608 | let mut p = Dummy::new(); 609 | let data = b"

Hello world

"; 610 | assert_eq!(::Parser::parse(&mut BufReader::new(&data[..]), &mut p).unwrap(), 23); 611 | assert_eq!(p.starttag[0].attributes[0].name(), "id"); 612 | assert_eq!(p.starttag[0].attributes[0].value(), "1"); 613 | } 614 | 615 | #[test] 616 | fn parse_tag_with_one_attribute_with_doubleqouted_value() { 617 | let mut p = Dummy::new(); 618 | let data = b"

Hello world

"; 619 | assert_eq!(::Parser::parse(&mut BufReader::new(&data[..]), &mut p).unwrap(), 25); 620 | assert_eq!(p.starttag[0].attributes[0].name(), "id"); 621 | assert_eq!(p.starttag[0].attributes[0].value(), "1"); 622 | } 623 | 624 | #[test] 625 | fn parse_tag_with_one_attribute_with_singleqouted_value() { 626 | let mut p = Dummy::new(); 627 | let data = b"

Hello world

"; 628 | assert_eq!(::Parser::parse(&mut BufReader::new(&data[..]), &mut p).unwrap(), 25); 629 | assert_eq!(p.starttag[0].attributes[0].name(), "id"); 630 | assert_eq!(p.starttag[0].attributes[0].value(), "1"); 631 | } 632 | 633 | #[test] 634 | fn parse_tag_with_one_attribute_doubleqouted_with_space_in_value() { 635 | let mut p = Dummy::new(); 636 | let data = b"

Hello world

"; 637 | assert_eq!(::Parser::parse(&mut BufReader::new(&data[..]), &mut p).unwrap(), 37); 638 | assert_eq!(p.starttag[0].attributes[0].name(), "class"); 639 | assert_eq!(p.starttag[0].attributes[0].value(), "info error"); 640 | } 641 | 642 | #[test] 643 | fn parse_tag_with_one_attribute_singlequoted_with_space_in_value() { 644 | let mut p = Dummy::new(); 645 | let data = b"

Hello world

"; 646 | assert_eq!(::Parser::parse(&mut BufReader::new(&data[..]), &mut p).unwrap(), 37); 647 | assert_eq!(p.starttag[0].attributes[0].name(), "class"); 648 | assert_eq!(p.starttag[0].attributes[0].value(), "info error"); 649 | } 650 | 651 | #[test] 652 | fn parse_tag_with_one_attribute_with_space_ending_value() { 653 | let mut p = Dummy::new(); 654 | let data = b"

Hello world

"; 655 | assert_eq!(::Parser::parse(&mut BufReader::new(&data[..]), &mut p).unwrap(), 37); 656 | assert_eq!(p.starttag[0].attributes[0].name(), "id"); 657 | assert_eq!(p.starttag[0].attributes[0].value(), "test"); 658 | assert_eq!(p.starttag[0].attributes[1].name(), "class"); 659 | assert_eq!(p.starttag[0].attributes[1].value(), "info"); 660 | } 661 | 662 | #[test] 663 | fn parse_tag_with_one_attribute_with_space_ending_value2() { 664 | let mut p = Dummy::new(); 665 | let data = b"

Hello world

"; 666 | assert_eq!(::Parser::parse(&mut BufReader::new(&data[..]), &mut p).unwrap(), 27); 667 | assert_eq!(p.starttag[0].attributes[0].name(), "id"); 668 | assert_eq!(p.starttag[0].attributes[0].value(), "test"); 669 | } 670 | 671 | #[test] 672 | fn parse_tag_with_two_attribute() { 673 | let mut p = Dummy::new(); 674 | let data = b"

Hello world

"; 675 | assert_eq!(::Parser::parse(&mut BufReader::new(&data[..]), &mut p).unwrap(), 41); 676 | assert_eq!(p.starttag[0].attributes[0].name(), "id"); 677 | assert_eq!(p.starttag[0].attributes[0].value(), "myid"); 678 | assert_eq!(p.starttag[0].attributes[1].name(), "class"); 679 | assert_eq!(p.starttag[0].attributes[1].value(), "info"); 680 | } 681 | 682 | #[test] 683 | fn parse_tag_with_two_attribute_separated_with_lf() { 684 | let mut p = Dummy::new(); 685 | let data = b"

Hello world

"; 686 | assert_eq!(::Parser::parse(&mut BufReader::new(&data[..]), &mut p).unwrap(), 44); 687 | assert_eq!(p.starttag[0].attributes[0].name(), "id"); 688 | assert_eq!(p.starttag[0].attributes[0].value(), "myid"); 689 | assert_eq!(p.starttag[0].attributes[1].name(), "class"); 690 | assert_eq!(p.starttag[0].attributes[1].value(), "info"); 691 | } 692 | 693 | #[test] 694 | fn parse_tag_with_one_boolean_attribute() { 695 | let mut p = Dummy::new(); 696 | let data = b""; 697 | assert_eq!(::Parser::parse(&mut BufReader::new(&data[..]), &mut p).unwrap(), 37); 698 | assert_eq!(p.starttag[0].attributes[0].name(), "selected"); 699 | assert_eq!(p.starttag[0].attributes[0].is_boolean(), true); 700 | } 701 | 702 | #[test] 703 | fn parse_tag_with_one_boolean_attribute_with_space_ending() { 704 | let mut p = Dummy::new(); 705 | let data = b""; 706 | assert_eq!(::Parser::parse(&mut BufReader::new(&data[..]), &mut p).unwrap(), 38); 707 | assert_eq!(p.starttag[0].attributes[0].name(), "selected"); 708 | assert_eq!(p.starttag[0].attributes[0].is_boolean(), true); 709 | } 710 | 711 | #[test] 712 | fn parse_tag_with_two_attribute_were_first_is_boolean_attribute() { 713 | let mut p = Dummy::new(); 714 | let data = b""; 715 | assert_eq!(::Parser::parse(&mut BufReader::new(&data[..]), &mut p).unwrap(), 47); 716 | assert_eq!(p.starttag[0].attributes[0].name(), "selected"); 717 | assert_eq!(p.starttag[0].attributes[0].is_boolean(), true); 718 | assert_eq!(p.starttag[0].attributes[1].name(), "id"); 719 | assert_eq!(p.starttag[0].attributes[1].value(), "myid"); 720 | } 721 | } 722 | 723 | --------------------------------------------------------------------------------