",
9 | {
10 | "name": "div",
11 | "variant": "normal",
12 | "children": [
13 | {
14 | "name": "div",
15 | "variant": "normal",
16 | "children": [
17 | {
18 | "name": "div",
19 | "variant": "normal"
20 | }
21 | ]
22 | }
23 | ]
24 | }
25 | ]
26 | }
27 |
--------------------------------------------------------------------------------
/tests/snapshots/element__it_can_parse_nested_elements_mixed_children.snap:
--------------------------------------------------------------------------------
1 | ---
2 | source: tests/element.rs
3 | expression: dom
4 | ---
5 | {
6 | "treeType": "documentFragment",
7 | "children": [
8 | {
9 | "name": "div",
10 | "variant": "normal",
11 | "children": [
12 | "comment",
13 | {
14 | "name": "div",
15 | "variant": "void"
16 | },
17 | "\n Hello\n ",
18 | {
19 | "name": "div",
20 | "variant": "normal",
21 | "children": [
22 | "\n World\n "
23 | ]
24 | }
25 | ]
26 | }
27 | ]
28 | }
29 |
--------------------------------------------------------------------------------
/examples/get_all_href/main.rs:
--------------------------------------------------------------------------------
1 | use html_parser::{Dom, Node, Result};
2 |
3 | // This example illustrates how to use the library to get all of the anchor-hrefs from a document.
4 |
5 | fn main() -> Result<()> {
6 | let html = include_str!("./index.html");
7 | let dom = Dom::parse(html)?;
8 | let iter = dom.children.get(0).unwrap().into_iter();
9 |
10 | let hrefs = iter.filter_map(|item| match item {
11 | Node::Element(ref element) if element.name == "a" => element.attributes["href"].clone(),
12 | _ => None,
13 | });
14 |
15 | println!("\nThe following links where found:");
16 | for (index, href) in hrefs.enumerate() {
17 | println!("{}: {}", index + 1, href)
18 | }
19 |
20 | Ok(())
21 | }
22 |
--------------------------------------------------------------------------------
/tests/snapshots/svg__it_can_parse_svg.snap:
--------------------------------------------------------------------------------
1 | ---
2 | source: tests/svg.rs
3 | expression: dom
4 | ---
5 | {
6 | "treeType": "documentFragment",
7 | "children": [
8 | {
9 | "name": "svg",
10 | "variant": "normal",
11 | "attributes": {
12 | "xmlns": "http://www.w3.org/2000/svg",
13 | "xmlns:xlink": "http://www.w3.org/1999/xlink"
14 | },
15 | "children": [
16 | {
17 | "name": "rect",
18 | "variant": "void",
19 | "attributes": {
20 | "height": "100",
21 | "style": "stroke:#ff0000; fill: #0000ff",
22 | "width": "100",
23 | "x": "10",
24 | "y": "10"
25 | }
26 | }
27 | ]
28 | }
29 | ]
30 | }
31 |
--------------------------------------------------------------------------------
/src/dom/span.rs:
--------------------------------------------------------------------------------
1 | use serde::{Serialize};
2 |
3 | /// Span of the information in the parsed source.
4 | #[derive(Debug, Default, Clone, Serialize, PartialEq)]
5 | #[serde(rename_all = "camelCase")]
6 | pub struct SourceSpan {
7 | pub text: String,
8 | pub start_line: usize,
9 | pub end_line: usize,
10 | pub start_column: usize,
11 | pub end_column: usize,
12 | }
13 |
14 | impl SourceSpan {
15 | pub fn new(
16 | text: String,
17 | start_line: usize,
18 | end_line: usize,
19 | start_column: usize,
20 | end_column: usize,
21 | ) -> Self {
22 | Self {
23 | text,
24 | start_line,
25 | end_line,
26 | start_column,
27 | end_column,
28 | }
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/tests/bin.rs:
--------------------------------------------------------------------------------
1 | use html_parser::Result;
2 | use indoc::indoc;
3 | use std::io::Write;
4 | use std::process::Command;
5 | use tempfile::NamedTempFile;
6 |
7 | #[test]
8 | fn it_prints_out_processing_error() -> Result<()> {
9 | let html = indoc!(
10 | r#"
11 |
12 |
13 | "#
14 | );
15 |
16 | let mut file = NamedTempFile::new()?;
17 | file.write_all(html.as_bytes())?;
18 |
19 | let output = Command::new("./target/debug/examples/simple_parser")
20 | .arg("-d")
21 | .arg(file.path())
22 | .output()
23 | .unwrap();
24 |
25 | let stdout = String::from_utf8(output.stdout).unwrap();
26 |
27 | assert!(stdout.starts_with("# Failed to create element at rule: el_process_instruct"));
28 | Ok(())
29 | }
30 |
--------------------------------------------------------------------------------
/tests/snapshots/text__it_can_parse_text_in_paragraph_with_weird_formatting.snap:
--------------------------------------------------------------------------------
1 | ---
2 | source: tests/text.rs
3 | expression: dom
4 | ---
5 | {
6 | "treeType": "documentFragment",
7 | "children": [
8 | {
9 | "name": "p",
10 | "variant": "normal",
11 | "children": [
12 | "\n This is a ",
13 | {
14 | "name": "b",
15 | "variant": "normal",
16 | "children": [
17 | "para"
18 | ]
19 | },
20 | "gra",
21 | {
22 | "name": "b",
23 | "variant": "normal",
24 | "children": [
25 | "ph"
26 | ]
27 | },
28 | " with some",
29 | {
30 | "name": "i",
31 | "variant": "normal",
32 | "children": [
33 | " weird "
34 | ]
35 | },
36 | " formatting.\n"
37 | ]
38 | }
39 | ]
40 | }
41 |
--------------------------------------------------------------------------------
/tests/node_iter.rs:
--------------------------------------------------------------------------------
1 | use html_parser::{Dom, Node, Result};
2 | use indoc::indoc;
3 |
4 | #[test]
5 | fn it_can_iter_1() -> Result<()> {
6 | let html = indoc! {"
7 |
8 |
9 |
title
10 |
11 |
12 |
17 |
18 |
19 | "};
20 | let dom = Dom::parse(&html)?;
21 | let root = dom.children.get(0).unwrap().into_iter();
22 | let num_li = root.into_iter().fold(0, |mut acc, curr| match curr {
23 | Node::Element(ref e) => {
24 | if e.name == "li" {
25 | acc += 1;
26 | }
27 | acc
28 | }
29 | _ => acc,
30 | });
31 | assert_eq!(num_li, 3);
32 | Ok(())
33 | }
34 |
--------------------------------------------------------------------------------
/tests/output.rs:
--------------------------------------------------------------------------------
1 | use html_parser::{Dom, Result};
2 | use indoc::indoc;
3 | use insta::assert_json_snapshot;
4 |
5 | #[test]
6 | fn it_can_output_json() -> Result<()> {
7 | assert!(Dom::parse("
")?.to_json().is_ok());
8 | Ok(())
9 | }
10 |
11 | #[test]
12 | fn it_can_output_json_pretty() -> Result<()> {
13 | assert!(Dom::parse("
")?.to_json_pretty().is_ok());
14 | Ok(())
15 | }
16 |
17 | #[test]
18 | fn it_can_output_complex_html_as_json() -> Result<()> {
19 | let html = indoc!(
20 | "
21 |
22 |
Här kan man va
23 |
24 |
25 |
Tjena världen!
26 |
Tänkte bara informera om att Sverige är bättre än Finland i ishockey.
27 |
28 | "
29 | );
30 | let dom = Dom::parse(html)?;
31 | assert_json_snapshot!(dom);
32 | Ok(())
33 | }
34 |
--------------------------------------------------------------------------------
/tests/snapshots/element__it_can_parse_script_with_content.snap:
--------------------------------------------------------------------------------
1 | ---
2 | source: tests/element.rs
3 | expression: dom
4 | ---
5 | {
6 | "treeType": "documentFragment",
7 | "children": [
8 | {
9 | "name": "script",
10 | "variant": "normal",
11 | "children": [
12 | "const person_creator = ({ name, symtoms }) => {\n let person = {}\n person.name = name\n person.symtoms = {}\n for (symtom of symtoms) {\n person.symtoms[symtom] = true\n }\n return person\n }\n \n const main = () => {\n let name = 'mathias'\n let symtoms = ['Dunning-Kruger', 'ACDC', 'Slacker']\n \n setTimeout(() => {\n let person = person_creator({ name, symtoms })\n if (person.symtoms.hasOwnProperty('Dunning-Kruger')) {\n console.log('yeah buddy, that\\'s right')\n }\n }, 1337)\n }\n \n main()"
13 | ]
14 | }
15 | ]
16 | }
17 |
--------------------------------------------------------------------------------
/tests/document.rs:
--------------------------------------------------------------------------------
1 | use html_parser::{Dom, Result};
2 | use indoc::indoc;
3 | use insta::assert_json_snapshot;
4 |
5 | #[test]
6 | fn it_can_parse_minimal_document() -> Result<()> {
7 | let html = "";
8 | let dom = Dom::parse(html)?;
9 | assert_json_snapshot!(dom);
10 | Ok(())
11 | }
12 | #[test]
13 | fn it_can_parse_document_with_comments() -> Result<()> {
14 | let html = indoc!(
15 | r#"
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 | "#
27 | );
28 | let dom = Dom::parse(html)?;
29 | assert_json_snapshot!(dom);
30 | Ok(())
31 | }
32 | #[test]
33 | fn it_error_when_doctype_and_multiple_html() {
34 | let html = "";
35 | assert!(Dom::parse(html).is_err());
36 | }
37 |
--------------------------------------------------------------------------------
/tests/snapshots/element__it_can_deal_with_weird_whitespaces.snap:
--------------------------------------------------------------------------------
1 | ---
2 | source: tests/element.rs
3 | expression: dom
4 | ---
5 | {
6 | "treeType": "documentFragment",
7 | "children": [
8 | "Normal case",
9 | {
10 | "name": "div",
11 | "variant": "normal",
12 | "children": [
13 | " Text "
14 | ]
15 | },
16 | "Whitespaces in opening tag to the left",
17 | {
18 | "name": "div",
19 | "variant": "normal",
20 | "children": [
21 | " Text "
22 | ]
23 | },
24 | "Whitespaces in opening tag to the right",
25 | {
26 | "name": "div",
27 | "variant": "normal",
28 | "children": [
29 | " Text "
30 | ]
31 | },
32 | "Whitespaces in closing tag to the left (should not work)",
33 | "
Text < /div>\n\n",
34 | "Whitespaces in closing tag to the right",
35 | {
36 | "name": "div",
37 | "variant": "normal",
38 | "children": [
39 | " Text "
40 | ]
41 | },
42 | "Whitespaces everywhere (should not work)",
43 | "< div > Text < / div >\n"
44 | ]
45 | }
46 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Mathias Iversen
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "html_parser"
3 | version = "0.7.0"
4 | authors = ["Mathias Iversen
"]
5 | edition = "2018"
6 | repository = "https://github.com/mathiversen/html-parser"
7 | license = "MIT"
8 | description = "A simple and general purpose html/xhtml parser"
9 | keywords = ["html", "parser", "json", "pest", "dom"]
10 | categories = ["parsing", "web-programming"]
11 | readme = "README.md"
12 |
13 | [dependencies]
14 | pest = "2.5.7"
15 | pest_derive = "2.5.7"
16 | thiserror = "1.0.40"
17 | serde = { version = "1.0.159", features = ["derive"] }
18 | serde_derive = "1.0.159"
19 | serde_json = "1.0.95"
20 | doc-comment = "0.3.3"
21 |
22 | [dev-dependencies]
23 | indoc = "2.0.1"
24 | insta = { version = "1.29.0", features = ["json"]}
25 | tempfile = "3.5.0"
26 | criterion = "0.4.0"
27 | reqwest = { version = "0.11.16", features = ["blocking", "rustls-tls"] }
28 | clap = { version = "4.2.1", features = ["derive"] }
29 |
30 | [[example]]
31 | name = "get_all_href"
32 | path = "examples/get_all_href/main.rs"
33 |
34 | [[example]]
35 | name = "simple_parser"
36 | path = "examples/simple_parser/main.rs"
37 |
38 | [[bench]]
39 | name = "bench_wikipedia"
40 | harness = false
41 |
--------------------------------------------------------------------------------
/tests/document_fragment.rs:
--------------------------------------------------------------------------------
1 | use html_parser::{Dom, Result};
2 | use insta::assert_json_snapshot;
3 |
4 | #[test]
5 | fn it_can_parse_single_div_as_fragment() -> Result<()> {
6 | let html = "
";
7 | let dom = Dom::parse(html)?;
8 | assert_json_snapshot!(dom);
9 | Ok(())
10 | }
11 | #[test]
12 | fn it_can_parse_single_text_as_fragment() -> Result<()> {
13 | let html = "hello";
14 | let dom = Dom::parse(html)?;
15 | assert_json_snapshot!(dom);
16 | Ok(())
17 | }
18 | #[test]
19 | fn it_can_parse_text_comment_element_as_fragment() -> Result<()> {
20 | let html = "hello
";
21 | let dom = Dom::parse(html)?;
22 | assert_json_snapshot!(dom);
23 | Ok(())
24 | }
25 | #[test]
26 | fn it_error_when_body_is_used_in_fragment_root() {
27 | let html = "
";
28 | assert!(Dom::parse(html).is_err());
29 | }
30 | #[test]
31 | fn it_error_when_head_is_used_in_fragment_root() {
32 | let html = "
";
33 | assert!(Dom::parse(html).is_err());
34 | }
35 | #[test]
36 | fn it_error_when_html_is_used_in_fragment_root() {
37 | let html = "
";
38 | assert!(Dom::parse(html).is_err());
39 | }
40 |
--------------------------------------------------------------------------------
/tests/snapshots/output__it_can_output_complex_html_as_json.snap:
--------------------------------------------------------------------------------
1 | ---
2 | source: tests/output.rs
3 | expression: dom
4 | ---
5 | {
6 | "treeType": "document",
7 | "children": [
8 | {
9 | "name": "html",
10 | "variant": "normal",
11 | "attributes": {
12 | "lang": "sv"
13 | },
14 | "children": [
15 | {
16 | "name": "head",
17 | "variant": "normal",
18 | "children": [
19 | {
20 | "name": "title",
21 | "variant": "normal",
22 | "children": [
23 | "Här kan man va"
24 | ]
25 | }
26 | ]
27 | },
28 | {
29 | "name": "body",
30 | "variant": "normal",
31 | "children": [
32 | {
33 | "name": "h1",
34 | "variant": "normal",
35 | "children": [
36 | "Tjena världen!"
37 | ]
38 | },
39 | {
40 | "name": "p",
41 | "variant": "normal",
42 | "children": [
43 | "Tänkte bara informera om att Sverige är bättre än Finland i ishockey."
44 | ]
45 | }
46 | ]
47 | }
48 | ]
49 | }
50 | ]
51 | }
52 |
--------------------------------------------------------------------------------
/src/dom/formatting.rs:
--------------------------------------------------------------------------------
1 | use crate::error::Error;
2 | use crate::Result;
3 | use crate::Rule;
4 | use pest::error::Error as PestError;
5 |
6 | /// This function abstracts the formatting of errors away from the core logic inside parser,
7 | /// so that the file is easier to read.
8 | pub fn error_msg(error: PestError) -> Result {
9 | let message = error.renamed_rules(|rule| match *rule {
10 | Rule::EOI => "end of input".to_string(),
11 | Rule::doctype => "doctype element".to_string(),
12 | Rule::node_text => "text node".to_string(),
13 | Rule::node_element => "element node".to_string(),
14 | Rule::el_void => "void element".to_string(),
15 | Rule::el_void_xml => "void element with xml ending (/>)".to_string(),
16 | Rule::el_process_instruct => "xml processing instruction".to_string(),
17 | Rule::el_raw_text => "element with raw text (style or script)".to_string(),
18 | Rule::el_normal => "normal element".to_string(),
19 | Rule::el_dangling => "".to_string(),
20 | Rule::attr => "attribute (key=\"value\")".to_string(),
21 | Rule::attr_key => "attribute key".to_string(),
22 | Rule::attr_value => "attribute value".to_string(),
23 | Rule::el_name => "element name".to_string(),
24 | Rule::el_void_name_html => "void element name".to_string(),
25 | // TODO: Continue with this
26 | x => format!("{:?} ", x),
27 | });
28 | Err(Error::Parsing(message.to_string()))
29 | }
30 |
--------------------------------------------------------------------------------
/tests/text.rs:
--------------------------------------------------------------------------------
1 | use html_parser::{Dom, Result};
2 | use indoc::indoc;
3 | use insta::assert_json_snapshot;
4 |
5 | #[test]
6 | fn it_can_parse_document_with_just_text() -> Result<()> {
7 | let html = "hello world";
8 | let dom = Dom::parse(html)?;
9 | assert_json_snapshot!(dom);
10 | Ok(())
11 | }
12 |
13 | #[test]
14 | fn it_can_parse_document_with_text_and_line_breaks() -> Result<()> {
15 | let html = indoc!(
16 | r"
17 | hello world
18 | here's another line for you!
19 | The end
20 | "
21 | );
22 | let dom = Dom::parse(html)?;
23 | assert_json_snapshot!(dom);
24 | Ok(())
25 | }
26 |
27 | #[test]
28 | fn it_can_parse_document_with_multiple_text_elements() -> Result<()> {
29 | let html = indoc!(
30 | r"
31 | hello world
32 | here's another line for you!
33 |
34 | The end
35 | "
36 | );
37 | let dom = Dom::parse(html)?;
38 | assert_json_snapshot!(dom);
39 | Ok(())
40 | }
41 |
42 | #[test]
43 | fn it_can_parse_text_with_chevron() -> Result<()> {
44 | let html = indoc!(r"hello <> world");
45 | let dom = Dom::parse(html)?;
46 | assert_json_snapshot!(dom);
47 | Ok(())
48 | }
49 |
50 | #[test]
51 | fn it_can_parse_text_in_paragraph_with_weird_formatting() -> Result<()> {
52 | let html = indoc!(r"
53 |
54 | This is a para graph with some weird formatting.
55 |
56 | ");
57 | let dom = Dom::parse(html)?;
58 | assert_json_snapshot!(dom);
59 | Ok(())
60 | }
61 |
--------------------------------------------------------------------------------
/examples/simple_parser/main.rs:
--------------------------------------------------------------------------------
1 | use clap::Parser;
2 | use html_parser::{Dom, Result};
3 | use std::{
4 | fs::File,
5 | io::{self, Read},
6 | path::PathBuf,
7 | };
8 |
9 | #[derive(Debug, Parser)]
10 | /// A simple and general purpose html/xhtml parser.
11 | struct Opt {
12 | #[arg(short, long)]
13 | /// Pretty-print the output.
14 | pretty_print: bool,
15 |
16 | #[arg(short, long)]
17 | /// Debug the parser, this will print errors to the console.
18 | debug: bool,
19 |
20 | /// Path to the file, or stdin (piped content).
21 | ///
22 | /// This argument can either be a path to the html-file that you would like to parse or the
23 | /// result of stdin. Note: Content over stdin needs to be finite, for now, as it is collected
24 | /// into a string and then processed by the parser.
25 | input: Option,
26 | }
27 |
28 | fn main() -> Result<()> {
29 | let opt = Opt::parse();
30 |
31 | let mut content = String::with_capacity(100_000);
32 |
33 | // If input is provided then use that as a path
34 | if let Some(path) = opt.input {
35 | let mut file = File::open(path)?;
36 | file.read_to_string(&mut content)?;
37 |
38 | // Else read from stdin, this enables piping
39 | // ex: `cat index.html | html_parser`
40 | } else {
41 | let stdin = io::stdin();
42 | let mut handle = stdin.lock();
43 | handle.read_to_string(&mut content)?;
44 | };
45 |
46 | let dom = Dom::parse(&content)?;
47 |
48 | if opt.debug {
49 | for error in &dom.errors {
50 | println!("# {}", error);
51 | }
52 | }
53 |
54 | if opt.pretty_print {
55 | println!("{}", dom.to_json_pretty()?);
56 | } else {
57 | println!("{}", dom.to_json()?);
58 | }
59 |
60 | Ok(())
61 | }
62 |
--------------------------------------------------------------------------------
/tests/svg.rs:
--------------------------------------------------------------------------------
1 | use html_parser::{Dom, Result};
2 | use indoc::indoc;
3 | use insta::assert_json_snapshot;
4 |
5 | #[test]
6 | fn it_can_parse_svg() -> Result<()> {
7 | let html = indoc!(
8 | r#"
9 |
10 |
11 |
12 | "#
13 | );
14 | let dom = Dom::parse(html)?;
15 | assert_json_snapshot!(dom);
16 | Ok(())
17 | }
18 |
19 | #[test]
20 | fn it_can_parse_complex_svg() {
21 | let svg = indoc!(
22 | r#"
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 | It's SVG!
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 | Sorry, your browser does not support inline SVG.
41 |
42 | "#
43 | );
44 | assert!(Dom::parse(&svg).is_ok());
45 | }
46 |
--------------------------------------------------------------------------------
/src/dom/element.rs:
--------------------------------------------------------------------------------
1 | use super::node::Node;
2 | use super::span::SourceSpan;
3 | use serde::{Serialize, Serializer};
4 | use std::collections::{BTreeMap, HashMap};
5 | use std::default::Default;
6 | use std::result::Result;
7 |
8 | /// Normal: `
` or Void: ` `and ` `
9 | #[derive(Debug, Clone, Serialize, PartialEq)]
10 | #[serde(rename_all = "camelCase")]
11 | // TODO: Align with: https://html.spec.whatwg.org/multipage/syntax.html#elements-2
12 | pub enum ElementVariant {
13 | /// A normal element can have children, ex:
.
14 | Normal,
15 | /// A void element can't have children, ex: and
16 | Void,
17 | }
18 |
19 | pub type Attributes = HashMap>;
20 |
21 | /// Most of the parsed html nodes are elements, except for text
22 | #[derive(Debug, Clone, Serialize, PartialEq)]
23 | #[serde(rename_all = "camelCase")]
24 | pub struct Element {
25 | /// The id of the element
26 | #[serde(skip_serializing_if = "Option::is_none")]
27 | pub id: Option,
28 |
29 | /// The name / tag of the element
30 | pub name: String,
31 |
32 | /// The element variant, if it is of type void or not
33 | pub variant: ElementVariant,
34 |
35 | /// All of the elements attributes, except id and class
36 | #[serde(skip_serializing_if = "HashMap::is_empty")]
37 | #[serde(serialize_with = "ordered_map")]
38 | pub attributes: Attributes,
39 |
40 | /// All of the elements classes
41 | #[serde(skip_serializing_if = "Vec::is_empty")]
42 | pub classes: Vec,
43 |
44 | /// All of the elements child nodes
45 | #[serde(skip_serializing_if = "Vec::is_empty")]
46 | pub children: Vec,
47 |
48 | /// Span of the element in the parsed source
49 | #[serde(skip)]
50 | pub source_span: SourceSpan
51 | }
52 |
53 | impl Default for Element {
54 | fn default() -> Self {
55 | Self {
56 | id: None,
57 | name: "".to_string(),
58 | variant: ElementVariant::Void,
59 | classes: vec![],
60 | attributes: HashMap::new(),
61 | children: vec![],
62 | source_span: SourceSpan::default()
63 | }
64 | }
65 | }
66 |
67 | fn ordered_map(value: &Attributes, serializer: S) -> Result {
68 | let ordered: BTreeMap<_, _> = value.iter().collect();
69 | ordered.serialize(serializer)
70 | }
71 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Html parser
2 |
3 | A simple and general purpose html/xhtml parser lib/bin, using [Pest](https://pest.rs/).
4 |
5 | ## Features
6 |
7 | - Parse html & xhtml (not xml processing instructions)
8 | - Parse html-documents
9 | - Parse html-fragments
10 | - Parse empty documents
11 | - Parse with the same api for both documents and fragments
12 | - Parse custom, non-standard, elements; ` `, ` ` and ` `
13 | - Removes comments
14 | - Removes dangling elements
15 | - Iterate over all nodes in the dom three
16 |
17 | ## What is it not
18 |
19 | - It's not a high-performance browser-grade parser
20 | - It's not suitable for html validation
21 | - It's not a parser that includes element selection or dom manipulation
22 |
23 | If your requirements matches any of the above, then you're most likely looking for one of the crates below:
24 |
25 | - [html5ever](https://crates.io/crates/html5ever)
26 | - [kuchiki](https://crates.io/crates/kuchiki)
27 | - [scraper](https://crates.io/crates/scraper)
28 | - or other crates using the `html5ever` parser
29 |
30 | ## Examples bin
31 |
32 | Parse html file
33 |
34 | ```shell
35 | html_parser index.html
36 |
37 | ```
38 |
39 | Parse stdin with pretty output
40 |
41 | ```shell
42 | curl | html_parser -p
43 | ```
44 |
45 | ## Examples lib
46 |
47 | Parse html document
48 |
49 | ```rust
50 | use html_parser::Dom;
51 |
52 | fn main() {
53 | let html = r#"
54 |
55 |
56 |
57 |
58 | Html parser
59 |
60 |
61 | Hello world
62 |
63 |
64 | "#;
65 |
66 | assert!(Dom::parse(html).is_ok());
67 | }
68 | ```
69 |
70 | Parse html fragment
71 |
72 | ```rust
73 | use html_parser::Dom;
74 |
75 | fn main() {
76 | let html = "
";
77 | assert!(Dom::parse(html).is_ok());
78 | }
79 | ```
80 |
81 | Print to json
82 |
83 | ```rust
84 | use html_parser::{Dom, Result};
85 |
86 | fn main() -> Result<()> {
87 | let html = "
";
88 | let json = Dom::parse(html)?.to_json_pretty()?;
89 | println!("{}", json);
90 | Ok(())
91 | }
92 | ```
93 |
--------------------------------------------------------------------------------
/tests/snapshots/element__it_can_parse_deeply_nested.snap:
--------------------------------------------------------------------------------
1 | ---
2 | source: tests/element.rs
3 | expression: dom
4 | ---
5 | {
6 | "treeType": "documentFragment",
7 | "children": [
8 | {
9 | "name": "div",
10 | "variant": "normal",
11 | "classes": [
12 | "1"
13 | ],
14 | "children": [
15 | {
16 | "name": "div",
17 | "variant": "normal",
18 | "classes": [
19 | "1"
20 | ],
21 | "children": [
22 | {
23 | "name": "div",
24 | "variant": "normal",
25 | "classes": [
26 | "1"
27 | ],
28 | "children": [
29 | {
30 | "name": "div",
31 | "variant": "normal",
32 | "classes": [
33 | "1"
34 | ],
35 | "children": [
36 | {
37 | "name": "div",
38 | "variant": "normal",
39 | "classes": [
40 | "1"
41 | ],
42 | "children": [
43 | {
44 | "name": "div",
45 | "variant": "normal",
46 | "classes": [
47 | "1"
48 | ],
49 | "children": [
50 | {
51 | "name": "div",
52 | "variant": "normal",
53 | "classes": [
54 | "1"
55 | ],
56 | "children": [
57 | {
58 | "name": "div",
59 | "variant": "normal",
60 | "classes": [
61 | "1"
62 | ],
63 | "children": [
64 | "this is deep",
65 | "hello world\n "
66 | ]
67 | }
68 | ]
69 | }
70 | ]
71 | }
72 | ]
73 | }
74 | ]
75 | }
76 | ]
77 | }
78 | ]
79 | }
80 | ]
81 | }
82 | ]
83 | }
84 |
--------------------------------------------------------------------------------
/tests/snapshots/source_span__it_can_generate_source_span.snap:
--------------------------------------------------------------------------------
1 | ---
2 | source: tests/source_span.rs
3 | expression: dom
4 | ---
5 | Dom {
6 | tree_type: DocumentFragment,
7 | children: [
8 | Element(
9 | Element {
10 | id: None,
11 | name: "template",
12 | variant: Normal,
13 | attributes: {},
14 | classes: [],
15 | children: [
16 | Element(
17 | Element {
18 | id: None,
19 | name: "h1",
20 | variant: Normal,
21 | attributes: {},
22 | classes: [],
23 | children: [
24 | Text(
25 | "Header",
26 | ),
27 | ],
28 | source_span: SourceSpan {
29 | text: "Header ",
30 | start_line: 2,
31 | end_line: 2,
32 | start_column: 5,
33 | end_column: 20,
34 | },
35 | },
36 | ),
37 | Element(
38 | Element {
39 | id: None,
40 | name: "p",
41 | variant: Normal,
42 | attributes: {},
43 | classes: [],
44 | children: [
45 | Text(
46 | "Paragraph",
47 | ),
48 | ],
49 | source_span: SourceSpan {
50 | text: "Paragraph
",
51 | start_line: 3,
52 | end_line: 3,
53 | start_column: 5,
54 | end_column: 21,
55 | },
56 | },
57 | ),
58 | ],
59 | source_span: SourceSpan {
60 | text: "\n Header \n Paragraph
\n ",
61 | start_line: 1,
62 | end_line: 4,
63 | start_column: 1,
64 | end_column: 12,
65 | },
66 | },
67 | ),
68 | ],
69 | errors: [],
70 | }
71 |
--------------------------------------------------------------------------------
/tests/websites.rs:
--------------------------------------------------------------------------------
1 | use html_parser::Dom;
2 | use indoc::indoc;
3 |
4 | #[test]
5 | fn it_can_parse_simple() {
6 | let html = indoc!(
7 | r#"
8 |
9 |
10 |
11 |
12 |
13 | Document
14 |
23 |
24 |
25 | Hello world
26 |
27 |
31 |
32 |
33 | "#
34 | );
35 | assert!(Dom::parse(html).is_ok());
36 | }
37 |
38 | #[test]
39 | fn it_can_parse_spotify() {
40 | let resp = reqwest::blocking::get("https://www.spotify.com/se")
41 | .unwrap()
42 | .text()
43 | .unwrap();
44 | assert!(Dom::parse(&resp).is_ok());
45 | }
46 |
47 | #[ignore]
48 | #[test]
49 | fn it_can_parse_facebook() {
50 | let resp = reqwest::blocking::get("https://www.facebook.com/")
51 | .unwrap()
52 | .text()
53 | .unwrap();
54 | assert!(Dom::parse(&resp).is_ok());
55 | }
56 |
57 | #[ignore]
58 | #[test]
59 | fn it_can_parse_amazon() {
60 | let resp = reqwest::blocking::get("https://www.amazon.com/")
61 | .unwrap()
62 | .text()
63 | .unwrap();
64 | assert!(Dom::parse(&resp).is_ok());
65 | }
66 |
67 | #[ignore]
68 | #[test]
69 | fn it_can_parse_apple() {
70 | let resp = reqwest::blocking::get("https://www.apple.com/")
71 | .unwrap()
72 | .text()
73 | .unwrap();
74 | assert!(Dom::parse(&resp).is_ok());
75 | }
76 |
77 | #[ignore]
78 | #[test]
79 | fn it_can_parse_nytimes() {
80 | let resp = reqwest::blocking::get("https://www.nytimes.com/")
81 | .unwrap()
82 | .text()
83 | .unwrap();
84 | assert!(Dom::parse(&resp).is_ok());
85 | }
86 |
87 | #[ignore]
88 | #[test]
89 | fn it_can_parse_wikipedia() {
90 | let resp = reqwest::blocking::get("https://en.wikipedia.org/wiki/Main_Page")
91 | .unwrap()
92 | .text()
93 | .unwrap();
94 | assert!(Dom::parse(&resp).is_ok());
95 | }
96 |
--------------------------------------------------------------------------------
/tests/element_attributes.rs:
--------------------------------------------------------------------------------
1 | use html_parser::{Dom, Result};
2 | use insta::assert_json_snapshot;
3 |
4 | #[test]
5 | fn it_can_parse_double_quote() -> Result<()> {
6 | let html = "
";
7 | let dom = Dom::parse(html)?;
8 | assert_json_snapshot!(dom);
9 | Ok(())
10 | }
11 | #[test]
12 | fn it_can_parse_single_quote() -> Result<()> {
13 | let html = "
";
14 | let dom = Dom::parse(html)?;
15 | assert_json_snapshot!(dom);
16 | Ok(())
17 | }
18 | #[test]
19 | fn it_can_parse_no_quote() -> Result<()> {
20 | let html = "
";
21 | let dom = Dom::parse(html)?;
22 | assert_json_snapshot!(dom);
23 | Ok(())
24 | }
25 | #[test]
26 | fn it_can_parse_attribute_key_mixed_case_symbols() -> Result<()> {
27 | let html = "
";
28 | let dom = Dom::parse(html)?;
29 | assert_json_snapshot!(dom);
30 | Ok(())
31 | }
32 | #[test]
33 | fn it_can_parse_multiple_attributes_single_quote() -> Result<()> {
34 | let html = "
";
35 | let dom = Dom::parse(html)?;
36 | assert_json_snapshot!(dom);
37 | Ok(())
38 | }
39 | #[test]
40 | fn it_can_parse_multiple_attributes_where_whitespace_does_not_matter_for_keys() -> Result<()> {
41 | let html = "
";
42 | let dom = Dom::parse(html)?;
43 | assert_json_snapshot!(dom);
44 | Ok(())
45 | }
46 | #[test]
47 | fn it_can_parse_multiple_attributes_double_quote() -> Result<()> {
48 | let html = "
";
49 | let dom = Dom::parse(html)?;
50 | assert_json_snapshot!(dom);
51 | Ok(())
52 | }
53 | #[test]
54 | fn it_can_parse_multiple_attributes_no_quote() -> Result<()> {
55 | let html = "
";
56 | let dom = Dom::parse(html)?;
57 | assert_json_snapshot!(dom);
58 | Ok(())
59 | }
60 | #[test]
61 | fn it_can_parse_attribute_multiple_values_single_quote() -> Result<()> {
62 | let html = "
";
63 | let dom = Dom::parse(html)?;
64 | assert_json_snapshot!(dom);
65 | Ok(())
66 | }
67 | #[test]
68 | fn it_can_parse_attribute_multiple_values_double_quote() -> Result<()> {
69 | let html = "
";
70 | let dom = Dom::parse(html)?;
71 | assert_json_snapshot!(dom);
72 | Ok(())
73 | }
74 | #[test]
75 | fn it_can_parse_attribute_with_empty_value() -> Result<()> {
76 | let html = " ";
77 | let dom = Dom::parse(html)?;
78 | assert_json_snapshot!(dom);
79 | Ok(())
80 | }
81 |
82 | #[test]
83 | fn it_can_parse_id() -> Result<()> {
84 | let html = " ";
85 | let dom = Dom::parse(html)?;
86 | assert_json_snapshot!(dom);
87 | Ok(())
88 | }
89 | #[test]
90 | fn it_can_parse_classes() -> Result<()> {
91 | let html = " ";
92 | let dom = Dom::parse(html)?;
93 | assert_json_snapshot!(dom);
94 | Ok(())
95 | }
96 | #[test]
97 | fn it_keeps_spaces_for_non_classes() -> Result<()> {
98 | let html = " ";
99 | let dom = Dom::parse(html)?;
100 | assert_json_snapshot!(dom);
101 | Ok(())
102 | }
103 |
--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
1 | //! [![github]](https://github.com/mathiversen/html-parser)
2 | //!
3 | //! [github]: https://img.shields.io/badge/github-8da0cb?style=for-the-badge&labelColor=555555&logo=github
4 | //!
5 | //! # Html parser
6 | //!
7 | //! A simple and general purpose html/xhtml parser lib/bin, using [Pest](https://pest.rs/).
8 | //!
9 | //! ## Features
10 | //! - Parse html & xhtml (not xml processing instructions)
11 | //! - Parse html-documents
12 | //! - Parse html-fragments
13 | //! - Parse empty documents
14 | //! - Parse with the same api for both documents and fragments
15 | //! - Parse custom, non-standard, elements; ` `, ` ` and ` `
16 | //! - Removes comments
17 | //! - Removes dangling elements
18 | //! - Iterate over all nodes in the dom tree
19 | //!
20 | //! ## What is it not
21 | //!
22 | //! - It's not a high-performance browser-grade parser
23 | //! - It's not suitable for html validation
24 | //! - It's not a parser that includes element selection or dom manipulation
25 | //!
26 | //! If your requirements matches any of the above, then you're most likely looking for one of the crates below:
27 | //!
28 | //! - [html5ever](https://crates.io/crates/html5ever)
29 | //! - [kuchiki](https://crates.io/crates/kuchiki)
30 | //! - [scraper](https://crates.io/crates/scraper)
31 | //! - or other crates using the `html5ever` parser
32 | //!
33 | //! ## Examples bin
34 | //!
35 | //! Parse html file
36 | //!
37 | //! ```shell
38 | //! html_parser index.html
39 | //!
40 | //! ```
41 | //!
42 | //! Parse stdin with pretty output
43 | //!
44 | //! ```shell
45 | //! curl | html_parser -p
46 | //! ```
47 | //!
48 | //! ## Examples lib
49 | //!
50 | //! Parse html document
51 | //!
52 | //! ```rust
53 | //! use html_parser::Dom;
54 | //!
55 | //! fn main() {
56 | //! let html = r#"
57 | //!
58 | //!
59 | //!
60 | //!
61 | //! Html parser
62 | //!
63 | //!
64 | //! Hello world
65 | //!
66 | //!
67 | //! "#;
68 | //!
69 | //! assert!(Dom::parse(html).is_ok());
70 | //! }
71 | //! ```
72 | //!
73 | //! Parse html fragment
74 | //!
75 | //! ```rust
76 | //! use html_parser::Dom;
77 | //!
78 | //! fn main() {
79 | //! let html = "
";
80 | //! assert!(Dom::parse(html).is_ok());
81 | //! }
82 | //! ```
83 | //!
84 | //! Print to json
85 | //!
86 | //! ```rust
87 | //! use html_parser::{Dom, Result};
88 | //!
89 | //! fn main() -> Result<()> {
90 | //! let html = "
";
91 | //! let json = Dom::parse(html)?.to_json_pretty()?;
92 | //! println!("{}", json);
93 | //! Ok(())
94 | //! }
95 | //! ```
96 |
97 | #![allow(clippy::needless_doctest_main)]
98 |
99 | mod dom;
100 | mod error;
101 | mod grammar;
102 |
103 | use grammar::Rule;
104 |
105 | pub use crate::dom::element::{Element, ElementVariant};
106 | pub use crate::dom::node::Node;
107 | pub use crate::dom::Dom;
108 | pub use crate::dom::DomVariant;
109 | pub use crate::error::Error;
110 | pub use crate::error::Result;
111 |
--------------------------------------------------------------------------------
/src/dom/node.rs:
--------------------------------------------------------------------------------
1 | use super::element::Element;
2 | use serde::Serialize;
3 |
4 | #[derive(Debug, Clone, Serialize, PartialEq)]
5 | #[serde(untagged)]
6 | pub enum Node {
7 | Text(String),
8 | Element(Element),
9 | Comment(String),
10 | }
11 |
12 | impl Node {
13 | pub fn text(&self) -> Option<&str> {
14 | match self {
15 | Node::Text(t) => Some(t.as_str()),
16 | _ => None,
17 | }
18 | }
19 |
20 | pub fn element(&self) -> Option<&Element> {
21 | match self {
22 | Node::Element(e) => Some(e),
23 | _ => None,
24 | }
25 | }
26 |
27 | pub fn comment(&self) -> Option<&str> {
28 | match self {
29 | Node::Comment(t) => Some(t.as_str()),
30 | _ => None,
31 | }
32 | }
33 | }
34 |
35 | impl<'a> IntoIterator for &'a Node {
36 | type Item = &'a Node;
37 | type IntoIter = NodeIntoIterator<'a>;
38 |
39 | fn into_iter(self) -> Self::IntoIter {
40 | NodeIntoIterator {
41 | node: self,
42 | index: vec![],
43 | }
44 | }
45 | }
46 |
47 | pub struct NodeIntoIterator<'a> {
48 | node: &'a Node,
49 | // We add/remove to this vec each time we go up/down a node three
50 | index: Vec<(usize, &'a Node)>,
51 | }
52 |
53 | impl<'a> Iterator for NodeIntoIterator<'a> {
54 | type Item = &'a Node;
55 |
56 | fn next(&mut self) -> Option {
57 | // Get first child
58 | let child = match self.node {
59 | Node::Element(ref e) => e.children.get(0),
60 | _ => None,
61 | };
62 |
63 | let result = match child {
64 | // If element has child, return child
65 | Some(child) => {
66 | self.index.push((0, self.node));
67 | self.node = child;
68 | Some(child)
69 | }
70 | // If element doesn't have a child, but is a child of another node
71 | None if self.index.len() > 0 => {
72 | let mut has_finished = false;
73 | let mut next_node = None;
74 |
75 | while !has_finished {
76 | // Try to get the next sibling of the parent node
77 | if let Some((sibling_index, parent)) = self.index.pop() {
78 | let next_sibling = sibling_index + 1;
79 | let sibling = if let Node::Element(ref e) = parent {
80 | e.children.get(next_sibling)
81 | } else {
82 | None
83 | };
84 | if sibling.is_some() {
85 | has_finished = true;
86 | self.index.push((next_sibling, parent));
87 | next_node = sibling;
88 | } else {
89 | continue;
90 | }
91 | // Break of there are no more parents
92 | } else {
93 | has_finished = true;
94 | }
95 | }
96 |
97 | if let Some(next_node) = next_node {
98 | self.node = next_node;
99 | }
100 |
101 | next_node
102 | }
103 | _ => None,
104 | };
105 |
106 | result
107 | }
108 | }
109 |
110 | #[cfg(test)]
111 | mod tests {
112 | use super::*;
113 |
114 | #[test]
115 | fn node_utillity_functions() {
116 | let node = Node::Text("test".to_string());
117 |
118 | assert_eq!(node.text(), Some("test"));
119 | assert_eq!(node.element(), None);
120 | assert_eq!(node.comment(), None);
121 |
122 | let node = Node::Element(Element::default());
123 |
124 | assert_eq!(node.text(), None);
125 | assert_eq!(node.element(), Some(&Element::default()));
126 | assert_eq!(node.comment(), None);
127 |
128 | let node = Node::Comment("test".to_string());
129 |
130 | assert_eq!(node.text(), None);
131 | assert_eq!(node.element(), None);
132 | assert_eq!(node.comment(), Some("test"));
133 | }
134 | }
--------------------------------------------------------------------------------
/src/grammar/rules.pest:
--------------------------------------------------------------------------------
1 | //
2 | // HTML
3 | //
4 | html = _{
5 | SOI
6 | ~ node_comment*
7 | ~ doctype?
8 | ~ node*
9 | ~ EOI
10 | }
11 |
12 | //
13 | // DOCTYPE
14 | //
15 | doctype = { WSP* ~ chevron_left_bang ~ ^"doctype" ~ WSP* ~ attr* ~ WSP* ~ chevron_right_normal}
16 |
17 | //
18 | // NODES
19 | //
20 | node = _{ node_comment | node_element | node_text }
21 | node_comment = { WSP* ~ (comment_if | comment_normal) ~ WSP* }
22 | node_text = { (!(node_element | comment_tag_start | chevron_left_bang) ~ ANY)+ }
23 | node_element = { el_void | el_void_xml | el_process_instruct | el_raw_text | el_normal | el_dangling }
24 |
25 | //
26 | // COMMENTS
27 | //
28 | comment_normal = _{ comment_tag_start ~ comment_body ~ comment_tag_end }
29 | comment_body = { (!comment_tag_end ~ ANY)* }
30 | comment_tag_start = _{ chevron_left_bang ~ "--" ~ WSP* }
31 | comment_tag_end = _{ WSP* ~ "--" ~ chevron_right_normal }
32 |
33 | // Compatability with old IE browsers... This is not necessary for newer browsers
34 | comment_if = _{ comment_if_start ~ comment_if_body ~ comment_if_end }
35 | comment_if_body = { (!comment_if_end ~ ANY)* }
36 | comment_if_start = _{ comment_tag_start ~ "[" ~ ^"if" }
37 | comment_if_end = _{ chevron_left_bang ~ "[" ~ ^"endif" ~ "]" ~ comment_tag_end }
38 |
39 | //
40 | // ATTRIBUTES
41 | //
42 | attr = { attr_key ~ (equal ~ WSP* ~ (attr_non_quoted | attr_quoted ))? }
43 | attr_quoted = ${PUSH(quote) ~ attr_value ~ POP }
44 | attr_non_quoted = @{ !quote ~ (!(WSP | chevron_right) ~ ANY)* }
45 | attr_key = { WSP* ~ ASCII_ALPHA ~ text_chars* ~ WSP* }
46 | attr_value = { WSP* ~ (!PEEK ~ ANY)* ~ WSP* }
47 |
48 | //
49 | // ELEMENTS
50 | //
51 | el_name = @{ ASCII_ALPHA ~ text_chars* }
52 |
53 | // Void element aka self-closing element
54 | // Ex:
55 | el_void_name_html = @{
56 | ^"area"
57 | | ^"base"
58 | | ^"br"
59 | | ^"col"
60 | | ^"command"
61 | | ^"embed"
62 | | ^"hr"
63 | | ^"img"
64 | | ^"input"
65 | | ^"keygen"
66 | | ^"link"
67 | | ^"meta"
68 | | ^"param"
69 | | ^"source"
70 | | ^"track"
71 | | ^"wbr"
72 | | ^"meta"
73 | }
74 | // NOTE: This should not have to be a rule, but people doesn't know what void elements are...
75 | el_void_name_svg = @{
76 | ^"path"
77 | | ^"polygon"
78 | | ^"rect"
79 | | ^"circle"
80 | }
81 | el_void_name = @{ el_void_name_html | el_void_name_svg }
82 | el_void = _{ chevron_left_normal ~ WSP* ~ el_void_name ~ WSP* ~ attr* ~ WSP* ~ (chevron_right_normal | chevron_right_closed) }
83 | el_void_xml = _{ chevron_left_normal ~ WSP* ~ el_name ~ WSP* ~ attr* ~ WSP* ~ chevron_right_closed }
84 |
85 | // Open elements are default element that can take children
86 | // and have both a start tag and an end tag
87 | // Ex:
88 | el_normal = _{ el_normal_start ~ (!el_normal_end ~ node)* ~ el_normal_end }
89 | el_normal_start = _{ chevron_left_normal ~ WSP* ~ PUSH(el_name) ~ WSP* ~ attr* ~ WSP* ~ chevron_right_normal}
90 | el_normal_end = { chevron_left_closed ~ WSP* ~ POP ~ WSP* ~ chevron_right_normal}
91 |
92 | // Raw text elements are elements with text/script content that
93 | // might interfere with the normal html syntax
94 | el_raw_text_name = {
95 | ^"style"
96 | | ^"script"
97 | | ^"title"
98 | | ^"textarea"
99 | }
100 | el_raw_text_content = { (!el_raw_text_end ~ ANY)* }
101 | el_raw_text = _{ el_raw_text_start ~ el_raw_text_content ~ el_raw_text_end }
102 | el_raw_text_start = _{ chevron_left_normal ~ WSP* ~ PUSH(el_raw_text_name) ~ WSP* ~ attr* ~ WSP* ~ chevron_right_normal ~ WSP*}
103 | el_raw_text_end = { WSP* ~ chevron_left_closed ~ WSP* ~ POP ~ WSP* ~ chevron_right_normal}
104 |
105 | // XML processing instruction
106 | // Ex:
107 | el_process_instruct = { chevron_left_question ~ WSP* ~ el_name? ~ WSP* ~ attr* ~ WSP* ~ chevron_right_question }
108 |
109 | // Catch dangling elements
110 | // Ex:
111 | el_dangling = { chevron_left_closed ~ WSP* ~ el_name ~ WSP* ~ chevron_right_normal}
112 |
113 | //
114 | // SYMBOLS / CHARACTERS
115 | //
116 | text_chars = _{'a'..'z' | 'A'..'Z' | "_" | "-" | ":" |'0'..'9'}
117 |
118 | chevron_left_normal = _{ "<" }
119 | chevron_left_closed = _{ "" }
120 | chevron_left_bang = _{ "" }
124 | chevron_right_closed = _{ "/>" }
125 | chevron_right_question = _{ "?>" }
126 | chevron_right = _{
127 | chevron_right_normal
128 | | chevron_right_closed
129 | | chevron_right_question
130 | }
131 |
132 | equal = _{ "=" }
133 | quote_dubble = _{ "\"" }
134 | quote_single = _{ "'" }
135 | quote = _{ quote_dubble | quote_single }
136 | WSP = _{ " " | "\t" | "\r" | "\n" }
137 |
--------------------------------------------------------------------------------
/tests/element.rs:
--------------------------------------------------------------------------------
1 | use html_parser::{Dom, Result};
2 | use indoc::indoc;
3 | use insta::assert_json_snapshot;
4 |
5 | #[test]
6 | fn it_can_parse_one_element() -> Result<()> {
7 | let html = "";
8 | let dom = Dom::parse(html)?;
9 | assert_json_snapshot!(dom);
10 | Ok(())
11 | }
12 | #[test]
13 | fn it_can_parse_one_element_upper_case() -> Result<()> {
14 | let html = "";
15 | let dom = Dom::parse(html)?;
16 | assert_json_snapshot!(dom);
17 | Ok(())
18 | }
19 | #[test]
20 | fn it_can_parse_one_element_mixed_case() -> Result<()> {
21 | let html = "";
22 | let dom = Dom::parse(html)?;
23 | assert_json_snapshot!(dom);
24 | Ok(())
25 | }
26 | #[test]
27 | fn it_can_parse_one_element_mixed_case_numbers() -> Result<()> {
28 | let html = "
";
29 | let dom = Dom::parse(html)?;
30 | assert_json_snapshot!(dom);
31 | Ok(())
32 | }
33 | #[test]
34 | fn it_can_parse_one_element_mixed_case_numbers_symbols() -> Result<()> {
35 | let html = "
";
36 | let dom = Dom::parse(html)?;
37 | assert_json_snapshot!(dom);
38 | Ok(())
39 | }
40 | #[test]
41 | fn it_can_parse_multiple_elements() -> Result<()> {
42 | let html = "
";
43 | let dom = Dom::parse(html)?;
44 | assert_json_snapshot!(dom);
45 | Ok(())
46 | }
47 | #[test]
48 | fn it_can_parse_multiple_open_elements() -> Result<()> {
49 | let html = "
";
50 | let dom = Dom::parse(html)?;
51 | assert_json_snapshot!(dom);
52 | Ok(())
53 | }
54 | #[test]
55 | fn it_can_parse_nested_elements() -> Result<()> {
56 | let html = indoc!(
57 | r"
58 |
61 | "
62 | );
63 | let dom = Dom::parse(html)?;
64 | assert_json_snapshot!(dom);
65 | Ok(())
66 | }
67 | #[test]
68 | fn it_can_parse_nested_elements_mixed_children() -> Result<()> {
69 | let html = indoc!(
70 | r"
71 |
72 |
73 |
74 | Hello
75 |
76 | World
77 |
78 |
79 | "
80 | );
81 | let dom = Dom::parse(html)?;
82 | assert_json_snapshot!(dom);
83 | Ok(())
84 | }
85 | #[test]
86 | fn it_can_parse_deeply_nested() -> Result<()> {
87 | let html = indoc!(
88 | r#"
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 | hello world
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 | "#
108 | );
109 | let dom = Dom::parse(html)?;
110 | assert_json_snapshot!(dom);
111 | Ok(())
112 | }
113 | #[test]
114 | fn it_can_parse_script_with_content() -> Result<()> {
115 | let html = indoc!(
116 | r#"
117 |
142 | "#
143 | );
144 | let dom = Dom::parse(html)?;
145 | assert_json_snapshot!(dom);
146 | Ok(())
147 | }
148 | #[test]
149 | fn it_can_parse_style_with_content() -> Result<()> {
150 | let html = indoc!(
151 | r#"
152 |
162 | "#
163 | );
164 | let dom = Dom::parse(html)?;
165 | assert_json_snapshot!(dom);
166 | Ok(())
167 | }
168 | #[test]
169 | fn it_skips_dangling_elements() -> Result<()> {
170 | let html = indoc!(
171 | "
172 |
173 |
174 |