├── .github ├── dependabot.yml └── workflows │ ├── build.yml │ └── msrv.yml ├── .gitignore ├── CHANGELOG.md ├── Cargo.toml ├── README.md ├── src ├── html.rs ├── http.rs ├── lib.rs ├── opengraph.rs ├── parser.rs └── schema_org.rs └── tests ├── data └── index.html └── integration_test.rs /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "cargo" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "weekly" 12 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | 3 | # read-only repo token 4 | # no access to secrets 5 | on: 6 | push: 7 | branches: [ master ] 8 | pull_request: 9 | 10 | env: 11 | CARGO_TERM_COLOR: always 12 | 13 | jobs: 14 | verify-build: 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | # checkout repo 19 | - uses: actions/checkout@v3 20 | 21 | - name: Install rust 22 | uses: dtolnay/rust-toolchain@stable 23 | with: 24 | components: clippy, rustfmt 25 | 26 | - name: Generate Cargo.lock 27 | run: cargo generate-lockfile 28 | 29 | # restore cargo cache from previous runs 30 | - name: Rust Cache 31 | uses: Swatinem/rust-cache@v2 32 | with: 33 | # The cache should not be shared between different workflows and jobs. 34 | shared-key: ${{ github.workflow }}-${{ github.job }} 35 | 36 | # check it builds 37 | - name: Build 38 | run: cargo build --locked --verbose --all-targets --all-features 39 | 40 | # run tests 41 | - name: Run tests 42 | run: cargo test --verbose --all-features 43 | 44 | # make sure all code has been formatted with rustfmt 45 | - name: check rustfmt 46 | run: cargo fmt -- --check --color always 47 | 48 | # run clippy to verify we have no warnings 49 | - name: cargo clippy 50 | env: 51 | RUSTDOCFLAGS: -D warnings 52 | run: cargo clippy --all-targets --all-features 53 | 54 | # check for rustdoc warnings 55 | - name: generate and verify rustdoc 56 | env: 57 | RUSTDOCFLAGS: -D warnings 58 | run: cargo doc --no-deps --document-private-items --workspace --all-features 59 | -------------------------------------------------------------------------------- /.github/workflows/msrv.yml: -------------------------------------------------------------------------------- 1 | name: MSRV 2 | 3 | # read-only repo token 4 | # no access to secrets 5 | on: 6 | push: 7 | branches: [ master ] 8 | pull_request: 9 | 10 | env: 11 | CARGO_TERM_COLOR: always 12 | 13 | jobs: 14 | verify-build: 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | # checkout repo 19 | - uses: actions/checkout@v3 20 | 21 | - name: Install rust 22 | # Aligned with `rust-version` in `Cargo.toml` 23 | uses: dtolnay/rust-toolchain@1.63 24 | 25 | - name: Generate Cargo.lock 26 | run: cargo generate-lockfile 27 | 28 | # restore cargo cache from previous runs 29 | - name: Rust Cache 30 | uses: Swatinem/rust-cache@v2 31 | with: 32 | # The cache should not be shared between different workflows and jobs. 33 | shared-key: ${{ github.workflow }}-${{ github.job }} 34 | 35 | # check it builds 36 | - name: Build 37 | run: cargo build --locked --verbose --all-targets --all-features 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | /target 3 | **/*.rs.bk 4 | Cargo.lock 5 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Version History 2 | 3 | ## Version 2.0.1 4 | 5 | - Specified the MSRV rust-version (1.63) 6 | - Updated dependencies 7 | 8 | ## Version 2.0.0 9 | 10 | Breaking: 11 | - Changed all structs to be `non_exhaustive` 12 | - Moved all structs to the crate root (no re-exports) 13 | 14 | New features: 15 | - Added the ability to specify HTTP request headers 16 | - Collect all links/anchors of the HTML document 17 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "webpage" 3 | description = "Small library to fetch info about a web page: title, description, language, HTTP info, links, RSS feeds, Opengraph, Schema.org, and more" 4 | readme = "README.md" 5 | keywords = ["webpage", "html", "opengraph"] 6 | categories = ["web-programming"] 7 | license = "MIT" 8 | version = "2.0.1" 9 | authors = ["Otto "] 10 | repository = "https://github.com/orottier/webpage-rs" 11 | edition = "2021" 12 | rust-version = "1.63" 13 | 14 | [features] 15 | default = ["curl"] 16 | serde = ["dep:serde"] 17 | 18 | [dependencies] 19 | curl = { version = "0.4.41", optional = true } 20 | html5ever = "0.27" 21 | markup5ever_rcdom = "0.3" 22 | serde = { version = "1.0", optional = true, features = ["derive"] } 23 | serde_json = "1.0" 24 | url = "2.5" 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Webpage.rs 2 | 3 | [![crates.io](https://img.shields.io/crates/v/webpage.svg)](https://crates.io/crates/webpage) 4 | [![docs.rs](https://img.shields.io/docsrs/webpage)](https://docs.rs/webpage) 5 | 6 | _Small library to fetch info about a web page: title, description, language, 7 | HTTP info, links, RSS feeds, Opengraph, Schema.org, and more_ 8 | 9 | ## Usage 10 | 11 | ```rust 12 | use webpage::{Webpage, WebpageOptions}; 13 | 14 | let info = Webpage::from_url("http://www.rust-lang.org/en-US/", WebpageOptions::default()) 15 | .expect("Could not read from URL"); 16 | 17 | // the HTTP transfer info 18 | let http = info.http; 19 | 20 | assert_eq!(http.ip, "54.192.129.71".to_string()); 21 | assert!(http.headers[0].starts_with("HTTP")); 22 | assert!(http.body.starts_with("")); 23 | assert_eq!(http.url, "https://www.rust-lang.org/en-US/".to_string()); // followed redirects (HTTPS) 24 | assert_eq!(http.content_type, "text/html".to_string()); 25 | 26 | // the parsed HTML info 27 | let html = info.html; 28 | 29 | assert_eq!(html.title, Some("The Rust Programming Language".to_string())); 30 | assert_eq!(html.description, Some("A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_string())); 31 | assert_eq!(html.opengraph.og_type, "website".to_string()); 32 | ``` 33 | 34 | You can also get HTML info about local data: 35 | 36 | ```rust 37 | use webpage::HTML; 38 | let html = HTML::from_file("index.html", None); 39 | // or let html = HTML::from_string(input, None); 40 | ``` 41 | 42 | ## Features 43 | 44 | ### Serialization 45 | 46 | If you need to be able to serialize the data provided by the library using 47 | [serde](https://serde.rs/), you can include specify the `serde` *feature* while 48 | declaring your dependencies in `Cargo.toml`: 49 | 50 | ```toml 51 | webpage = { version = "2.0", features = ["serde"] } 52 | ``` 53 | 54 | ### No curl dependency 55 | 56 | The `curl` feature is enabled by default but is optional. This is useful if you 57 | do not need a HTTP client but already have the HTML data at hand. 58 | 59 | ## All fields 60 | 61 | ```rust 62 | pub struct Webpage { 63 | pub http: HTTP, // info about the HTTP transfer 64 | pub html: HTML, // info from the parsed HTML doc 65 | } 66 | 67 | pub struct HTTP { 68 | pub ip: String, 69 | pub transfer_time: Duration, 70 | pub redirect_count: u32, 71 | pub content_type: String, 72 | pub response_code: u32, 73 | pub headers: Vec, // raw headers from final request 74 | pub url: String, // effective url 75 | pub body: String, 76 | } 77 | 78 | pub struct HTML { 79 | pub title: Option, 80 | pub description: Option, 81 | 82 | pub url: Option, // canonical url 83 | pub feed: Option, // RSS feed typically 84 | 85 | pub language: Option, // as specified, not detected 86 | pub text_content: String, // all tags stripped from body 87 | pub links: Vec, // all links in the document 88 | 89 | pub meta: HashMap, // flattened down list of meta properties 90 | 91 | pub opengraph: Opengraph, 92 | pub schema_org: Vec, 93 | } 94 | 95 | pub struct Link { 96 | pub url: String, // resolved url of the link 97 | pub text: String, // anchor text 98 | } 99 | 100 | pub struct Opengraph { 101 | pub og_type: String, 102 | pub properties: HashMap, 103 | 104 | pub images: Vec, 105 | pub videos: Vec, 106 | pub audios: Vec, 107 | } 108 | 109 | // Facebook's Opengraph structured data 110 | pub struct OpengraphObject { 111 | pub url: String, 112 | pub properties: HashMap, 113 | } 114 | 115 | // Google's schema.org structured data 116 | pub struct SchemaOrg { 117 | pub schema_type: String, 118 | pub value: serde_json::Value, 119 | } 120 | ``` 121 | 122 | ## Options 123 | 124 | The following HTTP configurations are available: 125 | 126 | ```rust 127 | pub struct WebpageOptions { 128 | allow_insecure: false, 129 | follow_location: true, 130 | max_redirections: 5, 131 | timeout: Duration::from_secs(10), 132 | useragent: "Webpage - Rust crate - https://crates.io/crates/webpage".to_string(), 133 | headers: vec!["X-My-Header: 1234".to_string()], 134 | } 135 | 136 | // usage 137 | let mut options = WebpageOptions::default(); 138 | options.allow_insecure = true; 139 | let info = Webpage::from_url(&url, options).expect("Halp, could not fetch"); 140 | ``` 141 | -------------------------------------------------------------------------------- /src/html.rs: -------------------------------------------------------------------------------- 1 | //! Info from the parsed HTML document 2 | 3 | use html5ever::driver::ParseOpts; 4 | use html5ever::parse_document; 5 | use html5ever::tendril::TendrilSink; 6 | use markup5ever_rcdom::RcDom; 7 | use url::Url; 8 | 9 | use std::collections::HashMap; 10 | use std::default::Default; 11 | use std::io; 12 | use std::path::Path; 13 | 14 | use crate::opengraph::Opengraph; 15 | use crate::parser::Parser; 16 | use crate::schema_org::SchemaOrg; 17 | 18 | /// Information regarding the HTML content 19 | #[derive(Debug, Clone)] 20 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] 21 | #[non_exhaustive] 22 | pub struct HTML { 23 | /// \ 24 | pub title: Option, 25 | /// meta description 26 | pub description: Option, 27 | /// Canonical URL 28 | pub url: Option, 29 | #[cfg_attr(feature = "serde", serde(skip))] 30 | pub(crate) url_parsed: Option, 31 | /// Feed URL (atom, rss, ..) 32 | pub feed: Option, 33 | 34 | /// Language as specified in the document 35 | pub language: Option, 36 | /// Text content inside \, all tags stripped 37 | pub text_content: String, 38 | 39 | /// Flattened down list of meta properties 40 | pub meta: HashMap, 41 | /// Opengraph tags 42 | pub opengraph: Opengraph, 43 | /// Schema.org data 44 | pub schema_org: Vec, 45 | /// All links in the document 46 | pub links: Vec, 47 | } 48 | 49 | impl HTML { 50 | fn empty(url: Option) -> Self { 51 | let url_parsed = url.as_ref().and_then(|u| Url::parse(u).ok()); 52 | Self { 53 | title: None, 54 | description: None, 55 | url, 56 | url_parsed, 57 | feed: None, 58 | 59 | language: None, 60 | text_content: String::new(), 61 | 62 | meta: HashMap::new(), 63 | opengraph: Opengraph::empty(), 64 | schema_org: Vec::new(), 65 | links: Vec::new(), 66 | } 67 | } 68 | 69 | /// Construct HTML from RcDom, optionally with a URL set 70 | fn from_dom(dom: RcDom, url: Option) -> Self { 71 | let mut html = Self::empty(url); 72 | let parser = Parser::start(dom.document); 73 | parser.traverse(&mut html); 74 | 75 | html 76 | } 77 | 78 | /// Construct HTML from File, optionally with a URL set 79 | pub fn from_file(path: &str, url: Option) -> Result { 80 | parse_document(RcDom::default(), ParseOpts::default()) 81 | .from_utf8() 82 | .from_file(Path::new(path)) 83 | .map(|dom| Self::from_dom(dom, url)) 84 | } 85 | 86 | /// Construct HTML from String, optionally with a URL set 87 | /// 88 | /// ## Examples 89 | /// ``` 90 | /// use webpage::HTML; 91 | /// 92 | /// let input = String::from("HelloContents"); 93 | /// let html = HTML::from_string(input, None); 94 | /// assert!(html.is_ok()); 95 | /// ``` 96 | pub fn from_string(html: String, url: Option) -> Result { 97 | parse_document(RcDom::default(), ParseOpts::default()) 98 | .from_utf8() 99 | .read_from(&mut html.as_bytes()) 100 | .map(|dom| Self::from_dom(dom, url)) 101 | } 102 | 103 | pub(crate) fn set_url(&mut self, url: Option) { 104 | self.url_parsed = url.as_ref().and_then(|url| Url::parse(url).ok()); 105 | self.url = url; 106 | } 107 | } 108 | 109 | /// Information for an `` anchor 110 | #[derive(Debug, Clone, Eq, PartialEq, Hash)] 111 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] 112 | #[non_exhaustive] 113 | pub struct Link { 114 | pub url: String, 115 | pub text: String, 116 | } 117 | 118 | #[cfg(test)] 119 | mod tests { 120 | use super::*; 121 | 122 | #[test] 123 | fn from_string() { 124 | let input = "HelloContents Link" 125 | .to_string(); 126 | let html = HTML::from_string(input, Some("https://example.com/".into())); 127 | assert!(html.is_ok()); 128 | 129 | let html = html.unwrap(); 130 | assert_eq!(html.title, Some("Hello".to_string())); 131 | assert!(html.description.is_none()); 132 | assert_eq!(html.text_content, "Contents Link".to_string()); 133 | assert_eq!( 134 | html.links, 135 | vec![Link { 136 | url: "https://example.com/a".into(), 137 | text: "Link".into() 138 | }] 139 | ); 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /src/http.rs: -------------------------------------------------------------------------------- 1 | //! Info about the HTTP transfer 2 | 3 | use std::io; 4 | use std::time::Duration; 5 | 6 | use curl::easy::{Easy, List}; 7 | 8 | use crate::WebpageOptions; 9 | 10 | /// Information regarding the HTTP transfer 11 | #[derive(Debug, Clone)] 12 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] 13 | #[non_exhaustive] 14 | pub struct HTTP { 15 | /// The external ip address (v4 or v6) 16 | pub ip: String, 17 | /// Duration of the HTTP call 18 | pub transfer_time: Duration, 19 | /// Number of redirections encountered 20 | pub redirect_count: u32, 21 | /// HTTP content type returned 22 | pub content_type: String, 23 | /// HTTP response code returned 24 | pub response_code: u32, 25 | /// All HTTP response headers 26 | pub headers: Vec, 27 | /// Effective URL that was visited 28 | pub url: String, 29 | /// HTTP body 30 | pub body: String, 31 | } 32 | 33 | impl HTTP { 34 | /// Fetch a webpage from the given URL 35 | /// 36 | /// ## Examples 37 | /// ``` 38 | /// use webpage::HTTP; 39 | /// use webpage::WebpageOptions; 40 | /// 41 | /// let info = HTTP::fetch("http://example.org", WebpageOptions::default()); 42 | /// assert!(info.is_ok()); 43 | /// 44 | /// let info = HTTP::fetch("mal formed or unreachable", WebpageOptions::default()); 45 | /// assert!(info.is_err()); 46 | /// ``` 47 | pub fn fetch(url: &str, options: WebpageOptions) -> Result { 48 | let mut handle = Easy::new(); 49 | 50 | // configure 51 | handle.ssl_verify_peer(!options.allow_insecure)?; 52 | handle.ssl_verify_host(!options.allow_insecure)?; 53 | handle.timeout(options.timeout)?; 54 | handle.follow_location(options.follow_location)?; 55 | handle.max_redirections(options.max_redirections)?; 56 | handle.useragent(&options.useragent)?; 57 | if !options.headers.is_empty() { 58 | let mut list = List::new(); 59 | for header in options.headers.iter() { 60 | list.append(header)?; 61 | } 62 | handle.http_headers(list)?; 63 | } 64 | 65 | handle.url(url)?; 66 | 67 | let mut headers = Vec::new(); 68 | let mut body = Vec::new(); 69 | { 70 | let mut transfer = handle.transfer(); 71 | transfer.header_function(|new_data| { 72 | let header = String::from_utf8_lossy(new_data) 73 | .into_owned() 74 | .trim() 75 | .to_string(); 76 | 77 | // clear list on redirects 78 | if header.starts_with("HTTP/") { 79 | headers = Vec::new(); 80 | } 81 | 82 | if !header.is_empty() { 83 | headers.push(header); 84 | } 85 | 86 | true 87 | })?; 88 | 89 | transfer.write_function(|new_data| { 90 | body.extend_from_slice(new_data); 91 | Ok(new_data.len()) 92 | })?; 93 | 94 | transfer.perform()?; 95 | } 96 | 97 | let body = String::from_utf8_lossy(&body).into_owned(); 98 | 99 | Ok(HTTP { 100 | ip: handle.primary_ip()?.unwrap_or("").to_string(), 101 | transfer_time: handle.total_time()?, 102 | redirect_count: handle.redirect_count()?, 103 | content_type: handle.content_type()?.unwrap_or("").to_string(), 104 | response_code: handle.response_code()?, 105 | url: handle.effective_url()?.unwrap_or("").to_string(), 106 | 107 | headers, 108 | body, 109 | }) 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! _Small library to fetch info about a web page: title, description, language, HTTP info, links, RSS feeds, Opengraph, Schema.org, and more_ 2 | //! 3 | //! ## Usage 4 | //! 5 | //! ```rust 6 | //! use webpage::{Webpage, WebpageOptions}; 7 | //! 8 | //! let info = Webpage::from_url("http://example.org", WebpageOptions::default()) 9 | //! .expect("Could not read from URL"); 10 | //! 11 | //! // the HTTP transfer info 12 | //! let http = info.http; 13 | //! 14 | //! // assert_eq!(http.ip, "54.192.129.71".to_string()); 15 | //! assert!(http.headers[0].starts_with("HTTP")); 16 | //! assert!(http.body.starts_with("")); 17 | //! assert_eq!(http.url, "http://example.org/".to_string()); // effective url 18 | //! assert_eq!(http.content_type, "text/html; charset=UTF-8".to_string()); 19 | //! 20 | //! // the parsed HTML info 21 | //! let html = info.html; 22 | //! 23 | //! assert_eq!(html.title, Some("Example Domain".to_string())); 24 | //! assert_eq!(html.description, None); 25 | //! assert_eq!(html.links.len(), 1); 26 | //! assert_eq!(html.opengraph.og_type, "website".to_string()); 27 | //! ``` 28 | //! 29 | //! You can also get HTML info about local data: 30 | //! 31 | //! ```rust 32 | //! use webpage::HTML; 33 | //! let html = HTML::from_file("index.html", None); 34 | //! // or let html = HTML::from_string(input, None); 35 | //! ``` 36 | //! 37 | //! ## Options 38 | //! 39 | //! The following configurations are available: 40 | //! ```rust 41 | //! pub struct WebpageOptions { 42 | //! allow_insecure: bool, 43 | //! follow_location: bool, 44 | //! max_redirections: u32, 45 | //! timeout: std::time::Duration, 46 | //! useragent: String, 47 | //! headers: Vec, 48 | //! } 49 | //! ``` 50 | //! 51 | //! ```rust 52 | //! use webpage::{Webpage, WebpageOptions}; 53 | //! 54 | //! let mut options = WebpageOptions::default(); 55 | //! options.allow_insecure = true; 56 | //! let info = Webpage::from_url("https://example.org", options).expect("Halp, could not fetch"); 57 | //! ``` 58 | 59 | mod html; 60 | pub use html::{Link, HTML}; 61 | 62 | #[cfg(feature = "curl")] 63 | mod http; 64 | #[cfg(feature = "curl")] 65 | pub use http::HTTP; 66 | 67 | mod opengraph; 68 | pub use opengraph::{Opengraph, OpengraphObject}; 69 | 70 | mod schema_org; 71 | pub use schema_org::SchemaOrg; 72 | 73 | mod parser; 74 | 75 | #[cfg(feature = "curl")] 76 | use std::time::Duration; 77 | 78 | #[cfg(feature = "serde")] 79 | #[macro_use] 80 | extern crate serde; 81 | 82 | /// All gathered info for a webpage 83 | #[derive(Debug)] 84 | #[cfg(feature = "curl")] 85 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] 86 | #[non_exhaustive] 87 | pub struct Webpage { 88 | /// info about the HTTP transfer 89 | pub http: HTTP, 90 | /// info from the parsed HTML doc 91 | pub html: HTML, 92 | } 93 | 94 | /// Configuration options for fetching a webpage 95 | #[derive(Debug)] 96 | #[cfg(feature = "curl")] 97 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] 98 | #[non_exhaustive] 99 | pub struct WebpageOptions { 100 | /// Allow fetching over invalid and/or self signed HTTPS connections \[false\] 101 | pub allow_insecure: bool, 102 | /// Follow HTTP redirects \[true\] 103 | pub follow_location: bool, 104 | /// Max number of redirects to follow \[5\] 105 | pub max_redirections: u32, 106 | /// Timeout for the HTTP request \[10 secs\] 107 | pub timeout: Duration, 108 | /// User agent string used for the request \[webpage-rs - \] 109 | pub useragent: String, 110 | /// Custom HTTP headers to send with the request 111 | pub headers: Vec, 112 | } 113 | 114 | #[cfg(feature = "curl")] 115 | impl Default for WebpageOptions { 116 | fn default() -> Self { 117 | Self { 118 | allow_insecure: false, 119 | follow_location: true, 120 | max_redirections: 5, 121 | timeout: Duration::from_secs(10), 122 | useragent: "webpage-rs - https://crates.io/crates/webpage".to_string(), 123 | headers: Vec::new(), 124 | } 125 | } 126 | } 127 | 128 | #[cfg(feature = "curl")] 129 | impl Webpage { 130 | /// Fetch a webpage from the given URL, and extract HTML info 131 | /// 132 | /// ## Examples 133 | /// ``` 134 | /// use webpage::{Webpage, WebpageOptions}; 135 | /// 136 | /// let info = Webpage::from_url("http://example.org", WebpageOptions::default()); 137 | /// assert!(info.is_ok()) 138 | /// ``` 139 | pub fn from_url(url: &str, options: WebpageOptions) -> Result { 140 | let http = HTTP::fetch(url, options)?; 141 | 142 | let html = HTML::from_string(http.body.clone(), Some(http.url.clone()))?; 143 | 144 | Ok(Self { http, html }) 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /src/opengraph.rs: -------------------------------------------------------------------------------- 1 | //! OpenGraph information 2 | 3 | use std::collections::HashMap; 4 | 5 | #[derive(Debug, Clone)] 6 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] 7 | #[non_exhaustive] 8 | /// Representing [OpenGraph](http://ogp.me/) information 9 | pub struct Opengraph { 10 | /// Opengraph type (article, image, event, ..) 11 | pub og_type: String, 12 | /// Opengraph properties of this object 13 | pub properties: HashMap, 14 | 15 | /// Images relevant to this object 16 | pub images: Vec, 17 | /// Videos relevant to this object 18 | pub videos: Vec, 19 | /// Audio relevant to this object 20 | pub audios: Vec, 21 | } 22 | 23 | #[derive(Debug, Clone)] 24 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] 25 | #[non_exhaustive] 26 | /// Info about an OpenGraph media type 27 | pub struct OpengraphObject { 28 | /// URL describing this object 29 | pub url: String, 30 | /// Properties of the referred object 31 | pub properties: HashMap, 32 | } 33 | 34 | impl OpengraphObject { 35 | pub fn new(url: String) -> Self { 36 | Self { 37 | url, 38 | properties: HashMap::new(), 39 | } 40 | } 41 | } 42 | 43 | impl Opengraph { 44 | pub fn empty() -> Self { 45 | Self { 46 | og_type: "website".to_string(), 47 | properties: HashMap::new(), 48 | 49 | images: vec![], 50 | videos: vec![], 51 | audios: vec![], 52 | } 53 | } 54 | 55 | pub fn extend(&mut self, property: &str, content: String) { 56 | if property == "type" { 57 | self.og_type = content; 58 | } else if property.starts_with("image") { 59 | parse_object("image", property, content, &mut self.images); 60 | } else if property.starts_with("video") { 61 | parse_object("video", property, content, &mut self.videos); 62 | } else if property.starts_with("audio") { 63 | parse_object("audio", property, content, &mut self.audios); 64 | } else { 65 | self.properties.insert(property.to_string(), content); 66 | } 67 | } 68 | } 69 | 70 | fn parse_object( 71 | og_type: &str, 72 | property: &str, 73 | content: String, 74 | collection: &mut Vec, 75 | ) { 76 | let num_images = collection.len(); 77 | 78 | if property == og_type || &property[og_type.len()..] == ":url" { 79 | collection.push(OpengraphObject::new(content)); 80 | } else if num_images > 0 && property.len() > og_type.len() + 1 { 81 | let property = &property["image:".len()..]; 82 | collection[num_images - 1] 83 | .properties 84 | .insert(property.to_string(), content); 85 | } 86 | } 87 | 88 | #[cfg(test)] 89 | mod tests { 90 | use super::Opengraph; 91 | 92 | #[test] 93 | fn test_type() { 94 | let mut opengraph = Opengraph::empty(); 95 | assert_eq!(opengraph.og_type, "website"); 96 | 97 | opengraph.extend("type", "article".to_string()); 98 | assert_eq!(opengraph.og_type, "article"); 99 | } 100 | 101 | #[test] 102 | fn test_image() { 103 | let mut opengraph = Opengraph::empty(); 104 | 105 | opengraph.extend("image", "http://example.org/image.png".to_string()); 106 | opengraph.extend( 107 | "image:secure_url", 108 | "https://example.org/image.png".to_string(), 109 | ); 110 | assert_eq!(opengraph.images.len(), 1); 111 | assert_eq!(opengraph.images[0].url, "http://example.org/image.png"); 112 | 113 | let prop = opengraph.images[0].properties.get("secure_url"); 114 | assert!(prop.is_some()); 115 | assert_eq!(prop.unwrap(), "https://example.org/image.png"); 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /src/parser.rs: -------------------------------------------------------------------------------- 1 | use html5ever::tendril::{fmt::UTF8, Tendril}; 2 | use html5ever::Attribute; 3 | use markup5ever_rcdom::{Handle, NodeData}; 4 | 5 | use crate::html::{Link, HTML}; 6 | use crate::schema_org::SchemaOrg; 7 | 8 | #[derive(Copy, Clone)] 9 | enum Segment { 10 | None, 11 | Head, 12 | Body, 13 | } 14 | 15 | pub struct Parser<'a> { 16 | segment: Segment, 17 | parent: Option<&'a NodeData>, 18 | handle: Handle, 19 | } 20 | 21 | impl<'a> Parser<'a> { 22 | pub fn start(handle: Handle) -> Self { 23 | Parser { 24 | handle, 25 | segment: Segment::None, 26 | parent: None, 27 | } 28 | } 29 | 30 | pub fn traverse(self, html: &mut HTML) { 31 | let mut segment = self.segment; 32 | 33 | let handle_ref = &self.handle; 34 | match self.handle.data { 35 | NodeData::Document => (), 36 | NodeData::Doctype { .. } => (), 37 | NodeData::Comment { .. } => (), 38 | 39 | NodeData::Text { ref contents } => { 40 | if let Some(NodeData::Element { ref name, .. }) = self.parent { 41 | let tag_name = name.local.as_ref(); 42 | 43 | process_text( 44 | self.segment, 45 | tag_name, 46 | tendril_to_utf8(&contents.borrow()), 47 | html, 48 | ) 49 | } 50 | } 51 | 52 | NodeData::Element { 53 | ref name, 54 | ref attrs, 55 | .. 56 | } => { 57 | let tag_name = name.local.as_ref(); 58 | 59 | if tag_name == "head" { 60 | segment = Segment::Head; 61 | } else if tag_name == "body" { 62 | segment = Segment::Body; 63 | } 64 | 65 | process_element(segment, tag_name, handle_ref, &attrs.borrow(), html) 66 | } 67 | 68 | NodeData::ProcessingInstruction { .. } => unreachable!(), 69 | } 70 | 71 | for child in self.handle.children.borrow().iter() { 72 | let new_parser = Parser { 73 | segment, 74 | parent: Some(&self.handle.data), 75 | handle: child.clone(), 76 | }; 77 | new_parser.traverse(html); 78 | } 79 | } 80 | } 81 | 82 | fn process_text(segment: Segment, tag_name: &str, contents: &str, html: &mut HTML) { 83 | if let Segment::Body = segment { 84 | if tag_name != "style" && tag_name != "script" && tag_name != "noscript" { 85 | if !html.text_content.is_empty() { 86 | html.text_content.push(' '); 87 | } 88 | html.text_content.push_str(contents); 89 | } 90 | } 91 | } 92 | 93 | fn process_element( 94 | segment: Segment, 95 | tag_name: &str, 96 | handle: &Handle, 97 | attrs: &[Attribute], 98 | html: &mut HTML, 99 | ) { 100 | // process language attribute 101 | if tag_name == "html" || tag_name == "body" { 102 | let language = get_attribute(attrs, "lang"); 103 | if language.is_some() { 104 | html.language = language; 105 | } 106 | } 107 | 108 | // process 109 | if let Segment::Head = segment { 110 | if tag_name == "title" { 111 | html.title = text_content(handle); 112 | } 113 | if tag_name == "meta" { 114 | let content = get_attribute(attrs, "content"); 115 | if let Some(content) = content { 116 | let property_opt = get_attribute(attrs, "property") 117 | .or_else(|| get_attribute(attrs, "name")) 118 | .or_else(|| get_attribute(attrs, "http-equiv")); 119 | 120 | if let Some(property) = property_opt { 121 | html.meta.insert(property.clone(), content.clone()); 122 | 123 | if property.starts_with("og:") && property.len() > 3 { 124 | html.opengraph.extend(&property[3..], content); 125 | } else if property == "description" { 126 | html.description = Some(content); 127 | } 128 | } 129 | } 130 | 131 | if let Some(charset) = get_attribute(attrs, "charset") { 132 | html.meta.insert("charset".to_string(), charset); 133 | } 134 | } 135 | if tag_name == "link" { 136 | let rel = get_attribute(attrs, "rel").unwrap_or_default(); 137 | if rel == "canonical" { 138 | html.set_url(get_attribute(attrs, "href")); 139 | } else if rel == "alternate" { 140 | let link_type = get_attribute(attrs, "type").unwrap_or_default(); 141 | if [ 142 | "application/atom+xml", 143 | "application/json", 144 | "application/rdf+xml", 145 | "application/rss+xml", 146 | "application/xml", 147 | "text/xml", 148 | ] 149 | .contains(&&link_type[..]) 150 | { 151 | html.feed = get_attribute(attrs, "href"); 152 | } 153 | } 154 | } 155 | } 156 | 157 | // process ld-json snippets 158 | if tag_name == "script" { 159 | if let Some(script_type) = get_attribute(attrs, "type") { 160 | if script_type == "application/ld+json" { 161 | if let Some(content) = text_content(handle) { 162 | html.schema_org.append(&mut SchemaOrg::from(content)); 163 | } 164 | } 165 | } 166 | } 167 | 168 | if tag_name == "a" { 169 | if let Some(href) = get_attribute(attrs, "href") { 170 | let text = text_content(handle).unwrap_or_default(); 171 | let href = if let Some(url) = &html.url_parsed { 172 | if let Ok(url) = url.join(&href) { 173 | url.to_string() 174 | } else { 175 | href 176 | } 177 | } else { 178 | href 179 | }; 180 | html.links.push(Link { url: href, text }); 181 | } 182 | } 183 | } 184 | 185 | fn get_attribute(attrs: &[Attribute], name: &str) -> Option { 186 | attrs 187 | .iter() 188 | .find(|attr| attr.name.local.as_ref() == name) 189 | .map(|attr| attr.value.trim().to_string()) 190 | } 191 | 192 | fn text_content(handle: &Handle) -> Option { 193 | // todo paste all the text together 194 | for child in handle.children.borrow().iter() { 195 | if let NodeData::Text { ref contents } = child.data { 196 | let string = tendril_to_utf8(&contents.borrow()).to_string(); 197 | return Some(string.trim().to_string()); 198 | } 199 | } 200 | 201 | None 202 | } 203 | 204 | fn tendril_to_utf8(t: &Tendril) -> &str { 205 | t 206 | } 207 | -------------------------------------------------------------------------------- /src/schema_org.rs: -------------------------------------------------------------------------------- 1 | //! Schema.org information 2 | 3 | use serde_json::{self, Value}; 4 | 5 | /// Representing [Schema.org](https://schema.org/) information (currently only via JSON-LD) 6 | #[derive(Debug, Clone)] 7 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] 8 | #[non_exhaustive] 9 | pub struct SchemaOrg { 10 | /// Schema.org type (article, image, event) 11 | pub schema_type: String, 12 | /// Schema.org info 13 | pub value: Value, 14 | } 15 | 16 | impl SchemaOrg { 17 | pub fn from(content: String) -> Vec { 18 | let node: Value = serde_json::from_str(&content).unwrap_or(Value::Null); 19 | 20 | let vals: Vec; 21 | if let Value::Array(arr) = node { 22 | vals = arr; 23 | } else { 24 | vals = vec![node]; 25 | } 26 | 27 | // Some websites place schema.org objects under "@graph", which we want to use as values 28 | let vals: Vec = if let Some(obj) = vals.first().and_then(|v| v.as_object()) { 29 | obj.get("@graph") 30 | .and_then(|v| v.as_array()) 31 | .unwrap_or(&vals) 32 | .to_vec() 33 | } else { 34 | vals 35 | }; 36 | 37 | vals.into_iter() 38 | .flat_map(|v| { 39 | let type_opt = v["@type"].clone(); 40 | if let Value::String(ref type_val) = type_opt { 41 | return Some(SchemaOrg { 42 | schema_type: type_val.to_string(), 43 | value: v, 44 | }); 45 | } 46 | None 47 | }) 48 | .collect() 49 | } 50 | } 51 | 52 | #[cfg(test)] 53 | mod tests { 54 | use super::SchemaOrg; 55 | 56 | #[test] 57 | fn test_empty() { 58 | let schema = SchemaOrg::from("{}".to_string()); 59 | assert!(schema.is_empty()); 60 | } 61 | 62 | #[test] 63 | fn test_type() { 64 | let schema = SchemaOrg::from("{\"@type\": \"NewsArticle\"}".to_string()); 65 | assert_eq!(schema.len(), 1); 66 | assert_eq!(schema[0].schema_type, "NewsArticle"); 67 | } 68 | 69 | #[test] 70 | fn test_graph() { 71 | let schema = SchemaOrg::from("{\"@context\":\"https://schema.org\",\"@graph\":[{\"@context\":\"https://schema.org\",\"@type\":\"NewsArticle\"}]}".to_string()); 72 | assert_eq!(schema.len(), 1); 73 | assert_eq!(schema[0].schema_type, "NewsArticle"); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /tests/data/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Example Domain 5 | 6 | 7 | 8 | 9 | 40 | 41 | 42 | 43 |
44 |

Example Domain

45 |

This domain is established to be used for illustrative examples in documents. You may use this 46 | domain in examples without prior coordination or asking for permission.

47 |

More information...

48 |
49 | 50 | 51 | -------------------------------------------------------------------------------- /tests/integration_test.rs: -------------------------------------------------------------------------------- 1 | extern crate webpage; 2 | 3 | use std::io::{Read, Write}; 4 | use std::net::TcpListener; 5 | 6 | #[cfg(feature = "curl")] 7 | use webpage::{Webpage, WebpageOptions, HTML}; 8 | 9 | #[test] 10 | fn from_file() { 11 | let path = "tests/data/index.html"; 12 | let html = HTML::from_file(path, None); 13 | assert!(html.is_ok()); 14 | 15 | let html = html.unwrap(); 16 | assert_eq!(html.title, Some("Example Domain".to_string())); 17 | assert!(html.description.is_none()); 18 | } 19 | 20 | #[test] 21 | #[ignore] 22 | #[cfg(feature = "curl")] 23 | fn from_url() { 24 | let url = "https://example.org"; 25 | let webpage = Webpage::from_url(url, WebpageOptions::default()); 26 | assert!(webpage.is_ok()); 27 | 28 | let html = webpage.unwrap().html; 29 | assert_eq!(html.title, Some("Example Domain".to_string())); 30 | assert!(html.description.is_none()); 31 | } 32 | 33 | #[test] 34 | fn test_headers() { 35 | let socket = TcpListener::bind("127.0.0.1:0").unwrap(); // bind to a random port 36 | let url = format!("{}", socket.local_addr().unwrap()); 37 | std::thread::spawn(move || { 38 | let my_headers: Vec = vec!["X-My-Header: 1234".to_string()]; 39 | let mut options = WebpageOptions::default(); 40 | options.headers = my_headers; 41 | let webpage = Webpage::from_url(&url, options); 42 | assert!(webpage.is_ok()); 43 | }); 44 | let mut stream = socket.accept().unwrap().0; 45 | let mut buf = vec![0; 1024]; 46 | let mut read = 0; 47 | let mut request; 48 | loop { 49 | let bytes = stream.read(&mut buf[read..]).unwrap(); 50 | assert_ne!(bytes, 0); 51 | read += bytes; 52 | request = String::from_utf8(buf[..read].to_vec()).unwrap(); 53 | if request.contains("\r\n\r\n") { 54 | break; 55 | } 56 | } 57 | assert!(request.contains("X-My-Header: 1234\r\n")); 58 | stream.write_all(b"HTTP/1.1 200 OK\r\n\r\n").unwrap(); 59 | } 60 | --------------------------------------------------------------------------------