├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── build.yml
    │   └── msrv.yml
├── .gitignore
├── CHANGELOG.md
├── Cargo.toml
├── README.md
├── src
    ├── html.rs
    ├── http.rs
    ├── lib.rs
    ├── opengraph.rs
    ├── parser.rs
    └── schema_org.rs
└── tests
    ├── data
        └── index.html
    └── integration_test.rs


/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "cargo" # See documentation for possible values
 9 |     directory: "/" # Location of package manifests
10 |     schedule:
11 |       interval: "weekly"
12 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: Build
 2 | 
 3 | # read-only repo token
 4 | # no access to secrets
 5 | on:
 6 |   push:
 7 |     branches: [ master ]
 8 |   pull_request:
 9 | 
10 | env:
11 |   CARGO_TERM_COLOR: always
12 | 
13 | jobs:
14 |   verify-build:
15 |     runs-on: ubuntu-latest
16 | 
17 |     steps:
18 |     # checkout repo
19 |     - uses: actions/checkout@v3
20 | 
21 |     - name: Install rust
22 |       uses: dtolnay/rust-toolchain@stable
23 |       with:
24 |         components: clippy, rustfmt
25 | 
26 |     - name: Generate Cargo.lock
27 |       run: cargo generate-lockfile
28 | 
29 |     # restore cargo cache from previous runs
30 |     - name: Rust Cache
31 |       uses: Swatinem/rust-cache@v2
32 |       with:
33 |         # The cache should not be shared between different workflows and jobs.
34 |         shared-key: ${{ github.workflow }}-${{ github.job }}
35 | 
36 |     # check it builds
37 |     - name: Build
38 |       run: cargo build --locked --verbose --all-targets --all-features
39 | 
40 |     # run tests
41 |     - name: Run tests
42 |       run: cargo test --verbose --all-features
43 | 
44 |     # make sure all code has been formatted with rustfmt
45 |     - name: check rustfmt
46 |       run: cargo fmt -- --check --color always
47 | 
48 |     # run clippy to verify we have no warnings
49 |     - name: cargo clippy
50 |       env:
51 |         RUSTDOCFLAGS: -D warnings
52 |       run: cargo clippy --all-targets --all-features
53 | 
54 |     # check for rustdoc warnings
55 |     - name: generate and verify rustdoc
56 |       env:
57 |         RUSTDOCFLAGS: -D warnings
58 |       run: cargo doc --no-deps --document-private-items --workspace --all-features
59 | 


--------------------------------------------------------------------------------
/.github/workflows/msrv.yml:
--------------------------------------------------------------------------------
 1 | name: MSRV
 2 | 
 3 | # read-only repo token
 4 | # no access to secrets
 5 | on:
 6 |   push:
 7 |     branches: [ master ]
 8 |   pull_request:
 9 | 
10 | env:
11 |   CARGO_TERM_COLOR: always
12 | 
13 | jobs:
14 |   verify-build:
15 |     runs-on: ubuntu-latest
16 | 
17 |     steps:
18 |     # checkout repo
19 |     - uses: actions/checkout@v3
20 | 
21 |     - name: Install rust
22 |       # Aligned with `rust-version` in `Cargo.toml`
23 |       uses: dtolnay/rust-toolchain@1.63
24 | 
25 |     - name: Generate Cargo.lock
26 |       run: cargo generate-lockfile
27 | 
28 |     # restore cargo cache from previous runs
29 |     - name: Rust Cache
30 |       uses: Swatinem/rust-cache@v2
31 |       with:
32 |         # The cache should not be shared between different workflows and jobs.
33 |         shared-key: ${{ github.workflow }}-${{ github.job }}
34 | 
35 |     # check it builds
36 |     - name: Build
37 |       run: cargo build --locked --verbose --all-targets --all-features
38 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | /target
3 | **/*.rs.bk
4 | Cargo.lock
5 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Version History
 2 | 
 3 | ## Version 2.0.1
 4 | 
 5 | - Specified the MSRV rust-version (1.63)
 6 | - Updated dependencies
 7 | 
 8 | ## Version 2.0.0
 9 | 
10 | Breaking:
11 | - Changed all structs to be `non_exhaustive`
12 | - Moved all structs to the crate root (no re-exports)
13 | 
14 | New features:
15 | - Added the ability to specify HTTP request headers
16 | - Collect all links/anchors of the HTML document
17 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "webpage"
 3 | description = "Small library to fetch info about a web page: title, description, language, HTTP info, links, RSS feeds, Opengraph, Schema.org, and more"
 4 | readme = "README.md"
 5 | keywords = ["webpage", "html", "opengraph"]
 6 | categories = ["web-programming"]
 7 | license = "MIT"
 8 | version = "2.0.1"
 9 | authors = ["Otto <otto@ot-to.nl>"]
10 | repository = "https://github.com/orottier/webpage-rs"
11 | edition = "2021"
12 | rust-version = "1.63"
13 | 
14 | [features]
15 | default = ["curl"]
16 | serde = ["dep:serde"]
17 | 
18 | [dependencies]
19 | curl = { version = "0.4.41", optional = true }
20 | html5ever = "0.27"
21 | markup5ever_rcdom = "0.3"
22 | serde = { version = "1.0", optional = true, features = ["derive"] }
23 | serde_json = "1.0"
24 | url = "2.5"
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Webpage.rs
  2 | 
  3 | [![crates.io](https://img.shields.io/crates/v/webpage.svg)](https://crates.io/crates/webpage)
  4 | [![docs.rs](https://img.shields.io/docsrs/webpage)](https://docs.rs/webpage)
  5 | 
  6 | _Small library to fetch info about a web page: title, description, language,
  7 | HTTP info, links, RSS feeds, Opengraph, Schema.org, and more_
  8 | 
  9 | ## Usage
 10 | 
 11 | ```rust
 12 | use webpage::{Webpage, WebpageOptions};
 13 | 
 14 | let info = Webpage::from_url("http://www.rust-lang.org/en-US/", WebpageOptions::default())
 15 |     .expect("Could not read from URL");
 16 | 
 17 | // the HTTP transfer info
 18 | let http = info.http;
 19 | 
 20 | assert_eq!(http.ip, "54.192.129.71".to_string());
 21 | assert!(http.headers[0].starts_with("HTTP"));
 22 | assert!(http.body.starts_with("<!DOCTYPE html>"));
 23 | assert_eq!(http.url, "https://www.rust-lang.org/en-US/".to_string()); // followed redirects (HTTPS)
 24 | assert_eq!(http.content_type, "text/html".to_string());
 25 | 
 26 | // the parsed HTML info
 27 | let html = info.html;
 28 | 
 29 | assert_eq!(html.title, Some("The Rust Programming Language".to_string()));
 30 | assert_eq!(html.description, Some("A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_string()));
 31 | assert_eq!(html.opengraph.og_type, "website".to_string());
 32 | ```
 33 | 
 34 | You can also get HTML info about local data:
 35 | 
 36 | ```rust
 37 | use webpage::HTML;
 38 | let html = HTML::from_file("index.html", None);
 39 | // or let html = HTML::from_string(input, None);
 40 | ```
 41 | 
 42 | ## Features
 43 | 
 44 | ### Serialization
 45 | 
 46 | If you need to be able to serialize the data provided by the library using
 47 | [serde](https://serde.rs/), you can include specify the `serde` *feature* while
 48 | declaring your dependencies in `Cargo.toml`:
 49 | 
 50 | ```toml
 51 | webpage = { version = "2.0", features = ["serde"] }
 52 | ```
 53 | 
 54 | ### No curl dependency
 55 | 
 56 | The `curl` feature is enabled by default but is optional. This is useful if you
 57 | do not need a HTTP client but already have the HTML data at hand.
 58 | 
 59 | ## All fields
 60 | 
 61 | ```rust
 62 | pub struct Webpage {
 63 |     pub http: HTTP, // info about the HTTP transfer
 64 |     pub html: HTML, // info from the parsed HTML doc
 65 | }
 66 | 
 67 | pub struct HTTP {
 68 |     pub ip: String,
 69 |     pub transfer_time: Duration,
 70 |     pub redirect_count: u32,
 71 |     pub content_type: String,
 72 |     pub response_code: u32,
 73 |     pub headers: Vec<String>, // raw headers from final request
 74 |     pub url: String, // effective url
 75 |     pub body: String,
 76 | }
 77 | 
 78 | pub struct HTML {
 79 |     pub title: Option<String>,
 80 |     pub description: Option<String>,
 81 | 
 82 |     pub url: Option<String>, // canonical url
 83 |     pub feed: Option<String>, // RSS feed typically
 84 | 
 85 |     pub language: Option<String>, // as specified, not detected
 86 |     pub text_content: String, // all tags stripped from body
 87 |     pub links: Vec<Link>, // all links in the document
 88 | 
 89 |     pub meta: HashMap<String, String>, // flattened down list of meta properties
 90 | 
 91 |     pub opengraph: Opengraph,
 92 |     pub schema_org: Vec<SchemaOrg>,
 93 | }
 94 | 
 95 | pub struct Link {
 96 |     pub url: String, // resolved url of the link
 97 |     pub text: String, // anchor text
 98 | }
 99 | 
100 | pub struct Opengraph {
101 |     pub og_type: String,
102 |     pub properties: HashMap<String, String>,
103 | 
104 |     pub images: Vec<Object>,
105 |     pub videos: Vec<Object>,
106 |     pub audios: Vec<Object>,
107 | }
108 | 
109 | // Facebook's Opengraph structured data
110 | pub struct OpengraphObject {
111 |     pub url: String,
112 |     pub properties: HashMap<String, String>,
113 | }
114 | 
115 | // Google's schema.org structured data
116 | pub struct SchemaOrg {
117 |     pub schema_type: String,
118 |     pub value: serde_json::Value,
119 | }
120 | ```
121 | 
122 | ## Options
123 | 
124 | The following HTTP configurations are available:
125 | 
126 | ```rust
127 | pub struct WebpageOptions {
128 |     allow_insecure: false,
129 |     follow_location: true,
130 |     max_redirections: 5,
131 |     timeout: Duration::from_secs(10),
132 |     useragent: "Webpage - Rust crate - https://crates.io/crates/webpage".to_string(),
133 |     headers: vec!["X-My-Header: 1234".to_string()],
134 | }
135 | 
136 | // usage
137 | let mut options = WebpageOptions::default();
138 | options.allow_insecure = true;
139 | let info = Webpage::from_url(&url, options).expect("Halp, could not fetch");
140 | ```
141 | 


--------------------------------------------------------------------------------
/src/html.rs:
--------------------------------------------------------------------------------
  1 | //! Info from the parsed HTML document
  2 | 
  3 | use html5ever::driver::ParseOpts;
  4 | use html5ever::parse_document;
  5 | use html5ever::tendril::TendrilSink;
  6 | use markup5ever_rcdom::RcDom;
  7 | use url::Url;
  8 | 
  9 | use std::collections::HashMap;
 10 | use std::default::Default;
 11 | use std::io;
 12 | use std::path::Path;
 13 | 
 14 | use crate::opengraph::Opengraph;
 15 | use crate::parser::Parser;
 16 | use crate::schema_org::SchemaOrg;
 17 | 
 18 | /// Information regarding the HTML content
 19 | #[derive(Debug, Clone)]
 20 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 21 | #[non_exhaustive]
 22 | pub struct HTML {
 23 |     /// \<title\>
 24 |     pub title: Option<String>,
 25 |     /// meta description
 26 |     pub description: Option<String>,
 27 |     /// Canonical URL
 28 |     pub url: Option<String>,
 29 |     #[cfg_attr(feature = "serde", serde(skip))]
 30 |     pub(crate) url_parsed: Option<Url>,
 31 |     /// Feed URL (atom, rss, ..)
 32 |     pub feed: Option<String>,
 33 | 
 34 |     /// Language as specified in the document
 35 |     pub language: Option<String>,
 36 |     /// Text content inside \<body\>, all tags stripped
 37 |     pub text_content: String,
 38 | 
 39 |     /// Flattened down list of meta properties
 40 |     pub meta: HashMap<String, String>,
 41 |     /// Opengraph tags
 42 |     pub opengraph: Opengraph,
 43 |     /// Schema.org data
 44 |     pub schema_org: Vec<SchemaOrg>,
 45 |     /// All links in the document
 46 |     pub links: Vec<Link>,
 47 | }
 48 | 
 49 | impl HTML {
 50 |     fn empty(url: Option<String>) -> Self {
 51 |         let url_parsed = url.as_ref().and_then(|u| Url::parse(u).ok());
 52 |         Self {
 53 |             title: None,
 54 |             description: None,
 55 |             url,
 56 |             url_parsed,
 57 |             feed: None,
 58 | 
 59 |             language: None,
 60 |             text_content: String::new(),
 61 | 
 62 |             meta: HashMap::new(),
 63 |             opengraph: Opengraph::empty(),
 64 |             schema_org: Vec::new(),
 65 |             links: Vec::new(),
 66 |         }
 67 |     }
 68 | 
 69 |     /// Construct HTML from RcDom, optionally with a URL set
 70 |     fn from_dom(dom: RcDom, url: Option<String>) -> Self {
 71 |         let mut html = Self::empty(url);
 72 |         let parser = Parser::start(dom.document);
 73 |         parser.traverse(&mut html);
 74 | 
 75 |         html
 76 |     }
 77 | 
 78 |     /// Construct HTML from File, optionally with a URL set
 79 |     pub fn from_file(path: &str, url: Option<String>) -> Result<Self, io::Error> {
 80 |         parse_document(RcDom::default(), ParseOpts::default())
 81 |             .from_utf8()
 82 |             .from_file(Path::new(path))
 83 |             .map(|dom| Self::from_dom(dom, url))
 84 |     }
 85 | 
 86 |     /// Construct HTML from String, optionally with a URL set
 87 |     ///
 88 |     /// ## Examples
 89 |     /// ```
 90 |     /// use webpage::HTML;
 91 |     ///
 92 |     /// let input = String::from("<html><head><title>Hello</title></head><body>Contents");
 93 |     /// let html = HTML::from_string(input, None);
 94 |     /// assert!(html.is_ok());
 95 |     ///  ```
 96 |     pub fn from_string(html: String, url: Option<String>) -> Result<Self, io::Error> {
 97 |         parse_document(RcDom::default(), ParseOpts::default())
 98 |             .from_utf8()
 99 |             .read_from(&mut html.as_bytes())
100 |             .map(|dom| Self::from_dom(dom, url))
101 |     }
102 | 
103 |     pub(crate) fn set_url(&mut self, url: Option<String>) {
104 |         self.url_parsed = url.as_ref().and_then(|url| Url::parse(url).ok());
105 |         self.url = url;
106 |     }
107 | }
108 | 
109 | /// Information for an `<a>` anchor
110 | #[derive(Debug, Clone, Eq, PartialEq, Hash)]
111 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
112 | #[non_exhaustive]
113 | pub struct Link {
114 |     pub url: String,
115 |     pub text: String,
116 | }
117 | 
118 | #[cfg(test)]
119 | mod tests {
120 |     use super::*;
121 | 
122 |     #[test]
123 |     fn from_string() {
124 |         let input = "<html><head><title>Hello</title></head><body>Contents <a href='/a'>Link</a>"
125 |             .to_string();
126 |         let html = HTML::from_string(input, Some("https://example.com/".into()));
127 |         assert!(html.is_ok());
128 | 
129 |         let html = html.unwrap();
130 |         assert_eq!(html.title, Some("Hello".to_string()));
131 |         assert!(html.description.is_none());
132 |         assert_eq!(html.text_content, "Contents  Link".to_string());
133 |         assert_eq!(
134 |             html.links,
135 |             vec![Link {
136 |                 url: "https://example.com/a".into(),
137 |                 text: "Link".into()
138 |             }]
139 |         );
140 |     }
141 | }
142 | 


--------------------------------------------------------------------------------
/src/http.rs:
--------------------------------------------------------------------------------
  1 | //! Info about the HTTP transfer
  2 | 
  3 | use std::io;
  4 | use std::time::Duration;
  5 | 
  6 | use curl::easy::{Easy, List};
  7 | 
  8 | use crate::WebpageOptions;
  9 | 
 10 | /// Information regarding the HTTP transfer
 11 | #[derive(Debug, Clone)]
 12 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 13 | #[non_exhaustive]
 14 | pub struct HTTP {
 15 |     /// The external ip address (v4 or v6)
 16 |     pub ip: String,
 17 |     /// Duration of the HTTP call
 18 |     pub transfer_time: Duration,
 19 |     /// Number of redirections encountered
 20 |     pub redirect_count: u32,
 21 |     /// HTTP content type returned
 22 |     pub content_type: String,
 23 |     /// HTTP response code returned
 24 |     pub response_code: u32,
 25 |     /// All HTTP response headers
 26 |     pub headers: Vec<String>,
 27 |     /// Effective URL that was visited
 28 |     pub url: String,
 29 |     /// HTTP body
 30 |     pub body: String,
 31 | }
 32 | 
 33 | impl HTTP {
 34 |     /// Fetch a webpage from the given URL
 35 |     ///
 36 |     /// ## Examples
 37 |     /// ```
 38 |     /// use webpage::HTTP;
 39 |     /// use webpage::WebpageOptions;
 40 |     ///
 41 |     /// let info = HTTP::fetch("http://example.org", WebpageOptions::default());
 42 |     /// assert!(info.is_ok());
 43 |     ///
 44 |     /// let info = HTTP::fetch("mal formed or unreachable", WebpageOptions::default());
 45 |     /// assert!(info.is_err());
 46 |     /// ```
 47 |     pub fn fetch(url: &str, options: WebpageOptions) -> Result<Self, io::Error> {
 48 |         let mut handle = Easy::new();
 49 | 
 50 |         // configure
 51 |         handle.ssl_verify_peer(!options.allow_insecure)?;
 52 |         handle.ssl_verify_host(!options.allow_insecure)?;
 53 |         handle.timeout(options.timeout)?;
 54 |         handle.follow_location(options.follow_location)?;
 55 |         handle.max_redirections(options.max_redirections)?;
 56 |         handle.useragent(&options.useragent)?;
 57 |         if !options.headers.is_empty() {
 58 |             let mut list = List::new();
 59 |             for header in options.headers.iter() {
 60 |                 list.append(header)?;
 61 |             }
 62 |             handle.http_headers(list)?;
 63 |         }
 64 | 
 65 |         handle.url(url)?;
 66 | 
 67 |         let mut headers = Vec::new();
 68 |         let mut body = Vec::new();
 69 |         {
 70 |             let mut transfer = handle.transfer();
 71 |             transfer.header_function(|new_data| {
 72 |                 let header = String::from_utf8_lossy(new_data)
 73 |                     .into_owned()
 74 |                     .trim()
 75 |                     .to_string();
 76 | 
 77 |                 // clear list on redirects
 78 |                 if header.starts_with("HTTP/") {
 79 |                     headers = Vec::new();
 80 |                 }
 81 | 
 82 |                 if !header.is_empty() {
 83 |                     headers.push(header);
 84 |                 }
 85 | 
 86 |                 true
 87 |             })?;
 88 | 
 89 |             transfer.write_function(|new_data| {
 90 |                 body.extend_from_slice(new_data);
 91 |                 Ok(new_data.len())
 92 |             })?;
 93 | 
 94 |             transfer.perform()?;
 95 |         }
 96 | 
 97 |         let body = String::from_utf8_lossy(&body).into_owned();
 98 | 
 99 |         Ok(HTTP {
100 |             ip: handle.primary_ip()?.unwrap_or("").to_string(),
101 |             transfer_time: handle.total_time()?,
102 |             redirect_count: handle.redirect_count()?,
103 |             content_type: handle.content_type()?.unwrap_or("").to_string(),
104 |             response_code: handle.response_code()?,
105 |             url: handle.effective_url()?.unwrap_or("").to_string(),
106 | 
107 |             headers,
108 |             body,
109 |         })
110 |     }
111 | }
112 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | //! _Small library to fetch info about a web page: title, description, language, HTTP info, links, RSS feeds, Opengraph, Schema.org, and more_
  2 | //!
  3 | //! ## Usage
  4 | //!
  5 | //! ```rust
  6 | //! use webpage::{Webpage, WebpageOptions};
  7 | //!
  8 | //! let info = Webpage::from_url("http://example.org", WebpageOptions::default())
  9 | //!     .expect("Could not read from URL");
 10 | //!
 11 | //! // the HTTP transfer info
 12 | //! let http = info.http;
 13 | //!
 14 | //! // assert_eq!(http.ip, "54.192.129.71".to_string());
 15 | //! assert!(http.headers[0].starts_with("HTTP"));
 16 | //! assert!(http.body.starts_with("<!doctype html>"));
 17 | //! assert_eq!(http.url, "http://example.org/".to_string()); // effective url
 18 | //! assert_eq!(http.content_type, "text/html; charset=UTF-8".to_string());
 19 | //!
 20 | //! // the parsed HTML info
 21 | //! let html = info.html;
 22 | //!
 23 | //! assert_eq!(html.title, Some("Example Domain".to_string()));
 24 | //! assert_eq!(html.description, None);
 25 | //! assert_eq!(html.links.len(), 1);
 26 | //! assert_eq!(html.opengraph.og_type, "website".to_string());
 27 | //! ```
 28 | //!
 29 | //! You can also get HTML info about local data:
 30 | //!
 31 | //! ```rust
 32 | //! use webpage::HTML;
 33 | //! let html = HTML::from_file("index.html", None);
 34 | //! // or let html = HTML::from_string(input, None);
 35 | //! ```
 36 | //!
 37 | //! ## Options
 38 | //!
 39 | //! The following configurations are available:
 40 | //! ```rust
 41 | //! pub struct WebpageOptions {
 42 | //!     allow_insecure: bool,
 43 | //!     follow_location: bool,
 44 | //!     max_redirections: u32,
 45 | //!     timeout: std::time::Duration,
 46 | //!     useragent: String,
 47 | //!     headers: Vec<String>,
 48 | //! }
 49 | //! ```
 50 | //!
 51 | //! ```rust
 52 | //! use webpage::{Webpage, WebpageOptions};
 53 | //!
 54 | //! let mut options = WebpageOptions::default();
 55 | //! options.allow_insecure = true;
 56 | //! let info = Webpage::from_url("https://example.org", options).expect("Halp, could not fetch");
 57 | //! ```
 58 | 
 59 | mod html;
 60 | pub use html::{Link, HTML};
 61 | 
 62 | #[cfg(feature = "curl")]
 63 | mod http;
 64 | #[cfg(feature = "curl")]
 65 | pub use http::HTTP;
 66 | 
 67 | mod opengraph;
 68 | pub use opengraph::{Opengraph, OpengraphObject};
 69 | 
 70 | mod schema_org;
 71 | pub use schema_org::SchemaOrg;
 72 | 
 73 | mod parser;
 74 | 
 75 | #[cfg(feature = "curl")]
 76 | use std::time::Duration;
 77 | 
 78 | #[cfg(feature = "serde")]
 79 | #[macro_use]
 80 | extern crate serde;
 81 | 
 82 | /// All gathered info for a webpage
 83 | #[derive(Debug)]
 84 | #[cfg(feature = "curl")]
 85 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 86 | #[non_exhaustive]
 87 | pub struct Webpage {
 88 |     /// info about the HTTP transfer
 89 |     pub http: HTTP,
 90 |     /// info from the parsed HTML doc
 91 |     pub html: HTML,
 92 | }
 93 | 
 94 | /// Configuration options for fetching a webpage
 95 | #[derive(Debug)]
 96 | #[cfg(feature = "curl")]
 97 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 98 | #[non_exhaustive]
 99 | pub struct WebpageOptions {
100 |     /// Allow fetching over invalid and/or self signed HTTPS connections \[false\]
101 |     pub allow_insecure: bool,
102 |     /// Follow HTTP redirects \[true\]
103 |     pub follow_location: bool,
104 |     /// Max number of redirects to follow \[5\]
105 |     pub max_redirections: u32,
106 |     /// Timeout for the HTTP request \[10 secs\]
107 |     pub timeout: Duration,
108 |     /// User agent string used for the request \[webpage-rs - <https://crates.io/crates/webpage>\]
109 |     pub useragent: String,
110 |     /// Custom HTTP headers to send with the request
111 |     pub headers: Vec<String>,
112 | }
113 | 
114 | #[cfg(feature = "curl")]
115 | impl Default for WebpageOptions {
116 |     fn default() -> Self {
117 |         Self {
118 |             allow_insecure: false,
119 |             follow_location: true,
120 |             max_redirections: 5,
121 |             timeout: Duration::from_secs(10),
122 |             useragent: "webpage-rs - https://crates.io/crates/webpage".to_string(),
123 |             headers: Vec::new(),
124 |         }
125 |     }
126 | }
127 | 
128 | #[cfg(feature = "curl")]
129 | impl Webpage {
130 |     /// Fetch a webpage from the given URL, and extract HTML info
131 |     ///
132 |     /// ## Examples
133 |     /// ```
134 |     /// use webpage::{Webpage, WebpageOptions};
135 |     ///
136 |     /// let info = Webpage::from_url("http://example.org", WebpageOptions::default());
137 |     /// assert!(info.is_ok())
138 |     /// ```
139 |     pub fn from_url(url: &str, options: WebpageOptions) -> Result<Self, std::io::Error> {
140 |         let http = HTTP::fetch(url, options)?;
141 | 
142 |         let html = HTML::from_string(http.body.clone(), Some(http.url.clone()))?;
143 | 
144 |         Ok(Self { http, html })
145 |     }
146 | }
147 | 


--------------------------------------------------------------------------------
/src/opengraph.rs:
--------------------------------------------------------------------------------
  1 | //! OpenGraph information
  2 | 
  3 | use std::collections::HashMap;
  4 | 
  5 | #[derive(Debug, Clone)]
  6 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
  7 | #[non_exhaustive]
  8 | /// Representing [OpenGraph](http://ogp.me/) information
  9 | pub struct Opengraph {
 10 |     /// Opengraph type (article, image, event, ..)
 11 |     pub og_type: String,
 12 |     /// Opengraph properties of this object
 13 |     pub properties: HashMap<String, String>,
 14 | 
 15 |     /// Images relevant to this object
 16 |     pub images: Vec<OpengraphObject>,
 17 |     /// Videos relevant to this object
 18 |     pub videos: Vec<OpengraphObject>,
 19 |     /// Audio relevant to this object
 20 |     pub audios: Vec<OpengraphObject>,
 21 | }
 22 | 
 23 | #[derive(Debug, Clone)]
 24 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 25 | #[non_exhaustive]
 26 | /// Info about an OpenGraph media type
 27 | pub struct OpengraphObject {
 28 |     /// URL describing this object
 29 |     pub url: String,
 30 |     /// Properties of the referred object
 31 |     pub properties: HashMap<String, String>,
 32 | }
 33 | 
 34 | impl OpengraphObject {
 35 |     pub fn new(url: String) -> Self {
 36 |         Self {
 37 |             url,
 38 |             properties: HashMap::new(),
 39 |         }
 40 |     }
 41 | }
 42 | 
 43 | impl Opengraph {
 44 |     pub fn empty() -> Self {
 45 |         Self {
 46 |             og_type: "website".to_string(),
 47 |             properties: HashMap::new(),
 48 | 
 49 |             images: vec![],
 50 |             videos: vec![],
 51 |             audios: vec![],
 52 |         }
 53 |     }
 54 | 
 55 |     pub fn extend(&mut self, property: &str, content: String) {
 56 |         if property == "type" {
 57 |             self.og_type = content;
 58 |         } else if property.starts_with("image") {
 59 |             parse_object("image", property, content, &mut self.images);
 60 |         } else if property.starts_with("video") {
 61 |             parse_object("video", property, content, &mut self.videos);
 62 |         } else if property.starts_with("audio") {
 63 |             parse_object("audio", property, content, &mut self.audios);
 64 |         } else {
 65 |             self.properties.insert(property.to_string(), content);
 66 |         }
 67 |     }
 68 | }
 69 | 
 70 | fn parse_object(
 71 |     og_type: &str,
 72 |     property: &str,
 73 |     content: String,
 74 |     collection: &mut Vec<OpengraphObject>,
 75 | ) {
 76 |     let num_images = collection.len();
 77 | 
 78 |     if property == og_type || &property[og_type.len()..] == ":url" {
 79 |         collection.push(OpengraphObject::new(content));
 80 |     } else if num_images > 0 && property.len() > og_type.len() + 1 {
 81 |         let property = &property["image:".len()..];
 82 |         collection[num_images - 1]
 83 |             .properties
 84 |             .insert(property.to_string(), content);
 85 |     }
 86 | }
 87 | 
 88 | #[cfg(test)]
 89 | mod tests {
 90 |     use super::Opengraph;
 91 | 
 92 |     #[test]
 93 |     fn test_type() {
 94 |         let mut opengraph = Opengraph::empty();
 95 |         assert_eq!(opengraph.og_type, "website");
 96 | 
 97 |         opengraph.extend("type", "article".to_string());
 98 |         assert_eq!(opengraph.og_type, "article");
 99 |     }
100 | 
101 |     #[test]
102 |     fn test_image() {
103 |         let mut opengraph = Opengraph::empty();
104 | 
105 |         opengraph.extend("image", "http://example.org/image.png".to_string());
106 |         opengraph.extend(
107 |             "image:secure_url",
108 |             "https://example.org/image.png".to_string(),
109 |         );
110 |         assert_eq!(opengraph.images.len(), 1);
111 |         assert_eq!(opengraph.images[0].url, "http://example.org/image.png");
112 | 
113 |         let prop = opengraph.images[0].properties.get("secure_url");
114 |         assert!(prop.is_some());
115 |         assert_eq!(prop.unwrap(), "https://example.org/image.png");
116 |     }
117 | }
118 | 


--------------------------------------------------------------------------------
/src/parser.rs:
--------------------------------------------------------------------------------
  1 | use html5ever::tendril::{fmt::UTF8, Tendril};
  2 | use html5ever::Attribute;
  3 | use markup5ever_rcdom::{Handle, NodeData};
  4 | 
  5 | use crate::html::{Link, HTML};
  6 | use crate::schema_org::SchemaOrg;
  7 | 
  8 | #[derive(Copy, Clone)]
  9 | enum Segment {
 10 |     None,
 11 |     Head,
 12 |     Body,
 13 | }
 14 | 
 15 | pub struct Parser<'a> {
 16 |     segment: Segment,
 17 |     parent: Option<&'a NodeData>,
 18 |     handle: Handle,
 19 | }
 20 | 
 21 | impl<'a> Parser<'a> {
 22 |     pub fn start(handle: Handle) -> Self {
 23 |         Parser {
 24 |             handle,
 25 |             segment: Segment::None,
 26 |             parent: None,
 27 |         }
 28 |     }
 29 | 
 30 |     pub fn traverse(self, html: &mut HTML) {
 31 |         let mut segment = self.segment;
 32 | 
 33 |         let handle_ref = &self.handle;
 34 |         match self.handle.data {
 35 |             NodeData::Document => (),
 36 |             NodeData::Doctype { .. } => (),
 37 |             NodeData::Comment { .. } => (),
 38 | 
 39 |             NodeData::Text { ref contents } => {
 40 |                 if let Some(NodeData::Element { ref name, .. }) = self.parent {
 41 |                     let tag_name = name.local.as_ref();
 42 | 
 43 |                     process_text(
 44 |                         self.segment,
 45 |                         tag_name,
 46 |                         tendril_to_utf8(&contents.borrow()),
 47 |                         html,
 48 |                     )
 49 |                 }
 50 |             }
 51 | 
 52 |             NodeData::Element {
 53 |                 ref name,
 54 |                 ref attrs,
 55 |                 ..
 56 |             } => {
 57 |                 let tag_name = name.local.as_ref();
 58 | 
 59 |                 if tag_name == "head" {
 60 |                     segment = Segment::Head;
 61 |                 } else if tag_name == "body" {
 62 |                     segment = Segment::Body;
 63 |                 }
 64 | 
 65 |                 process_element(segment, tag_name, handle_ref, &attrs.borrow(), html)
 66 |             }
 67 | 
 68 |             NodeData::ProcessingInstruction { .. } => unreachable!(),
 69 |         }
 70 | 
 71 |         for child in self.handle.children.borrow().iter() {
 72 |             let new_parser = Parser {
 73 |                 segment,
 74 |                 parent: Some(&self.handle.data),
 75 |                 handle: child.clone(),
 76 |             };
 77 |             new_parser.traverse(html);
 78 |         }
 79 |     }
 80 | }
 81 | 
 82 | fn process_text(segment: Segment, tag_name: &str, contents: &str, html: &mut HTML) {
 83 |     if let Segment::Body = segment {
 84 |         if tag_name != "style" && tag_name != "script" && tag_name != "noscript" {
 85 |             if !html.text_content.is_empty() {
 86 |                 html.text_content.push(' ');
 87 |             }
 88 |             html.text_content.push_str(contents);
 89 |         }
 90 |     }
 91 | }
 92 | 
 93 | fn process_element(
 94 |     segment: Segment,
 95 |     tag_name: &str,
 96 |     handle: &Handle,
 97 |     attrs: &[Attribute],
 98 |     html: &mut HTML,
 99 | ) {
100 |     // process language attribute
101 |     if tag_name == "html" || tag_name == "body" {
102 |         let language = get_attribute(attrs, "lang");
103 |         if language.is_some() {
104 |             html.language = language;
105 |         }
106 |     }
107 | 
108 |     // process <head>
109 |     if let Segment::Head = segment {
110 |         if tag_name == "title" {
111 |             html.title = text_content(handle);
112 |         }
113 |         if tag_name == "meta" {
114 |             let content = get_attribute(attrs, "content");
115 |             if let Some(content) = content {
116 |                 let property_opt = get_attribute(attrs, "property")
117 |                     .or_else(|| get_attribute(attrs, "name"))
118 |                     .or_else(|| get_attribute(attrs, "http-equiv"));
119 | 
120 |                 if let Some(property) = property_opt {
121 |                     html.meta.insert(property.clone(), content.clone());
122 | 
123 |                     if property.starts_with("og:") && property.len() > 3 {
124 |                         html.opengraph.extend(&property[3..], content);
125 |                     } else if property == "description" {
126 |                         html.description = Some(content);
127 |                     }
128 |                 }
129 |             }
130 | 
131 |             if let Some(charset) = get_attribute(attrs, "charset") {
132 |                 html.meta.insert("charset".to_string(), charset);
133 |             }
134 |         }
135 |         if tag_name == "link" {
136 |             let rel = get_attribute(attrs, "rel").unwrap_or_default();
137 |             if rel == "canonical" {
138 |                 html.set_url(get_attribute(attrs, "href"));
139 |             } else if rel == "alternate" {
140 |                 let link_type = get_attribute(attrs, "type").unwrap_or_default();
141 |                 if [
142 |                     "application/atom+xml",
143 |                     "application/json",
144 |                     "application/rdf+xml",
145 |                     "application/rss+xml",
146 |                     "application/xml",
147 |                     "text/xml",
148 |                 ]
149 |                 .contains(&&link_type[..])
150 |                 {
151 |                     html.feed = get_attribute(attrs, "href");
152 |                 }
153 |             }
154 |         }
155 |     }
156 | 
157 |     // process ld-json snippets
158 |     if tag_name == "script" {
159 |         if let Some(script_type) = get_attribute(attrs, "type") {
160 |             if script_type == "application/ld+json" {
161 |                 if let Some(content) = text_content(handle) {
162 |                     html.schema_org.append(&mut SchemaOrg::from(content));
163 |                 }
164 |             }
165 |         }
166 |     }
167 | 
168 |     if tag_name == "a" {
169 |         if let Some(href) = get_attribute(attrs, "href") {
170 |             let text = text_content(handle).unwrap_or_default();
171 |             let href = if let Some(url) = &html.url_parsed {
172 |                 if let Ok(url) = url.join(&href) {
173 |                     url.to_string()
174 |                 } else {
175 |                     href
176 |                 }
177 |             } else {
178 |                 href
179 |             };
180 |             html.links.push(Link { url: href, text });
181 |         }
182 |     }
183 | }
184 | 
185 | fn get_attribute(attrs: &[Attribute], name: &str) -> Option<String> {
186 |     attrs
187 |         .iter()
188 |         .find(|attr| attr.name.local.as_ref() == name)
189 |         .map(|attr| attr.value.trim().to_string())
190 | }
191 | 
192 | fn text_content(handle: &Handle) -> Option<String> {
193 |     // todo paste all the text together
194 |     for child in handle.children.borrow().iter() {
195 |         if let NodeData::Text { ref contents } = child.data {
196 |             let string = tendril_to_utf8(&contents.borrow()).to_string();
197 |             return Some(string.trim().to_string());
198 |         }
199 |     }
200 | 
201 |     None
202 | }
203 | 
204 | fn tendril_to_utf8(t: &Tendril<UTF8>) -> &str {
205 |     t
206 | }
207 | 


--------------------------------------------------------------------------------
/src/schema_org.rs:
--------------------------------------------------------------------------------
 1 | //! Schema.org information
 2 | 
 3 | use serde_json::{self, Value};
 4 | 
 5 | /// Representing [Schema.org](https://schema.org/) information (currently only via JSON-LD)
 6 | #[derive(Debug, Clone)]
 7 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 8 | #[non_exhaustive]
 9 | pub struct SchemaOrg {
10 |     /// Schema.org type (article, image, event)
11 |     pub schema_type: String,
12 |     /// Schema.org info
13 |     pub value: Value,
14 | }
15 | 
16 | impl SchemaOrg {
17 |     pub fn from(content: String) -> Vec<Self> {
18 |         let node: Value = serde_json::from_str(&content).unwrap_or(Value::Null);
19 | 
20 |         let vals: Vec<Value>;
21 |         if let Value::Array(arr) = node {
22 |             vals = arr;
23 |         } else {
24 |             vals = vec![node];
25 |         }
26 | 
27 |         // Some websites place schema.org objects under "@graph", which we want to use as values
28 |         let vals: Vec<Value> = if let Some(obj) = vals.first().and_then(|v| v.as_object()) {
29 |             obj.get("@graph")
30 |                 .and_then(|v| v.as_array())
31 |                 .unwrap_or(&vals)
32 |                 .to_vec()
33 |         } else {
34 |             vals
35 |         };
36 | 
37 |         vals.into_iter()
38 |             .flat_map(|v| {
39 |                 let type_opt = v["@type"].clone();
40 |                 if let Value::String(ref type_val) = type_opt {
41 |                     return Some(SchemaOrg {
42 |                         schema_type: type_val.to_string(),
43 |                         value: v,
44 |                     });
45 |                 }
46 |                 None
47 |             })
48 |             .collect()
49 |     }
50 | }
51 | 
52 | #[cfg(test)]
53 | mod tests {
54 |     use super::SchemaOrg;
55 | 
56 |     #[test]
57 |     fn test_empty() {
58 |         let schema = SchemaOrg::from("{}".to_string());
59 |         assert!(schema.is_empty());
60 |     }
61 | 
62 |     #[test]
63 |     fn test_type() {
64 |         let schema = SchemaOrg::from("{\"@type\": \"NewsArticle\"}".to_string());
65 |         assert_eq!(schema.len(), 1);
66 |         assert_eq!(schema[0].schema_type, "NewsArticle");
67 |     }
68 | 
69 |     #[test]
70 |     fn test_graph() {
71 |         let schema = SchemaOrg::from("{\"@context\":\"https://schema.org\",\"@graph\":[{\"@context\":\"https://schema.org\",\"@type\":\"NewsArticle\"}]}".to_string());
72 |         assert_eq!(schema.len(), 1);
73 |         assert_eq!(schema[0].schema_type, "NewsArticle");
74 |     }
75 | }
76 | 


--------------------------------------------------------------------------------
/tests/data/index.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html>
 3 | <head>
 4 |     <title>Example Domain</title>
 5 | 
 6 |     <meta charset="utf-8" />
 7 |     <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
 8 |     <meta name="viewport" content="width=device-width, initial-scale=1" />
 9 |     <style type="text/css">
10 |     body {
11 |         background-color: #f0f0f2;
12 |         margin: 0;
13 |         padding: 0;
14 |         font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
15 |         
16 |     }
17 |     div {
18 |         width: 600px;
19 |         margin: 5em auto;
20 |         padding: 50px;
21 |         background-color: #fff;
22 |         border-radius: 1em;
23 |     }
24 |     a:link, a:visited {
25 |         color: #38488f;
26 |         text-decoration: none;
27 |     }
28 |     @media (max-width: 700px) {
29 |         body {
30 |             background-color: #fff;
31 |         }
32 |         div {
33 |             width: auto;
34 |             margin: 0 auto;
35 |             border-radius: 0;
36 |             padding: 1em;
37 |         }
38 |     }
39 |     </style>    
40 | </head>
41 | 
42 | <body>
43 | <div>
44 |     <h1>Example Domain</h1>
45 |     <p>This domain is established to be used for illustrative examples in documents. You may use this
46 |     domain in examples without prior coordination or asking for permission.</p>
47 |     <p><a href="http://www.iana.org/domains/example">More information...</a></p>
48 | </div>
49 | </body>
50 | </html>
51 | 


--------------------------------------------------------------------------------
/tests/integration_test.rs:
--------------------------------------------------------------------------------
 1 | extern crate webpage;
 2 | 
 3 | use std::io::{Read, Write};
 4 | use std::net::TcpListener;
 5 | 
 6 | #[cfg(feature = "curl")]
 7 | use webpage::{Webpage, WebpageOptions, HTML};
 8 | 
 9 | #[test]
10 | fn from_file() {
11 |     let path = "tests/data/index.html";
12 |     let html = HTML::from_file(path, None);
13 |     assert!(html.is_ok());
14 | 
15 |     let html = html.unwrap();
16 |     assert_eq!(html.title, Some("Example Domain".to_string()));
17 |     assert!(html.description.is_none());
18 | }
19 | 
20 | #[test]
21 | #[ignore]
22 | #[cfg(feature = "curl")]
23 | fn from_url() {
24 |     let url = "https://example.org";
25 |     let webpage = Webpage::from_url(url, WebpageOptions::default());
26 |     assert!(webpage.is_ok());
27 | 
28 |     let html = webpage.unwrap().html;
29 |     assert_eq!(html.title, Some("Example Domain".to_string()));
30 |     assert!(html.description.is_none());
31 | }
32 | 
33 | #[test]
34 | fn test_headers() {
35 |     let socket = TcpListener::bind("127.0.0.1:0").unwrap(); // bind to a random port
36 |     let url = format!("{}", socket.local_addr().unwrap());
37 |     std::thread::spawn(move || {
38 |         let my_headers: Vec<String> = vec!["X-My-Header: 1234".to_string()];
39 |         let mut options = WebpageOptions::default();
40 |         options.headers = my_headers;
41 |         let webpage = Webpage::from_url(&url, options);
42 |         assert!(webpage.is_ok());
43 |     });
44 |     let mut stream = socket.accept().unwrap().0;
45 |     let mut buf = vec![0; 1024];
46 |     let mut read = 0;
47 |     let mut request;
48 |     loop {
49 |         let bytes = stream.read(&mut buf[read..]).unwrap();
50 |         assert_ne!(bytes, 0);
51 |         read += bytes;
52 |         request = String::from_utf8(buf[..read].to_vec()).unwrap();
53 |         if request.contains("\r\n\r\n") {
54 |             break;
55 |         }
56 |     }
57 |     assert!(request.contains("X-My-Header: 1234\r\n"));
58 |     stream.write_all(b"HTTP/1.1 200 OK\r\n\r\n").unwrap();
59 | }
60 | 


--------------------------------------------------------------------------------