├── .gitignore
├── CHANGELOG.md
├── examples
    └── example.rs
├── Cargo.toml
├── README.md
├── LICENSE
└── src
    └── lib.rs


/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | **/*.rs.bk
3 | Cargo.lock
4 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # 0.1.0
2 | 
3 | - Initial release of crate


--------------------------------------------------------------------------------
/examples/example.rs:
--------------------------------------------------------------------------------
 1 | extern crate url_scraper;
 2 | use url_scraper::UrlScraper;
 3 | 
 4 | fn main() {
 5 |     let directory = "http://phoronix.com/";
 6 | 
 7 |     let scraper = UrlScraper::new(directory).unwrap();
 8 |     for (text, url) in scraper.into_iter() {
 9 |         println!("{}: {}", text, url);
10 |     }
11 | }


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "url-scraper"
 3 | version = "0.1.2"
 4 | description = "Simple HTML URL scraper"
 5 | repository = "https://github.com/pop-os/url-scraper"
 6 | authors = ["Michael Aaron Murphy <mmstickman@gmail.com>"]
 7 | license = "MIT"
 8 | readme = "README.md"
 9 | keywords = ["url", "scraper"]
10 | categories = ["web-programming"]
11 | 
12 | [dependencies]
13 | reqwest = "0.9"
14 | scraper = "0.12"
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # url-scraper
 2 | 
 3 | Rust crate for scraping URLs from HTML pages.
 4 | 
 5 | ## Example
 6 | 
 7 | ```rust
 8 | extern crate url_scraper;
 9 | use url_scraper::UrlScraper;
10 | 
11 | fn main() {
12 |     let directory = "http://phoronix.com/";
13 | 
14 |     let scraper = UrlScraper::new(directory).unwrap();
15 |     for (text, url) in scraper.into_iter() {
16 |         println!("{}: {}", text, url);
17 |     }
18 | }
19 | ```


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 System76
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | //! Simple library for quickly fetching a list of URLs from a webpage.
  2 | //! 
  3 | //! # Example
  4 | //! ```rust,no_run
  5 | //! extern crate url_scraper;
  6 | //! use url_scraper::UrlScraper;
  7 | //! 
  8 | //! let scraper = UrlScraper::new("http://phoronix.com/").unwrap();
  9 | //! for (text, url) in scraper.into_iter() {
 10 | //!     println!("{}: {}", text, url);
 11 | //! }
 12 | //!```
 13 | 
 14 | extern crate reqwest;
 15 | extern crate scraper;
 16 | 
 17 | use reqwest::{Client, Url};
 18 | use scraper::{Html, html::Select, Selector};
 19 | use std::fmt;
 20 | 
 21 | /// Stores the HTML document in memory.
 22 | pub struct UrlScraper {
 23 |     url: Url,
 24 |     html: Html,
 25 |     selector: Selector,
 26 | }
 27 | 
 28 | impl UrlScraper {
 29 |     /// Constructs a new scraper from a given URL.
 30 |     pub fn new(url: &str) -> Result<Self, Error> {
 31 |         let client = Client::new();
 32 |         Self::new_with_client(url, &client)
 33 |     }
 34 | 
 35 |     /// Use an existing `reqwest::Client` to make a request.
 36 |     pub fn new_with_client(url: &str, client: &Client) -> Result<Self, Error> {
 37 |         let url = Url::parse(url)?;
 38 |         let mut resp = client.get(url.clone()).send()?;
 39 |         let html = resp.text()?;
 40 | 
 41 |         Ok(Self {
 42 |             url,
 43 |             html: Html::parse_document(&html),
 44 |             selector: Selector::parse("a").expect("failed to create <a> selector"),
 45 |         })
 46 |     }
 47 | 
 48 |     /// In case the HTML has already been fetched in advance, this can be used to parse from it directly.
 49 |     pub fn new_with_html(url: &str, html: &str) -> Result<Self, Error> {
 50 |         Ok(Self {
 51 |             url: Url::parse(url)?,
 52 |             html: Html::parse_document(html),
 53 |             selector: Selector::parse("a").expect("failed to create <a> selector"),
 54 |         })
 55 |     }
 56 | 
 57 |     /// Fetch the URLs using an iterator.
 58 |     pub fn into_iter<'a>(&'a self) -> UrlIter<'a, 'a> {
 59 |         UrlIter {
 60 |             url: &self.url,
 61 |             data: self.html.select(&self.selector)
 62 |         }
 63 |     }
 64 | }
 65 | 
 66 | /// An Iterator that returns `(String, Url)` pairs per iteration.
 67 | pub struct UrlIter<'a, 'b> {
 68 |     url: &'a Url,
 69 |     data: Select<'a, 'b>
 70 | }
 71 | 
 72 | impl<'a, 'b> Iterator for UrlIter<'a, 'b> {
 73 |     type Item = (String, Url);
 74 | 
 75 |     fn next(&mut self) -> Option<Self::Item> {
 76 |         for element in &mut self.data {
 77 |             if let Some(url) = element.value().attr("href") {
 78 |                 if ! url.starts_with('?') {
 79 |                     if let Ok(url) = self.url.join(url) {
 80 |                         return Some((element.inner_html(), url));
 81 |                     }
 82 |                 }
 83 |             }
 84 |         }
 85 | 
 86 |         None
 87 |     }
 88 | }
 89 | 
 90 | #[derive(Debug)]
 91 | pub enum Error {
 92 |     UrlParsing { why: reqwest::UrlError },
 93 |     Request { why: reqwest::Error }
 94 | }
 95 | 
 96 | impl fmt::Display for Error {
 97 |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 98 |         let error = match *self {
 99 |             Error::UrlParsing { ref why } => format!("failed to parse URL: {}", why),
100 |             Error::Request { ref why } => format!("failure in request: {}", why),
101 |         };
102 |         f.write_str(&error)
103 |     }
104 | }
105 | 
106 | impl From<reqwest::UrlError> for Error {
107 |     fn from(why: reqwest::UrlError) -> Error {
108 |         Error::UrlParsing { why }
109 |     }
110 | }
111 | 
112 | impl From<reqwest::Error> for Error {
113 |     fn from(why: reqwest::Error) -> Error {
114 |         Error::Request { why }
115 |     }
116 | }
117 | 


--------------------------------------------------------------------------------