├── .gitignore ├── CHANGELOG.md ├── examples └── example.rs ├── Cargo.toml ├── README.md ├── LICENSE └── src └── lib.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | **/*.rs.bk 3 | Cargo.lock 4 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # 0.1.0 2 | 3 | - Initial release of crate -------------------------------------------------------------------------------- /examples/example.rs: -------------------------------------------------------------------------------- 1 | extern crate url_scraper; 2 | use url_scraper::UrlScraper; 3 | 4 | fn main() { 5 | let directory = "http://phoronix.com/"; 6 | 7 | let scraper = UrlScraper::new(directory).unwrap(); 8 | for (text, url) in scraper.into_iter() { 9 | println!("{}: {}", text, url); 10 | } 11 | } -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "url-scraper" 3 | version = "0.1.2" 4 | description = "Simple HTML URL scraper" 5 | repository = "https://github.com/pop-os/url-scraper" 6 | authors = ["Michael Aaron Murphy "] 7 | license = "MIT" 8 | readme = "README.md" 9 | keywords = ["url", "scraper"] 10 | categories = ["web-programming"] 11 | 12 | [dependencies] 13 | reqwest = "0.9" 14 | scraper = "0.12" 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # url-scraper 2 | 3 | Rust crate for scraping URLs from HTML pages. 4 | 5 | ## Example 6 | 7 | ```rust 8 | extern crate url_scraper; 9 | use url_scraper::UrlScraper; 10 | 11 | fn main() { 12 | let directory = "http://phoronix.com/"; 13 | 14 | let scraper = UrlScraper::new(directory).unwrap(); 15 | for (text, url) in scraper.into_iter() { 16 | println!("{}: {}", text, url); 17 | } 18 | } 19 | ``` -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 System76 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Simple library for quickly fetching a list of URLs from a webpage. 2 | //! 3 | //! # Example 4 | //! ```rust,no_run 5 | //! extern crate url_scraper; 6 | //! use url_scraper::UrlScraper; 7 | //! 8 | //! let scraper = UrlScraper::new("http://phoronix.com/").unwrap(); 9 | //! for (text, url) in scraper.into_iter() { 10 | //! println!("{}: {}", text, url); 11 | //! } 12 | //!``` 13 | 14 | extern crate reqwest; 15 | extern crate scraper; 16 | 17 | use reqwest::{Client, Url}; 18 | use scraper::{Html, html::Select, Selector}; 19 | use std::fmt; 20 | 21 | /// Stores the HTML document in memory. 22 | pub struct UrlScraper { 23 | url: Url, 24 | html: Html, 25 | selector: Selector, 26 | } 27 | 28 | impl UrlScraper { 29 | /// Constructs a new scraper from a given URL. 30 | pub fn new(url: &str) -> Result { 31 | let client = Client::new(); 32 | Self::new_with_client(url, &client) 33 | } 34 | 35 | /// Use an existing `reqwest::Client` to make a request. 36 | pub fn new_with_client(url: &str, client: &Client) -> Result { 37 | let url = Url::parse(url)?; 38 | let mut resp = client.get(url.clone()).send()?; 39 | let html = resp.text()?; 40 | 41 | Ok(Self { 42 | url, 43 | html: Html::parse_document(&html), 44 | selector: Selector::parse("a").expect("failed to create selector"), 45 | }) 46 | } 47 | 48 | /// In case the HTML has already been fetched in advance, this can be used to parse from it directly. 49 | pub fn new_with_html(url: &str, html: &str) -> Result { 50 | Ok(Self { 51 | url: Url::parse(url)?, 52 | html: Html::parse_document(html), 53 | selector: Selector::parse("a").expect("failed to create selector"), 54 | }) 55 | } 56 | 57 | /// Fetch the URLs using an iterator. 58 | pub fn into_iter<'a>(&'a self) -> UrlIter<'a, 'a> { 59 | UrlIter { 60 | url: &self.url, 61 | data: self.html.select(&self.selector) 62 | } 63 | } 64 | } 65 | 66 | /// An Iterator that returns `(String, Url)` pairs per iteration. 67 | pub struct UrlIter<'a, 'b> { 68 | url: &'a Url, 69 | data: Select<'a, 'b> 70 | } 71 | 72 | impl<'a, 'b> Iterator for UrlIter<'a, 'b> { 73 | type Item = (String, Url); 74 | 75 | fn next(&mut self) -> Option { 76 | for element in &mut self.data { 77 | if let Some(url) = element.value().attr("href") { 78 | if ! url.starts_with('?') { 79 | if let Ok(url) = self.url.join(url) { 80 | return Some((element.inner_html(), url)); 81 | } 82 | } 83 | } 84 | } 85 | 86 | None 87 | } 88 | } 89 | 90 | #[derive(Debug)] 91 | pub enum Error { 92 | UrlParsing { why: reqwest::UrlError }, 93 | Request { why: reqwest::Error } 94 | } 95 | 96 | impl fmt::Display for Error { 97 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 98 | let error = match *self { 99 | Error::UrlParsing { ref why } => format!("failed to parse URL: {}", why), 100 | Error::Request { ref why } => format!("failure in request: {}", why), 101 | }; 102 | f.write_str(&error) 103 | } 104 | } 105 | 106 | impl From for Error { 107 | fn from(why: reqwest::UrlError) -> Error { 108 | Error::UrlParsing { why } 109 | } 110 | } 111 | 112 | impl From for Error { 113 | fn from(why: reqwest::Error) -> Error { 114 | Error::Request { why } 115 | } 116 | } 117 | --------------------------------------------------------------------------------