├── .gitignore ├── Cargo.toml ├── LICENSE ├── README.md └── src └── main.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "substack_scraper" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | chrono = "0.4.23" 10 | clap = { version = "4.0.32", features = ["derive"] } 11 | color-eyre = "0.6.2" 12 | derive = "1.0.0" 13 | env_logger = "0.10.0" 14 | fancy-regex = "0.10.0" 15 | features = "0.10.0" 16 | fs-err = "2.9.0" 17 | futures = "0.3.25" 18 | html2text = "0.4.4" 19 | log = "0.4.17" 20 | regex = "1.7.0" 21 | reqwest = { version = "0.11.13", features = ["json"] } 22 | scraper = "0.14.0" 23 | serde = { version = "1.0.152", features = ["derive"] } 24 | tokio = { version = "1.23.0", features = ["full"] } 25 | url = { version = "2.3.1", features = ["serde"] } 26 | voca_rs = "1.15.2" 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Ivy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Substack Scraper 2 | This scrapes Substack blogs for all their post content and outputs it into raw text files. This project was intended to create neural network training data. 3 | 4 | *NOTE*: This project currently cannot get around subscriber-only Substack articles; it will output the truncated article text along with the subscriber message. 5 | 6 | # Usage 7 | ```shell 8 | git clone https://github.com/ivyraine/substack_scraper 9 | cargo run -- -w 10 | ``` 11 | Example: 12 | ```sh 13 | ## Example: 14 | cargo run -- -w "https://substack.thewebscraping.club/ https://etiennefd.substack.com/" 15 | ``` 16 | For debug messages, set envvar `RUST_LOG=debug` 17 | 18 | # Contributing 19 | Feel free to open an issue or PR if you have any suggestions or improvements, but I cannot guarantee that I'll get to them! The project is small and has some documentation, so I would encourage putting up a PR if you have a feature you want to add. 20 | 21 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use reqwest; 2 | use html2text::from_read; 3 | use serde::{Deserialize, Serialize}; 4 | use voca_rs::strip::strip_tags; 5 | use tokio::{macros, spawn}; 6 | use futures::executor::block_on; 7 | use log::{debug, LevelFilter}; 8 | use env_logger::{Builder, Target}; 9 | use std::{env, fs, iter}; 10 | use std::collections::HashSet; 11 | use std::io::Write; 12 | use std::path::Path; 13 | use std::thread::sleep; 14 | use chrono::Local; 15 | use scraper::{Html, Selector}; 16 | 17 | use clap::{Parser, Subcommand}; 18 | use clap::builder::TypedValueParser; 19 | use color_eyre::eyre; 20 | use color_eyre::eyre::eyre; 21 | use env_logger::Target::Stdout; 22 | use futures::TryFutureExt; 23 | use fancy_regex::Regex; 24 | use reqwest::Url; 25 | 26 | #[derive(Parser)] 27 | #[command(author, version, about, long_about = None)] 28 | struct Cli { 29 | /// Optional name to operate on 30 | name: Option, 31 | 32 | /// A space-delimited list of substack sites to scrape, such as "https://blog.bytebytego.com/ https://astralcodexten.substack.com/" 33 | #[clap(short, long, use_value_delimiter = true, value_delimiter = ' ')] 34 | websites: Vec, 35 | } 36 | 37 | #[tokio::main(flavor = "multi_thread")] 38 | async fn main() -> eyre::Result<()> { 39 | Builder::from_default_env() 40 | .format(|buf, record| { 41 | writeln!(buf, 42 | "{} [{}] - {}", 43 | Local::now().format("%Y-%m-%dT%H:%M:%S"), 44 | record.level(), 45 | record.args() 46 | ) 47 | }) 48 | .target(Stdout) 49 | .init(); 50 | 51 | let cli = Cli::parse(); 52 | 53 | debug!("Websites are {:?}", cli.websites); 54 | // Convert to Url type 55 | // Remove websites that are empty. 56 | let websites = cli.websites.iter().filter(|x| !x.is_empty()); 57 | 58 | let websites = websites.into_iter().map(|s| 59 | Url::parse(&s).unwrap()) 60 | .collect::>(); 61 | 62 | // let join_handle = tokio::spawn(async move { 63 | for website in websites { 64 | scrape(&website).await.expect(&*format!("Failed to scrape {}", website)); 65 | } 66 | // }); 67 | 68 | // Wait for the async functions to complete. 69 | // join_handle.await.unwrap(); 70 | Ok(()) 71 | } 72 | 73 | #[derive(Deserialize)] 74 | #[derive(Debug)] 75 | struct CanonicalUrl { 76 | canonical_url: Url, 77 | } 78 | 79 | async fn scrape(homepage_url: &Url) -> eyre::Result<()> { 80 | let post_urls = get_post_urls(homepage_url).await?; 81 | 82 | let mut urls_to_post_content: Vec<(&Url, Vec)> = Vec::new(); 83 | 84 | // Get posts' content. 85 | for mut post_url in &post_urls { 86 | let post = get_post_content(&post_url).await?; 87 | urls_to_post_content.push((&post_url, post)); 88 | } 89 | 90 | let blog_folder_path = Path::new("blogs").join(Path::new(&homepage_url.host_str().unwrap())); 91 | 92 | // Write to files. 93 | for (url, post) in urls_to_post_content { 94 | let path = Path::new(url.path()); 95 | let path = path.strip_prefix("/").unwrap_or(path); 96 | let path = blog_folder_path.join(path); 97 | if let Some(dir) = path.parent() { 98 | fs_err::create_dir_all(dir)?; 99 | } 100 | fs_err::write(&path, post.join("\n").as_bytes())?; 101 | } 102 | Ok(()) 103 | } 104 | 105 | /// Get the text content of a post. 106 | async fn get_post_content(url: &Url) -> eyre::Result> { 107 | // TODO wait & retry getting content when hitting rate limit. 108 | debug!("url is {:?}", url); 109 | 110 | let mut result = Vec::new(); 111 | loop { 112 | let headers = reqwest::get(url.clone()).await?; 113 | debug!("headers are {:?}", headers); 114 | let mut body = headers.text().await?; 115 | 116 | let fragment = Html::parse_fragment(&body); 117 | // The following selector looks for

elements with the .available-content parent. 118 | let selector = Selector::parse(".available-content p:not(.button-wrapper)").unwrap(); 119 | for it in fragment.select(&selector) { 120 | let temp = it.inner_html(); 121 | result.push(cleanup_content(&temp)); 122 | }; 123 | if !result.is_empty() { break }; 124 | // Wait on rate limiter. 125 | sleep(std::time::Duration::from_secs(1)); 126 | debug!("Retrying..."); 127 | } 128 | debug!("{:?}", result); 129 | Ok(result) 130 | } 131 | 132 | /// Transform HTML into clean text output. 133 | fn cleanup_content(input: &String) -> String { 134 | // Replace in-paragraph footnote links with "". Assumes that the following regex works. 135 | let regex_footnote = Regex::new(r">\d").unwrap(); 136 | let temp = regex_footnote.replace_all(&input, ">").to_string(); 137 | // Strip HTML tags. 138 | let temp = strip_tags(&temp); 139 | // Remove HTML encoding artifacts like ;nbsp; 140 | let temp = from_read(temp.as_bytes(), 100); 141 | let temp = temp.replace("\n", " "); 142 | temp 143 | } 144 | 145 | async fn get_post_urls(homepage_url: &Url) -> eyre::Result> { 146 | debug!("Scraping {}", homepage_url); 147 | 148 | // Current page number. 149 | let mut page_offset = 0; 150 | // Pages to request on each iteration. 151 | let page_limit = 12; 152 | 153 | // Contains the hashset of article URLs. 154 | let mut seen_urls = HashSet::new(); 155 | 156 | loop { 157 | // Get content. The api url may be subject to change from Substack. 158 | let current_request_url = format!("{}api/v1/archive?sort=new&search=&offset={}&limit={}", homepage_url, page_offset, page_limit); 159 | debug!("current_request_url = {}", ¤t_request_url); 160 | 161 | let post_urls = reqwest::get(¤t_request_url) 162 | .await?. 163 | json::>() 164 | .await?; 165 | 166 | // Add page URLs. 167 | // Exit on empty query. 168 | if post_urls.is_empty() { 169 | break; 170 | } 171 | seen_urls.extend(post_urls.into_iter().map(|it| it.canonical_url)); 172 | 173 | page_offset += page_limit; 174 | } 175 | debug!("seen_urls = {seen_urls:?}"); 176 | 177 | debug!("Finished scraping {}", homepage_url); 178 | Ok(seen_urls) 179 | } 180 | --------------------------------------------------------------------------------