├── .gitignore
├── Cargo.toml
├── LICENSE
├── README.md
└── src
    └── main.rs


/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "substack_scraper"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | chrono = "0.4.23"
10 | clap = { version = "4.0.32", features = ["derive"] }
11 | color-eyre = "0.6.2"
12 | derive = "1.0.0"
13 | env_logger = "0.10.0"
14 | fancy-regex = "0.10.0"
15 | features = "0.10.0"
16 | fs-err = "2.9.0"
17 | futures = "0.3.25"
18 | html2text = "0.4.4"
19 | log = "0.4.17"
20 | regex = "1.7.0"
21 | reqwest = { version = "0.11.13", features = ["json"] }
22 | scraper = "0.14.0"
23 | serde = { version = "1.0.152", features = ["derive"] }
24 | tokio = { version = "1.23.0", features = ["full"] }
25 | url = { version = "2.3.1", features = ["serde"] }
26 | voca_rs = "1.15.2"
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Ivy
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Substack Scraper
 2 | This scrapes Substack blogs for all their post content and outputs it into raw text files. This project was intended to create neural network training data.
 3 | 
 4 | *NOTE*: This project currently cannot get around subscriber-only Substack articles; it will output the truncated article text along with the subscriber message.
 5 | 
 6 | # Usage
 7 | ```shell
 8 | git clone https://github.com/ivyraine/substack_scraper
 9 | cargo run -- -w <BLOGS>
10 | ```
11 | Example:
12 | ```sh
13 | ## Example:
14 | cargo run -- -w "https://substack.thewebscraping.club/ https://etiennefd.substack.com/"
15 | ```
16 | For debug messages, set envvar `RUST_LOG=debug`
17 | 
18 | # Contributing
19 | Feel free to open an issue or PR if you have any suggestions or improvements, but I cannot guarantee that I'll get to them! The project is small and has some documentation, so I would encourage putting up a PR if you have a feature you want to add.
20 | 
21 | 


--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
  1 | use reqwest;
  2 | use html2text::from_read;
  3 | use serde::{Deserialize, Serialize};
  4 | use voca_rs::strip::strip_tags;
  5 | use tokio::{macros, spawn};
  6 | use futures::executor::block_on;
  7 | use log::{debug, LevelFilter};
  8 | use env_logger::{Builder, Target};
  9 | use std::{env, fs, iter};
 10 | use std::collections::HashSet;
 11 | use std::io::Write;
 12 | use std::path::Path;
 13 | use std::thread::sleep;
 14 | use chrono::Local;
 15 | use scraper::{Html, Selector};
 16 | 
 17 | use clap::{Parser, Subcommand};
 18 | use clap::builder::TypedValueParser;
 19 | use color_eyre::eyre;
 20 | use color_eyre::eyre::eyre;
 21 | use env_logger::Target::Stdout;
 22 | use futures::TryFutureExt;
 23 | use fancy_regex::Regex;
 24 | use reqwest::Url;
 25 | 
 26 | #[derive(Parser)]
 27 | #[command(author, version, about, long_about = None)]
 28 | struct Cli {
 29 |     /// Optional name to operate on
 30 |     name: Option<String>,
 31 | 
 32 |     /// A space-delimited list of substack sites to scrape, such as "https://blog.bytebytego.com/ https://astralcodexten.substack.com/"
 33 |     #[clap(short, long, use_value_delimiter = true, value_delimiter = ' ')]
 34 |     websites: Vec<String>,
 35 | }
 36 | 
 37 | #[tokio::main(flavor = "multi_thread")]
 38 | async fn main() -> eyre::Result<()> {
 39 |     Builder::from_default_env()
 40 |         .format(|buf, record| {
 41 |             writeln!(buf,
 42 |                      "{} [{}] - {}",
 43 |                      Local::now().format("%Y-%m-%dT%H:%M:%S"),
 44 |                      record.level(),
 45 |                      record.args()
 46 |             )
 47 |         })
 48 |         .target(Stdout)
 49 |         .init();
 50 | 
 51 |     let cli = Cli::parse();
 52 | 
 53 |     debug!("Websites are {:?}", cli.websites);
 54 |     // Convert to Url type
 55 |     // Remove websites that are empty.
 56 |     let websites = cli.websites.iter().filter(|x| !x.is_empty());
 57 | 
 58 |     let websites = websites.into_iter().map(|s|
 59 |         Url::parse(&s).unwrap())
 60 |             .collect::<Vec<Url>>();
 61 | 
 62 |     // let join_handle = tokio::spawn(async move {
 63 |     for website in websites {
 64 |         scrape(&website).await.expect(&*format!("Failed to scrape {}", website));
 65 |     }
 66 |     // });
 67 | 
 68 |     // Wait for the async functions to complete.
 69 |     // join_handle.await.unwrap();
 70 |     Ok(())
 71 | }
 72 | 
 73 | #[derive(Deserialize)]
 74 | #[derive(Debug)]
 75 | struct CanonicalUrl {
 76 |     canonical_url: Url,
 77 | }
 78 | 
 79 | async fn scrape(homepage_url: &Url) -> eyre::Result<()> {
 80 |     let post_urls = get_post_urls(homepage_url).await?;
 81 | 
 82 |     let mut urls_to_post_content: Vec<(&Url, Vec<String>)> = Vec::new();
 83 | 
 84 |     // Get posts' content.
 85 |     for mut post_url in &post_urls {
 86 |         let post = get_post_content(&post_url).await?;
 87 |         urls_to_post_content.push((&post_url, post));
 88 |     }
 89 | 
 90 |     let blog_folder_path = Path::new("blogs").join(Path::new(&homepage_url.host_str().unwrap()));
 91 | 
 92 |     // Write to files.
 93 |     for (url, post) in urls_to_post_content {
 94 |         let path = Path::new(url.path());
 95 |         let path = path.strip_prefix("/").unwrap_or(path);
 96 |         let path = blog_folder_path.join(path);
 97 |         if let Some(dir) = path.parent() {
 98 |             fs_err::create_dir_all(dir)?;
 99 |         }
100 |         fs_err::write(&path, post.join("\n").as_bytes())?;
101 |     }
102 |     Ok(())
103 | }
104 | 
105 | /// Get the text content of a post.
106 | async fn get_post_content(url: &Url) -> eyre::Result<Vec<String>> {
107 |     // TODO wait & retry getting content when hitting rate limit.
108 |     debug!("url is {:?}", url);
109 | 
110 |     let mut result = Vec::new();
111 |     loop {
112 |         let headers = reqwest::get(url.clone()).await?;
113 |         debug!("headers are {:?}", headers);
114 |         let mut body = headers.text().await?;
115 | 
116 |         let fragment = Html::parse_fragment(&body);
117 |         // The following selector looks for <p> elements with the .available-content parent.
118 |         let selector = Selector::parse(".available-content p:not(.button-wrapper)").unwrap();
119 |         for it in fragment.select(&selector) {
120 |             let temp = it.inner_html();
121 |             result.push(cleanup_content(&temp));
122 |         };
123 |         if !result.is_empty() { break };
124 |         // Wait on rate limiter.
125 |         sleep(std::time::Duration::from_secs(1));
126 |         debug!("Retrying...");
127 |     }
128 |     debug!("{:?}", result);
129 |     Ok(result)
130 | }
131 | 
132 | /// Transform HTML into clean text output.
133 | fn cleanup_content(input: &String) -> String {
134 |     // Replace in-paragraph footnote links with "". Assumes that the following regex works.
135 |     let regex_footnote = Regex::new(r">\d</a>").unwrap();
136 |     let temp = regex_footnote.replace_all(&input, "></a>").to_string();
137 |     // Strip HTML tags.
138 |     let temp = strip_tags(&temp);
139 |     // Remove HTML encoding artifacts like ;nbsp;
140 |     let temp = from_read(temp.as_bytes(), 100);
141 |     let temp = temp.replace("\n", " ");
142 |     temp
143 | }
144 | 
145 | async fn get_post_urls(homepage_url: &Url) -> eyre::Result<HashSet<Url>> {
146 |     debug!("Scraping {}", homepage_url);
147 | 
148 |     // Current page number.
149 |     let mut page_offset = 0;
150 |     // Pages to request on each iteration.
151 |     let page_limit = 12;
152 | 
153 |     // Contains the hashset of article URLs.
154 |     let mut seen_urls = HashSet::new();
155 | 
156 |     loop {
157 |         // Get content. The api url may be subject to change from Substack.
158 |         let current_request_url = format!("{}api/v1/archive?sort=new&search=&offset={}&limit={}", homepage_url, page_offset, page_limit);
159 |         debug!("current_request_url = {}", &current_request_url);
160 | 
161 |         let post_urls = reqwest::get(&current_request_url)
162 |             .await?.
163 |             json::<Vec<CanonicalUrl>>()
164 |             .await?;
165 | 
166 |         // Add page URLs.
167 |         // Exit on empty query.
168 |         if post_urls.is_empty() {
169 |             break;
170 |         }
171 |         seen_urls.extend(post_urls.into_iter().map(|it| it.canonical_url));
172 | 
173 |         page_offset += page_limit;
174 |     }
175 |     debug!("seen_urls = {seen_urls:?}");
176 | 
177 |     debug!("Finished scraping {}", homepage_url);
178 |     Ok(seen_urls)
179 | }
180 | 


--------------------------------------------------------------------------------