├── .gitignore ├── Cargo.toml ├── airbnb ├── Cargo.toml └── src │ └── lib.rs ├── annas_archive ├── Cargo.toml └── src │ └── lib.rs ├── bypass_cloudflare ├── Cargo.toml └── src │ └── lib.rs ├── readme.md └── scraper ├── Cargo.toml └── src └── main.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /Cargo.lock 3 | chromedriver* 4 | *chromedriver -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | resolver = "2" 3 | members = ["scraper", "airbnb", "annas_archive", "bypass_cloudflare"] 4 | -------------------------------------------------------------------------------- /airbnb/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "airbnb" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | csv = "1.2.2" 10 | serde = { version = "1.0.188", features = ["derive"] } 11 | thirtyfour = "0.31.0" 12 | tokio = { version = "1.32.0", features = ["full"] } 13 | url = "2.4.1" 14 | -------------------------------------------------------------------------------- /airbnb/src/lib.rs: -------------------------------------------------------------------------------- 1 | use serde::Serialize; 2 | use std::error::Error; 3 | use std::thread; 4 | use std::time::Duration; 5 | use thirtyfour::{ 6 | prelude::{ElementWaitable, WebDriverError}, 7 | By, DesiredCapabilities, WebDriver, WebElement, 8 | }; 9 | use url::Url; 10 | 11 | pub async fn scrape_airbnb(place: &str) -> Result<(), Box> { 12 | let driver = initialize_driver().await?; 13 | let url = Url::parse("https://www.airbnb.it/")?; 14 | 15 | driver.goto(url).await?; 16 | thread::sleep(Duration::from_secs(2)); 17 | 18 | search_location(&driver, place).await?; 19 | thread::sleep(Duration::from_secs(2)); 20 | 21 | scrape_all(driver).await?; 22 | 23 | Ok(()) 24 | } 25 | 26 | async fn scrape_all(driver: WebDriver) -> Result<(), Box> { 27 | driver 28 | .execute("window.scrollTo(0, document.body.scrollHeight);", vec![]) 29 | .await?; 30 | thread::sleep(Duration::from_secs(1)); 31 | 32 | let mut wtr = csv::Writer::from_path("airbnb.csv")?; 33 | 34 | loop { 35 | if let Ok(next_page_button) = driver.find(By::Css("#site-content > div > div.p1szzjq8.dir.dir-ltr > div > div > div > nav > div > a.l1ovpqvx.c1ytbx3a.dir.dir-ltr")).await { 36 | 37 | match next_page_button.is_clickable().await? { 38 | true => { 39 | 40 | //start extracting data 41 | 42 | let house_elems = get_house_elements(&driver).await?; 43 | 44 | for house_elem in house_elems { 45 | 46 | let bnb_details = BnbDetails::from(house_elem).await?; 47 | 48 | wtr.serialize(bnb_details)?; 49 | 50 | } 51 | load_next_page(next_page_button, &driver).await?; 52 | } 53 | false => { 54 | break 55 | }, 56 | } 57 | } else { 58 | let house_elems = get_house_elements(&driver).await?; 59 | 60 | for house_elem in house_elems { 61 | 62 | let bnb_details = BnbDetails::from(house_elem).await?; 63 | wtr.serialize(bnb_details)?; 64 | } 65 | break; 66 | } 67 | } 68 | Ok(()) 69 | } 70 | 71 | async fn load_next_page( 72 | next_page_button: WebElement, 73 | driver: &WebDriver, 74 | ) -> Result<(), Box> { 75 | next_page_button.click().await?; 76 | thread::sleep(Duration::from_secs(2)); 77 | 78 | driver 79 | .execute("window.scrollTo(0, document.body.scrollHeight);", vec![]) 80 | .await?; 81 | thread::sleep(Duration::from_secs(1)); 82 | 83 | Ok(()) 84 | } 85 | 86 | async fn get_house_elements(driver: &WebDriver) -> Result, WebDriverError> { 87 | driver.find_all(By::Css("#site-content > div > div:nth-child(2) > div > div > div > div > div.gsgwcjk.g8ge8f1.g14v8520.dir.dir-ltr > div > div > div.c1l1h97y.dir.dir-ltr > div > div > div > div > div > div.g1qv1ctd.c1v0rf5q.dir.dir-ltr")).await 88 | } 89 | 90 | async fn initialize_driver() -> Result { 91 | let caps = DesiredCapabilities::chrome(); 92 | let driver = WebDriver::new("http://localhost:9515", caps).await?; 93 | driver.maximize_window().await?; 94 | Ok(driver) 95 | } 96 | 97 | async fn search_location(driver: &WebDriver, place: &str) -> Result<(), WebDriverError> { 98 | click_choose_place(driver).await?; 99 | 100 | write_place(driver, place).await?; 101 | 102 | click_search_button(driver).await?; 103 | 104 | Ok(()) 105 | } 106 | 107 | 108 | async fn click_choose_place(driver: &WebDriver) -> Result<(), WebDriverError> { 109 | driver 110 | .find(By::Css("body > div:nth-child(8) > div > div > div:nth-child(1) > div > div.cd56ld.cb80sj1.dir.dir-ltr > div.h1ta6hky.dir.dir-ltr > div > div > div > header > div > div.c1ujpdn9.dir.dir-ltr > div.l1sjr04j.l1x4ovsg.llb1jct.lc9d3st.dir.dir-ltr > div > span.ieg7dag.dir.dir-ltr > button:nth-child(1)")) 111 | .await?.click().await?; 112 | 113 | Ok(()) 114 | } 115 | 116 | async fn write_place(driver: &WebDriver, place: &str) -> Result<(), WebDriverError> { 117 | let input = driver 118 | .find(By::Css("#bigsearch-query-location-input")) 119 | .await?; 120 | input.wait_until().clickable().await?; 121 | 122 | input.send_keys(place).await?; 123 | 124 | Ok(()) 125 | } 126 | 127 | async fn click_search_button(driver: &WebDriver) -> Result<(), WebDriverError> { 128 | driver.find(By::Css("#search-tabpanel > div.ir2ixub.dir.dir-ltr > div.c111bvlt.c1gh7ier.dir.dir-ltr > div.c1ddhymz.cggll98.dir.dir-ltr > div.s1t4vwjw.dir.dir-ltr > button")).await?.click().await?; 129 | Ok(()) 130 | } 131 | 132 | #[derive(Debug, Serialize)] 133 | struct BnbDetails { 134 | title: String, 135 | description: String, 136 | host: String, 137 | availability: String, 138 | price: String, 139 | star: String, 140 | } 141 | 142 | impl BnbDetails { 143 | async fn from(house_elem: WebElement) -> Result { 144 | let title = BnbDetails::get_title(&house_elem).await?; 145 | let description = BnbDetails::get_description(&house_elem).await?; 146 | let host = BnbDetails::get_host(&house_elem).await?; 147 | let availability = BnbDetails::get_availability(&house_elem).await?; 148 | let price = BnbDetails::get_price(&house_elem).await?; 149 | let star = BnbDetails::get_star(&house_elem).await?; 150 | 151 | Ok(Self { 152 | title, 153 | description, 154 | host, 155 | availability, 156 | price, 157 | star, 158 | }) 159 | } 160 | async fn get_title(house_elem: &WebElement) -> Result { 161 | house_elem 162 | .find(By::Css("div:nth-child(1)")) 163 | .await? 164 | .text() 165 | .await 166 | } 167 | async fn get_description(house_elem: &WebElement) -> Result { 168 | house_elem 169 | .find(By::Css("div:nth-child(2) > span")) 170 | .await? 171 | .text() 172 | .await 173 | } 174 | async fn get_host(house_elem: &WebElement) -> Result { 175 | let host = house_elem 176 | .find(By::Css("div:nth-child(3) > span > span")) 177 | .await; 178 | if let Ok(host) = host { 179 | host.text().await 180 | } else { 181 | house_elem 182 | .find(By::Css("div:nth-child(3) > span")) 183 | .await? 184 | .text() 185 | .await 186 | } 187 | } 188 | async fn get_availability(house_elem: &WebElement) -> Result { 189 | house_elem 190 | .find(By::Css("div:nth-child(4) > span > span")) 191 | .await? 192 | .text() 193 | .await 194 | } 195 | async fn get_price(house_elem: &WebElement) -> Result { 196 | house_elem 197 | .find(By::XPath("div[5]/div/div/span[1]/div/span[1]")) 198 | .await? 199 | .text() 200 | .await 201 | } 202 | 203 | async fn get_star(house_elem: &WebElement) -> Result { 204 | if let Ok(star) = house_elem 205 | .find(By::Css("span > span.r1dxllyb.dir.dir-ltr")) 206 | .await 207 | { 208 | return star.text().await; 209 | } 210 | Ok("No ratings available".into()) 211 | } 212 | } 213 | -------------------------------------------------------------------------------- /annas_archive/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "annas_archive" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | serde = { version = "1.0.188", features = ["derive"] } 10 | thirtyfour = "0.31.0" 11 | tokio = { version = "1.32.0", features = ["full"] } 12 | url = "2.4.1" 13 | undetected-chromedriver = "0.1.2" 14 | bypass_cloudflare = { path = "../bypass_cloudflare" } 15 | indicatif = "0.17.7" 16 | reqwest = { version = "0.11.21", features = ["stream"] } 17 | futures-util = "0.3.14" -------------------------------------------------------------------------------- /annas_archive/src/lib.rs: -------------------------------------------------------------------------------- 1 | use bypass_cloudflare::bypass_cloudflare; 2 | use futures_util::StreamExt; 3 | use indicatif::{ProgressBar, ProgressStyle}; 4 | use std::error::Error; 5 | use std::fs::File; 6 | use std::io::Write; 7 | use std::thread; 8 | use std::time::Duration; 9 | use thirtyfour::{By, WebDriver, WebElement}; 10 | use undetected_chromedriver::chrome; 11 | use url::Url; 12 | 13 | pub async fn scrape_annas_archive(book: &str) -> Result<(), Box> { 14 | let driver = chrome().await?; 15 | driver.maximize_window().await?; 16 | 17 | let url = generate_url(book)?; 18 | 19 | driver.goto(url).await?; 20 | thread::sleep(Duration::from_secs(1)); 21 | 22 | scrape_all(&driver).await?; 23 | 24 | Ok(()) 25 | } 26 | 27 | fn generate_url(book: &str) -> Result> { 28 | let mut url = Url::parse("https://annas-archive.org/search")?; 29 | url.set_query(Some(&format!("q={}", book))); 30 | Ok(url) 31 | } 32 | 33 | async fn scrape_all(driver: &WebDriver) -> Result, Box> { 34 | driver 35 | .execute("window.scrollTo(0, document.body.scrollHeight);", vec![]) 36 | .await?; 37 | thread::sleep(Duration::from_secs(1)); 38 | 39 | let books = get_books(driver).await?; 40 | 41 | let books_info = get_books_info(books).await?; 42 | 43 | let choosen_book = books_info.get(0).expect("Unable to get the book"); 44 | 45 | download_book(choosen_book, driver).await?; 46 | Ok(books_info) 47 | } 48 | 49 | async fn get_books(driver: &WebDriver) -> Result, Box> { 50 | let books = driver 51 | .find_all(By::Css( 52 | r"body > main > form > div.flex.w-\[100\%\] > div.min-w-\[0\] > div > div.h-\[125\] > a", 53 | )) 54 | .await?; 55 | Ok(books) 56 | } 57 | 58 | async fn get_books_info(books: Vec) -> Result, Box> { 59 | let mut books_info = vec![]; 60 | for book in &books { 61 | let name = get_book_name(book).await?; 62 | 63 | let info = get_book_info(book).await?; 64 | let parts: Vec<&str> = info.split(',').collect(); 65 | 66 | let _language = parts 67 | .first() 68 | .unwrap_or(&" ") 69 | .trim() 70 | .split('[') 71 | .next() 72 | .unwrap() 73 | .trim() 74 | .to_string(); 75 | 76 | let extension = parts.get(1).unwrap_or(&" ").trim().to_string(); 77 | 78 | let _size_str = parts.get(2).unwrap_or(&" ").trim().to_string(); 79 | 80 | let mut filename = parts.get(3).unwrap_or(&" ").trim().to_string(); 81 | 82 | if let Some(last_part) = parts.get(4) { 83 | let missing_part = format!(",{}", last_part.trim()); 84 | filename.push_str(&missing_part); 85 | } 86 | 87 | let download_link = get_book_link(book).await?; 88 | books_info.push(BookInfo::new(name, extension, download_link)); 89 | } 90 | Ok(books_info) 91 | } 92 | 93 | async fn get_book_name(book: &WebElement) -> Result> { 94 | let book_name = book 95 | .find(By::Css( 96 | r" div.relative.top-\[-1\].pl-4.grow.overflow-hidden > h3", 97 | )) 98 | .await? 99 | .text() 100 | .await?; 101 | Ok(book_name) 102 | } 103 | 104 | async fn get_book_info(book: &WebElement) -> Result> { 105 | let info = book.find(By::Css(r" div.relative.top-\[-1\].pl-4.grow.overflow-hidden > div.truncate.text-xs.text-gray-500")).await?.text().await?; 106 | Ok(info) 107 | } 108 | 109 | async fn download_book(book_info: &BookInfo, driver: &WebDriver) -> Result<(), Box> { 110 | let md5_link = get_md5_link(driver, book_info).await?; 111 | bypass_cloudflare(driver, md5_link).await?; 112 | thread::sleep(Duration::from_secs(2)); 113 | 114 | download_file(driver, book_info).await?; 115 | Ok(()) 116 | } 117 | 118 | async fn get_md5_link(driver: &WebDriver, book_info: &BookInfo) -> Result> { 119 | driver.goto(book_info.download_link.clone()).await?; 120 | let cloudflare_link = driver 121 | .find(By::Css( 122 | "#md5-panel-downloads > div:nth-child(2) > ul > li:nth-child(1) > a.js-download-link", 123 | )) 124 | .await? 125 | .attr("href") 126 | .await? 127 | .expect("Unable to get download link"); 128 | let md5_link = append_to_base_url(cloudflare_link)?; 129 | Ok(md5_link) 130 | } 131 | 132 | async fn download_file(driver: &WebDriver, book_info: &BookInfo) -> Result<(), Box> { 133 | let first_window = driver 134 | .windows() 135 | .await? 136 | .last() 137 | .expect("Unable to get last windows") 138 | .clone(); 139 | driver.switch_to_window(first_window).await?; 140 | let download_link = driver 141 | .find(By::Css("body > main > p:nth-child(2) > a")) 142 | .await? 143 | .attr("href") 144 | .await? 145 | .expect("Unable to get book donwload link"); 146 | driver.clone().quit().await?; 147 | 148 | show_progress_and_download(download_link, book_info).await?; 149 | 150 | Ok(()) 151 | } 152 | 153 | async fn show_progress_and_download( 154 | download_link: String, 155 | book_info: &BookInfo, 156 | ) -> Result<(), Box> { 157 | let body = reqwest::get(&download_link).await?; 158 | 159 | let total_size = body.content_length().ok_or(format!( 160 | "Failed to get content length from '{}'", 161 | &download_link 162 | ))?; 163 | 164 | let pb = ProgressBar::new(total_size); 165 | pb.set_style( 166 | ProgressStyle::default_bar() 167 | .template( 168 | "{msg} [{elapsed_precise}] [{bar:40.cyan/blue}] {bytes}/{total_bytes} ({eta})", 169 | )? 170 | .progress_chars("#>-"), 171 | ); 172 | pb.set_message(format!("Downloading {}", book_info.name)); 173 | 174 | let path = format!("{}.{}", book_info.name, book_info.extension); 175 | let mut file = File::create(&path)?; 176 | let mut downloaded: u64 = 0; 177 | 178 | let mut stream = body.bytes_stream(); 179 | while let Some(item) = stream.next().await { 180 | let chunk = item.or(Err("Error while downloading file".to_string()))?; 181 | file.write_all(&chunk) 182 | .or(Err("Error while writing to file".to_string()))?; 183 | 184 | downloaded += chunk.len() as u64; 185 | pb.set_position(downloaded); 186 | } 187 | 188 | pb.finish_with_message("Done!"); 189 | Ok(()) 190 | } 191 | 192 | async fn get_book_link(book: &WebElement) -> Result> { 193 | let path = book.attr("href").await?.expect("Unable to get book link"); 194 | let url = append_to_base_url(path)?; 195 | Ok(url) 196 | } 197 | 198 | fn append_to_base_url(path: String) -> Result> { 199 | let mut url = Url::parse("https://annas-archive.org/")?; 200 | url.set_path(&path); 201 | Ok(url) 202 | } 203 | 204 | #[derive(Clone)] 205 | pub struct BookInfo { 206 | name: String, 207 | extension: String, 208 | download_link: Url, 209 | } 210 | 211 | impl BookInfo { 212 | fn new(name: String, extension: String, download_link: Url) -> Self { 213 | Self { 214 | name, 215 | extension, 216 | download_link, 217 | } 218 | } 219 | } 220 | -------------------------------------------------------------------------------- /bypass_cloudflare/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "bypass_cloudflare" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | thirtyfour = "0.31.0" 10 | url = "2.4.1" -------------------------------------------------------------------------------- /bypass_cloudflare/src/lib.rs: -------------------------------------------------------------------------------- 1 | use std::error::Error; 2 | use std::thread; 3 | use std::time::Duration; 4 | use thirtyfour::{prelude::ElementWaitable, By, WebDriver}; 5 | use url::Url; 6 | pub async fn bypass_cloudflare( 7 | driver: &WebDriver, 8 | download_link: Url, 9 | ) -> Result<(), Box> { 10 | driver 11 | .execute( 12 | &format!(r#"window.open("{}", "_blank");"#, download_link.as_str()), 13 | vec![], 14 | ) 15 | .await?; 16 | thread::sleep(Duration::from_secs(3)); 17 | let first_window = driver 18 | .windows() 19 | .await? 20 | .first() 21 | .expect("Unable to get first windows") 22 | .clone(); 23 | driver.switch_to_window(first_window).await?; 24 | driver.close_window().await?; 25 | let first_window = driver 26 | .windows() 27 | .await? 28 | .last() 29 | .expect("Unable to get last windows") 30 | .clone(); 31 | driver.switch_to_window(first_window).await?; 32 | 33 | driver.enter_frame(0).await?; 34 | 35 | let button = driver.find(By::Css("#challenge-stage")).await?; 36 | 37 | button.wait_until().clickable().await?; 38 | thread::sleep(Duration::from_secs(2)); 39 | button.click().await?; 40 | Ok(()) 41 | } 42 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Web Scraping using rust 2 | This repository will contain the source code of my series about web scraping using rust. 3 | 4 | [Scraping airbnb](https://itehax.com/blog/web-scraping-using-rust) 5 | 6 | [Rust web scraping course](https://youtu.be/LUhjyYEFXvo) 7 | 8 | Currently the repository contains 9 | : 10 | - [x] Airbnb scraping. 11 | - [x] Bypass cloudflare. 12 | - [x] Download books from Anna's archive. 13 | - [ ] Downloading video from mp4 file. 14 | - [ ] Download video from streaming website (parse m3u8 playlist and extract js scripts). 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /scraper/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "scraper" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | airbnb = { path = "../airbnb" } 10 | annas_archive = { path = "../annas_archive" } 11 | bypass_cloudflare = { path = "../bypass_cloudflare" } 12 | tokio = { version = "1.32.0", features = ["full"] } 13 | undetected-chromedriver = "0.1.2" 14 | url = "2.4.1" 15 | reqwest = "0.11.22" -------------------------------------------------------------------------------- /scraper/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::error::Error; 2 | 3 | #[tokio::main] 4 | async fn main() -> Result<(), Box> { 5 | // airbnb::scrape_airbnb("Rome").await?; 6 | 7 | // annas_archive::scrape_annas_archive("questa è l'acqua").await?; 8 | 9 | // let driver = undetected_chromedriver::chrome().await?; 10 | // let link = url::Url::parse("https://nowsecure.nl")?; 11 | // bypass_cloudflare::bypass_cloudflare(&driver, link).await?; 12 | 13 | Ok(()) 14 | } 15 | --------------------------------------------------------------------------------