├── utils ├── __init__.py ├── logging_utils.py ├── user_interface.py ├── session_utils.py ├── error_handling.py ├── updating_utils.py ├── scraping_utils.py ├── downloading_utils.py └── user_input.py ├── assets └── AO3.ico ├── .gitignore ├── requirements.txt ├── LICENSE ├── README.md └── main.py /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /assets/AO3.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Dramatycznie/AO3_Scraper/HEAD/assets/AO3.ico -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | 3 | __pycache__/ 4 | 5 | venv/ 6 | 7 | app.log 8 | 9 | main.spec 10 | 11 | Downloaded Works/ -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests~=2.31.0 2 | beautifulsoup4~=4.12.2 3 | colorama~=0.4.6 4 | tqdm~=4.66.1 5 | EbookLib~=0.18 6 | pypdf2~=3.0.1 -------------------------------------------------------------------------------- /utils/logging_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | def setup_logging(): 5 | logging.basicConfig(filename='app.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 6 | logger = logging.getLogger(__name__) 7 | logger.info("Program started.") 8 | return logger 9 | 10 | 11 | def log_program_closure(logger): 12 | logger.info("Program closed.") 13 | -------------------------------------------------------------------------------- /utils/user_interface.py: -------------------------------------------------------------------------------- 1 | from colorama import Fore 2 | 3 | 4 | # put printing in a different module? 5 | def print_welcome(): 6 | print(Fore.CYAN + """ 7 | _______ _______ _______ _______ _______ ______ _______ _______ _______ ______ 8 | | _ || || | | || || _ | | _ || || || _ | 9 | | |_| || _ ||___ | | _____|| || | || | |_| || _ || ___|| | || 10 | | || | | | ___| | | |_____ | || |_||_ | || |_| || |___ | |_||_ 11 | | || |_| ||___ | |_____ || _|| __ || || ___|| ___|| __ | 12 | | _ || | ___| | _____| || |_ | | | || _ || | | |___ | | | | 13 | |__| |__||_______||_______| |_______||_______||___| |_||__| |__||___| |_______||___| |_| 14 | """ + Fore.RESET) 15 | 16 | 17 | # Prints the goodbye message and exits the program 18 | def print_goodbye(): 19 | print(Fore.CYAN + "\nThank you for using AO3 Scraper!" + Fore.RESET) 20 | input("\nPress Enter to exit.") 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Damatycznie 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /utils/session_utils.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | from . import error_handling 4 | 5 | 6 | # Creates a session and returns the authenticity token 7 | def create_session(logger): 8 | print("\nCreating a session...") 9 | logger.info("Creating a session...") 10 | 11 | # Create a session with custom user agent 12 | headers = { 13 | 'User-Agent': 'Bookmark Scraper Bot' 14 | } 15 | 16 | try: 17 | # Create a session and make a GET request 18 | session = requests.Session() 19 | session.headers.update(headers) 20 | response = session.get("https://archiveofourown.org/users/login") 21 | response.raise_for_status() 22 | 23 | soup = BeautifulSoup(response.content, 'html.parser') 24 | token = soup.find('input', {'name': 'authenticity_token'}) 25 | 26 | if token is None: 27 | error_handling.handle_token_not_found(logger) 28 | return None, None 29 | else: 30 | token = token['value'] 31 | print("\nSession created.") 32 | logger.info("Session created.") 33 | return token, session 34 | 35 | except requests.exceptions.RequestException as error: 36 | error_handling.handle_request_error(error, logger) 37 | return None, None -------------------------------------------------------------------------------- /utils/error_handling.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | # Handles the "Retry later" message (unneeded?) 5 | def handle_retry_later(response, logger): 6 | if "Retry later" in response.text: 7 | logger.error("Received 'Retry later' message. Too many requests, stopping scraping.") 8 | print("\nReceived 'Retry later' message. Please try again later, consider increasing the delay.") 9 | return True 10 | return False 11 | 12 | 13 | # Handles request errors 14 | def handle_request_error(error, logger): 15 | if "429" in str(error): # HTTP 429: Too Many Requests 16 | logger.error("Too many requests, stopping scraping.") 17 | print("\nToo many requests. Please try again later, consider increasing the delay.") 18 | else: 19 | logger.error(f"An error occurred while making the request: {error}") 20 | print("\nAn error occurred while making the request. Please try again later. Check the logs for more details.") 21 | 22 | 23 | # Handles invalid input 24 | def handle_invalid_input(context, logger): 25 | logger.error(f"Invalid input: {context}") 26 | print(f"\nInvalid input: {context}") 27 | 28 | 29 | # Handles token not found error 30 | def handle_token_not_found(logger): 31 | logger.error("Authenticity token not found. Cannot log in.") 32 | print("\nAn error occurred while logging in. Skipping. Please try again later. Check the logs for more details.") 33 | 34 | 35 | # Handles parse errors 36 | def handle_parse_error(logger): 37 | logger.error("Error parsing HTML.") 38 | print("\nAn error occurred while parsing the HTML. Please try again later. Check the logs for more details.") 39 | 40 | 41 | # Handles keyboard interrupts 42 | def handle_keyboard_interrupt(logger): 43 | logger.error("Keyboard Interrupt detected.") 44 | print("\nKeyboardInterrupt received. Exiting gracefully...") 45 | sys.exit(0) 46 | -------------------------------------------------------------------------------- /utils/updating_utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | from datetime import datetime 3 | 4 | import ebooklib 5 | from PyPDF2 import PdfReader 6 | from bs4 import BeautifulSoup 7 | from ebooklib import epub 8 | 9 | # Date patterns to get the last date in the file (either Completed, Updated, or Published) 10 | date_patterns = [ 11 | (r'Completed: (\d{4}-\d{2}-\d{2})', 'Completed'), 12 | (r'Updated: (\d{4}-\d{2}-\d{2})', 'Updated'), 13 | (r'Published: (\d{4}-\d{2}-\d{2})', 'Published') 14 | ] 15 | 16 | 17 | # Extracts the date from an EPUB file 18 | def extract_epub_date(file_path): 19 | epub_book = ebooklib.epub.read_epub(file_path) 20 | 21 | text_content = "" 22 | 23 | for item in epub_book.get_items(): 24 | if isinstance(item, ebooklib.epub.EpubHtml): 25 | text_content += item.get_body_content().decode('utf-8') 26 | 27 | for pattern, label in date_patterns: 28 | match = re.search(pattern, text_content) 29 | if match: 30 | file_date = match.group(1) 31 | file_date = datetime.strptime(file_date, "%Y-%m-%d") 32 | return file_date 33 | 34 | return None 35 | 36 | 37 | # Extracts the date from a PDF file 38 | def extract_pdf_date(file_path): 39 | with open(file_path, 'rb') as pdf_file: 40 | pdf_reader = PdfReader(pdf_file) 41 | 42 | text_content = "" 43 | 44 | page = pdf_reader.pages[0] 45 | text_content += page.extract_text() 46 | 47 | for pattern, label in date_patterns: 48 | match = re.search(pattern, text_content) 49 | if match: 50 | file_date = match.group(1) 51 | file_date = datetime.strptime(file_date, "%Y-%m-%d") 52 | return file_date 53 | 54 | return None 55 | 56 | 57 | # Extracts the date from an HTML file 58 | def extract_html_date(file_path): 59 | with open(file_path, 'r', encoding='utf-8') as html_file: 60 | html_content = html_file.read() 61 | 62 | soup = BeautifulSoup(html_content, 'html.parser') 63 | text_content = soup.get_text() 64 | 65 | for pattern, label in date_patterns: 66 | match = re.search(pattern, text_content) 67 | if match: 68 | file_date = match.group(1) 69 | file_date = datetime.strptime(file_date, "%Y-%m-%d") 70 | 71 | return file_date 72 | 73 | return None 74 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AO3_Scraper 2 | A web scraper that extracts bookmark metadata from Archive of Our Own and saves it to a CSV file. Has an option to download the bookmarks and neatly organize them into folders based on fandoms. 3 | 4 | **Works on public and private bookmarks if you log into your AO3 account.** 5 | 6 | ## Table of Contents 7 | - [Features](#features) 8 | - [Dependencies](#dependencies) 9 | - [How to Use](#how-to-use) 10 | - [Contact](#contact) 11 | - [Bug Reports and Feature Requests](#bug-reports-and-feature-requests) 12 | 13 | # Features 14 | 15 | Scrapes or downloads bookmarks from Archive of Our Own. Allows user to log into their account to access private bookmarks and works that are only available to registered users. 16 | 17 | ### Scraping 18 | - Extracts bookmark metadata such as URL, title, authors, fandoms, warnings, ratings, categories, characters, relationships, tags, wordcounts, date bookmarked, date updated. 19 | - Writes extracted data to a CSV file. 20 | 21 | ### Downloading 22 | - Downloads bookmarks to different folders based on fandoms, and names the files based on the title and authors. 23 | - Supports downloading bookmarked series. 24 | - Allows user to choose format when downloading bookmarks (HTML, MOBI, EPUB, PDF, AZW3). 25 | 26 | ### Updating 27 | - Checks bookmarks for updates and downloads them accordingly (HTML, EPUB, PDF). 28 | - Works only if bookmarks are in the correct folder and named correctly. 29 | 30 | 31 | # Dependencies 32 | - Python 3 33 | - PIP 34 | - requests 35 | - BeautifulSoup 36 | - tqdm 37 | - colorama 38 | - ebooklib 39 | - pypdf2 40 | 41 | # How to use 42 | Run the script yourself or use the release. 43 | 44 | ### Running the script 45 | - Download or clone the repository. 46 | - Install the required dependencies by running `pip install -r requirements.txt` in the correct directory where the `requirements.txt` file is located. 47 | - Run the script by running `python main.py` in command prompt or terminal. Make sure you're in the correct directory. 48 | - Scrape or download the bookmarks. 49 | 50 | ### Use the release 51 | Instead of downloading the script, you can download the [latest release](https://github.com/Dramatycznie/AO3_Scraper/releases) and run it directly. 52 | 53 | - Download the release. 54 | - Unpack it. 55 | - Run `AO3_Scraper.exe`. 56 | - Scrape or download the bookmarks. 57 | 58 | # Contact 59 | If you have any questions or feedback about this project, please feel free to reach out to me. 60 | - Email: mellodramat@gmail.com 61 | - GitHub: https://github.com/Dramatycznie 62 | 63 | # Bug reports and feature requests 64 | For bug reports and feature requests, please open an issue on my GitHub repository: https://github.com/Dramatycznie/AO3_Scraper/issues 65 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | 3 | from utils import error_handling 4 | from utils import logging_utils 5 | from utils import user_input 6 | from utils import user_interface 7 | from utils import scraping_utils 8 | from utils import session_utils 9 | from utils import downloading_utils 10 | 11 | 12 | # Main function 13 | def main(): 14 | session = None 15 | logger = logging_utils.setup_logging() 16 | atexit.register(logging_utils.log_program_closure, logger) 17 | try: 18 | user_interface.print_welcome() 19 | log_in = user_input.ask_if_log_in(logger) 20 | 21 | while True: 22 | if log_in and session is None: # Create a session if logging in 23 | token, session = session_utils.create_session(logger) 24 | user_input.get_login_info(token, session, logger) 25 | logged_in = True 26 | else: 27 | logged_in = False 28 | 29 | action = user_input.download_or_scrape(logger) 30 | 31 | # Get the info needed to scrape the bookmarks 32 | username, url = user_input.get_username(logged_in, action, logger) 33 | 34 | # Call get_available_pages and check the result 35 | available_pages = user_input.get_available_pages(username, session, url, logger) 36 | 37 | if available_pages is not None: 38 | if action != "download updates": 39 | # Get the range of the pages 40 | page_range = user_input.get_page_range(session, url, logger) 41 | if page_range is None: 42 | continue 43 | start_page, end_page = page_range 44 | else: 45 | # ...Unless user chooses to download updates, start from page 1 and end at the last page, no asking 46 | start_page = 1 47 | end_page = available_pages 48 | 49 | delay = user_input.get_delay(logger) 50 | 51 | if action in ["download", "download updates"]: 52 | chosen_format = user_input.get_download_format(logger, action) 53 | downloading_utils.download_bookmarks(username, logged_in, start_page, end_page, session, 54 | chosen_format, delay, action, logger) 55 | elif action == "scrape": 56 | scraping_utils.scrape_bookmarks(username, start_page, end_page, session, delay, logger) 57 | 58 | if not user_input.ask_again(logger): 59 | user_interface.print_goodbye() 60 | break # Exit the loop if the user chooses not to try again 61 | 62 | except KeyboardInterrupt: 63 | error_handling.handle_keyboard_interrupt(logger) 64 | 65 | finally: 66 | if session: 67 | session.close() 68 | print("\nSession closed.") 69 | logger.info("Session closed.") 70 | 71 | if __name__ == "__main__": 72 | main() 73 | 74 | -------------------------------------------------------------------------------- /utils/scraping_utils.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import socket 3 | import time 4 | import requests 5 | from bs4 import BeautifulSoup 6 | from colorama import Fore 7 | from tqdm import tqdm 8 | from . import error_handling 9 | 10 | 11 | # Gets the text of an element 12 | def get_element_text(element): 13 | return element.text.strip() if element else "" 14 | 15 | 16 | # Gets the text of a list of elements 17 | def get_element_text_list(elements): 18 | return [element.text.strip() for element in elements] if elements else [] 19 | 20 | 21 | # Scrapes a single bookmark entry 22 | def scrape_single_bookmark(bookmark, csvwriter): 23 | # Get title from the bookmark 24 | title_element = bookmark.select_one("h4 a:nth-of-type(1)") 25 | if title_element: 26 | title = get_element_text(title_element) 27 | else: 28 | return 29 | 30 | # Get the other data from the bookmark 31 | authors = get_element_text_list(bookmark.select("a[rel='author']")) 32 | fandoms = get_element_text_list(bookmark.select(".fandoms a")) 33 | warnings = get_element_text_list(bookmark.select("li.warnings")) 34 | ratings = get_element_text_list(bookmark.select_one("span.rating")) 35 | categories = get_element_text_list(bookmark.select("span.category")) 36 | words = get_element_text(bookmark.select_one("dd.words") or bookmark.select_one("dd")) 37 | tags = get_element_text_list(bookmark.select("li.freeforms")) 38 | characters = get_element_text_list(bookmark.select("li.characters")) 39 | relationships = get_element_text_list(bookmark.select("li.relationships")) 40 | date_bookmarked = get_element_text(bookmark.select_one("div.user p.datetime")) 41 | url = "https://archiveofourown.org" + bookmark.select_one("h4 a:nth-of-type(1)")["href"] 42 | date_updated = get_element_text(bookmark.select_one("p.datetime")) 43 | 44 | # Replace commas with semicolons in ratings and categories (important when bookmark is a series) 45 | ratings = [rating.replace(',', ';') for rating in ratings] 46 | categories = [category.replace(',', ';') for category in categories] 47 | 48 | # Write bookmark data to CSV, replace empty author with "Anonymous" 49 | csvwriter.writerow([ 50 | url, title, '; '.join(authors) if authors else 'Anonymous', '; '.join(fandoms), '; '.join(warnings), 51 | '; '.join(ratings), '; '.join(categories), '; '.join(characters), 52 | '; '.join(relationships), '; '.join(tags), words, date_bookmarked, date_updated 53 | ]) 54 | 55 | 56 | # Scrape the bookmarks of a user 57 | def scrape_bookmarks(username, start_page, end_page, session, delay, logger): 58 | with open(username + '_bookmarks.csv', 'w', newline='', encoding='utf-8') as csvfile: 59 | csvwriter = csv.writer(csvfile) 60 | logger.info(f"CSV file created: {username}_bookmarks.csv") 61 | 62 | # Write header row to CSV file 63 | csvwriter.writerow( 64 | ['URL', 'Title', 'Authors', 'Fandoms', 'Warnings', 'Rating', 'Categories', 'Characters', 65 | 'Relationships', 'Tags', 'Words', 'Date Bookmarked', 'Date Updated']) 66 | logger.info("Header row written to CSV file") 67 | 68 | num_bookmarks = 0 69 | total_pages = end_page - start_page + 1 70 | 71 | # Loop through pages and scrape bookmarks 72 | print() 73 | for page in tqdm(range(start_page, end_page + 1), total=total_pages, desc="Scraping: "): 74 | try: 75 | response = session.get( 76 | f"https://archiveofourown.org/users/{username}/bookmarks?private=true&page={page}") if \ 77 | session else requests.get(f"https://archiveofourown.org/users/{username}/bookmarks?page={page}") 78 | 79 | response.raise_for_status() 80 | 81 | time.sleep(delay) 82 | soup = BeautifulSoup(response.text, 'html.parser') 83 | logger.info(f"Scraping page {page}") 84 | 85 | except (requests.exceptions.RequestException, socket.timeout) as error: 86 | error_handling.handle_request_error(error, logger) 87 | return 88 | if error_handling.handle_retry_later(response, logger): 89 | return 90 | 91 | # Loop through each bookmark on the page 92 | for bookmark in soup.select("li.bookmark"): 93 | scrape_single_bookmark(bookmark, csvwriter) 94 | num_bookmarks += 1 95 | 96 | # Print completion message 97 | logger.info(f"Scraping complete. Scraped {num_bookmarks} bookmarks.") 98 | print("\nAll done! \nYour bookmarks have been saved to {}{}{}_bookmarks.csv{}.".format(Fore.CYAN, 99 | username, Fore.RESET, 100 | Fore.RESET)) 101 | print("Scraped {}{}{} bookmarks.".format(Fore.CYAN, num_bookmarks, Fore.RESET)) 102 | -------------------------------------------------------------------------------- /utils/downloading_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from datetime import datetime 4 | import warnings 5 | from colorama import Fore 6 | 7 | import requests 8 | from bs4 import BeautifulSoup 9 | from tqdm import tqdm 10 | 11 | from . import error_handling 12 | from . import updating_utils 13 | 14 | # Before importing ebooklib, filter out the specific warning 15 | warnings.filterwarnings("ignore", category=UserWarning, 16 | message="In the future version we will turn default option ignore_ncx to True.") 17 | 18 | 19 | # Cleans the folder name to be used as a folder name 20 | def clean_folder_name(name): 21 | forbidden_characters = r'<>:"/\|?*' 22 | return ''.join(char for char in name if char not in forbidden_characters) 23 | 24 | 25 | # Cleans the work title to be used as a file name 26 | def clean_work_title(title): 27 | forbidden_characters = r'<>:"/\|?*' 28 | cleaned_title = ''.join(char for char in title if char not in forbidden_characters) 29 | return cleaned_title[:50] 30 | 31 | 32 | # Extracts the work URLs from a page of bookmarks 33 | def extract_work_urls_from_page(url, session, logger): 34 | work_urls = [] 35 | 36 | try: 37 | response = session.get(url) if session else requests.get(url) 38 | response.raise_for_status() # Check for request errors 39 | 40 | soup = BeautifulSoup(response.content, 'html.parser') 41 | 42 | for bookmark in soup.select("li.bookmark"): 43 | work_url_element = bookmark.select_one("h4 a:nth-of-type(1)") 44 | if work_url_element: 45 | work_url = "https://archiveofourown.org" + work_url_element["href"] 46 | if "/series/" in work_url: 47 | series_work_urls = extract_work_urls_from_series(work_url, session, logger) 48 | work_urls.extend(series_work_urls) 49 | else: 50 | work_urls.append(work_url) 51 | 52 | except requests.exceptions.RequestException as error: 53 | error_handling.handle_request_error(error, logger) 54 | 55 | return work_urls 56 | 57 | 58 | # Extracts the work URLs from series 59 | def extract_work_urls_from_series(series_url, session, logger): 60 | series_work_urls = [] 61 | 62 | try: 63 | response = session.get(series_url) if session else requests.get(series_url) 64 | response.raise_for_status() # Check for request errors 65 | 66 | soup = BeautifulSoup(response.content, 'html.parser') 67 | 68 | series_work_elements = soup.select("ul.series.work.index.group li.work.blurb.group") 69 | for series_work_element in series_work_elements: 70 | series_work_url_element = series_work_element.select_one("h4 a:nth-of-type(1)") 71 | if series_work_url_element: 72 | series_work_url = "https://archiveofourown.org" + series_work_url_element["href"] 73 | series_work_urls.append(series_work_url) 74 | 75 | except requests.exceptions.RequestException as error: 76 | error_handling.handle_request_error(error, logger) # Handle request error 77 | 78 | return series_work_urls 79 | 80 | 81 | # Downloads the works from the given work URLs 82 | def download_works_from_urls(work_url, session, chosen_format, action, logger): 83 | no_update_needed = False 84 | try: 85 | response = session.get(work_url) if session else requests.get(work_url) 86 | if response.status_code == 200: 87 | soup = BeautifulSoup(response.content, 'html.parser') 88 | 89 | work_title = soup.find("h2", class_="title heading").get_text(strip=True) 90 | author_elements = soup.select(".byline a[rel='author']") 91 | max_display_authors = 3 92 | work_authors = [author.get_text(strip=True) for author in author_elements[:max_display_authors]] 93 | if not work_authors: 94 | work_authors = ["Anonymous"] 95 | 96 | work_fandoms = [fandom.get_text(strip=True) for fandom in soup.select(".fandom a")] 97 | 98 | update_date_element = soup.select_one("dd.status") or soup.select_one("dd.published") 99 | if update_date_element: 100 | update_date_element = datetime.strptime(update_date_element.get_text(strip=True), "%Y-%m-%d") 101 | 102 | download_menu = soup.find("li", class_="download") 103 | if download_menu: 104 | format_links = { 105 | "EPUB": "EPUB", "MOBI": "MOBI", "PDF": "PDF", "HTML": "HTML", "AZW3": "AZW3" 106 | } 107 | 108 | format_name = format_links.get(chosen_format) 109 | if format_name: 110 | format_link = download_menu.find("a", href=True, string=format_name) 111 | if format_link: 112 | format_url = "https://archiveofourown.org" + format_link["href"] 113 | 114 | for fandom in work_fandoms: 115 | cleaned_fandom = clean_folder_name(fandom) 116 | folder_path = os.path.join("Downloaded Works", cleaned_fandom) 117 | os.makedirs(folder_path, exist_ok=True) 118 | 119 | cleaned_work_title = clean_work_title(work_title) 120 | authors_string = ' & '.join(work_authors) 121 | file_name = f"{cleaned_work_title} by {authors_string}.{format_name.lower()}" 122 | file_path = os.path.join(folder_path, file_name) 123 | 124 | if os.path.exists(file_path): 125 | if action == "download updates": 126 | if chosen_format == "EPUB": 127 | file_date = updating_utils.extract_epub_date(file_path) 128 | elif chosen_format == "PDF": 129 | file_date = updating_utils.extract_pdf_date(file_path) 130 | elif chosen_format == "HTML": 131 | file_date = updating_utils.extract_html_date(file_path) 132 | else: 133 | file_date = None 134 | 135 | if file_date is None or (update_date_element is not None and 136 | update_date_element > file_date): 137 | download_file(file_path, format_url, file_name, cleaned_fandom, logger) 138 | else: 139 | logger.info(f"'{file_name}' in '{cleaned_fandom}' does not need to be updated.") 140 | no_update_needed = True # (assumes all works are up-to-date) 141 | tqdm.write(f"{Fore.CYAN}All works are up-to-date.{Fore.RESET}\n") 142 | else: 143 | logger.info(f"'{file_name}' in '{cleaned_fandom}' already exists. Skipping.") 144 | else: 145 | download_file(file_path, format_url, file_name, cleaned_fandom, logger) 146 | 147 | except requests.RequestException as error: 148 | error_handling.handle_request_error(error, logger) 149 | 150 | return no_update_needed 151 | 152 | 153 | # Define a separate function for downloading a file 154 | def download_file(file_path, format_url, file_name, cleaned_fandom, logger): 155 | response_format = requests.get(format_url) 156 | if response_format.status_code == 200: 157 | with open(file_path, "wb") as file: 158 | file.write(response_format.content) 159 | logger.info( 160 | f"'{file_name}' downloaded successfully to '{cleaned_fandom}' folder." 161 | ) 162 | 163 | 164 | def download_bookmarks(username, logged_in, start_page, end_page, session, chosen_format, delay, action, logger): 165 | if action == "download updates": 166 | base_url = f"https://archiveofourown.org/users/{username}/bookmarks?bookmark_search%5Bbookmark_query%5D" \ 167 | f"=&bookmark_search%5Bbookmarkable_query%5D=&bookmark_search%5Bexcluded_bookmark_tag_names%5D" \ 168 | f"=&bookmark_search%5Bexcluded_tag_names%5D=&bookmark_search%5Blanguage_id%5D=&bookmark_search" \ 169 | f"%5Bother_bookmark_tag_names%5D=&bookmark_search%5Bother_tag_names%5D=&bookmark_search%5Brec%5D=0" \ 170 | f"&bookmark_search%5Bsort_column%5D=bookmarkable_date&bookmark_search%5Bwith_notes%5D=0&commit" \ 171 | f"=Sort+and+Filter" 172 | logged_in_base_url = base_url + "&private=true" 173 | else: 174 | base_url = f"https://archiveofourown.org/users/{username}/bookmarks" 175 | logged_in_base_url = base_url + "?private=true" 176 | 177 | page_number = start_page 178 | 179 | while page_number <= end_page: 180 | if action == "download updates": 181 | bookmark_page_url = f"{logged_in_base_url}&page={page_number}" if logged_in else \ 182 | f"{base_url}&page={page_number}" 183 | else: 184 | bookmark_page_url = f"{logged_in_base_url}&page={page_number}" if logged_in else f"{base_url}?page=" \ 185 | f"{page_number}" 186 | work_urls = extract_work_urls_from_page(bookmark_page_url, session, logger) 187 | 188 | if not work_urls: 189 | break 190 | 191 | # Loop through extracted work URLs and download 192 | for work_url in tqdm(work_urls, desc=f"Downloading works from page {page_number}", leave=True): 193 | no_update_needed = download_works_from_urls(work_url, session, chosen_format, action, logger) 194 | if no_update_needed: 195 | return 196 | time.sleep(delay) 197 | 198 | page_number += 1 199 | -------------------------------------------------------------------------------- /utils/user_input.py: -------------------------------------------------------------------------------- 1 | import re 2 | import requests 3 | from bs4 import BeautifulSoup 4 | from colorama import Fore 5 | from . import error_handling 6 | 7 | 8 | # Asks the user if they want to log in 9 | def ask_if_log_in(logger): 10 | while True: 11 | user_choice = input("\nWould you like to log in?\nLogged in users can access private bookmarks and bookmarks " 12 | "that are only visible to logged in users.\n1. Yes\n2. No\n") 13 | 14 | if user_choice == "1": 15 | logger.info("User chose to log in.") 16 | return True 17 | elif user_choice == "2": 18 | logger.info("User chose not to log in.") 19 | return False 20 | else: 21 | error_handling.handle_invalid_input("Please enter a valid choice. 1 or 2.", logger) 22 | 23 | 24 | # Asks if the user wants to log in 25 | def ask_again(logger): 26 | while True: 27 | answer = input("\nWould you like to run the program again? \n1. Yes \n2. No\n") 28 | if answer == "1": 29 | logger.info("User chose to try again.") 30 | return True 31 | elif answer == "2": 32 | logger.info("User chose not to try again.") 33 | return False 34 | else: 35 | error_handling.handle_invalid_input("Please enter a valid choice. 1 or 2.", logger) 36 | 37 | 38 | # Gets username/email and password from the user and logs in 39 | def get_login_info(token, session, logger): 40 | if session is None or token is None: 41 | error_handling.handle_token_not_found(logger) 42 | return False 43 | 44 | while True: 45 | # Prompt for user input 46 | username_or_email = input("\nEnter your username or email: ") 47 | password = input("\nEnter your password: ") 48 | print("\nChecking if login is successful...") 49 | 50 | # Create a payload 51 | payload = { 52 | "utf8": "✓", 53 | "authenticity_token": token, 54 | "user[login]": username_or_email, 55 | "user[password]": password, 56 | "commit": "Log in" 57 | } 58 | 59 | try: 60 | # Make a POST request 61 | response = session.post("https://archiveofourown.org/users/login", data=payload) 62 | response.raise_for_status() 63 | 64 | except requests.exceptions.RequestException as error: 65 | error_handling.handle_request_error(error, logger) 66 | continue 67 | 68 | # Check if login was successful 69 | if "Successfully logged in" in response.text: 70 | print(Fore.CYAN + "\nLogin successful." + Fore.RESET) 71 | logger.info("Login successful.") 72 | return True 73 | else: 74 | error_handling.handle_invalid_input("Login failed. Please try again.", logger) 75 | 76 | 77 | # Gets the username of the user whose bookmarks are to be scraped 78 | def get_username(logged_in, action, logger): 79 | while True: 80 | username = input(f"\nEnter the username of the user whose bookmarks you want to download or scrape: ") 81 | if not username: 82 | error_handling.handle_invalid_input("Please enter a username.", logger) 83 | continue 84 | 85 | print("\nChecking if the username is valid...") 86 | 87 | # Check if the username follows guidelines (3-40 characters, alphanumeric and underscore) 88 | username_pattern = r"^[A-Za-z0-9_]{3,40}$" 89 | if not re.match(username_pattern, username): 90 | error_handling.handle_invalid_input("Please enter a valid username.", logger) 91 | continue 92 | logger.info(f"{username} is a valid username.") 93 | 94 | try: 95 | print("\nChecking if username exists...") 96 | response = requests.get(f"https://archiveofourown.org/users/{username}") 97 | response.raise_for_status() 98 | 99 | soup = BeautifulSoup(response.text, 'html.parser') 100 | if len(soup.find_all("div", class_="user")) > 0: 101 | if action == "scrape": 102 | print(f"\nScraping bookmarks of user: {Fore.CYAN}{username}{Fore.RESET}") 103 | elif action == "download": 104 | print(f"\nDownloading bookmarks of user: {Fore.CYAN}{username}{Fore.RESET}") 105 | url = f"https://archiveofourown.org/users/{username}/bookmarks" 106 | 107 | if logged_in: 108 | url += "?private=true" 109 | logger.info(f"{username} exists. URL: {url}") 110 | return username, url 111 | else: 112 | error_handling.handle_invalid_input(f"Username {username} does not exist. Please enter a valid " 113 | f"username.", logger) 114 | 115 | except requests.exceptions.RequestException as error: 116 | error_handling.handle_request_error(error, logger) 117 | 118 | 119 | # Gets the number of pages of bookmarks available (with error handling) 120 | def get_available_pages(username, session, url, logger): 121 | try: 122 | # Construct the URL based on the login status 123 | response = session.get(url, timeout=60) if session else requests.get(url, timeout=60) 124 | response.raise_for_status() 125 | 126 | soup = BeautifulSoup(response.text, 'html.parser') 127 | bookmarks = soup.find_all("li", class_="bookmark") 128 | 129 | if len(bookmarks) == 0: 130 | error_handling.handle_invalid_input(f"{username} has no bookmarks.", logger) 131 | return None 132 | 133 | # Extract pagination information 134 | pagination = soup.find("ol", class_="actions") 135 | if pagination is not None: 136 | pagination = pagination.find_all("li") 137 | last_page = int(pagination[-2].text) 138 | else: 139 | error_handling.handle_parse_error(logger) 140 | return None # Return None in case of pagination parse error 141 | 142 | print(f"\nThe user has {Fore.CYAN}{last_page}{Fore.RESET} pages of bookmarks available.") 143 | logger.info(f"{username} has {last_page} pages of bookmarks available.") 144 | return last_page 145 | 146 | except requests.exceptions.RequestException as error: # works ok if not logged in (check for logged in) 147 | error_handling.handle_request_error(error, logger) 148 | return None # Return None to indicate an error 149 | 150 | except (AttributeError, ValueError): 151 | error_handling.handle_parse_error(logger) 152 | return None 153 | 154 | 155 | # Gets the page range from the user 156 | def get_page_range(session, url, logger): 157 | while True: 158 | try: 159 | start_page = int(input("\nEnter the starting page number: ")) 160 | if start_page < 1: 161 | error_handling.handle_invalid_input("The starting page number must be positive.", logger) 162 | continue 163 | 164 | end_page = int(input("\nEnter the ending page number: ")) 165 | if end_page < 1 or end_page < start_page: 166 | error_handling.handle_invalid_input("The ending page number must be positive and greater than the starting page number.", logger) 167 | continue 168 | 169 | # Try to fetch the URL and extract pagination info 170 | try: 171 | # Request the bookmark page and raise HTTPError for bad responses 172 | response = session.get(url, timeout=60) if session else requests.get(url, timeout=60) 173 | response.raise_for_status() 174 | 175 | # Parse the HTML and extract the last page number from pagination 176 | soup = BeautifulSoup(response.text, 'html.parser') 177 | pagination = soup.find("ol", class_="actions") 178 | 179 | if pagination: 180 | pagination = pagination.find_all("li") 181 | last_page = int(pagination[-2].text) # Second-to-last is usually the last page number 182 | else: 183 | error_handling.handle_parse_error(logger) 184 | continue 185 | 186 | # Validate the user-entered page numbers against available pages 187 | if start_page > last_page: 188 | error_handling.handle_invalid_input(f"Starting page {start_page} is out of range. Available starting pages are between 1 - {last_page}.", logger) 189 | continue 190 | if end_page > last_page: 191 | error_handling.handle_invalid_input(f"Ending page {end_page} is out of range. The last available page is {last_page}.", logger) 192 | continue 193 | 194 | except requests.exceptions.RequestException as error: 195 | # Handles errors and logs cleanly 196 | error_handling.handle_request_error(error, logger) 197 | return None 198 | except (AttributeError, ValueError): 199 | # Handles broken HTML or unexpected formats 200 | error_handling.handle_parse_error(logger) 201 | return None 202 | 203 | logger.info(f"Page range: {start_page} - {end_page}") 204 | return start_page, end_page 205 | 206 | except ValueError: 207 | error_handling.handle_invalid_input("Please enter a valid number.", logger) 208 | 209 | 210 | # Gets the delay between requests 211 | def get_delay(logger): 212 | while True: 213 | try: 214 | delay = int(input("\nEnter the interval delay between requests (at least 1 second, suggested value " 215 | "is 5 seconds): ")) 216 | if delay < 1: 217 | error_handling.handle_invalid_input("Please enter a delay of at least 1 second.", logger) 218 | continue 219 | break 220 | 221 | except ValueError: 222 | error_handling.handle_invalid_input("Please enter a valid number.", logger) 223 | 224 | logger.info(f"Delay: {delay} seconds") 225 | return delay 226 | 227 | 228 | # Gets the input for the download or scrape choice 229 | def download_or_scrape(logger): 230 | while True: 231 | choice = input("\nDo you want to scrape the bookmarks or download them?\n1. Scrape\n2. Download\n" 232 | "3. Download updates\n") 233 | choices = ['1', '2', '3'] 234 | if choice in choices: 235 | action = ["scrape", "download", "download updates"][int(choice) - 1] 236 | logger.info(f"User chose to {action} bookmarks.") 237 | return action 238 | else: 239 | error_handling.handle_invalid_input("Invalid choice. Please enter a valid number.", logger) 240 | 241 | 242 | # Gets the input for the download format 243 | def get_download_format(logger, action): 244 | while True: 245 | if action == "download updates": 246 | user_format = input("\nChoose the download format:\n1. EPUB\n2. PDF\n3. HTML\n") 247 | formats = ['1', '2', '3'] 248 | else: 249 | user_format = input("\nChoose the download format:\n1. AZW3\n2. EPUB\n3. MOBI\n4. PDF\n5. HTML\n") 250 | formats = ['1', '2', '3', '4', '5'] 251 | 252 | if user_format in formats: 253 | if action == "download updates": 254 | chosen_format = ["EPUB", "PDF", "HTML"][int(user_format) - 1] 255 | else: 256 | chosen_format = ["AZW3", "EPUB", "MOBI", "PDF", "HTML"][int(user_format) - 1] 257 | 258 | print() 259 | 260 | logger.info(f"User chose to download in {chosen_format} format.") 261 | return chosen_format 262 | else: 263 | error_handling.handle_invalid_input("Invalid choice. Please enter a valid number.", logger) 264 | --------------------------------------------------------------------------------