├── utils
    ├── __init__.py
    ├── logging_utils.py
    ├── user_interface.py
    ├── session_utils.py
    ├── error_handling.py
    ├── updating_utils.py
    ├── scraping_utils.py
    ├── downloading_utils.py
    └── user_input.py
├── assets
    └── AO3.ico
├── .gitignore
├── requirements.txt
├── LICENSE
├── README.md
└── main.py


/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/assets/AO3.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Dramatycznie/AO3_Scraper/HEAD/assets/AO3.ico


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/
 2 | 
 3 | __pycache__/
 4 | 
 5 | venv/
 6 | 
 7 | app.log
 8 | 
 9 | main.spec
10 | 
11 | Downloaded Works/


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests~=2.31.0
2 | beautifulsoup4~=4.12.2
3 | colorama~=0.4.6
4 | tqdm~=4.66.1
5 | EbookLib~=0.18
6 | pypdf2~=3.0.1


--------------------------------------------------------------------------------
/utils/logging_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | def setup_logging():
 5 |     logging.basicConfig(filename='app.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 6 |     logger = logging.getLogger(__name__)
 7 |     logger.info("Program started.")
 8 |     return logger
 9 | 
10 | 
11 | def log_program_closure(logger):
12 |     logger.info("Program closed.")
13 | 


--------------------------------------------------------------------------------
/utils/user_interface.py:
--------------------------------------------------------------------------------
 1 | from colorama import Fore
 2 | 
 3 | 
 4 | # put printing in a different module?
 5 | def print_welcome():
 6 |     print(Fore.CYAN + """
 7 |  _______  _______  _______      _______  _______  ______    _______  _______  _______  ______   
 8 | |   _   ||       ||       |    |       ||       ||    _ |  |   _   ||       ||       ||    _ |  
 9 | |  |_|  ||   _   ||___    |    |  _____||       ||   | ||  |  |_|  ||    _  ||    ___||   | ||  
10 | |       ||  | |  | ___|   |    | |_____ |       ||   |_||_ |       ||   |_| ||   |___ |   |_||_ 
11 | |       ||  |_|  ||___    |    |_____  ||      _||    __  ||       ||    ___||    ___||    __  |
12 | |   _   ||       | ___|   |     _____| ||     |_ |   |  | ||   _   ||   |    |   |___ |   |  | |
13 | |__| |__||_______||_______|    |_______||_______||___|  |_||__| |__||___|    |_______||___|  |_|
14 | """ + Fore.RESET)
15 | 
16 | 
17 | # Prints the goodbye message and exits the program
18 | def print_goodbye():
19 |     print(Fore.CYAN + "\nThank you for using AO3 Scraper!" + Fore.RESET)
20 |     input("\nPress Enter to exit.")
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Damatycznie
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/utils/session_utils.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | from . import error_handling
 4 | 
 5 | 
 6 | # Creates a session and returns the authenticity token
 7 | def create_session(logger):
 8 |     print("\nCreating a session...")
 9 |     logger.info("Creating a session...")
10 | 
11 |     # Create a session with custom user agent
12 |     headers = {
13 |         'User-Agent': 'Bookmark Scraper Bot'
14 |     }
15 | 
16 |     try:
17 |         # Create a session and make a GET request
18 |         session = requests.Session()
19 |         session.headers.update(headers)
20 |         response = session.get("https://archiveofourown.org/users/login")
21 |         response.raise_for_status()
22 | 
23 |         soup = BeautifulSoup(response.content, 'html.parser')
24 |         token = soup.find('input', {'name': 'authenticity_token'})
25 | 
26 |         if token is None:
27 |             error_handling.handle_token_not_found(logger)
28 |             return None, None
29 |         else:
30 |             token = token['value']
31 |             print("\nSession created.")
32 |             logger.info("Session created.")
33 |             return token, session
34 | 
35 |     except requests.exceptions.RequestException as error:
36 |         error_handling.handle_request_error(error, logger)
37 |         return None, None


--------------------------------------------------------------------------------
/utils/error_handling.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | 
 4 | # Handles the "Retry later" message (unneeded?)
 5 | def handle_retry_later(response, logger):
 6 |     if "Retry later" in response.text:
 7 |         logger.error("Received 'Retry later' message. Too many requests, stopping scraping.")
 8 |         print("\nReceived 'Retry later' message. Please try again later, consider increasing the delay.")
 9 |         return True
10 |     return False
11 | 
12 | 
13 | # Handles request errors
14 | def handle_request_error(error, logger):
15 |     if "429" in str(error):  # HTTP 429: Too Many Requests
16 |         logger.error("Too many requests, stopping scraping.")
17 |         print("\nToo many requests. Please try again later, consider increasing the delay.")
18 |     else:
19 |         logger.error(f"An error occurred while making the request: {error}")
20 |         print("\nAn error occurred while making the request. Please try again later. Check the logs for more details.")
21 | 
22 | 
23 | # Handles invalid input
24 | def handle_invalid_input(context, logger):
25 |     logger.error(f"Invalid input: {context}")
26 |     print(f"\nInvalid input: {context}")
27 | 
28 | 
29 | # Handles token not found error
30 | def handle_token_not_found(logger):
31 |     logger.error("Authenticity token not found. Cannot log in.")
32 |     print("\nAn error occurred while logging in. Skipping. Please try again later. Check the logs for more details.")
33 | 
34 | 
35 | # Handles parse errors
36 | def handle_parse_error(logger):
37 |     logger.error("Error parsing HTML.")
38 |     print("\nAn error occurred while parsing the HTML. Please try again later. Check the logs for more details.")
39 | 
40 | 
41 | # Handles keyboard interrupts
42 | def handle_keyboard_interrupt(logger):
43 |     logger.error("Keyboard Interrupt detected.")
44 |     print("\nKeyboardInterrupt received. Exiting gracefully...")
45 |     sys.exit(0)
46 | 


--------------------------------------------------------------------------------
/utils/updating_utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from datetime import datetime
 3 | 
 4 | import ebooklib
 5 | from PyPDF2 import PdfReader
 6 | from bs4 import BeautifulSoup
 7 | from ebooklib import epub
 8 | 
 9 | # Date patterns to get the last date in the file (either Completed, Updated, or Published)
10 | date_patterns = [
11 |     (r'Completed: (\d{4}-\d{2}-\d{2})', 'Completed'),
12 |     (r'Updated: (\d{4}-\d{2}-\d{2})', 'Updated'),
13 |     (r'Published: (\d{4}-\d{2}-\d{2})', 'Published')
14 | ]
15 | 
16 | 
17 | # Extracts the date from an EPUB file
18 | def extract_epub_date(file_path):
19 |     epub_book = ebooklib.epub.read_epub(file_path)
20 | 
21 |     text_content = ""
22 | 
23 |     for item in epub_book.get_items():
24 |         if isinstance(item, ebooklib.epub.EpubHtml):
25 |             text_content += item.get_body_content().decode('utf-8')
26 | 
27 |     for pattern, label in date_patterns:
28 |         match = re.search(pattern, text_content)
29 |         if match:
30 |             file_date = match.group(1)
31 |             file_date = datetime.strptime(file_date, "%Y-%m-%d")
32 |             return file_date
33 | 
34 |     return None
35 | 
36 | 
37 | # Extracts the date from a PDF file
38 | def extract_pdf_date(file_path):
39 |     with open(file_path, 'rb') as pdf_file:
40 |         pdf_reader = PdfReader(pdf_file)
41 | 
42 |         text_content = ""
43 | 
44 |         page = pdf_reader.pages[0]
45 |         text_content += page.extract_text()
46 | 
47 |         for pattern, label in date_patterns:
48 |             match = re.search(pattern, text_content)
49 |             if match:
50 |                 file_date = match.group(1)
51 |                 file_date = datetime.strptime(file_date, "%Y-%m-%d")
52 |                 return file_date
53 | 
54 |     return None
55 | 
56 | 
57 | # Extracts the date from an HTML file
58 | def extract_html_date(file_path):
59 |     with open(file_path, 'r', encoding='utf-8') as html_file:
60 |         html_content = html_file.read()
61 | 
62 |     soup = BeautifulSoup(html_content, 'html.parser')
63 |     text_content = soup.get_text()
64 | 
65 |     for pattern, label in date_patterns:
66 |         match = re.search(pattern, text_content)
67 |         if match:
68 |             file_date = match.group(1)
69 |             file_date = datetime.strptime(file_date, "%Y-%m-%d")
70 | 
71 |             return file_date
72 | 
73 |     return None
74 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # AO3_Scraper
 2 | A web scraper that extracts bookmark metadata from Archive of Our Own and saves it to a CSV file. Has an option to download the bookmarks and neatly organize them into folders based on fandoms.
 3 | 
 4 | **Works on public and private bookmarks if you log into your AO3 account.**
 5 | 
 6 | ## Table of Contents
 7 | - [Features](#features)
 8 | - [Dependencies](#dependencies)
 9 | - [How to Use](#how-to-use)
10 | - [Contact](#contact)
11 | - [Bug Reports and Feature Requests](#bug-reports-and-feature-requests)
12 | 
13 | # Features
14 | 
15 | Scrapes or downloads bookmarks from Archive of Our Own. Allows user to log into their account to access private bookmarks and works that are only available to registered users.
16 | 
17 | ### Scraping
18 | - Extracts bookmark metadata such as URL, title, authors, fandoms, warnings, ratings, categories, characters, relationships, tags, wordcounts, date bookmarked, date updated.
19 | - Writes extracted data to a CSV file.
20 | 
21 | ### Downloading
22 | - Downloads bookmarks to different folders based on fandoms, and names the files based on the title and authors.
23 | - Supports downloading bookmarked series.
24 | - Allows user to choose format when downloading bookmarks (HTML, MOBI, EPUB, PDF, AZW3).
25 | 
26 | ### Updating
27 | - Checks bookmarks for updates and downloads them accordingly (HTML, EPUB, PDF). 
28 | - Works only if bookmarks are in the correct folder and named correctly.
29 | 
30 | 
31 | # Dependencies
32 | - Python 3
33 | - PIP
34 | - requests
35 | - BeautifulSoup 
36 | - tqdm
37 | - colorama
38 | - ebooklib
39 | - pypdf2
40 | 
41 | # How to use
42 | Run the script yourself or use the release.
43 | 
44 | ### Running the script
45 | - Download or clone the repository.
46 | - Install the required dependencies by running `pip install -r requirements.txt` in the correct directory where the `requirements.txt` file is located.
47 | - Run the script by running `python main.py` in command prompt or terminal. Make sure you're in the correct directory.
48 | - Scrape or download the bookmarks.
49 | 
50 | ### Use the release
51 | Instead of downloading the script, you can download the [latest release](https://github.com/Dramatycznie/AO3_Scraper/releases) and run it directly.
52 | 
53 | - Download the release.
54 | - Unpack it.
55 | - Run `AO3_Scraper.exe`.
56 | - Scrape or download the bookmarks.
57 | 
58 | # Contact
59 | If you have any questions or feedback about this project, please feel free to reach out to me.
60 | - Email: mellodramat@gmail.com
61 | - GitHub: https://github.com/Dramatycznie
62 | 
63 | # Bug reports and feature requests
64 | For bug reports and feature requests, please open an issue on my GitHub repository: https://github.com/Dramatycznie/AO3_Scraper/issues
65 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import atexit
 2 | 
 3 | from utils import error_handling
 4 | from utils import logging_utils
 5 | from utils import user_input
 6 | from utils import user_interface
 7 | from utils import scraping_utils
 8 | from utils import session_utils
 9 | from utils import downloading_utils
10 | 
11 | 
12 | # Main function
13 | def main():
14 |     session = None
15 |     logger = logging_utils.setup_logging()
16 |     atexit.register(logging_utils.log_program_closure, logger)
17 |     try:
18 |         user_interface.print_welcome()
19 |         log_in = user_input.ask_if_log_in(logger)
20 | 
21 |         while True:
22 |             if log_in and session is None:  # Create a session if logging in
23 |                 token, session = session_utils.create_session(logger)
24 |                 user_input.get_login_info(token, session, logger)
25 |                 logged_in = True
26 |             else:
27 |                 logged_in = False
28 | 
29 |             action = user_input.download_or_scrape(logger)
30 | 
31 |             # Get the info needed to scrape the bookmarks
32 |             username, url = user_input.get_username(logged_in, action, logger)
33 | 
34 |             # Call get_available_pages and check the result
35 |             available_pages = user_input.get_available_pages(username, session, url, logger)
36 | 
37 |             if available_pages is not None:
38 |                 if action != "download updates":
39 |                     # Get the range of the pages
40 |                     page_range = user_input.get_page_range(session, url, logger)
41 |                     if page_range is None:
42 |                         continue
43 |                     start_page, end_page = page_range
44 |                 else:
45 |                     # ...Unless user chooses to download updates, start from page 1 and end at the last page, no asking
46 |                     start_page = 1
47 |                     end_page = available_pages
48 | 
49 |                 delay = user_input.get_delay(logger)
50 | 
51 |                 if action in ["download", "download updates"]:
52 |                     chosen_format = user_input.get_download_format(logger, action)
53 |                     downloading_utils.download_bookmarks(username, logged_in, start_page, end_page, session,
54 |                                                          chosen_format, delay, action, logger)
55 |                 elif action == "scrape":
56 |                     scraping_utils.scrape_bookmarks(username, start_page, end_page, session, delay, logger)
57 | 
58 |                 if not user_input.ask_again(logger):
59 |                     user_interface.print_goodbye()
60 |                     break  # Exit the loop if the user chooses not to try again
61 | 
62 |     except KeyboardInterrupt:
63 |         error_handling.handle_keyboard_interrupt(logger)
64 | 
65 |     finally:
66 |         if session:
67 |             session.close()
68 |             print("\nSession closed.")
69 |             logger.info("Session closed.")
70 | 
71 | if __name__ == "__main__":
72 |     main()
73 | 
74 | 


--------------------------------------------------------------------------------
/utils/scraping_utils.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import socket
  3 | import time
  4 | import requests
  5 | from bs4 import BeautifulSoup
  6 | from colorama import Fore
  7 | from tqdm import tqdm
  8 | from . import error_handling
  9 | 
 10 | 
 11 | # Gets the text of an element
 12 | def get_element_text(element):
 13 |     return element.text.strip() if element else ""
 14 | 
 15 | 
 16 | # Gets the text of a list of elements
 17 | def get_element_text_list(elements):
 18 |     return [element.text.strip() for element in elements] if elements else []
 19 | 
 20 | 
 21 | # Scrapes a single bookmark entry
 22 | def scrape_single_bookmark(bookmark, csvwriter):
 23 |     # Get title from the bookmark
 24 |     title_element = bookmark.select_one("h4 a:nth-of-type(1)")
 25 |     if title_element:
 26 |         title = get_element_text(title_element)
 27 |     else:
 28 |         return
 29 | 
 30 |     # Get the other data from the bookmark
 31 |     authors = get_element_text_list(bookmark.select("a[rel='author']"))
 32 |     fandoms = get_element_text_list(bookmark.select(".fandoms a"))
 33 |     warnings = get_element_text_list(bookmark.select("li.warnings"))
 34 |     ratings = get_element_text_list(bookmark.select_one("span.rating"))
 35 |     categories = get_element_text_list(bookmark.select("span.category"))
 36 |     words = get_element_text(bookmark.select_one("dd.words") or bookmark.select_one("dd"))
 37 |     tags = get_element_text_list(bookmark.select("li.freeforms"))
 38 |     characters = get_element_text_list(bookmark.select("li.characters"))
 39 |     relationships = get_element_text_list(bookmark.select("li.relationships"))
 40 |     date_bookmarked = get_element_text(bookmark.select_one("div.user p.datetime"))
 41 |     url = "https://archiveofourown.org" + bookmark.select_one("h4 a:nth-of-type(1)")["href"]
 42 |     date_updated = get_element_text(bookmark.select_one("p.datetime"))
 43 | 
 44 |     # Replace commas with semicolons in ratings and categories (important when bookmark is a series)
 45 |     ratings = [rating.replace(',', ';') for rating in ratings]
 46 |     categories = [category.replace(',', ';') for category in categories]
 47 | 
 48 |     # Write bookmark data to CSV, replace empty author with "Anonymous"
 49 |     csvwriter.writerow([
 50 |         url, title, '; '.join(authors) if authors else 'Anonymous', '; '.join(fandoms), '; '.join(warnings),
 51 |         '; '.join(ratings), '; '.join(categories), '; '.join(characters),
 52 |         '; '.join(relationships), '; '.join(tags), words, date_bookmarked, date_updated
 53 |     ])
 54 | 
 55 | 
 56 | # Scrape the bookmarks of a user
 57 | def scrape_bookmarks(username, start_page, end_page, session, delay, logger):
 58 |     with open(username + '_bookmarks.csv', 'w', newline='', encoding='utf-8') as csvfile:
 59 |         csvwriter = csv.writer(csvfile)
 60 |         logger.info(f"CSV file created: {username}_bookmarks.csv")
 61 | 
 62 |         # Write header row to CSV file
 63 |         csvwriter.writerow(
 64 |             ['URL', 'Title', 'Authors', 'Fandoms', 'Warnings', 'Rating', 'Categories', 'Characters',
 65 |              'Relationships', 'Tags', 'Words', 'Date Bookmarked', 'Date Updated'])
 66 |         logger.info("Header row written to CSV file")
 67 | 
 68 |         num_bookmarks = 0
 69 |         total_pages = end_page - start_page + 1
 70 | 
 71 |         # Loop through pages and scrape bookmarks
 72 |         print()
 73 |         for page in tqdm(range(start_page, end_page + 1), total=total_pages, desc="Scraping: "):
 74 |             try:
 75 |                 response = session.get(
 76 |                     f"https://archiveofourown.org/users/{username}/bookmarks?private=true&page={page}") if \
 77 |                     session else requests.get(f"https://archiveofourown.org/users/{username}/bookmarks?page={page}")
 78 | 
 79 |                 response.raise_for_status()
 80 | 
 81 |                 time.sleep(delay)
 82 |                 soup = BeautifulSoup(response.text, 'html.parser')
 83 |                 logger.info(f"Scraping page {page}")
 84 | 
 85 |             except (requests.exceptions.RequestException, socket.timeout) as error:
 86 |                 error_handling.handle_request_error(error, logger)
 87 |                 return
 88 |             if error_handling.handle_retry_later(response, logger):
 89 |                 return
 90 | 
 91 |             # Loop through each bookmark on the page
 92 |             for bookmark in soup.select("li.bookmark"):
 93 |                 scrape_single_bookmark(bookmark, csvwriter)
 94 |                 num_bookmarks += 1
 95 | 
 96 |         # Print completion message
 97 |         logger.info(f"Scraping complete. Scraped {num_bookmarks} bookmarks.")
 98 |         print("\nAll done! \nYour bookmarks have been saved to {}{}{}_bookmarks.csv{}.".format(Fore.CYAN,
 99 |                                                                                                username, Fore.RESET,
100 |                                                                                                Fore.RESET))
101 |         print("Scraped {}{}{} bookmarks.".format(Fore.CYAN, num_bookmarks, Fore.RESET))
102 | 


--------------------------------------------------------------------------------
/utils/downloading_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | from datetime import datetime
  4 | import warnings
  5 | from colorama import Fore
  6 | 
  7 | import requests
  8 | from bs4 import BeautifulSoup
  9 | from tqdm import tqdm
 10 | 
 11 | from . import error_handling
 12 | from . import updating_utils
 13 | 
 14 | # Before importing ebooklib, filter out the specific warning
 15 | warnings.filterwarnings("ignore", category=UserWarning,
 16 |                         message="In the future version we will turn default option ignore_ncx to True.")
 17 | 
 18 | 
 19 | # Cleans the folder name to be used as a folder name
 20 | def clean_folder_name(name):
 21 |     forbidden_characters = r'<>:"/\|?*'
 22 |     return ''.join(char for char in name if char not in forbidden_characters)
 23 | 
 24 | 
 25 | # Cleans the work title to be used as a file name
 26 | def clean_work_title(title):
 27 |     forbidden_characters = r'<>:"/\|?*'
 28 |     cleaned_title = ''.join(char for char in title if char not in forbidden_characters)
 29 |     return cleaned_title[:50]
 30 | 
 31 | 
 32 | # Extracts the work URLs from a page of bookmarks
 33 | def extract_work_urls_from_page(url, session, logger):
 34 |     work_urls = []
 35 | 
 36 |     try:
 37 |         response = session.get(url) if session else requests.get(url)
 38 |         response.raise_for_status()  # Check for request errors
 39 | 
 40 |         soup = BeautifulSoup(response.content, 'html.parser')
 41 | 
 42 |         for bookmark in soup.select("li.bookmark"):
 43 |             work_url_element = bookmark.select_one("h4 a:nth-of-type(1)")
 44 |             if work_url_element:
 45 |                 work_url = "https://archiveofourown.org" + work_url_element["href"]
 46 |                 if "/series/" in work_url:
 47 |                     series_work_urls = extract_work_urls_from_series(work_url, session, logger)
 48 |                     work_urls.extend(series_work_urls)
 49 |                 else:
 50 |                     work_urls.append(work_url)
 51 | 
 52 |     except requests.exceptions.RequestException as error:
 53 |         error_handling.handle_request_error(error, logger)
 54 | 
 55 |     return work_urls
 56 | 
 57 | 
 58 | # Extracts the work URLs from series
 59 | def extract_work_urls_from_series(series_url, session, logger):
 60 |     series_work_urls = []
 61 | 
 62 |     try:
 63 |         response = session.get(series_url) if session else requests.get(series_url)
 64 |         response.raise_for_status()  # Check for request errors
 65 | 
 66 |         soup = BeautifulSoup(response.content, 'html.parser')
 67 | 
 68 |         series_work_elements = soup.select("ul.series.work.index.group li.work.blurb.group")
 69 |         for series_work_element in series_work_elements:
 70 |             series_work_url_element = series_work_element.select_one("h4 a:nth-of-type(1)")
 71 |             if series_work_url_element:
 72 |                 series_work_url = "https://archiveofourown.org" + series_work_url_element["href"]
 73 |                 series_work_urls.append(series_work_url)
 74 | 
 75 |     except requests.exceptions.RequestException as error:
 76 |         error_handling.handle_request_error(error, logger)  # Handle request error
 77 | 
 78 |     return series_work_urls
 79 | 
 80 | 
 81 | # Downloads the works from the given work URLs
 82 | def download_works_from_urls(work_url, session, chosen_format, action, logger):
 83 |     no_update_needed = False
 84 |     try:
 85 |         response = session.get(work_url) if session else requests.get(work_url)
 86 |         if response.status_code == 200:
 87 |             soup = BeautifulSoup(response.content, 'html.parser')
 88 | 
 89 |             work_title = soup.find("h2", class_="title heading").get_text(strip=True)
 90 |             author_elements = soup.select(".byline a[rel='author']")
 91 |             max_display_authors = 3
 92 |             work_authors = [author.get_text(strip=True) for author in author_elements[:max_display_authors]]
 93 |             if not work_authors:
 94 |                 work_authors = ["Anonymous"]
 95 | 
 96 |             work_fandoms = [fandom.get_text(strip=True) for fandom in soup.select(".fandom a")]
 97 | 
 98 |             update_date_element = soup.select_one("dd.status") or soup.select_one("dd.published")
 99 |             if update_date_element:
100 |                 update_date_element = datetime.strptime(update_date_element.get_text(strip=True), "%Y-%m-%d")
101 | 
102 |             download_menu = soup.find("li", class_="download")
103 |             if download_menu:
104 |                 format_links = {
105 |                     "EPUB": "EPUB", "MOBI": "MOBI", "PDF": "PDF", "HTML": "HTML", "AZW3": "AZW3"
106 |                 }
107 | 
108 |                 format_name = format_links.get(chosen_format)
109 |                 if format_name:
110 |                     format_link = download_menu.find("a", href=True, string=format_name)
111 |                     if format_link:
112 |                         format_url = "https://archiveofourown.org" + format_link["href"]
113 | 
114 |                         for fandom in work_fandoms:
115 |                             cleaned_fandom = clean_folder_name(fandom)
116 |                             folder_path = os.path.join("Downloaded Works", cleaned_fandom)
117 |                             os.makedirs(folder_path, exist_ok=True)
118 | 
119 |                             cleaned_work_title = clean_work_title(work_title)
120 |                             authors_string = ' & '.join(work_authors)
121 |                             file_name = f"{cleaned_work_title} by {authors_string}.{format_name.lower()}"
122 |                             file_path = os.path.join(folder_path, file_name)
123 | 
124 |                             if os.path.exists(file_path):
125 |                                 if action == "download updates":
126 |                                     if chosen_format == "EPUB":
127 |                                         file_date = updating_utils.extract_epub_date(file_path)
128 |                                     elif chosen_format == "PDF":
129 |                                         file_date = updating_utils.extract_pdf_date(file_path)
130 |                                     elif chosen_format == "HTML":
131 |                                         file_date = updating_utils.extract_html_date(file_path)
132 |                                     else:
133 |                                         file_date = None
134 | 
135 |                                     if file_date is None or (update_date_element is not None and
136 |                                                              update_date_element > file_date):
137 |                                         download_file(file_path, format_url, file_name, cleaned_fandom, logger)
138 |                                     else:
139 |                                         logger.info(f"'{file_name}' in '{cleaned_fandom}' does not need to be updated.")
140 |                                         no_update_needed = True  # (assumes all works are up-to-date)
141 |                                         tqdm.write(f"{Fore.CYAN}All works are up-to-date.{Fore.RESET}\n")
142 |                                 else:
143 |                                     logger.info(f"'{file_name}' in '{cleaned_fandom}' already exists. Skipping.")
144 |                             else:
145 |                                 download_file(file_path, format_url, file_name, cleaned_fandom, logger)
146 | 
147 |     except requests.RequestException as error:
148 |         error_handling.handle_request_error(error, logger)
149 | 
150 |     return no_update_needed
151 | 
152 | 
153 | # Define a separate function for downloading a file
154 | def download_file(file_path, format_url, file_name, cleaned_fandom, logger):
155 |     response_format = requests.get(format_url)
156 |     if response_format.status_code == 200:
157 |         with open(file_path, "wb") as file:
158 |             file.write(response_format.content)
159 |         logger.info(
160 |             f"'{file_name}' downloaded successfully to '{cleaned_fandom}' folder."
161 |         )
162 | 
163 | 
164 | def download_bookmarks(username, logged_in, start_page, end_page, session, chosen_format, delay, action, logger):
165 |     if action == "download updates":
166 |         base_url = f"https://archiveofourown.org/users/{username}/bookmarks?bookmark_search%5Bbookmark_query%5D" \
167 |                    f"=&bookmark_search%5Bbookmarkable_query%5D=&bookmark_search%5Bexcluded_bookmark_tag_names%5D" \
168 |                    f"=&bookmark_search%5Bexcluded_tag_names%5D=&bookmark_search%5Blanguage_id%5D=&bookmark_search" \
169 |                    f"%5Bother_bookmark_tag_names%5D=&bookmark_search%5Bother_tag_names%5D=&bookmark_search%5Brec%5D=0" \
170 |                    f"&bookmark_search%5Bsort_column%5D=bookmarkable_date&bookmark_search%5Bwith_notes%5D=0&commit" \
171 |                    f"=Sort+and+Filter"
172 |         logged_in_base_url = base_url + "&private=true"
173 |     else:
174 |         base_url = f"https://archiveofourown.org/users/{username}/bookmarks"
175 |         logged_in_base_url = base_url + "?private=true"
176 | 
177 |     page_number = start_page
178 | 
179 |     while page_number <= end_page:
180 |         if action == "download updates":
181 |             bookmark_page_url = f"{logged_in_base_url}&page={page_number}" if logged_in else \
182 |                 f"{base_url}&page={page_number}"
183 |         else:
184 |             bookmark_page_url = f"{logged_in_base_url}&page={page_number}" if logged_in else f"{base_url}?page=" \
185 |                                                                                              f"{page_number}"
186 |         work_urls = extract_work_urls_from_page(bookmark_page_url, session, logger)
187 | 
188 |         if not work_urls:
189 |             break
190 | 
191 |         # Loop through extracted work URLs and download
192 |         for work_url in tqdm(work_urls, desc=f"Downloading works from page {page_number}", leave=True):
193 |             no_update_needed = download_works_from_urls(work_url, session, chosen_format, action, logger)
194 |             if no_update_needed:
195 |                 return
196 |             time.sleep(delay)
197 | 
198 |         page_number += 1
199 | 


--------------------------------------------------------------------------------
/utils/user_input.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import requests
  3 | from bs4 import BeautifulSoup
  4 | from colorama import Fore
  5 | from . import error_handling
  6 | 
  7 | 
  8 | # Asks the user if they want to log in
  9 | def ask_if_log_in(logger):
 10 |     while True:
 11 |         user_choice = input("\nWould you like to log in?\nLogged in users can access private bookmarks and bookmarks "
 12 |                             "that are only visible to logged in users.\n1. Yes\n2. No\n")
 13 | 
 14 |         if user_choice == "1":
 15 |             logger.info("User chose to log in.")
 16 |             return True
 17 |         elif user_choice == "2":
 18 |             logger.info("User chose not to log in.")
 19 |             return False
 20 |         else:
 21 |             error_handling.handle_invalid_input("Please enter a valid choice. 1 or 2.", logger)
 22 | 
 23 | 
 24 | # Asks if the user wants to log in
 25 | def ask_again(logger):
 26 |     while True:
 27 |         answer = input("\nWould you like to run the program again? \n1. Yes \n2. No\n")
 28 |         if answer == "1":
 29 |             logger.info("User chose to try again.")
 30 |             return True
 31 |         elif answer == "2":
 32 |             logger.info("User chose not to try again.")
 33 |             return False
 34 |         else:
 35 |             error_handling.handle_invalid_input("Please enter a valid choice. 1 or 2.", logger)
 36 | 
 37 | 
 38 | # Gets username/email and password from the user and logs in
 39 | def get_login_info(token, session, logger):
 40 |     if session is None or token is None:
 41 |         error_handling.handle_token_not_found(logger)
 42 |         return False
 43 | 
 44 |     while True:
 45 |         # Prompt for user input
 46 |         username_or_email = input("\nEnter your username or email: ")
 47 |         password = input("\nEnter your password: ")
 48 |         print("\nChecking if login is successful...")
 49 | 
 50 |         # Create a payload
 51 |         payload = {
 52 |             "utf8": "✓",
 53 |             "authenticity_token": token,
 54 |             "user[login]": username_or_email,
 55 |             "user[password]": password,
 56 |             "commit": "Log in"
 57 |         }
 58 | 
 59 |         try:
 60 |             # Make a POST request
 61 |             response = session.post("https://archiveofourown.org/users/login", data=payload)
 62 |             response.raise_for_status()
 63 | 
 64 |         except requests.exceptions.RequestException as error:
 65 |             error_handling.handle_request_error(error, logger)
 66 |             continue
 67 | 
 68 |         # Check if login was successful
 69 |         if "Successfully logged in" in response.text:
 70 |             print(Fore.CYAN + "\nLogin successful." + Fore.RESET)
 71 |             logger.info("Login successful.")
 72 |             return True
 73 |         else:
 74 |             error_handling.handle_invalid_input("Login failed. Please try again.", logger)
 75 | 
 76 | 
 77 | # Gets the username of the user whose bookmarks are to be scraped
 78 | def get_username(logged_in, action, logger):
 79 |     while True:
 80 |         username = input(f"\nEnter the username of the user whose bookmarks you want to download or scrape: ")
 81 |         if not username:
 82 |             error_handling.handle_invalid_input("Please enter a username.", logger)
 83 |             continue
 84 | 
 85 |         print("\nChecking if the username is valid...")
 86 | 
 87 |         # Check if the username follows guidelines (3-40 characters, alphanumeric and underscore)
 88 |         username_pattern = r"^[A-Za-z0-9_]{3,40}$"
 89 |         if not re.match(username_pattern, username):
 90 |             error_handling.handle_invalid_input("Please enter a valid username.", logger)
 91 |             continue
 92 |         logger.info(f"{username} is a valid username.")
 93 | 
 94 |         try:
 95 |             print("\nChecking if username exists...")
 96 |             response = requests.get(f"https://archiveofourown.org/users/{username}")
 97 |             response.raise_for_status()
 98 | 
 99 |             soup = BeautifulSoup(response.text, 'html.parser')
100 |             if len(soup.find_all("div", class_="user")) > 0:
101 |                 if action == "scrape":
102 |                     print(f"\nScraping bookmarks of user: {Fore.CYAN}{username}{Fore.RESET}")
103 |                 elif action == "download":
104 |                     print(f"\nDownloading bookmarks of user: {Fore.CYAN}{username}{Fore.RESET}")
105 |                 url = f"https://archiveofourown.org/users/{username}/bookmarks"
106 | 
107 |                 if logged_in:
108 |                     url += "?private=true"
109 |                 logger.info(f"{username} exists. URL: {url}")
110 |                 return username, url
111 |             else:
112 |                 error_handling.handle_invalid_input(f"Username {username} does not exist. Please enter a valid "
113 |                                                     f"username.", logger)
114 | 
115 |         except requests.exceptions.RequestException as error:
116 |             error_handling.handle_request_error(error, logger)
117 | 
118 | 
119 | # Gets the number of pages of bookmarks available (with error handling)
120 | def get_available_pages(username, session, url, logger):
121 |     try:
122 |         # Construct the URL based on the login status
123 |         response = session.get(url, timeout=60) if session else requests.get(url, timeout=60)
124 |         response.raise_for_status()
125 | 
126 |         soup = BeautifulSoup(response.text, 'html.parser')
127 |         bookmarks = soup.find_all("li", class_="bookmark")
128 | 
129 |         if len(bookmarks) == 0:
130 |             error_handling.handle_invalid_input(f"{username} has no bookmarks.", logger)
131 |             return None
132 | 
133 |         # Extract pagination information
134 |         pagination = soup.find("ol", class_="actions")
135 |         if pagination is not None:
136 |             pagination = pagination.find_all("li")
137 |             last_page = int(pagination[-2].text)
138 |         else:
139 |             error_handling.handle_parse_error(logger)
140 |             return None  # Return None in case of pagination parse error
141 | 
142 |         print(f"\nThe user has {Fore.CYAN}{last_page}{Fore.RESET} pages of bookmarks available.")
143 |         logger.info(f"{username} has {last_page} pages of bookmarks available.")
144 |         return last_page
145 | 
146 |     except requests.exceptions.RequestException as error:  # works ok if not logged in (check for logged in)
147 |         error_handling.handle_request_error(error, logger)
148 |         return None  # Return None to indicate an error
149 | 
150 |     except (AttributeError, ValueError):
151 |         error_handling.handle_parse_error(logger)
152 |         return None
153 | 
154 | 
155 | # Gets the page range from the user
156 | def get_page_range(session, url, logger):
157 |     while True:
158 |         try:
159 |             start_page = int(input("\nEnter the starting page number: "))
160 |             if start_page < 1:
161 |                 error_handling.handle_invalid_input("The starting page number must be positive.", logger)
162 |                 continue
163 | 
164 |             end_page = int(input("\nEnter the ending page number: "))
165 |             if end_page < 1 or end_page < start_page:
166 |                 error_handling.handle_invalid_input("The ending page number must be positive and greater than the starting page number.", logger)
167 |                 continue
168 | 
169 |             # Try to fetch the URL and extract pagination info
170 |             try:
171 |                 # Request the bookmark page and raise HTTPError for bad responses
172 |                 response = session.get(url, timeout=60) if session else requests.get(url, timeout=60)
173 |                 response.raise_for_status()
174 | 
175 |                 # Parse the HTML and extract the last page number from pagination
176 |                 soup = BeautifulSoup(response.text, 'html.parser')
177 |                 pagination = soup.find("ol", class_="actions")
178 | 
179 |                 if pagination:
180 |                     pagination = pagination.find_all("li")
181 |                     last_page = int(pagination[-2].text)  # Second-to-last is usually the last page number
182 |                 else:
183 |                     error_handling.handle_parse_error(logger)
184 |                     continue
185 | 
186 |                 # Validate the user-entered page numbers against available pages
187 |                 if start_page > last_page:
188 |                     error_handling.handle_invalid_input(f"Starting page {start_page} is out of range. Available starting pages are between 1 - {last_page}.", logger)
189 |                     continue
190 |                 if end_page > last_page:
191 |                     error_handling.handle_invalid_input(f"Ending page {end_page} is out of range. The last available page is {last_page}.", logger)
192 |                     continue
193 | 
194 |             except requests.exceptions.RequestException as error:
195 |                 # Handles errors and logs cleanly
196 |                 error_handling.handle_request_error(error, logger)
197 |                 return None
198 |             except (AttributeError, ValueError):
199 |                 # Handles broken HTML or unexpected formats
200 |                 error_handling.handle_parse_error(logger)
201 |                 return None
202 | 
203 |             logger.info(f"Page range: {start_page} - {end_page}")
204 |             return start_page, end_page
205 | 
206 |         except ValueError:
207 |             error_handling.handle_invalid_input("Please enter a valid number.", logger)
208 | 
209 | 
210 | # Gets the delay between requests
211 | def get_delay(logger):
212 |     while True:
213 |         try:
214 |             delay = int(input("\nEnter the interval delay between requests (at least 1 second, suggested value "
215 |                               "is 5 seconds): "))
216 |             if delay < 1:
217 |                 error_handling.handle_invalid_input("Please enter a delay of at least 1 second.", logger)
218 |                 continue
219 |             break
220 | 
221 |         except ValueError:
222 |             error_handling.handle_invalid_input("Please enter a valid number.", logger)
223 | 
224 |     logger.info(f"Delay: {delay} seconds")
225 |     return delay
226 | 
227 | 
228 | # Gets the input for the download or scrape choice
229 | def download_or_scrape(logger):
230 |     while True:
231 |         choice = input("\nDo you want to scrape the bookmarks or download them?\n1. Scrape\n2. Download\n"
232 |                        "3. Download updates\n")
233 |         choices = ['1', '2', '3']
234 |         if choice in choices:
235 |             action = ["scrape", "download", "download updates"][int(choice) - 1]
236 |             logger.info(f"User chose to {action} bookmarks.")
237 |             return action
238 |         else:
239 |             error_handling.handle_invalid_input("Invalid choice. Please enter a valid number.", logger)
240 | 
241 | 
242 | # Gets the input for the download format
243 | def get_download_format(logger, action):
244 |     while True:
245 |         if action == "download updates":
246 |             user_format = input("\nChoose the download format:\n1. EPUB\n2. PDF\n3. HTML\n")
247 |             formats = ['1', '2', '3']
248 |         else:
249 |             user_format = input("\nChoose the download format:\n1. AZW3\n2. EPUB\n3. MOBI\n4. PDF\n5. HTML\n")
250 |             formats = ['1', '2', '3', '4', '5']
251 | 
252 |         if user_format in formats:
253 |             if action == "download updates":
254 |                 chosen_format = ["EPUB", "PDF", "HTML"][int(user_format) - 1]
255 |             else:
256 |                 chosen_format = ["AZW3", "EPUB", "MOBI", "PDF", "HTML"][int(user_format) - 1]
257 | 
258 |             print()
259 | 
260 |             logger.info(f"User chose to download in {chosen_format} format.")
261 |             return chosen_format
262 |         else:
263 |             error_handling.handle_invalid_input("Invalid choice. Please enter a valid number.", logger)
264 | 


--------------------------------------------------------------------------------