├── ad.bat ├── .gitattributes ├── requirements.txt ├── ad.txt ├── README.md └── ad.py /ad.bat: -------------------------------------------------------------------------------- 1 | python ad.py -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | tqdm 3 | img2pdf 4 | pywin32 5 | titlecase 6 | pyperclip -------------------------------------------------------------------------------- /ad.txt: -------------------------------------------------------------------------------- 1 | # Your archive.org email 2 | billgates@microsoft.com 3 | # Your archive.org password 4 | toolazy 5 | # Output directory [default - current] 6 | # Image resolution (10 to 0, 0 is the highest), [default 0] 7 | 0 8 | # Maximum number of threads, [default 10] 9 | 10 10 | # Type of output - jpg, pdf, jpgpdf, jpgpdfmeta [default - jpg] 11 | jpgpdfmeta 12 | # Preparation for editing 13 | none 14 | # Book URLs or only IDs 15 | https://archive.org/details/billgatesbiograp0000becr 16 | https://archive.org/details/billgates00woog -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

AD - Archive.org-Downloader

2 | 3 |

4 | Python3 script and ready to run Windows tool to download archive.org books in PDF format ! 5 |
6 |

7 | 8 | ## About 9 | 10 | Allows you to download locally books from https://archive.org! 11 | 12 | It can download original JPG files to work on them further, join them into single PDF without quality loss, or made special PDF with cover and matadata. 13 | 14 | Tool also now has instant clipboard support, and will pick up individual book URL or list of them from clipboard. 15 | 16 | You must remember to create an account on https://archive.org/ for the tool to work. 17 | 18 | ## Getting Started 19 | 20 | You can either install python, use "pip install requirements.txt" and run tool on any platform. 21 | 22 | Or if you are on Windows - you can download ready to run exe tool. 23 | 24 | Download zip archive with AD.exe and settings file from project Releases, make some folder and unpack ZIP archive you don't need to install anything else. 25 | 26 | ## Usage 27 | 28 | Open ad.txt settings file in editor, read the comments and put your information in proper place. 29 | 30 | Save settings file and run the tool, it'll pick up all new settings. 31 | 32 | ## Notes 33 | 34 | Tool is based on Archive.org-Downloader original script with many feature additions -------------------------------------------------------------------------------- /ad.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import random, string 3 | from concurrent import futures 4 | from tqdm import tqdm 5 | import time 6 | from datetime import datetime 7 | import os 8 | import sys 9 | import shutil 10 | from titlecase import titlecase 11 | import pyperclip 12 | 13 | def display_error(response, message): 14 | print(message) 15 | print(response) 16 | print(response.text) 17 | sys.exit() 18 | 19 | 20 | # Request from site all information about book, including titles, all metadata and such 21 | 22 | def get_book_infos(session, url): 23 | r = session.get(url).text 24 | infos_url = "https:" + r.split('"url":"')[1].split('"')[0].replace("\\u0026", "&") 25 | response = session.get(infos_url) 26 | data = response.json()['data'] 27 | title = titlecase(data['brOptions']['bookTitle']) # titlecase is more advanced compared to capwords method, but only for english! 28 | title = ''.join( c for c in title if c not in '<>:"/\\|?*' ) # Filter forbidden chars in directory names (Windows & Linux) 29 | title = title[:150] + " " + url.split('/')[4] # Trim the title to avoid long file names and add book URL as modificator 30 | metadata = data['metadata'] 31 | links = [] 32 | for item in data['brOptions']['data']: 33 | for page in item: 34 | links.append(page['uri']) 35 | 36 | if len(links) > 1: 37 | print(f"[+] This book has {len(links)} pages") 38 | return title, links, metadata 39 | else: 40 | print(f"[-] Error while getting links to images of the pages!") 41 | sys.exit() # must raise exeption, not exit! 42 | 43 | def format_data(content_type, fields): 44 | data = "" 45 | for name, value in fields.items(): 46 | data += f"--{content_type}\x0d\x0aContent-Disposition: form-data; name=\"{name}\"\x0d\x0a\x0d\x0a{value}\x0d\x0a" 47 | data += content_type+"--" 48 | return data 49 | 50 | def login(email, password): 51 | session = requests.Session() 52 | session.get("https://archive.org/account/login") 53 | data = {"username":email, "password":password} 54 | 55 | response = session.post("https://archive.org/account/login", data=data) 56 | if "bad_login" in response.text: 57 | print("[-] Wrong email or password, please check!") 58 | sys.exit() 59 | elif "Successful login" in response.text: 60 | print("[+] Successfully logged in!") 61 | return session 62 | else: 63 | display_error(response, "[-] Error while logging in:") 64 | 65 | def loan(session, book_id, verbose=True): 66 | data = { 67 | "action": "grant_access", 68 | "identifier": book_id 69 | } 70 | # 2022-07-03: This request is done by the website but we don't need to do it here. 71 | # response = session.post("https://archive.org/services/loans/loan/searchInside.php", data=data) 72 | data['action'] = "browse_book" 73 | response = session.post("https://archive.org/services/loans/loan/", data=data) 74 | 75 | if response.status_code == 400 : 76 | if response.json()["error"] == "This book is not available to borrow at this time. Please try again later.": 77 | print("This book doesn't need to be borrowed") 78 | return session 79 | else : 80 | display_error(response, "Something went wrong when trying to borrow the book.") 81 | 82 | data['action'] = "create_token" 83 | response = session.post("https://archive.org/services/loans/loan/", data=data) 84 | 85 | if "token" in response.text: 86 | if verbose: 87 | print("[+] Successfully loaned this book for one hour") 88 | return session 89 | else: 90 | display_error(response, "Something went wrong when trying to borrow the book, maybe you can't borrow this book.") 91 | 92 | # routine to return loan on selected book id 93 | 94 | def return_loan(session, book_id): 95 | data = { 96 | "action": "return_loan", 97 | "identifier": book_id 98 | } 99 | response = session.post("https://archive.org/services/loans/loan/", data=data) 100 | if response.status_code == 200 and response.json()["success"]: 101 | print("[+] Book returned") 102 | else: 103 | display_error(response, "Something went wrong when trying to return the book") # else if we download multiple books we must not exit! 104 | 105 | def image_name(pages, page, directory, book_id): 106 | return f"{directory}/{book_id}_{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg" 107 | 108 | def download_one_image(session, link, i, directory, book_id, pages): 109 | image = image_name(pages, i, directory, book_id) 110 | if not os.path.exists(image): 111 | headers = { 112 | "Referer": "https://archive.org/", 113 | "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8", 114 | "Sec-Fetch-Site": "same-site", 115 | "Sec-Fetch-Mode": "no-cors", 116 | "Sec-Fetch-Dest": "image", 117 | } 118 | retry = True 119 | while retry: 120 | try: 121 | response = session.get(link, headers=headers) 122 | if response.status_code == 403: 123 | session = loan(session, book_id, verbose=False) 124 | raise Exception("Borrow again") 125 | elif response.status_code == 200: 126 | retry = False 127 | except KeyboardInterrupt: 128 | raise 129 | except: 130 | time.sleep(1) # Wait 1 second before retrying 131 | 132 | tmpimage = image.replace(".jpg",".tmp") 133 | with open(tmpimage,"wb") as f: 134 | f.write(response.content) 135 | os.rename(tmpimage, image) 136 | 137 | def download(session, n_threads, directory, links, scale, book_id): 138 | print("Downloading pages...") 139 | links = [f"{link}&rotate=0&scale={scale}" for link in links] 140 | pages = len(links) 141 | 142 | tasks = [] 143 | with futures.ThreadPoolExecutor(max_workers=n_threads) as executor: 144 | for link in links: 145 | i = links.index(link) 146 | tasks.append(executor.submit(download_one_image, session=session, link=link, i=i, directory=directory, book_id=book_id, pages=pages)) 147 | for task in tqdm(futures.as_completed(tasks), total=len(tasks)): 148 | pass 149 | 150 | images = [image_name(pages, i, directory, book_id) for i in range(len(links))] 151 | return images 152 | 153 | 154 | # make single PDF file from multiple JPEG files 155 | 156 | def make_pdf(pdf, title, directory): 157 | 158 | file = title+".pdf" 159 | 160 | # Write only if file does not exist 161 | 162 | if not os.path.isfile(os.path.join(directory, file)): 163 | with open(os.path.join(directory, file),"wb") as f: 164 | f.write(pdf) 165 | print(f"[+] PDF saved as \"{file}\"") 166 | else: 167 | print(f"[-] PDF file \"{file}\" already present on disk") 168 | 169 | # set all config values 170 | 171 | def process_config(config_file_name): 172 | 173 | global email, password, scale, n_threads, d, outtype, urls, editingprep 174 | 175 | file1 = open(config_file_name, 'r') 176 | 177 | # set default values 178 | 179 | email = "none" 180 | password = "none" 181 | scale = 0 182 | n_threads = 10 183 | d = os.getcwd() 184 | outtype = "jpg" 185 | urls = [] 186 | 187 | editingprep = 'none' 188 | 189 | # Set current parsing mode to none 190 | 191 | mode_pars = "none" 192 | 193 | while True: 194 | # Get next line from file 195 | line = file1.readline() 196 | # if line is empty 197 | # end of file is reached 198 | if not line: 199 | break 200 | line = line.strip('\n ') # remove all accidental spaces and newline 201 | if line.find("#") == 0: 202 | if line.find("# Your archive.org email")==0: 203 | mode_pars = "email" 204 | if line.find("# Your archive.org password")==0: 205 | mode_pars = "password" 206 | if line.find("# Output directory")==0: 207 | mode_pars = "outdir" 208 | if line.find("# Image resolution")==0: 209 | mode_pars = "resolution" 210 | if line.find("# Maximum number of threads")==0: 211 | mode_pars = "threads" 212 | if line.find("# Type of output - jpg")==0: 213 | mode_pars = "outtype" 214 | if line.find("# Folder and file naming")==0: 215 | mode_pars = "naming" 216 | if line.find("# Preparation for editing")==0: 217 | mode_pars = "prep" 218 | 219 | if line.find("# Book URLs")==0: 220 | mode_pars = "urls" 221 | continue 222 | if mode_pars == "email": 223 | email = line 224 | if mode_pars == "password": 225 | password = line 226 | if mode_pars == "outdir": 227 | d = line 228 | if mode_pars == "prep": 229 | editingprep = line 230 | if mode_pars == "resolution": 231 | scale = int(line) 232 | if mode_pars == "threads": 233 | n_threads = int(line) 234 | if mode_pars == "outtype": 235 | outtype = line 236 | 237 | if mode_pars == "urls": 238 | urls.append(line.strip()) 239 | else: 240 | mode_pars = "none" 241 | 242 | file1.close() 243 | 244 | def get_clipboard_content(): 245 | clipboard_cont = "" 246 | 247 | clipboard_cont = pyperclip.paste() # Cross platform clipboard support 248 | pyperclip.copy('') # such way we won't reuse old clipboard contents next time, you can uncomment this if you need MacOS and Linux clipboard support 249 | 250 | return clipboard_cont.splitlines() 251 | 252 | def make_pdf_metadata(metadata): 253 | # prepare PDF metadata 254 | # keywords are in 'subject' 255 | # ISBN can be got from isbn': ['9780981803982', '0981803989'] 256 | # 'creator': 'Kingsley, Eve', 'date': '2008' 257 | # sometimes archive metadata is missing 258 | pdfmeta = { } 259 | # ensure metadata are str 260 | for key in ["title", "creator", "associated-names"]: 261 | if key in metadata: 262 | if isinstance(metadata[key], str): 263 | pass 264 | elif isinstance(metadata[key], list): 265 | metadata[key] = "; ".join(metadata[key]) 266 | else: 267 | raise Exception("unsupported metadata type") 268 | # title 269 | if 'title' in metadata: 270 | pdfmeta['title'] = titlecase(metadata['title']) 271 | 272 | # author, we have issue here as we need sometimes to modify names from Rayan, Jack to Jack Rayan 273 | 274 | authors_list = "" 275 | 276 | if 'creator' in metadata: 277 | authors_list = metadata['creator'] 278 | if 'associated-names' in metadata: 279 | if not authors_list == "": 280 | authors_list = authors_list + ";" 281 | authors_list = authors_list + metadata['associated-names'] 282 | 283 | authors_split = authors_list.split(";") 284 | authors_list = "" 285 | 286 | for author in authors_split: 287 | author_res="" 288 | for ch in author: 289 | if ch not in ['0','1','2','3','4','5','6','7','8','9','-']: 290 | author_res+=ch 291 | if not author_res.find(",")==-1: 292 | author_split = author_res.split(",") 293 | author_res = author_split[1].strip()+" "+author_split[0].strip() 294 | if authors_list=="": 295 | authors_list = author_res 296 | else: 297 | authors_list = authors_list + " & " + author_res 298 | 299 | pdfmeta['author'] = authors_list 300 | 301 | if 'date' in metadata: 302 | try: 303 | pdfmeta['creationdate'] = datetime.strptime("1 June " + metadata['date'], '%d %B %Y') 304 | pdfmeta['moddate'] = pdfmeta['creationdate'] 305 | except: 306 | pass 307 | # keywords 308 | 309 | pdfmeta['keywords'] = [f"https://archive.org/details/{book_id}"] 310 | 311 | # if 'subject' in metadata: 312 | # if isinstance(metadata['subject'], list): 313 | # pdfmeta['keywords'] = pdfmeta['keywords'] + metadata['subject'] 314 | # else: 315 | # pdfmeta['keywords'] = pdfmeta['keywords'] + [metadata['subject']] 316 | 317 | if 'isbn' in metadata: 318 | if isinstance(metadata['isbn'], list): 319 | pdfmeta['keywords'] = pdfmeta['keywords'] + metadata['isbn'] 320 | else: 321 | pdfmeta['keywords'] = pdfmeta['keywords'] + [metadata['isbn']] 322 | 323 | 324 | # if 'date' in metadata: 325 | # if isinstance(metadata['date'], list): 326 | # pdfmeta['keywords'] = pdfmeta['keywords'] + metadata['date'] 327 | # else: 328 | # pdfmeta['keywords'] = pdfmeta['keywords'] + [metadata['date']] 329 | 330 | return pdfmeta 331 | 332 | 333 | # function to patch DPI values in all jpeg files 334 | 335 | def Patch_DPI(images): 336 | print('Changing DPI settings for all images to 300dpi') 337 | 338 | res_bytes = (300).to_bytes(2, byteorder='big') 339 | inches_value = bytearray([1]) 340 | total_patch = inches_value + res_bytes + res_bytes 341 | 342 | with tqdm(total=len(images)) as t: 343 | for myimage in images: 344 | if os.path.isfile(myimage): 345 | with open(myimage,'r+b') as f: 346 | f.seek(6) # jump to JFIF 347 | jfif_signature = f.read(4) 348 | if jfif_signature == b'JFIF': 349 | f.seek(13) # jump to resolution values 350 | if f.read(len(total_patch )) != total_patch: 351 | f.seek(13) # jump to resolution values 352 | f.write(total_patch) 353 | t.update(1) 354 | 355 | 356 | # start of main body 357 | 358 | if __name__ == "__main__": 359 | 360 | print("Archive Downloader 2024.11.3") 361 | 362 | if len(sys.argv) == 1: 363 | print("Note that you can specify configuration file in parameters like AD C:\Path\To\MyConfig.txt") 364 | 365 | myfile = 'ad.txt' 366 | 367 | # use custom configuation file if supplied 368 | 369 | if len(sys.argv) == 2: 370 | myfile = sys.argv[1] 371 | 372 | if not os.path.isfile(myfile): 373 | print("Can't find configuration file, exiting!") 374 | sys.exit() 375 | 376 | process_config(myfile) 377 | 378 | if not os.path.isdir(d): 379 | print(f"Output directory does not exist!") 380 | sys.exit() 381 | 382 | clipboard_list = get_clipboard_content() 383 | 384 | for clip_url in clipboard_list: 385 | if clip_url.startswith("https://archive.org/details/") and not clip_url in urls: 386 | urls.append(clip_url) 387 | 388 | books = [] 389 | 390 | # Check the urls format 391 | for url in urls: 392 | if url.startswith("https://archive.org/details/"): 393 | book_id = list(filter(None, url.split("/")))[3] 394 | books.append((book_id, url)) 395 | elif len(url.split("/")) == 1: 396 | books.append((url, "https://archive.org/details/" + url)) 397 | 398 | if len(books)==0: 399 | print("No correct books URLs to download, exiting!") 400 | sys.exit() 401 | else: 402 | print(f"{len(books)} Book(s) will be downloaded") 403 | 404 | 405 | session = login(email, password) 406 | 407 | for book in books: 408 | book_id = book[0] 409 | url = book[1] 410 | print("="*40) 411 | print(f"Current book: https://archive.org/details/{book_id}") 412 | session = loan(session, book_id) 413 | title, links, metadata = get_book_infos(session, url) 414 | 415 | directory = os.path.join(d, title) # "\\\\?\\" + os.path.join(d, title) 416 | 417 | if not os.path.isdir(directory): 418 | os.makedirs(directory) 419 | 420 | if 'title' in metadata: 421 | print("Current book title: "+ titlecase(metadata['title'])) 422 | 423 | images = download(session, n_threads, directory, links, scale, book_id) 424 | 425 | if editingprep != "none": 426 | Patch_DPI(images) 427 | 428 | # make all subdirectories only if in this edit preparation mode 429 | 430 | if not os.path.isdir(os.path.join(directory, "Meta")): 431 | os.makedirs(os.path.join(directory, "Meta")) 432 | if not os.path.isdir(os.path.join(directory, "Cover")): 433 | os.makedirs(os.path.join(directory, "Cover")) 434 | if not os.path.isdir(os.path.join(directory, os.path.join("Cover","Proc"))): 435 | os.makedirs(os.path.join(directory, os.path.join("Cover","Proc"))) 436 | if not os.path.isdir(os.path.join(directory, "Empty")): 437 | os.makedirs(os.path.join(directory, "Empty")) 438 | if not os.path.isdir(os.path.join(directory, "Proc")): 439 | os.makedirs(os.path.join(directory, "Proc")) 440 | if not os.path.isdir(os.path.join(directory, "Proj")): 441 | os.makedirs(os.path.join(directory, "Proj")) 442 | if not os.path.isdir(os.path.join(directory, "Illust")): 443 | os.makedirs(os.path.join(directory, "Illust")) 444 | 445 | 446 | if outtype in ("pdf","jpgpdf","jpgpdfmeta","jpgepub"): # any modes that require creation of PDF or EPUB file 447 | import img2pdf 448 | 449 | pdfmeta = make_pdf_metadata(metadata) 450 | 451 | if outtype=="jpgpdfmeta": 452 | pdf = img2pdf.convert(images[0], **pdfmeta) 453 | 454 | # In preparation mode we place meta PDF in special folder and move first and last image into cover folder 455 | 456 | if editingprep != "none": 457 | make_pdf(pdf, title[:25], os.path.join(directory, "Meta")) # shorten title in case of meta to avoid too long paths 458 | else: 459 | make_pdf(pdf, title, d) 460 | else: 461 | pdf = img2pdf.convert(images, **pdfmeta) 462 | make_pdf(pdf, title, d) 463 | 464 | # move first and last images into separate Covers folder 465 | 466 | if editingprep != "none": 467 | image1 = os.path.split(images[0]) # first one 468 | image2 = os.path.split(images[-1]) # last one 469 | image1_path = os.path.join(os.path.join(image1[0],"Cover"), image1[1]) 470 | image2_path = os.path.join(os.path.join(image2[0],"Cover"), image2[1]) 471 | if not os.path.exists(image1_path): 472 | os.rename(images[0], image1_path) 473 | else: 474 | os.remove(images[0]) 475 | if not os.path.exists(image2_path): 476 | os.rename(images[-1], image2_path) 477 | else: 478 | os.remove(images[-1]) 479 | 480 | if outtype=="pdf": 481 | try: 482 | shutil.rmtree(directory) 483 | except OSError as e: 484 | print ("Error: %s - %s." % (e.filename, e.strerror)) 485 | 486 | return_loan(session, book_id) --------------------------------------------------------------------------------