├── ad.bat
├── .gitattributes
├── requirements.txt
├── ad.txt
├── README.md
└── ad.py
/ad.bat:
--------------------------------------------------------------------------------
1 | python ad.py
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | tqdm
3 | img2pdf
4 | pywin32
5 | titlecase
6 | pyperclip
--------------------------------------------------------------------------------
/ad.txt:
--------------------------------------------------------------------------------
1 | # Your archive.org email
2 | billgates@microsoft.com
3 | # Your archive.org password
4 | toolazy
5 | # Output directory [default - current]
6 | # Image resolution (10 to 0, 0 is the highest), [default 0]
7 | 0
8 | # Maximum number of threads, [default 10]
9 | 10
10 | # Type of output - jpg, pdf, jpgpdf, jpgpdfmeta [default - jpg]
11 | jpgpdfmeta
12 | # Preparation for editing
13 | none
14 | # Book URLs or only IDs
15 | https://archive.org/details/billgatesbiograp0000becr
16 | https://archive.org/details/billgates00woog
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
AD - Archive.org-Downloader
2 |
3 |
4 | Python3 script and ready to run Windows tool to download archive.org books in PDF format !
5 |
6 |
7 |
8 | ## About
9 |
10 | Allows you to download locally books from https://archive.org!
11 |
12 | It can download original JPG files to work on them further, join them into single PDF without quality loss, or made special PDF with cover and matadata.
13 |
14 | Tool also now has instant clipboard support, and will pick up individual book URL or list of them from clipboard.
15 |
16 | You must remember to create an account on https://archive.org/ for the tool to work.
17 |
18 | ## Getting Started
19 |
20 | You can either install python, use "pip install requirements.txt" and run tool on any platform.
21 |
22 | Or if you are on Windows - you can download ready to run exe tool.
23 |
24 | Download zip archive with AD.exe and settings file from project Releases, make some folder and unpack ZIP archive you don't need to install anything else.
25 |
26 | ## Usage
27 |
28 | Open ad.txt settings file in editor, read the comments and put your information in proper place.
29 |
30 | Save settings file and run the tool, it'll pick up all new settings.
31 |
32 | ## Notes
33 |
34 | Tool is based on Archive.org-Downloader original script with many feature additions
--------------------------------------------------------------------------------
/ad.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import random, string
3 | from concurrent import futures
4 | from tqdm import tqdm
5 | import time
6 | from datetime import datetime
7 | import os
8 | import sys
9 | import shutil
10 | from titlecase import titlecase
11 | import pyperclip
12 |
13 | def display_error(response, message):
14 | print(message)
15 | print(response)
16 | print(response.text)
17 | sys.exit()
18 |
19 |
20 | # Request from site all information about book, including titles, all metadata and such
21 |
22 | def get_book_infos(session, url):
23 | r = session.get(url).text
24 | infos_url = "https:" + r.split('"url":"')[1].split('"')[0].replace("\\u0026", "&")
25 | response = session.get(infos_url)
26 | data = response.json()['data']
27 | title = titlecase(data['brOptions']['bookTitle']) # titlecase is more advanced compared to capwords method, but only for english!
28 | title = ''.join( c for c in title if c not in '<>:"/\\|?*' ) # Filter forbidden chars in directory names (Windows & Linux)
29 | title = title[:150] + " " + url.split('/')[4] # Trim the title to avoid long file names and add book URL as modificator
30 | metadata = data['metadata']
31 | links = []
32 | for item in data['brOptions']['data']:
33 | for page in item:
34 | links.append(page['uri'])
35 |
36 | if len(links) > 1:
37 | print(f"[+] This book has {len(links)} pages")
38 | return title, links, metadata
39 | else:
40 | print(f"[-] Error while getting links to images of the pages!")
41 | sys.exit() # must raise exeption, not exit!
42 |
43 | def format_data(content_type, fields):
44 | data = ""
45 | for name, value in fields.items():
46 | data += f"--{content_type}\x0d\x0aContent-Disposition: form-data; name=\"{name}\"\x0d\x0a\x0d\x0a{value}\x0d\x0a"
47 | data += content_type+"--"
48 | return data
49 |
50 | def login(email, password):
51 | session = requests.Session()
52 | session.get("https://archive.org/account/login")
53 | data = {"username":email, "password":password}
54 |
55 | response = session.post("https://archive.org/account/login", data=data)
56 | if "bad_login" in response.text:
57 | print("[-] Wrong email or password, please check!")
58 | sys.exit()
59 | elif "Successful login" in response.text:
60 | print("[+] Successfully logged in!")
61 | return session
62 | else:
63 | display_error(response, "[-] Error while logging in:")
64 |
65 | def loan(session, book_id, verbose=True):
66 | data = {
67 | "action": "grant_access",
68 | "identifier": book_id
69 | }
70 | # 2022-07-03: This request is done by the website but we don't need to do it here.
71 | # response = session.post("https://archive.org/services/loans/loan/searchInside.php", data=data)
72 | data['action'] = "browse_book"
73 | response = session.post("https://archive.org/services/loans/loan/", data=data)
74 |
75 | if response.status_code == 400 :
76 | if response.json()["error"] == "This book is not available to borrow at this time. Please try again later.":
77 | print("This book doesn't need to be borrowed")
78 | return session
79 | else :
80 | display_error(response, "Something went wrong when trying to borrow the book.")
81 |
82 | data['action'] = "create_token"
83 | response = session.post("https://archive.org/services/loans/loan/", data=data)
84 |
85 | if "token" in response.text:
86 | if verbose:
87 | print("[+] Successfully loaned this book for one hour")
88 | return session
89 | else:
90 | display_error(response, "Something went wrong when trying to borrow the book, maybe you can't borrow this book.")
91 |
92 | # routine to return loan on selected book id
93 |
94 | def return_loan(session, book_id):
95 | data = {
96 | "action": "return_loan",
97 | "identifier": book_id
98 | }
99 | response = session.post("https://archive.org/services/loans/loan/", data=data)
100 | if response.status_code == 200 and response.json()["success"]:
101 | print("[+] Book returned")
102 | else:
103 | display_error(response, "Something went wrong when trying to return the book") # else if we download multiple books we must not exit!
104 |
105 | def image_name(pages, page, directory, book_id):
106 | return f"{directory}/{book_id}_{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg"
107 |
108 | def download_one_image(session, link, i, directory, book_id, pages):
109 | image = image_name(pages, i, directory, book_id)
110 | if not os.path.exists(image):
111 | headers = {
112 | "Referer": "https://archive.org/",
113 | "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
114 | "Sec-Fetch-Site": "same-site",
115 | "Sec-Fetch-Mode": "no-cors",
116 | "Sec-Fetch-Dest": "image",
117 | }
118 | retry = True
119 | while retry:
120 | try:
121 | response = session.get(link, headers=headers)
122 | if response.status_code == 403:
123 | session = loan(session, book_id, verbose=False)
124 | raise Exception("Borrow again")
125 | elif response.status_code == 200:
126 | retry = False
127 | except KeyboardInterrupt:
128 | raise
129 | except:
130 | time.sleep(1) # Wait 1 second before retrying
131 |
132 | tmpimage = image.replace(".jpg",".tmp")
133 | with open(tmpimage,"wb") as f:
134 | f.write(response.content)
135 | os.rename(tmpimage, image)
136 |
137 | def download(session, n_threads, directory, links, scale, book_id):
138 | print("Downloading pages...")
139 | links = [f"{link}&rotate=0&scale={scale}" for link in links]
140 | pages = len(links)
141 |
142 | tasks = []
143 | with futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
144 | for link in links:
145 | i = links.index(link)
146 | tasks.append(executor.submit(download_one_image, session=session, link=link, i=i, directory=directory, book_id=book_id, pages=pages))
147 | for task in tqdm(futures.as_completed(tasks), total=len(tasks)):
148 | pass
149 |
150 | images = [image_name(pages, i, directory, book_id) for i in range(len(links))]
151 | return images
152 |
153 |
154 | # make single PDF file from multiple JPEG files
155 |
156 | def make_pdf(pdf, title, directory):
157 |
158 | file = title+".pdf"
159 |
160 | # Write only if file does not exist
161 |
162 | if not os.path.isfile(os.path.join(directory, file)):
163 | with open(os.path.join(directory, file),"wb") as f:
164 | f.write(pdf)
165 | print(f"[+] PDF saved as \"{file}\"")
166 | else:
167 | print(f"[-] PDF file \"{file}\" already present on disk")
168 |
169 | # set all config values
170 |
171 | def process_config(config_file_name):
172 |
173 | global email, password, scale, n_threads, d, outtype, urls, editingprep
174 |
175 | file1 = open(config_file_name, 'r')
176 |
177 | # set default values
178 |
179 | email = "none"
180 | password = "none"
181 | scale = 0
182 | n_threads = 10
183 | d = os.getcwd()
184 | outtype = "jpg"
185 | urls = []
186 |
187 | editingprep = 'none'
188 |
189 | # Set current parsing mode to none
190 |
191 | mode_pars = "none"
192 |
193 | while True:
194 | # Get next line from file
195 | line = file1.readline()
196 | # if line is empty
197 | # end of file is reached
198 | if not line:
199 | break
200 | line = line.strip('\n ') # remove all accidental spaces and newline
201 | if line.find("#") == 0:
202 | if line.find("# Your archive.org email")==0:
203 | mode_pars = "email"
204 | if line.find("# Your archive.org password")==0:
205 | mode_pars = "password"
206 | if line.find("# Output directory")==0:
207 | mode_pars = "outdir"
208 | if line.find("# Image resolution")==0:
209 | mode_pars = "resolution"
210 | if line.find("# Maximum number of threads")==0:
211 | mode_pars = "threads"
212 | if line.find("# Type of output - jpg")==0:
213 | mode_pars = "outtype"
214 | if line.find("# Folder and file naming")==0:
215 | mode_pars = "naming"
216 | if line.find("# Preparation for editing")==0:
217 | mode_pars = "prep"
218 |
219 | if line.find("# Book URLs")==0:
220 | mode_pars = "urls"
221 | continue
222 | if mode_pars == "email":
223 | email = line
224 | if mode_pars == "password":
225 | password = line
226 | if mode_pars == "outdir":
227 | d = line
228 | if mode_pars == "prep":
229 | editingprep = line
230 | if mode_pars == "resolution":
231 | scale = int(line)
232 | if mode_pars == "threads":
233 | n_threads = int(line)
234 | if mode_pars == "outtype":
235 | outtype = line
236 |
237 | if mode_pars == "urls":
238 | urls.append(line.strip())
239 | else:
240 | mode_pars = "none"
241 |
242 | file1.close()
243 |
244 | def get_clipboard_content():
245 | clipboard_cont = ""
246 |
247 | clipboard_cont = pyperclip.paste() # Cross platform clipboard support
248 | pyperclip.copy('') # such way we won't reuse old clipboard contents next time, you can uncomment this if you need MacOS and Linux clipboard support
249 |
250 | return clipboard_cont.splitlines()
251 |
252 | def make_pdf_metadata(metadata):
253 | # prepare PDF metadata
254 | # keywords are in 'subject'
255 | # ISBN can be got from isbn': ['9780981803982', '0981803989']
256 | # 'creator': 'Kingsley, Eve', 'date': '2008'
257 | # sometimes archive metadata is missing
258 | pdfmeta = { }
259 | # ensure metadata are str
260 | for key in ["title", "creator", "associated-names"]:
261 | if key in metadata:
262 | if isinstance(metadata[key], str):
263 | pass
264 | elif isinstance(metadata[key], list):
265 | metadata[key] = "; ".join(metadata[key])
266 | else:
267 | raise Exception("unsupported metadata type")
268 | # title
269 | if 'title' in metadata:
270 | pdfmeta['title'] = titlecase(metadata['title'])
271 |
272 | # author, we have issue here as we need sometimes to modify names from Rayan, Jack to Jack Rayan
273 |
274 | authors_list = ""
275 |
276 | if 'creator' in metadata:
277 | authors_list = metadata['creator']
278 | if 'associated-names' in metadata:
279 | if not authors_list == "":
280 | authors_list = authors_list + ";"
281 | authors_list = authors_list + metadata['associated-names']
282 |
283 | authors_split = authors_list.split(";")
284 | authors_list = ""
285 |
286 | for author in authors_split:
287 | author_res=""
288 | for ch in author:
289 | if ch not in ['0','1','2','3','4','5','6','7','8','9','-']:
290 | author_res+=ch
291 | if not author_res.find(",")==-1:
292 | author_split = author_res.split(",")
293 | author_res = author_split[1].strip()+" "+author_split[0].strip()
294 | if authors_list=="":
295 | authors_list = author_res
296 | else:
297 | authors_list = authors_list + " & " + author_res
298 |
299 | pdfmeta['author'] = authors_list
300 |
301 | if 'date' in metadata:
302 | try:
303 | pdfmeta['creationdate'] = datetime.strptime("1 June " + metadata['date'], '%d %B %Y')
304 | pdfmeta['moddate'] = pdfmeta['creationdate']
305 | except:
306 | pass
307 | # keywords
308 |
309 | pdfmeta['keywords'] = [f"https://archive.org/details/{book_id}"]
310 |
311 | # if 'subject' in metadata:
312 | # if isinstance(metadata['subject'], list):
313 | # pdfmeta['keywords'] = pdfmeta['keywords'] + metadata['subject']
314 | # else:
315 | # pdfmeta['keywords'] = pdfmeta['keywords'] + [metadata['subject']]
316 |
317 | if 'isbn' in metadata:
318 | if isinstance(metadata['isbn'], list):
319 | pdfmeta['keywords'] = pdfmeta['keywords'] + metadata['isbn']
320 | else:
321 | pdfmeta['keywords'] = pdfmeta['keywords'] + [metadata['isbn']]
322 |
323 |
324 | # if 'date' in metadata:
325 | # if isinstance(metadata['date'], list):
326 | # pdfmeta['keywords'] = pdfmeta['keywords'] + metadata['date']
327 | # else:
328 | # pdfmeta['keywords'] = pdfmeta['keywords'] + [metadata['date']]
329 |
330 | return pdfmeta
331 |
332 |
333 | # function to patch DPI values in all jpeg files
334 |
335 | def Patch_DPI(images):
336 | print('Changing DPI settings for all images to 300dpi')
337 |
338 | res_bytes = (300).to_bytes(2, byteorder='big')
339 | inches_value = bytearray([1])
340 | total_patch = inches_value + res_bytes + res_bytes
341 |
342 | with tqdm(total=len(images)) as t:
343 | for myimage in images:
344 | if os.path.isfile(myimage):
345 | with open(myimage,'r+b') as f:
346 | f.seek(6) # jump to JFIF
347 | jfif_signature = f.read(4)
348 | if jfif_signature == b'JFIF':
349 | f.seek(13) # jump to resolution values
350 | if f.read(len(total_patch )) != total_patch:
351 | f.seek(13) # jump to resolution values
352 | f.write(total_patch)
353 | t.update(1)
354 |
355 |
356 | # start of main body
357 |
358 | if __name__ == "__main__":
359 |
360 | print("Archive Downloader 2024.11.3")
361 |
362 | if len(sys.argv) == 1:
363 | print("Note that you can specify configuration file in parameters like AD C:\Path\To\MyConfig.txt")
364 |
365 | myfile = 'ad.txt'
366 |
367 | # use custom configuation file if supplied
368 |
369 | if len(sys.argv) == 2:
370 | myfile = sys.argv[1]
371 |
372 | if not os.path.isfile(myfile):
373 | print("Can't find configuration file, exiting!")
374 | sys.exit()
375 |
376 | process_config(myfile)
377 |
378 | if not os.path.isdir(d):
379 | print(f"Output directory does not exist!")
380 | sys.exit()
381 |
382 | clipboard_list = get_clipboard_content()
383 |
384 | for clip_url in clipboard_list:
385 | if clip_url.startswith("https://archive.org/details/") and not clip_url in urls:
386 | urls.append(clip_url)
387 |
388 | books = []
389 |
390 | # Check the urls format
391 | for url in urls:
392 | if url.startswith("https://archive.org/details/"):
393 | book_id = list(filter(None, url.split("/")))[3]
394 | books.append((book_id, url))
395 | elif len(url.split("/")) == 1:
396 | books.append((url, "https://archive.org/details/" + url))
397 |
398 | if len(books)==0:
399 | print("No correct books URLs to download, exiting!")
400 | sys.exit()
401 | else:
402 | print(f"{len(books)} Book(s) will be downloaded")
403 |
404 |
405 | session = login(email, password)
406 |
407 | for book in books:
408 | book_id = book[0]
409 | url = book[1]
410 | print("="*40)
411 | print(f"Current book: https://archive.org/details/{book_id}")
412 | session = loan(session, book_id)
413 | title, links, metadata = get_book_infos(session, url)
414 |
415 | directory = os.path.join(d, title) # "\\\\?\\" + os.path.join(d, title)
416 |
417 | if not os.path.isdir(directory):
418 | os.makedirs(directory)
419 |
420 | if 'title' in metadata:
421 | print("Current book title: "+ titlecase(metadata['title']))
422 |
423 | images = download(session, n_threads, directory, links, scale, book_id)
424 |
425 | if editingprep != "none":
426 | Patch_DPI(images)
427 |
428 | # make all subdirectories only if in this edit preparation mode
429 |
430 | if not os.path.isdir(os.path.join(directory, "Meta")):
431 | os.makedirs(os.path.join(directory, "Meta"))
432 | if not os.path.isdir(os.path.join(directory, "Cover")):
433 | os.makedirs(os.path.join(directory, "Cover"))
434 | if not os.path.isdir(os.path.join(directory, os.path.join("Cover","Proc"))):
435 | os.makedirs(os.path.join(directory, os.path.join("Cover","Proc")))
436 | if not os.path.isdir(os.path.join(directory, "Empty")):
437 | os.makedirs(os.path.join(directory, "Empty"))
438 | if not os.path.isdir(os.path.join(directory, "Proc")):
439 | os.makedirs(os.path.join(directory, "Proc"))
440 | if not os.path.isdir(os.path.join(directory, "Proj")):
441 | os.makedirs(os.path.join(directory, "Proj"))
442 | if not os.path.isdir(os.path.join(directory, "Illust")):
443 | os.makedirs(os.path.join(directory, "Illust"))
444 |
445 |
446 | if outtype in ("pdf","jpgpdf","jpgpdfmeta","jpgepub"): # any modes that require creation of PDF or EPUB file
447 | import img2pdf
448 |
449 | pdfmeta = make_pdf_metadata(metadata)
450 |
451 | if outtype=="jpgpdfmeta":
452 | pdf = img2pdf.convert(images[0], **pdfmeta)
453 |
454 | # In preparation mode we place meta PDF in special folder and move first and last image into cover folder
455 |
456 | if editingprep != "none":
457 | make_pdf(pdf, title[:25], os.path.join(directory, "Meta")) # shorten title in case of meta to avoid too long paths
458 | else:
459 | make_pdf(pdf, title, d)
460 | else:
461 | pdf = img2pdf.convert(images, **pdfmeta)
462 | make_pdf(pdf, title, d)
463 |
464 | # move first and last images into separate Covers folder
465 |
466 | if editingprep != "none":
467 | image1 = os.path.split(images[0]) # first one
468 | image2 = os.path.split(images[-1]) # last one
469 | image1_path = os.path.join(os.path.join(image1[0],"Cover"), image1[1])
470 | image2_path = os.path.join(os.path.join(image2[0],"Cover"), image2[1])
471 | if not os.path.exists(image1_path):
472 | os.rename(images[0], image1_path)
473 | else:
474 | os.remove(images[0])
475 | if not os.path.exists(image2_path):
476 | os.rename(images[-1], image2_path)
477 | else:
478 | os.remove(images[-1])
479 |
480 | if outtype=="pdf":
481 | try:
482 | shutil.rmtree(directory)
483 | except OSError as e:
484 | print ("Error: %s - %s." % (e.filename, e.strerror))
485 |
486 | return_loan(session, book_id)
--------------------------------------------------------------------------------