AD - Archive.org-Downloader

├── ad.bat
├── .gitattributes
├── requirements.txt
├── ad.txt
├── README.md
└── ad.py


/ad.bat:
--------------------------------------------------------------------------------
1 | python ad.py


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | tqdm
3 | img2pdf
4 | pywin32
5 | titlecase
6 | pyperclip


--------------------------------------------------------------------------------
/ad.txt:
--------------------------------------------------------------------------------
 1 | # Your archive.org email
 2 | billgates@microsoft.com
 3 | # Your archive.org password
 4 | toolazy
 5 | # Output directory [default - current]
 6 | # Image resolution (10 to 0, 0 is the highest), [default 0]
 7 | 0
 8 | # Maximum number of threads, [default 10]
 9 | 10
10 | # Type of output - jpg, pdf, jpgpdf, jpgpdfmeta [default - jpg]
11 | jpgpdfmeta
12 | # Preparation for editing
13 | none
14 | # Book URLs or only IDs
15 | https://archive.org/details/billgatesbiograp0000becr
16 | https://archive.org/details/billgates00woog


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <h1 align="center">AD - Archive.org-Downloader</h1>
 2 | 
 3 |   <p align="center">
 4 |     Python3 script and ready to run Windows tool to download archive.org books in PDF format !
 5 |     <br />
 6 |     </p>
 7 | 
 8 | ## About 
 9 | 
10 | Allows you to download locally books from https://archive.org!
11 | 
12 | It can download original JPG files to work on them further, join them into single PDF without quality loss, or made special PDF with cover and matadata.
13 | 
14 | Tool also now has instant clipboard support, and will pick up individual book URL or list of them from clipboard.
15 | 
16 | You must remember to create an account on https://archive.org/ for the tool to work.
17 | 
18 | ## Getting Started
19 | 
20 | You can either install python, use "pip install requirements.txt" and run tool on any platform.
21 | 
22 | Or if you are on Windows - you can download ready to run exe tool.
23 | 
24 | Download zip archive with AD.exe and settings file from project Releases, make some folder and unpack ZIP archive you don't need to install anything else.
25 |    
26 | ## Usage
27 | 
28 | Open ad.txt settings file in editor, read the comments and put your information in proper place.
29 | 
30 | Save settings file and run the tool, it'll pick up all new settings.
31 | 
32 | ## Notes
33 | 
34 | Tool is based on Archive.org-Downloader original script with many feature additions 


--------------------------------------------------------------------------------
/ad.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import random, string
  3 | from concurrent import futures
  4 | from tqdm import tqdm
  5 | import time
  6 | from datetime import datetime
  7 | import os
  8 | import sys
  9 | import shutil
 10 | from titlecase import titlecase
 11 | import pyperclip
 12 | 
 13 | def display_error(response, message):
 14 | 	print(message)
 15 | 	print(response)
 16 | 	print(response.text)
 17 | 	sys.exit()
 18 | 
 19 | 
 20 | # Request from site all information about book, including titles, all metadata and such
 21 | 
 22 | def get_book_infos(session, url):
 23 | 	r = session.get(url).text
 24 | 	infos_url = "https:" + r.split('"url":"')[1].split('"')[0].replace("\\u0026", "&")
 25 | 	response = session.get(infos_url)
 26 | 	data = response.json()['data']
 27 | 	title = titlecase(data['brOptions']['bookTitle']) # titlecase is more advanced compared to capwords method, but only for english!
 28 | 	title = ''.join( c for c in title if c not in '<>:"/\\|?*' ) # Filter forbidden chars in directory names (Windows & Linux)
 29 | 	title =  title[:150] + " " + url.split('/')[4] # Trim the title to avoid long file names and add book URL as modificator	
 30 | 	metadata = data['metadata']
 31 | 	links = []
 32 | 	for item in data['brOptions']['data']:
 33 | 		for page in item:
 34 | 			links.append(page['uri'])
 35 | 
 36 | 	if len(links) > 1:
 37 | 		print(f"[+] This book has {len(links)} pages")
 38 | 		return title, links, metadata
 39 | 	else:
 40 | 		print(f"[-] Error while getting links to images of the pages!")
 41 | 		sys.exit()  # must raise exeption, not exit!
 42 | 
 43 | def format_data(content_type, fields):
 44 | 	data = ""
 45 | 	for name, value in fields.items():
 46 | 		data += f"--{content_type}\x0d\x0aContent-Disposition: form-data; name=\"{name}\"\x0d\x0a\x0d\x0a{value}\x0d\x0a"
 47 | 	data += content_type+"--"
 48 | 	return data
 49 | 
 50 | def login(email, password):
 51 | 	session = requests.Session()
 52 | 	session.get("https://archive.org/account/login")
 53 | 	data = {"username":email, "password":password}
 54 | 
 55 | 	response = session.post("https://archive.org/account/login", data=data)
 56 | 	if "bad_login" in response.text:
 57 | 		print("[-] Wrong email or password, please check!")
 58 | 		sys.exit()
 59 | 	elif "Successful login" in response.text:
 60 | 		print("[+] Successfully logged in!")
 61 | 		return session
 62 | 	else:
 63 | 		display_error(response, "[-] Error while logging in:")
 64 | 
 65 | def loan(session, book_id, verbose=True):
 66 | 	data = {
 67 | 		"action": "grant_access",
 68 | 		"identifier": book_id
 69 | 	}
 70 | 	# 2022-07-03: This request is done by the website but we don't need to do it here.
 71 | 	# response = session.post("https://archive.org/services/loans/loan/searchInside.php", data=data)
 72 | 	data['action'] = "browse_book"
 73 | 	response = session.post("https://archive.org/services/loans/loan/", data=data)
 74 | 
 75 | 	if response.status_code == 400 :
 76 | 		if response.json()["error"] == "This book is not available to borrow at this time. Please try again later.":
 77 | 			print("This book doesn't need to be borrowed")
 78 | 			return session
 79 | 		else :
 80 | 			display_error(response, "Something went wrong when trying to borrow the book.")
 81 | 
 82 | 	data['action'] = "create_token"
 83 | 	response = session.post("https://archive.org/services/loans/loan/", data=data)
 84 | 
 85 | 	if "token" in response.text:
 86 | 		if verbose:
 87 | 			print("[+] Successfully loaned this book for one hour")
 88 | 		return session
 89 | 	else:
 90 | 		display_error(response, "Something went wrong when trying to borrow the book, maybe you can't borrow this book.")
 91 | 
 92 | # routine to return loan on selected book id
 93 | 
 94 | def return_loan(session, book_id):
 95 | 	data = {
 96 | 		"action": "return_loan",
 97 | 		"identifier": book_id
 98 | 	}
 99 | 	response = session.post("https://archive.org/services/loans/loan/", data=data)
100 | 	if response.status_code == 200 and response.json()["success"]:
101 | 		print("[+] Book returned")
102 | 	else:
103 | 		display_error(response, "Something went wrong when trying to return the book") # else if we download multiple books we must not exit!
104 | 
105 | def image_name(pages, page, directory, book_id):
106 | 	return f"{directory}/{book_id}_{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg"
107 | 
108 | def download_one_image(session, link, i, directory, book_id, pages):
109 | 	image = image_name(pages, i, directory, book_id)
110 | 	if not os.path.exists(image):
111 | 		headers = {
112 | 			"Referer": "https://archive.org/",
113 | 			"Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
114 | 			"Sec-Fetch-Site": "same-site",
115 | 			"Sec-Fetch-Mode": "no-cors",
116 | 			"Sec-Fetch-Dest": "image",
117 | 		}
118 | 		retry = True
119 | 		while retry:
120 | 			try:
121 | 				response = session.get(link, headers=headers)
122 | 				if response.status_code == 403:
123 | 					session = loan(session, book_id, verbose=False)
124 | 					raise Exception("Borrow again")
125 | 				elif response.status_code == 200:
126 | 					retry = False
127 | 			except KeyboardInterrupt:
128 | 				raise
129 | 			except:
130 | 				time.sleep(1)	# Wait 1 second before retrying
131 | 
132 | 		tmpimage = image.replace(".jpg",".tmp")
133 | 		with open(tmpimage,"wb") as f:
134 | 			f.write(response.content)
135 | 		os.rename(tmpimage, image)
136 | 
137 | def download(session, n_threads, directory, links, scale, book_id):	
138 | 	print("Downloading pages...")
139 | 	links = [f"{link}&rotate=0&scale={scale}" for link in links]
140 | 	pages = len(links)
141 | 
142 | 	tasks = []
143 | 	with futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
144 | 		for link in links:
145 | 			i = links.index(link)
146 | 			tasks.append(executor.submit(download_one_image, session=session, link=link, i=i, directory=directory, book_id=book_id, pages=pages))
147 | 		for task in tqdm(futures.as_completed(tasks), total=len(tasks)):
148 | 			pass
149 | 
150 | 	images = [image_name(pages, i, directory, book_id) for i in range(len(links))]
151 | 	return images
152 | 
153 | 
154 | # make single PDF file from multiple JPEG files
155 | 
156 | def make_pdf(pdf, title, directory):
157 | 
158 | 	file = title+".pdf"
159 | 
160 | 	# Write only if file does not exist
161 | 
162 | 	if not os.path.isfile(os.path.join(directory, file)):
163 | 		with open(os.path.join(directory, file),"wb") as f:
164 | 			f.write(pdf)
165 | 		print(f"[+] PDF saved as \"{file}\"")
166 | 	else:
167 | 		print(f"[-] PDF file \"{file}\" already present on disk")
168 | 
169 | # set all config values
170 | 
171 | def process_config(config_file_name):
172 | 
173 | 	global email, password, scale, n_threads, d, outtype, urls, editingprep
174 | 
175 | 	file1 = open(config_file_name, 'r')     
176 | 
177 | 	# set default values
178 | 
179 | 	email = "none"
180 | 	password = "none"
181 | 	scale = 0
182 | 	n_threads = 10
183 | 	d = os.getcwd()	
184 | 	outtype = "jpg"
185 | 	urls = []
186 | 	
187 | 	editingprep = 'none'
188 | 
189 | 	# Set current parsing mode to none
190 | 
191 | 	mode_pars = "none"
192 |   
193 | 	while True:  
194 |     		# Get next line from file
195 | 		line = file1.readline()		
196 | 		# if line is empty
197 | 		# end of file is reached
198 | 		if not line:
199 | 			break
200 | 		line = line.strip('\n ')    # remove all accidental spaces and newline
201 | 		if line.find("#") == 0:
202 | 			if line.find("# Your archive.org email")==0:
203 | 				mode_pars = "email"
204 | 			if line.find("# Your archive.org password")==0:
205 | 				mode_pars = "password"
206 | 			if line.find("# Output directory")==0:
207 | 				mode_pars = "outdir"
208 | 			if line.find("# Image resolution")==0:
209 | 				mode_pars = "resolution"
210 | 			if line.find("# Maximum number of threads")==0:
211 | 				mode_pars = "threads"
212 | 			if line.find("# Type of output - jpg")==0:
213 | 				mode_pars = "outtype"
214 | 			if line.find("# Folder and file naming")==0:
215 | 				mode_pars = "naming"
216 | 			if line.find("# Preparation for editing")==0:
217 | 				mode_pars = "prep"	
218 | 
219 | 			if line.find("# Book URLs")==0:
220 | 				mode_pars = "urls"
221 | 			continue
222 | 		if  mode_pars ==  "email":
223 | 			email = line		
224 | 		if  mode_pars ==  "password":
225 | 			password = line	
226 | 		if  mode_pars ==  "outdir":
227 | 			d = line	
228 | 		if  mode_pars ==  "prep":
229 | 			editingprep = line		
230 | 		if  mode_pars ==  "resolution":
231 | 			scale = int(line)
232 | 		if  mode_pars ==  "threads":
233 | 			n_threads = int(line)
234 | 		if  mode_pars ==  "outtype":
235 | 			outtype = line
236 | 
237 | 		if mode_pars == "urls":
238 | 			urls.append(line.strip())
239 | 		else:
240 | 			mode_pars = "none"
241 |   
242 | 	file1.close()
243 | 
244 | def get_clipboard_content():
245 | 	clipboard_cont = ""
246 | 
247 | 	clipboard_cont = pyperclip.paste() # Cross platform clipboard support
248 | 	pyperclip.copy('')  # such way we won't reuse old clipboard contents next time, you can uncomment this if you need MacOS and Linux clipboard support	
249 | 
250 | 	return clipboard_cont.splitlines()
251 | 
252 | def make_pdf_metadata(metadata):
253 | 	# prepare PDF metadata
254 |     # keywords are in 'subject'  
255 |     # ISBN can be got from isbn': ['9780981803982', '0981803989']
256 |     # 'creator': 'Kingsley, Eve', 'date': '2008'
257 | 	# sometimes archive metadata is missing
258 | 	pdfmeta = { }
259 | 	# ensure metadata are str
260 | 	for key in ["title", "creator", "associated-names"]:
261 | 		if key in metadata:
262 | 			if isinstance(metadata[key], str):
263 | 				pass
264 | 			elif isinstance(metadata[key], list):
265 | 				metadata[key] = "; ".join(metadata[key])
266 | 			else:
267 | 				raise Exception("unsupported metadata type")
268 | 	# title
269 | 	if 'title' in metadata:
270 | 		pdfmeta['title'] = titlecase(metadata['title'])
271 | 
272 | 	# author, we have issue here as we need sometimes to modify names from Rayan, Jack to Jack Rayan
273 | 
274 | 	authors_list = ""
275 | 
276 | 	if 'creator' in metadata:
277 | 		authors_list = metadata['creator']
278 | 	if 'associated-names' in metadata:
279 | 		if not authors_list == "":
280 | 			authors_list = authors_list  + ";"
281 | 		authors_list = authors_list + metadata['associated-names']
282 | 
283 | 	authors_split = authors_list.split(";")
284 | 	authors_list = ""
285 | 
286 | 	for author in authors_split:
287 | 		author_res=""
288 | 		for ch in author:
289 | 			if ch not in ['0','1','2','3','4','5','6','7','8','9','-']:
290 | 				author_res+=ch
291 | 		if not author_res.find(",")==-1:
292 | 			author_split = author_res.split(",")
293 | 			author_res = author_split[1].strip()+" "+author_split[0].strip()
294 | 		if authors_list=="":
295 | 			authors_list = author_res
296 | 		else:
297 | 			authors_list = authors_list + " & " + author_res
298 | 
299 | 	pdfmeta['author'] = authors_list
300 | 
301 | 	if 'date' in metadata:
302 | 		try:
303 | 			pdfmeta['creationdate'] = datetime.strptime("1 June " + metadata['date'], '%d %B %Y')
304 | 			pdfmeta['moddate'] = pdfmeta['creationdate']
305 | 		except:
306 | 			pass
307 | 	# keywords
308 | 
309 | 	pdfmeta['keywords'] = [f"https://archive.org/details/{book_id}"]
310 | 
311 | 	# if 'subject' in metadata:
312 | 	# 	if isinstance(metadata['subject'], list):
313 | 	# 		pdfmeta['keywords'] =  pdfmeta['keywords'] + metadata['subject']
314 | 	# 	else:
315 | 	# 		pdfmeta['keywords'] =  pdfmeta['keywords'] + [metadata['subject']]
316 | 
317 | 	if 'isbn' in metadata:
318 | 		if isinstance(metadata['isbn'], list):
319 | 			pdfmeta['keywords'] =  pdfmeta['keywords'] + metadata['isbn']
320 | 		else:
321 | 			pdfmeta['keywords'] =  pdfmeta['keywords'] + [metadata['isbn']]
322 | 		
323 | 
324 | 	# if 'date' in metadata:
325 | 	# 	if isinstance(metadata['date'], list):
326 | 	# 		pdfmeta['keywords'] =  pdfmeta['keywords'] + metadata['date']
327 | 	# 	else:
328 | 	# 		pdfmeta['keywords'] =  pdfmeta['keywords'] + [metadata['date']]
329 | 
330 | 	return pdfmeta
331 | 
332 | 
333 | # function to patch DPI values in all jpeg files
334 | 
335 | def Patch_DPI(images):
336 | 	print('Changing DPI settings for all images to 300dpi')
337 | 
338 | 	res_bytes = (300).to_bytes(2, byteorder='big')
339 | 	inches_value = bytearray([1])
340 | 	total_patch = inches_value + res_bytes + res_bytes
341 | 
342 | 	with tqdm(total=len(images)) as t:
343 | 		for myimage in images:		
344 | 			if os.path.isfile(myimage):
345 | 				with open(myimage,'r+b') as f:
346 | 					f.seek(6) # jump to JFIF
347 | 					jfif_signature = f.read(4)				
348 | 					if jfif_signature == b'JFIF':														
349 | 						f.seek(13) # jump to resolution values
350 | 						if f.read(len(total_patch )) != total_patch:
351 | 							f.seek(13) # jump to resolution values
352 | 							f.write(total_patch)
353 | 			t.update(1)				
354 | 	
355 | 
356 | # start of main body
357 | 
358 | if __name__ == "__main__":
359 | 
360 | 	print("Archive Downloader 2024.11.3")
361 | 
362 | 	if len(sys.argv) == 1:
363 | 		print("Note that you can specify configuration file in parameters like AD C:\Path\To\MyConfig.txt")	
364 | 
365 | 	myfile = 'ad.txt'	
366 | 
367 | 	# use custom configuation file if supplied
368 | 
369 | 	if len(sys.argv) == 2:
370 | 		myfile = sys.argv[1]
371 | 
372 | 	if not os.path.isfile(myfile):
373 | 		print("Can't find configuration file, exiting!")
374 | 		sys.exit()
375 | 
376 | 	process_config(myfile)
377 | 
378 | 	if not os.path.isdir(d):
379 | 		print(f"Output directory does not exist!")
380 | 		sys.exit()
381 | 
382 | 	clipboard_list = get_clipboard_content()
383 | 
384 | 	for clip_url in clipboard_list:
385 | 		if clip_url.startswith("https://archive.org/details/") and not clip_url in urls:
386 | 			urls.append(clip_url)
387 |         
388 | 	books = []
389 | 
390 | 	# Check the urls format
391 | 	for url in urls:
392 | 		if url.startswith("https://archive.org/details/"):
393 | 			book_id = list(filter(None, url.split("/")))[3]
394 | 			books.append((book_id, url))
395 | 		elif len(url.split("/")) == 1:
396 | 			books.append((url, "https://archive.org/details/" + url))
397 | 	
398 | 	if len(books)==0:
399 | 		print("No correct books URLs to download, exiting!")
400 | 		sys.exit()
401 | 	else:	
402 | 		print(f"{len(books)} Book(s) will be downloaded") 
403 | 	
404 | 
405 | 	session = login(email, password)
406 | 
407 | 	for book in books:
408 | 		book_id = book[0]
409 | 		url = book[1]
410 | 		print("="*40)
411 | 		print(f"Current book: https://archive.org/details/{book_id}")
412 | 		session = loan(session, book_id)
413 | 		title, links, metadata = get_book_infos(session, url)
414 | 
415 | 		directory = os.path.join(d, title) # "\\\\?\\" + os.path.join(d, title)
416 | 
417 | 		if not os.path.isdir(directory):
418 | 			os.makedirs(directory)
419 | 
420 | 		if 'title' in metadata:
421 | 			print("Current book title: "+ titlecase(metadata['title']))                
422 | 
423 | 		images = download(session, n_threads, directory, links, scale, book_id)
424 | 
425 | 		if editingprep != "none":				
426 | 			Patch_DPI(images)
427 | 
428 | 			# make all subdirectories only if in this edit preparation mode		
429 | 
430 | 			if not os.path.isdir(os.path.join(directory, "Meta")):
431 | 				os.makedirs(os.path.join(directory, "Meta"))
432 | 			if not os.path.isdir(os.path.join(directory, "Cover")):
433 | 				os.makedirs(os.path.join(directory, "Cover"))	
434 | 			if not os.path.isdir(os.path.join(directory, os.path.join("Cover","Proc"))):
435 | 				os.makedirs(os.path.join(directory, os.path.join("Cover","Proc")))		
436 | 			if not os.path.isdir(os.path.join(directory, "Empty")):
437 | 				os.makedirs(os.path.join(directory, "Empty"))		
438 | 			if not os.path.isdir(os.path.join(directory, "Proc")):
439 | 				os.makedirs(os.path.join(directory, "Proc"))
440 | 			if not os.path.isdir(os.path.join(directory, "Proj")):
441 | 				os.makedirs(os.path.join(directory, "Proj"))	
442 | 			if not os.path.isdir(os.path.join(directory, "Illust")):
443 | 				os.makedirs(os.path.join(directory, "Illust"))			
444 | 
445 | 
446 | 		if outtype in ("pdf","jpgpdf","jpgpdfmeta","jpgepub"): # any modes that require creation of PDF or EPUB file
447 | 			import img2pdf
448 | 			
449 | 			pdfmeta = make_pdf_metadata(metadata)			
450 | 
451 | 			if outtype=="jpgpdfmeta":											
452 | 				pdf = img2pdf.convert(images[0], **pdfmeta)		
453 | 
454 | 				# In preparation mode we place meta PDF in special folder and move first and last image into cover folder
455 | 
456 | 				if editingprep != "none":
457 | 					make_pdf(pdf, title[:25], os.path.join(directory, "Meta")) # shorten title in case of meta to avoid too long paths						
458 | 				else:
459 | 					make_pdf(pdf, title, d)
460 | 			else:
461 | 				pdf = img2pdf.convert(images, **pdfmeta)
462 | 				make_pdf(pdf, title, d)			
463 | 
464 | 		# move first and last images into separate Covers folder
465 | 
466 | 		if editingprep != "none":	
467 | 			image1 = os.path.split(images[0]) # first one				
468 | 			image2 = os.path.split(images[-1]) # last one	
469 | 			image1_path = os.path.join(os.path.join(image1[0],"Cover"), image1[1])
470 | 			image2_path = os.path.join(os.path.join(image2[0],"Cover"), image2[1])
471 | 			if not os.path.exists(image1_path):
472 | 				os.rename(images[0], image1_path)
473 | 			else:
474 | 				os.remove(images[0])	
475 | 			if not os.path.exists(image2_path):
476 | 				os.rename(images[-1], image2_path)
477 | 			else:
478 | 				os.remove(images[-1])
479 | 
480 | 		if outtype=="pdf":
481 | 				try:
482 | 					shutil.rmtree(directory)
483 | 				except OSError as e:
484 | 					print ("Error: %s - %s." % (e.filename, e.strerror))
485 | 
486 | 		return_loan(session, book_id)


--------------------------------------------------------------------------------