├── .gitignore ├── LICENSE ├── README.md ├── api.py ├── archive.png ├── explorer.png ├── requirements.txt ├── ripper.py └── screenshot.png /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | cover/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | .pybuilder/ 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | # For a library or package, you might want to ignore these files since the code is 89 | # intended to run in multiple environments; otherwise, check them in: 90 | # .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 100 | __pypackages__/ 101 | 102 | # Celery stuff 103 | celerybeat-schedule 104 | celerybeat.pid 105 | 106 | # SageMath parsed files 107 | *.sage.py 108 | 109 | # Environments 110 | .env 111 | .venv 112 | env/ 113 | venv/ 114 | ENV/ 115 | env.bak/ 116 | venv.bak/ 117 | 118 | # Spyder project settings 119 | .spyderproject 120 | .spyproject 121 | 122 | # Rope project settings 123 | .ropeproject 124 | 125 | # mkdocs documentation 126 | /site 127 | 128 | # mypy 129 | .mypy_cache/ 130 | .dmypy.json 131 | dmypy.json 132 | 133 | # Pyre type checker 134 | .pyre/ 135 | 136 | # pytype static type analyzer 137 | .pytype/ 138 | 139 | # Cython debug symbols 140 | cython_debug/ 141 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 James Shiffer 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Archive.org Ripper 2 | 3 | This script lets you download books page-by-page from [archive.org](https://archive.org) in the event that there is no PDF link. Any book with a <14 day loan period is like this, as you can see: 4 | 5 | ![](./archive.png) 6 | 7 | The script needs your login credentials to borrow the book, then it will run on its own using your session. 8 | 9 | Do not use this program in an illegal manner. Thanks! 10 | 11 | ## Screenshots 12 | 13 | ![](./screenshot.png) 14 | ![](./explorer.png) 15 | 16 | ## Dependencies 17 | 18 | Install the Python dependencies (creating a virtual environment first is recommended): 19 | 20 | ```sh 21 | pip install -r requirements.txt 22 | ``` 23 | 24 | ## Usage 25 | 26 | Run `python ripper.py -h` to get a list of all possible command line options. 27 | 28 | ## Planned Features 29 | 30 | - Searching for books instead of inputting id directly 31 | 32 | - GUI 33 | -------------------------------------------------------------------------------- /api.py: -------------------------------------------------------------------------------- 1 | # api.py 2 | # Copyright (c) 2020 James Shiffer 3 | # This file contains all the API calls made to archive.org. 4 | 5 | import base64, json, logging, lxml.etree, re, requests, sched, time 6 | 7 | from Crypto.Cipher import AES 8 | from Crypto.Hash import SHA1 9 | from Crypto.Util import Counter 10 | 11 | 12 | class ArchiveReaderClient: 13 | 14 | def __init__(self): 15 | self.session = requests.Session() 16 | self.logged_in = False 17 | self.book_id = None 18 | self.book_meta = {} 19 | self.book_page_urls = [] 20 | self.token = None 21 | self.URL_FORMAT = 'https://archive.org/%s' 22 | self.timer = sched.scheduler(time.time, time.sleep) 23 | 24 | 25 | def borrow_book(self, book_id): 26 | """ 27 | Borrows a book. You should use the scheduler instead of calling this 28 | method directly. 29 | """ 30 | logging.debug('attempting to borrow book') 31 | 32 | # borrowing is done in two phases: 'browse_book' and 'grant_access' 33 | self.book_id = book_id 34 | url = self.URL_FORMAT % 'services/loans/loan/' 35 | res = self.session.post(url, { 36 | 'action': 'browse_book', 37 | 'identifier': book_id 38 | }) 39 | js = res.json() 40 | if 'success' not in js: 41 | err = js['error'] if 'error' in js else 'unknown error' 42 | logging.error('error with action browse_book: %s' % err) 43 | raise AssertionError 44 | 45 | url = self.URL_FORMAT % 'services/loans/loan/searchInside.php' 46 | res = self.session.post(url, { 47 | 'action': 'grant_access', 48 | 'identifier': book_id 49 | }) 50 | js = res.json() 51 | if 'success' not in js: 52 | err = js['error'] if 'error' in js else 'unknown error' 53 | logging.error('error with action grant_access: %s' % err) 54 | raise AssertionError 55 | else: 56 | logging.debug('received book token: %s' % js['value']) 57 | self.token = js['value'] 58 | 59 | 60 | def renew_book(self): 61 | """ 62 | Renews a loaned book, which must be borrowed before calling this method. 63 | You should use the scheduler instead of calling this directly. 64 | """ 65 | if not self.book_id: 66 | logging.error('no book_id; you need to borrow a book first.') 67 | raise AssertionError 68 | 69 | logging.debug('attempting to renew book') 70 | url = self.URL_FORMAT % 'services/loans/loan/' 71 | res = self.session.post(url, { 72 | 'action': 'create_token', 73 | 'identifier': self.book_id 74 | }) 75 | js = res.json() 76 | if 'success' not in js: 77 | err = js['error'] if 'error' in js else 'unknown error' 78 | logging.error('error renewing book: %s' % err) 79 | raise AssertionError 80 | else: 81 | logging.debug('renewed book token: %s' % js['token']) 82 | self.token = js['token'] 83 | 84 | 85 | def schedule_renew_book(self): 86 | """ 87 | Performs one renewal and schedules the next one for two minutes in the future. 88 | """ 89 | logging.debug('time is %d, time to renew book again' % time.time()) 90 | self.renew_book() 91 | self.timer.enter(120, 1, self.schedule_renew_book) 92 | 93 | 94 | def schedule_loan_book(self, book_id): 95 | """ 96 | Borrows a book and then automatically renews it every two minutes. 97 | """ 98 | # first, borrow & renew the book once 99 | logging.debug('scheduler running borrow/renew for the first time') 100 | self.borrow_book(book_id) 101 | self.schedule_renew_book() 102 | 103 | 104 | def fetch_book_metadata(self): 105 | """ 106 | Finds the book metadata, including book title and page URLs, and 107 | returns the page count. 108 | """ 109 | if not self.book_id: 110 | logging.error('no book_id; you need to borrow a book first.') 111 | raise AssertionError 112 | 113 | # archive.org has an endpoint for getting book metadata but its url 114 | # is hidden in inline js 115 | res = self.session.get(self.URL_FORMAT % ('details/' + self.book_id)) 116 | root = lxml.etree.HTML(res.text) 117 | reader_data = root.find('.//input[@class="js-bookreader"]').get('value') 118 | reader = json.loads(reader_data) 119 | if 'url' not in reader: 120 | logging.error('bookreader metadata is missing URL field') 121 | raise AssertionError 122 | 123 | # call the endpoint and voila, we have all the info we could ever 124 | # want about our book. 125 | res = self.session.get('https:' + reader['url']) 126 | js = res.json() 127 | if 'data' not in js: 128 | logging.error('expected data in JSIA response but got none') 129 | raise AssertionError 130 | self.book_meta = js['data'] 131 | logging.debug('title: %s, imagecount: %s' % ( 132 | self.book_meta['metadata']['title'], 133 | self.book_meta['metadata']['imagecount'] 134 | )) 135 | 136 | # we only really need a list of the pages' urls 137 | flattened = [pages for spreads in \ 138 | self.book_meta['brOptions']['data'] for pages in spreads] 139 | self.book_page_urls = list(map(lambda p: p['uri'], flattened)) 140 | return len(self.book_page_urls) 141 | 142 | 143 | def download_page(self, i, scale=0): 144 | """ 145 | Downloads a single page of a book. Call fetch_book_metadata() first. 146 | """ 147 | if not self.book_meta: 148 | logging.error('no book_meta; you must fetch the metadata first.') 149 | raise AssertionError 150 | 151 | if i < 0 or i >= len(self.book_page_urls): 152 | logging.error('page index out of range') 153 | raise IndexError 154 | 155 | img_url = self.book_page_urls[i] + "&scale=%d&rotate=0" % scale 156 | res = self.session.get(img_url, headers={ 157 | 'referer': self.URL_FORMAT % ('details/' + self.book_id) 158 | }) 159 | 160 | image_data = res.content 161 | obfuscation_key = res.headers.get('x-obfuscate') 162 | if obfuscation_key: 163 | version, counter = obfuscation_key.split('|') 164 | if version != '1': 165 | raise AssertionError('obfuscation version %s not supported' % version) 166 | 167 | counter = base64.b64decode(counter) 168 | aes_key = re.sub(r'https?:\/\/.*?\/', '/', img_url) 169 | 170 | # decrypt first 1024 bytes of image data 171 | decrypted_part = self.decrypt(image_data[:1024], aes_key, counter) 172 | # replace the first 1024 bytes of image data with the decrypted data 173 | decrypted_buffer = bytearray(image_data) 174 | decrypted_buffer[:1024] = decrypted_part 175 | image_data = bytes(decrypted_buffer) 176 | 177 | return image_data 178 | 179 | 180 | def decrypt(self, data: bytes, aes_key: str, counter: bytes): 181 | """ 182 | Decrypts the data with AES-CTR using the given key. 183 | """ 184 | # hash the AES key using SHA-1 185 | sha1 = SHA1.new() 186 | sha1.update(aes_key.encode('utf-8')) 187 | aes_key_hashed = sha1.digest()[:16] # use first 16 bytes 188 | 189 | # create a counter of length 64-bit 190 | ctr = Counter.new(64, prefix=counter[:8], initial_value=int.from_bytes(counter[8:], 'big')) 191 | 192 | # create AES cipher in CTR mode 193 | cipher = AES.new(aes_key_hashed, AES.MODE_CTR, counter=ctr) 194 | 195 | # decrypt the buffer 196 | plaintext = cipher.decrypt(data) 197 | 198 | return plaintext 199 | 200 | 201 | def login(self, email, password): 202 | """ 203 | Logs a user in to their archive.org account. 204 | """ 205 | # get cookies 206 | self.session.get(self.URL_FORMAT % 'account/login') 207 | 208 | res = self.session.post(self.URL_FORMAT % 'account/login', { 209 | 'username': email, 210 | 'password': password, 211 | 'remember': True, 212 | 'referer': self.URL_FORMAT % '', 213 | 'login': True, 214 | 'submit_by_js': True 215 | }, headers={ 216 | 'referer': self.URL_FORMAT % 'account/login' 217 | }) 218 | js = res.json() 219 | if js['status'] != 'ok': 220 | logging.error('login responded with status %s, message %s' % \ 221 | (js['status'], js['message'])) 222 | raise AssertionError 223 | else: 224 | logging.debug('user has logged in successfully') 225 | -------------------------------------------------------------------------------- /archive.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scoliono/archiveripper/40914d721ed902f7b2c87957e6264e485d46e539/archive.png -------------------------------------------------------------------------------- /explorer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scoliono/archiveripper/40914d721ed902f7b2c87957e6264e485d46e539/explorer.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | certifi==2025.4.26 2 | charset-normalizer==3.4.2 3 | Deprecated==1.2.18 4 | idna==3.10 5 | img2pdf==0.6.1 6 | lxml==5.4.0 7 | packaging==25.0 8 | pikepdf==9.7.0 9 | pillow==11.2.1 10 | pycryptodome==3.23.0 11 | requests==2.32.3 12 | tqdm==4.67.1 13 | urllib3==2.4.0 14 | wrapt==1.17.2 15 | -------------------------------------------------------------------------------- /ripper.py: -------------------------------------------------------------------------------- 1 | # ripper.py 2 | # Copyright (c) 2020 James Shiffer 3 | # This file contains the main application logic. 4 | 5 | import argparse, api, getpass, img2pdf, logging, os, sys, tqdm 6 | 7 | 8 | def main(): 9 | client = api.ArchiveReaderClient() 10 | logging.basicConfig(level=logging.INFO) 11 | 12 | # Parse book id and credentials 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('id', nargs='?', 15 | help='Look for the book\'s identifier (the part of the url immediately after "https://archive.org/details/").') 16 | parser.add_argument('-u', '--username', help='Your archive.org account\'s email.') 17 | parser.add_argument('-p', '--password', help='Your archive.org account\'s password') 18 | parser.add_argument('-a', '--all-pages', action='store_true', help='Download every page of the book') 19 | parser.add_argument('-s', '--page-start', type=int, help='Download pages starting at page number N and ending at the book\'s last page, or a range if --page-end has been specified') 20 | parser.add_argument('-e', '--page-end', type=int, help='End of the range of page numbers to download') 21 | parser.add_argument('-f', '--format', choices=['pdf'], help='Converts the individual pages into a single file. Currently only PDF is supported') 22 | parser.add_argument('-d', '--output-dir', help='Directory you want the pages to be written to. If undefined the directory will be named the book id') 23 | parser.add_argument('-S', '--scale', default=0, type=int, help='Image resolution of the pages requested, can save bandwidth if the best image quality isn\'t necessary. Higher integers mean smaller resolution, default is 0 (no downscaling)') 24 | args = parser.parse_args() 25 | 26 | id = args.id 27 | username = args.username 28 | password = args.password 29 | 30 | #If any of the credentials isn't specified with cmdline args ask for it interactively 31 | if not args.id: 32 | print('Look for the book\'s identifier (the part of the url immediately after "https://archive.org/details/").') 33 | id = input('Enter it here: ') 34 | logging.debug('received book ID: %s' % id) 35 | if not args.username: 36 | username = input('Enter your archive.org email: ') 37 | if not args.password: 38 | password = getpass.getpass('Enter your archive.org password: ') 39 | 40 | 41 | logging.debug('attempting login with user-supplied credentials') 42 | client.login(username, password) 43 | 44 | logging.debug('attempting to start scheduler') 45 | client.schedule_loan_book(id) 46 | 47 | if not args.output_dir: 48 | dirname = './' + id 49 | else: 50 | dirname = os.path.expanduser(args.output_dir) 51 | 52 | logging.debug('creating output dir "%s"' % dirname) 53 | if os.path.isdir(dirname): 54 | response = input('Output folder %s already exists. Continue? ' \ 55 | % dirname) 56 | if not response.lower().startswith('y'): 57 | return 58 | else: 59 | os.mkdir(dirname) 60 | 61 | page_count = client.fetch_book_metadata() 62 | 63 | start = 0 64 | end = page_count 65 | 66 | if not args.all_pages: 67 | if not args.page_start and not args.page_end: 68 | print('The book is %d pages long. Which pages do you want?' % page_count) 69 | desired_pages = input('Enter a range (eg. 1-15) or leave blank for all: ') 70 | 71 | if desired_pages: 72 | [start, end] = desired_pages.split('-') 73 | start = int(start) - 1 74 | end = int(end) 75 | else: 76 | if args.page_start: start = args.page_start - 1 77 | if args.page_end: end = args.page_end 78 | 79 | logging.debug('planning on fetching pages %d thru %d' % (start, end)) 80 | 81 | total = end - start 82 | for i in tqdm.tqdm(range(start, end)): 83 | logging.debug('downloading page %d (index %d)' % (i + 1, 84 | i)) 85 | contents = client.download_page(i, args.scale) 86 | with open('%s/%d.jpg' % (dirname, i + 1), 'wb') as file: 87 | file.write(contents) 88 | done_count = i + 1 - start 89 | 90 | if args.format == 'pdf': 91 | print('converting images to pdf') 92 | img_list = ['%s/%d.jpg' % (dirname, i + 1) for i in range(start, end)] 93 | with open('%s.pdf' % dirname, 'wb') as pdf: 94 | pdf_data = img2pdf.convert(img_list) 95 | pdf.write(pdf_data) 96 | print('wrote %d pages to %s.pdf' % (total, dirname)) 97 | logging.debug('removing temporary image files') 98 | for img_name in img_list: 99 | os.remove(img_name) 100 | try: 101 | os.rmdir(dirname) 102 | print('removed output dir %s' % dirname) 103 | except OSError: 104 | logging.warning('unable to remove output dir %s; it may not be empty' % dirname) 105 | 106 | print('done') 107 | 108 | if __name__ == '__main__': 109 | main() 110 | -------------------------------------------------------------------------------- /screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scoliono/archiveripper/40914d721ed902f7b2c87957e6264e485d46e539/screenshot.png --------------------------------------------------------------------------------