├── .gitignore
├── LICENSE
├── README.md
├── api.py
├── archive.png
├── explorer.png
├── requirements.txt
├── ripper.py
└── screenshot.png


/.gitignore:
--------------------------------------------------------------------------------
  1 | .vscode/
  2 | 
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | cover/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | .pybuilder/
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | #   For a library or package, you might want to ignore these files since the code is
 89 | #   intended to run in multiple environments; otherwise, check them in:
 90 | # .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 94 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 95 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 96 | #   install all needed dependencies.
 97 | #Pipfile.lock
 98 | 
 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
100 | __pypackages__/
101 | 
102 | # Celery stuff
103 | celerybeat-schedule
104 | celerybeat.pid
105 | 
106 | # SageMath parsed files
107 | *.sage.py
108 | 
109 | # Environments
110 | .env
111 | .venv
112 | env/
113 | venv/
114 | ENV/
115 | env.bak/
116 | venv.bak/
117 | 
118 | # Spyder project settings
119 | .spyderproject
120 | .spyproject
121 | 
122 | # Rope project settings
123 | .ropeproject
124 | 
125 | # mkdocs documentation
126 | /site
127 | 
128 | # mypy
129 | .mypy_cache/
130 | .dmypy.json
131 | dmypy.json
132 | 
133 | # Pyre type checker
134 | .pyre/
135 | 
136 | # pytype static type analyzer
137 | .pytype/
138 | 
139 | # Cython debug symbols
140 | cython_debug/
141 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 James Shiffer
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Archive.org Ripper
 2 | 
 3 | This script lets you download books page-by-page from [archive.org](https://archive.org) in the event that there is no PDF link. Any book with a <14 day loan period is like this, as you can see:
 4 | 
 5 | ![](./archive.png)
 6 | 
 7 | The script needs your login credentials to borrow the book, then it will run on its own using your session.
 8 | 
 9 | Do not use this program in an illegal manner. Thanks!
10 | 
11 | ## Screenshots
12 | 
13 | ![](./screenshot.png)
14 | ![](./explorer.png)
15 | 
16 | ## Dependencies
17 | 
18 | Install the Python dependencies (creating a virtual environment first is recommended):
19 | 
20 | ```sh
21 | pip install -r requirements.txt
22 | ```
23 | 
24 | ## Usage
25 | 
26 | Run `python ripper.py -h` to get a list of all possible command line options.
27 | 
28 | ## Planned Features
29 | 
30 | - Searching for books instead of inputting id directly
31 | 
32 | - GUI
33 | 


--------------------------------------------------------------------------------
/api.py:
--------------------------------------------------------------------------------
  1 | # api.py
  2 | # Copyright (c) 2020  James Shiffer
  3 | # This file contains all the API calls made to archive.org.
  4 | 
  5 | import base64, json, logging, lxml.etree, re, requests, sched, time
  6 | 
  7 | from Crypto.Cipher import AES
  8 | from Crypto.Hash import SHA1
  9 | from Crypto.Util import Counter
 10 | 
 11 | 
 12 | class ArchiveReaderClient:
 13 | 
 14 |     def __init__(self):
 15 |         self.session = requests.Session()
 16 |         self.logged_in = False
 17 |         self.book_id = None
 18 |         self.book_meta = {}
 19 |         self.book_page_urls = []
 20 |         self.token = None
 21 |         self.URL_FORMAT = 'https://archive.org/%s'
 22 |         self.timer = sched.scheduler(time.time, time.sleep)
 23 | 
 24 | 
 25 |     def borrow_book(self, book_id):
 26 |         """
 27 |         Borrows a book. You should use the scheduler instead of calling this
 28 |         method directly.
 29 |         """
 30 |         logging.debug('attempting to borrow book')
 31 | 
 32 |         # borrowing is done in two phases: 'browse_book' and 'grant_access'
 33 |         self.book_id = book_id
 34 |         url = self.URL_FORMAT % 'services/loans/loan/'
 35 |         res = self.session.post(url, {
 36 |             'action': 'browse_book',
 37 |             'identifier': book_id
 38 |         })
 39 |         js = res.json()
 40 |         if 'success' not in js:
 41 |             err = js['error'] if 'error' in js else 'unknown error'
 42 |             logging.error('error with action browse_book: %s' % err)
 43 |             raise AssertionError
 44 | 
 45 |         url = self.URL_FORMAT % 'services/loans/loan/searchInside.php'
 46 |         res = self.session.post(url, {
 47 |             'action': 'grant_access',
 48 |             'identifier': book_id
 49 |         })
 50 |         js = res.json()
 51 |         if 'success' not in js:
 52 |             err = js['error'] if 'error' in js else 'unknown error'
 53 |             logging.error('error with action grant_access: %s' % err)
 54 |             raise AssertionError
 55 |         else:
 56 |             logging.debug('received book token: %s' % js['value'])
 57 |             self.token = js['value']
 58 | 
 59 | 
 60 |     def renew_book(self):
 61 |         """
 62 |         Renews a loaned book, which must be borrowed before calling this method.
 63 |         You should use the scheduler instead of calling this directly.
 64 |         """
 65 |         if not self.book_id:
 66 |             logging.error('no book_id; you need to borrow a book first.')
 67 |             raise AssertionError
 68 | 
 69 |         logging.debug('attempting to renew book')
 70 |         url = self.URL_FORMAT % 'services/loans/loan/'
 71 |         res = self.session.post(url, {
 72 |             'action': 'create_token',
 73 |             'identifier': self.book_id
 74 |         })
 75 |         js = res.json()
 76 |         if 'success' not in js:
 77 |             err = js['error'] if 'error' in js else 'unknown error'
 78 |             logging.error('error renewing book: %s' % err)
 79 |             raise AssertionError
 80 |         else:
 81 |             logging.debug('renewed book token: %s' % js['token'])
 82 |             self.token = js['token']
 83 | 
 84 | 
 85 |     def schedule_renew_book(self):
 86 |         """
 87 |         Performs one renewal and schedules the next one for two minutes in the future.
 88 |         """
 89 |         logging.debug('time is %d, time to renew book again' % time.time())
 90 |         self.renew_book()
 91 |         self.timer.enter(120, 1, self.schedule_renew_book)
 92 | 
 93 | 
 94 |     def schedule_loan_book(self, book_id):
 95 |         """
 96 |         Borrows a book and then automatically renews it every two minutes.
 97 |         """
 98 |         # first, borrow & renew the book once
 99 |         logging.debug('scheduler running borrow/renew for the first time')
100 |         self.borrow_book(book_id)
101 |         self.schedule_renew_book()
102 | 
103 | 
104 |     def fetch_book_metadata(self):
105 |         """
106 |         Finds the book metadata, including book title and page URLs, and
107 |         returns the page count.
108 |         """
109 |         if not self.book_id:
110 |             logging.error('no book_id; you need to borrow a book first.')
111 |             raise AssertionError
112 | 
113 |         # archive.org has an endpoint for getting book metadata but its url
114 |         # is hidden in inline js
115 |         res = self.session.get(self.URL_FORMAT % ('details/' + self.book_id))
116 |         root = lxml.etree.HTML(res.text)
117 |         reader_data = root.find('.//input[@class="js-bookreader"]').get('value')
118 |         reader = json.loads(reader_data)
119 |         if 'url' not in reader:
120 |             logging.error('bookreader metadata is missing URL field')
121 |             raise AssertionError
122 | 
123 |         # call the endpoint and voila, we have all the info we could ever
124 |         # want about our book.
125 |         res = self.session.get('https:' + reader['url'])
126 |         js = res.json()
127 |         if 'data' not in js:
128 |             logging.error('expected data in JSIA response but got none')
129 |             raise AssertionError
130 |         self.book_meta = js['data']
131 |         logging.debug('title: %s, imagecount: %s' % (
132 |             self.book_meta['metadata']['title'],
133 |             self.book_meta['metadata']['imagecount']
134 |         ))
135 | 
136 |         # we only really need a list of the pages' urls
137 |         flattened = [pages for spreads in \
138 |             self.book_meta['brOptions']['data'] for pages in spreads]
139 |         self.book_page_urls = list(map(lambda p: p['uri'], flattened))
140 |         return len(self.book_page_urls)
141 | 
142 | 
143 |     def download_page(self, i, scale=0):
144 |         """
145 |         Downloads a single page of a book. Call fetch_book_metadata() first.
146 |         """
147 |         if not self.book_meta:
148 |             logging.error('no book_meta; you must fetch the metadata first.')
149 |             raise AssertionError
150 | 
151 |         if i < 0 or i >= len(self.book_page_urls):
152 |             logging.error('page index out of range')
153 |             raise IndexError
154 | 
155 |         img_url = self.book_page_urls[i] + "&scale=%d&rotate=0" % scale
156 |         res = self.session.get(img_url, headers={
157 |             'referer': self.URL_FORMAT % ('details/' + self.book_id)
158 |         })
159 | 
160 |         image_data = res.content
161 |         obfuscation_key = res.headers.get('x-obfuscate')
162 |         if obfuscation_key:
163 |             version, counter = obfuscation_key.split('|')
164 |             if version != '1':
165 |                 raise AssertionError('obfuscation version %s not supported' % version)
166 |             
167 |             counter = base64.b64decode(counter)
168 |             aes_key = re.sub(r'https?:\/\/.*?\/', '/', img_url)
169 | 
170 |             # decrypt first 1024 bytes of image data
171 |             decrypted_part = self.decrypt(image_data[:1024], aes_key, counter)
172 |             # replace the first 1024 bytes of image data with the decrypted data
173 |             decrypted_buffer = bytearray(image_data)
174 |             decrypted_buffer[:1024] = decrypted_part
175 |             image_data = bytes(decrypted_buffer)
176 | 
177 |         return image_data
178 | 
179 | 
180 |     def decrypt(self, data: bytes, aes_key: str, counter: bytes):
181 |         """
182 |         Decrypts the data with AES-CTR using the given key.
183 |         """
184 |         # hash the AES key using SHA-1
185 |         sha1 = SHA1.new()
186 |         sha1.update(aes_key.encode('utf-8'))
187 |         aes_key_hashed = sha1.digest()[:16]  # use first 16 bytes
188 | 
189 |         # create a counter of length 64-bit
190 |         ctr = Counter.new(64, prefix=counter[:8], initial_value=int.from_bytes(counter[8:], 'big'))
191 | 
192 |         # create AES cipher in CTR mode
193 |         cipher = AES.new(aes_key_hashed, AES.MODE_CTR, counter=ctr)
194 | 
195 |         # decrypt the buffer
196 |         plaintext = cipher.decrypt(data)
197 | 
198 |         return plaintext
199 | 
200 | 
201 |     def login(self, email, password):
202 |         """
203 |         Logs a user in to their archive.org account.
204 |         """
205 |         # get cookies
206 |         self.session.get(self.URL_FORMAT % 'account/login')
207 | 
208 |         res = self.session.post(self.URL_FORMAT % 'account/login', {
209 |             'username': email,
210 |             'password': password,
211 |             'remember': True,
212 |             'referer': self.URL_FORMAT % '',
213 |             'login': True,
214 |             'submit_by_js': True
215 |         }, headers={
216 |             'referer': self.URL_FORMAT % 'account/login'
217 |         })
218 |         js = res.json()
219 |         if js['status'] != 'ok':
220 |             logging.error('login responded with status %s, message %s' % \
221 |                 (js['status'], js['message']))
222 |             raise AssertionError
223 |         else:
224 |             logging.debug('user has logged in successfully')
225 | 


--------------------------------------------------------------------------------
/archive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scoliono/archiveripper/40914d721ed902f7b2c87957e6264e485d46e539/archive.png


--------------------------------------------------------------------------------
/explorer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scoliono/archiveripper/40914d721ed902f7b2c87957e6264e485d46e539/explorer.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | certifi==2025.4.26
 2 | charset-normalizer==3.4.2
 3 | Deprecated==1.2.18
 4 | idna==3.10
 5 | img2pdf==0.6.1
 6 | lxml==5.4.0
 7 | packaging==25.0
 8 | pikepdf==9.7.0
 9 | pillow==11.2.1
10 | pycryptodome==3.23.0
11 | requests==2.32.3
12 | tqdm==4.67.1
13 | urllib3==2.4.0
14 | wrapt==1.17.2
15 | 


--------------------------------------------------------------------------------
/ripper.py:
--------------------------------------------------------------------------------
  1 | # ripper.py
  2 | # Copyright (c) 2020  James Shiffer
  3 | # This file contains the main application logic.
  4 | 
  5 | import argparse, api, getpass, img2pdf, logging, os, sys, tqdm
  6 | 
  7 | 
  8 | def main():
  9 |     client = api.ArchiveReaderClient()
 10 |     logging.basicConfig(level=logging.INFO)
 11 | 
 12 |     # Parse book id and credentials
 13 |     parser = argparse.ArgumentParser()
 14 |     parser.add_argument('id', nargs='?',
 15 |         help='Look for the book\'s identifier (the part of the url immediately after "https://archive.org/details/").')
 16 |     parser.add_argument('-u', '--username', help='Your archive.org account\'s email.')
 17 |     parser.add_argument('-p', '--password', help='Your archive.org account\'s password')
 18 |     parser.add_argument('-a', '--all-pages', action='store_true', help='Download every page of the book')
 19 |     parser.add_argument('-s', '--page-start', type=int, help='Download pages starting at page number N and ending at the book\'s last page, or a range if --page-end has been specified')
 20 |     parser.add_argument('-e', '--page-end', type=int, help='End of the range of page numbers to download')
 21 |     parser.add_argument('-f', '--format', choices=['pdf'], help='Converts the individual pages into a single file. Currently only PDF is supported')
 22 |     parser.add_argument('-d', '--output-dir', help='Directory you want the pages to be written to. If undefined the directory will be named the book id')
 23 |     parser.add_argument('-S', '--scale', default=0, type=int, help='Image resolution of the pages requested, can save bandwidth if the best image quality isn\'t necessary. Higher integers mean smaller resolution, default is 0 (no downscaling)')
 24 |     args = parser.parse_args()
 25 | 
 26 |     id = args.id
 27 |     username = args.username
 28 |     password = args.password
 29 | 
 30 |     #If any of the credentials isn't specified with cmdline args ask for it interactively
 31 |     if not args.id:
 32 |         print('Look for the book\'s identifier (the part of the url immediately after "https://archive.org/details/").')
 33 |         id = input('Enter it here: ')
 34 |         logging.debug('received book ID: %s' % id)
 35 |     if not args.username:
 36 |         username = input('Enter your archive.org email: ')
 37 |     if not args.password:
 38 |         password = getpass.getpass('Enter your archive.org password: ')
 39 | 
 40 | 
 41 |     logging.debug('attempting login with user-supplied credentials')
 42 |     client.login(username, password)
 43 | 
 44 |     logging.debug('attempting to start scheduler')
 45 |     client.schedule_loan_book(id)
 46 | 
 47 |     if not args.output_dir:
 48 |         dirname = './' + id
 49 |     else:
 50 |         dirname = os.path.expanduser(args.output_dir)
 51 | 
 52 |     logging.debug('creating output dir "%s"' % dirname)
 53 |     if os.path.isdir(dirname):
 54 |         response = input('Output folder %s already exists. Continue? ' \
 55 |             % dirname)
 56 |         if not response.lower().startswith('y'):
 57 |             return
 58 |     else:
 59 |         os.mkdir(dirname)
 60 | 
 61 |     page_count = client.fetch_book_metadata()
 62 | 
 63 |     start = 0
 64 |     end = page_count
 65 | 
 66 |     if not args.all_pages:
 67 |         if not args.page_start and not args.page_end:
 68 |             print('The book is %d pages long. Which pages do you want?' % page_count)
 69 |             desired_pages = input('Enter a range (eg. 1-15) or leave blank for all: ')
 70 | 
 71 |             if desired_pages:
 72 |                 [start, end] = desired_pages.split('-')
 73 |                 start = int(start) - 1
 74 |                 end = int(end)
 75 |         else:
 76 |             if args.page_start: start = args.page_start - 1
 77 |             if args.page_end: end = args.page_end
 78 | 
 79 |     logging.debug('planning on fetching pages %d thru %d' % (start, end))
 80 | 
 81 |     total = end - start
 82 |     for i in tqdm.tqdm(range(start, end)):
 83 |         logging.debug('downloading page %d (index %d)' % (i + 1,
 84 |             i))
 85 |         contents = client.download_page(i, args.scale)
 86 |         with open('%s/%d.jpg' % (dirname, i + 1), 'wb') as file:
 87 |             file.write(contents)
 88 |         done_count = i + 1 - start
 89 | 
 90 |     if args.format == 'pdf':
 91 |         print('converting images to pdf')
 92 |         img_list = ['%s/%d.jpg' % (dirname, i + 1) for i in range(start, end)]
 93 |         with open('%s.pdf' % dirname, 'wb') as pdf:
 94 |             pdf_data = img2pdf.convert(img_list)
 95 |             pdf.write(pdf_data)
 96 |             print('wrote %d pages to %s.pdf' % (total, dirname))
 97 |         logging.debug('removing temporary image files')
 98 |         for img_name in img_list:
 99 |             os.remove(img_name)
100 |         try:
101 |             os.rmdir(dirname)
102 |             print('removed output dir %s' % dirname)
103 |         except OSError:
104 |             logging.warning('unable to remove output dir %s; it may not be empty' % dirname)
105 | 
106 |     print('done')
107 | 
108 | if __name__ == '__main__':
109 |     main()
110 | 


--------------------------------------------------------------------------------
/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scoliono/archiveripper/40914d721ed902f7b2c87957e6264e485d46e539/screenshot.png


--------------------------------------------------------------------------------