├── requirements.txt ├── requirements-test.txt ├── setup.cfg ├── .gitignore ├── CHANGELOG.md ├── tox.ini ├── .travis.yml ├── setup.py ├── LICENSE ├── README.md ├── wasapi_client.py └── tests └── test_wasapi_client.py /requirements.txt: -------------------------------------------------------------------------------- 1 | requests>=2.18.1 2 | -------------------------------------------------------------------------------- /requirements-test.txt: -------------------------------------------------------------------------------- 1 | pytest>=4.6.4 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [aliases] 2 | test=pytest 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.swp 3 | *.swo 4 | *.egg 5 | *.egg-info 6 | *.log 7 | *.warc.gz 8 | manifest-*.txt 9 | 10 | .cache/ 11 | .eggs/ 12 | build/ 13 | dist/ 14 | .tox/* 15 | __pycache__/* 16 | 17 | # virtualenv 18 | bin/ 19 | include/ 20 | lib/ 21 | pip-selfcheck.json 22 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | 1.1.0 2 | ------ 3 | 4 | * Add support for Webrecorder.io. [#34](https://github.com/unt-libraries/py-wasapi-client/pull/34) 5 | * Fix tests that fail on MacOS due to `qsize`. [#36](https://github.com/unt-libraries/py-wasapi-client/pull/36) 6 | 7 | 1.0.0 8 | ------ 9 | 10 | * Initial release. 11 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 99 3 | 4 | [tox] 5 | envlist = py34,py35,py36,py37,py37-flake8 6 | 7 | [testenv] 8 | usedevelop=True 9 | deps = -r{toxinidir}/requirements-test.txt 10 | commands = py.test 11 | 12 | [testenv:py37-flake8] 13 | deps = flake8 14 | commands = flake8 wasapi_client.py tests setup.py 15 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | jobs: 2 | include: 3 | - os: linux 4 | dist: xenial 5 | python: 3.4 6 | - os: linux 7 | dist: xenial 8 | python: 3.5 9 | - os: linux 10 | dist: xenial 11 | python: 3.6 12 | - os: linux 13 | dist: xenial 14 | python: 3.7 15 | - os: osx 16 | osx_image: xcode11 17 | language: shell 18 | language: python 19 | sudo: false 20 | install: 21 | - pip3 install -r requirements-test.txt 22 | - pip3 install flake8 23 | - python3 setup.py install 24 | script: 25 | - pytest 26 | - flake8 27 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup 4 | 5 | 6 | with open('README.md', 'r') as readme_f: 7 | long_description = readme_f.read() 8 | 9 | 10 | setup( 11 | name='py-wasapi-client', 12 | version='1.1.0', 13 | url='https://github.com/unt-libraries/py-wasapi-client', 14 | author='University of North Texas Libraries', 15 | author_email='lauren.ko@unt.edu', 16 | license='BSD', 17 | py_modules=['wasapi_client'], 18 | scripts=['wasapi_client.py'], 19 | description='A client for the Archive-It and Webrecorder WASAPI Data Transer API', 20 | long_description=long_description, 21 | long_description_content_type='text/markdown', 22 | install_requires=['requests>=2.18.1'], 23 | entry_points={ 24 | 'console_scripts': [ 25 | 'wasapi-client=wasapi_client:main' 26 | ] 27 | }, 28 | setup_requires=['pytest-runner'], 29 | tests_require=['pytest'], 30 | classifiers=[ 31 | 'Intended Audience :: System Administrators', 32 | 'License :: OSI Approved :: BSD License', 33 | 'Natural Language :: English', 34 | 'Programming Language :: Python', 35 | 'Programming Language :: Python :: 3.4', 36 | 'Programming Language :: Python :: 3.5', 37 | 'Programming Language :: Python :: 3.6', 38 | 'Programming Language :: Python :: 3.7', 39 | 'Topic :: Communications :: File Sharing', 40 | ], 41 | ) 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright © 2017, Regents of the University of North Texas 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are 6 | met: 7 | 8 | * Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | * Neither the name of the University of North Texas Libraries nor the 16 | names of its contributors may be used to endorse or promote products 17 | derived from this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 21 | BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 22 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 23 | COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 24 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 25 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 26 | OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 27 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 28 | TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 29 | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 30 | DAMAGE. 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # py-wasapi-client [![Build Status](https://travis-ci.org/unt-libraries/py-wasapi-client.svg)](https://travis-ci.org/unt-libraries/py-wasapi-client) 2 | A client for the WASAPI Data Transfer API. Initially developed according to the 3 | [Archive-It specification](https://github.com/WASAPI-Community/data-transfer-apis/tree/master/ait-specification), the client now additionally supports [Webrecorder.io](https://webrecorder.io/). 4 | 5 | ## Requirements 6 | 7 | * Python 3.4-3.7 8 | 9 | ## Installation 10 | 11 | To run the latest code, the WASAPI client may be downloaded or cloned 12 | from [GitHub](https://github.com/unt-libraries/py-wasapi-client). From inside the top-level of the py-wasapi-client directory, 13 | install with: 14 | 15 | ``` 16 | $ python setup.py install 17 | ``` 18 | 19 | Alternatively, the most recent release (not guaranteed to be the latest 20 | code) may be installed from [PyPi](https://pypi.org/project/py-wasapi-client/): 21 | 22 | ``` 23 | $ pip install py-wasapi-client 24 | ``` 25 | 26 | Once installed, run the client at the command line with: 27 | 28 | ``` 29 | $ wasapi-client --help 30 | ``` 31 | 32 | That gives you usage instructions: 33 | 34 | ``` 35 | usage: wasapi-client [-h] [-b BASE_URI] [-d DESTINATION] [-l LOG] [-n] [-v] 36 | [--profile PROFILE | -u USER | -t TOKEN] 37 | [-c | -m | -p PROCESSES | -s | -r] 38 | [--collection COLLECTION [COLLECTION ...]] 39 | [--filename FILENAME] [--crawl CRAWL] 40 | [--crawl-time-after CRAWL_TIME_AFTER] 41 | [--crawl-time-before CRAWL_TIME_BEFORE] 42 | [--crawl-start-after CRAWL_START_AFTER] 43 | [--crawl-start-before CRAWL_START_BEFORE] 44 | 45 | Download WARC files from a WASAPI access point. 46 | 47 | Acceptable date/time formats are: 48 | 2017-01-01 49 | 2017-01-01T12:34:56 50 | 2017-01-01 12:34:56 51 | 2017-01-01T12:34:56Z 52 | 2017-01-01 12:34:56-0700 53 | 2017 54 | 2017-01 55 | 56 | optional arguments: 57 | -h, --help show this help message and exit 58 | -b BASE_URI, --base-uri BASE_URI 59 | base URI for WASAPI access; default: 60 | https://partner.archive-it.org/wasapi/v1/webdata 61 | -d DESTINATION, --destination DESTINATION 62 | location for storing downloaded files 63 | -l LOG, --log LOG file to which logging should be written 64 | -n, --no-manifest do not generate checksum files (ignored when used in 65 | combination with --manifest) 66 | -v, --verbose log verbosely; -v is INFO, -vv is DEBUG 67 | --profile PROFILE profile to use for API authentication 68 | -u USER, --user USER username for API authentication 69 | -t TOKEN, --token TOKEN 70 | token for API authentication 71 | -c, --count print number of files for download and exit 72 | -m, --manifest generate checksum files only and exit 73 | -p PROCESSES, --processes PROCESSES 74 | number of WARC downloading processes 75 | -s, --size print count and total size of files and exit 76 | -r, --urls list URLs for downloadable files only and exit 77 | 78 | query parameters: 79 | parameters for webdata request 80 | 81 | --collection COLLECTION [COLLECTION ...] 82 | collection identifier 83 | --filename FILENAME exact webdata filename to download 84 | --crawl CRAWL crawl job identifier 85 | --crawl-time-after CRAWL_TIME_AFTER 86 | request files created on or after this date/time 87 | --crawl-time-before CRAWL_TIME_BEFORE 88 | request files created before this date/time 89 | --crawl-start-after CRAWL_START_AFTER 90 | request files from crawl jobs starting on or after 91 | this date/time 92 | --crawl-start-before CRAWL_START_BEFORE 93 | request files from crawl jobs starting before this 94 | date/time 95 | ``` 96 | 97 | ## Configuration 98 | 99 | When you are using the tool to query an Archive-It or Webrecorder WASAPI 100 | endpoint, you will need to supply a username and password for the API. You have 101 | three options to provide these credentials. 102 | 103 | 1. Supply a username with `-u`, and you will be prompted for a password. 104 | 2. Set an environment variable called 'WASAPI_USER' to supply a username 105 | and a variable called 'WASAPI_PASS' to supply a password. 106 | 3. Supply a profile `--profile` defined in a configuration 107 | file. The configuration file should be at `~/.wasapi-client`. 108 | 109 | An example profile: 110 | 111 | ``` 112 | [unt] 113 | username = exampleUser 114 | password = examplePassword 115 | ``` 116 | 117 | Order of precedence is command line, environment, config file. 118 | 119 | ## Example Usage 120 | 121 | The following command downloads the WARC files available from a crawl 122 | with `crawl id` 256119 and logs program output to a file named 123 | `out.log`. The program will prompt the user to enter the password for 124 | user `myusername`. Downloads are carried out by one process. 125 | 126 | ``` 127 | $ wasapi-client -u myusername --crawl 256119 --log /tmp/out.log -p 1 128 | ``` 129 | 130 | The following command downloads similarly, but user credentials are 131 | supplied by a configuration file. 132 | 133 | ``` 134 | $ wasapi-client --profile unt --crawl 256119 --log out.log -p 1 135 | ``` 136 | 137 | You may supply an API token instead of user credentials. 138 | 139 | ``` 140 | $ wasapi-client --token thisistheAPItokenIwasgiven --crawl 256119 --log out.log -p 1 141 | ``` 142 | 143 | The following command downloads the WARC files available from crawls 144 | that occurred in the specified time range. Verbose logging is being 145 | written to a file named out.log. Downloads are happening via four 146 | processes and written to a directory at /tmp/wasapi_warcs/. 147 | 148 | ``` 149 | $ wasapi-client --profile unt --crawl-start-after 2016-12-22T13:01:00 --crawl-start-before 2016-12-22T15:11:00 -vv --log out.log -p 4 -d /tmp/wasapi_warcs/ 150 | 151 | ``` 152 | 153 | The following command produces the size and file count of all content 154 | available to the user. 155 | 156 | ``` 157 | $ wasapi-client --profile unt -s 158 | ``` 159 | 160 | The following command gives the user the number of files available by 161 | the given query parameters. 162 | 163 | ``` 164 | $ wasapi-client --profile unt --crawl 256119 -c 165 | ``` 166 | 167 | The following command downloads the file called example.warc.gz to 168 | the current working directory. 169 | 170 | ``` 171 | $ wasapi-client --profile unt --filename example.warc.gz 172 | ``` 173 | 174 | By default, manifest files are generated to provide checksums for the 175 | files to be downloaded. One manifest file is generated for each hash algorithm 176 | provided by the WASAPI access point. The manifest files are written to the 177 | download destination. If you don't want manifest files, use the --no-manifest 178 | flag. 179 | 180 | ``` 181 | $ wasapi-client --profile unt --crawl 256119 --log out.log --no-manifest 182 | ``` 183 | 184 | If you want to generate manifest files for your available webdata files 185 | without actually downloading the webdata files, use the --manifest flag. 186 | 187 | ``` 188 | $ wasapi-client --profile unt --crawl 256119 --manifest 189 | ``` 190 | 191 | If you would like to produce a list of URLs where your webdata files can 192 | later be downloaded by another tool (such as wget) rather than having 193 | wasapi-client do the downloading, use the --urls flag. 194 | 195 | ``` 196 | $ wasapi-client --profile unt --crawl 256119 --urls 197 | ``` 198 | 199 | To use the client with Webrecorder (not all query parameters may be supported), 200 | supply the base URL with -b. 201 | 202 | ``` 203 | $ wasapi-client -b https://webrecorder.io/api/v1/download/webdata --profile webrecorder --collection my_collection -d warcs 204 | ``` 205 | 206 | ## Run the Tests 207 | 208 | ``` 209 | $ python setup.py test 210 | ``` 211 | 212 | or 213 | 214 | ``` 215 | $ pip install tox 216 | $ tox 217 | ``` 218 | -------------------------------------------------------------------------------- /wasapi_client.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import atexit 5 | import configparser 6 | import getpass 7 | import hashlib 8 | import logging 9 | import logging.handlers 10 | import math 11 | import multiprocessing 12 | import os 13 | import requests 14 | import re 15 | import sys 16 | from collections import defaultdict 17 | try: 18 | from json.decoder import JSONDecodeError 19 | except ImportError: 20 | class JSONDecodeError(ValueError): 21 | pass 22 | from queue import Empty 23 | from urllib.parse import urlencode 24 | 25 | NAME = 'wasapi_client' if __name__ == '__main__' else __name__ 26 | 27 | LOGGER = logging.getLogger(NAME) 28 | 29 | READ_LIMIT = 1024 * 512 30 | 31 | PROFILE_PATH = os.path.join(os.path.expanduser('~'), '.wasapi-client') 32 | 33 | PRE_SIGNED_REGEX = [re.compile(r'https://.*\.s3.amazonaws\.com/.*[?].*Signature=.+')] 34 | 35 | 36 | def start_listener_logging(log_q, path=''): 37 | formatter = logging.Formatter('%(asctime)s %(levelname)s %(name)s %(message)s') 38 | if path: 39 | handler = logging.FileHandler(filename=path) 40 | else: 41 | handler = logging.StreamHandler() 42 | handler.setFormatter(formatter) 43 | 44 | # Get records from the queue and send them to the handler. 45 | listener = logging.handlers.QueueListener(log_q, handler) 46 | listener.start() 47 | 48 | return listener 49 | 50 | 51 | def configure_main_logging(log_q, log_level=logging.ERROR): 52 | """Put a handler on the root logger. 53 | 54 | This allows handling log records from imported modules. 55 | """ 56 | root = logging.getLogger() 57 | root.addHandler(logging.handlers.QueueHandler(log_q)) 58 | root.setLevel(log_level) 59 | 60 | 61 | def configure_worker_logging(log_q, log_level=logging.ERROR): 62 | """Configure logging for worker processes.""" 63 | # Remove any existing handlers. 64 | LOGGER.handlers = [] 65 | # Prevent root logger duplicating messages. 66 | LOGGER.propagate = False 67 | LOGGER.addHandler(logging.handlers.QueueHandler(log_q)) 68 | LOGGER.setLevel(log_level) 69 | 70 | 71 | class WASAPIDownloadError(Exception): 72 | pass 73 | 74 | 75 | class WASAPIManifestError(Exception): 76 | pass 77 | 78 | 79 | def make_session(auth=None, headers={}): 80 | """Make a session that will store our auth. 81 | 82 | `auth` is a tuple of the form (user, password) 83 | """ 84 | session = requests.Session() 85 | session.auth = auth 86 | session.headers.update(headers) 87 | return session 88 | 89 | 90 | def get_webdata(webdata_uri, session): 91 | """Make a request to the WASAPI.""" 92 | try: 93 | response = session.get(webdata_uri) 94 | except requests.exceptions.ConnectionError as err: 95 | sys.exit('Could not connect at {}:\n{}'.format(webdata_uri, err)) 96 | LOGGER.info('requesting {}'.format(webdata_uri)) 97 | if response.status_code == 403: 98 | sys.exit('Verify user/password for {}:\n{} {}'.format(webdata_uri, 99 | response.status_code, 100 | response.reason)) 101 | try: 102 | return response.json() 103 | except (JSONDecodeError, ValueError) as err: 104 | sys.exit('Non-JSON response from {}:\n{}'.format(webdata_uri, err)) 105 | 106 | 107 | def get_files_count(webdata_uri, auth=None, headers={}): 108 | """Return total number of downloadable files.""" 109 | session = make_session(auth, headers) 110 | webdata = get_webdata(webdata_uri, session) 111 | session.close() 112 | return webdata.get('count', None) 113 | 114 | 115 | def get_files_size(page_uri, auth=None, headers={}): 116 | """Return total size (bytes) of downloadable files.""" 117 | session = make_session(auth, headers) 118 | total = 0 119 | count = 0 120 | webdata = None 121 | while page_uri: 122 | webdata = get_webdata(page_uri, session) 123 | for f in webdata['files']: 124 | total += int(f['size']) 125 | page_uri = webdata.get('next', None) 126 | if webdata: 127 | count = webdata.get('count', None) 128 | session.close() 129 | return count, total 130 | 131 | 132 | def convert_bytes(size): 133 | """Make a human readable size.""" 134 | label = ('B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB') 135 | try: 136 | i = int(math.floor(math.log(size, 1024))) 137 | except ValueError: 138 | i = 0 139 | p = math.pow(1024, i) 140 | readable_size = round(size/p, 2) 141 | return '{}{}'.format(readable_size, label[i]) 142 | 143 | 144 | class Downloads: 145 | """Handles cycling through all of our query results. 146 | 147 | If download is True, we create a queue of the files that need to be 148 | downloaded. If manifest is True, store the checksums/filenames for 149 | each available hash algorithm. 150 | """ 151 | 152 | def __init__(self, page_uri, auth=None, download=True, destination='', 153 | headers={}): 154 | self.page_uri = page_uri 155 | self.auth = auth 156 | self.download = download 157 | if self.download: 158 | self.get_q = multiprocessing.JoinableQueue() 159 | self.checksums = defaultdict(list) 160 | self.urls = [] 161 | self.destination = '' if destination == '.' else destination 162 | self.headers = headers 163 | self.populate_downloads() 164 | 165 | def populate_downloads(self): 166 | """Repeat webdata requests to gather downloadable file info.""" 167 | session = make_session(self.auth, self.headers) 168 | current_uri = self.page_uri 169 | while current_uri: 170 | webdata = get_webdata(current_uri, session) 171 | for f in webdata['files']: 172 | # Store the first locations URL per file only. 173 | self.urls.append(f['locations'][0]) 174 | path = os.path.join(self.destination, f['filename']) 175 | for algorithm, value in f['checksums'].items(): 176 | self.checksums[algorithm].append((value, path)) 177 | if self.download: 178 | df = DataFile(f['locations'], f['filename'], f['checksums'], f['size']) 179 | self.get_q.put(df) 180 | current_uri = webdata.get('next', None) 181 | session.close() 182 | 183 | def generate_manifests(self): 184 | """Produce manifest files for all hash algorithms.""" 185 | for algorithm in self.checksums: 186 | self.write_manifest_file(algorithm) 187 | 188 | def write_manifest_file(self, algorithm): 189 | """Write a manifest file for the provided algorithm.""" 190 | if algorithm not in self.checksums: 191 | raise WASAPIManifestError('No values for {}'.format(algorithm)) 192 | manifest_path = os.path.join(self.destination, 193 | 'manifest-{}.txt'.format(algorithm)) 194 | with open(manifest_path, 'w') as manifest_f: 195 | for checksum, path in self.checksums[algorithm]: 196 | manifest_f.write('{} {}\n'.format(checksum, path)) 197 | 198 | 199 | class DataFile: 200 | """Representation of a file to be downloaded. 201 | 202 | `locations` is a list of URLs 203 | `filename` is the name of the data file 204 | `size` is the size of the file in bytes 205 | `checksums` is a dictionary of hash algorithm/value pairs 206 | `verified` is a Boolean value indicating a successful checksum verification 207 | """ 208 | 209 | def __init__(self, locations, filename, checksums, size): 210 | self.locations = locations 211 | self.filename = filename 212 | self.checksums = checksums 213 | self.size = size 214 | self.verified = False 215 | 216 | 217 | def download_file(data_file, session, output_path): 218 | """Download webdata file to disk.""" 219 | if check_exists(output_path, data_file.size, data_file.checksums): 220 | # Don't download the file if it already exists. 221 | LOGGER.info('{} exists with expected size/checksum'.format(data_file.filename)) 222 | data_file.verified = True 223 | return data_file 224 | for location in data_file.locations: 225 | 226 | # if location matches a 'pre-signed' url regex pattern, 227 | # skip auth for this location 228 | for rx in PRE_SIGNED_REGEX: 229 | if rx.match(location): 230 | sesh = requests 231 | else: 232 | sesh = session 233 | 234 | try: 235 | response = sesh.get(location, stream=True) 236 | except requests.exceptions.RequestException as err: 237 | # This could be a remote disconnect, read timeout, connection timeout, 238 | # temporary name resolution issue... 239 | LOGGER.error('Error downloading {}:\n{}'.format(location, err)) 240 | continue 241 | msg = '{}: {} {}'.format(location, 242 | response.status_code, 243 | response.reason) 244 | if response.status_code == 200: 245 | try: 246 | write_file(response, output_path) 247 | except OSError as err: 248 | LOGGER.error('{}: {}'.format(location, str(err))) 249 | break 250 | # Successful download; don't try alternate locations. 251 | LOGGER.info(msg) 252 | return data_file 253 | else: 254 | LOGGER.error(msg) 255 | # We didn't download successfully; raise error. 256 | msg = 'FAILED to download {} from {}'.format(data_file.filename, 257 | data_file.locations) 258 | raise WASAPIDownloadError(msg) 259 | 260 | 261 | def check_exists(path, size, checksums): 262 | """Check if file with matching size and checksum exists.""" 263 | if not os.path.isfile(path): 264 | return False 265 | if not os.path.getsize(path) == size: 266 | return False 267 | return verify_file(checksums, path) 268 | 269 | 270 | def write_file(response, output_path=''): 271 | """Write file to disk.""" 272 | with open(output_path, 'wb') as wtf: 273 | for chunk in response.iter_content(1024*4): 274 | wtf.write(chunk) 275 | 276 | 277 | def verify_file(checksums, file_path): 278 | """Verify the file checksum is correct. 279 | 280 | Takes a dictionary of hash algorithms and the corresponding 281 | expected value for the file_path provided. The first success 282 | or failure determines if the file is valid. 283 | """ 284 | for algorithm, value in checksums.items(): 285 | read_limit = READ_LIMIT 286 | hash_function = getattr(hashlib, algorithm, None) 287 | if not hash_function and algorithm == 's3etag': 288 | # if etag does not contain a '-', then its just a regular md5 289 | if '-' not in value: 290 | hash_function = hashlib.md5 291 | 292 | # otherwise, its likely a 'double-md5' 293 | # see: https://zihao.me/post/calculating-etag-for-aws-s3-objects/ 294 | else: 295 | hash_function = S3DoubleMD5 296 | # expected chunk size for S3 md5 computation 297 | read_limit = 1024 * 1024 * 8 298 | 299 | if not hash_function: 300 | # The hash algorithm provided is not supported by hashlib. 301 | LOGGER.debug('{} is unsupported'.format(algorithm)) 302 | continue 303 | digest = calculate_sum(hash_function, file_path, read_limit) 304 | if digest == value: 305 | LOGGER.info('Checksum success at: {}'.format(file_path)) 306 | return True 307 | else: 308 | LOGGER.error('Checksum {} mismatch for {}: expected {}, got {}'.format(algorithm, 309 | file_path, 310 | value, 311 | digest)) 312 | return False 313 | # We didn't find a compatible algorithm. 314 | return False 315 | 316 | 317 | class S3DoubleMD5: 318 | """Implements double-md5 computation as suggested by: 319 | 320 | https://zihao.me/post/calculating-etag-for-aws-s3-objects/ 321 | """ 322 | 323 | def __init__(self): 324 | self.md5s = [] 325 | 326 | def update(self, buff): 327 | self.md5s.append(hashlib.md5(buff)) 328 | 329 | def hexdigest(self): 330 | if len(self.md5s) == 1: 331 | return self.md5s[0].hexdigest() 332 | 333 | digests = b''.join(m.digest() for m in self.md5s) 334 | digests_md5 = hashlib.md5(digests) 335 | return '{}-{}'.format(digests_md5.hexdigest(), len(self.md5s)) 336 | 337 | 338 | def calculate_sum(hash_function, file_path, read_limit=READ_LIMIT): 339 | """Return the checksum of the given file.""" 340 | hasher = hash_function() 341 | with open(file_path, 'rb') as rff: 342 | r = rff.read(read_limit) 343 | while r: 344 | hasher.update(r) 345 | r = rff.read(read_limit) 346 | return hasher.hexdigest() 347 | 348 | 349 | def convert_queue(tuple_q): 350 | """Convert a queue containing 2-element tuples into a dictionary. 351 | 352 | The first element becomes a key. The key's value becomes a list 353 | to which the second tuple element is appended. 354 | """ 355 | ddict = defaultdict(list) 356 | while True: 357 | try: 358 | key, value = tuple_q.get(block=False) 359 | except Empty: 360 | break 361 | ddict[key].append(value) 362 | return ddict 363 | 364 | 365 | def generate_report(result_q): 366 | """Create a summary of success/failure downloads.""" 367 | results = convert_queue(result_q) 368 | success = len(results.get('success', [])) 369 | failure = len(results.get('failure', [])) 370 | total = success + failure 371 | summary = ('Total downloads attempted: {}\n' 372 | 'Successful downloads: {}\n' 373 | 'Failed downloads: {}\n').format(total, success, failure) 374 | if total != failure and failure > 0: 375 | summary += 'Failed files (see log for details):\n' 376 | for filename in results['failure']: 377 | summary += ' {}\n'.format(filename) 378 | return summary 379 | 380 | 381 | class Downloader(multiprocessing.Process): 382 | """Worker for downloading web files with a persistent session.""" 383 | 384 | def __init__(self, get_q, result_q, log_q, log_level=logging.ERROR, 385 | auth=None, destination='.', headers={}, *args, **kwargs): 386 | super(Downloader, self).__init__(*args, **kwargs) 387 | self.get_q = get_q 388 | self.result_q = result_q 389 | self.session = make_session(auth, headers) 390 | self.destination = destination 391 | configure_worker_logging(log_q, log_level) 392 | 393 | def run(self): 394 | """Download files from the queue until there are no more. 395 | 396 | Gets a file's data off the queue, attempts to download the 397 | file, and puts the result onto another queue. 398 | """ 399 | while True: 400 | try: 401 | data_file = self.get_q.get(block=False) 402 | except Empty: 403 | break 404 | result = 'failure' 405 | output_path = os.path.join(self.destination, data_file.filename) 406 | try: 407 | data_file = download_file(data_file, self.session, output_path) 408 | except WASAPIDownloadError as err: 409 | LOGGER.error(str(err)) 410 | else: 411 | # If we download the file without error, verify the checksum. 412 | if data_file.verified or verify_file(data_file.checksums, output_path): 413 | result = 'success' 414 | self.result_q.put((result, data_file.filename)) 415 | self.get_q.task_done() 416 | 417 | 418 | class SetQueryParametersAction(argparse.Action): 419 | """Store all of the query parameter argument values in a dict.""" 420 | 421 | def __call__(self, parser, namespace, values, option_string): 422 | if not hasattr(namespace, 'query_params'): 423 | setattr(namespace, 'query_params', {}) 424 | option = option_string.lstrip('-') 425 | namespace.query_params[option] = values 426 | 427 | 428 | def _parse_args(args=sys.argv[1:]): 429 | """Parse the commandline arguments.""" 430 | description = """ 431 | Download WARC files from a WASAPI access point. 432 | 433 | Acceptable date/time formats are: 434 | 2017-01-01 435 | 2017-01-01T12:34:56 436 | 2017-01-01 12:34:56 437 | 2017-01-01T12:34:56Z 438 | 2017-01-01 12:34:56-0700 439 | 2017 440 | 2017-01""" 441 | try: 442 | # According to multiprocessing docs, this could fail on some platforms. 443 | default_processes = multiprocessing.cpu_count() 444 | except NotImplementedError: 445 | default_processes = 1 446 | parser = argparse.ArgumentParser(description=description, 447 | formatter_class=argparse.RawDescriptionHelpFormatter) 448 | 449 | parser.add_argument('-b', 450 | '--base-uri', 451 | dest='base_uri', 452 | default='https://partner.archive-it.org/wasapi/v1/webdata', 453 | help='base URI for WASAPI access; default: ' 454 | 'https://partner.archive-it.org/wasapi/v1/webdata') 455 | parser.add_argument('-d', 456 | '--destination', 457 | default='.', 458 | help='location for storing downloaded files') 459 | parser.add_argument('-l', 460 | '--log', 461 | help='file to which logging should be written') 462 | parser.add_argument('-n', 463 | '--no-manifest', 464 | action='store_true', 465 | dest='skip_manifest', 466 | help='do not generate checksum files (ignored' 467 | ' when used in combination with --manifest)') 468 | parser.add_argument('-v', 469 | '--verbose', 470 | action='count', 471 | default=0, 472 | help='log verbosely; -v is INFO, -vv is DEBUG') 473 | 474 | auth_group = parser.add_mutually_exclusive_group() 475 | auth_group.add_argument('--profile', 476 | dest='profile', 477 | help='profile to use for API authentication') 478 | auth_group.add_argument('-u', 479 | '--user', 480 | dest='user', 481 | help='username for API authentication') 482 | auth_group.add_argument('-t', 483 | '--token', 484 | dest='token', 485 | help='token for API authentication') 486 | 487 | out_group = parser.add_mutually_exclusive_group() 488 | out_group.add_argument('-c', 489 | '--count', 490 | action='store_true', 491 | help='print number of files for download and exit') 492 | out_group.add_argument('-m', 493 | '--manifest', 494 | action='store_true', 495 | help='generate checksum files only and exit') 496 | out_group.add_argument('-p', 497 | '--processes', 498 | type=int, 499 | default=default_processes, 500 | help='number of WARC downloading processes') 501 | out_group.add_argument('-s', 502 | '--size', 503 | action='store_true', 504 | help='print count and total size of files and exit') 505 | out_group.add_argument('-r', 506 | '--urls', 507 | action='store_true', 508 | help='list URLs for downloadable files only and exit') 509 | 510 | # Arguments to become part of query parameter string 511 | param_group = parser.add_argument_group('query parameters', 512 | 'parameters for webdata request') 513 | param_group.add_argument('--collection', 514 | action=SetQueryParametersAction, 515 | nargs='+', 516 | help='collection identifier') 517 | param_group.add_argument('--filename', 518 | action=SetQueryParametersAction, 519 | help='exact webdata filename to download') 520 | param_group.add_argument('--crawl', 521 | action=SetQueryParametersAction, 522 | help='crawl job identifier') 523 | param_group.add_argument('--crawl-time-after', 524 | action=SetQueryParametersAction, 525 | help='request files created on or after this ' 526 | 'date/time') 527 | param_group.add_argument('--crawl-time-before', 528 | action=SetQueryParametersAction, 529 | help='request files created before this date/time') 530 | param_group.add_argument('--crawl-start-after', 531 | action=SetQueryParametersAction, 532 | help='request files from crawl jobs starting on ' 533 | 'or after this date/time') 534 | param_group.add_argument('--crawl-start-before', 535 | action=SetQueryParametersAction, 536 | help='request files from crawl jobs starting ' 537 | 'before this date/time') 538 | return parser.parse_args(args) 539 | 540 | 541 | def get_credentials_env(): 542 | """Get API credentials from environment variables.""" 543 | env = os.environ.get 544 | auth = (env('WASAPI_USER'), env('WASAPI_PASS')) 545 | if None in auth: 546 | auth = None 547 | else: 548 | LOGGER.debug('Using API credentials from environment variables') 549 | return auth 550 | 551 | 552 | def get_credentials_config(profile, path=PROFILE_PATH): 553 | """Get API credentials from a config file.""" 554 | config = configparser.ConfigParser() 555 | try: 556 | config.read_file(open(path)) 557 | auth = (config.get(profile, 'username'), 558 | config.get(profile, 'password')) 559 | except (OSError, 560 | configparser.NoSectionError, 561 | configparser.NoOptionError) as err: 562 | sys.exit('{}: please create config file to supply API credentials with format:\n\n' 563 | '[{}]\n' 564 | 'username = someuser\n' 565 | 'password = secretpasswd\n'.format(err, profile)) 566 | LOGGER.debug('Using API credentials from {}'.format(path)) 567 | return auth 568 | 569 | 570 | def get_credentials(user=None, profile=None): 571 | """Determine a username/password combination if one is supplied. 572 | 573 | Order of precedence is command line, environment, config file.""" 574 | auth = None 575 | if user: 576 | # If there is a username, prompt for a password. 577 | auth = (user, getpass.getpass()) 578 | else: 579 | # Check for credentials in environment variables. 580 | auth = get_credentials_env() 581 | if profile and auth is None: 582 | # Check for credentials in a config file. 583 | auth = get_credentials_config(profile) 584 | return auth 585 | 586 | 587 | def main(): 588 | args = _parse_args() 589 | 590 | if (not os.access(args.destination, os.W_OK) 591 | and not args.size 592 | and not args.count): 593 | msg = 'Cannot write to destination: {}'.format(args.destination) 594 | sys.exit(msg) 595 | 596 | # Start log writing process. 597 | manager = multiprocessing.Manager() 598 | log_q = manager.Queue() 599 | try: 600 | listener = start_listener_logging(log_q, args.log) 601 | except OSError as err: 602 | print('Could not open file for logging:', err) 603 | sys.exit(1) 604 | 605 | @atexit.register 606 | def stop_listener_logging(): 607 | """Stop listener when exiting program normally.""" 608 | listener.stop() 609 | 610 | # Configure a logger for the main process. 611 | try: 612 | log_level = [logging.ERROR, logging.INFO, logging.DEBUG][args.verbose] 613 | except IndexError: 614 | log_level = logging.DEBUG 615 | configure_main_logging(log_q, log_level) 616 | 617 | # Generate query string for the webdata request. 618 | try: 619 | query = '?{}'.format(urlencode(args.query_params, safe=':', doseq=True)) 620 | except AttributeError: 621 | # Use empty query if user didn't enter any query parameters. 622 | query = '' 623 | webdata_uri = '{}{}'.format(args.base_uri, query) 624 | 625 | # Set up authentication. 626 | auth = None 627 | headers = {} 628 | if args.token: 629 | # Set the HTTP Authentication header. 630 | headers['Authorization'] = 'Token {}'.format(args.token) 631 | else: 632 | # Generate authentication tuple for the API calls. 633 | auth = get_credentials(args.user, args.profile) 634 | 635 | # If user wants the size, don't download files. 636 | if args.size: 637 | count, size = get_files_size(webdata_uri, auth, headers) 638 | print('Number of Files: ', count) 639 | print('Size of Files: ', convert_bytes(size)) 640 | sys.exit() 641 | 642 | # If user wants a count, don't download files. 643 | if args.count: 644 | print('Number of Files: ', get_files_count(webdata_uri, auth, headers)) 645 | sys.exit() 646 | 647 | # Process webdata requests to generate checksum files. 648 | if args.manifest: 649 | downloads = Downloads(webdata_uri, auth, download=False, 650 | destination=args.destination, headers=headers) 651 | downloads.generate_manifests() 652 | sys.exit() 653 | 654 | # Print the URLs for files that can be downloaded; don't download them. 655 | if args.urls: 656 | downloads = Downloads(webdata_uri, auth, download=False, 657 | destination=args.destination, headers=headers) 658 | for url in downloads.urls: 659 | print(url) 660 | sys.exit() 661 | 662 | # Process webdata requests to fill webdata file queue. 663 | downloads = Downloads(webdata_uri, auth, download=True, 664 | destination=args.destination, headers=headers) 665 | 666 | # Write manifest file(s). 667 | if not args.skip_manifest: 668 | downloads.generate_manifests() 669 | 670 | # Download with multiple processes. 671 | get_q = downloads.get_q 672 | result_q = manager.Queue() 673 | 674 | download_processes = [] 675 | try: 676 | num_processes = min(args.processes, get_q.qsize()) 677 | except NotImplementedError: 678 | num_processes = args.processes 679 | for _ in range(num_processes): 680 | dp = Downloader(get_q, result_q, log_q, log_level, auth, 681 | args.destination, headers=headers) 682 | dp.start() 683 | download_processes.append(dp) 684 | for dp in download_processes: 685 | dp.join() 686 | get_q.join() 687 | 688 | print(generate_report(result_q)) 689 | 690 | 691 | if __name__ == '__main__': 692 | main() 693 | -------------------------------------------------------------------------------- /tests/test_wasapi_client.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import hashlib 4 | import io 5 | import json 6 | import multiprocessing 7 | import os 8 | import sys 9 | from collections import OrderedDict 10 | from logging import INFO 11 | from unittest.mock import call, mock_open, patch 12 | 13 | import pytest 14 | import requests 15 | 16 | import wasapi_client as wc 17 | 18 | 19 | WASAPI_URL = 'http://example.com/webdata' 20 | 21 | WASAPI_TEXT = "".join("""{ 22 | "count": 2, 23 | "files": [ 24 | { 25 | "account": 1, 26 | "checksums": { 27 | "md5": "61f818912d1f39bc9dd15d4b87461110", 28 | "sha1": "edef6bca652d75d0587ef411d5f028335341b074" 29 | }, 30 | "collection": 7967, 31 | "crawl": 256123, 32 | "crawl-start": "2016-12-22T14:07:24Z", 33 | "crawl-time": "2016-12-22T18:55:12Z", 34 | "filename": "AIT-JOB256123-00000.warc.gz", 35 | "filetype": "warc", 36 | "locations": [ 37 | "https://warcs.example.com/webdatafile/AIT-JOB256123-00000.warc.gz", 38 | "https://example.com/download/AIT-JOB256123-00000.warc.gz" 39 | ], 40 | "size": 943100093 41 | }, 42 | { 43 | "account": 1, 44 | "checksums": { 45 | "md5": "748120fd9672b22df5942bb44e9cde81", 46 | "sha1": "54a466421471ef7d8cb4d6bbfb85afd76022a378" 47 | }, 48 | "collection": 7967, 49 | "crawl": 256118, 50 | "crawl-start": "2016-12-22T14:01:53Z", 51 | "crawl-time": "2016-12-22T14:01:58Z", 52 | "filename": "ARCHIVEIT-JOB256118-00000.warc.gz", 53 | "filetype": "warc", 54 | "locations": [ 55 | "https://warcs.example.com/webdatafile/AIT-JOB256118-00000.warc.gz", 56 | "https://example.com/download/AIT-JOB256118-00000.warc.gz" 57 | ], 58 | "size": 6265488 59 | } 60 | ], 61 | "includes-extra": false, 62 | "next": null, 63 | "previous": null, 64 | "request-url": "https://example.com/wasapi/v1/webdata" 65 | }""".split()) 66 | 67 | 68 | NO_FILES = """{ 69 | "count": 0, 70 | "files": [], 71 | "request-url": "https://example.com/wasapi/v1/webdata", 72 | "includes-extra": false, 73 | "next": null, 74 | "previous": null 75 | }""" 76 | 77 | 78 | class MockResponse200: 79 | """A mocked successful requests GET response from WASAPI.""" 80 | 81 | def __init__(self, text=WASAPI_TEXT): 82 | self.status_code = 200 83 | self.text = text 84 | self.reason = 'OK' 85 | 86 | def json(self): 87 | return json.loads(self.text) 88 | 89 | 90 | class MockResponse403: 91 | """A mocked unsuccessful requests GET response from WASAPI.""" 92 | 93 | def __init__(self): 94 | self.status_code = 403 95 | self.reason = 'Forbidden' 96 | 97 | 98 | class Test_make_session: 99 | def test_make_session_auth(self): 100 | auth = ('user', 'pass') 101 | headers = {'Authorization': 'Token lalala'} 102 | session = wc.make_session(auth, headers) 103 | assert session.auth == auth 104 | assert 'Authorization' in session.headers 105 | 106 | def test_make_session_no_auth(self): 107 | session = wc.make_session(None) 108 | assert session.auth is None 109 | 110 | 111 | class Test_get_webdata: 112 | def test_get_webdata(self): 113 | """Test a successful response.""" 114 | session = requests.Session() 115 | with patch.object(session, 'get', return_value=MockResponse200()): 116 | response = wc.get_webdata(WASAPI_URL, session) 117 | # Compare with whitespace stripped. 118 | response_text = "".join(json.dumps(response, sort_keys=True).split()) 119 | assert response_text == WASAPI_TEXT 120 | 121 | def test_get_webdata_403_forbidden(self): 122 | """Test bad authentication handling.""" 123 | session = requests.Session() 124 | with patch.object(session, 'get', return_value=MockResponse403()): 125 | with pytest.raises(SystemExit): 126 | wc.get_webdata(WASAPI_URL, session) 127 | 128 | def test_get_webdata_ConnectionError(self): 129 | """Test host connection isn't made.""" 130 | session = requests.Session() 131 | error = requests.exceptions.ConnectionError 132 | with patch.object(session, 'get', side_effect=error): 133 | with pytest.raises(SystemExit): 134 | wc.get_webdata(WASAPI_URL, session) 135 | 136 | def test_get_webdata_json_error(self): 137 | """Test 200 non-JSON repsonse exits.""" 138 | session = requests.Session() 139 | text = 'response text is not json' 140 | with patch.object(session, 'get', return_value=MockResponse200(text)): 141 | with pytest.raises(SystemExit): 142 | wc.get_webdata(WASAPI_URL, session) 143 | 144 | 145 | @patch('requests.Session') 146 | class Test_Downloads: 147 | def test_populate_downloads(self, mock_session): 148 | """Test a queue is returned with expected data.""" 149 | mock_session.return_value.get.return_value = MockResponse200() 150 | downloads = wc.Downloads(WASAPI_URL, download=True) 151 | j_queue = downloads.get_q 152 | 153 | # Drain the JoinableQueue to avoid BrokenPipeError. 154 | # There could be a better way to handle this... 155 | for _ in (1, 2): 156 | q_item = j_queue.get() 157 | assert isinstance(q_item, wc.DataFile) 158 | j_queue.task_done() 159 | # Verify it was two items on the queue. 160 | assert j_queue.empty() 161 | 162 | def test_populate_downloads_multi_page(self, mock_session): 163 | """Test the queue returned for multiple results pages.""" 164 | # Give the first of our two page responses a next page URL. 165 | p1 = WASAPI_TEXT.replace('"next":null', '"next":"http://test?page=2"') 166 | responses = [MockResponse200(p1), MockResponse200()] 167 | mock_session.return_value.get.side_effect = responses 168 | downloads = wc.Downloads(WASAPI_URL, download=True) 169 | j_queue = downloads.get_q 170 | 171 | # Drain the JoinableQueue to avoid BrokenPipeError. 172 | for _ in range(4): 173 | q_item = j_queue.get() 174 | assert isinstance(q_item, wc.DataFile) 175 | j_queue.task_done() 176 | # Verify there were only 4 items on the queue. 177 | assert j_queue.empty() 178 | 179 | def test_populate_downloads_no_get_q(self, mock_session): 180 | """Test download=False prevents get_q attribute existing.""" 181 | mock_session.return_value.get.return_value = MockResponse200() 182 | downloads = wc.Downloads(WASAPI_URL, download=False) 183 | with pytest.raises(AttributeError): 184 | getattr(downloads, 'get_q') 185 | 186 | def test_populate_downloads_urls(self, mock_session): 187 | """Test urls is populated with first location per file.""" 188 | mock_session.return_value.get.return_value = MockResponse200() 189 | downloads = wc.Downloads(WASAPI_URL, download=False) 190 | assert len(downloads.urls) == 2 191 | for url in ['https://warcs.example.com/webdatafile/AIT-JOB256123-00000.warc.gz', 192 | 'https://warcs.example.com/webdatafile/AIT-JOB256118-00000.warc.gz']: 193 | assert url in downloads.urls 194 | 195 | def test_populate_downloads_manifest(self, mock_session): 196 | """Test the checksums dict is populated.""" 197 | mock_session.return_value.get.return_value = MockResponse200() 198 | downloads = wc.Downloads(WASAPI_URL, download=False) 199 | assert len(downloads.checksums) 200 | assert downloads.checksums['md5'] == [('61f818912d1f39bc9dd15d4b87461110', 201 | 'AIT-JOB256123-00000.warc.gz'), 202 | ('748120fd9672b22df5942bb44e9cde81', 203 | 'ARCHIVEIT-JOB256118-00000.warc.gz')] 204 | assert downloads.checksums['sha1'] == [('edef6bca652d75d0587ef411d5f028335341b074', 205 | 'AIT-JOB256123-00000.warc.gz'), 206 | ('54a466421471ef7d8cb4d6bbfb85afd76022a378', 207 | 'ARCHIVEIT-JOB256118-00000.warc.gz')] 208 | 209 | def test_populate_downloads_manifest_destination(self, mock_session): 210 | """Test the checksums dict is populated with destination included.""" 211 | mock_session.return_value.get.return_value = MockResponse200() 212 | downloads = wc.Downloads(WASAPI_URL, download=False, destination='{}tmp'.format(os.sep)) 213 | assert len(downloads.checksums) 214 | assert downloads.checksums['md5'] == [ 215 | ('61f818912d1f39bc9dd15d4b87461110', 216 | os.path.normpath('/tmp/AIT-JOB256123-00000.warc.gz')), 217 | ('748120fd9672b22df5942bb44e9cde81', 218 | os.path.normpath('/tmp/ARCHIVEIT-JOB256118-00000.warc.gz')) 219 | ] 220 | assert downloads.checksums['sha1'] == [ 221 | ('edef6bca652d75d0587ef411d5f028335341b074', 222 | os.path.normpath('/tmp/AIT-JOB256123-00000.warc.gz')), 223 | ('54a466421471ef7d8cb4d6bbfb85afd76022a378', 224 | os.path.normpath('/tmp/ARCHIVEIT-JOB256118-00000.warc.gz')) 225 | ] 226 | 227 | def test_populate_downloads_generate_manifest(self, mock_session, tmpdir): 228 | """Test checksum files are created for all algorithms.""" 229 | mock_session.return_value.get.return_value = MockResponse200() 230 | sub_dir = 'downloads' 231 | dest = tmpdir.mkdir(sub_dir) 232 | downloads = wc.Downloads(WASAPI_URL, download=False, destination=str(dest)) 233 | downloads.generate_manifests() 234 | sub_dir_contents = dest.listdir() 235 | assert len(sub_dir_contents) == 2 236 | for name in ['manifest-md5.txt', 'manifest-sha1.txt']: 237 | assert dest.join(name) in sub_dir_contents 238 | 239 | def test_write_manifest_file(self, mock_session, tmpdir): 240 | """Test a manifest file is written for the given algorithm.""" 241 | mock_session.return_value.get.return_value = MockResponse200() 242 | sub_dir = 'downloads' 243 | dest = tmpdir.mkdir(sub_dir) 244 | downloads = wc.Downloads(WASAPI_URL, download=False, destination=str(dest)) 245 | downloads.write_manifest_file('sha1') 246 | assert len(dest.listdir()) == 1 247 | txt = ( 248 | 'edef6bca652d75d0587ef411d5f028335341b074 {p}{s}AIT-JOB256123-00000.warc.gz\n' 249 | '54a466421471ef7d8cb4d6bbfb85afd76022a378 {p}{s}ARCHIVEIT-JOB256118-00000.warc.gz\n' 250 | ) 251 | assert dest.join('manifest-sha1.txt').read() == txt.format(p=dest, s=os.sep) 252 | 253 | def test_write_manifest_file_wrong_algorithm(self, mock_session, tmpdir): 254 | """Test writing a manifest file for an algorithm we don't have.""" 255 | mock_session.return_value.get.return_value = MockResponse200() 256 | sub_dir = 'downloads' 257 | dest = tmpdir.mkdir(sub_dir) 258 | downloads = wc.Downloads(WASAPI_URL, download=False, destination=str(dest)) 259 | with pytest.raises(wc.WASAPIManifestError): 260 | downloads.write_manifest_file('sha2') 261 | 262 | 263 | @patch('requests.Session') 264 | class Test_get_files_count: 265 | def test_get_files_count(self, mock_session): 266 | mock_session.return_value.get.return_value = MockResponse200() 267 | count = wc.get_files_count(WASAPI_URL) 268 | assert count == 2 269 | 270 | 271 | @patch('requests.Session') 272 | class Test_get_files_size: 273 | def test_get_files_size(self, mock_session): 274 | mock_session.return_value.get.return_value = MockResponse200() 275 | count, total = wc.get_files_size(WASAPI_URL) 276 | assert count == 2 277 | assert total == 949365581 278 | 279 | def test_get_files_size_multi_page(self, mock_session): 280 | # Give the first of our two page responses a next page URL. 281 | p1 = WASAPI_TEXT.replace('"next":null', 282 | '"next":"{}?page=2"'.format(WASAPI_URL)) 283 | # The value for `count` is pulled from the last page. Though, 284 | # in actuality, `count` should be same on all pages. 285 | p2 = WASAPI_TEXT.replace('"count":2', '"count":4') 286 | responses = [MockResponse200(p1), MockResponse200(p2)] 287 | mock_session.return_value.get.side_effect = responses 288 | count, total = wc.get_files_size(WASAPI_URL) 289 | assert count == 4 290 | assert total == 949365581 * 2 291 | 292 | def test_get_files_size_no_files(self, mock_session): 293 | mock_session.return_value.get.return_value = MockResponse200(NO_FILES) 294 | count, total = wc.get_files_size(WASAPI_URL) 295 | assert count == 0 296 | assert total == 0 297 | 298 | 299 | class Test_convert_bytes: 300 | @pytest.mark.parametrize('size, expected', [ 301 | (0, '0.0B'), 302 | (1023, '1023.0B'), 303 | (1024, '1.0KB'), 304 | (1024000, '1000.0KB'), 305 | (1048576, '1.0MB'), 306 | (1073741824, '1.0GB'), 307 | (1099511628000, '1.0TB') 308 | ]) 309 | def test_convert_bytes(self, size, expected): 310 | assert wc.convert_bytes(size) == expected 311 | 312 | 313 | class Test_download_file: 314 | locations = ['http://loc1/blah.warc.gz', 'http://loc2/blah.warc.gz'] 315 | filename = 'blah.warc.gz' 316 | checksums = {'sha1': '33304d104f95d826da40079bad2400dc4d005403', 317 | 'md5': '62f87a969af0dd857ecd6c3e7fde6aed'} 318 | size = 12345678 319 | data_file = wc.DataFile(locations, filename, checksums, size) 320 | 321 | def test_download_file_200(self): 322 | session = requests.Session() 323 | mock_200 = MockResponse200('') 324 | 325 | with patch.object(session, 'get', return_value=mock_200) as mock_get, \ 326 | patch('wasapi_client.write_file') as mock_write_file: 327 | file_data = wc.download_file(self.data_file, session, self.filename) 328 | 329 | # Check we only tried downloading files until successful download. 330 | mock_get.assert_called_once_with(self.locations[0], stream=True) 331 | mock_write_file.assert_called_once_with(mock_200, self.filename) 332 | assert not file_data.verified 333 | 334 | def test_download_file_not_200(self): 335 | session = requests.Session() 336 | mock_403 = MockResponse403() 337 | 338 | with patch.object(session, 'get', return_value=mock_403) as mock_get, \ 339 | pytest.raises(wc.WASAPIDownloadError) as err: 340 | wc.download_file(self.data_file, session, self.filename) 341 | for item in (str(self.locations), self.filename): 342 | assert item in err.value.args[0] 343 | # Check all locations were tried. 344 | calls = [call(self.locations[0], stream=True), 345 | call(self.locations[1], stream=True)] 346 | mock_get.assert_has_calls(calls) 347 | 348 | def test_download_get_raises_some_RequestException(self, caplog): 349 | caplog.set_level(INFO) 350 | session = requests.Session() 351 | mock_200 = MockResponse200('') 352 | 353 | with patch.object(session, 'get') as mock_get, \ 354 | patch('wasapi_client.write_file') as mock_write_file: 355 | # Raise a subclass of RequestException on first download attempt; 356 | # mock a successful response on the second attempt 357 | mock_get.side_effect = [requests.exceptions.ConnectionError(), 358 | mock_200] 359 | wc.download_file(self.data_file, session, self.filename) 360 | 361 | # Check all locations were tried. 362 | calls = [call(self.locations[0], stream=True), 363 | call(self.locations[1], stream=True)] 364 | mock_get.assert_has_calls(calls) 365 | mock_write_file.assert_called_once_with(mock_200, self.filename) 366 | # Verify requests exception was caught and logged. 367 | for msg in ('Error downloading http://loc1/blah.warc.gz:', 368 | 'http://loc2/blah.warc.gz: 200 OK'): 369 | assert msg in caplog.text 370 | 371 | def test_download_file_OSError(self): 372 | session = requests.Session() 373 | mock_200 = MockResponse200('') 374 | 375 | with patch.object(session, 'get', return_value=mock_200) as mock_get, \ 376 | patch('wasapi_client.write_file') as mock_write_file: 377 | mock_write_file.side_effect = OSError 378 | with pytest.raises(wc.WASAPIDownloadError) as err: 379 | wc.download_file(self.data_file, session, self.filename) 380 | 381 | for item in (str(self.locations), self.filename): 382 | assert item in err.value.args[0] 383 | # Check we only tried downloading files until successful download. 384 | mock_get.assert_called_once_with(self.locations[0], stream=True) 385 | mock_write_file.assert_called_once_with(mock_200, self.filename) 386 | 387 | def test_download_check_exists_true(self): 388 | """Test a file already existing on the filesystem is not downloaded.""" 389 | with patch('wasapi_client.check_exists', return_value=True), \ 390 | patch('requests.Session', autospec=True) as mock_session: 391 | file_data = wc.download_file(self.data_file, mock_session, self.filename) 392 | # Check `verified` has been set True on the FileData instance. 393 | assert file_data.verified 394 | # Check that no get request was made. 395 | assert not mock_session.get.called 396 | 397 | def test_download_uses_pre_signed_url(self): 398 | """Test that an s3 URL uses requests.get, not a session.""" 399 | locations = ['https://data.s3.amazonaws.com/warcs/blah.warc.gz?Signature=xyz', 400 | 'http://loc2/blah.warc.gz'] 401 | filename = 'blah.warc.gz' 402 | checksums = {'md5': '72b484a2610cb54ec22e48c8104ba3bd'} 403 | data_file = wc.DataFile(locations, filename, checksums, 123456) 404 | mock_200 = MockResponse200('') 405 | 406 | with patch('requests.get', return_value=mock_200) as mock_get, \ 407 | patch('wasapi_client.write_file') as mock_write_file: 408 | wc.download_file(data_file, requests.Session(), filename) 409 | 410 | # Check we attempted one download via requests.get and wrote the file. 411 | mock_get.assert_called_once_with(locations[0], stream=True) 412 | mock_write_file.assert_called_once_with(mock_200, filename) 413 | 414 | 415 | class Test_check_exists: 416 | def test_check_exists_return_true(self): 417 | checksums = {'sha1': '33304d104f95d826da40079bad2400dc4d005403'} 418 | with patch('os.path.isfile', return_value=True), \ 419 | patch('os.path.getsize', return_value=123456), \ 420 | patch('wasapi_client.verify_file', return_value=True) as mock_verify: 421 | assert wc.check_exists('path', 123456, checksums) 422 | mock_verify.assert_called_once_with(checksums, 'path') 423 | 424 | @patch('os.path.isfile', return_value=False) 425 | @patch('os.path.getsize') 426 | def test_check_exists_no_file(self, mock_getsize, mock_isfile): 427 | assert not wc.check_exists('path', 123456, {}) 428 | mock_isfile.assert_called_once_with('path') 429 | assert not mock_getsize.called 430 | 431 | @patch('os.path.isfile', return_value=True) 432 | @patch('os.path.getsize', return_value=123456) 433 | @patch('wasapi_client.verify_file') 434 | def test_check_exists_file_size_mismatch(self, mock_verify, mock_getsize, mock_isfile): 435 | assert not wc.check_exists('path', 789, {}) 436 | mock_isfile.assert_called_once_with('path') 437 | mock_getsize.assert_called_once_with('path') 438 | assert not mock_verify.called 439 | 440 | def test_check_exists_checksum_fail(self): 441 | with patch('os.path.isfile', return_value=True), \ 442 | patch('os.path.getsize', return_value=123456), \ 443 | patch('wasapi_client.verify_file', return_value=False) as mock_verify: 444 | assert not wc.check_exists('path', 123456, {}) 445 | mock_verify.assert_called_once_with({}, 'path') 446 | 447 | 448 | class Test_verify_file: 449 | @patch('wasapi_client.calculate_sum') 450 | def test_verify_file(self, mock_calc_sum): 451 | """Test a matching checksum returns True.""" 452 | checksum = '33304d104f95d826da40079bad2400dc4d005403' 453 | checksums = {'sha1': checksum} 454 | mock_calc_sum.return_value = checksum 455 | assert wc.verify_file(checksums, 'dummy/path') 456 | 457 | def test_verify_file_unsupported_algorithm(self): 458 | """Test all algorithms being unsupported returns False.""" 459 | checksums = {'shaq1': 'shaq1algorithmdoesnotexist'} 460 | assert not wc.verify_file(checksums, 'dummy/path') 461 | 462 | @patch('wasapi_client.calculate_sum') 463 | def test_verify_file_checksum_mismatch(self, mock_calc_sum): 464 | """Test calculated checksum does not match the expected.""" 465 | checksum = '33304d104f95d826da40079bad2400dc4d005403' 466 | algorithm = 'sha1' 467 | path = 'dummy/path' 468 | checksums = {algorithm: checksum} 469 | mock_calc_sum.return_value = checksum + 'notmatching' 470 | with patch('wasapi_client.LOGGER', autospec=True) as mock_logger: 471 | assert not wc.verify_file(checksums, path) 472 | msg = 'Checksum {} mismatch for {}: expected {}, got {}notmatching'.format(algorithm, 473 | path, 474 | checksum, 475 | checksum) 476 | mock_logger.error.assert_called_once_with(msg) 477 | 478 | @patch('wasapi_client.calculate_sum') 479 | def test_verify_file_one_supported_algorithm(self, mock_calc_sum): 480 | """Test one unsupported/one supported algorithm returns True.""" 481 | checksum = '33304d104f95d826da40079bad2400dc4d005403' 482 | checksums = OrderedDict([('abc', 'algorithm_unsupported'), 483 | ('sha1', checksum)]) 484 | mock_calc_sum.return_value = checksum 485 | with patch('wasapi_client.LOGGER', autospec=True) as mock_logger: 486 | assert wc.verify_file(checksums, 'dummy/path') 487 | # Check that unsupported algorithm was tried. 488 | mock_logger.debug.assert_called_once_with('abc is unsupported') 489 | mock_logger.info.assert_called_once_with('Checksum success at: dummy/path') 490 | 491 | @patch('wasapi_client.calculate_sum') 492 | def test_verify_file_s3etag_algorithm_regular_md5(self, mock_calc_sum): 493 | checksum = '72b484a2610cb54ec22e48c8104ba3bd' 494 | checksums = {'s3etag': checksum} 495 | mock_calc_sum.return_value = checksum 496 | assert wc.verify_file(checksums, 'dummy/path') 497 | # Verify the hash_function used was md5. 498 | mock_calc_sum.assert_called_once_with(hashlib.md5, 'dummy/path', wc.READ_LIMIT) 499 | 500 | @patch('wasapi_client.calculate_sum') 501 | def test_verify_file_s3etag_algorithm_double_md5(self, mock_calc_sum): 502 | checksum = 'ceb8853ddc5086cc4ab9e149f8f09c88-2' 503 | checksums = {'s3etag': checksum} 504 | mock_calc_sum.return_value = checksum 505 | assert wc.verify_file(checksums, 'dummy/path') 506 | # Verify s3etag value containing a '-' uses S3DoubleMD5 and custom read_limit. 507 | mock_calc_sum.assert_called_once_with(wc.S3DoubleMD5, 'dummy/path', 1024*1024*8) 508 | 509 | 510 | class Test_S3DoubleMD5: 511 | def test_S3DoubleMD5_single_md5(self): 512 | content = b'We are updating this once.' 513 | s3md5 = wc.S3DoubleMD5() 514 | s3md5.update(content) 515 | # Calling update once means length of s3md5.md5s is 1, and 516 | # hexdigest is same as for regular md5. 517 | assert len(s3md5.md5s) == 1 518 | assert s3md5.hexdigest() == hashlib.md5(content).hexdigest() 519 | 520 | def test_S3DoubleMD5_double_md5(self): 521 | content = b'We are updating this once.\nTwice.\nAnd three times.' 522 | s3md5 = wc.S3DoubleMD5() 523 | # Cause update to be called three times. 524 | for line in content.split(b'\n'): 525 | s3md5.update(line) 526 | # S3DoubleMD5 hexdigest should be the hexdigest of the concatenation 527 | # of the digests of the 3 items in s3md5.md5s and a '-3' 528 | # for the number of digests that were concatenated. 529 | assert len(s3md5.md5s) == 3 530 | assert s3md5.hexdigest() == '8e73850eb35bebe8ebd2896dd9032e48-3' 531 | 532 | 533 | class Test_calculate_sum: 534 | @pytest.mark.skipif(sys.version_info < (3, 4, 4), reason=('bug via mock_open ' 535 | 'https://github.com/python/cpython/commit/86b34d')) 536 | def test_calculate_sum(self): 537 | data = 'data from file'.encode('utf-8') 538 | with patch('builtins.open', mock_open(read_data=data)): 539 | checksum = wc.calculate_sum(hashlib.sha1, 'dummy/path') 540 | assert checksum == hashlib.sha1(data).hexdigest() 541 | 542 | 543 | class Test_convert_queue: 544 | def test_convert_queue(self): 545 | m = multiprocessing.Manager() 546 | q = m.Queue() 547 | q.put(('success', 'name1')) 548 | q.put(('failure', 'name2')) 549 | dict_from_q = wc.convert_queue(q) 550 | assert dict_from_q['success'] == ['name1'] 551 | assert dict_from_q['failure'] == ['name2'] 552 | m.shutdown() 553 | 554 | 555 | class Test_generate_report: 556 | def test_generate_report_all_success(self): 557 | m = multiprocessing.Manager() 558 | q = m.Queue() 559 | q.put(('success', 'name1')) 560 | q.put(('success', 'name2')) 561 | report = wc.generate_report(q) 562 | assert report == ('Total downloads attempted: 2\n' 563 | 'Successful downloads: 2\n' 564 | 'Failed downloads: 0\n') 565 | m.shutdown() 566 | 567 | def test_generate_report_one_failure(self): 568 | m = multiprocessing.Manager() 569 | q = m.Queue() 570 | q.put(('success', 'name1')) 571 | q.put(('failure', 'name2')) 572 | report = wc.generate_report(q) 573 | assert report == ('Total downloads attempted: 2\n' 574 | 'Successful downloads: 1\n' 575 | 'Failed downloads: 1\n' 576 | 'Failed files (see log for details):\n' 577 | ' name2\n') 578 | m.shutdown() 579 | 580 | def test_generate_report_all_failure(self): 581 | m = multiprocessing.Manager() 582 | q = m.Queue() 583 | q.put(('failure', 'name1')) 584 | q.put(('failure', 'name2')) 585 | report = wc.generate_report(q) 586 | assert report == ('Total downloads attempted: 2\n' 587 | 'Successful downloads: 0\n' 588 | 'Failed downloads: 2\n') 589 | m.shutdown() 590 | 591 | 592 | class TestDownloader: 593 | locations = ['http://loc1/blah.warc.gz', 'http://loc2/blah.warc.gz'] 594 | filename = 'blah.warc.gz' 595 | checksums = {'sha1': '33304d104f95d826da40079bad2400dc4d005403', 596 | 'md5': '62f87a969af0dd857ecd6c3e7fde6aed'} 597 | size = 12345678 598 | data_file = wc.DataFile(locations, filename, checksums, size) 599 | 600 | def test_run(self): 601 | """Test downloader when downloads are successful.""" 602 | # Create a queue holding two sets of file data. 603 | get_q = multiprocessing.JoinableQueue() 604 | for _ in (1, 2): 605 | get_q.put(self.data_file) 606 | manager = multiprocessing.Manager() 607 | result_q = manager.Queue() 608 | log_q = manager.Queue() 609 | with patch('wasapi_client.verify_file', return_value=True), \ 610 | patch('wasapi_client.download_file', return_value=self.data_file): 611 | p = wc.Downloader(get_q, result_q, log_q) 612 | p.start() 613 | p.run() 614 | # If the join doesn't block, the queue is fully processed. 615 | get_q.join() 616 | # Verify there is nothing on the log_q. 617 | assert log_q.empty() 618 | for _ in (1, 2): 619 | assert result_q.get() == ('success', self.filename) 620 | # Verify those were the only two results on the result_q. 621 | assert result_q.empty() 622 | 623 | @patch('wasapi_client.download_file') 624 | def test_run_WASAPIDownloadError(self, mock_download): 625 | """Test downloader when downloads fail.""" 626 | expected_error = 'WD Error' 627 | mock_download.side_effect = wc.WASAPIDownloadError(expected_error) 628 | # Create a queue holding two sets of file data. 629 | get_q = multiprocessing.JoinableQueue() 630 | for _ in (1, 2): 631 | get_q.put(self.data_file) 632 | manager = multiprocessing.Manager() 633 | result_q = manager.Queue() 634 | log_q = manager.Queue() 635 | p = wc.Downloader(get_q, result_q, log_q) 636 | p.start() 637 | p.run() 638 | # If the join doesn't block, the queue is fully processed. 639 | get_q.join() 640 | for _ in (1, 2): 641 | assert log_q.get().msg == expected_error 642 | assert result_q.get() == ('failure', self.filename) 643 | # Verify those were the only two results on the result_q. 644 | # Sometimes `empty` needs a moment to register. 645 | assert result_q.empty() 646 | 647 | def test_run_file_already_verified(self): 648 | """Test a downloaded file is not verified twice.""" 649 | return_data_file = wc.DataFile(self.locations, self.filename, self.checksums, self.size) 650 | return_data_file.verified = True 651 | # Create a queue holding two sets of file data. 652 | get_q = multiprocessing.JoinableQueue() 653 | for _ in (1, 2): 654 | get_q.put(self.data_file) 655 | manager = multiprocessing.Manager() 656 | result_q = manager.Queue() 657 | log_q = manager.Queue() 658 | with patch('wasapi_client.verify_file', return_value=True) as mock_verify, \ 659 | patch('wasapi_client.download_file', return_value=return_data_file): 660 | p = wc.Downloader(get_q, result_q, log_q) 661 | p.start() 662 | p.run() 663 | # If the join doesn't block, the queue is fully processed. 664 | get_q.join() 665 | assert log_q.empty() 666 | for _ in (1, 2): 667 | assert result_q.get() == ('success', self.filename) 668 | assert result_q.empty() 669 | # Check verify_exists was not called, since it was called in `download_file`. 670 | assert not mock_verify.called 671 | 672 | 673 | class Test_parse_args: 674 | @patch('wasapi_client.multiprocessing.cpu_count') 675 | def test_default_processes(self, mock_cpu_count): 676 | """Test handling of cpu_count() erroring. 677 | 678 | Could happen when cpu_count isn't implemented on a platform 679 | and --processes isn't specified by the user. 680 | """ 681 | mock_cpu_count.side_effect = NotImplementedError 682 | args = wc._parse_args(['--crawl', '12']) 683 | assert args.processes == 1 684 | 685 | def test_SetQueryParametersAction(self): 686 | """Test that arguments passed with this action are in query_params.""" 687 | args = wc._parse_args(['--crawl-start-after', 688 | '2016-12-22T13:01:00', 689 | '--crawl-start-before', 690 | '2016-12-22T15:11:00', 691 | '-c']) 692 | assert len(args.query_params) == 2 693 | assert args.query_params['crawl-start-after'] == '2016-12-22T13:01:00' 694 | assert args.query_params['crawl-start-before'] == '2016-12-22T15:11:00' 695 | 696 | def test_SetQueryParametersAction_multiple_collections(self): 697 | """Test multiple collections end up in query_params. 698 | 699 | A query can have multiple collections, so test that the 700 | user can supply multiple values. 701 | """ 702 | args = wc._parse_args(['--collection', '12345', '98', '--crawl', '12']) 703 | assert len(args.query_params) == 2 704 | assert args.query_params['collection'] == ['12345', '98'] 705 | 706 | 707 | class Test_get_credentials_env: 708 | def test_get_credentials_env(self): 709 | """Test auth credentials are set from environment variables.""" 710 | with patch.dict('os.environ', {'WASAPI_USER': 'me', 'WASAPI_PASS': 'p@ss123'}): 711 | auth = wc.get_credentials_env() 712 | assert auth == ('me', 'p@ss123') 713 | 714 | def test_get_credentials_env_missing_one_env_var(self): 715 | """Test a None value for username or password causes no auth.""" 716 | with patch('os.environ.get') as mock_get: 717 | mock_get.side_effect = ['me', None] 718 | auth = wc.get_credentials_env() 719 | assert auth is None 720 | 721 | 722 | class Test_get_credentials_config: 723 | def test_get_credentials_config(self): 724 | """Test auth can be populated from a config file.""" 725 | stream = io.StringIO('[unt]\nusername = me\npassword = p@ss123') 726 | with patch('builtins.open', return_value=stream): 727 | auth = wc.get_credentials_config('unt') 728 | assert auth == ('me', 'p@ss123') 729 | 730 | def test_get_credentials_config_missing_profile(self): 731 | """Test program exits if the profile supplied doesn't exist.""" 732 | stream = io.StringIO('[unt]\nusername = me\npassword = p@ss123') 733 | with patch('builtins.open', return_value=stream), \ 734 | pytest.raises(SystemExit): 735 | wc.get_credentials_config('home') 736 | 737 | def test_get_credentials_config_missing_password(self): 738 | """Test program exits if config does not supply an expected option.""" 739 | stream = io.StringIO('[unt]\nusername = me') 740 | with patch('builtins.open', return_value=stream), \ 741 | pytest.raises(SystemExit): 742 | wc.get_credentials_config('unt') 743 | 744 | 745 | class Test_get_credentials: 746 | @patch('getpass.getpass', return_value='p@ss123') 747 | def test_get_credentials_from_getpass(self, mock_getpass): 748 | auth = wc.get_credentials(user='me') 749 | assert auth == ('me', 'p@ss123') 750 | mock_getpass.assert_called_once_with() 751 | 752 | @patch('wasapi_client.get_credentials_env', return_value=('me', 'p@ss123')) 753 | def test_get_credentials_from_env(self, mock_gce): 754 | auth = wc.get_credentials() 755 | assert auth == ('me', 'p@ss123') 756 | mock_gce.assert_called_once_with() 757 | 758 | @patch('wasapi_client.get_credentials_env', return_value=None) 759 | @patch('wasapi_client.get_credentials_config', return_value=('me', 'p@ss123')) 760 | def test_get_credentials_from_config(self, mock_gcc, mock_gce): 761 | auth = wc.get_credentials(profile='unt') 762 | assert auth == ('me', 'p@ss123') 763 | mock_gcc.assert_called_once_with('unt') 764 | mock_gce.assert_called_once_with() 765 | 766 | @patch('wasapi_client.get_credentials_env', return_value=None) 767 | @patch('wasapi_client.get_credentials_config') 768 | def test_get_credentials_no_credentials_provided(self, mock_gcc, mock_gce): 769 | """Test if no user/profile is provided and no valid config file exists.""" 770 | auth = wc.get_credentials() 771 | assert auth is None 772 | assert not mock_gcc.called 773 | mock_gce.assert_called_once_with() 774 | --------------------------------------------------------------------------------