├── requirements.txt
├── requirements-test.txt
├── setup.cfg
├── .gitignore
├── CHANGELOG.md
├── tox.ini
├── .travis.yml
├── setup.py
├── LICENSE
├── README.md
├── wasapi_client.py
└── tests
    └── test_wasapi_client.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | requests>=2.18.1
2 | 


--------------------------------------------------------------------------------
/requirements-test.txt:
--------------------------------------------------------------------------------
1 | pytest>=4.6.4
2 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [aliases]
2 | test=pytest
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.swp
 3 | *.swo
 4 | *.egg
 5 | *.egg-info
 6 | *.log
 7 | *.warc.gz
 8 | manifest-*.txt
 9 | 
10 | .cache/
11 | .eggs/
12 | build/
13 | dist/
14 | .tox/*
15 | __pycache__/*
16 | 
17 | # virtualenv
18 | bin/
19 | include/
20 | lib/
21 | pip-selfcheck.json
22 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | 1.1.0
 2 | ------
 3 | 
 4 | * Add support for Webrecorder.io. [#34](https://github.com/unt-libraries/py-wasapi-client/pull/34)
 5 | * Fix tests that fail on MacOS due to `qsize`. [#36](https://github.com/unt-libraries/py-wasapi-client/pull/36)
 6 | 
 7 | 1.0.0
 8 | ------
 9 | 
10 | * Initial release.
11 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | max-line-length = 99
 3 | 
 4 | [tox]
 5 | envlist = py34,py35,py36,py37,py37-flake8
 6 | 
 7 | [testenv]
 8 | usedevelop=True
 9 | deps = -r{toxinidir}/requirements-test.txt
10 | commands = py.test
11 | 
12 | [testenv:py37-flake8]
13 | deps = flake8
14 | commands = flake8 wasapi_client.py tests setup.py
15 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | jobs:
 2 |   include:
 3 |     - os: linux
 4 |       dist: xenial
 5 |       python: 3.4
 6 |     - os: linux
 7 |       dist: xenial
 8 |       python: 3.5
 9 |     - os: linux
10 |       dist: xenial
11 |       python: 3.6
12 |     - os: linux
13 |       dist: xenial
14 |       python: 3.7
15 |     - os: osx
16 |       osx_image: xcode11
17 |       language: shell
18 | language: python
19 | sudo: false
20 | install:
21 |   - pip3 install -r requirements-test.txt
22 |   - pip3 install flake8
23 |   - python3 setup.py install
24 | script:
25 |     - pytest
26 |     - flake8
27 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from setuptools import setup
 4 | 
 5 | 
 6 | with open('README.md', 'r') as readme_f:
 7 |     long_description = readme_f.read()
 8 | 
 9 | 
10 | setup(
11 |     name='py-wasapi-client',
12 |     version='1.1.0',
13 |     url='https://github.com/unt-libraries/py-wasapi-client',
14 |     author='University of North Texas Libraries',
15 |     author_email='lauren.ko@unt.edu',
16 |     license='BSD',
17 |     py_modules=['wasapi_client'],
18 |     scripts=['wasapi_client.py'],
19 |     description='A client for the Archive-It and Webrecorder WASAPI Data Transer API',
20 |     long_description=long_description,
21 |     long_description_content_type='text/markdown',
22 |     install_requires=['requests>=2.18.1'],
23 |     entry_points={
24 |         'console_scripts': [
25 |             'wasapi-client=wasapi_client:main'
26 |         ]
27 |     },
28 |     setup_requires=['pytest-runner'],
29 |     tests_require=['pytest'],
30 |     classifiers=[
31 |         'Intended Audience :: System Administrators',
32 |         'License :: OSI Approved :: BSD License',
33 |         'Natural Language :: English',
34 |         'Programming Language :: Python',
35 |         'Programming Language :: Python :: 3.4',
36 |         'Programming Language :: Python :: 3.5',
37 |         'Programming Language :: Python :: 3.6',
38 |         'Programming Language :: Python :: 3.7',
39 |         'Topic :: Communications :: File Sharing',
40 |     ],
41 | )
42 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright © 2017, Regents of the University of North Texas
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are
 6 | met:
 7 | 
 8 | * Redistributions of source code must retain the above copyright notice,
 9 |   this list of conditions and the following disclaimer.
10 | 
11 | * Redistributions in binary form must reproduce the above copyright
12 |   notice, this list of conditions and the following disclaimer in the
13 |   documentation and/or other materials provided with the distribution.
14 | 
15 | * Neither the name of the University of North Texas Libraries nor the
16 |   names of its contributors may be used to endorse or promote products
17 |   derived from this software without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
21 | BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
22 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23 | COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
24 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
25 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
26 | OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
28 | TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
29 | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
30 | DAMAGE.
31 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # py-wasapi-client [![Build Status](https://travis-ci.org/unt-libraries/py-wasapi-client.svg)](https://travis-ci.org/unt-libraries/py-wasapi-client)
  2 | A client for the WASAPI Data Transfer API. Initially developed according to the
  3 | [Archive-It specification](https://github.com/WASAPI-Community/data-transfer-apis/tree/master/ait-specification), the client now additionally supports [Webrecorder.io](https://webrecorder.io/).
  4 | 
  5 | ## Requirements
  6 | 
  7 | * Python 3.4-3.7
  8 | 
  9 | ## Installation
 10 | 
 11 | To run the latest code, the WASAPI client may be downloaded or cloned
 12 | from [GitHub](https://github.com/unt-libraries/py-wasapi-client). From inside the top-level of the py-wasapi-client directory,
 13 | install with:
 14 | 
 15 | ```
 16 |  $ python setup.py install
 17 | ```
 18 | 
 19 | Alternatively, the most recent release (not guaranteed to be the latest
 20 | code) may be installed from [PyPi](https://pypi.org/project/py-wasapi-client/):
 21 | 
 22 | ```
 23 |  $ pip install py-wasapi-client
 24 | ```
 25 | 
 26 | Once installed, run the client at the command line with:
 27 | 
 28 | ```
 29 |  $ wasapi-client --help
 30 | ```
 31 | 
 32 | That gives you usage instructions:
 33 | 
 34 | ```
 35 | usage: wasapi-client [-h] [-b BASE_URI] [-d DESTINATION] [-l LOG] [-n] [-v]
 36 |                      [--profile PROFILE | -u USER | -t TOKEN]
 37 |                      [-c | -m | -p PROCESSES | -s | -r]
 38 |                      [--collection COLLECTION [COLLECTION ...]]
 39 |                      [--filename FILENAME] [--crawl CRAWL]
 40 |                      [--crawl-time-after CRAWL_TIME_AFTER]
 41 |                      [--crawl-time-before CRAWL_TIME_BEFORE]
 42 |                      [--crawl-start-after CRAWL_START_AFTER]
 43 |                      [--crawl-start-before CRAWL_START_BEFORE]
 44 | 
 45 |         Download WARC files from a WASAPI access point.
 46 | 
 47 |         Acceptable date/time formats are:
 48 |          2017-01-01
 49 |          2017-01-01T12:34:56
 50 |          2017-01-01 12:34:56
 51 |          2017-01-01T12:34:56Z
 52 |          2017-01-01 12:34:56-0700
 53 |          2017
 54 |          2017-01
 55 | 
 56 | optional arguments:
 57 |   -h, --help            show this help message and exit
 58 |   -b BASE_URI, --base-uri BASE_URI
 59 |                         base URI for WASAPI access; default:
 60 |                         https://partner.archive-it.org/wasapi/v1/webdata
 61 |   -d DESTINATION, --destination DESTINATION
 62 |                         location for storing downloaded files
 63 |   -l LOG, --log LOG     file to which logging should be written
 64 |   -n, --no-manifest     do not generate checksum files (ignored when used in
 65 |                         combination with --manifest)
 66 |   -v, --verbose         log verbosely; -v is INFO, -vv is DEBUG
 67 |   --profile PROFILE     profile to use for API authentication
 68 |   -u USER, --user USER  username for API authentication
 69 |   -t TOKEN, --token TOKEN
 70 |                         token for API authentication
 71 |   -c, --count           print number of files for download and exit
 72 |   -m, --manifest        generate checksum files only and exit
 73 |   -p PROCESSES, --processes PROCESSES
 74 |                         number of WARC downloading processes
 75 |   -s, --size            print count and total size of files and exit
 76 |   -r, --urls            list URLs for downloadable files only and exit
 77 | 
 78 | query parameters:
 79 |   parameters for webdata request
 80 | 
 81 |   --collection COLLECTION [COLLECTION ...]
 82 |                         collection identifier
 83 |   --filename FILENAME   exact webdata filename to download
 84 |   --crawl CRAWL         crawl job identifier
 85 |   --crawl-time-after CRAWL_TIME_AFTER
 86 |                         request files created on or after this date/time
 87 |   --crawl-time-before CRAWL_TIME_BEFORE
 88 |                         request files created before this date/time
 89 |   --crawl-start-after CRAWL_START_AFTER
 90 |                         request files from crawl jobs starting on or after
 91 |                         this date/time
 92 |   --crawl-start-before CRAWL_START_BEFORE
 93 |                         request files from crawl jobs starting before this
 94 |                         date/time
 95 | ```
 96 | 
 97 | ## Configuration
 98 | 
 99 | When you are using the tool to query an Archive-It or Webrecorder WASAPI
100 | endpoint, you will need to supply a username and password for the API. You have
101 | three options to provide these credentials.
102 | 
103 | 1. Supply a username with `-u`, and you will be prompted for a password.
104 | 2. Set an environment variable called 'WASAPI_USER' to supply a username
105 | and a variable called 'WASAPI_PASS' to supply a password.
106 | 3. Supply a profile `--profile` defined in a configuration
107 | file. The configuration file should be at `~/.wasapi-client`.
108 | 
109 | An example profile:
110 | 
111 | ```
112 | [unt]
113 | username = exampleUser
114 | password = examplePassword
115 | ```
116 | 
117 | Order of precedence is command line, environment, config file.
118 | 
119 | ## Example Usage
120 | 
121 | The following command downloads the WARC files available from a crawl
122 | with `crawl id` 256119 and logs program output to a file named
123 | `out.log`. The program will prompt the user to enter the password for
124 | user `myusername`. Downloads are carried out by one process.
125 | 
126 | ```
127 |  $ wasapi-client -u myusername --crawl 256119 --log /tmp/out.log -p 1
128 | ```
129 | 
130 | The following command downloads similarly, but user credentials are
131 | supplied by a configuration file.
132 | 
133 | ```
134 |  $ wasapi-client --profile unt --crawl 256119 --log out.log -p 1
135 | ```
136 | 
137 | You may supply an API token instead of user credentials.
138 | 
139 | ```
140 |  $ wasapi-client --token thisistheAPItokenIwasgiven --crawl 256119 --log out.log -p 1
141 | ```
142 | 
143 | The following command downloads the WARC files available from crawls
144 | that occurred in the specified time range. Verbose logging is being
145 | written to a file named out.log. Downloads are happening via four
146 | processes and written to a directory at /tmp/wasapi_warcs/.
147 | 
148 | ```
149 |  $ wasapi-client --profile unt --crawl-start-after 2016-12-22T13:01:00 --crawl-start-before 2016-12-22T15:11:00  -vv --log out.log -p 4 -d /tmp/wasapi_warcs/
150 | 
151 | ```
152 | 
153 | The following command produces the size and file count of all content
154 | available to the user.
155 | 
156 | ```
157 |  $ wasapi-client --profile unt -s 
158 | ```
159 | 
160 | The following command gives the user the number of files available by
161 | the given query parameters.
162 | 
163 | ```
164 |  $ wasapi-client --profile unt --crawl 256119 -c 
165 | ```
166 | 
167 | The following command downloads the file called example.warc.gz to
168 | the current working directory.
169 | 
170 | ```
171 | $ wasapi-client --profile unt --filename example.warc.gz
172 | ```
173 | 
174 | By default, manifest files are generated to provide checksums for the
175 | files to be downloaded. One manifest file is generated for each hash algorithm
176 | provided by the WASAPI access point. The manifest files are written to the
177 | download destination. If you don't want manifest files, use the --no-manifest
178 | flag.
179 | 
180 | ```
181 | $ wasapi-client --profile unt --crawl 256119 --log out.log --no-manifest
182 | ```
183 | 
184 | If you want to generate manifest files for your available webdata files
185 | without actually downloading the webdata files, use the --manifest flag.
186 | 
187 | ```
188 | $ wasapi-client --profile unt --crawl 256119 --manifest
189 | ```
190 | 
191 | If you would like to produce a list of URLs where your webdata files can
192 | later be downloaded by another tool (such as wget) rather than having
193 | wasapi-client do the downloading, use the --urls flag.
194 | 
195 | ```
196 | $ wasapi-client --profile unt --crawl 256119 --urls
197 | ```
198 | 
199 | To use the client with Webrecorder (not all query parameters may be supported),
200 | supply the base URL with -b.
201 | 
202 | ```
203 | $ wasapi-client -b https://webrecorder.io/api/v1/download/webdata --profile webrecorder --collection my_collection -d warcs
204 | ```
205 | 
206 | ## Run the Tests
207 | 
208 | ```
209 | $ python setup.py test
210 | ```
211 | 
212 | or
213 | 
214 | ```
215 | $ pip install tox
216 | $ tox
217 | ```
218 | 


--------------------------------------------------------------------------------
/wasapi_client.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import argparse
  4 | import atexit
  5 | import configparser
  6 | import getpass
  7 | import hashlib
  8 | import logging
  9 | import logging.handlers
 10 | import math
 11 | import multiprocessing
 12 | import os
 13 | import requests
 14 | import re
 15 | import sys
 16 | from collections import defaultdict
 17 | try:
 18 |     from json.decoder import JSONDecodeError
 19 | except ImportError:
 20 |     class JSONDecodeError(ValueError):
 21 |         pass
 22 | from queue import Empty
 23 | from urllib.parse import urlencode
 24 | 
 25 | NAME = 'wasapi_client' if __name__ == '__main__' else __name__
 26 | 
 27 | LOGGER = logging.getLogger(NAME)
 28 | 
 29 | READ_LIMIT = 1024 * 512
 30 | 
 31 | PROFILE_PATH = os.path.join(os.path.expanduser('~'), '.wasapi-client')
 32 | 
 33 | PRE_SIGNED_REGEX = [re.compile(r'https://.*\.s3.amazonaws\.com/.*[?].*Signature=.+')]
 34 | 
 35 | 
 36 | def start_listener_logging(log_q, path=''):
 37 |     formatter = logging.Formatter('%(asctime)s %(levelname)s %(name)s %(message)s')
 38 |     if path:
 39 |         handler = logging.FileHandler(filename=path)
 40 |     else:
 41 |         handler = logging.StreamHandler()
 42 |     handler.setFormatter(formatter)
 43 | 
 44 |     # Get records from the queue and send them to the handler.
 45 |     listener = logging.handlers.QueueListener(log_q, handler)
 46 |     listener.start()
 47 | 
 48 |     return listener
 49 | 
 50 | 
 51 | def configure_main_logging(log_q, log_level=logging.ERROR):
 52 |     """Put a handler on the root logger.
 53 | 
 54 |     This allows handling log records from imported modules.
 55 |     """
 56 |     root = logging.getLogger()
 57 |     root.addHandler(logging.handlers.QueueHandler(log_q))
 58 |     root.setLevel(log_level)
 59 | 
 60 | 
 61 | def configure_worker_logging(log_q, log_level=logging.ERROR):
 62 |     """Configure logging for worker processes."""
 63 |     # Remove any existing handlers.
 64 |     LOGGER.handlers = []
 65 |     # Prevent root logger duplicating messages.
 66 |     LOGGER.propagate = False
 67 |     LOGGER.addHandler(logging.handlers.QueueHandler(log_q))
 68 |     LOGGER.setLevel(log_level)
 69 | 
 70 | 
 71 | class WASAPIDownloadError(Exception):
 72 |     pass
 73 | 
 74 | 
 75 | class WASAPIManifestError(Exception):
 76 |     pass
 77 | 
 78 | 
 79 | def make_session(auth=None, headers={}):
 80 |     """Make a session that will store our auth.
 81 | 
 82 |     `auth` is a tuple of the form (user, password)
 83 |     """
 84 |     session = requests.Session()
 85 |     session.auth = auth
 86 |     session.headers.update(headers)
 87 |     return session
 88 | 
 89 | 
 90 | def get_webdata(webdata_uri, session):
 91 |     """Make a request to the WASAPI."""
 92 |     try:
 93 |         response = session.get(webdata_uri)
 94 |     except requests.exceptions.ConnectionError as err:
 95 |         sys.exit('Could not connect at {}:\n{}'.format(webdata_uri, err))
 96 |     LOGGER.info('requesting {}'.format(webdata_uri))
 97 |     if response.status_code == 403:
 98 |         sys.exit('Verify user/password for {}:\n{} {}'.format(webdata_uri,
 99 |                                                               response.status_code,
100 |                                                               response.reason))
101 |     try:
102 |         return response.json()
103 |     except (JSONDecodeError, ValueError) as err:
104 |         sys.exit('Non-JSON response from {}:\n{}'.format(webdata_uri, err))
105 | 
106 | 
107 | def get_files_count(webdata_uri, auth=None, headers={}):
108 |     """Return total number of downloadable files."""
109 |     session = make_session(auth, headers)
110 |     webdata = get_webdata(webdata_uri, session)
111 |     session.close()
112 |     return webdata.get('count', None)
113 | 
114 | 
115 | def get_files_size(page_uri, auth=None, headers={}):
116 |     """Return total size (bytes) of downloadable files."""
117 |     session = make_session(auth, headers)
118 |     total = 0
119 |     count = 0
120 |     webdata = None
121 |     while page_uri:
122 |         webdata = get_webdata(page_uri, session)
123 |         for f in webdata['files']:
124 |             total += int(f['size'])
125 |         page_uri = webdata.get('next', None)
126 |     if webdata:
127 |         count = webdata.get('count', None)
128 |     session.close()
129 |     return count, total
130 | 
131 | 
132 | def convert_bytes(size):
133 |     """Make a human readable size."""
134 |     label = ('B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB')
135 |     try:
136 |         i = int(math.floor(math.log(size, 1024)))
137 |     except ValueError:
138 |         i = 0
139 |     p = math.pow(1024, i)
140 |     readable_size = round(size/p, 2)
141 |     return '{}{}'.format(readable_size, label[i])
142 | 
143 | 
144 | class Downloads:
145 |     """Handles cycling through all of our query results.
146 | 
147 |     If download is True, we create a queue of the files that need to be
148 |     downloaded. If manifest is True, store the checksums/filenames for
149 |     each available hash algorithm.
150 |     """
151 | 
152 |     def __init__(self, page_uri, auth=None, download=True, destination='',
153 |                  headers={}):
154 |         self.page_uri = page_uri
155 |         self.auth = auth
156 |         self.download = download
157 |         if self.download:
158 |             self.get_q = multiprocessing.JoinableQueue()
159 |         self.checksums = defaultdict(list)
160 |         self.urls = []
161 |         self.destination = '' if destination == '.' else destination
162 |         self.headers = headers
163 |         self.populate_downloads()
164 | 
165 |     def populate_downloads(self):
166 |         """Repeat webdata requests to gather downloadable file info."""
167 |         session = make_session(self.auth, self.headers)
168 |         current_uri = self.page_uri
169 |         while current_uri:
170 |             webdata = get_webdata(current_uri, session)
171 |             for f in webdata['files']:
172 |                 # Store the first locations URL per file only.
173 |                 self.urls.append(f['locations'][0])
174 |                 path = os.path.join(self.destination, f['filename'])
175 |                 for algorithm, value in f['checksums'].items():
176 |                     self.checksums[algorithm].append((value, path))
177 |                 if self.download:
178 |                     df = DataFile(f['locations'], f['filename'], f['checksums'], f['size'])
179 |                     self.get_q.put(df)
180 |             current_uri = webdata.get('next', None)
181 |         session.close()
182 | 
183 |     def generate_manifests(self):
184 |         """Produce manifest files for all hash algorithms."""
185 |         for algorithm in self.checksums:
186 |             self.write_manifest_file(algorithm)
187 | 
188 |     def write_manifest_file(self, algorithm):
189 |         """Write a manifest file for the provided algorithm."""
190 |         if algorithm not in self.checksums:
191 |             raise WASAPIManifestError('No values for {}'.format(algorithm))
192 |         manifest_path = os.path.join(self.destination,
193 |                                      'manifest-{}.txt'.format(algorithm))
194 |         with open(manifest_path, 'w') as manifest_f:
195 |             for checksum, path in self.checksums[algorithm]:
196 |                 manifest_f.write('{}  {}\n'.format(checksum, path))
197 | 
198 | 
199 | class DataFile:
200 |     """Representation of a file to be downloaded.
201 | 
202 |     `locations` is a list of URLs
203 |     `filename` is the name of the data file
204 |     `size` is the size of the file in bytes
205 |     `checksums` is a dictionary of hash algorithm/value pairs
206 |     `verified` is a Boolean value indicating a successful checksum verification
207 |     """
208 | 
209 |     def __init__(self, locations, filename, checksums, size):
210 |         self.locations = locations
211 |         self.filename = filename
212 |         self.checksums = checksums
213 |         self.size = size
214 |         self.verified = False
215 | 
216 | 
217 | def download_file(data_file, session, output_path):
218 |     """Download webdata file to disk."""
219 |     if check_exists(output_path, data_file.size, data_file.checksums):
220 |         # Don't download the file if it already exists.
221 |         LOGGER.info('{} exists with expected size/checksum'.format(data_file.filename))
222 |         data_file.verified = True
223 |         return data_file
224 |     for location in data_file.locations:
225 | 
226 |         # if location matches a 'pre-signed' url regex pattern,
227 |         # skip auth for this location
228 |         for rx in PRE_SIGNED_REGEX:
229 |             if rx.match(location):
230 |                 sesh = requests
231 |             else:
232 |                 sesh = session
233 | 
234 |         try:
235 |             response = sesh.get(location, stream=True)
236 |         except requests.exceptions.RequestException as err:
237 |             # This could be a remote disconnect, read timeout, connection timeout,
238 |             # temporary name resolution issue...
239 |             LOGGER.error('Error downloading {}:\n{}'.format(location, err))
240 |             continue
241 |         msg = '{}: {} {}'.format(location,
242 |                                  response.status_code,
243 |                                  response.reason)
244 |         if response.status_code == 200:
245 |             try:
246 |                 write_file(response, output_path)
247 |             except OSError as err:
248 |                 LOGGER.error('{}: {}'.format(location, str(err)))
249 |                 break
250 |             # Successful download; don't try alternate locations.
251 |             LOGGER.info(msg)
252 |             return data_file
253 |         else:
254 |             LOGGER.error(msg)
255 |     # We didn't download successfully; raise error.
256 |     msg = 'FAILED to download {} from {}'.format(data_file.filename,
257 |                                                  data_file.locations)
258 |     raise WASAPIDownloadError(msg)
259 | 
260 | 
261 | def check_exists(path, size, checksums):
262 |     """Check if file with matching size and checksum exists."""
263 |     if not os.path.isfile(path):
264 |         return False
265 |     if not os.path.getsize(path) == size:
266 |         return False
267 |     return verify_file(checksums, path)
268 | 
269 | 
270 | def write_file(response, output_path=''):
271 |     """Write file to disk."""
272 |     with open(output_path, 'wb') as wtf:
273 |         for chunk in response.iter_content(1024*4):
274 |             wtf.write(chunk)
275 | 
276 | 
277 | def verify_file(checksums, file_path):
278 |     """Verify the file checksum is correct.
279 | 
280 |     Takes a dictionary of hash algorithms and the corresponding
281 |     expected value for the file_path provided. The first success
282 |     or failure determines if the file is valid.
283 |     """
284 |     for algorithm, value in checksums.items():
285 |         read_limit = READ_LIMIT
286 |         hash_function = getattr(hashlib, algorithm, None)
287 |         if not hash_function and algorithm == 's3etag':
288 |             # if etag does not contain a '-', then its just a regular md5
289 |             if '-' not in value:
290 |                 hash_function = hashlib.md5
291 | 
292 |             # otherwise, its likely a 'double-md5'
293 |             # see: https://zihao.me/post/calculating-etag-for-aws-s3-objects/
294 |             else:
295 |                 hash_function = S3DoubleMD5
296 |                 # expected chunk size for S3 md5 computation
297 |                 read_limit = 1024 * 1024 * 8
298 | 
299 |         if not hash_function:
300 |             # The hash algorithm provided is not supported by hashlib.
301 |             LOGGER.debug('{} is unsupported'.format(algorithm))
302 |             continue
303 |         digest = calculate_sum(hash_function, file_path, read_limit)
304 |         if digest == value:
305 |             LOGGER.info('Checksum success at: {}'.format(file_path))
306 |             return True
307 |         else:
308 |             LOGGER.error('Checksum {} mismatch for {}: expected {}, got {}'.format(algorithm,
309 |                                                                                    file_path,
310 |                                                                                    value,
311 |                                                                                    digest))
312 |             return False
313 |     # We didn't find a compatible algorithm.
314 |     return False
315 | 
316 | 
317 | class S3DoubleMD5:
318 |     """Implements double-md5 computation as suggested by:
319 | 
320 |     https://zihao.me/post/calculating-etag-for-aws-s3-objects/
321 |     """
322 | 
323 |     def __init__(self):
324 |         self.md5s = []
325 | 
326 |     def update(self, buff):
327 |         self.md5s.append(hashlib.md5(buff))
328 | 
329 |     def hexdigest(self):
330 |         if len(self.md5s) == 1:
331 |             return self.md5s[0].hexdigest()
332 | 
333 |         digests = b''.join(m.digest() for m in self.md5s)
334 |         digests_md5 = hashlib.md5(digests)
335 |         return '{}-{}'.format(digests_md5.hexdigest(), len(self.md5s))
336 | 
337 | 
338 | def calculate_sum(hash_function, file_path, read_limit=READ_LIMIT):
339 |     """Return the checksum of the given file."""
340 |     hasher = hash_function()
341 |     with open(file_path, 'rb') as rff:
342 |         r = rff.read(read_limit)
343 |         while r:
344 |             hasher.update(r)
345 |             r = rff.read(read_limit)
346 |     return hasher.hexdigest()
347 | 
348 | 
349 | def convert_queue(tuple_q):
350 |     """Convert a queue containing 2-element tuples into a dictionary.
351 | 
352 |     The first element becomes a key. The key's value becomes a list
353 |     to which the second tuple element is appended.
354 |     """
355 |     ddict = defaultdict(list)
356 |     while True:
357 |         try:
358 |             key, value = tuple_q.get(block=False)
359 |         except Empty:
360 |             break
361 |         ddict[key].append(value)
362 |     return ddict
363 | 
364 | 
365 | def generate_report(result_q):
366 |     """Create a summary of success/failure downloads."""
367 |     results = convert_queue(result_q)
368 |     success = len(results.get('success', []))
369 |     failure = len(results.get('failure', []))
370 |     total = success + failure
371 |     summary = ('Total downloads attempted: {}\n'
372 |                'Successful downloads: {}\n'
373 |                'Failed downloads: {}\n').format(total, success, failure)
374 |     if total != failure and failure > 0:
375 |         summary += 'Failed files (see log for details):\n'
376 |         for filename in results['failure']:
377 |             summary += '    {}\n'.format(filename)
378 |     return summary
379 | 
380 | 
381 | class Downloader(multiprocessing.Process):
382 |     """Worker for downloading web files with a persistent session."""
383 | 
384 |     def __init__(self, get_q, result_q, log_q, log_level=logging.ERROR,
385 |                  auth=None, destination='.', headers={}, *args, **kwargs):
386 |         super(Downloader, self).__init__(*args, **kwargs)
387 |         self.get_q = get_q
388 |         self.result_q = result_q
389 |         self.session = make_session(auth, headers)
390 |         self.destination = destination
391 |         configure_worker_logging(log_q, log_level)
392 | 
393 |     def run(self):
394 |         """Download files from the queue until there are no more.
395 | 
396 |         Gets a file's data off the queue, attempts to download the
397 |         file, and puts the result onto another queue.
398 |         """
399 |         while True:
400 |             try:
401 |                 data_file = self.get_q.get(block=False)
402 |             except Empty:
403 |                 break
404 |             result = 'failure'
405 |             output_path = os.path.join(self.destination, data_file.filename)
406 |             try:
407 |                 data_file = download_file(data_file, self.session, output_path)
408 |             except WASAPIDownloadError as err:
409 |                 LOGGER.error(str(err))
410 |             else:
411 |                 # If we download the file without error, verify the checksum.
412 |                 if data_file.verified or verify_file(data_file.checksums, output_path):
413 |                     result = 'success'
414 |             self.result_q.put((result, data_file.filename))
415 |             self.get_q.task_done()
416 | 
417 | 
418 | class SetQueryParametersAction(argparse.Action):
419 |     """Store all of the query parameter argument values in a dict."""
420 | 
421 |     def __call__(self, parser, namespace, values, option_string):
422 |         if not hasattr(namespace, 'query_params'):
423 |             setattr(namespace, 'query_params', {})
424 |         option = option_string.lstrip('-')
425 |         namespace.query_params[option] = values
426 | 
427 | 
428 | def _parse_args(args=sys.argv[1:]):
429 |     """Parse the commandline arguments."""
430 |     description = """
431 |         Download WARC files from a WASAPI access point.
432 | 
433 |         Acceptable date/time formats are:
434 |          2017-01-01
435 |          2017-01-01T12:34:56
436 |          2017-01-01 12:34:56
437 |          2017-01-01T12:34:56Z
438 |          2017-01-01 12:34:56-0700
439 |          2017
440 |          2017-01"""
441 |     try:
442 |         # According to multiprocessing docs, this could fail on some platforms.
443 |         default_processes = multiprocessing.cpu_count()
444 |     except NotImplementedError:
445 |         default_processes = 1
446 |     parser = argparse.ArgumentParser(description=description,
447 |                                      formatter_class=argparse.RawDescriptionHelpFormatter)
448 | 
449 |     parser.add_argument('-b',
450 |                         '--base-uri',
451 |                         dest='base_uri',
452 |                         default='https://partner.archive-it.org/wasapi/v1/webdata',
453 |                         help='base URI for WASAPI access; default: '
454 |                              'https://partner.archive-it.org/wasapi/v1/webdata')
455 |     parser.add_argument('-d',
456 |                         '--destination',
457 |                         default='.',
458 |                         help='location for storing downloaded files')
459 |     parser.add_argument('-l',
460 |                         '--log',
461 |                         help='file to which logging should be written')
462 |     parser.add_argument('-n',
463 |                         '--no-manifest',
464 |                         action='store_true',
465 |                         dest='skip_manifest',
466 |                         help='do not generate checksum files (ignored'
467 |                              ' when used in combination with --manifest)')
468 |     parser.add_argument('-v',
469 |                         '--verbose',
470 |                         action='count',
471 |                         default=0,
472 |                         help='log verbosely; -v is INFO, -vv is DEBUG')
473 | 
474 |     auth_group = parser.add_mutually_exclusive_group()
475 |     auth_group.add_argument('--profile',
476 |                             dest='profile',
477 |                             help='profile to use for API authentication')
478 |     auth_group.add_argument('-u',
479 |                             '--user',
480 |                             dest='user',
481 |                             help='username for API authentication')
482 |     auth_group.add_argument('-t',
483 |                             '--token',
484 |                             dest='token',
485 |                             help='token for API authentication')
486 | 
487 |     out_group = parser.add_mutually_exclusive_group()
488 |     out_group.add_argument('-c',
489 |                            '--count',
490 |                            action='store_true',
491 |                            help='print number of files for download and exit')
492 |     out_group.add_argument('-m',
493 |                            '--manifest',
494 |                            action='store_true',
495 |                            help='generate checksum files only and exit')
496 |     out_group.add_argument('-p',
497 |                            '--processes',
498 |                            type=int,
499 |                            default=default_processes,
500 |                            help='number of WARC downloading processes')
501 |     out_group.add_argument('-s',
502 |                            '--size',
503 |                            action='store_true',
504 |                            help='print count and total size of files and exit')
505 |     out_group.add_argument('-r',
506 |                            '--urls',
507 |                            action='store_true',
508 |                            help='list URLs for downloadable files only and exit')
509 | 
510 |     # Arguments to become part of query parameter string
511 |     param_group = parser.add_argument_group('query parameters',
512 |                                             'parameters for webdata request')
513 |     param_group.add_argument('--collection',
514 |                              action=SetQueryParametersAction,
515 |                              nargs='+',
516 |                              help='collection identifier')
517 |     param_group.add_argument('--filename',
518 |                              action=SetQueryParametersAction,
519 |                              help='exact webdata filename to download')
520 |     param_group.add_argument('--crawl',
521 |                              action=SetQueryParametersAction,
522 |                              help='crawl job identifier')
523 |     param_group.add_argument('--crawl-time-after',
524 |                              action=SetQueryParametersAction,
525 |                              help='request files created on or after this '
526 |                                   'date/time')
527 |     param_group.add_argument('--crawl-time-before',
528 |                              action=SetQueryParametersAction,
529 |                              help='request files created before this date/time')
530 |     param_group.add_argument('--crawl-start-after',
531 |                              action=SetQueryParametersAction,
532 |                              help='request files from crawl jobs starting on '
533 |                                   'or after this date/time')
534 |     param_group.add_argument('--crawl-start-before',
535 |                              action=SetQueryParametersAction,
536 |                              help='request files from crawl jobs starting '
537 |                                   'before this date/time')
538 |     return parser.parse_args(args)
539 | 
540 | 
541 | def get_credentials_env():
542 |     """Get API credentials from environment variables."""
543 |     env = os.environ.get
544 |     auth = (env('WASAPI_USER'), env('WASAPI_PASS'))
545 |     if None in auth:
546 |         auth = None
547 |     else:
548 |         LOGGER.debug('Using API credentials from environment variables')
549 |     return auth
550 | 
551 | 
552 | def get_credentials_config(profile, path=PROFILE_PATH):
553 |     """Get API credentials from a config file."""
554 |     config = configparser.ConfigParser()
555 |     try:
556 |         config.read_file(open(path))
557 |         auth = (config.get(profile, 'username'),
558 |                 config.get(profile, 'password'))
559 |     except (OSError,
560 |             configparser.NoSectionError,
561 |             configparser.NoOptionError) as err:
562 |         sys.exit('{}: please create config file to supply API credentials with format:\n\n'
563 |                  '[{}]\n'
564 |                  'username = someuser\n'
565 |                  'password = secretpasswd\n'.format(err, profile))
566 |     LOGGER.debug('Using API credentials from {}'.format(path))
567 |     return auth
568 | 
569 | 
570 | def get_credentials(user=None, profile=None):
571 |     """Determine a username/password combination if one is supplied.
572 | 
573 |     Order of precedence is command line, environment, config file."""
574 |     auth = None
575 |     if user:
576 |         # If there is a username, prompt for a password.
577 |         auth = (user, getpass.getpass())
578 |     else:
579 |         # Check for credentials in environment variables.
580 |         auth = get_credentials_env()
581 |     if profile and auth is None:
582 |         # Check for credentials in a config file.
583 |         auth = get_credentials_config(profile)
584 |     return auth
585 | 
586 | 
587 | def main():
588 |     args = _parse_args()
589 | 
590 |     if (not os.access(args.destination, os.W_OK)
591 |             and not args.size
592 |             and not args.count):
593 |         msg = 'Cannot write to destination: {}'.format(args.destination)
594 |         sys.exit(msg)
595 | 
596 |     # Start log writing process.
597 |     manager = multiprocessing.Manager()
598 |     log_q = manager.Queue()
599 |     try:
600 |         listener = start_listener_logging(log_q, args.log)
601 |     except OSError as err:
602 |         print('Could not open file for logging:', err)
603 |         sys.exit(1)
604 | 
605 |     @atexit.register
606 |     def stop_listener_logging():
607 |         """Stop listener when exiting program normally."""
608 |         listener.stop()
609 | 
610 |     # Configure a logger for the main process.
611 |     try:
612 |         log_level = [logging.ERROR, logging.INFO, logging.DEBUG][args.verbose]
613 |     except IndexError:
614 |         log_level = logging.DEBUG
615 |     configure_main_logging(log_q, log_level)
616 | 
617 |     # Generate query string for the webdata request.
618 |     try:
619 |         query = '?{}'.format(urlencode(args.query_params, safe=':', doseq=True))
620 |     except AttributeError:
621 |         # Use empty query if user didn't enter any query parameters.
622 |         query = ''
623 |     webdata_uri = '{}{}'.format(args.base_uri, query)
624 | 
625 |     # Set up authentication.
626 |     auth = None
627 |     headers = {}
628 |     if args.token:
629 |         # Set the HTTP Authentication header.
630 |         headers['Authorization'] = 'Token {}'.format(args.token)
631 |     else:
632 |         # Generate authentication tuple for the API calls.
633 |         auth = get_credentials(args.user, args.profile)
634 | 
635 |     # If user wants the size, don't download files.
636 |     if args.size:
637 |         count, size = get_files_size(webdata_uri, auth, headers)
638 |         print('Number of Files: ', count)
639 |         print('Size of Files: ', convert_bytes(size))
640 |         sys.exit()
641 | 
642 |     # If user wants a count, don't download files.
643 |     if args.count:
644 |         print('Number of Files: ', get_files_count(webdata_uri, auth, headers))
645 |         sys.exit()
646 | 
647 |     # Process webdata requests to generate checksum files.
648 |     if args.manifest:
649 |         downloads = Downloads(webdata_uri, auth, download=False,
650 |                               destination=args.destination, headers=headers)
651 |         downloads.generate_manifests()
652 |         sys.exit()
653 | 
654 |     # Print the URLs for files that can be downloaded; don't download them.
655 |     if args.urls:
656 |         downloads = Downloads(webdata_uri, auth, download=False,
657 |                               destination=args.destination, headers=headers)
658 |         for url in downloads.urls:
659 |             print(url)
660 |         sys.exit()
661 | 
662 |     # Process webdata requests to fill webdata file queue.
663 |     downloads = Downloads(webdata_uri, auth, download=True,
664 |                           destination=args.destination, headers=headers)
665 | 
666 |     # Write manifest file(s).
667 |     if not args.skip_manifest:
668 |         downloads.generate_manifests()
669 | 
670 |     # Download with multiple processes.
671 |     get_q = downloads.get_q
672 |     result_q = manager.Queue()
673 | 
674 |     download_processes = []
675 |     try:
676 |         num_processes = min(args.processes, get_q.qsize())
677 |     except NotImplementedError:
678 |         num_processes = args.processes
679 |     for _ in range(num_processes):
680 |         dp = Downloader(get_q, result_q, log_q, log_level, auth,
681 |                         args.destination, headers=headers)
682 |         dp.start()
683 |         download_processes.append(dp)
684 |     for dp in download_processes:
685 |         dp.join()
686 |     get_q.join()
687 | 
688 |     print(generate_report(result_q))
689 | 
690 | 
691 | if __name__ == '__main__':
692 |     main()
693 | 


--------------------------------------------------------------------------------
/tests/test_wasapi_client.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import hashlib
  4 | import io
  5 | import json
  6 | import multiprocessing
  7 | import os
  8 | import sys
  9 | from collections import OrderedDict
 10 | from logging import INFO
 11 | from unittest.mock import call, mock_open, patch
 12 | 
 13 | import pytest
 14 | import requests
 15 | 
 16 | import wasapi_client as wc
 17 | 
 18 | 
 19 | WASAPI_URL = 'http://example.com/webdata'
 20 | 
 21 | WASAPI_TEXT = "".join("""{
 22 |   "count": 2,
 23 |   "files": [
 24 |     {
 25 |       "account": 1,
 26 |       "checksums": {
 27 |         "md5": "61f818912d1f39bc9dd15d4b87461110",
 28 |         "sha1": "edef6bca652d75d0587ef411d5f028335341b074"
 29 |       },
 30 |       "collection": 7967,
 31 |       "crawl": 256123,
 32 |       "crawl-start": "2016-12-22T14:07:24Z",
 33 |       "crawl-time": "2016-12-22T18:55:12Z",
 34 |       "filename": "AIT-JOB256123-00000.warc.gz",
 35 |       "filetype": "warc",
 36 |       "locations": [
 37 |         "https://warcs.example.com/webdatafile/AIT-JOB256123-00000.warc.gz",
 38 |         "https://example.com/download/AIT-JOB256123-00000.warc.gz"
 39 |       ],
 40 |       "size": 943100093
 41 |     },
 42 |     {
 43 |       "account": 1,
 44 |       "checksums": {
 45 |         "md5": "748120fd9672b22df5942bb44e9cde81",
 46 |         "sha1": "54a466421471ef7d8cb4d6bbfb85afd76022a378"
 47 |       },
 48 |       "collection": 7967,
 49 |       "crawl": 256118,
 50 |       "crawl-start": "2016-12-22T14:01:53Z",
 51 |       "crawl-time": "2016-12-22T14:01:58Z",
 52 |       "filename": "ARCHIVEIT-JOB256118-00000.warc.gz",
 53 |       "filetype": "warc",
 54 |       "locations": [
 55 |         "https://warcs.example.com/webdatafile/AIT-JOB256118-00000.warc.gz",
 56 |         "https://example.com/download/AIT-JOB256118-00000.warc.gz"
 57 |       ],
 58 |       "size": 6265488
 59 |     }
 60 |   ],
 61 |   "includes-extra": false,
 62 |   "next": null,
 63 |   "previous": null,
 64 |   "request-url": "https://example.com/wasapi/v1/webdata"
 65 | }""".split())
 66 | 
 67 | 
 68 | NO_FILES = """{
 69 |   "count": 0,
 70 |   "files": [],
 71 |   "request-url": "https://example.com/wasapi/v1/webdata",
 72 |   "includes-extra": false,
 73 |   "next": null,
 74 |   "previous": null
 75 | }"""
 76 | 
 77 | 
 78 | class MockResponse200:
 79 |     """A mocked successful requests GET response from WASAPI."""
 80 | 
 81 |     def __init__(self, text=WASAPI_TEXT):
 82 |         self.status_code = 200
 83 |         self.text = text
 84 |         self.reason = 'OK'
 85 | 
 86 |     def json(self):
 87 |         return json.loads(self.text)
 88 | 
 89 | 
 90 | class MockResponse403:
 91 |     """A mocked unsuccessful requests GET response from WASAPI."""
 92 | 
 93 |     def __init__(self):
 94 |         self.status_code = 403
 95 |         self.reason = 'Forbidden'
 96 | 
 97 | 
 98 | class Test_make_session:
 99 |     def test_make_session_auth(self):
100 |         auth = ('user', 'pass')
101 |         headers = {'Authorization': 'Token lalala'}
102 |         session = wc.make_session(auth, headers)
103 |         assert session.auth == auth
104 |         assert 'Authorization' in session.headers
105 | 
106 |     def test_make_session_no_auth(self):
107 |         session = wc.make_session(None)
108 |         assert session.auth is None
109 | 
110 | 
111 | class Test_get_webdata:
112 |     def test_get_webdata(self):
113 |         """Test a successful response."""
114 |         session = requests.Session()
115 |         with patch.object(session, 'get', return_value=MockResponse200()):
116 |             response = wc.get_webdata(WASAPI_URL, session)
117 |         # Compare with whitespace stripped.
118 |         response_text = "".join(json.dumps(response, sort_keys=True).split())
119 |         assert response_text == WASAPI_TEXT
120 | 
121 |     def test_get_webdata_403_forbidden(self):
122 |         """Test bad authentication handling."""
123 |         session = requests.Session()
124 |         with patch.object(session, 'get', return_value=MockResponse403()):
125 |             with pytest.raises(SystemExit):
126 |                 wc.get_webdata(WASAPI_URL, session)
127 | 
128 |     def test_get_webdata_ConnectionError(self):
129 |         """Test host connection isn't made."""
130 |         session = requests.Session()
131 |         error = requests.exceptions.ConnectionError
132 |         with patch.object(session, 'get', side_effect=error):
133 |             with pytest.raises(SystemExit):
134 |                 wc.get_webdata(WASAPI_URL, session)
135 | 
136 |     def test_get_webdata_json_error(self):
137 |         """Test 200 non-JSON repsonse exits."""
138 |         session = requests.Session()
139 |         text = 'response text is not json'
140 |         with patch.object(session, 'get', return_value=MockResponse200(text)):
141 |             with pytest.raises(SystemExit):
142 |                 wc.get_webdata(WASAPI_URL, session)
143 | 
144 | 
145 | @patch('requests.Session')
146 | class Test_Downloads:
147 |     def test_populate_downloads(self, mock_session):
148 |         """Test a queue is returned with expected data."""
149 |         mock_session.return_value.get.return_value = MockResponse200()
150 |         downloads = wc.Downloads(WASAPI_URL, download=True)
151 |         j_queue = downloads.get_q
152 | 
153 |         # Drain the JoinableQueue to avoid BrokenPipeError.
154 |         # There could be a better way to handle this...
155 |         for _ in (1, 2):
156 |             q_item = j_queue.get()
157 |             assert isinstance(q_item, wc.DataFile)
158 |             j_queue.task_done()
159 |         # Verify it was two items on the queue.
160 |         assert j_queue.empty()
161 | 
162 |     def test_populate_downloads_multi_page(self, mock_session):
163 |         """Test the queue returned for multiple results pages."""
164 |         # Give the first of our two page responses a next page URL.
165 |         p1 = WASAPI_TEXT.replace('"next":null', '"next":"http://test?page=2"')
166 |         responses = [MockResponse200(p1), MockResponse200()]
167 |         mock_session.return_value.get.side_effect = responses
168 |         downloads = wc.Downloads(WASAPI_URL, download=True)
169 |         j_queue = downloads.get_q
170 | 
171 |         # Drain the JoinableQueue to avoid BrokenPipeError.
172 |         for _ in range(4):
173 |             q_item = j_queue.get()
174 |             assert isinstance(q_item, wc.DataFile)
175 |             j_queue.task_done()
176 |         # Verify there were only 4 items on the queue.
177 |         assert j_queue.empty()
178 | 
179 |     def test_populate_downloads_no_get_q(self, mock_session):
180 |         """Test download=False prevents get_q attribute existing."""
181 |         mock_session.return_value.get.return_value = MockResponse200()
182 |         downloads = wc.Downloads(WASAPI_URL, download=False)
183 |         with pytest.raises(AttributeError):
184 |             getattr(downloads, 'get_q')
185 | 
186 |     def test_populate_downloads_urls(self, mock_session):
187 |         """Test urls is populated with first location per file."""
188 |         mock_session.return_value.get.return_value = MockResponse200()
189 |         downloads = wc.Downloads(WASAPI_URL, download=False)
190 |         assert len(downloads.urls) == 2
191 |         for url in ['https://warcs.example.com/webdatafile/AIT-JOB256123-00000.warc.gz',
192 |                     'https://warcs.example.com/webdatafile/AIT-JOB256118-00000.warc.gz']:
193 |             assert url in downloads.urls
194 | 
195 |     def test_populate_downloads_manifest(self, mock_session):
196 |         """Test the checksums dict is populated."""
197 |         mock_session.return_value.get.return_value = MockResponse200()
198 |         downloads = wc.Downloads(WASAPI_URL, download=False)
199 |         assert len(downloads.checksums)
200 |         assert downloads.checksums['md5'] == [('61f818912d1f39bc9dd15d4b87461110',
201 |                                                'AIT-JOB256123-00000.warc.gz'),
202 |                                               ('748120fd9672b22df5942bb44e9cde81',
203 |                                                'ARCHIVEIT-JOB256118-00000.warc.gz')]
204 |         assert downloads.checksums['sha1'] == [('edef6bca652d75d0587ef411d5f028335341b074',
205 |                                                 'AIT-JOB256123-00000.warc.gz'),
206 |                                                ('54a466421471ef7d8cb4d6bbfb85afd76022a378',
207 |                                                 'ARCHIVEIT-JOB256118-00000.warc.gz')]
208 | 
209 |     def test_populate_downloads_manifest_destination(self, mock_session):
210 |         """Test the checksums dict is populated with destination included."""
211 |         mock_session.return_value.get.return_value = MockResponse200()
212 |         downloads = wc.Downloads(WASAPI_URL, download=False, destination='{}tmp'.format(os.sep))
213 |         assert len(downloads.checksums)
214 |         assert downloads.checksums['md5'] == [
215 |             ('61f818912d1f39bc9dd15d4b87461110',
216 |              os.path.normpath('/tmp/AIT-JOB256123-00000.warc.gz')),
217 |             ('748120fd9672b22df5942bb44e9cde81',
218 |              os.path.normpath('/tmp/ARCHIVEIT-JOB256118-00000.warc.gz'))
219 |         ]
220 |         assert downloads.checksums['sha1'] == [
221 |             ('edef6bca652d75d0587ef411d5f028335341b074',
222 |              os.path.normpath('/tmp/AIT-JOB256123-00000.warc.gz')),
223 |             ('54a466421471ef7d8cb4d6bbfb85afd76022a378',
224 |              os.path.normpath('/tmp/ARCHIVEIT-JOB256118-00000.warc.gz'))
225 |         ]
226 | 
227 |     def test_populate_downloads_generate_manifest(self, mock_session, tmpdir):
228 |         """Test checksum files are created for all algorithms."""
229 |         mock_session.return_value.get.return_value = MockResponse200()
230 |         sub_dir = 'downloads'
231 |         dest = tmpdir.mkdir(sub_dir)
232 |         downloads = wc.Downloads(WASAPI_URL, download=False, destination=str(dest))
233 |         downloads.generate_manifests()
234 |         sub_dir_contents = dest.listdir()
235 |         assert len(sub_dir_contents) == 2
236 |         for name in ['manifest-md5.txt', 'manifest-sha1.txt']:
237 |             assert dest.join(name) in sub_dir_contents
238 | 
239 |     def test_write_manifest_file(self, mock_session, tmpdir):
240 |         """Test a manifest file is written for the given algorithm."""
241 |         mock_session.return_value.get.return_value = MockResponse200()
242 |         sub_dir = 'downloads'
243 |         dest = tmpdir.mkdir(sub_dir)
244 |         downloads = wc.Downloads(WASAPI_URL, download=False, destination=str(dest))
245 |         downloads.write_manifest_file('sha1')
246 |         assert len(dest.listdir()) == 1
247 |         txt = (
248 |             'edef6bca652d75d0587ef411d5f028335341b074  {p}{s}AIT-JOB256123-00000.warc.gz\n'
249 |             '54a466421471ef7d8cb4d6bbfb85afd76022a378  {p}{s}ARCHIVEIT-JOB256118-00000.warc.gz\n'
250 |         )
251 |         assert dest.join('manifest-sha1.txt').read() == txt.format(p=dest, s=os.sep)
252 | 
253 |     def test_write_manifest_file_wrong_algorithm(self, mock_session, tmpdir):
254 |         """Test writing a manifest file for an algorithm we don't have."""
255 |         mock_session.return_value.get.return_value = MockResponse200()
256 |         sub_dir = 'downloads'
257 |         dest = tmpdir.mkdir(sub_dir)
258 |         downloads = wc.Downloads(WASAPI_URL, download=False, destination=str(dest))
259 |         with pytest.raises(wc.WASAPIManifestError):
260 |             downloads.write_manifest_file('sha2')
261 | 
262 | 
263 | @patch('requests.Session')
264 | class Test_get_files_count:
265 |     def test_get_files_count(self, mock_session):
266 |         mock_session.return_value.get.return_value = MockResponse200()
267 |         count = wc.get_files_count(WASAPI_URL)
268 |         assert count == 2
269 | 
270 | 
271 | @patch('requests.Session')
272 | class Test_get_files_size:
273 |     def test_get_files_size(self, mock_session):
274 |         mock_session.return_value.get.return_value = MockResponse200()
275 |         count, total = wc.get_files_size(WASAPI_URL)
276 |         assert count == 2
277 |         assert total == 949365581
278 | 
279 |     def test_get_files_size_multi_page(self, mock_session):
280 |         # Give the first of our two page responses a next page URL.
281 |         p1 = WASAPI_TEXT.replace('"next":null',
282 |                                  '"next":"{}?page=2"'.format(WASAPI_URL))
283 |         # The value for `count` is pulled from the last page. Though,
284 |         # in actuality, `count` should be same on all pages.
285 |         p2 = WASAPI_TEXT.replace('"count":2', '"count":4')
286 |         responses = [MockResponse200(p1), MockResponse200(p2)]
287 |         mock_session.return_value.get.side_effect = responses
288 |         count, total = wc.get_files_size(WASAPI_URL)
289 |         assert count == 4
290 |         assert total == 949365581 * 2
291 | 
292 |     def test_get_files_size_no_files(self, mock_session):
293 |         mock_session.return_value.get.return_value = MockResponse200(NO_FILES)
294 |         count, total = wc.get_files_size(WASAPI_URL)
295 |         assert count == 0
296 |         assert total == 0
297 | 
298 | 
299 | class Test_convert_bytes:
300 |     @pytest.mark.parametrize('size, expected', [
301 |         (0, '0.0B'),
302 |         (1023, '1023.0B'),
303 |         (1024, '1.0KB'),
304 |         (1024000, '1000.0KB'),
305 |         (1048576, '1.0MB'),
306 |         (1073741824, '1.0GB'),
307 |         (1099511628000, '1.0TB')
308 |     ])
309 |     def test_convert_bytes(self, size, expected):
310 |         assert wc.convert_bytes(size) == expected
311 | 
312 | 
313 | class Test_download_file:
314 |     locations = ['http://loc1/blah.warc.gz', 'http://loc2/blah.warc.gz']
315 |     filename = 'blah.warc.gz'
316 |     checksums = {'sha1': '33304d104f95d826da40079bad2400dc4d005403',
317 |                  'md5': '62f87a969af0dd857ecd6c3e7fde6aed'}
318 |     size = 12345678
319 |     data_file = wc.DataFile(locations, filename, checksums, size)
320 | 
321 |     def test_download_file_200(self):
322 |         session = requests.Session()
323 |         mock_200 = MockResponse200('')
324 | 
325 |         with patch.object(session, 'get', return_value=mock_200) as mock_get, \
326 |                 patch('wasapi_client.write_file') as mock_write_file:
327 |             file_data = wc.download_file(self.data_file, session, self.filename)
328 | 
329 |         # Check we only tried downloading files until successful download.
330 |         mock_get.assert_called_once_with(self.locations[0], stream=True)
331 |         mock_write_file.assert_called_once_with(mock_200, self.filename)
332 |         assert not file_data.verified
333 | 
334 |     def test_download_file_not_200(self):
335 |         session = requests.Session()
336 |         mock_403 = MockResponse403()
337 | 
338 |         with patch.object(session, 'get', return_value=mock_403) as mock_get, \
339 |                 pytest.raises(wc.WASAPIDownloadError) as err:
340 |             wc.download_file(self.data_file, session, self.filename)
341 |         for item in (str(self.locations), self.filename):
342 |             assert item in err.value.args[0]
343 |         # Check all locations were tried.
344 |         calls = [call(self.locations[0], stream=True),
345 |                  call(self.locations[1], stream=True)]
346 |         mock_get.assert_has_calls(calls)
347 | 
348 |     def test_download_get_raises_some_RequestException(self, caplog):
349 |         caplog.set_level(INFO)
350 |         session = requests.Session()
351 |         mock_200 = MockResponse200('')
352 | 
353 |         with patch.object(session, 'get') as mock_get, \
354 |                 patch('wasapi_client.write_file') as mock_write_file:
355 |             # Raise a subclass of RequestException on first download attempt;
356 |             # mock a successful response on the second attempt
357 |             mock_get.side_effect = [requests.exceptions.ConnectionError(),
358 |                                     mock_200]
359 |             wc.download_file(self.data_file, session, self.filename)
360 | 
361 |         # Check all locations were tried.
362 |         calls = [call(self.locations[0], stream=True),
363 |                  call(self.locations[1], stream=True)]
364 |         mock_get.assert_has_calls(calls)
365 |         mock_write_file.assert_called_once_with(mock_200, self.filename)
366 |         # Verify requests exception was caught and logged.
367 |         for msg in ('Error downloading http://loc1/blah.warc.gz:',
368 |                     'http://loc2/blah.warc.gz: 200 OK'):
369 |             assert msg in caplog.text
370 | 
371 |     def test_download_file_OSError(self):
372 |         session = requests.Session()
373 |         mock_200 = MockResponse200('')
374 | 
375 |         with patch.object(session, 'get', return_value=mock_200) as mock_get, \
376 |                 patch('wasapi_client.write_file') as mock_write_file:
377 |             mock_write_file.side_effect = OSError
378 |             with pytest.raises(wc.WASAPIDownloadError) as err:
379 |                 wc.download_file(self.data_file, session, self.filename)
380 | 
381 |         for item in (str(self.locations), self.filename):
382 |             assert item in err.value.args[0]
383 |         # Check we only tried downloading files until successful download.
384 |         mock_get.assert_called_once_with(self.locations[0], stream=True)
385 |         mock_write_file.assert_called_once_with(mock_200, self.filename)
386 | 
387 |     def test_download_check_exists_true(self):
388 |         """Test a file already existing on the filesystem is not downloaded."""
389 |         with patch('wasapi_client.check_exists', return_value=True), \
390 |                 patch('requests.Session', autospec=True) as mock_session:
391 |             file_data = wc.download_file(self.data_file, mock_session, self.filename)
392 |         # Check `verified` has been set True on the FileData instance.
393 |         assert file_data.verified
394 |         # Check that no get request was made.
395 |         assert not mock_session.get.called
396 | 
397 |     def test_download_uses_pre_signed_url(self):
398 |         """Test that an s3 URL uses requests.get, not a session."""
399 |         locations = ['https://data.s3.amazonaws.com/warcs/blah.warc.gz?Signature=xyz',
400 |                      'http://loc2/blah.warc.gz']
401 |         filename = 'blah.warc.gz'
402 |         checksums = {'md5': '72b484a2610cb54ec22e48c8104ba3bd'}
403 |         data_file = wc.DataFile(locations, filename, checksums, 123456)
404 |         mock_200 = MockResponse200('')
405 | 
406 |         with patch('requests.get', return_value=mock_200) as mock_get, \
407 |                 patch('wasapi_client.write_file') as mock_write_file:
408 |             wc.download_file(data_file, requests.Session(), filename)
409 | 
410 |         # Check we attempted one download via requests.get and wrote the file.
411 |         mock_get.assert_called_once_with(locations[0], stream=True)
412 |         mock_write_file.assert_called_once_with(mock_200, filename)
413 | 
414 | 
415 | class Test_check_exists:
416 |     def test_check_exists_return_true(self):
417 |         checksums = {'sha1': '33304d104f95d826da40079bad2400dc4d005403'}
418 |         with patch('os.path.isfile', return_value=True), \
419 |                 patch('os.path.getsize', return_value=123456), \
420 |                 patch('wasapi_client.verify_file', return_value=True) as mock_verify:
421 |             assert wc.check_exists('path', 123456, checksums)
422 |             mock_verify.assert_called_once_with(checksums, 'path')
423 | 
424 |     @patch('os.path.isfile', return_value=False)
425 |     @patch('os.path.getsize')
426 |     def test_check_exists_no_file(self, mock_getsize, mock_isfile):
427 |         assert not wc.check_exists('path', 123456, {})
428 |         mock_isfile.assert_called_once_with('path')
429 |         assert not mock_getsize.called
430 | 
431 |     @patch('os.path.isfile', return_value=True)
432 |     @patch('os.path.getsize', return_value=123456)
433 |     @patch('wasapi_client.verify_file')
434 |     def test_check_exists_file_size_mismatch(self, mock_verify, mock_getsize, mock_isfile):
435 |         assert not wc.check_exists('path', 789, {})
436 |         mock_isfile.assert_called_once_with('path')
437 |         mock_getsize.assert_called_once_with('path')
438 |         assert not mock_verify.called
439 | 
440 |     def test_check_exists_checksum_fail(self):
441 |         with patch('os.path.isfile', return_value=True), \
442 |                 patch('os.path.getsize', return_value=123456), \
443 |                 patch('wasapi_client.verify_file', return_value=False) as mock_verify:
444 |             assert not wc.check_exists('path', 123456, {})
445 |             mock_verify.assert_called_once_with({}, 'path')
446 | 
447 | 
448 | class Test_verify_file:
449 |     @patch('wasapi_client.calculate_sum')
450 |     def test_verify_file(self, mock_calc_sum):
451 |         """Test a matching checksum returns True."""
452 |         checksum = '33304d104f95d826da40079bad2400dc4d005403'
453 |         checksums = {'sha1': checksum}
454 |         mock_calc_sum.return_value = checksum
455 |         assert wc.verify_file(checksums, 'dummy/path')
456 | 
457 |     def test_verify_file_unsupported_algorithm(self):
458 |         """Test all algorithms being unsupported returns False."""
459 |         checksums = {'shaq1': 'shaq1algorithmdoesnotexist'}
460 |         assert not wc.verify_file(checksums, 'dummy/path')
461 | 
462 |     @patch('wasapi_client.calculate_sum')
463 |     def test_verify_file_checksum_mismatch(self, mock_calc_sum):
464 |         """Test calculated checksum does not match the expected."""
465 |         checksum = '33304d104f95d826da40079bad2400dc4d005403'
466 |         algorithm = 'sha1'
467 |         path = 'dummy/path'
468 |         checksums = {algorithm: checksum}
469 |         mock_calc_sum.return_value = checksum + 'notmatching'
470 |         with patch('wasapi_client.LOGGER', autospec=True) as mock_logger:
471 |             assert not wc.verify_file(checksums, path)
472 |         msg = 'Checksum {} mismatch for {}: expected {}, got {}notmatching'.format(algorithm,
473 |                                                                                    path,
474 |                                                                                    checksum,
475 |                                                                                    checksum)
476 |         mock_logger.error.assert_called_once_with(msg)
477 | 
478 |     @patch('wasapi_client.calculate_sum')
479 |     def test_verify_file_one_supported_algorithm(self, mock_calc_sum):
480 |         """Test one unsupported/one supported algorithm returns True."""
481 |         checksum = '33304d104f95d826da40079bad2400dc4d005403'
482 |         checksums = OrderedDict([('abc', 'algorithm_unsupported'),
483 |                                  ('sha1', checksum)])
484 |         mock_calc_sum.return_value = checksum
485 |         with patch('wasapi_client.LOGGER', autospec=True) as mock_logger:
486 |             assert wc.verify_file(checksums, 'dummy/path')
487 |         # Check that unsupported algorithm was tried.
488 |         mock_logger.debug.assert_called_once_with('abc is unsupported')
489 |         mock_logger.info.assert_called_once_with('Checksum success at: dummy/path')
490 | 
491 |     @patch('wasapi_client.calculate_sum')
492 |     def test_verify_file_s3etag_algorithm_regular_md5(self, mock_calc_sum):
493 |         checksum = '72b484a2610cb54ec22e48c8104ba3bd'
494 |         checksums = {'s3etag': checksum}
495 |         mock_calc_sum.return_value = checksum
496 |         assert wc.verify_file(checksums, 'dummy/path')
497 |         # Verify the hash_function used was md5.
498 |         mock_calc_sum.assert_called_once_with(hashlib.md5, 'dummy/path', wc.READ_LIMIT)
499 | 
500 |     @patch('wasapi_client.calculate_sum')
501 |     def test_verify_file_s3etag_algorithm_double_md5(self, mock_calc_sum):
502 |         checksum = 'ceb8853ddc5086cc4ab9e149f8f09c88-2'
503 |         checksums = {'s3etag': checksum}
504 |         mock_calc_sum.return_value = checksum
505 |         assert wc.verify_file(checksums, 'dummy/path')
506 |         # Verify s3etag value containing a '-' uses S3DoubleMD5 and custom read_limit.
507 |         mock_calc_sum.assert_called_once_with(wc.S3DoubleMD5, 'dummy/path', 1024*1024*8)
508 | 
509 | 
510 | class Test_S3DoubleMD5:
511 |     def test_S3DoubleMD5_single_md5(self):
512 |         content = b'We are updating this once.'
513 |         s3md5 = wc.S3DoubleMD5()
514 |         s3md5.update(content)
515 |         # Calling update once means length of s3md5.md5s is 1, and
516 |         # hexdigest is same as for regular md5.
517 |         assert len(s3md5.md5s) == 1
518 |         assert s3md5.hexdigest() == hashlib.md5(content).hexdigest()
519 | 
520 |     def test_S3DoubleMD5_double_md5(self):
521 |         content = b'We are updating this once.\nTwice.\nAnd three times.'
522 |         s3md5 = wc.S3DoubleMD5()
523 |         # Cause update to be called three times.
524 |         for line in content.split(b'\n'):
525 |             s3md5.update(line)
526 |         # S3DoubleMD5 hexdigest should be the hexdigest of the concatenation
527 |         # of the digests of the 3 items in s3md5.md5s and a '-3'
528 |         # for the number of digests that were concatenated.
529 |         assert len(s3md5.md5s) == 3
530 |         assert s3md5.hexdigest() == '8e73850eb35bebe8ebd2896dd9032e48-3'
531 | 
532 | 
533 | class Test_calculate_sum:
534 |     @pytest.mark.skipif(sys.version_info < (3, 4, 4), reason=('bug via mock_open '
535 |                         'https://github.com/python/cpython/commit/86b34d'))
536 |     def test_calculate_sum(self):
537 |         data = 'data from file'.encode('utf-8')
538 |         with patch('builtins.open', mock_open(read_data=data)):
539 |             checksum = wc.calculate_sum(hashlib.sha1, 'dummy/path')
540 |         assert checksum == hashlib.sha1(data).hexdigest()
541 | 
542 | 
543 | class Test_convert_queue:
544 |     def test_convert_queue(self):
545 |         m = multiprocessing.Manager()
546 |         q = m.Queue()
547 |         q.put(('success', 'name1'))
548 |         q.put(('failure', 'name2'))
549 |         dict_from_q = wc.convert_queue(q)
550 |         assert dict_from_q['success'] == ['name1']
551 |         assert dict_from_q['failure'] == ['name2']
552 |         m.shutdown()
553 | 
554 | 
555 | class Test_generate_report:
556 |     def test_generate_report_all_success(self):
557 |         m = multiprocessing.Manager()
558 |         q = m.Queue()
559 |         q.put(('success', 'name1'))
560 |         q.put(('success', 'name2'))
561 |         report = wc.generate_report(q)
562 |         assert report == ('Total downloads attempted: 2\n'
563 |                           'Successful downloads: 2\n'
564 |                           'Failed downloads: 0\n')
565 |         m.shutdown()
566 | 
567 |     def test_generate_report_one_failure(self):
568 |         m = multiprocessing.Manager()
569 |         q = m.Queue()
570 |         q.put(('success', 'name1'))
571 |         q.put(('failure', 'name2'))
572 |         report = wc.generate_report(q)
573 |         assert report == ('Total downloads attempted: 2\n'
574 |                           'Successful downloads: 1\n'
575 |                           'Failed downloads: 1\n'
576 |                           'Failed files (see log for details):\n'
577 |                           '    name2\n')
578 |         m.shutdown()
579 | 
580 |     def test_generate_report_all_failure(self):
581 |         m = multiprocessing.Manager()
582 |         q = m.Queue()
583 |         q.put(('failure', 'name1'))
584 |         q.put(('failure', 'name2'))
585 |         report = wc.generate_report(q)
586 |         assert report == ('Total downloads attempted: 2\n'
587 |                           'Successful downloads: 0\n'
588 |                           'Failed downloads: 2\n')
589 |         m.shutdown()
590 | 
591 | 
592 | class TestDownloader:
593 |     locations = ['http://loc1/blah.warc.gz', 'http://loc2/blah.warc.gz']
594 |     filename = 'blah.warc.gz'
595 |     checksums = {'sha1': '33304d104f95d826da40079bad2400dc4d005403',
596 |                  'md5': '62f87a969af0dd857ecd6c3e7fde6aed'}
597 |     size = 12345678
598 |     data_file = wc.DataFile(locations, filename, checksums, size)
599 | 
600 |     def test_run(self):
601 |         """Test downloader when downloads are successful."""
602 |         # Create a queue holding two sets of file data.
603 |         get_q = multiprocessing.JoinableQueue()
604 |         for _ in (1, 2):
605 |             get_q.put(self.data_file)
606 |         manager = multiprocessing.Manager()
607 |         result_q = manager.Queue()
608 |         log_q = manager.Queue()
609 |         with patch('wasapi_client.verify_file', return_value=True), \
610 |                 patch('wasapi_client.download_file', return_value=self.data_file):
611 |             p = wc.Downloader(get_q, result_q, log_q)
612 |             p.start()
613 |             p.run()
614 |         # If the join doesn't block, the queue is fully processed.
615 |         get_q.join()
616 |         # Verify there is nothing on the log_q.
617 |         assert log_q.empty()
618 |         for _ in (1, 2):
619 |             assert result_q.get() == ('success', self.filename)
620 |         # Verify those were the only two results on the result_q.
621 |         assert result_q.empty()
622 | 
623 |     @patch('wasapi_client.download_file')
624 |     def test_run_WASAPIDownloadError(self, mock_download):
625 |         """Test downloader when downloads fail."""
626 |         expected_error = 'WD Error'
627 |         mock_download.side_effect = wc.WASAPIDownloadError(expected_error)
628 |         # Create a queue holding two sets of file data.
629 |         get_q = multiprocessing.JoinableQueue()
630 |         for _ in (1, 2):
631 |             get_q.put(self.data_file)
632 |         manager = multiprocessing.Manager()
633 |         result_q = manager.Queue()
634 |         log_q = manager.Queue()
635 |         p = wc.Downloader(get_q, result_q, log_q)
636 |         p.start()
637 |         p.run()
638 |         # If the join doesn't block, the queue is fully processed.
639 |         get_q.join()
640 |         for _ in (1, 2):
641 |             assert log_q.get().msg == expected_error
642 |             assert result_q.get() == ('failure', self.filename)
643 |         # Verify those were the only two results on the result_q.
644 |         # Sometimes `empty` needs a moment to register.
645 |         assert result_q.empty()
646 | 
647 |     def test_run_file_already_verified(self):
648 |         """Test a downloaded file is not verified twice."""
649 |         return_data_file = wc.DataFile(self.locations, self.filename, self.checksums, self.size)
650 |         return_data_file.verified = True
651 |         # Create a queue holding two sets of file data.
652 |         get_q = multiprocessing.JoinableQueue()
653 |         for _ in (1, 2):
654 |             get_q.put(self.data_file)
655 |         manager = multiprocessing.Manager()
656 |         result_q = manager.Queue()
657 |         log_q = manager.Queue()
658 |         with patch('wasapi_client.verify_file', return_value=True) as mock_verify, \
659 |                 patch('wasapi_client.download_file', return_value=return_data_file):
660 |             p = wc.Downloader(get_q, result_q, log_q)
661 |             p.start()
662 |             p.run()
663 |         # If the join doesn't block, the queue is fully processed.
664 |         get_q.join()
665 |         assert log_q.empty()
666 |         for _ in (1, 2):
667 |             assert result_q.get() == ('success', self.filename)
668 |         assert result_q.empty()
669 |         # Check verify_exists was not called, since it was called in `download_file`.
670 |         assert not mock_verify.called
671 | 
672 | 
673 | class Test_parse_args:
674 |     @patch('wasapi_client.multiprocessing.cpu_count')
675 |     def test_default_processes(self, mock_cpu_count):
676 |         """Test handling of cpu_count() erroring.
677 | 
678 |         Could happen when cpu_count isn't implemented on a platform
679 |         and --processes isn't specified by the user.
680 |         """
681 |         mock_cpu_count.side_effect = NotImplementedError
682 |         args = wc._parse_args(['--crawl', '12'])
683 |         assert args.processes == 1
684 | 
685 |     def test_SetQueryParametersAction(self):
686 |         """Test that arguments passed with this action are in query_params."""
687 |         args = wc._parse_args(['--crawl-start-after',
688 |                                '2016-12-22T13:01:00',
689 |                                '--crawl-start-before',
690 |                                '2016-12-22T15:11:00',
691 |                                '-c'])
692 |         assert len(args.query_params) == 2
693 |         assert args.query_params['crawl-start-after'] == '2016-12-22T13:01:00'
694 |         assert args.query_params['crawl-start-before'] == '2016-12-22T15:11:00'
695 | 
696 |     def test_SetQueryParametersAction_multiple_collections(self):
697 |         """Test multiple collections end up in query_params.
698 | 
699 |         A query can have multiple collections, so test that the
700 |         user can supply multiple values.
701 |         """
702 |         args = wc._parse_args(['--collection', '12345', '98', '--crawl', '12'])
703 |         assert len(args.query_params) == 2
704 |         assert args.query_params['collection'] == ['12345', '98']
705 | 
706 | 
707 | class Test_get_credentials_env:
708 |     def test_get_credentials_env(self):
709 |         """Test auth credentials are set from environment variables."""
710 |         with patch.dict('os.environ', {'WASAPI_USER': 'me', 'WASAPI_PASS': 'p@ss123'}):
711 |             auth = wc.get_credentials_env()
712 |         assert auth == ('me', 'p@ss123')
713 | 
714 |     def test_get_credentials_env_missing_one_env_var(self):
715 |         """Test a None value for username or password causes no auth."""
716 |         with patch('os.environ.get') as mock_get:
717 |             mock_get.side_effect = ['me', None]
718 |             auth = wc.get_credentials_env()
719 |         assert auth is None
720 | 
721 | 
722 | class Test_get_credentials_config:
723 |     def test_get_credentials_config(self):
724 |         """Test auth can be populated from a config file."""
725 |         stream = io.StringIO('[unt]\nusername = me\npassword = p@ss123')
726 |         with patch('builtins.open', return_value=stream):
727 |             auth = wc.get_credentials_config('unt')
728 |         assert auth == ('me', 'p@ss123')
729 | 
730 |     def test_get_credentials_config_missing_profile(self):
731 |         """Test program exits if the profile supplied doesn't exist."""
732 |         stream = io.StringIO('[unt]\nusername = me\npassword = p@ss123')
733 |         with patch('builtins.open', return_value=stream), \
734 |                 pytest.raises(SystemExit):
735 |             wc.get_credentials_config('home')
736 | 
737 |     def test_get_credentials_config_missing_password(self):
738 |         """Test program exits if config does not supply an expected option."""
739 |         stream = io.StringIO('[unt]\nusername = me')
740 |         with patch('builtins.open', return_value=stream), \
741 |                 pytest.raises(SystemExit):
742 |             wc.get_credentials_config('unt')
743 | 
744 | 
745 | class Test_get_credentials:
746 |     @patch('getpass.getpass', return_value='p@ss123')
747 |     def test_get_credentials_from_getpass(self, mock_getpass):
748 |         auth = wc.get_credentials(user='me')
749 |         assert auth == ('me', 'p@ss123')
750 |         mock_getpass.assert_called_once_with()
751 | 
752 |     @patch('wasapi_client.get_credentials_env', return_value=('me', 'p@ss123'))
753 |     def test_get_credentials_from_env(self, mock_gce):
754 |         auth = wc.get_credentials()
755 |         assert auth == ('me', 'p@ss123')
756 |         mock_gce.assert_called_once_with()
757 | 
758 |     @patch('wasapi_client.get_credentials_env', return_value=None)
759 |     @patch('wasapi_client.get_credentials_config', return_value=('me', 'p@ss123'))
760 |     def test_get_credentials_from_config(self, mock_gcc, mock_gce):
761 |         auth = wc.get_credentials(profile='unt')
762 |         assert auth == ('me', 'p@ss123')
763 |         mock_gcc.assert_called_once_with('unt')
764 |         mock_gce.assert_called_once_with()
765 | 
766 |     @patch('wasapi_client.get_credentials_env', return_value=None)
767 |     @patch('wasapi_client.get_credentials_config')
768 |     def test_get_credentials_no_credentials_provided(self, mock_gcc, mock_gce):
769 |         """Test if no user/profile is provided and no valid config file exists."""
770 |         auth = wc.get_credentials()
771 |         assert auth is None
772 |         assert not mock_gcc.called
773 |         mock_gce.assert_called_once_with()
774 | 


--------------------------------------------------------------------------------