├── .gitignore ├── .travis.yml ├── MANIFEST.in ├── README.md ├── requirements.txt ├── resumable └── __init__.py ├── setup.cfg ├── setup.py ├── test ├── test_resumable.py ├── trust.pdf └── trust.pdf.partial └── tox.ini /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | dist 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | sudo: false 3 | python: 4 | - 3.4 5 | env: 6 | global: 7 | - HOME=/home/travis 8 | - WORKON_HOME=$HOME/.pew 9 | matrix: 10 | - TOXENV="py34" PYTHON=3.4.6 11 | - TOXENV="py35" PYTHON=3.5.3 12 | - TOXENV="py36" PYTHON=3.6.0 13 | 14 | install: 15 | - pip install pew[pythonz] 16 | - mkdir $WORKON_HOME 17 | - pew install $PYTHON 18 | - pew new -d venv --python=$(pew locate_python $PYTHON) -i tox -i mypy-lang 19 | script: pew in venv tox -e $TOXENV 20 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include tox.ini 2 | include test/trust* 3 | include README.md 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Resumable urlretrieve 2 | ======================== 3 | 4 | [![PyPi version](http://img.shields.io/pypi/v/resumable-urlretrieve.svg)](https://pypi.python.org/pypi/resumable-urlretrieve) 5 | [![Build Status](https://travis-ci.org/berdario/resumable-urlretrieve.png)](https://travis-ci.org/berdario/resumable-urlretrieve) 6 | 7 | This is a drop-in replacement for `urllib.request.urlretrieve` that will automatically resume a partially-downloaded file (if the remote HTTP server supports `Range` requests). 8 | 9 | def urlretrieve(url: str, filename: Union[str, Path], reporthook=None, method='GET', 10 | sha256sum=None, filesize=None, headers=None, 11 | **kwargs) -> Dict[str, str] 12 | 13 | There are only a few differences: 14 | 15 | - The `filename` argument is not optional 16 | - It returns the headers of the HTTP requests 17 | - It will raise `resumable.DownloadError` if needed 18 | - It relies on `requests`, and can thus accept a `headers` dictionary, or an `auth` argument 19 | 20 | The `sha256sum` and `filesize` will be used (if supplied) to check the downloaded file against it, and prevent making another HTTP request in case it would have been already completed (Otherwise it'll rely on the server returned `Content-Length` and `Content-Range`). 21 | 22 | Tested on Python >= 3.4. 23 | 24 | License 25 | ======= 26 | 27 | Permission is hereby granted, free of charge, to any person obtaining a copy 28 | of this software and associated documentation files (the "Software"), to deal 29 | in the Software without restriction, including without limitation the rights 30 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 31 | copies of the Software, and to permit persons to whom the Software is 32 | furnished to do so, subject to the following conditions: 33 | 34 | The above copyright notice and this permission notice shall be included in 35 | all copies or substantial portions of the Software. 36 | 37 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 38 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 39 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 40 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 41 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 42 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 43 | THE SOFTWARE. 44 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests== 2.21.0 2 | py==1.4.30 3 | pytest==2.7.3 4 | rangehttpserver==1.2.0 5 | https://github.com/JukkaL/mypy/tarball/dcff596be878798fbd43af7b50d95c3eed7ebbc1#egg=mypy-lang 6 | -------------------------------------------------------------------------------- /resumable/__init__.py: -------------------------------------------------------------------------------- 1 | import requests # type: ignore 2 | from contextlib import closing # type: ignore 3 | from re import fullmatch # type: ignore 4 | from enum import Enum 5 | import hashlib 6 | from typing import Union, Dict, Optional, Tuple, NamedTuple, Any 7 | from pathlib import Path 8 | from logging import getLogger 9 | 10 | path = Union[str, Path] 11 | 12 | log = getLogger(__name__) 13 | 14 | DownloadCheck = Enum('DownloadCheck', 'completed partial checksum_mismatch size_mismatch') # type: ignore 15 | 16 | class DownloadError(Exception): pass 17 | 18 | ContentRange = NamedTuple('ContentRange', [('start', int), 19 | ('end', int), 20 | ('total', Optional[int])]) 21 | 22 | 23 | def sha256(filename: path) -> str: 24 | sha = hashlib.sha256() 25 | chunksize = 524288 26 | 27 | with Path(filename).open('rb') as f: 28 | data = f.read(chunksize) 29 | while data: 30 | sha.update(data) 31 | data = f.read(chunksize) 32 | return sha.hexdigest() 33 | 34 | 35 | def is_download_complete(filename: Path, sha256sum: str, filesize: int) -> Any: 36 | D = DownloadCheck # type: Any 37 | try: 38 | if sha256sum is not None: 39 | return D.completed if sha256(filename) == sha256sum else D.checksum_mismatch 40 | elif filesize is not None: 41 | return D.completed if filename.stat().st_size == filesize else D.size_mismatch 42 | else: 43 | return D.partial 44 | except (NameError, FileNotFoundError): 45 | return D.partial 46 | 47 | 48 | def parse_byte_range(content_range: str) -> ContentRange: 49 | try: 50 | start, end, total = fullmatch('bytes (\d+)-(\d+)/(\d+|\*)', content_range).groups() 51 | except AttributeError: 52 | raise DownloadError('Invalid Content-Range', content_range) 53 | else: 54 | total = int(total) if total != '*' else None 55 | return ContentRange(int(start), int(end), total) 56 | 57 | 58 | def get_resource_size(headers: Dict[str, str]) -> Optional[int]: 59 | cl = headers.get('Content-Length') 60 | cr = headers.get('Content-Range') 61 | if cr is not None: 62 | return parse_byte_range(cr).total 63 | elif cl: 64 | return int(cl) 65 | 66 | 67 | def starting_range(resp, filesize: Optional[int]) -> int: 68 | '''Find starting index from Content-Range, if any. Warn about problematic ranges''' 69 | if resp.status_code == 206 and 'Content-Range' in resp.headers: 70 | cr = parse_byte_range(resp.headers['Content-Range']) 71 | if filesize and filesize != cr.start: 72 | log.warning('The download is not resuming exactly where it ended') 73 | if cr.total and cr.end != cr.total - 1: 74 | log.warning("The download won't fetch the whole file," 75 | " you might want to run urlretrieve again") 76 | return cr.start 77 | else: 78 | return 0 79 | 80 | 81 | def write_response(resp, filename: Path, reporthook, 82 | size: Optional[int], remote_size: Optional[int]): 83 | if size is None or size != remote_size: 84 | with filename.open('r+b') if filename.exists() else filename.open('xb') as f: 85 | start = starting_range(resp, size) 86 | f.seek(start) 87 | chunk_size = 16384 88 | for i, chunk in enumerate(resp.iter_content(chunk_size=chunk_size), start // chunk_size): 89 | if chunk: 90 | f.write(chunk) 91 | if reporthook: 92 | reporthook(i, chunk_size, remote_size or -1) 93 | f.flush() 94 | 95 | 96 | def urlretrieve(url: str, filename: path, reporthook=None, method='GET', 97 | sha256sum=None, filesize=None, headers=None, 98 | **kwargs) -> Dict[str, str]: 99 | D = DownloadCheck # type: Any 100 | filename = Path(filename) 101 | if is_download_complete(filename, sha256sum, filesize) != D.completed: 102 | size = filename.stat().st_size if filename.exists() else None 103 | headers = headers or {} 104 | headers.update({'Range': 'bytes=%s-' % size} if size is not None else {}) 105 | with closing(requests.request(method, url, stream=True, 106 | headers=headers, **kwargs)) as resp: 107 | remote_size = get_resource_size(resp.headers) 108 | already_completed = resp.status_code == 416 109 | if not already_completed: 110 | try: 111 | resp.raise_for_status() 112 | except requests.exceptions.HTTPError as e: 113 | raise DownloadError(e) 114 | write_response(resp, filename, reporthook, size, remote_size) 115 | check = is_download_complete(filename, sha256sum, filesize) 116 | if check not in (D.completed, D.partial): 117 | raise DownloadError(check) 118 | return resp.headers 119 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal = 1 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import sys 4 | from setuptools import setup # type: ignore 5 | 6 | requires = ['requests >= 2.20.0'] 7 | 8 | if sys.version_info < (3, 5): 9 | requires.append('typing') 10 | 11 | 12 | setup( 13 | name='resumable-urlretrieve', 14 | version='0.1.6', 15 | description='Small library to fetch files over HTTP and resuming their download', 16 | author='Dario Bertini', 17 | author_email='berdario+pypi@gmail.com', 18 | url='https://github.com/berdario/resumable-urlretrieve', 19 | license='MIT License', 20 | packages=['resumable'], 21 | install_requires=requires, 22 | include_package_data=True, 23 | zip_safe=False, 24 | classifiers=[ 25 | 'Programming Language :: Python :: 3', 26 | 'Intended Audience :: Developers', 27 | 'Topic :: Internet :: WWW/HTTP'] 28 | ) 29 | -------------------------------------------------------------------------------- /test/test_resumable.py: -------------------------------------------------------------------------------- 1 | import pytest # type: ignore 2 | 3 | import errno 4 | import os 5 | import random 6 | from threading import Thread 7 | from http.server import HTTPServer, SimpleHTTPRequestHandler # type: ignore 8 | from tempfile import NamedTemporaryFile 9 | from unittest.mock import patch # type: ignore 10 | from functools import partial # type: ignore 11 | from typing import NamedTuple 12 | 13 | from RangeHTTPServer import RangeRequestHandler # type: ignore 14 | 15 | from resumable import urlretrieve, sha256, DownloadError 16 | 17 | FileStats = NamedTuple('FileStats', [('sha256sum', str), ('size', int)]) 18 | 19 | 20 | def get_port(): 21 | return random.randint(1024, 65536) 22 | 23 | 24 | def get_httpd(request_handler): 25 | while True: 26 | try: 27 | port = get_port() 28 | return HTTPServer(('', port), request_handler) 29 | 30 | except OSError as e: 31 | if e.errno != errno.EADDRINUSE: 32 | raise 33 | 34 | 35 | @pytest.fixture(scope='module') 36 | def httpd(): 37 | httpd = get_httpd(RangeRequestHandler) 38 | Thread(target=httpd.serve_forever, daemon=True).start() # type: ignore 39 | return httpd 40 | 41 | 42 | @pytest.fixture(scope='module') 43 | def simple_httpd(): 44 | httpd = get_httpd(SimpleHTTPRequestHandler) 45 | Thread(target=httpd.serve_forever, daemon=True).start() # type: ignore 46 | return httpd 47 | 48 | 49 | @pytest.fixture(scope='module') 50 | def testfile_stats(): 51 | fname = 'test/trust.pdf' 52 | return FileStats(sha256(fname), os.stat(fname).st_size) 53 | 54 | 55 | @pytest.yield_fixture() 56 | def partial_download(httpd): 57 | port = httpd.server_port 58 | 59 | with NamedTemporaryFile() as tempfile: 60 | urlretrieve('http://localhost:%s/test/trust.pdf.partial' % port, tempfile.name) 61 | yield tempfile 62 | 63 | 64 | def test_urlretrieve(httpd, partial_download, testfile_stats): 65 | port = httpd.server_port 66 | complete_downloader = partial(urlretrieve, 'http://localhost:%s/test/trust.pdf' % port, partial_download.name) 67 | headers = complete_downloader() 68 | assert int(headers['Content-Length']) < testfile_stats.size 69 | assert testfile_stats.sha256sum == sha256(partial_download.name) 70 | last_touched = os.stat(partial_download.name).st_mtime 71 | complete_downloader() 72 | assert last_touched == os.stat(partial_download.name).st_mtime 73 | with patch('socket.socket.__new__', side_effect=Exception): 74 | complete_downloader(sha256sum=testfile_stats.sha256sum) 75 | assert last_touched == os.stat(partial_download.name).st_mtime 76 | complete_downloader(filesize=testfile_stats.size) 77 | assert last_touched == os.stat(partial_download.name).st_mtime 78 | 79 | 80 | def test_wrongsize(httpd, partial_download, testfile_stats): 81 | port = httpd.server_port 82 | 83 | with pytest.raises(DownloadError): 84 | urlretrieve('http://localhost:%s/test/trust.pdf' % port, 85 | partial_download.name, 86 | filesize=testfile_stats.size-1) 87 | 88 | 89 | def test_wronghash(httpd, partial_download): 90 | port = httpd.server_port 91 | 92 | with pytest.raises(DownloadError): 93 | urlretrieve('http://localhost:%s/test/trust.pdf' % port, 94 | partial_download.name, 95 | sha256sum='') 96 | 97 | 98 | def test_norange(simple_httpd, partial_download, testfile_stats): 99 | urlretrieve('http://localhost:%s/test/trust.pdf' % simple_httpd.server_port, partial_download.name) 100 | assert testfile_stats.sha256sum == sha256(partial_download.name) 101 | -------------------------------------------------------------------------------- /test/trust.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berdario/resumable-urlretrieve/884c1bab6a846ea000bf2a039bf72e7a534c9b41/test/trust.pdf -------------------------------------------------------------------------------- /test/trust.pdf.partial: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berdario/resumable-urlretrieve/884c1bab6a846ea000bf2a039bf72e7a534c9b41/test/trust.pdf.partial -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py34,py35,py36 3 | 4 | [testenv] 5 | commands = 6 | mypy --use-python-path -m resumable 7 | py.test -rw [] 8 | deps = 9 | -r{toxinidir}/requirements.txt 10 | --------------------------------------------------------------------------------