├── .gitignore ├── pyremotezip ├── __init__.py └── remotezip.py ├── README.md ├── setup.py └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | dist 3 | *.pyc 4 | *.egg-info 5 | -------------------------------------------------------------------------------- /pyremotezip/__init__.py: -------------------------------------------------------------------------------- 1 | from .remotezip import RemoteZip # noqa 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Stories in Ready](https://badge.waffle.io/fcvarela/pyremotezip.png?label=ready&title=Ready)](https://waffle.io/fcvarela/pyremotezip) 2 | # PyRemoteZip 3 | 4 | PyRemoteZip is a pure python module to extract files from remote zip archives without downloading the whole zip archive. 5 | 6 | ### Usage 7 | 8 | from pyremotezip import RemoteZip 9 | rz = RemoteZip() 10 | toc = rz.getTableOfContents() 11 | 12 | # want file at pos 2 13 | output = rz.extractFile(toc[2]['filename']) 14 | 15 | ### Contributing 16 | 17 | Have you forked and improved this? Please submit your pull requests and raise issues here! 18 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from setuptools import setup, find_packages 4 | 5 | version = '0.5' 6 | 7 | setup( 8 | name='pyremotezip', 9 | version=version, 10 | description="Extract files from remote ZIP archives", 11 | long_description=""" """, 12 | classifiers=[], 13 | keywords="", 14 | author="Filipe Varela", 15 | author_email="fcvarela@gmail.com", 16 | url="https://github.com/fcvarela/pyremotezip/", 17 | license="BSD", 18 | package_dir={'': '.'}, 19 | packages=find_packages(where='.'), 20 | include_package_data=True, 21 | zip_safe=False, 22 | install_requires=['setuptools', ], 23 | entry_points=""" 24 | # None whatsoever 25 | """, 26 | ) 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012-2014, Filipe Varela 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | -------------------------------------------------------------------------------- /pyremotezip/remotezip.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | import zlib 3 | 4 | from urllib2 import HTTPError 5 | from struct import unpack 6 | 7 | 8 | class RemoteZip(object): 9 | """ 10 | This class extracts single files from a remote ZIP file by using HTTP ranged requests 11 | """ 12 | def __init__(self, zipURI): 13 | """ 14 | zipURI should be an HTTP URL hosted on a server that supports ranged requests. 15 | The init function will determine if the file exists and raise a urllib2 exception if not. 16 | """ 17 | self.filesize = None 18 | self.zipURI = zipURI 19 | self.tableOfContents = None 20 | self.request = None 21 | self.start = None 22 | self.end = None 23 | self.directory_end = None 24 | self.raw_bytes = None 25 | self.directory_size = None 26 | 27 | 28 | def __file_exists(self): 29 | # check if file exists 30 | headRequest = urllib2.Request(self.zipURI) 31 | headRequest.get_method = lambda: 'HEAD' 32 | try: 33 | response = urllib2.urlopen(headRequest) 34 | self.filesize = int(response.info().getheader('Content-Length')) 35 | return True 36 | except HTTPError as e: 37 | print '%s' % e 38 | return False 39 | 40 | def getDirectorySize(self): 41 | if not self.__file_exists(): 42 | raise FileNotFoundException() 43 | 44 | # now request bytes from that size minus a 64kb max zip directory length 45 | self.request = urllib2.Request(self.zipURI) 46 | self.start = self.filesize - (65536) 47 | self.end = self.filesize - 1 48 | self.request.headers['Range'] = "bytes=%s-%s" % (self.start, self.end) 49 | handle = urllib2.urlopen(self.request) 50 | 51 | # make sure the response is ranged 52 | return_range = handle.headers.get('Content-Range') 53 | if return_range != "bytes %d-%d/%s" % (self.start, self.end, self.filesize): 54 | raise Exception("Ranged requests are not supported for this URI") 55 | 56 | # got here? we're fine, read the contents 57 | self.raw_bytes = handle.read() 58 | 59 | # now find the end-of-directory: 06054b50 60 | # we're on little endian maybe 61 | self.directory_end = self.raw_bytes.find("\x50\x4b\x05\x06") 62 | if self.directory_end < 0: 63 | raise Exception("Could not find end of directory") 64 | 65 | # now find the size of the directory: offset 12, 4 bytes 66 | self.directory_size = unpack("i", self.raw_bytes[self.directory_end+12:self.directory_end+16])[0] 67 | 68 | return self.directory_size 69 | 70 | def requestContentDirectory(self): 71 | self.start = self.filesize - self.directory_size 72 | self.end = self.filesize - 1 73 | self.request.headers['Range'] = "bytes=%s-%s" % (self.start, self.end) 74 | handle = urllib2.urlopen(self.request) 75 | 76 | # make sure the response is ranged 77 | return_range = handle.headers.get('Content-Range') 78 | if return_range != "bytes %d-%d/%s" % (self.start, self.end, self.filesize): 79 | raise Exception("Ranged requests are not supported for this URI") 80 | 81 | # got here? we're fine, read the contents 82 | self.raw_bytes = handle.read() 83 | self.directory_end = self.raw_bytes.find("\x50\x4b\x05\x06") 84 | 85 | 86 | def getTableOfContents(self): 87 | """ 88 | This function populates the internal tableOfContents list with the contents 89 | of the zip file TOC. If the server does not support ranged requests, this will raise 90 | and exception. It will also throw an exception if the TOC cannot be found. 91 | """ 92 | 93 | self.directory_size = self.getDirectorySize() 94 | if self.directory_size > 65536: 95 | self.directory_size += 2 96 | self.requestContentDirectory() 97 | 98 | 99 | # and find the offset from start of file where it can be found 100 | directory_start = unpack("i", self.raw_bytes[self.directory_end + 16: self.directory_end + 20])[0] 101 | 102 | # find the data in the raw_bytes 103 | self.raw_bytes = self.raw_bytes 104 | current_start = directory_start - self.start 105 | filestart = 0 106 | compressedsize = 0 107 | tableOfContents = [] 108 | 109 | try: 110 | while True: 111 | # get file name size (n), extra len (m) and comm len (k) 112 | zip_n = unpack("H", self.raw_bytes[current_start + 28: current_start + 28 + 2])[0] 113 | zip_m = unpack("H", self.raw_bytes[current_start + 30: current_start + 30 + 2])[0] 114 | zip_k = unpack("H", self.raw_bytes[current_start + 32: current_start + 32 + 2])[0] 115 | 116 | filename = self.raw_bytes[current_start + 46: current_start + 46 + zip_n] 117 | 118 | # check if this is the index file 119 | filestart = unpack("I", self.raw_bytes[current_start + 42: current_start + 42 + 4])[0] 120 | compressedsize = unpack("I", self.raw_bytes[current_start + 20: current_start + 20 + 4])[0] 121 | uncompressedsize = unpack("I", self.raw_bytes[current_start + 24: current_start + 24 + 4])[0] 122 | tableItem = { 123 | 'filename': filename, 124 | 'compressedsize': compressedsize, 125 | 'uncompressedsize': uncompressedsize, 126 | 'filestart': filestart 127 | } 128 | tableOfContents.append(tableItem) 129 | 130 | # not this file, move along 131 | current_start = current_start + 46 + zip_n + zip_m + zip_k 132 | except: 133 | pass 134 | 135 | self.tableOfContents = tableOfContents 136 | return tableOfContents 137 | 138 | def extractFile(self, filename): 139 | """ 140 | This function will extract a single file from the remote zip without downloading 141 | the entire zip file. The filename argument should match whatever is in the 'filename' 142 | key of the tableOfContents. 143 | """ 144 | files = [x for x in self.tableOfContents if x['filename'] == filename] 145 | if len(files) == 0: 146 | raise FileNotFoundException() 147 | 148 | fileRecord = files[0] 149 | 150 | # got here? need to fetch the file size 151 | metaheadroom = 1024 # should be enough 152 | request = urllib2.Request(self.zipURI) 153 | start = fileRecord['filestart'] 154 | end = fileRecord['filestart'] + fileRecord['compressedsize'] + metaheadroom 155 | request.headers['Range'] = "bytes=%s-%s" % (start, end) 156 | handle = urllib2.urlopen(request) 157 | 158 | # make sure the response is ranged 159 | return_range = handle.headers.get('Content-Range') 160 | if return_range != "bytes %d-%d/%s" % (start, end, self.filesize): 161 | raise Exception("Ranged requests are not supported for this URI") 162 | 163 | filedata = handle.read() 164 | 165 | # find start of raw file data 166 | zip_n = unpack("H", filedata[26:28])[0] 167 | zip_m = unpack("H", filedata[28:30])[0] 168 | 169 | # check compressed size 170 | has_data_descriptor = bool(unpack("H", filedata[6:8])[0] & 8) 171 | comp_size = unpack("I", filedata[18:22])[0] 172 | if comp_size == 0 and has_data_descriptor: 173 | # assume compressed size in the Central Directory is correct 174 | comp_size = fileRecord['compressedsize'] 175 | elif comp_size != fileRecord['compressedsize']: 176 | raise Exception("Something went wrong. Directory and file header disagree of compressed file size") 177 | 178 | raw_zip_data = filedata[30 + zip_n + zip_m: 30 + zip_n + zip_m + comp_size] 179 | uncompressed_data = "" 180 | 181 | # can't decompress if stored without compression 182 | compression_method = unpack("H", filedata[8:10])[0] 183 | if compression_method == 0: 184 | return raw_zip_data 185 | 186 | dec = zlib.decompressobj(-zlib.MAX_WBITS) 187 | for chunk in raw_zip_data: 188 | rv = dec.decompress(chunk) 189 | if rv: 190 | uncompressed_data = uncompressed_data + rv 191 | 192 | return uncompressed_data 193 | 194 | 195 | class FileNotFoundException(Exception): 196 | pass --------------------------------------------------------------------------------