├── .gitignore
├── pyremotezip
    ├── __init__.py
    └── remotezip.py
├── README.md
├── setup.py
└── LICENSE


/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | dist
3 | *.pyc
4 | *.egg-info
5 | 


--------------------------------------------------------------------------------
/pyremotezip/__init__.py:
--------------------------------------------------------------------------------
1 | from .remotezip import RemoteZip  # noqa
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Stories in Ready](https://badge.waffle.io/fcvarela/pyremotezip.png?label=ready&title=Ready)](https://waffle.io/fcvarela/pyremotezip)
 2 | # PyRemoteZip
 3 | 
 4 | PyRemoteZip is a pure python module to extract files from remote zip archives without downloading the whole zip archive.
 5 | 
 6 | ### Usage
 7 | 
 8 |         from pyremotezip import RemoteZip
 9 |         rz = RemoteZip(<some_url_here>)
10 |         toc = rz.getTableOfContents()
11 |         
12 |         # want file at pos 2
13 |         output = rz.extractFile(toc[2]['filename'])
14 | 
15 | ### Contributing
16 | 
17 | Have you forked and improved this? Please submit your pull requests and raise issues here!
18 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | version = '0.5'
 6 | 
 7 | setup(
 8 |     name='pyremotezip',
 9 |     version=version,
10 |     description="Extract files from remote ZIP archives",
11 |     long_description=""" """,
12 |     classifiers=[],
13 |     keywords="",
14 |     author="Filipe Varela",
15 |     author_email="fcvarela@gmail.com",
16 |     url="https://github.com/fcvarela/pyremotezip/",
17 |     license="BSD",
18 |     package_dir={'': '.'},
19 |     packages=find_packages(where='.'),
20 |     include_package_data=True,
21 |     zip_safe=False,
22 |     install_requires=['setuptools', ],
23 |     entry_points="""
24 |     # None whatsoever
25 |     """,
26 | )
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012-2014, Filipe Varela
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 | 


--------------------------------------------------------------------------------
/pyremotezip/remotezip.py:
--------------------------------------------------------------------------------
  1 | import urllib2
  2 | import zlib
  3 | 
  4 | from urllib2 import HTTPError
  5 | from struct import unpack
  6 | 
  7 | 
  8 | class RemoteZip(object):
  9 |     """
 10 |     This class extracts single files from a remote ZIP file by using HTTP ranged requests
 11 |     """
 12 |     def __init__(self, zipURI):
 13 |         """
 14 |         zipURI should be an HTTP URL hosted on a server that supports ranged requests.
 15 |         The init function will determine if the file exists and raise a urllib2 exception if not.
 16 |         """
 17 |         self.filesize = None
 18 |         self.zipURI = zipURI
 19 |         self.tableOfContents = None
 20 |         self.request = None
 21 |         self.start = None
 22 |         self.end = None
 23 |         self.directory_end = None
 24 |         self.raw_bytes = None
 25 |         self.directory_size = None
 26 | 
 27 | 
 28 |     def __file_exists(self):
 29 |         # check if file exists
 30 |         headRequest = urllib2.Request(self.zipURI)
 31 |         headRequest.get_method = lambda: 'HEAD'
 32 |         try:
 33 |             response = urllib2.urlopen(headRequest)
 34 |             self.filesize = int(response.info().getheader('Content-Length'))
 35 |             return True
 36 |         except HTTPError as e:
 37 |             print '%s' % e
 38 |             return False
 39 | 
 40 |     def getDirectorySize(self):
 41 |         if not self.__file_exists():
 42 |             raise FileNotFoundException()
 43 | 
 44 |         # now request bytes from that size minus a 64kb max zip directory length
 45 |         self.request = urllib2.Request(self.zipURI)
 46 |         self.start = self.filesize - (65536)
 47 |         self.end = self.filesize - 1
 48 |         self.request.headers['Range'] = "bytes=%s-%s" % (self.start, self.end)
 49 |         handle = urllib2.urlopen(self.request)
 50 | 
 51 |         # make sure the response is ranged
 52 |         return_range = handle.headers.get('Content-Range')
 53 |         if return_range != "bytes %d-%d/%s" % (self.start, self.end, self.filesize):
 54 |             raise Exception("Ranged requests are not supported for this URI")
 55 | 
 56 |         # got here? we're fine, read the contents
 57 |         self.raw_bytes = handle.read()
 58 | 
 59 |         # now find the end-of-directory: 06054b50
 60 |         # we're on little endian maybe
 61 |         self.directory_end = self.raw_bytes.find("\x50\x4b\x05\x06")
 62 |         if self.directory_end < 0:
 63 |             raise Exception("Could not find end of directory")
 64 | 
 65 |         # now find the size of the directory: offset 12, 4 bytes
 66 |         self.directory_size = unpack("i", self.raw_bytes[self.directory_end+12:self.directory_end+16])[0]
 67 | 
 68 |         return self.directory_size
 69 | 
 70 |     def requestContentDirectory(self):
 71 |         self.start = self.filesize - self.directory_size
 72 |         self.end = self.filesize - 1
 73 |         self.request.headers['Range'] = "bytes=%s-%s" % (self.start, self.end)
 74 |         handle = urllib2.urlopen(self.request)
 75 | 
 76 |         # make sure the response is ranged
 77 |         return_range = handle.headers.get('Content-Range')
 78 |         if return_range != "bytes %d-%d/%s" % (self.start, self.end, self.filesize):
 79 |             raise Exception("Ranged requests are not supported for this URI")
 80 | 
 81 |         # got here? we're fine, read the contents
 82 |         self.raw_bytes = handle.read()
 83 |         self.directory_end = self.raw_bytes.find("\x50\x4b\x05\x06")
 84 | 
 85 | 
 86 |     def getTableOfContents(self):
 87 |         """
 88 |         This function populates the internal tableOfContents list with the contents
 89 |         of the zip file TOC. If the server does not support ranged requests, this will raise
 90 |         and exception. It will also throw an exception if the TOC cannot be found.
 91 |         """
 92 | 
 93 |         self.directory_size = self.getDirectorySize()
 94 |         if self.directory_size > 65536:
 95 |             self.directory_size += 2
 96 |             self.requestContentDirectory()
 97 | 
 98 | 
 99 |         # and find the offset from start of file where it can be found
100 |         directory_start = unpack("i", self.raw_bytes[self.directory_end + 16: self.directory_end + 20])[0]
101 | 
102 |         # find the data in the raw_bytes
103 |         self.raw_bytes = self.raw_bytes
104 |         current_start = directory_start - self.start
105 |         filestart = 0
106 |         compressedsize = 0
107 |         tableOfContents = []
108 | 
109 |         try:
110 |             while True:
111 |                 # get file name size (n), extra len (m) and comm len (k)
112 |                 zip_n = unpack("H", self.raw_bytes[current_start + 28: current_start + 28 + 2])[0]
113 |                 zip_m = unpack("H", self.raw_bytes[current_start + 30: current_start + 30 + 2])[0]
114 |                 zip_k = unpack("H", self.raw_bytes[current_start + 32: current_start + 32 + 2])[0]
115 | 
116 |                 filename = self.raw_bytes[current_start + 46: current_start + 46 + zip_n]
117 | 
118 |                 # check if this is the index file
119 |                 filestart = unpack("I", self.raw_bytes[current_start + 42: current_start + 42 + 4])[0]
120 |                 compressedsize = unpack("I", self.raw_bytes[current_start + 20: current_start + 20 + 4])[0]
121 |                 uncompressedsize = unpack("I", self.raw_bytes[current_start + 24: current_start + 24 + 4])[0]
122 |                 tableItem = {
123 |                     'filename': filename,
124 |                     'compressedsize': compressedsize,
125 |                     'uncompressedsize': uncompressedsize,
126 |                     'filestart': filestart
127 |                 }
128 |                 tableOfContents.append(tableItem)
129 | 
130 |                 # not this file, move along
131 |                 current_start = current_start + 46 + zip_n + zip_m + zip_k
132 |         except:
133 |             pass
134 | 
135 |         self.tableOfContents = tableOfContents
136 |         return tableOfContents
137 | 
138 |     def extractFile(self, filename):
139 |         """
140 |         This function will extract a single file from the remote zip without downloading
141 |         the entire zip file. The filename argument should match whatever is in the 'filename'
142 |         key of the tableOfContents.
143 |         """
144 |         files = [x for x in self.tableOfContents if x['filename'] == filename]
145 |         if len(files) == 0:
146 |             raise FileNotFoundException()
147 | 
148 |         fileRecord = files[0]
149 | 
150 |         # got here? need to fetch the file size
151 |         metaheadroom = 1024  # should be enough
152 |         request = urllib2.Request(self.zipURI)
153 |         start = fileRecord['filestart']
154 |         end = fileRecord['filestart'] + fileRecord['compressedsize'] + metaheadroom
155 |         request.headers['Range'] = "bytes=%s-%s" % (start, end)
156 |         handle = urllib2.urlopen(request)
157 | 
158 |         # make sure the response is ranged
159 |         return_range = handle.headers.get('Content-Range')
160 |         if return_range != "bytes %d-%d/%s" % (start, end, self.filesize):
161 |             raise Exception("Ranged requests are not supported for this URI")
162 | 
163 |         filedata = handle.read()
164 | 
165 |         # find start of raw file data
166 |         zip_n = unpack("H", filedata[26:28])[0]
167 |         zip_m = unpack("H", filedata[28:30])[0]
168 | 
169 |         # check compressed size
170 |         has_data_descriptor = bool(unpack("H", filedata[6:8])[0] & 8)
171 |         comp_size = unpack("I", filedata[18:22])[0]
172 |         if comp_size == 0 and has_data_descriptor:
173 |             # assume compressed size in the Central Directory is correct
174 |             comp_size = fileRecord['compressedsize']
175 |         elif comp_size != fileRecord['compressedsize']:
176 |             raise Exception("Something went wrong. Directory and file header disagree of compressed file size")
177 | 
178 |         raw_zip_data = filedata[30 + zip_n + zip_m: 30 + zip_n + zip_m + comp_size]
179 |         uncompressed_data = ""
180 |         
181 |         # can't decompress if stored without compression
182 |         compression_method = unpack("H", filedata[8:10])[0]
183 |         if compression_method == 0:
184 |           return raw_zip_data
185 | 
186 |         dec = zlib.decompressobj(-zlib.MAX_WBITS)
187 |         for chunk in raw_zip_data:
188 |             rv = dec.decompress(chunk)
189 |             if rv:
190 |                 uncompressed_data = uncompressed_data + rv
191 | 
192 |         return uncompressed_data
193 | 
194 | 
195 | class FileNotFoundException(Exception):
196 |     pass


--------------------------------------------------------------------------------