├── sozipfile ├── __init__.py └── sozipfile.py ├── pyproject.toml ├── README.md ├── tests ├── test_sozipfile.py └── test_reading.py └── LICENSE /sozipfile/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "sozipfile" 3 | version = "0.3.2" 4 | authors = [ 5 | { name="Even Rouault (using code from Python contributors)", email="even.rouault@spatialys.com" }, 6 | ] 7 | description = "Fork of Python zipfile module, adding generation of sozip optimization" 8 | readme = "README.md" 9 | requires-python = ">=3.8" 10 | classifiers = [ 11 | "Programming Language :: Python :: 3", 12 | "License :: OSI Approved :: Python Software Foundation License", 13 | "Operating System :: OS Independent", 14 | ] 15 | 16 | [project.urls] 17 | "Homepage" = "https://github.com/sozip/sozipfile" 18 | "Bug Tracker" = "https://github.com/sozip/sozipfile/issues" 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sozipfile 2 | 3 | sozipfile is a fork of Python [zipfile](https://docs.python.org/3/library/zipfile.html) 4 | module, from its implementation in CPython 3.11, which implements the 5 | [SOZip](https://sozip.org) optimization, 6 | when writing deflate compressed files whose size exceeds the chunk size (defaults 7 | to 32768 bytes). It also implements efficient seeking within a SOZip file. 8 | 9 | Example to generate a SOZip-optimized file: 10 | 11 | ```python 12 | import sozipfile.sozipfile as zipfile 13 | with zipfile.ZipFile('my.zip', 'w', 14 | compression=zipfile.ZIP_DEFLATED, 15 | chunk_size=zipfile.SOZIP_DEFAULT_CHUNK_SIZE) as myzip: 16 | myzip.write('my.file') 17 | ``` 18 | 19 | Example to check if a file within a ZIP is SOZip-optimized: 20 | 21 | ```python 22 | import sozipfile.sozipfile as zipfile 23 | with zipfile.ZipFile('my.zip', 'r') as myzip: 24 | if myzip.getinfo('my.gpkg').is_sozip_optimized(myzip): 25 | print('SOZip optimized!') 26 | ``` 27 | 28 | Available on [pypi](https://pypi.org/project/sozipfile): 29 | ```shell 30 | pip install sozipfile 31 | ``` 32 | -------------------------------------------------------------------------------- /tests/test_sozipfile.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.append("../sozipfile") 4 | import sozipfile.sozipfile as zipfile 5 | 6 | this_dir = os.path.dirname(__file__) 7 | 8 | def test_basic(): 9 | out_zip = this_dir + '/test.zip' 10 | try: 11 | with zipfile.ZipFile(out_zip, 'w', compression=zipfile.ZIP_DEFLATED, chunk_size=128) as myzip: 12 | myzip.write(this_dir + '/test_sozipfile.py', arcname='foo.py') 13 | myzip.writestr('baz.py', "foo") 14 | myzip.write(this_dir + '/test_sozipfile.py', arcname='subdir/bar.py') 15 | 16 | # Very basic check... 17 | zip_raw = open(out_zip, 'rb').read() 18 | assert b'.foo.py.sozip.idx' in zip_raw 19 | assert b'.baz.py.sozip.idx' not in zip_raw 20 | assert b'subdir/.bar.py.sozip.idx' in zip_raw 21 | 22 | with zipfile.ZipFile(out_zip, 'r') as myzip: 23 | assert myzip.namelist() == ['foo.py', 'baz.py', 'subdir/bar.py'] 24 | assert myzip.read('foo.py') == open(this_dir + '/test_sozipfile.py', 'rb').read() 25 | assert myzip.read('baz.py') == b"foo" 26 | assert myzip.read('subdir/bar.py') == open(this_dir + '/test_sozipfile.py', 'rb').read() 27 | 28 | assert myzip.getinfo('foo.py').is_sozip_optimized(myzip) 29 | assert not myzip.getinfo('baz.py').is_sozip_optimized(myzip) 30 | assert myzip.getinfo('subdir/bar.py').is_sozip_optimized(myzip) 31 | 32 | finally: 33 | if os.path.exists(out_zip): 34 | os.unlink(out_zip) 35 | -------------------------------------------------------------------------------- /tests/test_reading.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | import unittest 3 | import pathlib 4 | import zipfile 5 | 6 | try: 7 | from sozipfile import sozipfile 8 | except ImportError: 9 | import sys 10 | 11 | sys.path.append("../sozipfile") 12 | from sozipfile import sozipfile 13 | 14 | 15 | class TestReading(unittest.TestCase): 16 | @classmethod 17 | def setUpClass(cls): 18 | # create temporary directory 19 | cls._tempdir = tempfile.TemporaryDirectory(prefix="test-sozipfile-") 20 | cls.addClassCleanup(cls._tempdir.cleanup) 21 | cls.temp_path = pathlib.Path(cls._tempdir.name) 22 | # create archives with sozipfile and zipfile 23 | cls.sozip_path = cls.temp_path / "soarchive.zip" 24 | cls.content = "\n".join(f"{number:04d}" for number in range(5000)).encode() 25 | cls.chunk_size = 1024 26 | with sozipfile.ZipFile( 27 | cls.sozip_path, 28 | "w", 29 | compression=sozipfile.ZIP_DEFLATED, 30 | chunk_size=cls.chunk_size, 31 | ) as myzip: 32 | myzip.writestr("numbers.txt", cls.content) 33 | # write an uncompressed file 34 | myzip.writestr( 35 | "hello.txt", "Hello World!", compress_type=sozipfile.ZIP_STORED 36 | ) 37 | # create conventional zip archive 38 | cls.zip_path = cls.temp_path / "archive.zip" 39 | with zipfile.ZipFile( 40 | cls.zip_path, "w", compression=zipfile.ZIP_DEFLATED 41 | ) as myzip: 42 | myzip.writestr("numbers.txt", cls.content) 43 | 44 | def test_read_conventional_zip_with_sozipfile(self): 45 | with sozipfile.ZipFile(self.zip_path, mode="r") as myzip: 46 | zinfo = myzip.getinfo("numbers.txt") 47 | self.assertIsNone(zinfo.sozip_index) 48 | with myzip.open(zinfo) as file: 49 | content = file.read() 50 | self.assertEqual(content, self.content) 51 | 52 | def test_read_seek_optimized_zip_with_sozipfile(self): 53 | with sozipfile.ZipFile(self.sozip_path, mode="r") as myzip: 54 | # read un-indexed file 55 | zinfo = myzip.getinfo("hello.txt") 56 | self.assertIsNone(zinfo.sozip_index) 57 | with myzip.open(zinfo) as file: 58 | hello = file.read() 59 | self.assertEqual(hello, b"Hello World!") 60 | # read indexed file 61 | zinfo = myzip.getinfo("numbers.txt") 62 | self.assertIsNotNone(zinfo.sozip_index) 63 | with myzip.open(zinfo) as file: 64 | content = file.read() 65 | self.assertEqual(content, self.content) 66 | 67 | def test_seek_and_read(self): 68 | with sozipfile.ZipFile(self.sozip_path, mode="r") as myzip: 69 | with myzip.open("numbers.txt") as file: 70 | file.seek(2000) 71 | content = file.read(1500) 72 | self.assertEqual(content, self.content[2000:3500]) 73 | 74 | def test_seek_before_file(self): 75 | with sozipfile.ZipFile(self.sozip_path, mode="r") as myzip: 76 | with myzip.open("numbers.txt") as file: 77 | file.seek(-2 * self.chunk_size) 78 | content = file.read(100) 79 | self.assertEqual(content, self.content[:100]) 80 | # check that zipfile behaves the same 81 | with zipfile.ZipFile(self.sozip_path, mode="r") as myzip: 82 | with myzip.open("numbers.txt") as file: 83 | file.seek(-2 * self.chunk_size) 84 | content = file.read(100) 85 | self.assertEqual(content, self.content[:100]) 86 | 87 | def test_seek_beyond_filesize(self): 88 | with sozipfile.ZipFile(self.sozip_path, mode="r") as myzip: 89 | with myzip.open("numbers.txt") as file: 90 | file.seek(len(self.content) + 500) 91 | content = file.read(100) 92 | self.assertEqual(content, b"") 93 | # check that zipfile behaves the same 94 | with zipfile.ZipFile(self.sozip_path, mode="r") as myzip: 95 | with myzip.open("numbers.txt") as file: 96 | file.seek(len(self.content) + 500) 97 | content = file.read(100) 98 | self.assertEqual(content, b"") 99 | 100 | def test_reactivate_crc_check(self): 101 | with sozipfile.ZipFile(self.sozip_path, mode="r") as myzip: 102 | zinfo = myzip.getinfo("numbers.txt") 103 | zinfo.CRC += 1 104 | with myzip.open("numbers.txt") as file: 105 | file.seek(-500, 2) 106 | file.read() # no exception 107 | file.seek(0) 108 | with self.assertRaisesRegex( 109 | sozipfile.BadZipFile, r"Bad CRC-32 for file .*" 110 | ): 111 | file.read() 112 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | A. HISTORY OF THE SOFTWARE 2 | ========================== 3 | 4 | Python was created in the early 1990s by Guido van Rossum at Stichting 5 | Mathematisch Centrum (CWI, see https://www.cwi.nl) in the Netherlands 6 | as a successor of a language called ABC. Guido remains Python's 7 | principal author, although it includes many contributions from others. 8 | 9 | In 1995, Guido continued his work on Python at the Corporation for 10 | National Research Initiatives (CNRI, see https://www.cnri.reston.va.us) 11 | in Reston, Virginia where he released several versions of the 12 | software. 13 | 14 | In May 2000, Guido and the Python core development team moved to 15 | BeOpen.com to form the BeOpen PythonLabs team. In October of the same 16 | year, the PythonLabs team moved to Digital Creations, which became 17 | Zope Corporation. In 2001, the Python Software Foundation (PSF, see 18 | https://www.python.org/psf/) was formed, a non-profit organization 19 | created specifically to own Python-related Intellectual Property. 20 | Zope Corporation was a sponsoring member of the PSF. 21 | 22 | All Python releases are Open Source (see https://opensource.org for 23 | the Open Source Definition). Historically, most, but not all, Python 24 | releases have also been GPL-compatible; the table below summarizes 25 | the various releases. 26 | 27 | Release Derived Year Owner GPL- 28 | from compatible? (1) 29 | 30 | 0.9.0 thru 1.2 1991-1995 CWI yes 31 | 1.3 thru 1.5.2 1.2 1995-1999 CNRI yes 32 | 1.6 1.5.2 2000 CNRI no 33 | 2.0 1.6 2000 BeOpen.com no 34 | 1.6.1 1.6 2001 CNRI yes (2) 35 | 2.1 2.0+1.6.1 2001 PSF no 36 | 2.0.1 2.0+1.6.1 2001 PSF yes 37 | 2.1.1 2.1+2.0.1 2001 PSF yes 38 | 2.1.2 2.1.1 2002 PSF yes 39 | 2.1.3 2.1.2 2002 PSF yes 40 | 2.2 and above 2.1.1 2001-now PSF yes 41 | 42 | Footnotes: 43 | 44 | (1) GPL-compatible doesn't mean that we're distributing Python under 45 | the GPL. All Python licenses, unlike the GPL, let you distribute 46 | a modified version without making your changes open source. The 47 | GPL-compatible licenses make it possible to combine Python with 48 | other software that is released under the GPL; the others don't. 49 | 50 | (2) According to Richard Stallman, 1.6.1 is not GPL-compatible, 51 | because its license has a choice of law clause. According to 52 | CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1 53 | is "not incompatible" with the GPL. 54 | 55 | Thanks to the many outside volunteers who have worked under Guido's 56 | direction to make these releases possible. 57 | 58 | 59 | B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON 60 | =============================================================== 61 | 62 | Python software and documentation are licensed under the 63 | Python Software Foundation License Version 2. 64 | 65 | Starting with Python 3.8.6, examples, recipes, and other code in 66 | the documentation are dual licensed under the PSF License Version 2 67 | and the Zero-Clause BSD license. 68 | 69 | Some software incorporated into Python is under different licenses. 70 | The licenses are listed with code falling under that license. 71 | 72 | 73 | PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 74 | -------------------------------------------- 75 | 76 | 1. This LICENSE AGREEMENT is between the Python Software Foundation 77 | ("PSF"), and the Individual or Organization ("Licensee") accessing and 78 | otherwise using this software ("Python") in source or binary form and 79 | its associated documentation. 80 | 81 | 2. Subject to the terms and conditions of this License Agreement, PSF hereby 82 | grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, 83 | analyze, test, perform and/or display publicly, prepare derivative works, 84 | distribute, and otherwise use Python alone or in any derivative version, 85 | provided, however, that PSF's License Agreement and PSF's notice of copyright, 86 | i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 87 | 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022 Python Software Foundation; 88 | All Rights Reserved" are retained in Python alone or in any derivative version 89 | prepared by Licensee. 90 | 91 | 3. In the event Licensee prepares a derivative work that is based on 92 | or incorporates Python or any part thereof, and wants to make 93 | the derivative work available to others as provided herein, then 94 | Licensee hereby agrees to include in any such work a brief summary of 95 | the changes made to Python. 96 | 97 | 4. PSF is making Python available to Licensee on an "AS IS" 98 | basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR 99 | IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND 100 | DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS 101 | FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT 102 | INFRINGE ANY THIRD PARTY RIGHTS. 103 | 104 | 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON 105 | FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS 106 | A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, 107 | OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. 108 | 109 | 6. This License Agreement will automatically terminate upon a material 110 | breach of its terms and conditions. 111 | 112 | 7. Nothing in this License Agreement shall be deemed to create any 113 | relationship of agency, partnership, or joint venture between PSF and 114 | Licensee. This License Agreement does not grant permission to use PSF 115 | trademarks or trade name in a trademark sense to endorse or promote 116 | products or services of Licensee, or any third party. 117 | 118 | 8. By copying, installing or otherwise using Python, Licensee 119 | agrees to be bound by the terms and conditions of this License 120 | Agreement. 121 | 122 | 123 | BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0 124 | ------------------------------------------- 125 | 126 | BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1 127 | 128 | 1. This LICENSE AGREEMENT is between BeOpen.com ("BeOpen"), having an 129 | office at 160 Saratoga Avenue, Santa Clara, CA 95051, and the 130 | Individual or Organization ("Licensee") accessing and otherwise using 131 | this software in source or binary form and its associated 132 | documentation ("the Software"). 133 | 134 | 2. Subject to the terms and conditions of this BeOpen Python License 135 | Agreement, BeOpen hereby grants Licensee a non-exclusive, 136 | royalty-free, world-wide license to reproduce, analyze, test, perform 137 | and/or display publicly, prepare derivative works, distribute, and 138 | otherwise use the Software alone or in any derivative version, 139 | provided, however, that the BeOpen Python License is retained in the 140 | Software, alone or in any derivative version prepared by Licensee. 141 | 142 | 3. BeOpen is making the Software available to Licensee on an "AS IS" 143 | basis. BEOPEN MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR 144 | IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, BEOPEN MAKES NO AND 145 | DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS 146 | FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT 147 | INFRINGE ANY THIRD PARTY RIGHTS. 148 | 149 | 4. BEOPEN SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE 150 | SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS 151 | AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY 152 | DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. 153 | 154 | 5. This License Agreement will automatically terminate upon a material 155 | breach of its terms and conditions. 156 | 157 | 6. This License Agreement shall be governed by and interpreted in all 158 | respects by the law of the State of California, excluding conflict of 159 | law provisions. Nothing in this License Agreement shall be deemed to 160 | create any relationship of agency, partnership, or joint venture 161 | between BeOpen and Licensee. This License Agreement does not grant 162 | permission to use BeOpen trademarks or trade names in a trademark 163 | sense to endorse or promote products or services of Licensee, or any 164 | third party. As an exception, the "BeOpen Python" logos available at 165 | http://www.pythonlabs.com/logos.html may be used according to the 166 | permissions granted on that web page. 167 | 168 | 7. By copying, installing or otherwise using the software, Licensee 169 | agrees to be bound by the terms and conditions of this License 170 | Agreement. 171 | 172 | 173 | CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1 174 | --------------------------------------- 175 | 176 | 1. This LICENSE AGREEMENT is between the Corporation for National 177 | Research Initiatives, having an office at 1895 Preston White Drive, 178 | Reston, VA 20191 ("CNRI"), and the Individual or Organization 179 | ("Licensee") accessing and otherwise using Python 1.6.1 software in 180 | source or binary form and its associated documentation. 181 | 182 | 2. Subject to the terms and conditions of this License Agreement, CNRI 183 | hereby grants Licensee a nonexclusive, royalty-free, world-wide 184 | license to reproduce, analyze, test, perform and/or display publicly, 185 | prepare derivative works, distribute, and otherwise use Python 1.6.1 186 | alone or in any derivative version, provided, however, that CNRI's 187 | License Agreement and CNRI's notice of copyright, i.e., "Copyright (c) 188 | 1995-2001 Corporation for National Research Initiatives; All Rights 189 | Reserved" are retained in Python 1.6.1 alone or in any derivative 190 | version prepared by Licensee. Alternately, in lieu of CNRI's License 191 | Agreement, Licensee may substitute the following text (omitting the 192 | quotes): "Python 1.6.1 is made available subject to the terms and 193 | conditions in CNRI's License Agreement. This Agreement together with 194 | Python 1.6.1 may be located on the internet using the following 195 | unique, persistent identifier (known as a handle): 1895.22/1013. This 196 | Agreement may also be obtained from a proxy server on the internet 197 | using the following URL: http://hdl.handle.net/1895.22/1013". 198 | 199 | 3. In the event Licensee prepares a derivative work that is based on 200 | or incorporates Python 1.6.1 or any part thereof, and wants to make 201 | the derivative work available to others as provided herein, then 202 | Licensee hereby agrees to include in any such work a brief summary of 203 | the changes made to Python 1.6.1. 204 | 205 | 4. CNRI is making Python 1.6.1 available to Licensee on an "AS IS" 206 | basis. CNRI MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR 207 | IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, CNRI MAKES NO AND 208 | DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS 209 | FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 1.6.1 WILL NOT 210 | INFRINGE ANY THIRD PARTY RIGHTS. 211 | 212 | 5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON 213 | 1.6.1 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS 214 | A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1, 215 | OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. 216 | 217 | 6. This License Agreement will automatically terminate upon a material 218 | breach of its terms and conditions. 219 | 220 | 7. This License Agreement shall be governed by the federal 221 | intellectual property law of the United States, including without 222 | limitation the federal copyright law, and, to the extent such 223 | U.S. federal law does not apply, by the law of the Commonwealth of 224 | Virginia, excluding Virginia's conflict of law provisions. 225 | Notwithstanding the foregoing, with regard to derivative works based 226 | on Python 1.6.1 that incorporate non-separable material that was 227 | previously distributed under the GNU General Public License (GPL), the 228 | law of the Commonwealth of Virginia shall govern this License 229 | Agreement only as to issues arising under or with respect to 230 | Paragraphs 4, 5, and 7 of this License Agreement. Nothing in this 231 | License Agreement shall be deemed to create any relationship of 232 | agency, partnership, or joint venture between CNRI and Licensee. This 233 | License Agreement does not grant permission to use CNRI trademarks or 234 | trade name in a trademark sense to endorse or promote products or 235 | services of Licensee, or any third party. 236 | 237 | 8. By clicking on the "ACCEPT" button where indicated, or by copying, 238 | installing or otherwise using Python 1.6.1, Licensee agrees to be 239 | bound by the terms and conditions of this License Agreement. 240 | 241 | ACCEPT 242 | 243 | 244 | CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2 245 | -------------------------------------------------- 246 | 247 | Copyright (c) 1991 - 1995, Stichting Mathematisch Centrum Amsterdam, 248 | The Netherlands. All rights reserved. 249 | 250 | Permission to use, copy, modify, and distribute this software and its 251 | documentation for any purpose and without fee is hereby granted, 252 | provided that the above copyright notice appear in all copies and that 253 | both that copyright notice and this permission notice appear in 254 | supporting documentation, and that the name of Stichting Mathematisch 255 | Centrum or CWI not be used in advertising or publicity pertaining to 256 | distribution of the software without specific, written prior 257 | permission. 258 | 259 | STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO 260 | THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 261 | FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE 262 | FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 263 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 264 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 265 | OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 266 | 267 | ZERO-CLAUSE BSD LICENSE FOR CODE IN THE PYTHON DOCUMENTATION 268 | ---------------------------------------------------------------------- 269 | 270 | Permission to use, copy, modify, and/or distribute this software for any 271 | purpose with or without fee is hereby granted. 272 | 273 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 274 | REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 275 | AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 276 | INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 277 | LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 278 | OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 279 | PERFORMANCE OF THIS SOFTWARE. 280 | -------------------------------------------------------------------------------- /sozipfile/sozipfile.py: -------------------------------------------------------------------------------- 1 | """ 2 | Read and write ZIP files. 3 | 4 | XXX references to utf-8 need further investigation. 5 | """ 6 | import binascii 7 | import importlib.util 8 | import io 9 | import itertools 10 | import os 11 | import posixpath 12 | import shutil 13 | import stat 14 | import struct 15 | import sys 16 | import threading 17 | import time 18 | import contextlib 19 | import pathlib 20 | 21 | try: 22 | import zlib # We may need its compression method 23 | crc32 = zlib.crc32 24 | except ImportError: 25 | zlib = None 26 | crc32 = binascii.crc32 27 | 28 | try: 29 | import bz2 # We may need its compression method 30 | except ImportError: 31 | bz2 = None 32 | 33 | try: 34 | import lzma # We may need its compression method 35 | except ImportError: 36 | lzma = None 37 | 38 | __all__ = ["BadZipFile", "BadZipfile", "error", 39 | "ZIP_STORED", "ZIP_DEFLATED", "ZIP_BZIP2", "ZIP_LZMA", 40 | "is_zipfile", "ZipInfo", "ZipFile", "PyZipFile", "LargeZipFile", 41 | "Path"] 42 | 43 | class BadZipFile(Exception): 44 | pass 45 | 46 | 47 | class LargeZipFile(Exception): 48 | """ 49 | Raised when writing a zipfile, the zipfile requires ZIP64 extensions 50 | and those extensions are disabled. 51 | """ 52 | 53 | error = BadZipfile = BadZipFile # Pre-3.2 compatibility names 54 | 55 | SOZIP_DEFAULT_CHUNK_SIZE = 32768 56 | 57 | ZIP64_LIMIT = (1 << 31) - 1 58 | ZIP_FILECOUNT_LIMIT = (1 << 16) - 1 59 | ZIP_MAX_COMMENT = (1 << 16) - 1 60 | 61 | # constants for Zip file compression methods 62 | ZIP_STORED = 0 63 | ZIP_DEFLATED = 8 64 | ZIP_BZIP2 = 12 65 | ZIP_LZMA = 14 66 | # Other ZIP compression methods not supported 67 | 68 | DEFAULT_VERSION = 20 69 | ZIP64_VERSION = 45 70 | BZIP2_VERSION = 46 71 | LZMA_VERSION = 63 72 | # we recognize (but not necessarily support) all features up to that version 73 | MAX_EXTRACT_VERSION = 63 74 | 75 | # Below are some formats and associated data for reading/writing headers using 76 | # the struct module. The names and structures of headers/records are those used 77 | # in the PKWARE description of the ZIP file format: 78 | # http://www.pkware.com/documents/casestudies/APPNOTE.TXT 79 | # (URL valid as of January 2008) 80 | 81 | # The "end of central directory" structure, magic number, size, and indices 82 | # (section V.I in the format document) 83 | structEndArchive = b"<4s4H2LH" 84 | stringEndArchive = b"PK\005\006" 85 | sizeEndCentDir = struct.calcsize(structEndArchive) 86 | 87 | _ECD_SIGNATURE = 0 88 | _ECD_DISK_NUMBER = 1 89 | _ECD_DISK_START = 2 90 | _ECD_ENTRIES_THIS_DISK = 3 91 | _ECD_ENTRIES_TOTAL = 4 92 | _ECD_SIZE = 5 93 | _ECD_OFFSET = 6 94 | _ECD_COMMENT_SIZE = 7 95 | # These last two indices are not part of the structure as defined in the 96 | # spec, but they are used internally by this module as a convenience 97 | _ECD_COMMENT = 8 98 | _ECD_LOCATION = 9 99 | 100 | # The "central directory" structure, magic number, size, and indices 101 | # of entries in the structure (section V.F in the format document) 102 | structCentralDir = "<4s4B4HL2L5H2L" 103 | stringCentralDir = b"PK\001\002" 104 | sizeCentralDir = struct.calcsize(structCentralDir) 105 | 106 | # indexes of entries in the central directory structure 107 | _CD_SIGNATURE = 0 108 | _CD_CREATE_VERSION = 1 109 | _CD_CREATE_SYSTEM = 2 110 | _CD_EXTRACT_VERSION = 3 111 | _CD_EXTRACT_SYSTEM = 4 112 | _CD_FLAG_BITS = 5 113 | _CD_COMPRESS_TYPE = 6 114 | _CD_TIME = 7 115 | _CD_DATE = 8 116 | _CD_CRC = 9 117 | _CD_COMPRESSED_SIZE = 10 118 | _CD_UNCOMPRESSED_SIZE = 11 119 | _CD_FILENAME_LENGTH = 12 120 | _CD_EXTRA_FIELD_LENGTH = 13 121 | _CD_COMMENT_LENGTH = 14 122 | _CD_DISK_NUMBER_START = 15 123 | _CD_INTERNAL_FILE_ATTRIBUTES = 16 124 | _CD_EXTERNAL_FILE_ATTRIBUTES = 17 125 | _CD_LOCAL_HEADER_OFFSET = 18 126 | 127 | # General purpose bit flags 128 | # Zip Appnote: 4.4.4 general purpose bit flag: (2 bytes) 129 | _MASK_ENCRYPTED = 1 << 0 130 | # Bits 1 and 2 have different meanings depending on the compression used. 131 | _MASK_COMPRESS_OPTION_1 = 1 << 1 132 | # _MASK_COMPRESS_OPTION_2 = 1 << 2 133 | # _MASK_USE_DATA_DESCRIPTOR: If set, crc-32, compressed size and uncompressed 134 | # size are zero in the local header and the real values are written in the data 135 | # descriptor immediately following the compressed data. 136 | _MASK_USE_DATA_DESCRIPTOR = 1 << 3 137 | # Bit 4: Reserved for use with compression method 8, for enhanced deflating. 138 | # _MASK_RESERVED_BIT_4 = 1 << 4 139 | _MASK_COMPRESSED_PATCH = 1 << 5 140 | _MASK_STRONG_ENCRYPTION = 1 << 6 141 | # _MASK_UNUSED_BIT_7 = 1 << 7 142 | # _MASK_UNUSED_BIT_8 = 1 << 8 143 | # _MASK_UNUSED_BIT_9 = 1 << 9 144 | # _MASK_UNUSED_BIT_10 = 1 << 10 145 | _MASK_UTF_FILENAME = 1 << 11 146 | # Bit 12: Reserved by PKWARE for enhanced compression. 147 | # _MASK_RESERVED_BIT_12 = 1 << 12 148 | # _MASK_ENCRYPTED_CENTRAL_DIR = 1 << 13 149 | # Bit 14, 15: Reserved by PKWARE 150 | # _MASK_RESERVED_BIT_14 = 1 << 14 151 | # _MASK_RESERVED_BIT_15 = 1 << 15 152 | 153 | # The "local file header" structure, magic number, size, and indices 154 | # (section V.A in the format document) 155 | structFileHeader = "<4s2B4HL2L2H" 156 | stringFileHeader = b"PK\003\004" 157 | sizeFileHeader = struct.calcsize(structFileHeader) 158 | 159 | _FH_SIGNATURE = 0 160 | _FH_EXTRACT_VERSION = 1 161 | _FH_EXTRACT_SYSTEM = 2 162 | _FH_GENERAL_PURPOSE_FLAG_BITS = 3 163 | _FH_COMPRESSION_METHOD = 4 164 | _FH_LAST_MOD_TIME = 5 165 | _FH_LAST_MOD_DATE = 6 166 | _FH_CRC = 7 167 | _FH_COMPRESSED_SIZE = 8 168 | _FH_UNCOMPRESSED_SIZE = 9 169 | _FH_FILENAME_LENGTH = 10 170 | _FH_EXTRA_FIELD_LENGTH = 11 171 | 172 | # The "Zip64 end of central directory locator" structure, magic number, and size 173 | structEndArchive64Locator = "<4sLQL" 174 | stringEndArchive64Locator = b"PK\x06\x07" 175 | sizeEndCentDir64Locator = struct.calcsize(structEndArchive64Locator) 176 | 177 | # The "Zip64 end of central directory" record, magic number, size, and indices 178 | # (section V.G in the format document) 179 | structEndArchive64 = "<4sQ2H2L4Q" 180 | stringEndArchive64 = b"PK\x06\x06" 181 | sizeEndCentDir64 = struct.calcsize(structEndArchive64) 182 | 183 | _CD64_SIGNATURE = 0 184 | _CD64_DIRECTORY_RECSIZE = 1 185 | _CD64_CREATE_VERSION = 2 186 | _CD64_EXTRACT_VERSION = 3 187 | _CD64_DISK_NUMBER = 4 188 | _CD64_DISK_NUMBER_START = 5 189 | _CD64_NUMBER_ENTRIES_THIS_DISK = 6 190 | _CD64_NUMBER_ENTRIES_TOTAL = 7 191 | _CD64_DIRECTORY_SIZE = 8 192 | _CD64_OFFSET_START_CENTDIR = 9 193 | 194 | _DD_SIGNATURE = 0x08074b50 195 | 196 | _EXTRA_FIELD_STRUCT = struct.Struct(' 1: 260 | raise BadZipFile("zipfiles that span multiple disks are not supported") 261 | 262 | # Assume no 'zip64 extensible data' 263 | fpin.seek(offset - sizeEndCentDir64Locator - sizeEndCentDir64, 2) 264 | data = fpin.read(sizeEndCentDir64) 265 | if len(data) != sizeEndCentDir64: 266 | return endrec 267 | sig, sz, create_version, read_version, disk_num, disk_dir, \ 268 | dircount, dircount2, dirsize, diroffset = \ 269 | struct.unpack(structEndArchive64, data) 270 | if sig != stringEndArchive64: 271 | return endrec 272 | 273 | # Update the original endrec using data from the ZIP64 record 274 | endrec[_ECD_SIGNATURE] = sig 275 | endrec[_ECD_DISK_NUMBER] = disk_num 276 | endrec[_ECD_DISK_START] = disk_dir 277 | endrec[_ECD_ENTRIES_THIS_DISK] = dircount 278 | endrec[_ECD_ENTRIES_TOTAL] = dircount2 279 | endrec[_ECD_SIZE] = dirsize 280 | endrec[_ECD_OFFSET] = diroffset 281 | return endrec 282 | 283 | 284 | def _EndRecData(fpin): 285 | """Return data from the "End of Central Directory" record, or None. 286 | 287 | The data is a list of the nine items in the ZIP "End of central dir" 288 | record followed by a tenth item, the file seek offset of this record.""" 289 | 290 | # Determine file size 291 | fpin.seek(0, 2) 292 | filesize = fpin.tell() 293 | 294 | # Check to see if this is ZIP file with no archive comment (the 295 | # "end of central directory" structure should be the last item in the 296 | # file if this is the case). 297 | try: 298 | fpin.seek(-sizeEndCentDir, 2) 299 | except OSError: 300 | return None 301 | data = fpin.read() 302 | if (len(data) == sizeEndCentDir and 303 | data[0:4] == stringEndArchive and 304 | data[-2:] == b"\000\000"): 305 | # the signature is correct and there's no comment, unpack structure 306 | endrec = struct.unpack(structEndArchive, data) 307 | endrec=list(endrec) 308 | 309 | # Append a blank comment and record start offset 310 | endrec.append(b"") 311 | endrec.append(filesize - sizeEndCentDir) 312 | 313 | # Try to read the "Zip64 end of central directory" structure 314 | return _EndRecData64(fpin, -sizeEndCentDir, endrec) 315 | 316 | # Either this is not a ZIP file, or it is a ZIP file with an archive 317 | # comment. Search the end of the file for the "end of central directory" 318 | # record signature. The comment is the last item in the ZIP file and may be 319 | # up to 64K long. It is assumed that the "end of central directory" magic 320 | # number does not appear in the comment. 321 | maxCommentStart = max(filesize - (1 << 16) - sizeEndCentDir, 0) 322 | fpin.seek(maxCommentStart, 0) 323 | data = fpin.read() 324 | start = data.rfind(stringEndArchive) 325 | if start >= 0: 326 | # found the magic number; attempt to unpack and interpret 327 | recData = data[start:start+sizeEndCentDir] 328 | if len(recData) != sizeEndCentDir: 329 | # Zip file is corrupted. 330 | return None 331 | endrec = list(struct.unpack(structEndArchive, recData)) 332 | commentSize = endrec[_ECD_COMMENT_SIZE] #as claimed by the zip file 333 | comment = data[start+sizeEndCentDir:start+sizeEndCentDir+commentSize] 334 | endrec.append(comment) 335 | endrec.append(maxCommentStart + start) 336 | 337 | # Try to read the "Zip64 end of central directory" structure 338 | return _EndRecData64(fpin, maxCommentStart + start - filesize, 339 | endrec) 340 | 341 | # Unable to find a valid end of central directory structure 342 | return None 343 | 344 | 345 | class ZipInfo (object): 346 | """Class with attributes describing each file in the ZIP archive.""" 347 | 348 | __slots__ = ( 349 | 'orig_filename', 350 | 'filename', 351 | 'date_time', 352 | 'compress_type', 353 | '_compresslevel', 354 | 'comment', 355 | 'extra', 356 | 'create_system', 357 | 'create_version', 358 | 'extract_version', 359 | 'reserved', 360 | 'flag_bits', 361 | 'volume', 362 | 'internal_attr', 363 | 'external_attr', 364 | 'header_offset', 365 | 'CRC', 366 | 'compress_size', 367 | 'file_size', 368 | '_raw_time', 369 | 'sozip_index', 370 | 'chunk_size', 371 | ) 372 | 373 | def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)): 374 | self.orig_filename = filename # Original file name in archive 375 | 376 | # Terminate the file name at the first null byte. Null bytes in file 377 | # names are used as tricks by viruses in archives. 378 | null_byte = filename.find(chr(0)) 379 | if null_byte >= 0: 380 | filename = filename[0:null_byte] 381 | # This is used to ensure paths in generated ZIP files always use 382 | # forward slashes as the directory separator, as required by the 383 | # ZIP format specification. 384 | if os.sep != "/" and os.sep in filename: 385 | filename = filename.replace(os.sep, "/") 386 | 387 | self.filename = filename # Normalized file name 388 | self.date_time = date_time # year, month, day, hour, min, sec 389 | 390 | if date_time[0] < 1980: 391 | raise ValueError('ZIP does not support timestamps before 1980') 392 | 393 | # Standard values: 394 | self.compress_type = ZIP_STORED # Type of compression for the file 395 | self._compresslevel = None # Level for the compressor 396 | self.comment = b"" # Comment for each file 397 | self.extra = b"" # ZIP extra data 398 | if sys.platform == 'win32': 399 | self.create_system = 0 # System which created ZIP archive 400 | else: 401 | # Assume everything else is unix-y 402 | self.create_system = 3 # System which created ZIP archive 403 | self.create_version = DEFAULT_VERSION # Version which created ZIP archive 404 | self.extract_version = DEFAULT_VERSION # Version needed to extract archive 405 | self.reserved = 0 # Must be zero 406 | self.flag_bits = 0 # ZIP flag bits 407 | self.volume = 0 # Volume number of file header 408 | self.internal_attr = 0 # Internal attributes 409 | self.external_attr = 0 # External file attributes 410 | self.compress_size = 0 # Size of the compressed file 411 | self.file_size = 0 # Size of the uncompressed file 412 | # Other attributes are set by class ZipFile: 413 | # header_offset Byte offset to the file header 414 | # CRC CRC-32 of the uncompressed file 415 | self.sozip_index = None 416 | self.chunk_size = None 417 | 418 | def __repr__(self): 419 | result = ['<%s filename=%r' % (self.__class__.__name__, self.filename)] 420 | if self.compress_type != ZIP_STORED: 421 | result.append(' compress_type=%s' % 422 | compressor_names.get(self.compress_type, 423 | self.compress_type)) 424 | hi = self.external_attr >> 16 425 | lo = self.external_attr & 0xFFFF 426 | if hi: 427 | result.append(' filemode=%r' % stat.filemode(hi)) 428 | if lo: 429 | result.append(' external_attr=%#x' % lo) 430 | isdir = self.is_dir() 431 | if not isdir or self.file_size: 432 | result.append(' file_size=%r' % self.file_size) 433 | if ((not isdir or self.compress_size) and 434 | (self.compress_type != ZIP_STORED or 435 | self.file_size != self.compress_size)): 436 | result.append(' compress_size=%r' % self.compress_size) 437 | result.append('>') 438 | return ''.join(result) 439 | 440 | def FileHeader(self, zip64=None): 441 | """Return the per-file header as a bytes object.""" 442 | dt = self.date_time 443 | dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2] 444 | dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2) 445 | if self.flag_bits & _MASK_USE_DATA_DESCRIPTOR: 446 | # Set these to zero because we write them after the file data 447 | CRC = compress_size = file_size = 0 448 | else: 449 | CRC = self.CRC 450 | compress_size = self.compress_size 451 | file_size = self.file_size 452 | 453 | extra = self.extra 454 | 455 | min_version = 0 456 | if zip64 is None: 457 | zip64 = file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT 458 | if zip64: 459 | fmt = ' ZIP64_LIMIT or compress_size > ZIP64_LIMIT: 463 | if not zip64: 464 | raise LargeZipFile("Filesize would require ZIP64 extensions") 465 | # File is larger than what fits into a 4 byte integer, 466 | # fall back to the ZIP64 extension 467 | file_size = 0xffffffff 468 | compress_size = 0xffffffff 469 | min_version = ZIP64_VERSION 470 | 471 | if self.compress_type == ZIP_BZIP2: 472 | min_version = max(BZIP2_VERSION, min_version) 473 | elif self.compress_type == ZIP_LZMA: 474 | min_version = max(LZMA_VERSION, min_version) 475 | 476 | self.extract_version = max(min_version, self.extract_version) 477 | self.create_version = max(min_version, self.create_version) 478 | filename, flag_bits = self._encodeFilenameFlags() 479 | header = struct.pack(structFileHeader, stringFileHeader, 480 | self.extract_version, self.reserved, flag_bits, 481 | self.compress_type, dostime, dosdate, CRC, 482 | compress_size, file_size, 483 | len(filename), len(extra)) 484 | return header + filename + extra 485 | 486 | def _encodeFilenameFlags(self): 487 | try: 488 | return self.filename.encode('ascii'), self.flag_bits 489 | except UnicodeEncodeError: 490 | return self.filename.encode('utf-8'), self.flag_bits | _MASK_UTF_FILENAME 491 | 492 | def _decodeExtra(self): 493 | # Try to decode the extra field. 494 | extra = self.extra 495 | unpack = struct.unpack 496 | while len(extra) >= 4: 497 | tp, ln = unpack(' len(extra): 499 | raise BadZipFile("Corrupt extra field %04x (size=%d)" % (tp, ln)) 500 | if tp == 0x0001: 501 | data = extra[4:ln+4] 502 | # ZIP64 extension (large files and/or large archives) 503 | try: 504 | if self.file_size in (0xFFFF_FFFF_FFFF_FFFF, 0xFFFF_FFFF): 505 | field = "File size" 506 | self.file_size, = unpack(' 2107: 540 | date_time = (2107, 12, 31, 23, 59, 59) 541 | # Create ZipInfo instance to store file information 542 | if arcname is None: 543 | arcname = filename 544 | arcname = os.path.normpath(os.path.splitdrive(arcname)[1]) 545 | while arcname[0] in (os.sep, os.altsep): 546 | arcname = arcname[1:] 547 | if isdir: 548 | arcname += '/' 549 | zinfo = cls(arcname, date_time) 550 | zinfo.external_attr = (st.st_mode & 0xFFFF) << 16 # Unix attributes 551 | if isdir: 552 | zinfo.file_size = 0 553 | zinfo.external_attr |= 0x10 # MS-DOS directory flag 554 | else: 555 | zinfo.file_size = st.st_size 556 | 557 | return zinfo 558 | 559 | def is_dir(self): 560 | """Return True if this archive member is a directory.""" 561 | return self.filename[-1] == '/' 562 | 563 | def is_sozip_optimized(self, zipfile): 564 | """Return True if this file has a SOZip index.""" 565 | fp = zipfile.fp 566 | cur_pos = fp.tell() 567 | fp.seek(self.header_offset + sizeFileHeader + len(self.orig_filename) + len(self.extra) + self.compress_size) 568 | 569 | fheader = fp.read(sizeFileHeader) 570 | if len(fheader) != sizeFileHeader: 571 | fp.seek(cur_pos) 572 | return False 573 | fheader = struct.unpack(structFileHeader, fheader) 574 | if fheader[_FH_SIGNATURE] != stringFileHeader or \ 575 | fheader[_FH_COMPRESSION_METHOD] != ZIP_STORED: 576 | fp.seek(cur_pos) 577 | return False 578 | fname = fp.read(fheader[_FH_FILENAME_LENGTH]) 579 | if fheader[_FH_EXTRA_FIELD_LENGTH]: 580 | fp.read(fheader[_FH_EXTRA_FIELD_LENGTH]) 581 | 582 | if fheader[_FH_GENERAL_PURPOSE_FLAG_BITS] & _MASK_UTF_FILENAME: 583 | # UTF-8 filename 584 | idx_fname_str = fname.decode("utf-8") 585 | else: 586 | idx_fname_str = fname.decode(zipfile.metadata_encoding or "cp437") 587 | 588 | filename_parts = self.filename.split('/') 589 | filename_parts[-1] = '.' + filename_parts[-1] + '.sozip.idx' 590 | expected_idx_fname_str = '/'.join(filename_parts) 591 | if idx_fname_str != expected_idx_fname_str: 592 | fp.seek(cur_pos) 593 | return False 594 | 595 | version = struct.unpack('> 1) ^ 0xEDB88320 638 | else: 639 | crc >>= 1 640 | return crc 641 | 642 | # ZIP supports a password-based form of encryption. Even though known 643 | # plaintext attacks have been found against it, it is still useful 644 | # to be able to get data out of such a file. 645 | # 646 | # Usage: 647 | # zd = _ZipDecrypter(mypwd) 648 | # plain_bytes = zd(cypher_bytes) 649 | 650 | def _ZipDecrypter(pwd): 651 | key0 = 305419896 652 | key1 = 591751049 653 | key2 = 878082192 654 | 655 | global _crctable 656 | if _crctable is None: 657 | _crctable = list(map(_gen_crc, range(256))) 658 | crctable = _crctable 659 | 660 | def crc32(ch, crc): 661 | """Compute the CRC32 primitive on one byte.""" 662 | return (crc >> 8) ^ crctable[(crc ^ ch) & 0xFF] 663 | 664 | def update_keys(c): 665 | nonlocal key0, key1, key2 666 | key0 = crc32(c, key0) 667 | key1 = (key1 + (key0 & 0xFF)) & 0xFFFFFFFF 668 | key1 = (key1 * 134775813 + 1) & 0xFFFFFFFF 669 | key2 = crc32(key1 >> 24, key2) 670 | 671 | for p in pwd: 672 | update_keys(p) 673 | 674 | def decrypter(data): 675 | """Decrypt a bytes object.""" 676 | result = bytearray() 677 | append = result.append 678 | for c in data: 679 | k = key2 | 2 680 | c ^= ((k * (k^1)) >> 8) & 0xFF 681 | update_keys(c) 682 | append(c) 683 | return bytes(result) 684 | 685 | return decrypter 686 | 687 | 688 | class LZMACompressor: 689 | 690 | def __init__(self): 691 | self._comp = None 692 | 693 | def _init(self): 694 | props = lzma._encode_filter_properties({'id': lzma.FILTER_LZMA1}) 695 | self._comp = lzma.LZMACompressor(lzma.FORMAT_RAW, filters=[ 696 | lzma._decode_filter_properties(lzma.FILTER_LZMA1, props) 697 | ]) 698 | return struct.pack('> 8) & 0xff 933 | else: 934 | # compare against the CRC otherwise 935 | check_byte = (zipinfo.CRC >> 24) & 0xff 936 | h = self._init_decrypter() 937 | if h != check_byte: 938 | raise RuntimeError("Bad password for file %r" % zipinfo.orig_filename) 939 | 940 | 941 | def _init_decrypter(self): 942 | self._decrypter = _ZipDecrypter(self._pwd) 943 | # The first 12 bytes in the cypher stream is an encryption header 944 | # used to strengthen the algorithm. The first 11 bytes are 945 | # completely random, while the 12th contains the MSB of the CRC, 946 | # or the MSB of the file time depending on the header type 947 | # and is used to check the correctness of the password. 948 | header = self._fileobj.read(12) 949 | self._compress_left -= 12 950 | return self._decrypter(header)[11] 951 | 952 | def __repr__(self): 953 | result = ['<%s.%s' % (self.__class__.__module__, 954 | self.__class__.__qualname__)] 955 | if not self.closed: 956 | result.append(' name=%r mode=%r' % (self.name, self.mode)) 957 | if self._compress_type != ZIP_STORED: 958 | result.append(' compress_type=%s' % 959 | compressor_names.get(self._compress_type, 960 | self._compress_type)) 961 | else: 962 | result.append(' [closed]') 963 | result.append('>') 964 | return ''.join(result) 965 | 966 | def readline(self, limit=-1): 967 | """Read and return a line from the stream. 968 | 969 | If limit is specified, at most limit bytes will be read. 970 | """ 971 | 972 | if limit < 0: 973 | # Shortcut common case - newline found in buffer. 974 | i = self._readbuffer.find(b'\n', self._offset) + 1 975 | if i > 0: 976 | line = self._readbuffer[self._offset: i] 977 | self._offset = i 978 | return line 979 | 980 | return io.BufferedIOBase.readline(self, limit) 981 | 982 | def peek(self, n=1): 983 | """Returns buffered bytes without advancing the position.""" 984 | if n > len(self._readbuffer) - self._offset: 985 | chunk = self.read(n) 986 | if len(chunk) > self._offset: 987 | self._readbuffer = chunk + self._readbuffer[self._offset:] 988 | self._offset = 0 989 | else: 990 | self._offset -= len(chunk) 991 | 992 | # Return up to 512 bytes to reduce allocation overhead for tight loops. 993 | return self._readbuffer[self._offset: self._offset + 512] 994 | 995 | def readable(self): 996 | if self.closed: 997 | raise ValueError("I/O operation on closed file.") 998 | return True 999 | 1000 | def read(self, n=-1): 1001 | """Read and return up to n bytes. 1002 | If the argument is omitted, None, or negative, data is read and returned until EOF is reached. 1003 | """ 1004 | if self.closed: 1005 | raise ValueError("read from closed file.") 1006 | if n is None or n < 0: 1007 | buf = self._readbuffer[self._offset:] 1008 | self._readbuffer = b'' 1009 | self._offset = 0 1010 | while not self._eof: 1011 | buf += self._read1(self.MAX_N) 1012 | return buf 1013 | 1014 | end = n + self._offset 1015 | if end < len(self._readbuffer): 1016 | buf = self._readbuffer[self._offset:end] 1017 | self._offset = end 1018 | return buf 1019 | 1020 | n = end - len(self._readbuffer) 1021 | buf = self._readbuffer[self._offset:] 1022 | self._readbuffer = b'' 1023 | self._offset = 0 1024 | while n > 0 and not self._eof: 1025 | data = self._read1(n) 1026 | if n < len(data): 1027 | self._readbuffer = data 1028 | self._offset = n 1029 | buf += data[:n] 1030 | break 1031 | buf += data 1032 | n -= len(data) 1033 | return buf 1034 | 1035 | def _update_crc(self, newdata): 1036 | # Update the CRC using the given data. 1037 | if self._expected_crc is None or self._ignore_crc: 1038 | # No need to compute the CRC if we don't have a reference value 1039 | return 1040 | self._running_crc = crc32(newdata, self._running_crc) 1041 | # Check the CRC if we're at the end of the file 1042 | if self._eof and self._running_crc != self._expected_crc: 1043 | raise BadZipFile("Bad CRC-32 for file %r" % self.name) 1044 | 1045 | def read1(self, n): 1046 | """Read up to n bytes with at most one read() system call.""" 1047 | 1048 | if n is None or n < 0: 1049 | buf = self._readbuffer[self._offset:] 1050 | self._readbuffer = b'' 1051 | self._offset = 0 1052 | while not self._eof: 1053 | data = self._read1(self.MAX_N) 1054 | if data: 1055 | buf += data 1056 | break 1057 | return buf 1058 | 1059 | end = n + self._offset 1060 | if end < len(self._readbuffer): 1061 | buf = self._readbuffer[self._offset:end] 1062 | self._offset = end 1063 | return buf 1064 | 1065 | n = end - len(self._readbuffer) 1066 | buf = self._readbuffer[self._offset:] 1067 | self._readbuffer = b'' 1068 | self._offset = 0 1069 | if n > 0: 1070 | while not self._eof: 1071 | data = self._read1(n) 1072 | if n < len(data): 1073 | self._readbuffer = data 1074 | self._offset = n 1075 | buf += data[:n] 1076 | break 1077 | if data: 1078 | buf += data 1079 | break 1080 | return buf 1081 | 1082 | def _read1(self, n): 1083 | # Read up to n compressed bytes with at most one read() system call, 1084 | # decrypt and decompress them. 1085 | if self._eof or n <= 0: 1086 | return b'' 1087 | 1088 | # Read from file. 1089 | if self._compress_type == ZIP_DEFLATED: 1090 | ## Handle unconsumed data. 1091 | data = self._decompressor.unconsumed_tail 1092 | if n > len(data): 1093 | data += self._read2(n - len(data)) 1094 | else: 1095 | data = self._read2(n) 1096 | 1097 | if self._compress_type == ZIP_STORED: 1098 | self._eof = self._compress_left <= 0 1099 | elif self._compress_type == ZIP_DEFLATED: 1100 | n = max(n, self.MIN_READ_SIZE) 1101 | data = self._decompressor.decompress(data, n) 1102 | self._eof = (self._decompressor.eof or 1103 | self._compress_left <= 0 and 1104 | not self._decompressor.unconsumed_tail) 1105 | if self._eof: 1106 | data += self._decompressor.flush() 1107 | else: 1108 | data = self._decompressor.decompress(data) 1109 | self._eof = self._decompressor.eof or self._compress_left <= 0 1110 | 1111 | data = data[:self._left] 1112 | self._left -= len(data) 1113 | if self._left <= 0: 1114 | self._eof = True 1115 | self._update_crc(data) 1116 | return data 1117 | 1118 | def _read2(self, n): 1119 | if self._compress_left <= 0: 1120 | return b'' 1121 | 1122 | n = max(n, self.MIN_READ_SIZE) 1123 | n = min(n, self._compress_left) 1124 | 1125 | data = self._fileobj.read(n) 1126 | self._compress_left -= len(data) 1127 | if not data: 1128 | raise EOFError 1129 | 1130 | if self._decrypter is not None: 1131 | data = self._decrypter(data) 1132 | return data 1133 | 1134 | def close(self): 1135 | try: 1136 | if self._close_fileobj: 1137 | self._fileobj.close() 1138 | finally: 1139 | super().close() 1140 | 1141 | def seekable(self): 1142 | if self.closed: 1143 | raise ValueError("I/O operation on closed file.") 1144 | return self._seekable 1145 | 1146 | def seek(self, offset, whence=0): 1147 | if self.closed: 1148 | raise ValueError("seek on closed file.") 1149 | if not self._seekable: 1150 | raise io.UnsupportedOperation("underlying stream is not seekable") 1151 | curr_pos = self.tell() 1152 | if whence == 0: # Seek from start of file 1153 | new_pos = offset 1154 | elif whence == 1: # Seek from current position 1155 | new_pos = curr_pos + offset 1156 | elif whence == 2: # Seek from EOF 1157 | new_pos = self._orig_file_size + offset 1158 | else: 1159 | raise ValueError("whence must be os.SEEK_SET (0), " 1160 | "os.SEEK_CUR (1), or os.SEEK_END (2)") 1161 | 1162 | if new_pos > self._orig_file_size: 1163 | new_pos = self._orig_file_size 1164 | 1165 | if new_pos < 0: 1166 | new_pos = 0 1167 | 1168 | read_offset = new_pos - curr_pos 1169 | buff_offset = read_offset + self._offset 1170 | 1171 | if buff_offset >= 0 and buff_offset < len(self._readbuffer): 1172 | # Just move the _offset index if the new position is in the _readbuffer 1173 | self._offset = buff_offset 1174 | read_offset = 0 1175 | elif self._sozip_index is not None and self._decrypter is None: 1176 | # Determine chunk index and distance to chunk 1177 | chunk_index, read_offset = divmod(new_pos, self._chunk_size) 1178 | if chunk_index > len(self._sozip_index): 1179 | chunk_index = len(self._sozip_index) 1180 | read_offset = self._chunk_size 1181 | # The offset of the first chunk (=0) is not part of the sozip index 1182 | if chunk_index > 0: 1183 | chunk_offset = self._sozip_index[chunk_index - 1] 1184 | # When seeked, do not compute CRC 1185 | self._ignore_crc = True 1186 | else: 1187 | chunk_offset = 0 1188 | self._ignore_crc = False 1189 | self._fileobj.seek(self._orig_compress_start + chunk_offset) 1190 | self._running_crc = self._orig_start_crc 1191 | self._compress_left = self._orig_compress_size - chunk_offset 1192 | self._left = self._orig_file_size - chunk_index*self._chunk_size 1193 | self._readbuffer = b'' 1194 | self._offset = 0 1195 | self._decompressor = _get_decompressor(self._compress_type) 1196 | self._eof = False 1197 | elif read_offset < 0: 1198 | # Position is before the current position. Reset the ZipExtFile 1199 | self._fileobj.seek(self._orig_compress_start) 1200 | self._running_crc = self._orig_start_crc 1201 | self._compress_left = self._orig_compress_size 1202 | self._left = self._orig_file_size 1203 | self._readbuffer = b'' 1204 | self._offset = 0 1205 | self._decompressor = _get_decompressor(self._compress_type) 1206 | self._eof = False 1207 | read_offset = new_pos 1208 | if self._decrypter is not None: 1209 | self._init_decrypter() 1210 | 1211 | while read_offset > 0: 1212 | read_len = min(self.MAX_SEEK_READ, read_offset) 1213 | self.read(read_len) 1214 | read_offset -= read_len 1215 | 1216 | return self.tell() 1217 | 1218 | def tell(self): 1219 | if self.closed: 1220 | raise ValueError("tell on closed file.") 1221 | if not self._seekable: 1222 | raise io.UnsupportedOperation("underlying stream is not seekable") 1223 | filepos = self._orig_file_size - self._left - len(self._readbuffer) + self._offset 1224 | return filepos 1225 | 1226 | 1227 | class _ZipWriteFile(io.BufferedIOBase): 1228 | def __init__(self, zf, zinfo, zip64, chunk_size=SOZIP_DEFAULT_CHUNK_SIZE): 1229 | self._zinfo = zinfo 1230 | self._zip64 = zip64 1231 | self._zipfile = zf 1232 | self._compressor = _get_compressor(zinfo.compress_type, 1233 | zinfo._compresslevel) 1234 | self._file_size = 0 1235 | self._compress_size = 0 1236 | self._crc = 0 1237 | 1238 | # Start of SOZip specific changes 1239 | self._input_data_buffer = b"" 1240 | self._chunk_size = chunk_size 1241 | self._offsets_in_compressed_stream = [] 1242 | # End of SOZip specific changes 1243 | 1244 | @property 1245 | def _fileobj(self): 1246 | return self._zipfile.fp 1247 | 1248 | def writable(self): 1249 | return True 1250 | 1251 | def write(self, data): 1252 | if self.closed: 1253 | raise ValueError('I/O operation on closed file.') 1254 | 1255 | # Accept any data that supports the buffer protocol 1256 | if isinstance(data, (bytes, bytearray)): 1257 | nbytes = len(data) 1258 | else: 1259 | data = memoryview(data) 1260 | nbytes = data.nbytes 1261 | self._file_size += nbytes 1262 | 1263 | self._crc = crc32(data, self._crc) 1264 | 1265 | # Start of SOZip specific changes 1266 | if self._zinfo.compress_type == ZIP_DEFLATED: 1267 | offset_in_data = 0 1268 | len_data = len(data) 1269 | 1270 | # Accumulate input data into self._input_data_buffer until it 1271 | # reaches self.chunk_size 1272 | while len(self._input_data_buffer) + len_data > self._chunk_size: 1273 | len_to_append = self._chunk_size - len(self._input_data_buffer) 1274 | self._input_data_buffer += data[offset_in_data:offset_in_data + len_to_append] 1275 | if self._compress_size > 0: 1276 | # Store the ofset of the start of a new compressed chunk 1277 | # (except for the first chunk) 1278 | self._offsets_in_compressed_stream.append(self._compress_size) 1279 | compressed_data = self._compressor.compress(self._input_data_buffer) 1280 | compressed_data += self._compressor.flush(zlib.Z_SYNC_FLUSH) 1281 | compressed_data += self._compressor.flush(zlib.Z_FULL_FLUSH) 1282 | self._compress_size += len(compressed_data) 1283 | self._fileobj.write(compressed_data) 1284 | self._input_data_buffer = b"" 1285 | offset_in_data += len_to_append 1286 | len_data -= len_to_append 1287 | 1288 | if len_data > 0: 1289 | # Keep track of remaining data 1290 | self._input_data_buffer += data[offset_in_data:] 1291 | 1292 | return nbytes 1293 | # End of SOZip specific changes 1294 | 1295 | if self._compressor: 1296 | data = self._compressor.compress(data) 1297 | self._compress_size += len(data) 1298 | self._fileobj.write(data) 1299 | return nbytes 1300 | 1301 | def close(self): 1302 | if self.closed: 1303 | return 1304 | try: 1305 | super().close() 1306 | # Flush any data from the compressor, and update header info 1307 | if self._compressor: 1308 | # Start of SOZip specific changes 1309 | if self._zinfo.compress_type == ZIP_DEFLATED: 1310 | if self._compress_size > 0: 1311 | # Store the ofset of the start of a new compressed chunk 1312 | # (except for the first chunk) 1313 | self._offsets_in_compressed_stream.append(self._compress_size) 1314 | if self._input_data_buffer: 1315 | buf = self._compressor.compress(self._input_data_buffer) 1316 | buf += self._compressor.flush() 1317 | else: 1318 | buf = self._compressor.flush() 1319 | # End of SOZip specific changes 1320 | else: 1321 | buf = self._compressor.flush() 1322 | self._compress_size += len(buf) 1323 | self._fileobj.write(buf) 1324 | self._zinfo.compress_size = self._compress_size 1325 | else: 1326 | self._zinfo.compress_size = self._file_size 1327 | self._zinfo.CRC = self._crc 1328 | self._zinfo.file_size = self._file_size 1329 | 1330 | # Write updated header info 1331 | if self._zinfo.flag_bits & _MASK_USE_DATA_DESCRIPTOR: 1332 | # Write CRC and file sizes after the file data 1333 | fmt = ' ZIP64_LIMIT: 1340 | raise RuntimeError( 1341 | 'File size unexpectedly exceeded ZIP64 limit') 1342 | if self._compress_size > ZIP64_LIMIT: 1343 | raise RuntimeError( 1344 | 'Compressed size unexpectedly exceeded ZIP64 limit') 1345 | # Seek backwards and write file header (which will now include 1346 | # correct CRC and file sizes) 1347 | 1348 | # Preserve current position in file 1349 | self._zipfile.start_dir = self._fileobj.tell() 1350 | self._fileobj.seek(self._zinfo.header_offset) 1351 | self._fileobj.write(self._zinfo.FileHeader(self._zip64)) 1352 | self._fileobj.seek(self._zipfile.start_dir) 1353 | 1354 | # Start of SOZip specific changes 1355 | if self._offsets_in_compressed_stream: 1356 | # Generates a .sozip.idx file, that has only a local file 1357 | # header, but no corresponding central file record. 1358 | 1359 | # Create payload of index file 1360 | data = struct.pack('') 1537 | return ''.join(result) 1538 | 1539 | def _RealGetContents(self): 1540 | """Read in the table of contents for the ZIP file.""" 1541 | fp = self.fp 1542 | try: 1543 | endrec = _EndRecData(fp) 1544 | except OSError: 1545 | raise BadZipFile("File is not a zip file") 1546 | if not endrec: 1547 | raise BadZipFile("File is not a zip file") 1548 | if self.debug > 1: 1549 | print(endrec) 1550 | size_cd = endrec[_ECD_SIZE] # bytes in central directory 1551 | offset_cd = endrec[_ECD_OFFSET] # offset of central directory 1552 | self._comment = endrec[_ECD_COMMENT] # archive comment 1553 | 1554 | # "concat" is zero, unless zip was concatenated to another file 1555 | concat = endrec[_ECD_LOCATION] - size_cd - offset_cd 1556 | if endrec[_ECD_SIGNATURE] == stringEndArchive64: 1557 | # If Zip64 extension structures are present, account for them 1558 | concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator) 1559 | 1560 | if self.debug > 2: 1561 | inferred = concat + offset_cd 1562 | print("given, inferred, offset", offset_cd, inferred, concat) 1563 | # self.start_dir: Position of start of central directory 1564 | self.start_dir = offset_cd + concat 1565 | if self.start_dir < 0: 1566 | raise BadZipFile("Bad offset for central directory") 1567 | fp.seek(self.start_dir, 0) 1568 | data = fp.read(size_cd) 1569 | fp = io.BytesIO(data) 1570 | total = 0 1571 | while total < size_cd: 1572 | centdir = fp.read(sizeCentralDir) 1573 | if len(centdir) != sizeCentralDir: 1574 | raise BadZipFile("Truncated central directory") 1575 | centdir = struct.unpack(structCentralDir, centdir) 1576 | if centdir[_CD_SIGNATURE] != stringCentralDir: 1577 | raise BadZipFile("Bad magic number for central directory") 1578 | if self.debug > 2: 1579 | print(centdir) 1580 | filename = fp.read(centdir[_CD_FILENAME_LENGTH]) 1581 | flags = centdir[_CD_FLAG_BITS] 1582 | if flags & _MASK_UTF_FILENAME: 1583 | # UTF-8 file names extension 1584 | filename = filename.decode('utf-8') 1585 | else: 1586 | # Historical ZIP filename encoding 1587 | filename = filename.decode(self.metadata_encoding or 'cp437') 1588 | # Create ZipInfo instance to store file information 1589 | x = ZipInfo(filename) 1590 | x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH]) 1591 | x.comment = fp.read(centdir[_CD_COMMENT_LENGTH]) 1592 | x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET] 1593 | (x.create_version, x.create_system, x.extract_version, x.reserved, 1594 | x.flag_bits, x.compress_type, t, d, 1595 | x.CRC, x.compress_size, x.file_size) = centdir[1:12] 1596 | if x.extract_version > MAX_EXTRACT_VERSION: 1597 | raise NotImplementedError("zip file version %.1f" % 1598 | (x.extract_version / 10)) 1599 | x.volume, x.internal_attr, x.external_attr = centdir[15:18] 1600 | # Convert date/time code to (year, month, day, hour, min, sec) 1601 | x._raw_time = t 1602 | x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F, 1603 | t>>11, (t>>5)&0x3F, (t&0x1F) * 2 ) 1604 | 1605 | x._decodeExtra() 1606 | x.header_offset = x.header_offset + concat 1607 | self.filelist.append(x) 1608 | self.NameToInfo[x.filename] = x 1609 | 1610 | # update total bytes read from central directory 1611 | total = (total + sizeCentralDir + centdir[_CD_FILENAME_LENGTH] 1612 | + centdir[_CD_EXTRA_FIELD_LENGTH] 1613 | + centdir[_CD_COMMENT_LENGTH]) 1614 | 1615 | if self.debug > 2: 1616 | print("total", total) 1617 | 1618 | 1619 | def namelist(self): 1620 | """Return a list of file names in the archive.""" 1621 | return [data.filename for data in self.filelist] 1622 | 1623 | def infolist(self): 1624 | """Return a list of class ZipInfo instances for files in the 1625 | archive.""" 1626 | return self.filelist 1627 | 1628 | def printdir(self, file=None): 1629 | """Print a table of contents for the zip file.""" 1630 | print("%-46s %19s %12s %s" % ("File Name", "Modified ", "Size", "SOZip optimized ?"), 1631 | file=file) 1632 | for zinfo in self.filelist: 1633 | date = "%d-%02d-%02d %02d:%02d:%02d" % zinfo.date_time[:6] 1634 | print("%-46s %s %12d %s" % (zinfo.filename, date, zinfo.file_size, \ 1635 | "Yes" if zinfo.is_sozip_optimized(self) else ""), 1636 | file=file) 1637 | 1638 | def testzip(self): 1639 | """Read all the files and check the CRC.""" 1640 | chunk_size = 2 ** 20 1641 | for zinfo in self.filelist: 1642 | try: 1643 | # Read by chunks, to avoid an OverflowError or a 1644 | # MemoryError with very large embedded files. 1645 | with self.open(zinfo.filename, "r") as f: 1646 | while f.read(chunk_size): # Check CRC-32 1647 | pass 1648 | except BadZipFile: 1649 | return zinfo.filename 1650 | 1651 | def getinfo(self, name): 1652 | """Return the instance of ZipInfo given 'name'.""" 1653 | info = self.NameToInfo.get(name) 1654 | if info is None: 1655 | raise KeyError( 1656 | 'There is no item named %r in the archive' % name) 1657 | if info.compress_type != ZIP_STORED and info.sozip_index is None: 1658 | # If compressed, set sozip attributes if available 1659 | info.is_sozip_optimized(self) 1660 | return info 1661 | 1662 | def setpassword(self, pwd): 1663 | """Set default password for encrypted files.""" 1664 | if pwd and not isinstance(pwd, bytes): 1665 | raise TypeError("pwd: expected bytes, got %s" % type(pwd).__name__) 1666 | if pwd: 1667 | self.pwd = pwd 1668 | else: 1669 | self.pwd = None 1670 | 1671 | @property 1672 | def comment(self): 1673 | """The comment text associated with the ZIP file.""" 1674 | return self._comment 1675 | 1676 | @comment.setter 1677 | def comment(self, comment): 1678 | if not isinstance(comment, bytes): 1679 | raise TypeError("comment: expected bytes, got %s" % type(comment).__name__) 1680 | # check for valid comment length 1681 | if len(comment) > ZIP_MAX_COMMENT: 1682 | import warnings 1683 | warnings.warn('Archive comment is too long; truncating to %d bytes' 1684 | % ZIP_MAX_COMMENT, stacklevel=2) 1685 | comment = comment[:ZIP_MAX_COMMENT] 1686 | self._comment = comment 1687 | self._didModify = True 1688 | 1689 | def read(self, name, pwd=None): 1690 | """Return file bytes for name.""" 1691 | with self.open(name, "r", pwd) as fp: 1692 | return fp.read() 1693 | 1694 | def open(self, name, mode="r", pwd=None, *, force_zip64=False): 1695 | """Return file-like object for 'name'. 1696 | 1697 | name is a string for the file name within the ZIP file, or a ZipInfo 1698 | object. 1699 | 1700 | mode should be 'r' to read a file already in the ZIP file, or 'w' to 1701 | write to a file newly added to the archive. 1702 | 1703 | pwd is the password to decrypt files (only used for reading). 1704 | 1705 | When writing, if the file size is not known in advance but may exceed 1706 | 2 GiB, pass force_zip64 to use the ZIP64 format, which can handle large 1707 | files. If the size is known in advance, it is best to pass a ZipInfo 1708 | instance for name, with zinfo.file_size set. 1709 | """ 1710 | if mode not in {"r", "w"}: 1711 | raise ValueError('open() requires mode "r" or "w"') 1712 | if pwd and (mode == "w"): 1713 | raise ValueError("pwd is only supported for reading files") 1714 | if not self.fp: 1715 | raise ValueError( 1716 | "Attempt to use ZIP archive that was already closed") 1717 | 1718 | # Make sure we have an info object 1719 | if isinstance(name, ZipInfo): 1720 | # 'name' is already an info object 1721 | zinfo = name 1722 | elif mode == 'w': 1723 | zinfo = ZipInfo(name) 1724 | zinfo.compress_type = self.compression 1725 | zinfo._compresslevel = self.compresslevel 1726 | else: 1727 | # Get info object for name 1728 | zinfo = self.getinfo(name) 1729 | 1730 | if mode == 'w': 1731 | return self._open_to_write(zinfo, force_zip64=force_zip64, 1732 | chunk_size=self._chunk_size) 1733 | 1734 | if self._writing: 1735 | raise ValueError("Can't read from the ZIP file while there " 1736 | "is an open writing handle on it. " 1737 | "Close the writing handle before trying to read.") 1738 | 1739 | # Open for reading: 1740 | self._fileRefCnt += 1 1741 | zef_file = _SharedFile(self.fp, zinfo.header_offset, 1742 | self._fpclose, self._lock, lambda: self._writing) 1743 | try: 1744 | # Skip the file header: 1745 | fheader = zef_file.read(sizeFileHeader) 1746 | if len(fheader) != sizeFileHeader: 1747 | raise BadZipFile("Truncated file header") 1748 | fheader = struct.unpack(structFileHeader, fheader) 1749 | if fheader[_FH_SIGNATURE] != stringFileHeader: 1750 | raise BadZipFile("Bad magic number for file header") 1751 | 1752 | fname = zef_file.read(fheader[_FH_FILENAME_LENGTH]) 1753 | if fheader[_FH_EXTRA_FIELD_LENGTH]: 1754 | zef_file.read(fheader[_FH_EXTRA_FIELD_LENGTH]) 1755 | 1756 | if zinfo.flag_bits & _MASK_COMPRESSED_PATCH: 1757 | # Zip 2.7: compressed patched data 1758 | raise NotImplementedError("compressed patched data (flag bit 5)") 1759 | 1760 | if zinfo.flag_bits & _MASK_STRONG_ENCRYPTION: 1761 | # strong encryption 1762 | raise NotImplementedError("strong encryption (flag bit 6)") 1763 | 1764 | if fheader[_FH_GENERAL_PURPOSE_FLAG_BITS] & _MASK_UTF_FILENAME: 1765 | # UTF-8 filename 1766 | fname_str = fname.decode("utf-8") 1767 | else: 1768 | fname_str = fname.decode(self.metadata_encoding or "cp437") 1769 | 1770 | if fname_str != zinfo.orig_filename: 1771 | raise BadZipFile( 1772 | 'File name in directory %r and header %r differ.' 1773 | % (zinfo.orig_filename, fname)) 1774 | 1775 | # check for encrypted flag & handle password 1776 | is_encrypted = zinfo.flag_bits & _MASK_ENCRYPTED 1777 | if is_encrypted: 1778 | if not pwd: 1779 | pwd = self.pwd 1780 | if pwd and not isinstance(pwd, bytes): 1781 | raise TypeError("pwd: expected bytes, got %s" % type(pwd).__name__) 1782 | if not pwd: 1783 | raise RuntimeError("File %r is encrypted, password " 1784 | "required for extraction" % name) 1785 | else: 1786 | pwd = None 1787 | 1788 | return ZipExtFile(zef_file, mode, zinfo, pwd, True) 1789 | except: 1790 | zef_file.close() 1791 | raise 1792 | 1793 | def _open_to_write(self, zinfo, force_zip64=False, chunk_size=SOZIP_DEFAULT_CHUNK_SIZE): 1794 | if force_zip64 and not self._allowZip64: 1795 | raise ValueError( 1796 | "force_zip64 is True, but allowZip64 was False when opening " 1797 | "the ZIP file." 1798 | ) 1799 | if self._writing: 1800 | raise ValueError("Can't write to the ZIP file while there is " 1801 | "another write handle open on it. " 1802 | "Close the first handle before opening another.") 1803 | 1804 | # Size and CRC are overwritten with correct data after processing the file 1805 | zinfo.compress_size = 0 1806 | zinfo.CRC = 0 1807 | 1808 | zinfo.flag_bits = 0x00 1809 | if zinfo.compress_type == ZIP_LZMA: 1810 | # Compressed data includes an end-of-stream (EOS) marker 1811 | zinfo.flag_bits |= _MASK_COMPRESS_OPTION_1 1812 | if not self._seekable: 1813 | zinfo.flag_bits |= _MASK_USE_DATA_DESCRIPTOR 1814 | 1815 | if not zinfo.external_attr: 1816 | zinfo.external_attr = 0o600 << 16 # permissions: ?rw------- 1817 | 1818 | # Compressed size can be larger than uncompressed size 1819 | zip64 = self._allowZip64 and \ 1820 | (force_zip64 or zinfo.file_size * 1.05 > ZIP64_LIMIT) 1821 | 1822 | if self._seekable: 1823 | self.fp.seek(self.start_dir) 1824 | zinfo.header_offset = self.fp.tell() 1825 | 1826 | self._writecheck(zinfo) 1827 | self._didModify = True 1828 | 1829 | self.fp.write(zinfo.FileHeader(zip64)) 1830 | 1831 | self._writing = True 1832 | return _ZipWriteFile(self, zinfo, zip64, chunk_size) 1833 | 1834 | def extract(self, member, path=None, pwd=None): 1835 | """Extract a member from the archive to the current working directory, 1836 | using its full name. Its file information is extracted as accurately 1837 | as possible. `member' may be a filename or a ZipInfo object. You can 1838 | specify a different directory using `path'. 1839 | """ 1840 | if path is None: 1841 | path = os.getcwd() 1842 | else: 1843 | path = os.fspath(path) 1844 | 1845 | return self._extract_member(member, path, pwd) 1846 | 1847 | def extractall(self, path=None, members=None, pwd=None): 1848 | """Extract all members from the archive to the current working 1849 | directory. `path' specifies a different directory to extract to. 1850 | `members' is optional and must be a subset of the list returned 1851 | by namelist(). 1852 | """ 1853 | if members is None: 1854 | members = self.namelist() 1855 | 1856 | if path is None: 1857 | path = os.getcwd() 1858 | else: 1859 | path = os.fspath(path) 1860 | 1861 | for zipinfo in members: 1862 | self._extract_member(zipinfo, path, pwd) 1863 | 1864 | @classmethod 1865 | def _sanitize_windows_name(cls, arcname, pathsep): 1866 | """Replace bad characters and remove trailing dots from parts.""" 1867 | table = cls._windows_illegal_name_trans_table 1868 | if not table: 1869 | illegal = ':<>|"?*' 1870 | table = str.maketrans(illegal, '_' * len(illegal)) 1871 | cls._windows_illegal_name_trans_table = table 1872 | arcname = arcname.translate(table) 1873 | # remove trailing dots 1874 | arcname = (x.rstrip('.') for x in arcname.split(pathsep)) 1875 | # rejoin, removing empty parts. 1876 | arcname = pathsep.join(x for x in arcname if x) 1877 | return arcname 1878 | 1879 | def _extract_member(self, member, targetpath, pwd): 1880 | """Extract the ZipInfo object 'member' to a physical 1881 | file on the path targetpath. 1882 | """ 1883 | if not isinstance(member, ZipInfo): 1884 | member = self.getinfo(member) 1885 | 1886 | # build the destination pathname, replacing 1887 | # forward slashes to platform specific separators. 1888 | arcname = member.filename.replace('/', os.path.sep) 1889 | 1890 | if os.path.altsep: 1891 | arcname = arcname.replace(os.path.altsep, os.path.sep) 1892 | # interpret absolute pathname as relative, remove drive letter or 1893 | # UNC path, redundant separators, "." and ".." components. 1894 | arcname = os.path.splitdrive(arcname)[1] 1895 | invalid_path_parts = ('', os.path.curdir, os.path.pardir) 1896 | arcname = os.path.sep.join(x for x in arcname.split(os.path.sep) 1897 | if x not in invalid_path_parts) 1898 | if os.path.sep == '\\': 1899 | # filter illegal characters on Windows 1900 | arcname = self._sanitize_windows_name(arcname, os.path.sep) 1901 | 1902 | targetpath = os.path.join(targetpath, arcname) 1903 | targetpath = os.path.normpath(targetpath) 1904 | 1905 | # Create all upper directories if necessary. 1906 | upperdirs = os.path.dirname(targetpath) 1907 | if upperdirs and not os.path.exists(upperdirs): 1908 | os.makedirs(upperdirs) 1909 | 1910 | if member.is_dir(): 1911 | if not os.path.isdir(targetpath): 1912 | os.mkdir(targetpath) 1913 | return targetpath 1914 | 1915 | with self.open(member, pwd=pwd) as source, \ 1916 | open(targetpath, "wb") as target: 1917 | shutil.copyfileobj(source, target) 1918 | 1919 | return targetpath 1920 | 1921 | def _writecheck(self, zinfo): 1922 | """Check for errors before writing a file to the archive.""" 1923 | if zinfo.filename in self.NameToInfo: 1924 | import warnings 1925 | warnings.warn('Duplicate name: %r' % zinfo.filename, stacklevel=3) 1926 | if self.mode not in ('w', 'x', 'a'): 1927 | raise ValueError("write() requires mode 'w', 'x', or 'a'") 1928 | if not self.fp: 1929 | raise ValueError( 1930 | "Attempt to write ZIP archive that was already closed") 1931 | _check_compression(zinfo.compress_type) 1932 | if not self._allowZip64: 1933 | requires_zip64 = None 1934 | if len(self.filelist) >= ZIP_FILECOUNT_LIMIT: 1935 | requires_zip64 = "Files count" 1936 | elif zinfo.file_size > ZIP64_LIMIT: 1937 | requires_zip64 = "Filesize" 1938 | elif zinfo.header_offset > ZIP64_LIMIT: 1939 | requires_zip64 = "Zipfile size" 1940 | if requires_zip64: 1941 | raise LargeZipFile(requires_zip64 + 1942 | " would require ZIP64 extensions") 1943 | 1944 | def write(self, filename, arcname=None, 1945 | compress_type=None, compresslevel=None): 1946 | """Put the bytes from filename into the archive under the name 1947 | arcname.""" 1948 | if not self.fp: 1949 | raise ValueError( 1950 | "Attempt to write to ZIP archive that was already closed") 1951 | if self._writing: 1952 | raise ValueError( 1953 | "Can't write to ZIP archive while an open writing handle exists" 1954 | ) 1955 | 1956 | zinfo = ZipInfo.from_file(filename, arcname, 1957 | strict_timestamps=self._strict_timestamps) 1958 | 1959 | if zinfo.is_dir(): 1960 | zinfo.compress_size = 0 1961 | zinfo.CRC = 0 1962 | self.mkdir(zinfo) 1963 | else: 1964 | if compress_type is not None: 1965 | zinfo.compress_type = compress_type 1966 | else: 1967 | zinfo.compress_type = self.compression 1968 | 1969 | if compresslevel is not None: 1970 | zinfo._compresslevel = compresslevel 1971 | else: 1972 | zinfo._compresslevel = self.compresslevel 1973 | 1974 | with open(filename, "rb") as src, self.open(zinfo, 'w') as dest: 1975 | shutil.copyfileobj(src, dest, 1024*8) 1976 | 1977 | def writestr(self, zinfo_or_arcname, data, 1978 | compress_type=None, compresslevel=None): 1979 | """Write a file into the archive. The contents is 'data', which 1980 | may be either a 'str' or a 'bytes' instance; if it is a 'str', 1981 | it is encoded as UTF-8 first. 1982 | 'zinfo_or_arcname' is either a ZipInfo instance or 1983 | the name of the file in the archive.""" 1984 | if isinstance(data, str): 1985 | data = data.encode("utf-8") 1986 | if not isinstance(zinfo_or_arcname, ZipInfo): 1987 | zinfo = ZipInfo(filename=zinfo_or_arcname, 1988 | date_time=time.localtime(time.time())[:6]) 1989 | zinfo.compress_type = self.compression 1990 | zinfo._compresslevel = self.compresslevel 1991 | if zinfo.filename[-1] == '/': 1992 | zinfo.external_attr = 0o40775 << 16 # drwxrwxr-x 1993 | zinfo.external_attr |= 0x10 # MS-DOS directory flag 1994 | else: 1995 | zinfo.external_attr = 0o600 << 16 # ?rw------- 1996 | else: 1997 | zinfo = zinfo_or_arcname 1998 | 1999 | if not self.fp: 2000 | raise ValueError( 2001 | "Attempt to write to ZIP archive that was already closed") 2002 | if self._writing: 2003 | raise ValueError( 2004 | "Can't write to ZIP archive while an open writing handle exists." 2005 | ) 2006 | 2007 | if compress_type is not None: 2008 | zinfo.compress_type = compress_type 2009 | 2010 | if compresslevel is not None: 2011 | zinfo._compresslevel = compresslevel 2012 | 2013 | zinfo.file_size = len(data) # Uncompressed size 2014 | with self._lock: 2015 | with self.open(zinfo, mode='w') as dest: 2016 | dest.write(data) 2017 | 2018 | def mkdir(self, zinfo_or_directory_name, mode=511): 2019 | """Creates a directory inside the zip archive.""" 2020 | if isinstance(zinfo_or_directory_name, ZipInfo): 2021 | zinfo = zinfo_or_directory_name 2022 | if not zinfo.is_dir(): 2023 | raise ValueError("The given ZipInfo does not describe a directory") 2024 | elif isinstance(zinfo_or_directory_name, str): 2025 | directory_name = zinfo_or_directory_name 2026 | if not directory_name.endswith("/"): 2027 | directory_name += "/" 2028 | zinfo = ZipInfo(directory_name) 2029 | zinfo.compress_size = 0 2030 | zinfo.CRC = 0 2031 | zinfo.external_attr = ((0o40000 | mode) & 0xFFFF) << 16 2032 | zinfo.file_size = 0 2033 | zinfo.external_attr |= 0x10 2034 | else: 2035 | raise TypeError("Expected type str or ZipInfo") 2036 | 2037 | with self._lock: 2038 | if self._seekable: 2039 | self.fp.seek(self.start_dir) 2040 | zinfo.header_offset = self.fp.tell() # Start of header bytes 2041 | if zinfo.compress_type == ZIP_LZMA: 2042 | # Compressed data includes an end-of-stream (EOS) marker 2043 | zinfo.flag_bits |= _MASK_COMPRESS_OPTION_1 2044 | 2045 | self._writecheck(zinfo) 2046 | self._didModify = True 2047 | 2048 | self.filelist.append(zinfo) 2049 | self.NameToInfo[zinfo.filename] = zinfo 2050 | self.fp.write(zinfo.FileHeader(False)) 2051 | self.start_dir = self.fp.tell() 2052 | 2053 | def __del__(self): 2054 | """Call the "close()" method in case the user forgot.""" 2055 | self.close() 2056 | 2057 | def close(self): 2058 | """Close the file, and for mode 'w', 'x' and 'a' write the ending 2059 | records.""" 2060 | if self.fp is None: 2061 | return 2062 | 2063 | if self._writing: 2064 | raise ValueError("Can't close the ZIP file while there is " 2065 | "an open writing handle on it. " 2066 | "Close the writing handle before closing the zip.") 2067 | 2068 | try: 2069 | if self.mode in ('w', 'x', 'a') and self._didModify: # write ending records 2070 | with self._lock: 2071 | if self._seekable: 2072 | self.fp.seek(self.start_dir) 2073 | self._write_end_record() 2074 | finally: 2075 | fp = self.fp 2076 | self.fp = None 2077 | self._fpclose(fp) 2078 | 2079 | def _write_end_record(self): 2080 | for zinfo in self.filelist: # write central directory 2081 | dt = zinfo.date_time 2082 | dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2] 2083 | dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2) 2084 | extra = [] 2085 | if zinfo.file_size > ZIP64_LIMIT \ 2086 | or zinfo.compress_size > ZIP64_LIMIT: 2087 | extra.append(zinfo.file_size) 2088 | extra.append(zinfo.compress_size) 2089 | file_size = 0xffffffff 2090 | compress_size = 0xffffffff 2091 | else: 2092 | file_size = zinfo.file_size 2093 | compress_size = zinfo.compress_size 2094 | 2095 | if zinfo.header_offset > ZIP64_LIMIT: 2096 | extra.append(zinfo.header_offset) 2097 | header_offset = 0xffffffff 2098 | else: 2099 | header_offset = zinfo.header_offset 2100 | 2101 | extra_data = zinfo.extra 2102 | min_version = 0 2103 | if extra: 2104 | # Append a ZIP64 field to the extra's 2105 | extra_data = _strip_extra(extra_data, (1,)) 2106 | extra_data = struct.pack( 2107 | ' ZIP_FILECOUNT_LIMIT: 2140 | requires_zip64 = "Files count" 2141 | elif centDirOffset > ZIP64_LIMIT: 2142 | requires_zip64 = "Central directory offset" 2143 | elif centDirSize > ZIP64_LIMIT: 2144 | requires_zip64 = "Central directory size" 2145 | if requires_zip64: 2146 | # Need to write the ZIP64 end-of-archive records 2147 | if not self._allowZip64: 2148 | raise LargeZipFile(requires_zip64 + 2149 | " would require ZIP64 extensions") 2150 | zip64endrec = struct.pack( 2151 | structEndArchive64, stringEndArchive64, 2152 | 44, 45, 45, 0, 0, centDirCount, centDirCount, 2153 | centDirSize, centDirOffset) 2154 | self.fp.write(zip64endrec) 2155 | 2156 | zip64locrec = struct.pack( 2157 | structEndArchive64Locator, 2158 | stringEndArchive64Locator, 0, pos2, 1) 2159 | self.fp.write(zip64locrec) 2160 | centDirCount = min(centDirCount, 0xFFFF) 2161 | centDirSize = min(centDirSize, 0xFFFFFFFF) 2162 | centDirOffset = min(centDirOffset, 0xFFFFFFFF) 2163 | 2164 | endrec = struct.pack(structEndArchive, stringEndArchive, 2165 | 0, 0, centDirCount, centDirCount, 2166 | centDirSize, centDirOffset, len(self._comment)) 2167 | self.fp.write(endrec) 2168 | self.fp.write(self._comment) 2169 | if self.mode == "a": 2170 | self.fp.truncate() 2171 | self.fp.flush() 2172 | 2173 | def _fpclose(self, fp): 2174 | assert self._fileRefCnt > 0 2175 | self._fileRefCnt -= 1 2176 | if not self._fileRefCnt and not self._filePassed: 2177 | fp.close() 2178 | 2179 | 2180 | class PyZipFile(ZipFile): 2181 | """Class to create ZIP archives with Python library files and packages.""" 2182 | 2183 | def __init__(self, file, mode="r", compression=ZIP_STORED, 2184 | allowZip64=True, optimize=-1): 2185 | ZipFile.__init__(self, file, mode=mode, compression=compression, 2186 | allowZip64=allowZip64) 2187 | self._optimize = optimize 2188 | 2189 | def writepy(self, pathname, basename="", filterfunc=None): 2190 | """Add all files from "pathname" to the ZIP archive. 2191 | 2192 | If pathname is a package directory, search the directory and 2193 | all package subdirectories recursively for all *.py and enter 2194 | the modules into the archive. If pathname is a plain 2195 | directory, listdir *.py and enter all modules. Else, pathname 2196 | must be a Python *.py file and the module will be put into the 2197 | archive. Added modules are always module.pyc. 2198 | This method will compile the module.py into module.pyc if 2199 | necessary. 2200 | If filterfunc(pathname) is given, it is called with every argument. 2201 | When it is False, the file or directory is skipped. 2202 | """ 2203 | pathname = os.fspath(pathname) 2204 | if filterfunc and not filterfunc(pathname): 2205 | if self.debug: 2206 | label = 'path' if os.path.isdir(pathname) else 'file' 2207 | print('%s %r skipped by filterfunc' % (label, pathname)) 2208 | return 2209 | dir, name = os.path.split(pathname) 2210 | if os.path.isdir(pathname): 2211 | initname = os.path.join(pathname, "__init__.py") 2212 | if os.path.isfile(initname): 2213 | # This is a package directory, add it 2214 | if basename: 2215 | basename = "%s/%s" % (basename, name) 2216 | else: 2217 | basename = name 2218 | if self.debug: 2219 | print("Adding package in", pathname, "as", basename) 2220 | fname, arcname = self._get_codename(initname[0:-3], basename) 2221 | if self.debug: 2222 | print("Adding", arcname) 2223 | self.write(fname, arcname) 2224 | dirlist = sorted(os.listdir(pathname)) 2225 | dirlist.remove("__init__.py") 2226 | # Add all *.py files and package subdirectories 2227 | for filename in dirlist: 2228 | path = os.path.join(pathname, filename) 2229 | root, ext = os.path.splitext(filename) 2230 | if os.path.isdir(path): 2231 | if os.path.isfile(os.path.join(path, "__init__.py")): 2232 | # This is a package directory, add it 2233 | self.writepy(path, basename, 2234 | filterfunc=filterfunc) # Recursive call 2235 | elif ext == ".py": 2236 | if filterfunc and not filterfunc(path): 2237 | if self.debug: 2238 | print('file %r skipped by filterfunc' % path) 2239 | continue 2240 | fname, arcname = self._get_codename(path[0:-3], 2241 | basename) 2242 | if self.debug: 2243 | print("Adding", arcname) 2244 | self.write(fname, arcname) 2245 | else: 2246 | # This is NOT a package directory, add its files at top level 2247 | if self.debug: 2248 | print("Adding files from directory", pathname) 2249 | for filename in sorted(os.listdir(pathname)): 2250 | path = os.path.join(pathname, filename) 2251 | root, ext = os.path.splitext(filename) 2252 | if ext == ".py": 2253 | if filterfunc and not filterfunc(path): 2254 | if self.debug: 2255 | print('file %r skipped by filterfunc' % path) 2256 | continue 2257 | fname, arcname = self._get_codename(path[0:-3], 2258 | basename) 2259 | if self.debug: 2260 | print("Adding", arcname) 2261 | self.write(fname, arcname) 2262 | else: 2263 | if pathname[-3:] != ".py": 2264 | raise RuntimeError( 2265 | 'Files added with writepy() must end with ".py"') 2266 | fname, arcname = self._get_codename(pathname[0:-3], basename) 2267 | if self.debug: 2268 | print("Adding file", arcname) 2269 | self.write(fname, arcname) 2270 | 2271 | def _get_codename(self, pathname, basename): 2272 | """Return (filename, archivename) for the path. 2273 | 2274 | Given a module name path, return the correct file path and 2275 | archive name, compiling if necessary. For example, given 2276 | /python/lib/string, return (/python/lib/string.pyc, string). 2277 | """ 2278 | def _compile(file, optimize=-1): 2279 | import py_compile 2280 | if self.debug: 2281 | print("Compiling", file) 2282 | try: 2283 | py_compile.compile(file, doraise=True, optimize=optimize) 2284 | except py_compile.PyCompileError as err: 2285 | print(err.msg) 2286 | return False 2287 | return True 2288 | 2289 | file_py = pathname + ".py" 2290 | file_pyc = pathname + ".pyc" 2291 | pycache_opt0 = importlib.util.cache_from_source(file_py, optimization='') 2292 | pycache_opt1 = importlib.util.cache_from_source(file_py, optimization=1) 2293 | pycache_opt2 = importlib.util.cache_from_source(file_py, optimization=2) 2294 | if self._optimize == -1: 2295 | # legacy mode: use whatever file is present 2296 | if (os.path.isfile(file_pyc) and 2297 | os.stat(file_pyc).st_mtime >= os.stat(file_py).st_mtime): 2298 | # Use .pyc file. 2299 | arcname = fname = file_pyc 2300 | elif (os.path.isfile(pycache_opt0) and 2301 | os.stat(pycache_opt0).st_mtime >= os.stat(file_py).st_mtime): 2302 | # Use the __pycache__/*.pyc file, but write it to the legacy pyc 2303 | # file name in the archive. 2304 | fname = pycache_opt0 2305 | arcname = file_pyc 2306 | elif (os.path.isfile(pycache_opt1) and 2307 | os.stat(pycache_opt1).st_mtime >= os.stat(file_py).st_mtime): 2308 | # Use the __pycache__/*.pyc file, but write it to the legacy pyc 2309 | # file name in the archive. 2310 | fname = pycache_opt1 2311 | arcname = file_pyc 2312 | elif (os.path.isfile(pycache_opt2) and 2313 | os.stat(pycache_opt2).st_mtime >= os.stat(file_py).st_mtime): 2314 | # Use the __pycache__/*.pyc file, but write it to the legacy pyc 2315 | # file name in the archive. 2316 | fname = pycache_opt2 2317 | arcname = file_pyc 2318 | else: 2319 | # Compile py into PEP 3147 pyc file. 2320 | if _compile(file_py): 2321 | if sys.flags.optimize == 0: 2322 | fname = pycache_opt0 2323 | elif sys.flags.optimize == 1: 2324 | fname = pycache_opt1 2325 | else: 2326 | fname = pycache_opt2 2327 | arcname = file_pyc 2328 | else: 2329 | fname = arcname = file_py 2330 | else: 2331 | # new mode: use given optimization level 2332 | if self._optimize == 0: 2333 | fname = pycache_opt0 2334 | arcname = file_pyc 2335 | else: 2336 | arcname = file_pyc 2337 | if self._optimize == 1: 2338 | fname = pycache_opt1 2339 | elif self._optimize == 2: 2340 | fname = pycache_opt2 2341 | else: 2342 | msg = "invalid value for 'optimize': {!r}".format(self._optimize) 2343 | raise ValueError(msg) 2344 | if not (os.path.isfile(fname) and 2345 | os.stat(fname).st_mtime >= os.stat(file_py).st_mtime): 2346 | if not _compile(file_py, optimize=self._optimize): 2347 | fname = arcname = file_py 2348 | archivename = os.path.split(arcname)[1] 2349 | if basename: 2350 | archivename = "%s/%s" % (basename, archivename) 2351 | return (fname, archivename) 2352 | 2353 | 2354 | def _parents(path): 2355 | """ 2356 | Given a path with elements separated by 2357 | posixpath.sep, generate all parents of that path. 2358 | 2359 | >>> list(_parents('b/d')) 2360 | ['b'] 2361 | >>> list(_parents('/b/d/')) 2362 | ['/b'] 2363 | >>> list(_parents('b/d/f/')) 2364 | ['b/d', 'b'] 2365 | >>> list(_parents('b')) 2366 | [] 2367 | >>> list(_parents('')) 2368 | [] 2369 | """ 2370 | return itertools.islice(_ancestry(path), 1, None) 2371 | 2372 | 2373 | def _ancestry(path): 2374 | """ 2375 | Given a path with elements separated by 2376 | posixpath.sep, generate all elements of that path 2377 | 2378 | >>> list(_ancestry('b/d')) 2379 | ['b/d', 'b'] 2380 | >>> list(_ancestry('/b/d/')) 2381 | ['/b/d', '/b'] 2382 | >>> list(_ancestry('b/d/f/')) 2383 | ['b/d/f', 'b/d', 'b'] 2384 | >>> list(_ancestry('b')) 2385 | ['b'] 2386 | >>> list(_ancestry('')) 2387 | [] 2388 | """ 2389 | path = path.rstrip(posixpath.sep) 2390 | while path and path != posixpath.sep: 2391 | yield path 2392 | path, tail = posixpath.split(path) 2393 | 2394 | 2395 | _dedupe = dict.fromkeys 2396 | """Deduplicate an iterable in original order""" 2397 | 2398 | 2399 | def _difference(minuend, subtrahend): 2400 | """ 2401 | Return items in minuend not in subtrahend, retaining order 2402 | with O(1) lookup. 2403 | """ 2404 | return itertools.filterfalse(set(subtrahend).__contains__, minuend) 2405 | 2406 | 2407 | class CompleteDirs(ZipFile): 2408 | """ 2409 | A ZipFile subclass that ensures that implied directories 2410 | are always included in the namelist. 2411 | """ 2412 | 2413 | @staticmethod 2414 | def _implied_dirs(names): 2415 | parents = itertools.chain.from_iterable(map(_parents, names)) 2416 | as_dirs = (p + posixpath.sep for p in parents) 2417 | return _dedupe(_difference(as_dirs, names)) 2418 | 2419 | def namelist(self): 2420 | names = super(CompleteDirs, self).namelist() 2421 | return names + list(self._implied_dirs(names)) 2422 | 2423 | def _name_set(self): 2424 | return set(self.namelist()) 2425 | 2426 | def resolve_dir(self, name): 2427 | """ 2428 | If the name represents a directory, return that name 2429 | as a directory (with the trailing slash). 2430 | """ 2431 | names = self._name_set() 2432 | dirname = name + '/' 2433 | dir_match = name not in names and dirname in names 2434 | return dirname if dir_match else name 2435 | 2436 | @classmethod 2437 | def make(cls, source): 2438 | """ 2439 | Given a source (filename or zipfile), return an 2440 | appropriate CompleteDirs subclass. 2441 | """ 2442 | if isinstance(source, CompleteDirs): 2443 | return source 2444 | 2445 | if not isinstance(source, ZipFile): 2446 | return cls(source) 2447 | 2448 | # Only allow for FastLookup when supplied zipfile is read-only 2449 | if 'r' not in source.mode: 2450 | cls = CompleteDirs 2451 | 2452 | source.__class__ = cls 2453 | return source 2454 | 2455 | 2456 | class FastLookup(CompleteDirs): 2457 | """ 2458 | ZipFile subclass to ensure implicit 2459 | dirs exist and are resolved rapidly. 2460 | """ 2461 | 2462 | def namelist(self): 2463 | with contextlib.suppress(AttributeError): 2464 | return self.__names 2465 | self.__names = super(FastLookup, self).namelist() 2466 | return self.__names 2467 | 2468 | def _name_set(self): 2469 | with contextlib.suppress(AttributeError): 2470 | return self.__lookup 2471 | self.__lookup = super(FastLookup, self)._name_set() 2472 | return self.__lookup 2473 | 2474 | 2475 | class Path: 2476 | """ 2477 | A pathlib-compatible interface for zip files. 2478 | 2479 | Consider a zip file with this structure:: 2480 | 2481 | . 2482 | ├── a.txt 2483 | └── b 2484 | ├── c.txt 2485 | └── d 2486 | └── e.txt 2487 | 2488 | >>> data = io.BytesIO() 2489 | >>> zf = ZipFile(data, 'w') 2490 | >>> zf.writestr('a.txt', 'content of a') 2491 | >>> zf.writestr('b/c.txt', 'content of c') 2492 | >>> zf.writestr('b/d/e.txt', 'content of e') 2493 | >>> zf.filename = 'mem/abcde.zip' 2494 | 2495 | Path accepts the zipfile object itself or a filename 2496 | 2497 | >>> root = Path(zf) 2498 | 2499 | From there, several path operations are available. 2500 | 2501 | Directory iteration (including the zip file itself): 2502 | 2503 | >>> a, b = root.iterdir() 2504 | >>> a 2505 | Path('mem/abcde.zip', 'a.txt') 2506 | >>> b 2507 | Path('mem/abcde.zip', 'b/') 2508 | 2509 | name property: 2510 | 2511 | >>> b.name 2512 | 'b' 2513 | 2514 | join with divide operator: 2515 | 2516 | >>> c = b / 'c.txt' 2517 | >>> c 2518 | Path('mem/abcde.zip', 'b/c.txt') 2519 | >>> c.name 2520 | 'c.txt' 2521 | 2522 | Read text: 2523 | 2524 | >>> c.read_text() 2525 | 'content of c' 2526 | 2527 | existence: 2528 | 2529 | >>> c.exists() 2530 | True 2531 | >>> (b / 'missing.txt').exists() 2532 | False 2533 | 2534 | Coercion to string: 2535 | 2536 | >>> import os 2537 | >>> str(c).replace(os.sep, posixpath.sep) 2538 | 'mem/abcde.zip/b/c.txt' 2539 | 2540 | At the root, ``name``, ``filename``, and ``parent`` 2541 | resolve to the zipfile. Note these attributes are not 2542 | valid and will raise a ``ValueError`` if the zipfile 2543 | has no filename. 2544 | 2545 | >>> root.name 2546 | 'abcde.zip' 2547 | >>> str(root.filename).replace(os.sep, posixpath.sep) 2548 | 'mem/abcde.zip' 2549 | >>> str(root.parent) 2550 | 'mem' 2551 | """ 2552 | 2553 | __repr = "{self.__class__.__name__}({self.root.filename!r}, {self.at!r})" 2554 | 2555 | def __init__(self, root, at=""): 2556 | """ 2557 | Construct a Path from a ZipFile or filename. 2558 | 2559 | Note: When the source is an existing ZipFile object, 2560 | its type (__class__) will be mutated to a 2561 | specialized type. If the caller wishes to retain the 2562 | original type, the caller should either create a 2563 | separate ZipFile object or pass a filename. 2564 | """ 2565 | self.root = FastLookup.make(root) 2566 | self.at = at 2567 | 2568 | def open(self, mode='r', *args, pwd=None, **kwargs): 2569 | """ 2570 | Open this entry as text or binary following the semantics 2571 | of ``pathlib.Path.open()`` by passing arguments through 2572 | to io.TextIOWrapper(). 2573 | """ 2574 | if self.is_dir(): 2575 | raise IsADirectoryError(self) 2576 | zip_mode = mode[0] 2577 | if not self.exists() and zip_mode == 'r': 2578 | raise FileNotFoundError(self) 2579 | stream = self.root.open(self.at, zip_mode, pwd=pwd) 2580 | if 'b' in mode: 2581 | if args or kwargs: 2582 | raise ValueError("encoding args invalid for binary operation") 2583 | return stream 2584 | else: 2585 | kwargs["encoding"] = io.text_encoding(kwargs.get("encoding")) 2586 | return io.TextIOWrapper(stream, *args, **kwargs) 2587 | 2588 | @property 2589 | def name(self): 2590 | return pathlib.Path(self.at).name or self.filename.name 2591 | 2592 | @property 2593 | def suffix(self): 2594 | return pathlib.Path(self.at).suffix or self.filename.suffix 2595 | 2596 | @property 2597 | def suffixes(self): 2598 | return pathlib.Path(self.at).suffixes or self.filename.suffixes 2599 | 2600 | @property 2601 | def stem(self): 2602 | return pathlib.Path(self.at).stem or self.filename.stem 2603 | 2604 | @property 2605 | def filename(self): 2606 | return pathlib.Path(self.root.filename).joinpath(self.at) 2607 | 2608 | def read_text(self, *args, **kwargs): 2609 | kwargs["encoding"] = io.text_encoding(kwargs.get("encoding")) 2610 | with self.open('r', *args, **kwargs) as strm: 2611 | return strm.read() 2612 | 2613 | def read_bytes(self): 2614 | with self.open('rb') as strm: 2615 | return strm.read() 2616 | 2617 | def _is_child(self, path): 2618 | return posixpath.dirname(path.at.rstrip("/")) == self.at.rstrip("/") 2619 | 2620 | def _next(self, at): 2621 | return self.__class__(self.root, at) 2622 | 2623 | def is_dir(self): 2624 | return not self.at or self.at.endswith("/") 2625 | 2626 | def is_file(self): 2627 | return self.exists() and not self.is_dir() 2628 | 2629 | def exists(self): 2630 | return self.at in self.root._name_set() 2631 | 2632 | def iterdir(self): 2633 | if not self.is_dir(): 2634 | raise ValueError("Can't listdir a file") 2635 | subs = map(self._next, self.root.namelist()) 2636 | return filter(self._is_child, subs) 2637 | 2638 | def __str__(self): 2639 | return posixpath.join(self.root.filename, self.at) 2640 | 2641 | def __repr__(self): 2642 | return self.__repr.format(self=self) 2643 | 2644 | def joinpath(self, *other): 2645 | next = posixpath.join(self.at, *other) 2646 | return self._next(self.root.resolve_dir(next)) 2647 | 2648 | __truediv__ = joinpath 2649 | 2650 | @property 2651 | def parent(self): 2652 | if not self.at: 2653 | return self.filename.parent 2654 | parent_at = posixpath.dirname(self.at.rstrip('/')) 2655 | if parent_at: 2656 | parent_at += '/' 2657 | return self._next(parent_at) 2658 | 2659 | 2660 | def main(args=None): 2661 | import argparse 2662 | 2663 | description = 'A simple command-line interface for zipfile module.' 2664 | parser = argparse.ArgumentParser(description=description) 2665 | group = parser.add_mutually_exclusive_group(required=True) 2666 | group.add_argument('-l', '--list', metavar='', 2667 | help='Show listing of a zipfile') 2668 | group.add_argument('-e', '--extract', nargs=2, 2669 | metavar=('', ''), 2670 | help='Extract zipfile into target dir') 2671 | group.add_argument('-c', '--create', nargs='+', 2672 | metavar=('', ''), 2673 | help='Create zipfile from sources') 2674 | group.add_argument('-t', '--test', metavar='', 2675 | help='Test if a zipfile is valid') 2676 | parser.add_argument('--metadata-encoding', metavar='', 2677 | help='Specify encoding of member names for -l, -e and -t') 2678 | args = parser.parse_args(args) 2679 | 2680 | encoding = args.metadata_encoding 2681 | 2682 | if args.test is not None: 2683 | src = args.test 2684 | with ZipFile(src, 'r', metadata_encoding=encoding) as zf: 2685 | badfile = zf.testzip() 2686 | if badfile: 2687 | print("The following enclosed file is corrupted: {!r}".format(badfile)) 2688 | print("Done testing") 2689 | 2690 | elif args.list is not None: 2691 | src = args.list 2692 | with ZipFile(src, 'r', metadata_encoding=encoding) as zf: 2693 | zf.printdir() 2694 | 2695 | elif args.extract is not None: 2696 | src, curdir = args.extract 2697 | with ZipFile(src, 'r', metadata_encoding=encoding) as zf: 2698 | zf.extractall(curdir) 2699 | 2700 | elif args.create is not None: 2701 | if encoding: 2702 | print("Non-conforming encodings not supported with -c.", 2703 | file=sys.stderr) 2704 | sys.exit(1) 2705 | 2706 | zip_name = args.create.pop(0) 2707 | files = args.create 2708 | 2709 | def addToZip(zf, path, zippath): 2710 | if os.path.isfile(path): 2711 | zf.write(path, zippath, ZIP_DEFLATED) 2712 | elif os.path.isdir(path): 2713 | if zippath: 2714 | zf.write(path, zippath) 2715 | for nm in sorted(os.listdir(path)): 2716 | addToZip(zf, 2717 | os.path.join(path, nm), os.path.join(zippath, nm)) 2718 | # else: ignore 2719 | 2720 | with ZipFile(zip_name, 'w') as zf: 2721 | for path in files: 2722 | zippath = os.path.basename(path) 2723 | if not zippath: 2724 | zippath = os.path.basename(os.path.dirname(path)) 2725 | if zippath in ('', os.curdir, os.pardir): 2726 | zippath = '' 2727 | addToZip(zf, path, zippath) 2728 | 2729 | 2730 | if __name__ == "__main__": 2731 | main() 2732 | --------------------------------------------------------------------------------