├── CompressionBenchmark.png ├── DecompressionBenchmark.png ├── mgzip ├── __init__.py ├── __main__.py └── multiProcGzip.py ├── .vscode └── launch.json ├── LICENSE ├── setup.py ├── .gitignore ├── README.md ├── test.py └── gzipFormat.txt /CompressionBenchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vinlyx/mgzip/HEAD/CompressionBenchmark.png -------------------------------------------------------------------------------- /DecompressionBenchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vinlyx/mgzip/HEAD/DecompressionBenchmark.png -------------------------------------------------------------------------------- /mgzip/__init__.py: -------------------------------------------------------------------------------- 1 | """This module provide a simple replacement of Python internal gzip module 2 | to provide a multiprocessing solution for gzip compression/decompression. 3 | 4 | License: MIT LICENSE 5 | Copyright (c) 2019 Vincent Li 6 | 7 | """ 8 | 9 | from .multiProcGzip import MultiGzipFile, open, compress, decompress, __version__ 10 | 11 | __all__ = ["GzipFile", "open", "compress", "decompress"] 12 | 13 | GzipFile = MultiGzipFile -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // 使用 IntelliSense 了解相关属性。 3 | // 悬停以查看现有属性的描述。 4 | // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "Multithread compression", 9 | "type": "python", 10 | "request": "launch", 11 | "program": "${file}", 12 | "console": "integratedTerminal", 13 | "args": ["zipTest.log"] 14 | } 15 | ] 16 | } -------------------------------------------------------------------------------- /mgzip/__main__.py: -------------------------------------------------------------------------------- 1 | """This module provide a simple replacement of Python internal gzip module 2 | to provide a multiprocessing solution for gzip compression/decompression. 3 | 4 | License: MIT LICENSE 5 | Copyright (c) 2019 Vincent Li 6 | 7 | """ 8 | 9 | import mgzip 10 | 11 | def main(argv): 12 | decompress = False 13 | if argv and argv[0]=='-d': 14 | decompress = True 15 | argv=argv[1:] 16 | if decompress: 17 | f=mgzip.GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer, thread=4, blocksize=10**6) 18 | g=sys.stdout.buffer 19 | else: 20 | f=sys.stdin.buffer 21 | g=mgzip.GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer, thread=4, blocksize=10**6) 22 | while True: 23 | chunk = f.read(1024) 24 | if not chunk: 25 | break 26 | g.write(chunk) 27 | if g is not sys.stdout: 28 | g.close() 29 | if f is not sys.stdin: 30 | f.close() 31 | 32 | if __name__=='__main__': 33 | import sys 34 | main(sys.argv[1:]) 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Vincent Li 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from mgzip import __version__ 3 | 4 | with open('README.md') as fh: 5 | longDesc = fh.read().replace("CompressionBenchmark.png", "https://raw.githubusercontent.com/vinlyx/mgzip/master/CompressionBenchmark.png").replace("DecompressionBenchmark.png", "https://raw.githubusercontent.com/vinlyx/mgzip/master/DecompressionBenchmark.png") 6 | 7 | setup( 8 | name='mgzip', 9 | version=__version__, 10 | author='Vincent Li', 11 | author_email='vincentliyx@gmail.com', 12 | 13 | description='A multi-threading implementation of Python gzip module', 14 | long_description=longDesc, 15 | long_description_content_type="text/markdown", 16 | url='https://github.com/vinlyx/mgzip', 17 | license='MIT', 18 | packages=find_packages(), 19 | classifiers=[ 20 | 'Development Status :: 4 - Beta', 21 | 'License :: OSI Approved :: MIT License', 22 | 'Operating System :: OS Independent', 23 | 'Programming Language :: Python :: 3', 24 | 'Programming Language :: Python :: Implementation :: CPython', 25 | 'Topic :: Software Development :: Libraries :: Python Modules', 26 | 'Intended Audience :: Developers', 27 | ], 28 | python_requires=">=3.6" 29 | ) 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # gz test file 7 | *.gz 8 | zipTest* 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # pyenv 80 | .python-version 81 | 82 | # celery beat schedule file 83 | celerybeat-schedule 84 | 85 | # SageMath parsed files 86 | *.sage.py 87 | 88 | # Environments 89 | .env 90 | .venv 91 | env/ 92 | venv/ 93 | ENV/ 94 | env.bak/ 95 | venv.bak/ 96 | 97 | # Spyder project settings 98 | .spyderproject 99 | .spyproject 100 | 101 | # Rope project settings 102 | .ropeproject 103 | 104 | # mkdocs documentation 105 | /site 106 | 107 | # mypy 108 | .mypy_cache/ 109 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mgzip 2 | A multi-threading implement of Python gzip module 3 | 4 | Using a block indexed GZIP file format to enable compress and decompress in parallel. This implement use 'FEXTRA' to record the index of compressed member, which is defined in offical GZIP file format specification version 4.3, so it is fully compatible with normal GZIP implement. 5 | 6 | This module is **~25X** faster for compression and **~7X** faster for decompression (limited by IO and Python implementation) with a *24 CPUs* computer. 7 | 8 | ***In theoretical, compression and decompression acceleration should be linear according to the CPU cores. In fact, the performance is limited by IO and program language implementation.*** 9 | 10 | ## Usage 11 | Use same method as gzip module 12 | ```python 13 | import mgzip 14 | 15 | s = "a big string..." 16 | 17 | ## Use 8 threads to compress. 18 | ## None or 0 means using all CPUs (default) 19 | ## Compression block size is set to 200MB 20 | with mgzip.open("test.txt.gz", "wt", thread=8, blocksize=2*10**8) as fw: 21 | fw.write(s) 22 | 23 | with mgzip.open("test.txt.gz", "rt", thread=8) as fr: 24 | assert fr.read(len(s)) == s 25 | ``` 26 | 27 | ## Performance 28 | ### Compression: 29 | ![Compression Performance](CompressionBenchmark.png) 30 | 31 | ### Decompression: 32 | ![Decompression Performance](DecompressionBenchmark.png) 33 | 34 | *Brenchmarked on a 24 cores, 48 threads server (Xeon(R) CPU E5-2650 v4 @ 2.20GHz) with 8.0GB FASTQ text file.* 35 | 36 | *Using parameters thread=42 and blocksize=200000000* 37 | 38 | ## Warning 39 | **This package only replace the 'GzipFile' class and 'open', 'compress', 'decompress' functions of standard gzip module. It is not well tested for other class and function.** 40 | 41 | **As the first release version, some features are not yet supported, such as seek() and tell(). Any contribution or improvement is appreciated.** 42 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import mgzip 2 | # import gzip as mgzip 3 | import time 4 | 5 | def _test(): 6 | import sys 7 | import os 8 | # Act like gzip; with -d, act like gunzip. 9 | # The input file is not deleted, however, nor are any other gzip 10 | # options or features supported. 11 | args = sys.argv[1:] 12 | decompress = args and args[0] == "-d" 13 | if decompress: 14 | arg = args[1] 15 | else: 16 | arg = args[0] 17 | # if not args: 18 | # args = ["-"] 19 | if decompress: 20 | tsize = 0 21 | if arg != "-": 22 | # outf = arg + ".dcp" 23 | outf = "/dev/null" 24 | fh = open(outf, "wb") 25 | gh = mgzip.open(arg, "rb") 26 | t0 = time.time() 27 | # gh.show_index() 28 | # data = b"AAA" 29 | chunk_size = 10**7 30 | while True: 31 | data = gh.read(chunk_size) 32 | # data = gh.readline() 33 | if not data: 34 | break 35 | fh.write(data) 36 | tsize += len(data) 37 | # data = gh.readline() 38 | t1 = time.time() 39 | fh.close() 40 | gh.close() 41 | size = tsize/(1024**2) 42 | seconds = t1 - t0 43 | speed = size/seconds 44 | nsize = os.stat(arg).st_size 45 | print("Decompressed {:.2f} MB data in {:.2f} S, Speed: {:.2f} MB/s, Rate: {:.2f} %".format(size, seconds, speed, nsize/tsize*100)) 46 | else: 47 | if arg != "-": 48 | outf = arg + ".gz" 49 | fh = open(arg, "rb") 50 | gh = mgzip.open(outf, "wb", compresslevel=6) 51 | data = fh.read() 52 | t0 = time.time() 53 | gh.write(data) 54 | gh.close() 55 | t1 = time.time() 56 | size = len(data)/(1024**2) 57 | seconds = t1 - t0 58 | speed = size/seconds 59 | nsize = os.stat(outf).st_size 60 | print("Compressed {:.2f} MB data in {:.2f} S, Speed: {:.2f} MB/s, Rate: {:.2f} %".format(size, seconds, speed, nsize/len(data)*100)) 61 | 62 | if __name__ == '__main__': 63 | _test() -------------------------------------------------------------------------------- /gzipFormat.txt: -------------------------------------------------------------------------------- 1 | GZIP file format specification version 4.3 2 | Status of This Memo 3 | This memo provides information for the Internet community. This memo does not specify an Internet standard of any kind. Distribution of this memo is unlimited. 4 | IESG Note: 5 | The IESG takes no position on the validity of any Intellectual Property Rights statements contained in this document. 6 | Notices 7 | Copyright © 1996 L. Peter Deutsch 8 | Permission is granted to copy and distribute this document for any purpose and without charge, including translations into other languages and incorporation into compilations, provided that the copyright notice and this notice are preserved, and that any substantive changes or deletions from the original are clearly marked. 9 | 10 | A pointer to the latest version of this and related documentation in HTML format can be found at the URL . 11 | 12 | Abstract 13 | This specification defines a lossless compressed data format that is compatible with the widely used GZIP utility. The format includes a cyclic redundancy check value for detecting data corruption. The format presently uses the DEFLATE method of compression but can be easily extended to use other compression methods. The format can be implemented readily in a manner not covered by patents. 14 | Table of Contents 15 | 1. Introduction 16 | 1.1. Purpose 17 | 1.2. Intended audience 18 | 1.3. Scope 19 | 1.4. Compliance 20 | 1.5. Definitions of terms and conventions used 21 | 1.6. Changes from previous versions 22 | 2. Detailed specification 23 | 2.1. Overall conventions 24 | 2.2. File format 25 | 2.3. Member format 26 | 2.3.1. Member header and trailer 27 | 2.3.1.1. Extra field 28 | 2.3.1.2. Compliance 29 | 3. References 30 | 4. Security Considerations 31 | 5. Acknowledgements 32 | 6. Author's Address 33 | 7. Appendix: Jean-Loup Gailly's gzip utility 34 | 8. Appendix: Sample CRC Code 35 | 1. Introduction 36 | Purpose 37 | The purpose of this specification is to define a lossless compressed data format that: 38 | 39 | Is independent of CPU type, operating system, file system, and character set, and hence can be used for interchange; 40 | Can compress or decompress a data stream (as opposed to a randomly accessible file) to produce another data stream, using only an a priori bounded amount of intermediate storage, and hence can be used in data communications or similar structures such as Unix filters; 41 | Compresses data with efficiency comparable to the best currently available general-purpose compression methods, and in particular considerably better than the "compress" program; 42 | Can be implemented readily in a manner not covered by patents, and hence can be practiced freely; 43 | Is compatible with the file format produced by the current widely used gzip utility, in that conforming decompressors will be able to read data produced by the existing gzip compressor. 44 | The data format defined by this specification does not attempt to: 45 | Provide random access to compressed data; 46 | Compress specialized data (e.g., raster graphics) as well as the best currently available specialized algorithms. 47 | Intended audience 48 | This specification is intended for use by implementors of software to compress data into gzip format and/or decompress data from gzip format. 49 | 50 | The text of the specification assumes a basic background in programming at the level of bits and other primitive data representations. 51 | 52 | Scope 53 | The specification specifies a compression method and a file format (the latter assuming only that a file can store a sequence of arbitrary bytes). It does not specify any particular interface to a file system or anything about character sets or encodings (except for file names and comments, which are optional). 54 | 55 | Compliance 56 | Unless otherwise indicated below, a compliant decompressor must be able to accept and decompress any file that conforms to all the specifications presented here; a compliant compressor must produce files that conform to all the specifications presented here. The material in the appendices is not part of the specification per se and is not relevant to compliance. 57 | 58 | Definitions of terms and conventions used 59 | byte: 8 bits stored or transmitted as a unit (same as an octet). (For this specification, a byte is exactly 8 bits, even on machines which store a character on a number of bits different from 8.) See below for the numbering of bits within a byte. 60 | 61 | 1.6. Changes from previous versions 62 | There have been no technical changes to the gzip format since version 4.1 of this specification. In version 4.2, some terminology was changed, and the sample CRC code was rewritten for clarity and to eliminate the requirement for the caller to do pre- and post-conditioning. Version 4.3 is a conversion of the specification to RFC style. 63 | 64 | 2. Detailed specification 65 | Overall conventions 66 | In the diagrams below, a box like this: 67 | +---+ 68 | | | <-- the vertical bars might be missing 69 | +---+ 70 | represents one byte; a box like this: 71 | +==============+ 72 | | | 73 | +==============+ 74 | represents a variable number of bytes. 75 | Bytes stored within a computer do not have a "bit order", since they are always treated as a unit. However, a byte considered as an integer between 0 and 255 does have a most- and least-significant bit, and since we write numbers with the most-significant digit on the left, we also write bytes with the most-significant bit on the left. In the diagrams below, we number the bits of a byte so that bit 0 is the least-significant bit, i.e., the bits are numbered: 76 | 77 | +--------+ 78 | |76543210| 79 | +--------+ 80 | This document does not address the issue of the order in which bits of a byte are transmitted on a bit-sequential medium, since the data format described here is byte- rather than bit-oriented. 81 | 82 | Within a computer, a number may occupy multiple bytes. All multi-byte numbers in the format described here are stored with the least-significant byte first (at the lower memory address). For example, the decimal number 520 is stored as: 83 | 84 | 0 1 85 | +--------+--------+ 86 | |00001000|00000010| 87 | +--------+--------+ 88 | ^ ^ 89 | | | 90 | | + more significant byte = 2 x 256 91 | + less significant byte = 8 92 | File format 93 | A gzip file consists of a series of "members" (compressed data sets). The format of each member is specified in the following section. The members simply appear one after another in the file, with no additional information before, between, or after them. 94 | 95 | Member format 96 | Each member has the following structure: 97 | +---+---+---+---+---+---+---+---+---+---+ 98 | |ID1|ID2|CM |FLG| MTIME |XFL|OS | (more-->) 99 | +---+---+---+---+---+---+---+---+---+---+ 100 | (if FLG.FEXTRA set) 101 | +---+---+=================================+ 102 | | XLEN |...XLEN bytes of "extra field"...| (more-->) 103 | +---+---+=================================+ 104 | (if FLG.FNAME set) 105 | +=========================================+ 106 | |...original file name, zero-terminated...| (more-->) 107 | +=========================================+ 108 | (if FLG.FCOMMENT set) 109 | +===================================+ 110 | |...file comment, zero-terminated...| (more-->) 111 | +===================================+ 112 | (if FLG.FHCRC set) 113 | +---+---+ 114 | | CRC16 | 115 | +---+---+ 116 | +=======================+ 117 | |...compressed blocks...| (more-->) 118 | +=======================+ 119 | 0 1 2 3 4 5 6 7 120 | +---+---+---+---+---+---+---+---+ 121 | | CRC32 | ISIZE | 122 | +---+---+---+---+---+---+---+---+ 123 | Member header and trailer 124 | ID1 (IDentification 1) 125 | ID2 (IDentification 2) 126 | These have the fixed values ID1 = 31 (0x1f, \037), ID2 = 139 (0x8b, \213), to identify the file as being in gzip format. 127 | CM (Compression Method) 128 | This identifies the compression method used in the file. CM = 0-7 are reserved. CM = 8 denotes the "deflate" compression method, which is the one customarily used by gzip and which is documented elsewhere. 129 | FLG (FLaGs) 130 | This flag byte is divided into individual bits as follows: 131 | bit 0 FTEXT 132 | bit 1 FHCRC 133 | bit 2 FEXTRA 134 | bit 3 FNAME 135 | bit 4 FCOMMENT 136 | bit 5 reserved 137 | bit 6 reserved 138 | bit 7 reserved 139 | If FTEXT is set, the file is probably ASCII text. This is an optional indication, which the compressor may set by checking a small amount of the input data to see whether any non-ASCII characters are present. In case of doubt, FTEXT is cleared, indicating binary data. For systems which have different file formats for ascii text and binary data, the decompressor can use FTEXT to choose the appropriate format. We deliberately do not specify the algorithm used to set this bit, since a compressor always has the option of leaving it cleared and a decompressor always has the option of ignoring it and letting some other program handle issues of data conversion. 140 | 141 | If FHCRC is set, a CRC16 for the gzip header is present, immediately before the compressed data. The CRC16 consists of the two least significant bytes of the CRC32 for all bytes of the gzip header up to and not including the CRC16. [The FHCRC bit was never set by versions of gzip up to 1.2.4, even though it was documented with a different meaning in gzip 1.2.4.] 142 | 143 | If FEXTRA is set, optional extra fields are present, as described in a following section. 144 | 145 | If FNAME is set, an original file name is present, terminated by a zero byte. The name must consist of ISO 8859-1 (LATIN-1) characters; on operating systems using EBCDIC or any other character set for file names, the name must be translated to the ISO LATIN-1 character set. This is the original name of the file being compressed, with any directory components removed, and, if the file being compressed is on a file system with case insensitive names, forced to lower case. There is no original file name if the data was compressed from a source other than a named file; for example, if the source was stdin on a Unix system, there is no file name. 146 | 147 | If FCOMMENT is set, a zero-terminated file comment is present. This comment is not interpreted; it is only intended for human consumption. The comment must consist of ISO 8859-1 (LATIN-1) characters. Line breaks should be denoted by a single line feed character (10 decimal). 148 | 149 | Reserved FLG bits must be zero. 150 | 151 | MTIME (Modification TIME) 152 | This gives the most recent modification time of the original file being compressed. The time is in Unix format, i.e., seconds since 00:00:00 GMT, Jan. 1, 1970. (Note that this may cause problems for MS-DOS and other systems that use local rather than Universal time.) If the compressed data did not come from a file, MTIME is set to the time at which compression started. MTIME = 0 means no time stamp is available. 153 | XFL (eXtra FLags) 154 | These flags are available for use by specific compression methods. The "deflate" method (CM = 8) sets these flags as follows: 155 | XFL = 2 - compressor used maximum compression, 156 | slowest algorithm 157 | XFL = 4 - compressor used fastest algorithm 158 | OS (Operating System) 159 | This identifies the type of file system on which compression took place. This may be useful in determining end-of-line convention for text files. The currently defined values are as follows: 160 | 0 - FAT filesystem (MS-DOS, OS/2, NT/Win32) 161 | 1 - Amiga 162 | 2 - VMS (or OpenVMS) 163 | 3 - Unix 164 | 4 - VM/CMS 165 | 5 - Atari TOS 166 | 6 - HPFS filesystem (OS/2, NT) 167 | 7 - Macintosh 168 | 8 - Z-System 169 | 9 - CP/M 170 | 10 - TOPS-20 171 | 11 - NTFS filesystem (NT) 172 | 12 - QDOS 173 | 13 - Acorn RISCOS 174 | 255 - unknown 175 | XLEN (eXtra LENgth) 176 | If FLG.FEXTRA is set, this gives the length of the optional extra field. See below for details. 177 | CRC32 (CRC-32) 178 | This contains a Cyclic Redundancy Check value of the uncompressed data computed according to CRC-32 algorithm used in the ISO 3309 standard and in section 8.1.1.6.2 of ITU-T recommendation V.42. (See http://www.iso.ch for ordering ISO documents. See gopher://info.itu.ch for an online version of ITU-T V.42.) 179 | ISIZE (Input SIZE) 180 | This contains the size of the original (uncompressed) input data modulo 2^32. 181 | Extra field 182 | If the FLG.FEXTRA bit is set, an "extra field" is present in the header, with total length XLEN bytes. It consists of a series of subfields, each of the form: 183 | 184 | +---+---+---+---+==================================+ 185 | |SI1|SI2| LEN |... LEN bytes of subfield data ...| 186 | +---+---+---+---+==================================+ 187 | SI1 and SI2 provide a subfield ID, typically two ASCII letters with some mnemonic value. Jean-Loup Gailly is maintaining a registry of subfield IDs; please send him any subfield ID you wish to use. Subfield IDs with SI2 = 0 are reserved for future use. The following IDs are currently defined: 188 | SI1 SI2 Data 189 | ---------- ---------- ---- 190 | 0x41 ('A') 0x70 ('P') Apollo file type information 191 | LEN gives the length of the subfield data, excluding the 4 initial bytes. 192 | Compliance 193 | A compliant compressor must produce files with correct ID1, ID2, CM, CRC32, and ISIZE, but may set all the other fields in the fixed-length part of the header to default values (255 for OS, 0 for all others). The compressor must set all reserved bits to zero. 194 | 195 | A compliant decompressor must check ID1, ID2, and CM, and provide an error indication if any of these have incorrect values. It must examine FEXTRA/XLEN, FNAME, FCOMMENT and FHCRC at least so it can skip over the optional fields if they are present. It need not examine any other part of the header or trailer; in particular, a decompressor may ignore FTEXT and OS and always produce binary output, and still be compliant. A compliant decompressor must give an error indication if any reserved bit is non-zero, since such a bit could indicate the presence of a new field that would cause subsequent data to be interpreted incorrectly. 196 | 197 | 3. References 198 | [1] "Information Processing - 8-bit single-byte coded graphic character sets - Part 1: Latin alphabet No.1" (ISO 8859-1:1987). The ISO 8859-1 (Latin-1) character set is a superset of 7-bit ASCII. Files defining this character set are available as iso_8859-1.* in ftp://ftp.uu.net/graphics/png/documents/ 199 | 200 | [2] ISO 3309 201 | 202 | [3] ITU-T recommendation V.42 203 | 204 | [4] Deutsch, L.P.,"DEFLATE Compressed Data Format Specification", available in ftp://ftp.uu.net/pub/archiving/zip/doc/ 205 | 206 | [5] Gailly, J.-L., GZIP documentation, available as gzip-*.tar in ftp://prep.ai.mit.edu/pub/gnu/ 207 | 208 | [6] Sarwate, D.V., "Computation of Cyclic Redundancy Checks via Table Look-Up", Communications of the ACM, 31(8), pp.1008-1013. 209 | 210 | [7] Schwaderer, W.D., "CRC Calculation", April 85 PC Tech Journal, pp.118-133. 211 | 212 | [8] ftp://ftp.rocksoft.com/papers/crc_v3.txt, describing the CRC concept. 213 | 214 | 4. Security Considerations 215 | Any data compression method involves the reduction of redundancy in the data. Consequently, any corruption of the data is likely to have severe effects and be difficult to correct. Uncompressed text, on the other hand, will probably still be readable despite the presence of some corrupted bytes. It is recommended that systems using this data format provide some means of validating the integrity of the compressed data, such as by setting and checking the CRC-32 check value. 216 | 5. Acknowledgements 217 | Trademarks cited in this document are the property of their respective owners. 218 | Jean-Loup Gailly designed the gzip format and wrote, with Mark Adler, the related software described in this specification. Glenn Randers-Pehrson converted this document to RFC and HTML format. 219 | 220 | 6. Author's Address 221 | L. Peter Deutsch 222 | Aladdin Enterprises 223 | 203 Santa Margarita Ave. 224 | Menlo Park, CA 94025 225 | 226 | Phone: (415) 322-0103 (AM only) 227 | FAX: (415) 322-1734 228 | EMail: 229 | Questions about the technical content of this specification can be sent by email to: 230 | Jean-Loup Gailly and 231 | Mark Adler 232 | Editorial comments on this specification can be sent by email to: 233 | L. Peter Deutsch and 234 | Glenn Randers-Pehrson 235 | 7. Appendix: Jean-Loup Gailly's gzip utility 236 | The most widely used implementation of gzip compression, and the original documentation on which this specification is based, were created by Jean-Loup Gailly . Since this implementation is a de facto standard, we mention some more of its features here. Again, the material in this section is not part of the specification per se, and implementations need not follow it to be compliant. 237 | When compressing or decompressing a file, gzip preserves the protection, ownership, and modification time attributes on the local file system, since there is no provision for representing protection attributes in the gzip file format itself. Since the file format includes a modification time, the gzip decompressor provides a command line switch that assigns the modification time from the file, rather than the local modification time of the compressed input, to the decompressed output. 238 | 239 | 8. Appendix: Sample CRC Code 240 | The following sample code represents a practical implementation of the CRC (Cyclic Redundancy Check). (See also ISO 3309 and ITU-T V.42 for a formal specification.) 241 | The sample code is in the ANSI C programming language. Non C users may find it easier to read with these hints: 242 | 243 | & Bitwise AND operator. 244 | ^ Bitwise exclusive-OR operator. 245 | >> Bitwise right shift operator. When applied to an 246 | unsigned quantity, as here, right shift inserts zero 247 | bit(s) at the left. 248 | ! Logical NOT operator. 249 | ++ "n++" increments the variable n. 250 | 0xNNN 0x introduces a hexadecimal (base 16) constant. 251 | Suffix L indicates a long value (at least 32 bits). 252 | 253 | /* Table of CRCs of all 8-bit messages. */ 254 | unsigned long crc_table[256]; 255 | 256 | /* Flag: has the table been computed? Initially false. */ 257 | int crc_table_computed = 0; 258 | 259 | /* Make the table for a fast CRC. */ 260 | void make_crc_table(void) 261 | { 262 | unsigned long c; 263 | int n, k; 264 | 265 | for (n = 0; n < 256; n++) { 266 | c = (unsigned long) n; 267 | for (k = 0; k < 8; k++) { 268 | if (c & 1) { 269 | c = 0xedb88320L ^ (c >> 1); 270 | } else { 271 | c = c >> 1; 272 | } 273 | } 274 | crc_table[n] = c; 275 | } 276 | crc_table_computed = 1; 277 | } 278 | 279 | /* 280 | Update a running crc with the bytes buf[0..len-1] and return 281 | the updated crc. The crc should be initialized to zero. Pre- and 282 | post-conditioning (one's complement) is performed within this 283 | function so it shouldn't be done by the caller. Usage example: 284 | 285 | unsigned long crc = 0L; 286 | 287 | while (read_buffer(buffer, length) != EOF) { 288 | crc = update_crc(crc, buffer, length); 289 | } 290 | if (crc != original_crc) error(); 291 | */ 292 | unsigned long update_crc(unsigned long crc, 293 | unsigned char *buf, int len) 294 | { 295 | unsigned long c = crc ^ 0xffffffffL; 296 | int n; 297 | 298 | if (!crc_table_computed) 299 | make_crc_table(); 300 | for (n = 0; n < len; n++) { 301 | c = crc_table[(c ^ buf[n]) & 0xff] ^ (c >> 8); 302 | } 303 | return c ^ 0xffffffffL; 304 | } 305 | 306 | /* Return the CRC of the bytes buf[0..len-1]. */ 307 | unsigned long crc(unsigned char *buf, int len) 308 | { 309 | return update_crc(0L, buf, len); 310 | } -------------------------------------------------------------------------------- /mgzip/multiProcGzip.py: -------------------------------------------------------------------------------- 1 | """This module provide a simple replacement of Python internal gzip module 2 | to provide a multiprocessing solution for gzip compression/decompression. 3 | 4 | License: MIT LICENSE 5 | Copyright (c) 2019 Vincent Li 6 | 7 | """ 8 | 9 | import os, time 10 | import builtins 11 | import struct 12 | import zlib 13 | import io 14 | from gzip import GzipFile, write32u, _GzipReader, _PaddedFile, READ, WRITE, FEXTRA, FNAME, FCOMMENT, FHCRC 15 | from multiprocessing.dummy import Pool 16 | 17 | __version__ = "0.2.1" 18 | 19 | SID = b'IG' # Subfield ID of indexed gzip file 20 | 21 | def open(filename, mode="rb", compresslevel=9, 22 | encoding=None, errors=None, newline=None, 23 | thread=None, blocksize=10**8): 24 | """Open a gzip-compressed file in binary or text mode. 25 | 26 | The filename argument can be an actual filename (a str or bytes object), or 27 | an existing file object to read from or write to. 28 | 29 | The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for 30 | binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is 31 | "rb", and the default compresslevel is 9. 32 | 33 | For binary mode, this function is equivalent to the GzipFile constructor: 34 | GzipFile(filename, mode, compresslevel). In this case, the encoding, errors 35 | and newline arguments must not be provided. 36 | 37 | For text mode, a GzipFile object is created, and wrapped in an 38 | io.TextIOWrapper instance with the specified encoding, error handling 39 | behavior, and line ending(s). 40 | 41 | """ 42 | if "t" in mode: 43 | if "b" in mode: 44 | raise ValueError("Invalid mode: %r" % (mode,)) 45 | else: 46 | if encoding is not None: 47 | raise ValueError("Argument 'encoding' not supported in binary mode") 48 | if errors is not None: 49 | raise ValueError("Argument 'errors' not supported in binary mode") 50 | if newline is not None: 51 | raise ValueError("Argument 'newline' not supported in binary mode") 52 | 53 | gz_mode = mode.replace("t", "") 54 | if isinstance(filename, (str, bytes)): 55 | binary_file = MultiGzipFile(filename, gz_mode, compresslevel, thread=thread, blocksize=blocksize) 56 | elif hasattr(filename, "read") or hasattr(filename, "write"): 57 | binary_file = MultiGzipFile(None, gz_mode, compresslevel, filename, thread=thread, blocksize=blocksize) 58 | else: 59 | raise TypeError("filename must be a str or bytes object, or a file") 60 | 61 | if "t" in mode: 62 | return io.TextIOWrapper(binary_file, encoding, errors, newline) 63 | else: 64 | return binary_file 65 | 66 | def compress(data, compresslevel=9, thread=None, blocksize=10**8): 67 | """Compress data in one shot and return the compressed string. 68 | Optional argument is the compression level, in range of 0-9. 69 | """ 70 | buf = io.BytesIO() 71 | with MultiGzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel, 72 | thread=thread, blocksize=blocksize) as f: 73 | f.write(data) 74 | return buf.getvalue() 75 | 76 | def decompress(data, thread=None, blocksize=10**8): 77 | """Decompress a gzip compressed string in one shot. 78 | Return the decompressed string. 79 | """ 80 | with MultiGzipFile(fileobj=io.BytesIO(data), thread=thread, 81 | blocksize=blocksize) as f: 82 | return f.read() 83 | 84 | def padded_file_seek(self, off, whence=0): 85 | """ 86 | Provide a whence of seek method in gzip 87 | to allow seek to the end of file. 88 | * FIXME: This method may have some problem 89 | is stream mode since it is unable to seek 90 | to the end of stream object. 91 | """ 92 | self._read = None 93 | self._buffer = None 94 | return self.file.seek(off, whence) 95 | _PaddedFile.seek = padded_file_seek # override the seek method to provide whence parameter 96 | 97 | class MultiGzipFile(GzipFile): 98 | """ docstring of MultiGzipFile """ 99 | 100 | def __init__(self, filename=None, mode=None, 101 | compresslevel=9, fileobj=None, mtime=None, 102 | thread=None, blocksize=10**8): 103 | """Constructor for the GzipFile class. 104 | 105 | At least one of fileobj and filename must be given a 106 | non-trivial value. 107 | 108 | The new class instance is based on fileobj, which can be a regular 109 | file, an io.BytesIO object, or any other object which simulates a file. 110 | It defaults to None, in which case filename is opened to provide 111 | a file object. 112 | 113 | When fileobj is not None, the filename argument is only used to be 114 | included in the gzip file header, which may include the original 115 | filename of the uncompressed file. It defaults to the filename of 116 | fileobj, if discernible; otherwise, it defaults to the empty string, 117 | and in this case the original filename is not included in the header. 118 | 119 | The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or 120 | 'xb' depending on whether the file will be read or written. The default 121 | is the mode of fileobj if discernible; otherwise, the default is 'rb'. 122 | A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and 123 | 'wb', 'a' and 'ab', and 'x' and 'xb'. 124 | 125 | The compresslevel argument is an integer from 0 to 9 controlling the 126 | level of compression; 1 is fastest and produces the least compression, 127 | and 9 is slowest and produces the most compression. 0 is no compression 128 | at all. The default is 9. 129 | 130 | The mtime argument is an optional numeric timestamp to be written 131 | to the last modification time field in the stream when compressing. 132 | If omitted or None, the current time is used. 133 | 134 | """ 135 | 136 | self.thread = thread 137 | self.read_blocks = None 138 | if mode and ('t' in mode or 'U' in mode): 139 | raise ValueError("Invalid mode: {!r}".format(mode)) 140 | if mode and 'b' not in mode: 141 | mode += 'b' 142 | if fileobj is None: 143 | fileobj = self.myfileobj = builtins.open(filename, mode or 'rb', blocksize) 144 | if filename is None: 145 | filename = getattr(fileobj, 'name', '') 146 | if not isinstance(filename, (str, bytes)): 147 | filename = '' 148 | if mode is None: 149 | mode = getattr(fileobj, 'mode', 'rb') 150 | 151 | if mode.startswith('r'): 152 | self.mode = READ 153 | if not self.thread: 154 | self.thread = os.cpu_count() // 2 # cores number 155 | self.raw = _MulitGzipReader(fileobj, thread=self.thread, max_block_size=blocksize) 156 | self._buffer = io.BufferedReader(self.raw, blocksize) 157 | self.name = filename 158 | self.index = [] 159 | 160 | elif mode.startswith(('w', 'a', 'x')): 161 | self.mode = WRITE 162 | if not self.thread: 163 | # thread is None or 0, use all available CPUs 164 | self.thread = os.cpu_count() 165 | self._init_write(filename) 166 | self.compress = zlib.compressobj(compresslevel, 167 | zlib.DEFLATED, 168 | -zlib.MAX_WBITS, 169 | zlib.DEF_MEM_LEVEL, 170 | 0) 171 | self._write_mtime = mtime 172 | self.compresslevel = compresslevel 173 | self.blocksize = blocksize # use 20M blocksize as default 174 | self.pool = Pool(self.thread) 175 | self.pool_result = [] 176 | self.small_buf = io.BytesIO() 177 | else: 178 | raise ValueError("Invalid mode: {!r}".format(mode)) 179 | 180 | self.fileobj = fileobj 181 | 182 | def __repr__(self): 183 | s = repr(self.fileobj) 184 | return '' 185 | 186 | def _write_gzip_header(self): 187 | ## ignored to write original header 188 | pass 189 | 190 | def _compress_func(self, data, pdata=None): 191 | """ 192 | Compress data with zlib deflate algorithm. 193 | Input: 194 | data: btyes object of input data 195 | pdata: exists small buffer data 196 | Return: 197 | tuple of (Buffered compressed data, 198 | Major compressed data, 199 | Rest data after flush buffer, 200 | CRC32, 201 | Original size) 202 | """ 203 | cpr = zlib.compressobj(self.compresslevel, 204 | zlib.DEFLATED, 205 | -zlib.MAX_WBITS, 206 | 9, # use memory level 9 > zlib.DEF_MEM_LEVEL (8) for better performance 207 | 0) 208 | if pdata: 209 | prefix_bytes = cpr.compress(pdata) 210 | body_bytes = cpr.compress(data) 211 | rest_bytes = cpr.flush() 212 | if pdata: 213 | crc = zlib.crc32(data, zlib.crc32(pdata)) 214 | return (prefix_bytes, body_bytes, rest_bytes, crc, pdata.nbytes + data.nbytes) 215 | else: 216 | crc = zlib.crc32(data) 217 | return (b'', body_bytes, rest_bytes, crc, data.nbytes) 218 | 219 | def write(self, data): 220 | self._check_not_closed() 221 | if self.mode != WRITE: 222 | import errno 223 | raise OSError(errno.EBADF, "write() on read-only GzipFile object") 224 | 225 | if self.fileobj is None: 226 | raise ValueError("write() on closed GzipFile object") 227 | 228 | data = memoryview(data) 229 | length = data.nbytes 230 | 231 | if length == 0: 232 | return length 233 | elif length >= self.blocksize: 234 | if length < 2 * self.blocksize: 235 | # use sigle thread 236 | self._compress_block_async(data) 237 | else: 238 | for st in range(0, length, self.blocksize): 239 | self._compress_block_async(data[st: st+self.blocksize]) 240 | self._flush_pool() 241 | elif length < self.blocksize: 242 | self.small_buf.write(data) 243 | if self.small_buf.tell() >= self.blocksize: 244 | self._compress_async(self.small_buf.getbuffer()) 245 | self.small_buf = io.BytesIO() 246 | self._flush_pool() 247 | return length 248 | 249 | def _compress_async(self, data, pdata=None): 250 | return self.pool_result.append(self.pool.apply_async(self._compress_func, args=(data, pdata))) 251 | 252 | def _compress_block_async(self, data): 253 | if self.small_buf.tell() != 0: 254 | self._compress_async(data, self.small_buf.getbuffer()) 255 | self.small_buf = io.BytesIO() 256 | else: 257 | self._compress_async(data) 258 | 259 | def _flush_pool(self, force=False): 260 | if len(self.pool_result) <= self.thread and not force: 261 | return 0 262 | length = 0 263 | if force: 264 | flushSize = len(self.pool_result) 265 | else: 266 | flushSize = len(self.pool_result) - self.thread 267 | for i in range(flushSize): 268 | cdata = self.pool_result.pop(0).get() 269 | length += self._write_member(cdata) 270 | # (bodyBytes, resBytes, crc, oriSize) = rlt.get() 271 | # compressRlt = rlt.get() 272 | return length 273 | 274 | def _write_member(self, cdata): 275 | """ 276 | Write a compressed data as a complete gzip member 277 | Input: 278 | cdata: 279 | compressed data, a tuple of compressed result returned by _compress_func() 280 | Return: 281 | size of member 282 | """ 283 | size = self._write_member_header(len(cdata[0]) + len(cdata[1]) + len(cdata[2]), cdata[4]) 284 | self.fileobj.write(cdata[0]) # buffer data 285 | self.fileobj.write(cdata[1]) # body data 286 | self.fileobj.write(cdata[2]) # rest data 287 | write32u(self.fileobj, cdata[3]) # CRC32 288 | write32u(self.fileobj, cdata[4] & 0xffffffff) # raw data size in 32bits 289 | return size 290 | 291 | def _write_member_header(self, compressed_size, raw_size): 292 | self.fileobj.write(b'\037\213') # magic header, 2 bytes 293 | self.fileobj.write(b'\010') # compression method, 1 byte 294 | try: 295 | # RFC 1952 requires the FNAME field to be Latin-1. Do not 296 | # include filenames that cannot be represented that way. 297 | fname = os.path.basename(self.name) 298 | if not isinstance(fname, bytes): 299 | fname = fname.encode('latin-1') 300 | if fname.endswith(b'.gz'): 301 | fname = fname[:-3] 302 | except UnicodeEncodeError: 303 | fname = b'' 304 | flags = FEXTRA 305 | if fname: 306 | flags |= FNAME 307 | self.fileobj.write(chr(flags).encode('latin-1')) # flags, 1 byte 308 | mtime = self._write_mtime 309 | if mtime is None: 310 | mtime = time.time() 311 | write32u(self.fileobj, int(mtime)) # modified time, 4 bytes 312 | self.fileobj.write(b'\002') # fixed flag (maximum compression), 1 byte 313 | self.fileobj.write(b'\377') # OS (unknown), 1 byte 314 | 315 | # write extra flag for indexing 316 | # XLEN, 8 bytes 317 | self.fileobj.write(b'\x08\x00') # extra flag len, 2 bytes 318 | # EXTRA FLAG FORMAT: 319 | # +---+---+---+---+---+---+---+---+ 320 | # |SI1|SI2| LEN | MEMBER SIZE | 321 | # +---+---+---+---+---+---+---+---+ 322 | # SI1, SI2: Subfield ID, 'IG' (Indexed Gzip file) 323 | # LEN: Length of subfield body, always 4 (bytes) 324 | # MEMBER SIZE: The size of current member 325 | self.fileobj.write(SID) # subfield ID (IG), 2 bytes 326 | # LEN: 4 bytes 327 | self.fileobj.write(b'\x04\x00') # subfield len (4), 2 bytes 328 | # compressed data size: 16 + 4 + len(fname) + 1 + data + 8 329 | # header + member size + filename with zero end + data block + CRC32 and ISIZE 330 | member_size = 20 + len(fname) + 1 + compressed_size + 8 331 | if not fname: 332 | member_size -= 1 333 | self.fileobj.write(struct.pack(" 4 GB, rsize is just the mod of 4G 495 | ## not a good idea to read all of them in memory 496 | body_bytes = dpr.decompress(data, rsize) 497 | crc = zlib.crc32(body_bytes) 498 | if dpr.unconsumed_tail != b"": 499 | body_bytes += dpr.unconsumed_tail 500 | crc = zlib.crc32(dpr.unconsumed_tail, crc) 501 | return (body_bytes, rsize, crc, rcrc) 502 | 503 | def _decompress_async(self, data, rcrc, rsize): 504 | self._read_pool.append(self._pool.apply_async(self._decompress_func, args=(data, rcrc, rsize))) 505 | 506 | def _read_gzip_header(self): 507 | magic = self._fp.read(2) 508 | if magic == b'': 509 | return False 510 | 511 | if magic != b'\037\213': 512 | raise OSError('Not a gzipped file (%r)' % magic) 513 | 514 | (method, flag, 515 | self._last_mtime) = struct.unpack("= self._block_buff_size: 601 | self._block_buff_pos = self._block_buff_size 602 | return self._block_buff[st_pos:self._block_buff_pos] 603 | elif self._read_pool: 604 | block_read_rlt = self._read_pool.pop(0).get() 605 | self.thread += 1 606 | # check decompressed data size 607 | if len(block_read_rlt[0]) != block_read_rlt[1]: 608 | raise OSError("Incorrect length of data produced") 609 | # check raw crc32 == decompressed crc32 610 | if block_read_rlt[2] != block_read_rlt[3]: 611 | raise OSError("CRC check failed {:s} != {:s}".format( 612 | block_read_rlt[3], block_read_rlt[2] 613 | )) 614 | self._block_buff = self._block_buff[self._block_buff_pos:] + block_read_rlt[0] 615 | self._block_buff_size = len(self._block_buff) 616 | self._block_buff_pos = min(size, self._block_buff_size) 617 | return self._block_buff[:size] # FIXME: fix issue when size > len(self._block_buff) 618 | elif self._block_buff_pos != self._block_buff_size: 619 | # still something in self._block_buff 620 | st_pos = self._block_buff_pos 621 | self._block_buff_pos = self._block_buff_size 622 | return self._block_buff[st_pos:] 623 | elif self._is_eof: 624 | return b"" 625 | 626 | # Read a chunk of data from the file 627 | buf = self._fp.read(io.DEFAULT_BUFFER_SIZE) 628 | 629 | uncompress = self._decompressor.decompress(buf, size) 630 | if self._decompressor.unconsumed_tail != b"": 631 | self._fp.prepend(self._decompressor.unconsumed_tail) 632 | elif self._decompressor.unused_data != b"": 633 | # Prepend the already read bytes to the fileobj so they can 634 | # be seen by _read_eof() and _read_gzip_header() 635 | self._fp.prepend(self._decompressor.unused_data) 636 | 637 | if uncompress != b"": 638 | break 639 | if buf == b"": 640 | raise EOFError("Compressed file ended before the " 641 | "end-of-stream marker was reached") 642 | 643 | self._add_read_data( uncompress ) 644 | self._pos += len(uncompress) 645 | return uncompress 646 | 647 | def _read_eof_crc(self): 648 | """ 649 | Get crc32 and isize without checking 650 | """ 651 | crc32, isize = struct.unpack("