├── CompressionBenchmark.png
├── DecompressionBenchmark.png
├── mgzip
    ├── __init__.py
    ├── __main__.py
    └── multiProcGzip.py
├── .vscode
    └── launch.json
├── LICENSE
├── setup.py
├── .gitignore
├── README.md
├── test.py
└── gzipFormat.txt


/CompressionBenchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vinlyx/mgzip/HEAD/CompressionBenchmark.png


--------------------------------------------------------------------------------
/DecompressionBenchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vinlyx/mgzip/HEAD/DecompressionBenchmark.png


--------------------------------------------------------------------------------
/mgzip/__init__.py:
--------------------------------------------------------------------------------
 1 | """This module provide a simple replacement of Python internal gzip module
 2 | to provide a multiprocessing solution for gzip compression/decompression.
 3 | 
 4 | License: MIT LICENSE
 5 | Copyright (c) 2019 Vincent Li
 6 | 
 7 | """
 8 | 
 9 | from .multiProcGzip import MultiGzipFile, open, compress, decompress, __version__
10 | 
11 | __all__ = ["GzipFile", "open", "compress", "decompress"]
12 | 
13 | GzipFile = MultiGzipFile


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // 使用 IntelliSense 了解相关属性。
 3 |     // 悬停以查看现有属性的描述。
 4 |     // 欲了解更多信息，请访问: https://go.microsoft.com/fwlink/?linkid=830387
 5 |     "version": "0.2.0",
 6 |     "configurations": [
 7 |         {
 8 |             "name": "Multithread compression",
 9 |             "type": "python",
10 |             "request": "launch",
11 |             "program": "${file}",
12 |             "console": "integratedTerminal",
13 |             "args": ["zipTest.log"]
14 |         }
15 |     ]
16 | }


--------------------------------------------------------------------------------
/mgzip/__main__.py:
--------------------------------------------------------------------------------
 1 | """This module provide a simple replacement of Python internal gzip module
 2 | to provide a multiprocessing solution for gzip compression/decompression.
 3 | 
 4 | License: MIT LICENSE
 5 | Copyright (c) 2019 Vincent Li
 6 | 
 7 | """
 8 | 
 9 | import mgzip
10 | 
11 | def main(argv):
12 | 	decompress = False
13 | 	if argv and argv[0]=='-d':
14 | 		decompress = True
15 | 		argv=argv[1:]
16 | 	if decompress:
17 | 		f=mgzip.GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer, thread=4, blocksize=10**6)
18 | 		g=sys.stdout.buffer
19 | 	else:
20 | 		f=sys.stdin.buffer
21 | 		g=mgzip.GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer, thread=4, blocksize=10**6)
22 | 	while True:
23 | 		chunk = f.read(1024)
24 | 		if not chunk:
25 | 			break
26 | 		g.write(chunk)
27 | 	if g is not sys.stdout:
28 | 		g.close()
29 | 	if f is not sys.stdin:
30 | 		f.close()
31 | 
32 | if __name__=='__main__':
33 | 	import sys
34 | 	main(sys.argv[1:])
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Vincent Li
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | from mgzip import __version__
 3 | 
 4 | with open('README.md') as fh:
 5 |     longDesc = fh.read().replace("CompressionBenchmark.png", "https://raw.githubusercontent.com/vinlyx/mgzip/master/CompressionBenchmark.png").replace("DecompressionBenchmark.png", "https://raw.githubusercontent.com/vinlyx/mgzip/master/DecompressionBenchmark.png")
 6 | 
 7 | setup(
 8 |     name='mgzip',
 9 |     version=__version__,
10 |     author='Vincent Li',
11 |     author_email='vincentliyx@gmail.com',
12 | 
13 |     description='A multi-threading implementation of Python gzip module',
14 |     long_description=longDesc,
15 |     long_description_content_type="text/markdown",
16 |     url='https://github.com/vinlyx/mgzip',
17 |     license='MIT',
18 |     packages=find_packages(),
19 |     classifiers=[
20 |         'Development Status :: 4 - Beta',
21 |         'License :: OSI Approved :: MIT License',
22 |         'Operating System :: OS Independent',
23 |         'Programming Language :: Python :: 3',
24 |         'Programming Language :: Python :: Implementation :: CPython',
25 |         'Topic :: Software Development :: Libraries :: Python Modules',
26 |         'Intended Audience :: Developers',
27 |     ],
28 |     python_requires=">=3.6"
29 | )
30 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # gz test file
  7 | *.gz
  8 | zipTest*
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # pyenv
 80 | .python-version
 81 | 
 82 | # celery beat schedule file
 83 | celerybeat-schedule
 84 | 
 85 | # SageMath parsed files
 86 | *.sage.py
 87 | 
 88 | # Environments
 89 | .env
 90 | .venv
 91 | env/
 92 | venv/
 93 | ENV/
 94 | env.bak/
 95 | venv.bak/
 96 | 
 97 | # Spyder project settings
 98 | .spyderproject
 99 | .spyproject
100 | 
101 | # Rope project settings
102 | .ropeproject
103 | 
104 | # mkdocs documentation
105 | /site
106 | 
107 | # mypy
108 | .mypy_cache/
109 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # mgzip
 2 | A multi-threading implement of Python gzip module
 3 | 
 4 | Using a block indexed GZIP file format to enable compress and decompress in parallel. This implement use 'FEXTRA' to record the index of compressed member, which is defined in offical GZIP file format specification version 4.3, so it is fully compatible with normal GZIP implement.
 5 | 
 6 | This module is **~25X** faster for compression and **~7X** faster for decompression (limited by IO and Python implementation) with a *24 CPUs* computer.
 7 | 
 8 | ***In theoretical, compression and decompression acceleration should be linear according to the CPU cores. In fact, the performance is limited by IO and program language implementation.***
 9 | 
10 | ## Usage
11 | Use same method as gzip module
12 | ```python
13 | import mgzip
14 | 
15 | s = "a big string..."
16 | 
17 | ## Use 8 threads to compress.
18 | ## None or 0 means using all CPUs (default)
19 | ## Compression block size is set to 200MB
20 | with mgzip.open("test.txt.gz", "wt", thread=8, blocksize=2*10**8) as fw:
21 |     fw.write(s)
22 | 
23 | with mgzip.open("test.txt.gz", "rt", thread=8) as fr:
24 |     assert fr.read(len(s)) == s
25 | ```
26 | 
27 | ## Performance
28 | ### Compression:
29 | ![Compression Performance](CompressionBenchmark.png)
30 | 
31 | ### Decompression:
32 | ![Decompression Performance](DecompressionBenchmark.png)
33 | 
34 | *Brenchmarked on a 24 cores, 48 threads server (Xeon(R) CPU E5-2650 v4 @ 2.20GHz) with 8.0GB FASTQ text file.*
35 | 
36 | *Using parameters thread=42 and blocksize=200000000*
37 | 
38 | ## Warning
39 | **This package only replace the 'GzipFile' class and 'open', 'compress', 'decompress' functions of standard gzip module. It is not well tested for other class and function.**
40 | 
41 | **As the first release version, some features are not yet supported, such as seek() and tell(). Any contribution or improvement is appreciated.**
42 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | import mgzip
 2 | # import gzip as mgzip
 3 | import time
 4 | 
 5 | def _test():
 6 |     import sys
 7 |     import os
 8 |     # Act like gzip; with -d, act like gunzip.
 9 |     # The input file is not deleted, however, nor are any other gzip
10 |     # options or features supported.
11 |     args = sys.argv[1:]
12 |     decompress = args and args[0] == "-d"
13 |     if decompress:
14 |         arg = args[1]
15 |     else:
16 |         arg = args[0]
17 |     # if not args:
18 |     #     args = ["-"]
19 |     if decompress:
20 |         tsize = 0
21 |         if arg != "-":
22 |             # outf = arg + ".dcp"
23 |             outf = "/dev/null"
24 |             fh = open(outf, "wb")
25 |             gh = mgzip.open(arg, "rb")
26 |             t0 = time.time()
27 |             # gh.show_index()
28 |             # data = b"AAA"
29 |             chunk_size = 10**7
30 |             while True:
31 |                 data = gh.read(chunk_size)
32 |                 # data = gh.readline()
33 |                 if not data:
34 |                     break
35 |                 fh.write(data)
36 |                 tsize += len(data)
37 |             # data = gh.readline()
38 |             t1 = time.time()
39 |             fh.close()
40 |             gh.close()
41 |             size = tsize/(1024**2)
42 |             seconds = t1 - t0
43 |             speed = size/seconds
44 |             nsize = os.stat(arg).st_size
45 |             print("Decompressed {:.2f} MB data in {:.2f} S, Speed: {:.2f} MB/s, Rate: {:.2f} %".format(size, seconds, speed, nsize/tsize*100))
46 |     else:
47 |         if arg != "-":
48 |             outf = arg + ".gz"
49 |             fh = open(arg, "rb")
50 |             gh = mgzip.open(outf, "wb", compresslevel=6)
51 |             data = fh.read()
52 |             t0 = time.time()
53 |             gh.write(data)
54 |             gh.close()
55 |             t1 = time.time()
56 |             size = len(data)/(1024**2)
57 |             seconds = t1 - t0
58 |             speed = size/seconds
59 |             nsize = os.stat(outf).st_size
60 |             print("Compressed {:.2f} MB data in {:.2f} S, Speed: {:.2f} MB/s, Rate: {:.2f} %".format(size, seconds, speed, nsize/len(data)*100))
61 | 
62 | if __name__ == '__main__':
63 |     _test()


--------------------------------------------------------------------------------
/gzipFormat.txt:
--------------------------------------------------------------------------------
  1 | GZIP file format specification version 4.3
  2 | Status of This Memo
  3 | This memo provides information for the Internet community. This memo does not specify an Internet standard of any kind. Distribution of this memo is unlimited.
  4 | IESG Note:
  5 | The IESG takes no position on the validity of any Intellectual Property Rights statements contained in this document.
  6 | Notices
  7 | Copyright © 1996 L. Peter Deutsch
  8 | Permission is granted to copy and distribute this document for any purpose and without charge, including translations into other languages and incorporation into compilations, provided that the copyright notice and this notice are preserved, and that any substantive changes or deletions from the original are clearly marked.
  9 | 
 10 | A pointer to the latest version of this and related documentation in HTML format can be found at the URL <ftp://ftp.uu.net/graphics/png/documents/zlib/zdoc-index.html>.
 11 | 
 12 | Abstract
 13 | This specification defines a lossless compressed data format that is compatible with the widely used GZIP utility. The format includes a cyclic redundancy check value for detecting data corruption. The format presently uses the DEFLATE method of compression but can be easily extended to use other compression methods. The format can be implemented readily in a manner not covered by patents.
 14 | Table of Contents
 15 | 1. Introduction
 16 | 1.1. Purpose
 17 | 1.2. Intended audience
 18 | 1.3. Scope
 19 | 1.4. Compliance
 20 | 1.5. Definitions of terms and conventions used
 21 | 1.6. Changes from previous versions
 22 | 2. Detailed specification
 23 | 2.1. Overall conventions
 24 | 2.2. File format
 25 | 2.3. Member format
 26 | 2.3.1. Member header and trailer
 27 | 2.3.1.1. Extra field
 28 | 2.3.1.2. Compliance
 29 | 3. References
 30 | 4. Security Considerations
 31 | 5. Acknowledgements
 32 | 6. Author's Address
 33 | 7. Appendix: Jean-Loup Gailly's gzip utility
 34 | 8. Appendix: Sample CRC Code
 35 | 1. Introduction
 36 | Purpose
 37 | The purpose of this specification is to define a lossless compressed data format that:
 38 | 
 39 | Is independent of CPU type, operating system, file system, and character set, and hence can be used for interchange;
 40 | Can compress or decompress a data stream (as opposed to a randomly accessible file) to produce another data stream, using only an a priori bounded amount of intermediate storage, and hence can be used in data communications or similar structures such as Unix filters;
 41 | Compresses data with efficiency comparable to the best currently available general-purpose compression methods, and in particular considerably better than the "compress" program;
 42 | Can be implemented readily in a manner not covered by patents, and hence can be practiced freely;
 43 | Is compatible with the file format produced by the current widely used gzip utility, in that conforming decompressors will be able to read data produced by the existing gzip compressor.
 44 | The data format defined by this specification does not attempt to:
 45 | Provide random access to compressed data;
 46 | Compress specialized data (e.g., raster graphics) as well as the best currently available specialized algorithms.
 47 | Intended audience
 48 | This specification is intended for use by implementors of software to compress data into gzip format and/or decompress data from gzip format.
 49 | 
 50 | The text of the specification assumes a basic background in programming at the level of bits and other primitive data representations.
 51 | 
 52 | Scope
 53 | The specification specifies a compression method and a file format (the latter assuming only that a file can store a sequence of arbitrary bytes). It does not specify any particular interface to a file system or anything about character sets or encodings (except for file names and comments, which are optional).
 54 | 
 55 | Compliance
 56 | Unless otherwise indicated below, a compliant decompressor must be able to accept and decompress any file that conforms to all the specifications presented here; a compliant compressor must produce files that conform to all the specifications presented here. The material in the appendices is not part of the specification per se and is not relevant to compliance.
 57 | 
 58 | Definitions of terms and conventions used
 59 | byte: 8 bits stored or transmitted as a unit (same as an octet). (For this specification, a byte is exactly 8 bits, even on machines which store a character on a number of bits different from 8.) See below for the numbering of bits within a byte.
 60 | 
 61 | 1.6. Changes from previous versions
 62 | There have been no technical changes to the gzip format since version 4.1 of this specification. In version 4.2, some terminology was changed, and the sample CRC code was rewritten for clarity and to eliminate the requirement for the caller to do pre- and post-conditioning. Version 4.3 is a conversion of the specification to RFC style.
 63 | 
 64 | 2. Detailed specification
 65 | Overall conventions
 66 | In the diagrams below, a box like this:
 67 | +---+
 68 | |   | <-- the vertical bars might be missing
 69 | +---+
 70 | represents one byte; a box like this:
 71 | +==============+
 72 | |              |
 73 | +==============+
 74 | represents a variable number of bytes.
 75 | Bytes stored within a computer do not have a "bit order", since they are always treated as a unit. However, a byte considered as an integer between 0 and 255 does have a most- and least-significant bit, and since we write numbers with the most-significant digit on the left, we also write bytes with the most-significant bit on the left. In the diagrams below, we number the bits of a byte so that bit 0 is the least-significant bit, i.e., the bits are numbered:
 76 | 
 77 | +--------+
 78 | |76543210|
 79 | +--------+
 80 | This document does not address the issue of the order in which bits of a byte are transmitted on a bit-sequential medium, since the data format described here is byte- rather than bit-oriented.
 81 | 
 82 | Within a computer, a number may occupy multiple bytes. All multi-byte numbers in the format described here are stored with the least-significant byte first (at the lower memory address). For example, the decimal number 520 is stored as:
 83 | 
 84 |     0        1
 85 | +--------+--------+
 86 | |00001000|00000010|
 87 | +--------+--------+
 88 |  ^        ^
 89 |  |        |
 90 |  |        + more significant byte = 2 x 256
 91 |  + less significant byte = 8
 92 | File format
 93 | A gzip file consists of a series of "members" (compressed data sets). The format of each member is specified in the following section. The members simply appear one after another in the file, with no additional information before, between, or after them.
 94 | 
 95 | Member format
 96 | Each member has the following structure:
 97 | +---+---+---+---+---+---+---+---+---+---+
 98 | |ID1|ID2|CM |FLG|     MTIME     |XFL|OS | (more-->)
 99 | +---+---+---+---+---+---+---+---+---+---+
100 | (if FLG.FEXTRA set)
101 | +---+---+=================================+
102 | | XLEN  |...XLEN bytes of "extra field"...| (more-->)
103 | +---+---+=================================+
104 | (if FLG.FNAME set)
105 | +=========================================+
106 | |...original file name, zero-terminated...| (more-->)
107 | +=========================================+
108 | (if FLG.FCOMMENT set)
109 | +===================================+
110 | |...file comment, zero-terminated...| (more-->)
111 | +===================================+
112 | (if FLG.FHCRC set)
113 | +---+---+
114 | | CRC16 |
115 | +---+---+
116 | +=======================+
117 | |...compressed blocks...| (more-->)
118 | +=======================+
119 |   0   1   2   3   4   5   6   7
120 | +---+---+---+---+---+---+---+---+
121 | |     CRC32     |     ISIZE     |
122 | +---+---+---+---+---+---+---+---+
123 | Member header and trailer
124 | ID1 (IDentification 1)
125 | ID2 (IDentification 2)
126 | These have the fixed values ID1 = 31 (0x1f, \037), ID2 = 139 (0x8b, \213), to identify the file as being in gzip format.
127 | CM (Compression Method)
128 | This identifies the compression method used in the file. CM = 0-7 are reserved. CM = 8 denotes the "deflate" compression method, which is the one customarily used by gzip and which is documented elsewhere.
129 | FLG (FLaGs)
130 | This flag byte is divided into individual bits as follows:
131 | bit 0   FTEXT
132 | bit 1   FHCRC
133 | bit 2   FEXTRA
134 | bit 3   FNAME
135 | bit 4   FCOMMENT
136 | bit 5   reserved
137 | bit 6   reserved
138 | bit 7   reserved
139 | If FTEXT is set, the file is probably ASCII text. This is an optional indication, which the compressor may set by checking a small amount of the input data to see whether any non-ASCII characters are present. In case of doubt, FTEXT is cleared, indicating binary data. For systems which have different file formats for ascii text and binary data, the decompressor can use FTEXT to choose the appropriate format. We deliberately do not specify the algorithm used to set this bit, since a compressor always has the option of leaving it cleared and a decompressor always has the option of ignoring it and letting some other program handle issues of data conversion.
140 | 
141 | If FHCRC is set, a CRC16 for the gzip header is present, immediately before the compressed data. The CRC16 consists of the two least significant bytes of the CRC32 for all bytes of the gzip header up to and not including the CRC16. [The FHCRC bit was never set by versions of gzip up to 1.2.4, even though it was documented with a different meaning in gzip 1.2.4.]
142 | 
143 | If FEXTRA is set, optional extra fields are present, as described in a following section.
144 | 
145 | If FNAME is set, an original file name is present, terminated by a zero byte. The name must consist of ISO 8859-1 (LATIN-1) characters; on operating systems using EBCDIC or any other character set for file names, the name must be translated to the ISO LATIN-1 character set. This is the original name of the file being compressed, with any directory components removed, and, if the file being compressed is on a file system with case insensitive names, forced to lower case. There is no original file name if the data was compressed from a source other than a named file; for example, if the source was stdin on a Unix system, there is no file name.
146 | 
147 | If FCOMMENT is set, a zero-terminated file comment is present. This comment is not interpreted; it is only intended for human consumption. The comment must consist of ISO 8859-1 (LATIN-1) characters. Line breaks should be denoted by a single line feed character (10 decimal).
148 | 
149 | Reserved FLG bits must be zero.
150 | 
151 | MTIME (Modification TIME)
152 | This gives the most recent modification time of the original file being compressed. The time is in Unix format, i.e., seconds since 00:00:00 GMT, Jan. 1, 1970. (Note that this may cause problems for MS-DOS and other systems that use local rather than Universal time.) If the compressed data did not come from a file, MTIME is set to the time at which compression started. MTIME = 0 means no time stamp is available.
153 | XFL (eXtra FLags)
154 | These flags are available for use by specific compression methods. The "deflate" method (CM = 8) sets these flags as follows:
155 | XFL = 2 - compressor used maximum compression,
156 |           slowest algorithm
157 | XFL = 4 - compressor used fastest algorithm
158 | OS (Operating System)
159 | This identifies the type of file system on which compression took place. This may be useful in determining end-of-line convention for text files. The currently defined values are as follows:
160 |   0 - FAT filesystem (MS-DOS, OS/2, NT/Win32)
161 |   1 - Amiga
162 |   2 - VMS (or OpenVMS)
163 |   3 - Unix
164 |   4 - VM/CMS
165 |   5 - Atari TOS
166 |   6 - HPFS filesystem (OS/2, NT)
167 |   7 - Macintosh
168 |   8 - Z-System
169 |   9 - CP/M
170 |  10 - TOPS-20
171 |  11 - NTFS filesystem (NT)
172 |  12 - QDOS
173 |  13 - Acorn RISCOS
174 | 255 - unknown
175 | XLEN (eXtra LENgth)
176 | If FLG.FEXTRA is set, this gives the length of the optional extra field. See below for details.
177 | CRC32 (CRC-32)
178 | This contains a Cyclic Redundancy Check value of the uncompressed data computed according to CRC-32 algorithm used in the ISO 3309 standard and in section 8.1.1.6.2 of ITU-T recommendation V.42. (See http://www.iso.ch for ordering ISO documents. See gopher://info.itu.ch for an online version of ITU-T V.42.)
179 | ISIZE (Input SIZE)
180 | This contains the size of the original (uncompressed) input data modulo 2^32.
181 | Extra field
182 | If the FLG.FEXTRA bit is set, an "extra field" is present in the header, with total length XLEN bytes. It consists of a series of subfields, each of the form:
183 | 
184 | +---+---+---+---+==================================+
185 | |SI1|SI2|  LEN  |... LEN bytes of subfield data ...|
186 | +---+---+---+---+==================================+
187 | SI1 and SI2 provide a subfield ID, typically two ASCII letters with some mnemonic value. Jean-Loup Gailly <jloup@gzip.org> is maintaining a registry of subfield IDs; please send him any subfield ID you wish to use. Subfield IDs with SI2 = 0 are reserved for future use. The following IDs are currently defined:
188 | SI1         SI2         Data
189 | ----------  ----------  ----
190 | 0x41 ('A')  0x70 ('P')  Apollo file type information
191 | LEN gives the length of the subfield data, excluding the 4 initial bytes.
192 | Compliance
193 | A compliant compressor must produce files with correct ID1, ID2, CM, CRC32, and ISIZE, but may set all the other fields in the fixed-length part of the header to default values (255 for OS, 0 for all others). The compressor must set all reserved bits to zero.
194 | 
195 | A compliant decompressor must check ID1, ID2, and CM, and provide an error indication if any of these have incorrect values. It must examine FEXTRA/XLEN, FNAME, FCOMMENT and FHCRC at least so it can skip over the optional fields if they are present. It need not examine any other part of the header or trailer; in particular, a decompressor may ignore FTEXT and OS and always produce binary output, and still be compliant. A compliant decompressor must give an error indication if any reserved bit is non-zero, since such a bit could indicate the presence of a new field that would cause subsequent data to be interpreted incorrectly.
196 | 
197 | 3. References
198 | [1] "Information Processing - 8-bit single-byte coded graphic character sets - Part 1: Latin alphabet No.1" (ISO 8859-1:1987). The ISO 8859-1 (Latin-1) character set is a superset of 7-bit ASCII. Files defining this character set are available as iso_8859-1.* in ftp://ftp.uu.net/graphics/png/documents/
199 | 
200 | [2] ISO 3309
201 | 
202 | [3] ITU-T recommendation V.42
203 | 
204 | [4] Deutsch, L.P.,"DEFLATE Compressed Data Format Specification", available in ftp://ftp.uu.net/pub/archiving/zip/doc/
205 | 
206 | [5] Gailly, J.-L., GZIP documentation, available as gzip-*.tar in ftp://prep.ai.mit.edu/pub/gnu/
207 | 
208 | [6] Sarwate, D.V., "Computation of Cyclic Redundancy Checks via Table Look-Up", Communications of the ACM, 31(8), pp.1008-1013.
209 | 
210 | [7] Schwaderer, W.D., "CRC Calculation", April 85 PC Tech Journal, pp.118-133.
211 | 
212 | [8] ftp://ftp.rocksoft.com/papers/crc_v3.txt, describing the CRC concept.
213 | 
214 | 4. Security Considerations
215 | Any data compression method involves the reduction of redundancy in the data. Consequently, any corruption of the data is likely to have severe effects and be difficult to correct. Uncompressed text, on the other hand, will probably still be readable despite the presence of some corrupted bytes. It is recommended that systems using this data format provide some means of validating the integrity of the compressed data, such as by setting and checking the CRC-32 check value.
216 | 5. Acknowledgements
217 | Trademarks cited in this document are the property of their respective owners.
218 | Jean-Loup Gailly designed the gzip format and wrote, with Mark Adler, the related software described in this specification. Glenn Randers-Pehrson converted this document to RFC and HTML format.
219 | 
220 | 6. Author's Address
221 | L. Peter Deutsch
222 | Aladdin Enterprises
223 | 203 Santa Margarita Ave.
224 | Menlo Park, CA 94025
225 | 
226 | Phone: (415) 322-0103 (AM only)
227 | FAX:   (415) 322-1734
228 | EMail: <ghost@aladdin.com>
229 | Questions about the technical content of this specification can be sent by email to:
230 | Jean-Loup Gailly <jloup@gzip.org> and
231 | Mark Adler <madler@alumni.caltech.edu>
232 | Editorial comments on this specification can be sent by email to:
233 | L. Peter Deutsch <ghost@aladdin.com> and
234 | Glenn Randers-Pehrson <randeg@alumni.rpi.edu>
235 | 7. Appendix: Jean-Loup Gailly's gzip utility
236 | The most widely used implementation of gzip compression, and the original documentation on which this specification is based, were created by Jean-Loup Gailly <jloup@gzip.org>. Since this implementation is a de facto standard, we mention some more of its features here. Again, the material in this section is not part of the specification per se, and implementations need not follow it to be compliant.
237 | When compressing or decompressing a file, gzip preserves the protection, ownership, and modification time attributes on the local file system, since there is no provision for representing protection attributes in the gzip file format itself. Since the file format includes a modification time, the gzip decompressor provides a command line switch that assigns the modification time from the file, rather than the local modification time of the compressed input, to the decompressed output.
238 | 
239 | 8. Appendix: Sample CRC Code
240 | The following sample code represents a practical implementation of the CRC (Cyclic Redundancy Check). (See also ISO 3309 and ITU-T V.42 for a formal specification.)
241 | The sample code is in the ANSI C programming language. Non C users may find it easier to read with these hints:
242 | 
243 | &      Bitwise AND operator.
244 | ^      Bitwise exclusive-OR operator.
245 | >>     Bitwise right shift operator. When applied to an
246 |        unsigned quantity, as here, right shift inserts zero
247 |        bit(s) at the left.
248 | !      Logical NOT operator.
249 | ++     "n++" increments the variable n.
250 | 0xNNN  0x introduces a hexadecimal (base 16) constant.
251 |        Suffix L indicates a long value (at least 32 bits).
252 | 
253 | /* Table of CRCs of all 8-bit messages. */
254 | unsigned long crc_table[256];
255 | 
256 | /* Flag: has the table been computed? Initially false. */
257 | int crc_table_computed = 0;
258 | 
259 | /* Make the table for a fast CRC. */
260 | void make_crc_table(void)
261 | {
262 |   unsigned long c;
263 |   int n, k;
264 | 
265 |   for (n = 0; n < 256; n++) {
266 |     c = (unsigned long) n;
267 |     for (k = 0; k < 8; k++) {
268 |       if (c & 1) {
269 |         c = 0xedb88320L ^ (c >> 1);
270 |       } else {
271 |         c = c >> 1;
272 |       }
273 |     }
274 |     crc_table[n] = c;
275 |   }
276 |   crc_table_computed = 1;
277 | }
278 | 
279 | /*
280 |    Update a running crc with the bytes buf[0..len-1] and return
281 |  the updated crc. The crc should be initialized to zero. Pre- and
282 |  post-conditioning (one's complement) is performed within this
283 |  function so it shouldn't be done by the caller. Usage example:
284 | 
285 |    unsigned long crc = 0L;
286 | 
287 |    while (read_buffer(buffer, length) != EOF) {
288 |      crc = update_crc(crc, buffer, length);
289 |    }
290 |    if (crc != original_crc) error();
291 | */
292 | unsigned long update_crc(unsigned long crc,
293 |                 unsigned char *buf, int len)
294 | {
295 |   unsigned long c = crc ^ 0xffffffffL;
296 |   int n;
297 | 
298 |   if (!crc_table_computed)
299 |     make_crc_table();
300 |   for (n = 0; n < len; n++) {
301 |     c = crc_table[(c ^ buf[n]) & 0xff] ^ (c >> 8);
302 |   }
303 |   return c ^ 0xffffffffL;
304 | }
305 | 
306 | /* Return the CRC of the bytes buf[0..len-1]. */
307 | unsigned long crc(unsigned char *buf, int len)
308 | {
309 |   return update_crc(0L, buf, len);
310 | }


--------------------------------------------------------------------------------
/mgzip/multiProcGzip.py:
--------------------------------------------------------------------------------
  1 | """This module provide a simple replacement of Python internal gzip module
  2 | to provide a multiprocessing solution for gzip compression/decompression.
  3 | 
  4 | License: MIT LICENSE
  5 | Copyright (c) 2019 Vincent Li
  6 | 
  7 | """
  8 | 
  9 | import os, time
 10 | import builtins
 11 | import struct
 12 | import zlib
 13 | import io
 14 | from gzip import GzipFile, write32u, _GzipReader, _PaddedFile, READ, WRITE, FEXTRA, FNAME, FCOMMENT, FHCRC
 15 | from multiprocessing.dummy import Pool
 16 | 
 17 | __version__ = "0.2.1"
 18 | 
 19 | SID = b'IG' # Subfield ID of indexed gzip file
 20 | 
 21 | def open(filename, mode="rb", compresslevel=9,
 22 |          encoding=None, errors=None, newline=None,
 23 |          thread=None, blocksize=10**8):
 24 |     """Open a gzip-compressed file in binary or text mode.
 25 | 
 26 |     The filename argument can be an actual filename (a str or bytes object), or
 27 |     an existing file object to read from or write to.
 28 | 
 29 |     The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for
 30 |     binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is
 31 |     "rb", and the default compresslevel is 9.
 32 | 
 33 |     For binary mode, this function is equivalent to the GzipFile constructor:
 34 |     GzipFile(filename, mode, compresslevel). In this case, the encoding, errors
 35 |     and newline arguments must not be provided.
 36 | 
 37 |     For text mode, a GzipFile object is created, and wrapped in an
 38 |     io.TextIOWrapper instance with the specified encoding, error handling
 39 |     behavior, and line ending(s).
 40 | 
 41 |     """
 42 |     if "t" in mode:
 43 |         if "b" in mode:
 44 |             raise ValueError("Invalid mode: %r" % (mode,))
 45 |     else:
 46 |         if encoding is not None:
 47 |             raise ValueError("Argument 'encoding' not supported in binary mode")
 48 |         if errors is not None:
 49 |             raise ValueError("Argument 'errors' not supported in binary mode")
 50 |         if newline is not None:
 51 |             raise ValueError("Argument 'newline' not supported in binary mode")
 52 | 
 53 |     gz_mode = mode.replace("t", "")
 54 |     if isinstance(filename, (str, bytes)):
 55 |         binary_file = MultiGzipFile(filename, gz_mode, compresslevel, thread=thread, blocksize=blocksize)
 56 |     elif hasattr(filename, "read") or hasattr(filename, "write"):
 57 |         binary_file = MultiGzipFile(None, gz_mode, compresslevel, filename, thread=thread, blocksize=blocksize)
 58 |     else:
 59 |         raise TypeError("filename must be a str or bytes object, or a file")
 60 | 
 61 |     if "t" in mode:
 62 |         return io.TextIOWrapper(binary_file, encoding, errors, newline)
 63 |     else:
 64 |         return binary_file
 65 | 
 66 | def compress(data, compresslevel=9, thread=None, blocksize=10**8):
 67 |     """Compress data in one shot and return the compressed string.
 68 |     Optional argument is the compression level, in range of 0-9.
 69 |     """
 70 |     buf = io.BytesIO()
 71 |     with MultiGzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel,
 72 |                        thread=thread, blocksize=blocksize) as f:
 73 |         f.write(data)
 74 |     return buf.getvalue()
 75 | 
 76 | def decompress(data, thread=None, blocksize=10**8):
 77 |     """Decompress a gzip compressed string in one shot.
 78 |     Return the decompressed string.
 79 |     """
 80 |     with MultiGzipFile(fileobj=io.BytesIO(data), thread=thread,
 81 |                        blocksize=blocksize) as f:
 82 |         return f.read()
 83 | 
 84 | def padded_file_seek(self, off, whence=0):
 85 |     """
 86 |         Provide a whence of seek method in gzip
 87 |         to allow seek to the end of file.
 88 |         * FIXME: This method may have some problem
 89 |          is stream mode since it is unable to seek
 90 |          to the end of stream object.
 91 |     """
 92 |     self._read = None
 93 |     self._buffer = None
 94 |     return self.file.seek(off, whence)
 95 | _PaddedFile.seek = padded_file_seek # override the seek method to provide whence parameter
 96 | 
 97 | class MultiGzipFile(GzipFile):
 98 |     """ docstring of MultiGzipFile """
 99 | 
100 |     def __init__(self, filename=None, mode=None,
101 |                  compresslevel=9, fileobj=None, mtime=None,
102 |                  thread=None, blocksize=10**8):
103 |         """Constructor for the GzipFile class.
104 | 
105 |         At least one of fileobj and filename must be given a
106 |         non-trivial value.
107 | 
108 |         The new class instance is based on fileobj, which can be a regular
109 |         file, an io.BytesIO object, or any other object which simulates a file.
110 |         It defaults to None, in which case filename is opened to provide
111 |         a file object.
112 | 
113 |         When fileobj is not None, the filename argument is only used to be
114 |         included in the gzip file header, which may include the original
115 |         filename of the uncompressed file.  It defaults to the filename of
116 |         fileobj, if discernible; otherwise, it defaults to the empty string,
117 |         and in this case the original filename is not included in the header.
118 | 
119 |         The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or
120 |         'xb' depending on whether the file will be read or written.  The default
121 |         is the mode of fileobj if discernible; otherwise, the default is 'rb'.
122 |         A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
123 |         'wb', 'a' and 'ab', and 'x' and 'xb'.
124 | 
125 |         The compresslevel argument is an integer from 0 to 9 controlling the
126 |         level of compression; 1 is fastest and produces the least compression,
127 |         and 9 is slowest and produces the most compression. 0 is no compression
128 |         at all. The default is 9.
129 | 
130 |         The mtime argument is an optional numeric timestamp to be written
131 |         to the last modification time field in the stream when compressing.
132 |         If omitted or None, the current time is used.
133 | 
134 |         """
135 | 
136 |         self.thread = thread
137 |         self.read_blocks = None
138 |         if mode and ('t' in mode or 'U' in mode):
139 |             raise ValueError("Invalid mode: {!r}".format(mode))
140 |         if mode and 'b' not in mode:
141 |             mode += 'b'
142 |         if fileobj is None:
143 |             fileobj = self.myfileobj = builtins.open(filename, mode or 'rb', blocksize)
144 |         if filename is None:
145 |             filename = getattr(fileobj, 'name', '')
146 |             if not isinstance(filename, (str, bytes)):
147 |                 filename = ''
148 |         if mode is None:
149 |             mode = getattr(fileobj, 'mode', 'rb')
150 | 
151 |         if mode.startswith('r'):
152 |             self.mode = READ
153 |             if not self.thread:
154 |                 self.thread = os.cpu_count() // 2 # cores number
155 |             self.raw = _MulitGzipReader(fileobj, thread=self.thread, max_block_size=blocksize)
156 |             self._buffer = io.BufferedReader(self.raw, blocksize)
157 |             self.name = filename
158 |             self.index = []
159 | 
160 |         elif mode.startswith(('w', 'a', 'x')):
161 |             self.mode = WRITE
162 |             if not self.thread:
163 |                 # thread is None or 0, use all available CPUs
164 |                 self.thread = os.cpu_count()
165 |             self._init_write(filename)
166 |             self.compress = zlib.compressobj(compresslevel,
167 |                                              zlib.DEFLATED,
168 |                                              -zlib.MAX_WBITS,
169 |                                              zlib.DEF_MEM_LEVEL,
170 |                                              0)
171 |             self._write_mtime = mtime
172 |             self.compresslevel = compresslevel
173 |             self.blocksize = blocksize # use 20M blocksize as default
174 |             self.pool = Pool(self.thread)
175 |             self.pool_result = []
176 |             self.small_buf = io.BytesIO()
177 |         else:
178 |             raise ValueError("Invalid mode: {!r}".format(mode))
179 | 
180 |         self.fileobj = fileobj
181 | 
182 |     def __repr__(self):
183 |         s = repr(self.fileobj)
184 |         return '<mgzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
185 | 
186 |     def _write_gzip_header(self):
187 |         ## ignored to write original header
188 |         pass
189 | 
190 |     def _compress_func(self, data, pdata=None):
191 |         """
192 |             Compress data with zlib deflate algorithm.
193 |             Input:
194 |                 data: btyes object of input data
195 |                 pdata: exists small buffer data
196 |             Return:
197 |                 tuple of (Buffered compressed data,
198 |                           Major compressed data,
199 |                           Rest data after flush buffer,
200 |                           CRC32,
201 |                           Original size)
202 |         """
203 |         cpr = zlib.compressobj(self.compresslevel,
204 |                                zlib.DEFLATED,
205 |                                -zlib.MAX_WBITS,
206 |                                9, # use memory level 9 > zlib.DEF_MEM_LEVEL (8) for better performance
207 |                                0)
208 |         if pdata:
209 |             prefix_bytes = cpr.compress(pdata)
210 |         body_bytes = cpr.compress(data)
211 |         rest_bytes = cpr.flush()
212 |         if pdata:
213 |             crc = zlib.crc32(data, zlib.crc32(pdata))
214 |             return (prefix_bytes, body_bytes, rest_bytes, crc, pdata.nbytes + data.nbytes)
215 |         else:
216 |             crc = zlib.crc32(data)
217 |             return (b'', body_bytes, rest_bytes, crc, data.nbytes)
218 | 
219 |     def write(self, data):
220 |         self._check_not_closed()
221 |         if self.mode != WRITE:
222 |             import errno
223 |             raise OSError(errno.EBADF, "write() on read-only GzipFile object")
224 | 
225 |         if self.fileobj is None:
226 |             raise ValueError("write() on closed GzipFile object")
227 | 
228 |         data = memoryview(data)
229 |         length = data.nbytes
230 | 
231 |         if length == 0:
232 |             return length
233 |         elif length >= self.blocksize:
234 |             if length < 2 * self.blocksize:
235 |                 # use sigle thread
236 |                 self._compress_block_async(data)
237 |             else:
238 |                 for st in range(0, length, self.blocksize):
239 |                     self._compress_block_async(data[st: st+self.blocksize])
240 |                     self._flush_pool()
241 |         elif length < self.blocksize:
242 |             self.small_buf.write(data)
243 |             if self.small_buf.tell() >= self.blocksize:
244 |                 self._compress_async(self.small_buf.getbuffer())
245 |                 self.small_buf = io.BytesIO()
246 |         self._flush_pool()
247 |         return length
248 | 
249 |     def _compress_async(self, data, pdata=None):
250 |         return self.pool_result.append(self.pool.apply_async(self._compress_func, args=(data, pdata)))
251 | 
252 |     def _compress_block_async(self, data):
253 |         if self.small_buf.tell() != 0:
254 |             self._compress_async(data, self.small_buf.getbuffer())
255 |             self.small_buf = io.BytesIO()
256 |         else:
257 |             self._compress_async(data)
258 | 
259 |     def _flush_pool(self, force=False):
260 |         if len(self.pool_result) <= self.thread and not force:
261 |             return 0
262 |         length = 0
263 |         if force:
264 |             flushSize = len(self.pool_result)
265 |         else:
266 |             flushSize = len(self.pool_result) - self.thread
267 |         for i in range(flushSize):
268 |             cdata = self.pool_result.pop(0).get()
269 |             length += self._write_member(cdata)
270 |             # (bodyBytes, resBytes, crc, oriSize) = rlt.get()
271 |             # compressRlt = rlt.get()
272 |         return length
273 | 
274 |     def _write_member(self, cdata):
275 |         """
276 |             Write a compressed data as a complete gzip member
277 |             Input:
278 |                 cdata:
279 |                     compressed data, a tuple of compressed result returned by _compress_func()
280 |             Return:
281 |                 size of member
282 |         """
283 |         size = self._write_member_header(len(cdata[0]) + len(cdata[1]) + len(cdata[2]), cdata[4])
284 |         self.fileobj.write(cdata[0])                   # buffer data
285 |         self.fileobj.write(cdata[1])                   # body data
286 |         self.fileobj.write(cdata[2])                   # rest data
287 |         write32u(self.fileobj, cdata[3])               # CRC32
288 |         write32u(self.fileobj, cdata[4] & 0xffffffff)  # raw data size in 32bits
289 |         return size
290 | 
291 |     def _write_member_header(self, compressed_size, raw_size):
292 |         self.fileobj.write(b'\037\213')             # magic header, 2 bytes
293 |         self.fileobj.write(b'\010')                 # compression method, 1 byte
294 |         try:
295 |             # RFC 1952 requires the FNAME field to be Latin-1. Do not
296 |             # include filenames that cannot be represented that way.
297 |             fname = os.path.basename(self.name)
298 |             if not isinstance(fname, bytes):
299 |                 fname = fname.encode('latin-1')
300 |             if fname.endswith(b'.gz'):
301 |                 fname = fname[:-3]
302 |         except UnicodeEncodeError:
303 |             fname = b''
304 |         flags = FEXTRA
305 |         if fname:
306 |             flags |= FNAME
307 |         self.fileobj.write(chr(flags).encode('latin-1'))  # flags, 1 byte
308 |         mtime = self._write_mtime
309 |         if mtime is None:
310 |             mtime = time.time()
311 |         write32u(self.fileobj, int(mtime))          # modified time, 4 bytes
312 |         self.fileobj.write(b'\002')                 # fixed flag (maximum compression), 1 byte
313 |         self.fileobj.write(b'\377')                 # OS (unknown), 1 byte
314 | 
315 |         # write extra flag for indexing
316 |         # XLEN, 8 bytes
317 |         self.fileobj.write(b'\x08\x00')             # extra flag len, 2 bytes
318 |         # EXTRA FLAG FORMAT:
319 |         # +---+---+---+---+---+---+---+---+
320 |         # |SI1|SI2|  LEN  |  MEMBER SIZE  |
321 |         # +---+---+---+---+---+---+---+---+
322 |         # SI1, SI2:      Subfield ID, 'IG' (Indexed Gzip file)
323 |         # LEN:           Length of subfield body, always 4 (bytes)
324 |         # MEMBER SIZE:   The size of current member
325 |         self.fileobj.write(SID)                   # subfield ID (IG), 2 bytes
326 |         # LEN: 4 bytes
327 |         self.fileobj.write(b'\x04\x00')             # subfield len (4), 2 bytes
328 |         # compressed data size: 16 + 4 + len(fname) + 1 + data + 8
329 |         #                       header + member size + filename with zero end + data block + CRC32 and ISIZE
330 |         member_size = 20 + len(fname) + 1 + compressed_size + 8
331 |         if not fname:
332 |             member_size -= 1
333 |         self.fileobj.write(struct.pack("<I", member_size)) # member size, 4 bytes
334 |         if fname:
335 |             self.fileobj.write(fname + b'\000')
336 |         return member_size
337 | 
338 |     def get_index(self):
339 |         """
340 |             Index Format:
341 |                 0: Start offset
342 |                 1: Block size
343 |                 2: Raw size
344 |                 3: Comment (If exists)
345 |         """
346 |         if self.mode != READ:
347 |             raise OSError("READ mode is required for get_index.")
348 |         self.index = []
349 |         raw_pos = self.myfileobj.tell()
350 |         self.myfileobj.seek(0)
351 |         while True:
352 |             commentByte = b""
353 |             self.myfileobj.seek(3, 1)
354 |             fByte = self.myfileobj.read(1)
355 |             if not fByte:
356 |                 break
357 |             flag, = struct.unpack("<B", fByte)
358 |             self.myfileobj.seek(8, 1)
359 |             extra_flag = self.myfileobj.read(8)
360 |             if not extra_flag:
361 |                 break
362 |             sid, _, msize = struct.unpack("<2sHI", extra_flag)
363 |             if sid != SID:
364 |                 raise OSError("Invaild Indexed GZIP format")
365 |             if flag & FNAME:
366 |                 while True:
367 |                     s = self.myfileobj.read(1)
368 |                     if not s or s==b'\000':
369 |                         break
370 |             if flag & FCOMMENT:
371 |                 while True:
372 |                     s = self.myfileobj.read(1)
373 |                     if not s or s==b'\000':
374 |                         break
375 |                     commentByte += s
376 |             if not self.index:
377 |                 self.index.append([0, msize, 0, commentByte.decode()])
378 |             else:
379 |                 self.index.append([self.index[-1][0] + self.index[-1][1], msize, 0, commentByte.decode()])
380 |             self.myfileobj.seek(self.index[-1][0] + self.index[-1][1] - 4)
381 |             isize, = struct.unpack("<I", self.myfileobj.read(4))
382 |             self.index[-1][2] = isize
383 |         self.myfileobj.seek(raw_pos)
384 |         return self.index
385 | 
386 |     def show_index(self):
387 |         if not self.index:
388 |             self.get_index()
389 |         block_id = 0
390 |         print("#ID\tStart\tBlockSize\tRawSize\tComment")
391 |         for e in self.index:
392 |             print(block_id, *e, sep="\t")
393 |             block_id += 1
394 | 
395 |     def build_index(self, idx_file=None):
396 |         if not idx_file:
397 |             idx_file = self.name + ".idx"
398 |         if not self.index:
399 |             self.get_index()
400 |         block_id = 0
401 |         with builtins.open(idx_file, 'w') as fh:
402 |             print("#ID\tStart\tBlockSize\tRawSize\tComment", file=fh)
403 |             for e in self.index:
404 |                 if not e[2]:
405 |                     continue
406 |                 print(block_id, *e, sep="\t", file=fh)
407 |                 block_id += 1
408 |         return self.index
409 | 
410 |     def load_index(self, idx_file):
411 |         self.index = []
412 |         with builtins.open(idx_file, 'r') as fh:
413 |             for line in fh:
414 |                 info = line.split()
415 |                 if not info or info[0].startswith('#'):
416 |                     continue
417 |                 self.index.append([int(info[1]), int(info[2]), int(info[3]), info[4]])
418 |         return self.index
419 | 
420 |     def set_read_blocks(self, block_ids):
421 |         self.raw.set_block_iter([self.index[x][0] for x in block_ids])
422 | 
423 |     def set_read_blocks_by_name(self, block_names):
424 |         """
425 |             If file use comment to record the name of blocks,
426 |             set read blocks to given list of block name.
427 | 
428 |             * The order of reading will follow the block
429 |               order in compressed file instead of input block_names
430 |         """
431 |         block_name_set = set(block_names)
432 |         self.raw.set_block_iter([x[0] for x in self.index if x[3] in block_name_set])
433 | 
434 |     def clear_read_blocks(self):
435 |         self.raw.clear_block_iter()
436 | 
437 |     def close(self):
438 |         fileobj = self.fileobj
439 |         if fileobj is None:
440 |             return
441 |         try:
442 |             if self.mode == WRITE:
443 |                 if self.small_buf.tell() != 0:
444 |                     self._compress_async(self.small_buf.getbuffer())
445 |                     self.small_buf = io.BytesIO()
446 |                 self._flush_pool(force=True)
447 |             elif self.mode == READ:
448 |                 self._buffer.close()
449 |         finally:
450 |             self.fileobj = None
451 |             myfileobj = self.myfileobj
452 |             if myfileobj:
453 |                 self.myfileobj = None
454 |                 myfileobj.close()
455 | 
456 |     def flush(self):
457 |         self._check_not_closed()
458 |         if self.mode == WRITE:
459 |             self._flush_pool(force=True)
460 |             self.fileobj.flush()
461 | 
462 | class _MulitGzipReader(_GzipReader):
463 |     def __init__(self, fp, thread=4, max_block_size=5*10**8):
464 |         super().__init__(fp)
465 | 
466 |         self.memberidx = [] # list of tuple (memberSize, rawTxtSize)
467 |         self._is_IG_member = False
468 |         self._header_size = 0
469 |         self.max_block_size = max_block_size
470 |         self.thread = thread
471 |         self._read_pool = []
472 |         self._pool = Pool(self.thread)
473 |         self._block_buff = b""
474 |         self._block_buff_pos = 0
475 |         self._block_buff_size = 0
476 |         self._is_eof = False
477 |         self._raw_fp = fp
478 |         self.block_start_iter = None
479 | 
480 |     def _decompress_func(self, data, rcrc, rsize):
481 |         """
482 |             Decompress data and return exact bytes of plain text
483 |             Input:
484 |                 data: compressed data
485 |                 rcrc: raw crc32
486 |                 rsize: raw data size
487 |             Return:
488 |                 body_bytes: bytes object of decompressed data
489 |                 rsize: raw data size
490 |                 crc: crc32 calculated by decompressed data
491 |                 rcrc: raw crc32 in compressed file
492 |         """
493 |         dpr = zlib.decompressobj(wbits=-zlib.MAX_WBITS)
494 |         ## FIXME: case when raw data size > 4 GB, rsize is just the mod of 4G
495 |         ## not a good idea to read all of them in memory
496 |         body_bytes = dpr.decompress(data, rsize)
497 |         crc = zlib.crc32(body_bytes)
498 |         if dpr.unconsumed_tail != b"":
499 |             body_bytes += dpr.unconsumed_tail
500 |             crc = zlib.crc32(dpr.unconsumed_tail, crc)
501 |         return (body_bytes, rsize, crc, rcrc)
502 | 
503 |     def _decompress_async(self, data, rcrc, rsize):
504 |         self._read_pool.append(self._pool.apply_async(self._decompress_func, args=(data, rcrc, rsize)))
505 | 
506 |     def _read_gzip_header(self):
507 |         magic = self._fp.read(2)
508 |         if magic == b'':
509 |             return False
510 | 
511 |         if magic != b'\037\213':
512 |             raise OSError('Not a gzipped file (%r)' % magic)
513 | 
514 |         (method, flag,
515 |          self._last_mtime) = struct.unpack("<BBIxx", self._read_exact(8))
516 |         if method != 8:
517 |             raise OSError('Unknown compression method')
518 | 
519 |         if flag & FEXTRA:
520 |             # Read & discard the extra field, if present
521 |             extra_len, sid = struct.unpack("<H2s", self._read_exact(4))
522 |             if sid == SID:
523 |                 _, msize = struct.unpack("<HI" ,self._read_exact(extra_len - 2))
524 |                 self.memberidx.append(msize)
525 |                 self._is_IG_member = True
526 |                 # print("block", len(self.memberidx), msize, rsize)
527 |                 self._header_size = 20 # fixed header + FEXTRA
528 |             else:
529 |                 self._is_IG_member = False
530 | 
531 |         if flag & FNAME:
532 |             # Read and discard a null-terminated string containing the filename
533 |             while True:
534 |                 s = self._fp.read(1)
535 |                 self._header_size += 1
536 |                 if not s or s==b'\000':
537 |                     break
538 |         if flag & FCOMMENT:
539 |             # Read and discard a null-terminated string containing a comment
540 |             while True:
541 |                 s = self._fp.read(1)
542 |                 self._header_size += 1
543 |                 if not s or s==b'\000':
544 |                     break
545 |         if flag & FHCRC:
546 |             self._read_exact(2)     # Read & discard the 16-bit header CRC
547 |             self._header_size += 2
548 |         return True
549 | 
550 |     def read(self, size=-1):
551 |         if size < 0:
552 |             return self.readall()
553 |         # size=0 is special because decompress(max_length=0) is not supported
554 |         if not size:
555 |             return b""
556 | 
557 |         # For certain input data, a single
558 |         # call to decompress() may not return
559 |         # any data. In this case, retry until we get some data or reach EOF.
560 |         while True:
561 |             if self.block_start_iter and self.thread:
562 |                 try:
563 |                     self._fp.seek(next(self.block_start_iter))
564 |                 except Exception:
565 |                     self.clear_block_iter()
566 |                     self._fp.seek(0, 2)
567 |                     continue
568 |             if self._decompressor.eof:
569 |                 # Ending case: we've come to the end of a member in the file,
570 |                 # so finish up this member, and read a new gzip header.
571 |                 # Check the CRC and file size, and set the flag so we read
572 |                 # a new member
573 |                 self._read_eof()
574 |                 self._new_member = True
575 |                 self._decompressor = self._decomp_factory(
576 |                     **self._decomp_args)
577 | 
578 |             if self._new_member and self.thread:
579 |                 # If the _new_member flag is set, we have to
580 |                 # jump to the next member, if there is one.
581 |                 self._init_read()
582 |                 if not self._read_gzip_header():
583 |                     self._size = self._pos
584 |                     self._is_eof = True
585 |                 else:
586 |                     self._new_member = False
587 | 
588 |                     if self._is_IG_member:
589 |                         # 8 bytes for crc32 and isize
590 |                         cpr_size = self.memberidx[-1] - self._header_size - 8
591 |                         self._decompress_async(self._fp.read(cpr_size),
592 |                                                *self._read_eof_crc())
593 |                         self.thread -= 1
594 |                         self._new_member = True
595 |                         continue
596 | 
597 |             if self._block_buff_pos + size <= self._block_buff_size:
598 |                 st_pos = self._block_buff_pos
599 |                 self._block_buff_pos += size
600 |                 if self._block_buff_pos >= self._block_buff_size:
601 |                     self._block_buff_pos = self._block_buff_size
602 |                 return self._block_buff[st_pos:self._block_buff_pos]
603 |             elif self._read_pool:
604 |                 block_read_rlt = self._read_pool.pop(0).get()
605 |                 self.thread += 1
606 |                 # check decompressed data size
607 |                 if len(block_read_rlt[0]) != block_read_rlt[1]:
608 |                     raise OSError("Incorrect length of data produced")
609 |                 # check raw crc32 == decompressed crc32
610 |                 if block_read_rlt[2] != block_read_rlt[3]:
611 |                     raise OSError("CRC check failed {:s} != {:s}".format(
612 |                         block_read_rlt[3], block_read_rlt[2]
613 |                     ))
614 |                 self._block_buff = self._block_buff[self._block_buff_pos:] + block_read_rlt[0]
615 |                 self._block_buff_size = len(self._block_buff)
616 |                 self._block_buff_pos = min(size, self._block_buff_size)
617 |                 return self._block_buff[:size] # FIXME: fix issue when size > len(self._block_buff)
618 |             elif self._block_buff_pos != self._block_buff_size:
619 |                 # still something in self._block_buff
620 |                 st_pos = self._block_buff_pos
621 |                 self._block_buff_pos = self._block_buff_size
622 |                 return self._block_buff[st_pos:]
623 |             elif self._is_eof:
624 |                 return b""
625 | 
626 |             # Read a chunk of data from the file
627 |             buf = self._fp.read(io.DEFAULT_BUFFER_SIZE)
628 | 
629 |             uncompress = self._decompressor.decompress(buf, size)
630 |             if self._decompressor.unconsumed_tail != b"":
631 |                 self._fp.prepend(self._decompressor.unconsumed_tail)
632 |             elif self._decompressor.unused_data != b"":
633 |                 # Prepend the already read bytes to the fileobj so they can
634 |                 # be seen by _read_eof() and _read_gzip_header()
635 |                 self._fp.prepend(self._decompressor.unused_data)
636 | 
637 |             if uncompress != b"":
638 |                 break
639 |             if buf == b"":
640 |                 raise EOFError("Compressed file ended before the "
641 |                                "end-of-stream marker was reached")
642 | 
643 |         self._add_read_data( uncompress )
644 |         self._pos += len(uncompress)
645 |         return uncompress
646 | 
647 |     def _read_eof_crc(self):
648 |         """
649 |             Get crc32 and isize without checking
650 |         """
651 |         crc32, isize = struct.unpack("<II", self._read_exact(8))
652 | 
653 |         # Gzip files can be padded with zeroes and still have archives.
654 |         # Consume all zero bytes and set the file position to the first
655 |         # non-zero byte. See http://www.gzip.org/#faq8
656 |         c = b"\x00"
657 |         while c == b"\x00":
658 |             c = self._fp.read(1)
659 |         if c:
660 |             self._fp.prepend(c)
661 |         return (crc32, isize)
662 | 
663 |     def set_block_iter(self, block_start_list):
664 |         self.block_start_iter = iter(block_start_list)
665 | 
666 |     def clear_block_iter(self):
667 |         self.block_start_iter = None


--------------------------------------------------------------------------------