├── .codecov.yml
├── .editorconfig
├── .gitattributes
├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.rst
├── pyproject.toml
├── src
    └── xopen
    │   ├── __init__.py
    │   ├── _version.pyi
    │   └── py.typed
├── tests
    ├── conftest.py
    ├── file.txt
    ├── file.txt.bz2
    ├── file.txt.gz
    ├── file.txt.xz
    ├── file.txt.zst
    ├── hello.gz
    ├── only_zeroes.zst
    ├── test_piped.py
    └── test_xopen.py
└── tox.ini


/.codecov.yml:
--------------------------------------------------------------------------------
 1 | comment: off
 2 | 
 3 | codecov:
 4 |   require_ci_to_pass: no
 5 | 
 6 | coverage:
 7 |   precision: 1
 8 |   round: down
 9 |   range: "70...100"
10 | 
11 |   status:
12 |     project: yes
13 |     patch: no
14 |     changes: no
15 | 
16 | comment: off
17 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
1 | [*.py]
2 | charset=utf-8
3 | end_of_line=lf
4 | insert_final_newline=true
5 | indent_style=space
6 | indent_size=4
7 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | tests/file.txt eol=lf
2 | tests/file.txt.test eol=lf
3 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | name: CI
  2 | 
  3 | on: [push, pull_request]
  4 | 
  5 | jobs:
  6 |   lint:
  7 |     # Run for PRs only if they come from a forked repo (avoids duplicate runs)
  8 |     if: >-
  9 |       github.event_name != 'pull_request' ||
 10 |       github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name
 11 |     timeout-minutes: 10
 12 |     runs-on: ubuntu-latest
 13 |     strategy:
 14 |       matrix:
 15 |         python-version: ["3.10"]
 16 |         toxenv: [black, flake8, mypy]
 17 |     steps:
 18 |     - uses: actions/checkout@v4
 19 |     - name: Set up Python ${{ matrix.python-version }}
 20 |       uses: actions/setup-python@v4
 21 |       with:
 22 |         python-version: ${{ matrix.python-version }}
 23 |     - name: Install dependencies
 24 |       run: python -m pip install tox
 25 |     - name: Run tox ${{ matrix.toxenv }}
 26 |       run: tox -e ${{ matrix.toxenv }}
 27 | 
 28 |   test:
 29 |     if: >-
 30 |       github.event_name != 'pull_request' ||
 31 |       github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name
 32 |     timeout-minutes: 10
 33 |     runs-on: ${{ matrix.os }}
 34 |     strategy:
 35 |       matrix:
 36 |         os: [ubuntu-latest]
 37 |         python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "pypy-3.9"]
 38 |         optional-deps: [true]
 39 |         with-libs: [true]
 40 |         include:
 41 |         - os: macos-latest
 42 |           python-version: "3.10"
 43 |           optional-deps: true
 44 |         - os: ubuntu-latest
 45 |           python-version: "3.10"
 46 |           with-libs: false
 47 |           optional-deps: false
 48 |         - os: ubuntu-latest
 49 |           python-version: "3.10"
 50 |           with-libs: false
 51 |           optional-deps: true
 52 |         - os: ubuntu-latest
 53 |           python-version: "3.10"
 54 |           optional-deps: false
 55 |           with-libs: false
 56 |           with-zstandard: true
 57 |         - os: windows-latest
 58 |           python-version: "3.10"
 59 |     steps:
 60 |     - name: Install optional tools macOS
 61 |       if: runner.os == 'macOS' && matrix.optional-deps
 62 |       run: brew install pigz pbzip2 isa-l zstd
 63 |     - name: Install optional tools Linux
 64 |       if: runner.os == 'Linux' && matrix.optional-deps
 65 |       run: sudo apt-get install pigz pbzip2 isal zstd
 66 |     - name: Remove xz
 67 |       if: runner.os == 'Linux' && !matrix.optional-deps
 68 |       run: while which xz; do sudo rm $(which xz); done
 69 |     - uses: actions/checkout@v4
 70 |       with:
 71 |         fetch-depth: 0
 72 |     - name: Set up Python ${{ matrix.python-version }}
 73 |       uses: actions/setup-python@v4
 74 |       with:
 75 |         python-version: ${{ matrix.python-version }}
 76 |     - name: Install dependencies
 77 |       run: python -m pip install tox
 78 |     - name: Test
 79 |       run: tox -e py
 80 |       if: matrix.with-libs
 81 |     - name: Test without python-isal and python-zlib-ng
 82 |       run: tox -e no-libs
 83 |       if: true && !matrix.with-libs
 84 |     - name: Test with zstandard
 85 |       if: matrix.with-zstandard
 86 |       run: tox -e zstd
 87 |     - name: Upload coverage report
 88 |       uses: codecov/codecov-action@v3
 89 | 
 90 |   deploy:
 91 |     timeout-minutes: 10
 92 |     runs-on: ubuntu-latest
 93 |     needs: [lint, test]
 94 |     if: startsWith(github.ref, 'refs/tags')
 95 |     steps:
 96 |     - uses: actions/checkout@v4
 97 |       with:
 98 |         fetch-depth: 0  # required for setuptools_scm
 99 |     - name: Set up Python
100 |       uses: actions/setup-python@v4
101 |       with:
102 |         python-version: "3.10"
103 |     - name: Make distributions
104 |       run: |
105 |         python -m pip install build
106 |         python -m build
107 |         ls -l dist/
108 |     - name: Publish to PyPI
109 |       uses: pypa/gh-action-pypi-publish@release/v1
110 |       with:
111 |         user: __token__
112 |         password: ${{ secrets.pypi_password }}
113 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | *.pyc
3 | *.egg-info
4 | *~
5 | .tox
6 | venv/
7 | src/xopen/_version.py
8 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v2.3.0
 4 |     hooks:
 5 |     -   id: end-of-file-fixer
 6 |     -   id: trailing-whitespace
 7 | -   repo: https://github.com/psf/black
 8 |     rev: 22.3.0
 9 |     hooks:
10 |     -   id: black
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2010 The xopen developers
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | .. image:: https://github.com/pycompression/xopen/workflows/CI/badge.svg
  2 |   :target: https://github.com/pycompression/xopen
  3 |   :alt:
  4 | 
  5 | .. image:: https://img.shields.io/pypi/v/xopen.svg?branch=main
  6 |   :target: https://pypi.python.org/pypi/xopen
  7 | 
  8 | .. image:: https://img.shields.io/conda/v/conda-forge/xopen.svg
  9 |   :target: https://anaconda.org/conda-forge/xopen
 10 |   :alt:
 11 | 
 12 | .. image:: https://codecov.io/gh/pycompression/xopen/branch/main/graph/badge.svg
 13 |   :target: https://codecov.io/gh/pycompression/xopen
 14 |   :alt:
 15 | 
 16 | =====
 17 | xopen
 18 | =====
 19 | 
 20 | This Python module provides an ``xopen`` function that works like Python’s
 21 | built-in ``open`` function but also transparently deals with compressed files.
 22 | ``xopen`` selects the most efficient method for reading or writing a compressed file.
 23 | 
 24 | Supported compression formats are:
 25 | 
 26 | - gzip (``.gz``)
 27 | - bzip2 (``.bz2``)
 28 | - xz (``.xz``)
 29 | - Zstandard (``.zst``) (optional)
 30 | 
 31 | 
 32 | Example usage
 33 | -------------
 34 | 
 35 | Open a file for reading::
 36 | 
 37 |     from xopen import xopen
 38 | 
 39 |     with xopen("file.txt.gz") as f:
 40 |         content = f.read()
 41 | 
 42 | Write to a file in binary mode,
 43 | set the compression level
 44 | and avoid using an external process::
 45 | 
 46 |     from xopen import xopen
 47 | 
 48 |     with xopen("file.txt.xz", mode="wb", threads=0, compresslevel=3) as f:
 49 |         f.write(b"Hello")
 50 | 
 51 | 
 52 | The ``xopen`` function
 53 | ----------------------
 54 | 
 55 | The ``xopen`` module offers a single function named ``xopen`` with the following
 56 | signature::
 57 | 
 58 |   xopen(
 59 |     filename: str | bytes | os.PathLike,
 60 |     mode: Literal["r", "w", "a", "rt", "rb", "wt", "wb", "at", "ab"] = "r",
 61 |     compresslevel: Optional[int] = None,
 62 |     threads: Optional[int] = None,
 63 |     *,
 64 |     encoding: str = "utf-8",
 65 |     errors: Optional[str] = None,
 66 |     newline: Optional[str] = None,
 67 |     format: Optional[str] = None,
 68 |   ) -> IO
 69 | 
 70 | The function opens the file using a function suitable for the detected
 71 | file format and returns an open file-like object.
 72 | 
 73 | When writing, the file format is chosen based on the file name extension:
 74 | ``.gz``, ``.bz2``, ``.xz``, ``.zst``. This can be overriden with ``format``.
 75 | If the extension is not recognized, no compression is used.
 76 | 
 77 | When reading and a file name extension is available, the format is detected
 78 | from the extension.
 79 | When reading and no file name extension is available,
 80 | the format is detected from the
 81 | `file signature <https://en.wikipedia.org/wiki/File_format#Magic_number>`.
 82 | 
 83 | Parameters
 84 | ~~~~~~~~~~
 85 | 
 86 | **filename** (str, bytes, or `os.PathLike <https://docs.python.org/3/library/os.html#os.PathLike>`_):
 87 | Name of the file to open.
 88 | 
 89 | If set to ``"-"``, standard output (in mode ``"w"``) or
 90 | standard input (in mode ``"r"``) is returned.
 91 | 
 92 | **mode**, **encoding**, **errors**, **newline**:
 93 | These parameters have the same meaning as in Python’s built-in
 94 | `open function <https://docs.python.org/3/library/functions.html#open>`_
 95 | except that the default encoding is always UTF-8 instead of the
 96 | preferred locale encoding.
 97 | ``encoding``, ``errors`` and ``newline`` are only used when opening a file in text mode.
 98 | 
 99 | **compresslevel**:
100 | The compression level for writing to gzip, xz and Zstandard files.
101 | If set to None, a default depending on the format is used:
102 | gzip: 1, xz: 6, Zstandard: 3.
103 | 
104 | This parameter is ignored for other compression formats.
105 | 
106 | **format**:
107 | Override the autodetection of the input or output format.
108 | Possible values are: ``"gz"``, ``"xz"``, ``"bz2"``, ``"zst"``.
109 | 
110 | **threads**:
111 | Set the number of additional threads spawned for compression or decompression.
112 | May be ignored if the backend does not support threads.
113 | 
114 | If *threads* is None (the default), as many threads as available CPU cores are
115 | used, but not more than four.
116 | 
117 | xopen tries to offload the (de)compression to other threads
118 | to free up the main Python thread for the application.
119 | This can either be done by using a subprocess to an external application or
120 | using a library that supports threads.
121 | 
122 | Set threads to 0 to force xopen to use only the main Python thread.
123 | 
124 | 
125 | Backends
126 | --------
127 | 
128 | Opening of gzip files is delegated to one of these programs or libraries:
129 | 
130 | * `python-isal <https://github.com/pycompression/python-isal>`_.
131 |   Supports multiple threads and compression levels up to 3.
132 | * `python-zlib-ng <https://github.com/pycompression/python-zlib-ng>`_
133 | * `pigz <https://zlib.net/pigz/>`_ (a parallel version of ``gzip``)
134 | * `gzip <https://www.gnu.org/software/gzip/>`_
135 | 
136 | For xz files, a pipe to the ``xz`` program is used because it has
137 | built-in support for multithreaded compression.
138 | 
139 | For bz2 files, `pbzip2 (parallel bzip2) <http://compression.great-site.net/pbzip2/>`_ is used.
140 | 
141 | ``xopen`` falls back to Python’s built-in functions
142 | (``gzip.open``, ``lzma.open``, ``bz2.open``)
143 | if none of the other methods can be used.
144 | 
145 | 
146 | Reproducibility
147 | ---------------
148 | 
149 | xopen writes gzip files in a reproducible manner.
150 | 
151 | Normally, gzip files contain a timestamp in the file header,
152 | which means that compressing the same data at different times results in different output files.
153 | xopen disables this for all of the supported gzip compression backends.
154 | For example, when using an external process, it sets the command-line option
155 | ``--no-name`` (same as ``-n``).
156 | 
157 | Note that different gzip compression backends typically do not produce
158 | identical output, so reproducibility may no longer be given when the execution environment changes
159 | from one ``xopen()`` invocation to the next.
160 | This includes the CPU architecture as `igzip adjusts its algorithm
161 | depending on it <https://github.com/intel/isa-l/issues/140#issuecomment-634877966>`_.
162 | 
163 | bzip2 and xz compression methods do not store timestamps in the file headers,
164 | so output from them is also reproducible.
165 | 
166 | 
167 | Optional Zstandard support
168 | --------------------------
169 | 
170 | For reading and writing Zstandard (``.zst``) files, either the ``zstd`` command-line
171 | program or the Python ``zstandard`` package needs to be installed.
172 | 
173 | * If the ``threads`` parameter to ``xopen()`` is ``None`` (the default) or any value greater than 0,
174 |   ``xopen`` uses an external ``zstd`` process.
175 | * If the above fails (because no ``zstd`` program is available) or if ``threads`` is 0,
176 |   the ``zstandard`` package is used.
177 | 
178 | To ensure that you get the correct ``zstandard`` version, you can specify the ``zstd`` extra for
179 | ``xopen``, that is, install it using ``pip install xopen[zstd]``.
180 | 
181 | 
182 | Changelog
183 | ---------
184 | 
185 | development version
186 | ~~~~~~~~~~~~~~~~~~~
187 | 
188 | * Dropped support for Python 3.8
189 | * Started supporting Python 3.13
190 | 
191 | v2.0.2 (2024-06-12)
192 | ~~~~~~~~~~~~~~~~~~~
193 | * #161: Fix a bug that was triggered when reading large compressed files with
194 |   an external program.
195 | 
196 | v2.0.1 (2024-03-28)
197 | ~~~~~~~~~~~~~~~~~~~
198 | + #158: Fixed a bug where reading from stdin and other pipes would discard the
199 |   first bytes from the input.
200 | + #156: Zstd files compressed with the ``--long=31`` files can now be opened
201 |   without throwing errors.
202 | 
203 | v2.0.0 (2024-03-26)
204 | ~~~~~~~~~~~~~~~~~~~
205 | 
206 | * #154: Support for gzip levels has been made more consistent. Levels 0-9
207 |   are supported. Level 11 which was only available when the ``pigz`` backend was
208 |   present is not supported anymore. Level 0, gzip format without compression,
209 |   lead to crashes when the ``gzip`` application backend was used as this does
210 |   not have a ``-0`` flag. ``xopen()`` now defers to other backends in that case.
211 | * #152: ``xopen()`` now accepts `file-like objects
212 |   <https://docs.python.org/3/glossary.html#term-file-object>`_ for its filename
213 |   argument.
214 | * #146, #147, #148: Various refactors for better code size and readability:
215 | 
216 |     * PipedCompressionReader/Writer are now combined _PipedCompressionProgram
217 |       class.
218 |     * _PipedCompressionProgram is binary-only. For text reading and writing
219 |       it is wrapped in an ``io.TextIOWrapper`` in the ``xopen()`` function.
220 |     * Classes that derive from PipedCompressionReader/Writer have been removed.
221 | * #148: xopen's classes, variables and functions pertaining to piped reading
222 |   and writing are all made private by prefixing them with an underscore.
223 |   These are not part of the API and may change between releases.
224 | 
225 | v1.9.0 (2024-01-31)
226 | ~~~~~~~~~~~~~~~~~~~
227 | * #142: The python-isal compression backend is now only used for compression
228 |   levels 1 and 2. Contrary to other backends, python-isal level 0 gave
229 |   compressed rather than uncompressed data in gzip format. Level 3 on
230 |   python-isal did not provide better compression than level 2.
231 | * #140: PipedCompressionReader/Writer now derive from the `io.IOBase
232 |   <https://docs.python.org/3/library/io.html#io.IOBase>`_ abstract class.
233 | * #138: The gzip default compression level is now 1 when no value is provided
234 |   by the calling function. The default used to be determined by the backend.
235 | * #135: xopen now uses zlib-ng when available and applicable.
236 | * #133: Piped ``igzip`` is no longer used as a (de)compression backend as
237 |   python-isal's threaded mode is a better choice in all use cases.
238 | 
239 | v1.8.0 (2023-11-03)
240 | ~~~~~~~~~~~~~~~~~~~
241 | * #131: xopen now defers to the ``isal.igzip_threaded`` module rather than
242 |   piping to external programs in applicable cases. This makes reading and
243 |   writing to gzip files using threads more efficient.
244 | * Support for Python 3.7 is dropped and support for Python 3.12 is added.
245 | 
246 | v1.7.0 (2022-11-03)
247 | ~~~~~~~~~~~~~~~~~~~
248 | 
249 | * #91: Added optional support for Zstandard (``.zst``) files.
250 |   This requires that the Python ``zstandard`` package is installed
251 |   or that the ``zstd`` command-line program is available.
252 | 
253 | v1.6.0 (2022-08-10)
254 | ~~~~~~~~~~~~~~~~~~~
255 | 
256 | * #94: When writing gzip files, the timestamp and name of the original
257 |   file is omitted (equivalent to using ``gzip --no-name`` (or ``-n``) on the
258 |   command line). This allows files to be written in a reproducible manner.
259 | 
260 | v1.5.0 (2022-03-23)
261 | ~~~~~~~~~~~~~~~~~~~
262 | 
263 | * #100: Dropped Python 3.6 support
264 | * #101: Added support for piping into and from an external ``xz`` process. Contributed by @fanninpm.
265 | * #102: Support setting the xz compression level. Contributed by @tsibley.
266 | 
267 | v1.4.0 (2022-01-14)
268 | ~~~~~~~~~~~~~~~~~~~
269 | 
270 | * Add ``seek()`` and ``tell()`` to the ``PipedCompressionReader`` classes
271 |   (for Windows compatibility)
272 | 
273 | v1.3.0 (2022-01-10)
274 | ~~~~~~~~~~~~~~~~~~~
275 | 
276 | * xopen is now available on Windows (in addition to Linux and macOS).
277 | * For greater compatibility with `the built-in open()
278 |   function <https://docs.python.org/3/library/functions.html#open>`_,
279 |   ``xopen()`` has gained the parameters *encoding*, *errors* and *newlines*
280 |   with the same meaning as in ``open()``. Unlike built-in ``open()``, though,
281 |   encoding is UTF-8 by default.
282 | * A parameter *format* has been added that allows to force the compression
283 |   file format.
284 | 
285 | v1.2.0 (2021-09-21)
286 | ~~~~~~~~~~~~~~~~~~~
287 | 
288 | * `pbzip2 <http://compression.great-site.net/pbzip2/>`_ is now used to open ``.bz2`` files if
289 |   ``threads`` is greater than zero (contributed by @DriesSchaumont).
290 | 
291 | v1.1.0 (2021-01-20)
292 | ~~~~~~~~~~~~~~~~~~~
293 | 
294 | * Python 3.5 support is dropped.
295 | * On Linux systems, `python-isal <https://github.com/pycompression/python-isal>`_
296 |   is now added as a requirement. This will speed up the reading of gzip files
297 |   significantly when no external processes are used.
298 | 
299 | v1.0.0 (2020-11-05)
300 | ~~~~~~~~~~~~~~~~~~~
301 | 
302 | * If installed, the ``igzip`` program (part of
303 |   `Intel ISA-L <https://github.com/intel/isa-l/>`_) is now used for reading
304 |   and writing gzip-compressed files at compression levels 1-3, which results
305 |   in a significant speedup.
306 | 
307 | v0.9.0 (2020-04-02)
308 | ~~~~~~~~~~~~~~~~~~~
309 | 
310 | * #80: When the file name extension of a file to be opened for reading is not
311 |   available, the content is inspected (if possible) and used to determine
312 |   which compression format applies (contributed by @bvaisvil).
313 | * This release drops Python 2.7 and 3.4 support. Python 3.5 or later is
314 |   now required.
315 | 
316 | v0.8.4 (2019-10-24)
317 | ~~~~~~~~~~~~~~~~~~~
318 | 
319 | * When reading gzipped files, force ``pigz`` to use only a single process.
320 |   ``pigz`` cannot use multiple cores anyway when decompressing. By default,
321 |   it would use extra I/O processes, which slightly reduces wall-clock time,
322 |   but increases CPU time. Single-core decompression with ``pigz`` is still
323 |   about twice as fast as regular ``gzip``.
324 | * Allow ``threads=0`` for specifying that no external ``pigz``/``gzip``
325 |   process should be used (then regular ``gzip.open()`` is used instead).
326 | 
327 | v0.8.3 (2019-10-18)
328 | ~~~~~~~~~~~~~~~~~~~
329 | 
330 | * #20: When reading gzipped files, let ``pigz`` use at most four threads by default.
331 |   This limit previously only applied when writing to a file. Contributed by @bernt-matthias.
332 | * Support Python 3.8
333 | 
334 | v0.8.0 (2019-08-14)
335 | ~~~~~~~~~~~~~~~~~~~
336 | 
337 | * #14: Speed improvements when iterating over gzipped files.
338 | 
339 | v0.6.0 (2019-05-23)
340 | ~~~~~~~~~~~~~~~~~~~
341 | 
342 | * For reading from gzipped files, xopen will now use a ``pigz`` subprocess.
343 |   This is faster than using ``gzip.open``.
344 | * Python 2 support will be dropped in one of the next releases.
345 | 
346 | v0.5.0 (2019-01-30)
347 | ~~~~~~~~~~~~~~~~~~~
348 | 
349 | * By default, pigz is now only allowed to use at most four threads. This hopefully reduces
350 |   problems some users had with too many threads when opening many files at the same time.
351 | * xopen now accepts pathlib.Path objects.
352 | 
353 | v0.4.0 (2019-01-07)
354 | ~~~~~~~~~~~~~~~~~~~
355 | 
356 | * Drop Python 3.3 support
357 | * Add a ``threads`` parameter (passed on to ``pigz``)
358 | 
359 | v0.3.2 (2017-11-22)
360 | ~~~~~~~~~~~~~~~~~~~
361 | 
362 | * #6: Make multi-block bz2 work on Python 2 by using external bz2file library.
363 | 
364 | v0.3.1 (2017-11-22)
365 | ~~~~~~~~~~~~~~~~~~~
366 | 
367 | * Drop Python 2.6 support
368 | * #5: Fix PipedGzipReader.read() not returning anything
369 | 
370 | v0.3.0 (2017-11-15)
371 | ~~~~~~~~~~~~~~~~~~~
372 | 
373 | * Add gzip compression parameter
374 | 
375 | v0.2.1 (2017-05-31)
376 | ~~~~~~~~~~~~~~~~~~~
377 | 
378 | * #3: Allow appending to bz2 and lzma files where possible
379 | 
380 | v0.1.1 (2016-12-02)
381 | ~~~~~~~~~~~~~~~~~~~
382 | 
383 | * Fix a deadlock
384 | 
385 | v0.1.0 (2016-09-09)
386 | ~~~~~~~~~~~~~~~~~~~
387 | 
388 | * Initial release
389 | 
390 | Credits
391 | -------
392 | 
393 | The name ``xopen`` was taken from the C function of the same name in the
394 | `utils.h file that is part of
395 | BWA <https://github.com/lh3/bwa/blob/83662032a2192d5712996f36069ab02db82acf67/utils.h>`_.
396 | 
397 | Some ideas were taken from the `canopener project <https://github.com/selassid/canopener>`_.
398 | If you also want to open S3 files, you may want to use that module instead.
399 | 
400 | @kyleabeauchamp contributed support for appending to files before this repository was created.
401 | 
402 | 
403 | Maintainers
404 | -----------
405 | 
406 | * Marcel Martin
407 | * Ruben Vorderman
408 | * See also the `full list of contributors <https://github.com/pycompression/xopen/graphs/contributors>`_.
409 | 
410 | 
411 | Links
412 | -----
413 | 
414 | * `Source code <https://github.com/pycompression/xopen/>`_
415 | * `Report an issue <https://github.com/pycompression/xopen/issues>`_
416 | * `Project page on PyPI (Python package index) <https://pypi.python.org/pypi/xopen/>`_
417 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools >= 77.0.3", "setuptools_scm[toml]>=6.2"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "xopen"
 7 | authors = [
 8 |     {name = "Marcel Martin", email = "marcel.martin@scilifelab.se"},
 9 |     {name = "Ruben Vorderman", email = "r.h.p.vorderman@lumc.nl"}
10 | ]
11 | description = "Open compressed files transparently"
12 | readme = "README.rst"
13 | license = "MIT"
14 | classifiers = [
15 |     "Development Status :: 5 - Production/Stable",
16 |     "Programming Language :: Python :: 3"
17 | ]
18 | requires-python = ">=3.9"
19 | dynamic = ["version"]
20 | dependencies = [
21 |     'isal>=1.6.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"',
22 |     'zlib-ng>=0.4.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"'
23 | ]
24 | 
25 | [project.urls]
26 | homepage = "https://github.com/pycompression/xopen/"
27 | 
28 | [project.optional-dependencies]
29 | dev = ["pytest"]
30 | zstd = ["zstandard<1"]
31 | 
32 | [tool.setuptools_scm]
33 | write_to = "src/xopen/_version.py"
34 | 
35 | [tool.pytest.ini_options]
36 | addopts = "--strict-markers"
37 | 


--------------------------------------------------------------------------------
/src/xopen/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Open compressed files transparently.
  3 | """
  4 | 
  5 | __all__ = [
  6 |     "xopen",
  7 |     "_PipedCompressionProgram",
  8 |     "__version__",
  9 | ]
 10 | 
 11 | import dataclasses
 12 | import gzip
 13 | import stat
 14 | import sys
 15 | import io
 16 | import os
 17 | import bz2
 18 | import lzma
 19 | import signal
 20 | import pathlib
 21 | import subprocess
 22 | import tempfile
 23 | import threading
 24 | import time
 25 | from typing import (
 26 |     Dict,
 27 |     Optional,
 28 |     Union,
 29 |     IO,
 30 |     overload,
 31 |     BinaryIO,
 32 |     Literal,
 33 |     Tuple,
 34 | )
 35 | from types import ModuleType
 36 | 
 37 | from ._version import version as __version__
 38 | 
 39 | # 128K buffer size also used by cat, pigz etc. It is faster than the 8K default.
 40 | BUFFER_SIZE = max(io.DEFAULT_BUFFER_SIZE, 128 * 1024)
 41 | 
 42 | XOPEN_DEFAULT_GZIP_COMPRESSION = 1
 43 | XOPEN_DEFAULT_BZ2_COMPRESSION = 9
 44 | XOPEN_DEFAULT_XZ_COMPRESSION = 6
 45 | XOPEN_DEFAULT_ZST_COMPRESSION = 3
 46 | 
 47 | igzip: Optional[ModuleType]
 48 | isal_zlib: Optional[ModuleType]
 49 | igzip_threaded: Optional[ModuleType]
 50 | zlib_ng: Optional[ModuleType]
 51 | gzip_ng: Optional[ModuleType]
 52 | gzip_ng_threaded: Optional[ModuleType]
 53 | 
 54 | try:
 55 |     from isal import igzip, igzip_threaded, isal_zlib
 56 | except ImportError:
 57 |     igzip = None
 58 |     isal_zlib = None
 59 |     igzip_threaded = None
 60 | 
 61 | try:
 62 |     from zlib_ng import gzip_ng, gzip_ng_threaded, zlib_ng
 63 | except ImportError:
 64 |     gzip_ng = None
 65 |     gzip_ng_threaded = None
 66 |     zlib_ng = None
 67 | 
 68 | try:
 69 |     import zstandard  # type: ignore
 70 | except ImportError:
 71 |     zstandard = None  # type: ignore
 72 | 
 73 | try:
 74 |     import fcntl
 75 | 
 76 |     # fcntl.F_SETPIPE_SZ will be available in python 3.10.
 77 |     # https://github.com/python/cpython/pull/21921
 78 |     # If not available: set it to the correct value for known platforms.
 79 |     if not hasattr(fcntl, "F_SETPIPE_SZ") and sys.platform == "linux":
 80 |         setattr(fcntl, "F_SETPIPE_SZ", 1031)
 81 | except ImportError:
 82 |     fcntl = None  # type: ignore
 83 | 
 84 | _MAX_PIPE_SIZE_PATH = pathlib.Path("/proc/sys/fs/pipe-max-size")
 85 | try:
 86 |     _MAX_PIPE_SIZE = int(
 87 |         _MAX_PIPE_SIZE_PATH.read_text(encoding="ascii")
 88 |     )  # type: Optional[int]
 89 | except (
 90 |     OSError
 91 | ):  # Catches file not found and permission errors. Possible other errors too.
 92 |     _MAX_PIPE_SIZE = None
 93 | 
 94 | 
 95 | FilePath = Union[str, bytes, os.PathLike]
 96 | FileOrPath = Union[FilePath, IO]
 97 | 
 98 | 
 99 | @dataclasses.dataclass
100 | class _ProgramSettings:
101 |     program_args: Tuple[str, ...]
102 |     acceptable_compression_levels: Tuple[int, ...] = tuple(range(1, 10))
103 |     threads_flag: Optional[str] = None
104 |     # This exit code is not interpreted as an error when terminating the process
105 |     allowed_exit_code: Optional[int] = -signal.SIGTERM
106 |     # If this message is printed on stderr on terminating the process,
107 |     # it is not interpreted as an error
108 |     allowed_exit_message: Optional[bytes] = None
109 | 
110 | 
111 | _PROGRAM_SETTINGS: Dict[str, _ProgramSettings] = {
112 |     "pbzip2": _ProgramSettings(
113 |         ("pbzip2",),
114 |         tuple(range(1, 10)),
115 |         "-p",
116 |         allowed_exit_code=None,
117 |         allowed_exit_message=b"\n *Control-C or similar caught [sig=15], quitting...",
118 |     ),
119 |     "xz": _ProgramSettings(("xz",), tuple(range(0, 10)), "-T"),
120 |     "zstd": _ProgramSettings(("zstd",), tuple(range(1, 20)), "-T"),
121 |     "pigz": _ProgramSettings(("pigz", "--no-name"), tuple(range(0, 10)) + (11,), "-p"),
122 |     "gzip": _ProgramSettings(("gzip", "--no-name"), tuple(range(1, 10))),
123 | }
124 | 
125 | 
126 | def _available_cpu_count() -> int:
127 |     """
128 |     Number of available virtual or physical CPUs on this system
129 |     Adapted from http://stackoverflow.com/a/1006301/715090
130 |     """
131 |     try:
132 |         return len(os.sched_getaffinity(0))
133 |     except AttributeError:
134 |         pass
135 |     import re
136 | 
137 |     try:
138 |         with open("/proc/self/status") as f:
139 |             status = f.read()
140 |         m = re.search(r"(?m)^Cpus_allowed:\s*(.*)$", status)
141 |         if m:
142 |             res = bin(int(m.group(1).replace(",", ""), 16)).count("1")
143 |             if res > 0:
144 |                 return res
145 |     except OSError:
146 |         pass
147 |     count = os.cpu_count()
148 |     return 1 if count is None else count
149 | 
150 | 
151 | def _set_pipe_size_to_max(fd: int) -> None:
152 |     """
153 |     Set pipe size to maximum on platforms that support it.
154 |     :param fd: The file descriptor to increase the pipe size for.
155 |     """
156 |     if not hasattr(fcntl, "F_SETPIPE_SZ") or not _MAX_PIPE_SIZE:
157 |         return
158 |     try:
159 |         fcntl.fcntl(fd, fcntl.F_SETPIPE_SZ, _MAX_PIPE_SIZE)  # type: ignore
160 |     except OSError:
161 |         pass
162 | 
163 | 
164 | class _PipedCompressionProgram(io.IOBase):
165 |     """
166 |     Read and write compressed files by running an external process and piping into it.
167 |     """
168 | 
169 |     def __init__(
170 |         self,
171 |         filename: FileOrPath,
172 |         mode="rb",
173 |         compresslevel: Optional[int] = None,
174 |         threads: Optional[int] = None,
175 |         program_settings: _ProgramSettings = _ProgramSettings(("gzip", "--no-name")),
176 |     ):
177 |         """
178 |         mode -- one of 'w', 'wb', 'a', 'ab'
179 |         compresslevel -- compression level
180 |         threads_flag -- which flag is used to denote the number of threads in the program.
181 |             If set to none, program will be called without threads flag.
182 |         threads (int) -- number of threads. If this is set to None, a reasonable default is
183 |             used. At the moment, this means that the number of available CPU cores is used, capped
184 |             at four to avoid creating too many threads. Use 0 to use all available cores.
185 |         """
186 |         self._error_raised = False
187 |         self._program_args = list(program_settings.program_args)
188 |         self._allowed_exit_code = program_settings.allowed_exit_code
189 |         self._allowed_exit_message = program_settings.allowed_exit_message
190 |         if mode not in ("r", "rb", "w", "wb", "a", "ab"):
191 |             raise ValueError(
192 |                 f"Mode is '{mode}', but it must be 'r', 'rb', 'w', 'wb', 'a', or 'ab'"
193 |             )
194 |         if "b" not in mode:
195 |             mode += "b"
196 |         if (
197 |             compresslevel is not None
198 |             and compresslevel not in program_settings.acceptable_compression_levels
199 |         ):
200 |             raise ValueError(
201 |                 f"compresslevel must be in {program_settings.acceptable_compression_levels}."
202 |             )
203 |         self._compresslevel = compresslevel
204 |         self.fileobj, self.closefd = _file_or_path_to_binary_stream(filename, mode)
205 |         self._path = _filepath_from_path_or_filelike(filename)
206 |         self.name: str = str(self._path)
207 |         self._mode: str = mode
208 |         self._stderr = tempfile.TemporaryFile("w+b")
209 |         self._threads_flag: Optional[str] = program_settings.threads_flag
210 | 
211 |         if threads is None:
212 |             if "r" in mode:
213 |                 # Reading occurs single threaded by default. This has the least
214 |                 # amount of overhead and is fast enough for most use cases.
215 |                 threads = 1
216 |             else:
217 |                 threads = min(_available_cpu_count(), 4)
218 |         self._threads = threads
219 | 
220 |         self._open_process()
221 | 
222 |     def _open_process(self):
223 |         if self._threads != 0 and self._threads_flag is not None:
224 |             self._program_args += [f"{self._threads_flag}{self._threads}"]
225 | 
226 |         # Setting close_fds to True in the Popen arguments is necessary due to
227 |         # <http://bugs.python.org/issue12786>.
228 |         # However, close_fds is not supported on Windows. See
229 |         # <https://github.com/marcelm/cutadapt/issues/315>.
230 |         close_fds = False
231 |         if sys.platform != "win32":
232 |             close_fds = True
233 | 
234 |         self.in_pipe = None
235 |         self.in_thread = None
236 |         self._feeding = True
237 |         if "r" in self._mode:
238 |             self._program_args += ["-c", "-d"]  # type: ignore
239 |             stdout = subprocess.PIPE
240 |         else:
241 |             if self._compresslevel is not None:
242 |                 self._program_args += ["-" + str(self._compresslevel)]
243 |             stdout = self.fileobj  # type: ignore
244 |         try:
245 |             self.process = subprocess.Popen(
246 |                 self._program_args,
247 |                 stderr=self._stderr,
248 |                 stdout=stdout,
249 |                 stdin=subprocess.PIPE,
250 |                 close_fds=close_fds,
251 |             )  # type: ignore
252 |         except OSError:
253 |             if self.closefd:
254 |                 self.fileobj.close()
255 |             raise
256 |         assert self.process.stdin is not None
257 |         if "r" in self._mode:
258 |             self.in_pipe = self.process.stdin
259 |             # A python subprocess can read and write from pipes, but not from
260 |             # Python in-memory objects. In order for a program to read from an
261 |             # in-memory object, a pipe must be created. This pipe must be fed
262 |             # data from the in-memory object. This must be done in a separate
263 |             # thread, because IO operations will block when the pipe is full
264 |             # when writing, or empty when reading. Since the quantity of output
265 |             # data generated by a certain amount of input data is unknown, the
266 |             # only way to prevent a blocking application is to write
267 |             # data continuously to the process stdin on another thread.
268 |             self.in_thread = threading.Thread(target=self._feed_pipe)
269 |             self.in_thread.start()
270 |             self._process_explicitly_terminated = False
271 |             self._file: BinaryIO = self.process.stdout  # type: ignore
272 |             self._wait_for_output_or_process_exit()
273 |             self._raise_if_error()
274 |         else:
275 |             self._file = self.process.stdin  # type: ignore
276 | 
277 |         _set_pipe_size_to_max(self._file.fileno())
278 | 
279 |     def __repr__(self):
280 |         return (
281 |             f"{self.__class__.__name__}"
282 |             f"('{self.name}', mode='{self._mode}', "
283 |             f"program='{' '.join(self._program_args)}', "
284 |             f"threads={self._threads})"
285 |         )
286 | 
287 |     def _feed_pipe(self):
288 |         try:
289 |             while self._feeding:
290 |                 chunk = self.fileobj.read(BUFFER_SIZE)
291 |                 if chunk == b"":
292 |                     self.in_pipe.close()
293 |                     return
294 |                 try:
295 |                     self.in_pipe.write(chunk)
296 |                 except BrokenPipeError:
297 |                     if not self._process_explicitly_terminated:
298 |                         raise
299 |         finally:
300 |             self.in_pipe.close()
301 | 
302 |     def write(self, arg: bytes) -> int:
303 |         return self._file.write(arg)
304 | 
305 |     def read(self, *args) -> bytes:
306 |         return self._file.read(*args)
307 | 
308 |     def readinto(self, *args):
309 |         return self._file.readinto(*args)
310 | 
311 |     def readline(self, *args) -> bytes:
312 |         return self._file.readline(*args)
313 | 
314 |     def seekable(self) -> bool:
315 |         return self._file.seekable()
316 | 
317 |     def tell(self) -> int:
318 |         return self._file.tell()
319 | 
320 |     def peek(self, n: Optional[int] = None):
321 |         return self._file.peek(n)  # type: ignore
322 | 
323 |     def seek(self, offset, whence=0) -> int:
324 |         return self._file.seek(offset, whence)
325 | 
326 |     def close(self) -> None:
327 |         if self.closed:
328 |             return
329 |         super().close()
330 |         if not hasattr(self, "process"):
331 |             # Exception was raised during __init__
332 |             if hasattr(self, "_stderr"):
333 |                 self._stderr.close()
334 |             return
335 |         check_allowed_code_and_message = False
336 |         if "r" in self._mode:
337 |             retcode = self.process.poll()
338 |             if retcode is None:
339 |                 # still running
340 |                 self._process_explicitly_terminated = True
341 |                 self.process.terminate()
342 |                 check_allowed_code_and_message = True
343 |                 self.process.wait()
344 |             self._feeding = False
345 |             self._file.read()
346 |             if self.in_thread:
347 |                 self.in_thread.join()
348 |             self._file.close()
349 |         else:
350 |             self._file.close()
351 |             self.process.wait()
352 |         if self.closefd:
353 |             self.fileobj.close()
354 |         stderr_message = self._read_error_message()
355 |         self._stderr.close()
356 |         if not self._error_raised:
357 |             # Only check for errors if none have been found earlier.
358 |             self._raise_if_error(check_allowed_code_and_message, stderr_message)
359 | 
360 |     def _wait_for_output_or_process_exit(self):
361 |         """
362 |         Wait for the process to produce at least some output, or has exited.
363 |         """
364 |         # The program may crash due to a non-existing file, internal error etc.
365 |         # In that case we need to check. However the 'time-to-crash' differs
366 |         # between programs. Some crash faster than others.
367 |         # Therefore we peek the first character(s) of stdout. Peek will return at
368 |         # least one byte of data, unless the buffer is empty or at EOF. If at EOF,
369 |         # we should wait for the program to exit. This way we ensure the program
370 |         # has at least decompressed some output, or stopped before we continue.
371 | 
372 |         # stdout is io.BufferedReader if set to PIPE
373 |         while True:
374 |             first_output = self.process.stdout.peek(1)
375 |             exit_code = self.process.poll()
376 |             if first_output or exit_code is not None:
377 |                 break
378 |             time.sleep(0.01)
379 | 
380 |     def _raise_if_error(
381 |         self, check_allowed_code_and_message: bool = False, stderr_message: bytes = b""
382 |     ) -> None:
383 |         """
384 |         Raise OSError if process is not running anymore and the exit code is
385 |         nonzero. If check_allowed_code_and_message is set, OSError is not raised when
386 |         (1) the exit value of the process is equal to the value of the allowed_exit_code
387 |         attribute or (2) the allowed_exit_message attribute is set and it matches with
388 |         stderr_message.
389 |         """
390 |         retcode = self.process.poll()
391 | 
392 |         if sys.platform == "win32" and retcode == 1 and stderr_message == b"":
393 |             # Special case for Windows. Winapi terminates processes with exit code 1
394 |             # and an empty error message.
395 |             return
396 | 
397 |         if retcode is None:
398 |             # process still running
399 |             return
400 |         if retcode == 0:
401 |             # process terminated successfully
402 |             return
403 | 
404 |         if check_allowed_code_and_message:
405 |             if retcode == self._allowed_exit_code:
406 |                 # terminated with allowed exit code
407 |                 return
408 |             if self._allowed_exit_message and stderr_message.startswith(
409 |                 self._allowed_exit_message
410 |             ):
411 |                 # terminated with another exit code, but message is allowed
412 |                 return
413 | 
414 |         if not stderr_message:
415 |             stderr_message = self._read_error_message()
416 | 
417 |         self._file.close()
418 |         self._error_raised = True
419 |         raise OSError(f"{stderr_message!r} (exit code {retcode})")
420 | 
421 |     def _read_error_message(self):
422 |         if self._stderr.closed:
423 |             return b""
424 |         self._stderr.flush()
425 |         self._stderr.seek(0)
426 |         return self._stderr.read()
427 | 
428 |     def __iter__(self):
429 |         return self
430 | 
431 |     def __next__(self) -> bytes:
432 |         return self._file.__next__()
433 | 
434 |     def readable(self):
435 |         return self._file.readable()
436 | 
437 |     def writable(self):
438 |         return self._file.writable()
439 | 
440 |     def flush(self) -> None:
441 |         return None
442 | 
443 | 
444 | def _open_stdin_or_out(mode: str) -> BinaryIO:
445 |     assert mode in ("rb", "ab", "wb")
446 |     std = sys.stdin if mode == "rb" else sys.stdout
447 |     return open(std.fileno(), mode=mode, closefd=False)  # type: ignore
448 | 
449 | 
450 | def _open_bz2(
451 |     filename: FileOrPath,
452 |     mode: str,
453 |     compresslevel: Optional[int],
454 |     threads: Optional[int],
455 | ):
456 |     assert mode in ("rb", "ab", "wb")
457 |     if compresslevel is None:
458 |         compresslevel = XOPEN_DEFAULT_BZ2_COMPRESSION
459 |     if threads != 0:
460 |         try:
461 |             # pbzip2 can compress using multiple cores.
462 |             return _PipedCompressionProgram(
463 |                 filename,
464 |                 mode,
465 |                 compresslevel,
466 |                 threads=threads,
467 |                 program_settings=_PROGRAM_SETTINGS["pbzip2"],
468 |             )
469 |         except OSError:
470 |             pass  # We try without threads.
471 | 
472 |     bz2_file = bz2.open(filename, mode, compresslevel)
473 |     if "r" in mode:
474 |         return bz2_file
475 |     # Buffer writes on bz2.open to mitigate overhead of small writes
476 |     return io.BufferedWriter(bz2_file)  # type: ignore
477 | 
478 | 
479 | def _open_xz(
480 |     filename: FileOrPath,
481 |     mode: str,
482 |     compresslevel: Optional[int],
483 |     threads: Optional[int],
484 | ):
485 |     assert mode in ("rb", "ab", "wb")
486 |     if compresslevel is None:
487 |         compresslevel = XOPEN_DEFAULT_XZ_COMPRESSION
488 | 
489 |     if threads != 0:
490 |         try:
491 |             # xz can compress using multiple cores.
492 |             return _PipedCompressionProgram(
493 |                 filename,
494 |                 mode,
495 |                 compresslevel,
496 |                 threads,
497 |                 _PROGRAM_SETTINGS["xz"],
498 |             )
499 |         except OSError:
500 |             pass  # We try without threads.
501 | 
502 |     if "r" in mode:
503 |         return lzma.open(filename, mode)
504 |     # Buffer writes on lzma.open to mitigate overhead of small writes
505 |     return io.BufferedWriter(lzma.open(filename, mode, preset=compresslevel))  # type: ignore
506 | 
507 | 
508 | def _open_zst(
509 |     filename: FileOrPath,
510 |     mode: str,
511 |     compresslevel: Optional[int],
512 |     threads: Optional[int],
513 | ):
514 |     assert mode in ("rb", "ab", "wb")
515 |     assert compresslevel != 0
516 |     if compresslevel is None:
517 |         compresslevel = XOPEN_DEFAULT_ZST_COMPRESSION
518 |     if zstandard:
519 |         max_window_bits = zstandard.WINDOWLOG_MAX
520 |     else:
521 |         max_window_bits = 31
522 |     if threads != 0:
523 |         try:
524 |             # zstd can compress using multiple cores
525 |             program_args: Tuple[str, ...] = ("zstd",)
526 |             if "r" in mode:
527 |                 # Only use --long=31 for decompression. Using it for
528 |                 # compression overrides level settings for window size and
529 |                 # forces other zstd users to use `--long=31` to decompress any
530 |                 # archive that has been compressed by xopen.
531 |                 program_args += (f"--long={max_window_bits}",)
532 |             return _PipedCompressionProgram(
533 |                 filename,
534 |                 mode,
535 |                 compresslevel,
536 |                 threads,
537 |                 _ProgramSettings(program_args, tuple(range(1, 20)), "-T"),
538 |             )
539 |         except OSError:
540 |             if zstandard is None:
541 |                 # No fallback available
542 |                 raise
543 | 
544 |     if zstandard is None:
545 |         raise ImportError("zstandard module (python-zstandard) not available")
546 |     dctx = zstandard.ZstdDecompressor(max_window_size=2**max_window_bits)
547 |     cctx = zstandard.ZstdCompressor(level=compresslevel)
548 |     f = zstandard.open(filename, mode, cctx=cctx, dctx=dctx)  # type: ignore
549 |     if mode == "rb":
550 |         return io.BufferedReader(f)
551 |     return io.BufferedWriter(f)  # mode "ab" and "wb"
552 | 
553 | 
554 | def _open_gz(
555 |     filename: FileOrPath,
556 |     mode: str,
557 |     compresslevel: Optional[int],
558 |     threads: Optional[int],
559 | ):
560 |     """
561 |     Open a gzip file. The ISA-L library is preferred when applicable because
562 |     it is the fastest. Then zlib-ng which is not as fast, but supports all
563 |     compression levels. After that comes pigz, which can utilize multiple
564 |     threads and is more efficient than gzip, even on one core. gzip is chosen
565 |     when none of the alternatives are available. Despite it being able to use
566 |     only one core, it still finishes faster than using the builtin gzip library
567 |     as the (de)compression is moved to another thread.
568 |     """
569 |     assert mode in ("rb", "ab", "wb")
570 |     if compresslevel is None:
571 |         # Force the same compression level on every tool regardless of
572 |         # library defaults
573 |         compresslevel = XOPEN_DEFAULT_GZIP_COMPRESSION
574 |     if compresslevel not in range(10):
575 |         # Level 0-9 are supported regardless of backend support
576 |         # (zlib_ng supports -1, pigz supports 11 etc.)
577 |         raise ValueError(
578 |             f"gzip compresslevel must be in range 0-9, got {compresslevel}."
579 |         )
580 | 
581 |     if threads != 0:
582 |         # Igzip level 0 does not output uncompressed deflate blocks as zlib does
583 |         # and level 3 is slower but does not compress better than level 1 and 2.
584 |         if igzip_threaded and (compresslevel in (1, 2) or "r" in mode):
585 |             return igzip_threaded.open(  # type: ignore
586 |                 filename,
587 |                 mode,
588 |                 compresslevel,
589 |                 threads=1,
590 |             )
591 |         if gzip_ng_threaded and zlib_ng:
592 |             return gzip_ng_threaded.open(
593 |                 filename,
594 |                 mode,
595 |                 # zlib-ng level 1 is 50% bigger than zlib level 1. Level
596 |                 # 2 gives a size close to expectations.
597 |                 compresslevel=2 if compresslevel == 1 else compresslevel,
598 |                 threads=threads or max(_available_cpu_count(), 4),
599 |             )
600 | 
601 |         for program in ("pigz", "gzip"):
602 |             try:
603 |                 return _PipedCompressionProgram(
604 |                     filename,
605 |                     mode,
606 |                     compresslevel,
607 |                     threads,
608 |                     _PROGRAM_SETTINGS[program],
609 |                 )
610 |             # ValueError when compresslevel is not supported. i.e. gzip and level 0
611 |             except (OSError, ValueError):
612 |                 pass  # We try without threads.
613 |     return _open_reproducible_gzip(filename, mode=mode, compresslevel=compresslevel)
614 | 
615 | 
616 | def _open_reproducible_gzip(filename, mode: str, compresslevel: int):
617 |     """
618 |     Open a gzip file for writing (without external processes)
619 |     that has neither mtime nor the file name in the header
620 |     (equivalent to gzip --no-name)
621 |     """
622 |     assert mode in ("rb", "wb", "ab")
623 |     assert compresslevel is not None
624 |     fileobj, closefd = _file_or_path_to_binary_stream(filename, mode)
625 |     # Neither gzip.open nor igzip.open have an mtime option, and they will
626 |     # always write the file name, so we need to open the file separately
627 |     # and pass it to gzip.GzipFile/igzip.IGzipFile.
628 |     kwargs = dict(
629 |         fileobj=fileobj,
630 |         filename="",
631 |         mode=mode,
632 |         mtime=0,
633 |     )
634 |     # Igzip level 0 does not output uncompressed deflate blocks as zlib does
635 |     # and level 3 is slower but does not compress better than level 1 and 2.
636 |     if igzip is not None and (compresslevel in (1, 2) or "r" in mode):
637 |         gzip_file = igzip.IGzipFile(**kwargs, compresslevel=compresslevel)
638 |     elif gzip_ng is not None:
639 |         # Zlib-ng level 1 creates much bigger files than zlib level 1.
640 |         gzip_file = gzip_ng.GzipNGFile(
641 |             **kwargs, compresslevel=2 if compresslevel == 1 else compresslevel
642 |         )
643 |     else:
644 |         gzip_file = gzip.GzipFile(**kwargs, compresslevel=compresslevel)  # type: ignore
645 |     # When (I)GzipFile is created with a fileobj instead of a filename,
646 |     # the passed file object is not closed when (I)GzipFile.close()
647 |     # is called. This forces it to be closed.
648 |     if closefd:
649 |         gzip_file.myfileobj = fileobj
650 |     if sys.version_info < (3, 12) and "r" not in mode:
651 |         # From version 3.12 onwards, gzip is properly internally buffered for writing.
652 |         return io.BufferedWriter(gzip_file)  # type: ignore
653 |     return gzip_file
654 | 
655 | 
656 | def _detect_format_from_content(filename: FileOrPath) -> Optional[str]:
657 |     """
658 |     Attempts to detect file format from the content by reading the first
659 |     6 bytes. Returns None if no format could be detected.
660 |     """
661 |     fileobj, closefd = _file_or_path_to_binary_stream(filename, "rb")
662 |     try:
663 |         if not fileobj.readable():
664 |             return None
665 |         if hasattr(fileobj, "peek"):
666 |             bs = fileobj.peek(6)
667 |         elif hasattr(fileobj, "seekable") and fileobj.seekable():
668 |             current_pos = fileobj.tell()
669 |             bs = fileobj.read(6)
670 |             fileobj.seek(current_pos)
671 |         else:
672 |             return None
673 | 
674 |         if bs[:2] == b"\x1f\x8b":
675 |             # https://tools.ietf.org/html/rfc1952#page-6
676 |             return "gz"
677 |         elif bs[:3] == b"\x42\x5a\x68":
678 |             # https://en.wikipedia.org/wiki/List_of_file_signatures
679 |             return "bz2"
680 |         elif bs[:6] == b"\xfd\x37\x7a\x58\x5a\x00":
681 |             # https://tukaani.org/xz/xz-file-format.txt
682 |             return "xz"
683 |         elif bs[:4] == b"\x28\xb5\x2f\xfd":
684 |             # https://datatracker.ietf.org/doc/html/rfc8478#section-3.1.1
685 |             return "zst"
686 |         return None
687 |     finally:
688 |         if closefd:
689 |             fileobj.close()
690 | 
691 | 
692 | def _detect_format_from_extension(filename: Union[str, bytes]) -> Optional[str]:
693 |     """
694 |     Attempt to detect file format from the filename extension.
695 |     Return None if no format could be detected.
696 |     """
697 |     for ext in ("bz2", "xz", "gz", "zst"):
698 |         if isinstance(filename, bytes):
699 |             if filename.endswith(b"." + ext.encode()):
700 |                 return ext
701 |         else:
702 |             if filename.endswith("." + ext):
703 |                 return ext
704 |     return None
705 | 
706 | 
707 | def _file_or_path_to_binary_stream(
708 |     file_or_path: FileOrPath, binary_mode: str
709 | ) -> Tuple[BinaryIO, bool]:
710 |     assert binary_mode in ("rb", "wb", "ab")
711 |     if isinstance(file_or_path, (str, bytes)) or hasattr(file_or_path, "__fspath__"):
712 |         return open(os.fspath(file_or_path), binary_mode), True  # type: ignore
713 |     if isinstance(file_or_path, io.TextIOWrapper):
714 |         return file_or_path.buffer, False
715 |     if hasattr(file_or_path, "readinto") or hasattr(file_or_path, "write"):
716 |         # Very lenient fallback for all filelike objects. If the filelike
717 |         # object is not binary, this will crash at a later point.
718 |         return file_or_path, False  # type: ignore
719 |     raise TypeError(
720 |         f"Unsupported type for {file_or_path}, " f"{file_or_path.__class__.__name__}."
721 |     )
722 | 
723 | 
724 | def _filepath_from_path_or_filelike(fileorpath: FileOrPath) -> str:
725 |     try:
726 |         return os.fspath(fileorpath)  # type: ignore
727 |     except TypeError:
728 |         pass
729 |     if hasattr(fileorpath, "name"):
730 |         name = fileorpath.name
731 |         if isinstance(name, str):
732 |             return name
733 |         elif isinstance(name, bytes):
734 |             return name.decode()
735 |     return ""
736 | 
737 | 
738 | def _file_is_a_socket_or_pipe(filepath):
739 |     try:
740 |         mode = os.stat(filepath).st_mode
741 |         # Treat anything that is not a regular file as special
742 |         return not stat.S_ISREG(mode)
743 |     except (OSError, TypeError):  # Type error for unexpected types in stat.
744 |         return False
745 | 
746 | 
747 | @overload
748 | def xopen(
749 |     filename: FileOrPath,
750 |     mode: Literal["r", "w", "a", "rt", "wt", "at"] = ...,
751 |     compresslevel: Optional[int] = ...,
752 |     threads: Optional[int] = ...,
753 |     *,
754 |     encoding: str = ...,
755 |     errors: Optional[str] = ...,
756 |     newline: Optional[str] = ...,
757 |     format: Optional[str] = ...,
758 | ) -> io.TextIOWrapper:
759 |     ...
760 | 
761 | 
762 | @overload
763 | def xopen(
764 |     filename: FileOrPath,
765 |     mode: Literal["rb", "wb", "ab"],
766 |     compresslevel: Optional[int] = ...,
767 |     threads: Optional[int] = ...,
768 |     *,
769 |     encoding: str = ...,
770 |     errors: None = ...,
771 |     newline: None = ...,
772 |     format: Optional[str] = ...,
773 | ) -> BinaryIO:
774 |     ...
775 | 
776 | 
777 | def xopen(  # noqa: C901
778 |     filename: FileOrPath,
779 |     mode: Literal["r", "w", "a", "rt", "rb", "wt", "wb", "at", "ab"] = "r",
780 |     compresslevel: Optional[int] = None,
781 |     threads: Optional[int] = None,
782 |     *,
783 |     encoding: str = "utf-8",
784 |     errors: Optional[str] = None,
785 |     newline: Optional[str] = None,
786 |     format: Optional[str] = None,
787 | ) -> IO:
788 |     """
789 |     A replacement for the "open" function that can also read and write
790 |     compressed files transparently. The supported compression formats are gzip,
791 |     bzip2, xz and zstandard. If the filename is '-', standard output (mode 'w') or
792 |     standard input (mode 'r') is returned. Filename can be a string or a
793 |     file object. (See https://docs.python.org/3/glossary.html#term-file-object.)
794 | 
795 |     When writing, the file format is chosen based on the file name extension:
796 |     - .gz uses gzip compression
797 |     - .bz2 uses bzip2 compression
798 |     - .xz uses xz/lzma compression
799 |     - .zst uses zstandard compression
800 |     - otherwise, no compression is used
801 | 
802 |     When reading, if a file name extension is available, the format is detected
803 |     using it, but if not, the format is detected from the contents.
804 | 
805 |     mode can be: 'rt', 'rb', 'at', 'ab', 'wt', or 'wb'. Also, the 't' can be omitted,
806 |     so instead of 'rt', 'wt' and 'at', the abbreviations 'r', 'w' and 'a' can be used.
807 | 
808 |     compresslevel is the compression level for writing to gzip, xz and zst files.
809 |     This parameter is ignored for the other compression formats.
810 |     If set to None, a default depending on the format is used:
811 |     gzip: 6, xz: 6, zstd: 3.
812 | 
813 |     When threads is None (the default), compressed file formats are read or written
814 |     using a pipe to a subprocess running an external tool such as,
815 |     ``pbzip2``, ``gzip`` etc., see PipedGzipWriter, PipedGzipReader etc.
816 |     If the external tool supports multiple threads, *threads* can be set to an int
817 |     specifying the number of threads to use.
818 |     If no external tool supporting the compression format is available, the file is
819 |     opened calling the appropriate Python function
820 |     (that is, no subprocess is spawned).
821 | 
822 |     Set threads to 0 to force opening the file without using a subprocess.
823 | 
824 |     encoding, errors and newline are used when opening a file in text mode.
825 |     The parameters have the same meaning as in the built-in open function,
826 |     except that the default encoding is always UTF-8 instead of the
827 |     preferred locale encoding.
828 | 
829 |     format overrides the autodetection of input and output formats. This can be
830 |     useful when compressed output needs to be written to a file without an
831 |     extension. Possible values are "gz", "xz", "bz2", "zst".
832 |     """
833 |     if mode in ("r", "w", "a"):
834 |         mode += "t"  # type: ignore
835 |     if mode not in ("rt", "rb", "wt", "wb", "at", "ab"):
836 |         raise ValueError("Mode '{}' not supported".format(mode))
837 |     binary_mode = mode[0] + "b"
838 |     filepath = _filepath_from_path_or_filelike(filename)
839 | 
840 |     # Open non-regular files such as pipes and sockets here to force opening
841 |     # them once.
842 |     if filename == "-":
843 |         filename = _open_stdin_or_out(binary_mode)
844 |     elif _file_is_a_socket_or_pipe(filename):
845 |         filename = open(filename, binary_mode)  # type: ignore
846 | 
847 |     if format not in (None, "gz", "xz", "bz2", "zst"):
848 |         raise ValueError(
849 |             f"Format not supported: {format}. "
850 |             f"Choose one of: 'gz', 'xz', 'bz2', 'zst'"
851 |         )
852 |     detected_format = format or _detect_format_from_extension(filepath)
853 |     if detected_format is None and "r" in mode:
854 |         detected_format = _detect_format_from_content(filename)
855 | 
856 |     if detected_format == "gz":
857 |         opened_file = _open_gz(filename, binary_mode, compresslevel, threads)
858 |     elif detected_format == "xz":
859 |         opened_file = _open_xz(filename, binary_mode, compresslevel, threads)
860 |     elif detected_format == "bz2":
861 |         opened_file = _open_bz2(filename, binary_mode, compresslevel, threads)
862 |     elif detected_format == "zst":
863 |         opened_file = _open_zst(filename, binary_mode, compresslevel, threads)
864 |     else:
865 |         opened_file, _ = _file_or_path_to_binary_stream(filename, binary_mode)
866 | 
867 |     if "t" in mode:
868 |         return io.TextIOWrapper(opened_file, encoding, errors, newline)
869 |     return opened_file
870 | 


--------------------------------------------------------------------------------
/src/xopen/_version.pyi:
--------------------------------------------------------------------------------
1 | # The _version.py file is generated on installation. By including this stub,
2 | # we can run mypy without having to install the package.
3 | 
4 | version: str
5 | 


--------------------------------------------------------------------------------
/src/xopen/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pycompression/xopen/d1931cb5485f1f5a055edb35eec162d6a349722c/src/xopen/py.typed


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | import string
 4 | import pytest
 5 | 
 6 | from xopen import xopen
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def create_large_file(tmp_path):
11 |     def _create_large_file(extension):
12 |         path = tmp_path / f"large{extension}"
13 |         random.seed(0)
14 |         chars = string.ascii_lowercase + "\n"
15 |         # Do not decrease this length. The generated file needs to have
16 |         # a certain length after compression to trigger some bugs
17 |         # (in particular, 512 kB is not sufficient).
18 |         random_text = "".join(random.choices(chars, k=1024 * 1024))
19 |         with xopen(path, "w") as f:
20 |             f.write(random_text)
21 |         return path
22 | 
23 |     return _create_large_file
24 | 
25 | 
26 | @pytest.fixture
27 | def create_truncated_file(create_large_file):
28 |     def _create_truncated_file(extension):
29 |         large_file = create_large_file(extension)
30 |         with open(large_file, "a", encoding="ascii") as f:
31 |             f.truncate(os.stat(large_file).st_size - 10)
32 |         return large_file
33 | 
34 |     return _create_truncated_file
35 | 


--------------------------------------------------------------------------------
/tests/file.txt:
--------------------------------------------------------------------------------
1 | Testing, testing ...
2 | The second line.
3 | 


--------------------------------------------------------------------------------
/tests/file.txt.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pycompression/xopen/d1931cb5485f1f5a055edb35eec162d6a349722c/tests/file.txt.bz2


--------------------------------------------------------------------------------
/tests/file.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pycompression/xopen/d1931cb5485f1f5a055edb35eec162d6a349722c/tests/file.txt.gz


--------------------------------------------------------------------------------
/tests/file.txt.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pycompression/xopen/d1931cb5485f1f5a055edb35eec162d6a349722c/tests/file.txt.xz


--------------------------------------------------------------------------------
/tests/file.txt.zst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pycompression/xopen/d1931cb5485f1f5a055edb35eec162d6a349722c/tests/file.txt.zst


--------------------------------------------------------------------------------
/tests/hello.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pycompression/xopen/d1931cb5485f1f5a055edb35eec162d6a349722c/tests/hello.gz


--------------------------------------------------------------------------------
/tests/only_zeroes.zst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pycompression/xopen/d1931cb5485f1f5a055edb35eec162d6a349722c/tests/only_zeroes.zst


--------------------------------------------------------------------------------
/tests/test_piped.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests for the PipedCompression classes
  3 | """
  4 | import gzip
  5 | import io
  6 | import os
  7 | import shutil
  8 | import sys
  9 | import pytest
 10 | from pathlib import Path
 11 | from itertools import cycle
 12 | 
 13 | from xopen import (
 14 |     xopen,
 15 |     _PipedCompressionProgram,
 16 |     _MAX_PIPE_SIZE,
 17 |     _PROGRAM_SETTINGS,
 18 |     _ProgramSettings,
 19 | )
 20 | 
 21 | extensions = ["", ".gz", ".bz2", ".xz", ".zst"]
 22 | 
 23 | try:
 24 |     import fcntl
 25 | 
 26 |     if not hasattr(fcntl, "F_GETPIPE_SZ") and sys.platform == "linux":
 27 |         setattr(fcntl, "F_GETPIPE_SZ", 1032)
 28 | except ImportError:
 29 |     fcntl = None
 30 | 
 31 | base = os.path.join(os.path.dirname(__file__), "file.txt")
 32 | files = [base + ext for ext in extensions]
 33 | TEST_DIR = Path(__file__).parent
 34 | CONTENT_LINES = [b"Testing, testing ...\n", b"The second line.\n"]
 35 | CONTENT = b"".join(CONTENT_LINES)
 36 | 
 37 | 
 38 | def available_gzip_programs():
 39 |     return [_PROGRAM_SETTINGS[prog] for prog in ("gzip", "pigz") if shutil.which(prog)]
 40 | 
 41 | 
 42 | def available_bzip2_programs():
 43 |     if shutil.which("pbzip2"):
 44 |         return [_PROGRAM_SETTINGS["pbzip2"]]
 45 |     return []
 46 | 
 47 | 
 48 | def available_xz_programs():
 49 |     if shutil.which("xz"):
 50 |         return [_PROGRAM_SETTINGS["xz"]]
 51 |     return []
 52 | 
 53 | 
 54 | def available_zstd_programs():
 55 |     if shutil.which("zstd"):
 56 |         return [_PROGRAM_SETTINGS["zstd"]]
 57 |     return []
 58 | 
 59 | 
 60 | PIPED_GZIP_PROGRAMS = available_gzip_programs()
 61 | PIPED_BZIP2_PROGRAMS = available_bzip2_programs()
 62 | PIPED_XZ_PROGRAMS = available_xz_programs()
 63 | PIPED_ZST_PROGRAMS = available_zstd_programs()
 64 | 
 65 | ALL_PROGRAMS_WITH_EXTENSION = (
 66 |     list(zip(PIPED_GZIP_PROGRAMS, cycle([".gz"])))
 67 |     + list(zip(PIPED_BZIP2_PROGRAMS, cycle([".bz2"])))
 68 |     + list(zip(PIPED_XZ_PROGRAMS, cycle([".xz"])))
 69 |     + list(zip(PIPED_ZST_PROGRAMS, cycle([".zst"])))
 70 | )
 71 | 
 72 | 
 73 | THREADED_PROGRAMS = [
 74 |     settings
 75 |     for settings in ALL_PROGRAMS_WITH_EXTENSION
 76 |     if "pbzip2" in settings[0].program_args or "pigz" in settings[0].program_args
 77 | ]
 78 | 
 79 | 
 80 | @pytest.fixture(params=PIPED_GZIP_PROGRAMS)
 81 | def gzip_writer(request):
 82 |     return request.param
 83 | 
 84 | 
 85 | @pytest.fixture(params=ALL_PROGRAMS_WITH_EXTENSION)
 86 | def reader(request):
 87 |     return request.param
 88 | 
 89 | 
 90 | @pytest.fixture(params=THREADED_PROGRAMS)
 91 | def threaded_reader(request):
 92 |     return request.param
 93 | 
 94 | 
 95 | @pytest.fixture(params=ALL_PROGRAMS_WITH_EXTENSION)
 96 | def writer(request):
 97 |     return request.param
 98 | 
 99 | 
100 | def test_reader_readinto(reader):
101 |     program_settings, extension = reader
102 |     content = CONTENT
103 |     with _PipedCompressionProgram(
104 |         TEST_DIR / f"file.txt{extension}", "rb", program_settings=program_settings
105 |     ) as f:
106 |         b = bytearray(len(content) + 100)
107 |         length = f.readinto(b)
108 |         assert length == len(content)
109 |         assert b[:length] == content
110 | 
111 | 
112 | def test_reader_textiowrapper(reader):
113 |     program_settings, extension = reader
114 |     with _PipedCompressionProgram(
115 |         TEST_DIR / f"file.txt{extension}", "rb", program_settings=program_settings
116 |     ) as f:
117 |         wrapped = io.TextIOWrapper(f, encoding="utf-8")
118 |         assert wrapped.read() == CONTENT.decode("utf-8")
119 | 
120 | 
121 | def test_reader_readline(reader):
122 |     program_settings, extension = reader
123 |     with _PipedCompressionProgram(
124 |         TEST_DIR / f"file.txt{extension}",
125 |         "rb",
126 |         program_settings=program_settings,
127 |     ) as f:
128 |         assert f.readline() == CONTENT_LINES[0]
129 | 
130 | 
131 | def test_reader_readlines(reader):
132 |     program_settings, extension = reader
133 |     with _PipedCompressionProgram(
134 |         TEST_DIR / f"file.txt{extension}", "rb", program_settings=program_settings
135 |     ) as f:
136 |         assert f.readlines() == CONTENT_LINES
137 | 
138 | 
139 | @pytest.mark.parametrize("threads", [None, 1, 2])
140 | def test_piped_reader_iter(threads, threaded_reader):
141 |     program_settings, extension = threaded_reader
142 |     with _PipedCompressionProgram(
143 |         TEST_DIR / f"file.txt{extension}",
144 |         "rb",
145 |         program_settings=program_settings,
146 |     ) as f:
147 |         lines = list(f)
148 |         assert lines[0] == CONTENT_LINES[0]
149 | 
150 | 
151 | def test_writer(tmp_path, writer):
152 |     program_settings, extension = writer
153 |     path = tmp_path / f"out{extension}"
154 |     with _PipedCompressionProgram(
155 |         path, mode="wb", program_settings=program_settings
156 |     ) as f:
157 |         f.write(b"hello")
158 |     with xopen(path, mode="rb") as f:
159 |         assert f.read() == b"hello"
160 | 
161 | 
162 | def test_writer_has_iter_method(tmp_path, writer):
163 |     program_settings, extension = writer
164 |     path = tmp_path / f"out{extension}"
165 |     with _PipedCompressionProgram(
166 |         path,
167 |         mode="wb",
168 |         program_settings=program_settings,
169 |     ) as f:
170 |         f.write(b"hello")
171 |         assert hasattr(f, "__iter__")
172 | 
173 | 
174 | def test_reader_iter_without_with(reader):
175 |     program_settings, extension = reader
176 |     f = _PipedCompressionProgram(
177 |         TEST_DIR / f"file.txt{extension}", program_settings=program_settings
178 |     )
179 |     it = iter(f)
180 |     assert CONTENT_LINES[0] == next(it)
181 |     f.close()
182 | 
183 | 
184 | def test_reader_close(reader, create_large_file):
185 |     program_settings, extension = reader
186 |     large_file = create_large_file(extension)
187 |     with _PipedCompressionProgram(
188 |         large_file, "rb", program_settings=program_settings
189 |     ) as f:
190 |         f.readline()
191 | 
192 | 
193 | def test_invalid_gzip_compression_level(gzip_writer, tmp_path):
194 |     with pytest.raises(ValueError) as e:
195 |         with _PipedCompressionProgram(
196 |             tmp_path / "out.gz",
197 |             mode="w",
198 |             compresslevel=17,
199 |             program_settings=gzip_writer,
200 |         ) as f:
201 |             f.write(b"hello")  # pragma: no cover
202 |     assert "compresslevel must be" in e.value.args[0]
203 | 
204 | 
205 | def test_invalid_xz_compression_level(tmp_path):
206 |     with pytest.raises(ValueError) as e:
207 |         with _PipedCompressionProgram(
208 |             tmp_path / "out.xz",
209 |             mode="w",
210 |             compresslevel=17,
211 |             program_settings=_PROGRAM_SETTINGS["xz"],
212 |         ) as f:
213 |             f.write(b"hello")  # pragma: no cover
214 |     assert "compresslevel must be" in e.value.args[0]
215 | 
216 | 
217 | def test_invalid_zstd_compression_level(tmp_path):
218 |     with pytest.raises(ValueError) as e:
219 |         with _PipedCompressionProgram(
220 |             tmp_path / "out.zst",
221 |             mode="w",
222 |             compresslevel=25,
223 |             program_settings=_PROGRAM_SETTINGS["zstd"],
224 |         ) as f:
225 |             f.write(b"hello")  # pragma: no cover
226 |     assert "compresslevel must be" in e.value.args[0]
227 | 
228 | 
229 | def test_readers_read(reader):
230 |     program_settings, extension = reader
231 |     with _PipedCompressionProgram(
232 |         TEST_DIR / f"file.txt{extension}", "rb", program_settings=program_settings
233 |     ) as f:
234 |         assert f.read() == CONTENT
235 | 
236 | 
237 | @pytest.mark.skipif(
238 |     not hasattr(fcntl, "F_GETPIPE_SZ") or _MAX_PIPE_SIZE is None,
239 |     reason="Pipe size modifications not available on this platform.",
240 | )
241 | def test_pipesize_changed(tmp_path):
242 |     # Higher compression level to avoid opening with threaded opener
243 |     with _PipedCompressionProgram(tmp_path / "hello.gz", "wb", compresslevel=5) as f:
244 |         assert fcntl.fcntl(f._file.fileno(), fcntl.F_GETPIPE_SZ) == _MAX_PIPE_SIZE
245 | 
246 | 
247 | def test_pipedcompressionwriter_wrong_mode(tmp_path):
248 |     with pytest.raises(ValueError) as error:
249 |         _PipedCompressionProgram(tmp_path / "test", "xb")
250 |     error.match("Mode is 'xb', but it must be")
251 | 
252 | 
253 | def test_pipedcompressionwriter_wrong_program(tmp_path):
254 |     with pytest.raises(OSError):
255 |         _PipedCompressionProgram(
256 |             tmp_path / "test", "wb", program_settings=_ProgramSettings(("XVXCLSKDLA",))
257 |         )
258 | 
259 | 
260 | def test_compression_level(tmp_path, gzip_writer):
261 |     # Currently only the gzip writers handle compression levels.
262 |     path = tmp_path / "test.gz"
263 |     with _PipedCompressionProgram(
264 |         path, "wb", 2, program_settings=gzip_writer
265 |     ) as test_h:
266 |         test_h.write(b"test")
267 |     assert gzip.decompress(path.read_bytes()) == b"test"
268 | 
269 | 
270 | def test_iter_method_writers(writer, tmp_path):
271 |     program_settings, extension = writer
272 |     writer = _PipedCompressionProgram(
273 |         tmp_path / f"test{extension}", "wb", program_settings=program_settings
274 |     )
275 |     assert iter(writer) == writer
276 |     writer.close()
277 | 
278 | 
279 | def test_next_method_writers(writer, tmp_path):
280 |     program_settings, extension = writer
281 |     writer = _PipedCompressionProgram(
282 |         tmp_path / f"test{extension}", "wb", program_settings=program_settings
283 |     )
284 |     with pytest.raises(io.UnsupportedOperation) as error:
285 |         next(writer)
286 |     error.match("read")
287 |     writer.close()
288 | 
289 | 
290 | def test_pipedcompressionprogram_wrong_mode():
291 |     with pytest.raises(ValueError) as error:
292 |         _PipedCompressionProgram("test", "xb")
293 |     error.match("Mode is 'xb', but it must be")
294 | 
295 | 
296 | def test_piped_compression_reader_peek_binary(reader):
297 |     program_settings, extension = reader
298 |     filegz = TEST_DIR / f"file.txt{extension}"
299 |     with _PipedCompressionProgram(
300 |         filegz, "rb", program_settings=program_settings
301 |     ) as read_h:
302 |         # Peek returns at least the amount of characters but maybe more
303 |         # depending on underlying stream. Hence startswith not ==.
304 |         assert read_h.peek(1).startswith(b"T")
305 | 
306 | 
307 | @pytest.mark.skipif(
308 |     sys.platform != "win32", reason="seeking only works on Windows for now"
309 | )
310 | def test_piped_compression_reader_seek_and_tell(reader):
311 |     program_settings, extension = reader
312 |     filegz = TEST_DIR / f"file.txt{extension}"
313 |     with _PipedCompressionProgram(filegz, "rb", program_settings=program_settings) as f:
314 |         original_position = f.tell()
315 |         assert f.read(4) == b"Test"
316 |         f.seek(original_position)
317 |         assert f.read(8) == b"Testing,"
318 | 
319 | 
320 | @pytest.mark.parametrize("mode", ["r", "rb"])
321 | def test_piped_compression_reader_peek_text(reader, mode):
322 |     program_settings, extension = reader
323 |     compressed_file = TEST_DIR / f"file.txt{extension}"
324 |     with _PipedCompressionProgram(
325 |         compressed_file, mode, program_settings=program_settings
326 |     ) as read_h:
327 |         assert read_h.peek(1)[0] == CONTENT[0]
328 | 
329 | 
330 | def writers_and_levels():
331 |     for writer in PIPED_GZIP_PROGRAMS:
332 |         if "gzip" in writer.program_args:
333 |             # Levels 1-9 are supported
334 |             yield from ((writer, i) for i in range(1, 10))
335 |         elif "pigz" in writer.program_args:
336 |             # Levels 0-9 + 11 are supported
337 |             yield from ((writer, i) for i in list(range(10)) + [11])
338 |         else:
339 |             raise NotImplementedError(
340 |                 f"Test should be implemented for " f"{writer}"
341 |             )  # pragma: no cover
342 | 
343 | 
344 | @pytest.mark.parametrize(["writer", "level"], writers_and_levels())
345 | def test_valid_compression_levels(writer, level, tmp_path):
346 |     path = tmp_path / "test.gz"
347 |     with _PipedCompressionProgram(path, "wb", level, program_settings=writer) as handle:
348 |         handle.write(b"test")
349 |     assert gzip.decompress(path.read_bytes()) == b"test"
350 | 
351 | 
352 | def test_reproducible_gzip_compression(gzip_writer, tmp_path):
353 |     path = tmp_path / "file.gz"
354 |     with _PipedCompressionProgram(path, mode="wb", program_settings=gzip_writer) as f:
355 |         f.write(b"hello")
356 | 
357 |     data = path.read_bytes()
358 |     assert (data[3] & gzip.FNAME) == 0, "gzip header contains file name"
359 |     assert data[4:8] == b"\0\0\0\0", "gzip header contains mtime"
360 | 
361 | 
362 | def test_piped_tool_fails_on_close(tmp_path):
363 |     # This test exercises the retcode != 0 case in PipedCompressionWriter.close()
364 |     with pytest.raises(OSError) as e:
365 |         with _PipedCompressionProgram(
366 |             tmp_path / "out.txt",
367 |             "wb",
368 |             program_settings=_ProgramSettings(
369 |                 (
370 |                     sys.executable,
371 |                     "-c",
372 |                     "import sys\nfor line in sys.stdin: pass\nprint()\nsys.exit(5)",
373 |                 ),
374 |             ),
375 |         ) as f:
376 |             f.write(b"Hello")
377 |     assert "exit code 5" in e.value.args[0]
378 | 


--------------------------------------------------------------------------------
/tests/test_xopen.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests for the xopen.xopen function
  3 | """
  4 | import bz2
  5 | import subprocess
  6 | import sys
  7 | import tempfile
  8 | from contextlib import contextmanager
  9 | import functools
 10 | import gzip
 11 | import io
 12 | import lzma
 13 | import os
 14 | from pathlib import Path
 15 | import shutil
 16 | 
 17 | import pytest
 18 | 
 19 | from xopen import xopen, _detect_format_from_content
 20 | 
 21 | try:
 22 |     import zstandard
 23 | except ImportError:
 24 |     zstandard = None
 25 | 
 26 | 
 27 | # TODO this is duplicated in test_piped.py
 28 | TEST_DIR = Path(__file__).parent
 29 | CONTENT_LINES = ["Testing, testing ...\n", "The second line.\n"]
 30 | CONTENT = "".join(CONTENT_LINES)
 31 | extensions = ["", ".gz", ".bz2", ".xz"]
 32 | if shutil.which("zstd") or zstandard:
 33 |     extensions += [".zst"]
 34 | base = os.path.join(os.path.dirname(__file__), "file.txt")
 35 | files = [base + ext for ext in extensions]
 36 | 
 37 | 
 38 | @contextmanager
 39 | def disable_binary(tmp_path, binary_name):
 40 |     """
 41 |     Find the location of the binary by its name, then set PATH to a directory that contains
 42 |     the binary with permissions set to 000. If no suitable binary could be found,
 43 |     PATH is set to an empty directory
 44 |     """
 45 |     binary_path = shutil.which(binary_name)
 46 |     if binary_path:
 47 |         shutil.copy(binary_path, tmp_path)
 48 |         os.chmod(tmp_path / Path(binary_path).name, 0)
 49 |     path = os.environ["PATH"]
 50 |     try:
 51 |         os.environ["PATH"] = str(tmp_path)
 52 |         yield
 53 |     finally:
 54 |         os.environ["PATH"] = path
 55 | 
 56 | 
 57 | @pytest.fixture(params=extensions)
 58 | def ext(request):
 59 |     return request.param
 60 | 
 61 | 
 62 | @pytest.fixture(params=files)
 63 | def fname(request):
 64 |     return request.param
 65 | 
 66 | 
 67 | @pytest.fixture
 68 | def lacking_pigz_permissions(tmp_path):
 69 |     with disable_binary(tmp_path, "pigz"):
 70 |         yield
 71 | 
 72 | 
 73 | @pytest.fixture
 74 | def lacking_pbzip2_permissions(tmp_path):
 75 |     with disable_binary(tmp_path, "pbzip2"):
 76 |         yield
 77 | 
 78 | 
 79 | @pytest.fixture
 80 | def lacking_xz_permissions(tmp_path):
 81 |     with disable_binary(tmp_path, "xz"):
 82 |         yield
 83 | 
 84 | 
 85 | @pytest.fixture
 86 | def xopen_without_igzip(monkeypatch):
 87 |     import xopen  # xopen local overrides xopen global variable
 88 | 
 89 |     monkeypatch.setattr(xopen, "igzip", None)
 90 |     return xopen.xopen
 91 | 
 92 | 
 93 | def test_text(fname):
 94 |     with xopen(fname, "rt") as f:
 95 |         lines = list(f)
 96 |         assert len(lines) == 2
 97 |         assert lines[1] == "The second line.\n", fname
 98 | 
 99 | 
100 | def test_binary(fname):
101 |     with xopen(fname, "rb") as f:
102 |         lines = list(f)
103 |         assert len(lines) == 2
104 |         assert lines[1] == b"The second line.\n", fname
105 | 
106 | 
107 | @pytest.mark.parametrize("mode", ["b", "", "t"])
108 | @pytest.mark.parametrize("threads", [None, 0])
109 | def test_roundtrip(ext, tmp_path, threads, mode):
110 |     if ext == ".zst" and threads == 0 and zstandard is None:
111 |         return
112 |     path = tmp_path / f"file{ext}"
113 |     data = b"Hello" if mode == "b" else "Hello"
114 |     with xopen(path, "w" + mode, threads=threads) as f:
115 |         f.write(data)
116 |     with xopen(path, "r" + mode, threads=threads) as f:
117 |         assert f.read() == data
118 | 
119 | 
120 | def test_binary_no_isal_no_threads(fname, xopen_without_igzip):
121 |     if fname.endswith(".zst") and zstandard is None:
122 |         return
123 |     with xopen_without_igzip(fname, "rb", threads=0) as f:
124 |         lines = list(f)
125 |         assert len(lines) == 2
126 |         assert lines[1] == b"The second line.\n", fname
127 | 
128 | 
129 | def test_binary_no_isal(fname, xopen_without_igzip):
130 |     with xopen_without_igzip(fname, "rb", threads=1) as f:
131 |         lines = list(f)
132 |         assert len(lines) == 2
133 |         assert lines[1] == b"The second line.\n", fname
134 | 
135 | 
136 | def test_no_context_manager_text(fname):
137 |     f = xopen(fname, "rt")
138 |     lines = list(f)
139 |     assert len(lines) == 2
140 |     assert lines[1] == "The second line.\n", fname
141 |     f.close()
142 |     assert f.closed
143 | 
144 | 
145 | def test_no_context_manager_binary(fname):
146 |     f = xopen(fname, "rb")
147 |     lines = list(f)
148 |     assert len(lines) == 2
149 |     assert lines[1] == b"The second line.\n", fname
150 |     f.close()
151 |     assert f.closed
152 | 
153 | 
154 | def test_bytes_path(fname):
155 |     path = fname.encode("utf-8")
156 |     with xopen(path, "rt") as f:
157 |         lines = list(f)
158 |         assert len(lines) == 2
159 |         assert lines[1] == "The second line.\n", fname
160 | 
161 | 
162 | def test_readinto(fname):
163 |     content = CONTENT.encode("utf-8")
164 |     with xopen(fname, "rb") as f:
165 |         b = bytearray(len(content) + 100)
166 |         length = f.readinto(b)
167 |         assert length == len(content)
168 |         assert b[:length] == content
169 | 
170 | 
171 | def test_detect_format_from_content(ext):
172 |     with open(Path(__file__).parent / f"file.txt{ext}", "rb") as f:
173 |         detected = _detect_format_from_content(f)
174 |     if ext == "":
175 |         assert detected is None
176 |     else:
177 |         assert ext[1:] == detected
178 | 
179 | 
180 | def test_detect_file_format_from_content(ext, tmp_path):
181 |     path = tmp_path / f"file.txt{ext}.test"
182 |     shutil.copy(TEST_DIR / f"file.txt{ext}", path)
183 |     with xopen(path, "rb") as fh:
184 |         assert fh.readline() == CONTENT_LINES[0].encode("utf-8")
185 | 
186 | 
187 | def test_readline(fname):
188 |     first_line = CONTENT_LINES[0].encode("utf-8")
189 |     with xopen(fname, "rb") as f:
190 |         assert f.readline() == first_line
191 | 
192 | 
193 | def test_readline_text(fname):
194 |     with xopen(fname, "r") as f:
195 |         assert f.readline() == CONTENT_LINES[0]
196 | 
197 | 
198 | def test_next(fname):
199 |     with xopen(fname, "rt") as f:
200 |         _ = next(f)
201 |         line2 = next(f)
202 |         assert line2 == "The second line.\n", fname
203 | 
204 | 
205 | def test_has_iter_method(ext, tmp_path):
206 |     path = tmp_path / f"out{ext}"
207 |     with xopen(path, mode="w") as f:
208 |         # Writing anything isn’t strictly necessary, but if we don’t, then
209 |         # pbzip2 causes a delay of one second
210 |         f.write("hello")
211 |         assert hasattr(f, "__iter__")
212 | 
213 | 
214 | def test_iter_without_with(fname):
215 |     f = xopen(fname, "rt")
216 |     it = iter(f)
217 |     assert CONTENT_LINES[0] == next(it)
218 |     f.close()
219 | 
220 | 
221 | @pytest.mark.parametrize("extension", [".gz", ".bz2"])
222 | def test_partial_iteration_closes_correctly(extension, create_large_file):
223 |     class LineReader:
224 |         def __init__(self, file):
225 |             self.file = xopen(file, "rb")
226 | 
227 |         def __iter__(self):
228 |             wrapper = io.TextIOWrapper(self.file, encoding="utf-8")
229 |             yield from wrapper
230 | 
231 |     large_file = create_large_file(extension)
232 |     f = LineReader(large_file)
233 |     next(iter(f))
234 |     f.file.close()
235 | 
236 | 
237 | def test_nonexisting_file(ext):
238 |     with pytest.raises(IOError):
239 |         with xopen("this-file-does-not-exist" + ext):
240 |             pass  # pragma: no cover
241 | 
242 | 
243 | def test_write_to_nonexisting_dir(ext):
244 |     with pytest.raises(IOError):
245 |         with xopen("this/path/does/not/exist/file.txt" + ext, "w"):
246 |             pass  # pragma: no cover
247 | 
248 | 
249 | def test_invalid_mode(ext):
250 |     with pytest.raises(ValueError):
251 |         with xopen(TEST_DIR / f"file.txt.{ext}", mode="hallo"):
252 |             pass  # pragma: no cover
253 | 
254 | 
255 | def test_filename_invalid_type():
256 |     with pytest.raises(TypeError):
257 |         with xopen(123, mode="r"):
258 |             pass  # pragma: no cover
259 | 
260 | 
261 | def test_invalid_compression_level(tmp_path):
262 |     with pytest.raises(ValueError) as e:
263 |         with xopen(tmp_path / "out.gz", mode="w", compresslevel=17) as f:
264 |             f.write("hello")  # pragma: no cover
265 |     assert "compresslevel must be" in e.value.args[0]
266 | 
267 | 
268 | @pytest.mark.parametrize("ext", extensions)
269 | @pytest.mark.parametrize("threads", (0, 1))
270 | def test_append(ext, threads, tmp_path):
271 |     if ext == ".zst" and zstandard is None and threads == 0:
272 |         pytest.skip("No zstandard installed")
273 |     text = b"AB"
274 |     reference = text + text
275 |     path = tmp_path / f"the-file{ext}"
276 |     with xopen(path, "ab", threads=threads) as f:
277 |         f.write(text)
278 |     with xopen(path, "ab", threads=threads) as f:
279 |         f.write(text)
280 |     with xopen(path, "r") as f:
281 |         for appended in f:
282 |             pass
283 |         reference = reference.decode("utf-8")
284 |         assert appended == reference
285 | 
286 | 
287 | @pytest.mark.parametrize("ext", extensions)
288 | def test_append_text(ext, tmp_path):
289 |     text = "AB"
290 |     reference = text + text
291 |     path = tmp_path / f"the-file{ext}"
292 |     with xopen(path, "at") as f:
293 |         f.write(text)
294 |     with xopen(path, "at") as f:
295 |         f.write(text)
296 |     with xopen(path, "rt") as f:
297 |         for appended in f:
298 |             pass
299 |         assert appended == reference
300 | 
301 | 
302 | @pytest.mark.timeout(5)
303 | @pytest.mark.parametrize("extension", [".gz", ".bz2", ".xz"])
304 | def test_truncated_file(extension, create_truncated_file):
305 |     truncated_file = create_truncated_file(extension)
306 |     with pytest.raises((EOFError, IOError)):
307 |         f = xopen(truncated_file, "r")
308 |         f.read()
309 |         f.close()  # pragma: no cover
310 | 
311 | 
312 | @pytest.mark.timeout(5)
313 | @pytest.mark.parametrize("extension", [".gz", ".bz2", ".xz"])
314 | def test_truncated_iter(extension, create_truncated_file):
315 |     truncated_file = create_truncated_file(extension)
316 |     with pytest.raises((EOFError, IOError)):
317 |         f = xopen(truncated_file, "r")
318 |         for line in f:
319 |             pass
320 |         f.close()  # pragma: no cover
321 | 
322 | 
323 | @pytest.mark.timeout(5)
324 | @pytest.mark.parametrize("extension", [".gz", ".bz2", ".xz"])
325 | def test_truncated_with(extension, create_truncated_file):
326 |     truncated_file = create_truncated_file(extension)
327 |     with pytest.raises((EOFError, IOError)):
328 |         with xopen(truncated_file, "r") as f:
329 |             f.read()
330 | 
331 | 
332 | @pytest.mark.timeout(5)
333 | @pytest.mark.parametrize("extension", [".gz", ".bz2", ".xz"])
334 | def test_truncated_iter_with(extension, create_truncated_file):
335 |     truncated_file = create_truncated_file(extension)
336 |     with pytest.raises((EOFError, IOError)):
337 |         with xopen(truncated_file, "r") as f:
338 |             for line in f:
339 |                 pass
340 | 
341 | 
342 | def test_bare_read_from_gz():
343 |     hello_file = TEST_DIR / "hello.gz"
344 |     with xopen(hello_file, "rt") as f:
345 |         assert f.read() == "hello"
346 | 
347 | 
348 | @pytest.mark.parametrize("threads", [None, 0, 2])
349 | def test_concatenated_gzip(tmp_path, threads):
350 |     path = tmp_path / "hello.gz"
351 |     with gzip.open(path, mode="wt") as f:
352 |         print("Hello", file=f)
353 |     with gzip.open(path, mode="at") as f:
354 |         print("world", file=f)
355 | 
356 |     with xopen(path, threads=threads) as f:
357 |         lines = list(f)
358 | 
359 |     assert lines == ["Hello\n", "world\n"]
360 | 
361 | 
362 | def test_read_no_threads(ext):
363 |     klasses = {
364 |         ".bz2": bz2.BZ2File,
365 |         ".gz": gzip.GzipFile,
366 |         ".xz": lzma.LZMAFile,
367 |         ".zst": io.BufferedReader,
368 |         "": io.BufferedReader,
369 |     }
370 |     if ext == ".zst" and zstandard is None:
371 |         return
372 |     klass = klasses[ext]
373 |     with xopen(TEST_DIR / f"file.txt{ext}", "rb", threads=0) as f:
374 |         assert isinstance(f, klass), f
375 | 
376 | 
377 | def test_write_threads(tmp_path, ext):
378 |     path = tmp_path / f"out.{ext}"
379 |     with xopen(path, mode="w", threads=3) as f:
380 |         f.write("hello")
381 |     with xopen(path) as f:
382 |         assert f.read() == "hello"
383 | 
384 | 
385 | def test_write_pigz_threads_no_isal(tmp_path, xopen_without_igzip):
386 |     path = tmp_path / "out.gz"
387 |     with xopen_without_igzip(path, mode="w", threads=3) as f:
388 |         f.write("hello")
389 |     with xopen_without_igzip(path) as f:
390 |         assert f.read() == "hello"
391 | 
392 | 
393 | def test_write_no_threads(tmp_path, ext):
394 |     klasses = {
395 |         ".bz2": bz2.BZ2File,
396 |         ".gz": gzip.GzipFile,
397 |         ".xz": lzma.LZMAFile,
398 |         "": io.BufferedWriter,
399 |     }
400 |     if ext == ".zst":
401 |         # Skip zst because if python-zstandard is not installed,
402 |         # we fall back to an external process even when threads=0
403 |         return
404 |     klass = klasses[ext]
405 |     with xopen(tmp_path / f"out{ext}", "wb", threads=0) as f:
406 |         if isinstance(f, io.BufferedWriter):
407 |             if ext:
408 |                 assert isinstance(f.raw, klass), f
409 |         else:
410 |             if ext:
411 |                 assert isinstance(f, klass)
412 | 
413 | 
414 | def test_write_gzip_no_threads_no_isal(tmp_path, xopen_without_igzip):
415 |     import gzip
416 | 
417 |     with xopen_without_igzip(tmp_path / "out.gz", "wb", threads=0) as f:
418 |         if sys.version_info.major == 3 and sys.version_info.minor >= 12:
419 |             assert isinstance(f, gzip.GzipFile), f
420 |         else:
421 |             assert isinstance(f.raw, gzip.GzipFile)
422 | 
423 | 
424 | def test_write_stdout():
425 |     f = xopen("-", mode="w")
426 |     print("Hello", file=f)
427 |     f.close()
428 |     # ensure stdout is not closed
429 |     print("Still there?")
430 | 
431 | 
432 | def test_write_stdout_contextmanager():
433 |     # Do not close stdout
434 |     with xopen("-", mode="w") as f:
435 |         print("Hello", file=f)
436 |     # ensure stdout is not closed
437 |     print("Still there?")
438 | 
439 | 
440 | def test_read_pathlib(fname):
441 |     path = Path(fname)
442 |     with xopen(path, mode="rt") as f:
443 |         assert f.read() == CONTENT
444 | 
445 | 
446 | def test_read_pathlib_binary(fname):
447 |     path = Path(fname)
448 |     with xopen(path, mode="rb") as f:
449 |         assert f.read() == bytes(CONTENT, "ascii")
450 | 
451 | 
452 | def test_write_pathlib(ext, tmp_path):
453 |     path = tmp_path / f"hello.txt{ext}"
454 |     with xopen(path, mode="wt") as f:
455 |         f.write("hello")
456 |     with xopen(path, mode="rt") as f:
457 |         assert f.read() == "hello"
458 | 
459 | 
460 | def test_write_pathlib_binary(ext, tmp_path):
461 |     path = tmp_path / f"hello.txt{ext}"
462 |     with xopen(path, mode="wb") as f:
463 |         f.write(b"hello")
464 |     with xopen(path, mode="rb") as f:
465 |         assert f.read() == b"hello"
466 | 
467 | 
468 | def test_falls_back_to_gzip_open(lacking_pigz_permissions):
469 |     with xopen(TEST_DIR / "file.txt.gz", "rb") as f:
470 |         assert f.readline() == CONTENT_LINES[0].encode("utf-8")
471 | 
472 | 
473 | def test_falls_back_to_gzip_open_no_isal(lacking_pigz_permissions, xopen_without_igzip):
474 |     with xopen_without_igzip(TEST_DIR / "file.txt.gz", "rb") as f:
475 |         assert f.readline() == CONTENT_LINES[0].encode("utf-8")
476 | 
477 | 
478 | def test_fals_back_to_gzip_open_write_no_isal(
479 |     lacking_pigz_permissions, xopen_without_igzip, tmp_path
480 | ):
481 |     tmp = tmp_path / "test.gz"
482 |     with xopen_without_igzip(tmp, "wb") as f:
483 |         f.write(b"hello")
484 |     assert gzip.decompress(tmp.read_bytes()) == b"hello"
485 | 
486 | 
487 | def test_falls_back_to_bzip2_open(lacking_pbzip2_permissions):
488 |     with xopen(TEST_DIR / "file.txt.bz2", "rb") as f:
489 |         assert f.readline() == CONTENT_LINES[0].encode("utf-8")
490 | 
491 | 
492 | def test_falls_back_to_lzma_open(lacking_xz_permissions):
493 |     with xopen(TEST_DIR / "file.txt.xz", "rb") as f:
494 |         assert f.readline() == CONTENT_LINES[0].encode("utf-8")
495 | 
496 | 
497 | def test_open_many_writers(tmp_path, ext):
498 |     files = []
499 |     # Because lzma.open allocates a lot of memory,
500 |     # open fewer files to avoid MemoryError on 32-bit architectures
501 |     n = 21 if ext == ".xz" else 61
502 |     for i in range(1, n):
503 |         path = tmp_path / f"{i:03d}.txt{ext}"
504 |         f = xopen(path, "wb", threads=2)
505 |         f.write(b"hello")
506 |         files.append(f)
507 |     for f in files:
508 |         f.close()
509 | 
510 | 
511 | def test_override_output_format(tmp_path):
512 |     path = tmp_path / "test_gzip_compressed"
513 |     with xopen(path, mode="wb", format="gz") as f:
514 |         f.write(b"test")
515 |     test_contents = path.read_bytes()
516 |     assert test_contents.startswith(b"\x1f\x8b")  # Gzip magic
517 |     assert gzip.decompress(test_contents) == b"test"
518 | 
519 | 
520 | def test_override_output_format_unsupported_format(tmp_path):
521 |     path = tmp_path / "test_fairy_format_compressed"
522 |     with pytest.raises(ValueError) as error:
523 |         xopen(path, mode="wb", format="fairy")
524 |     error.match("not supported")
525 |     error.match("fairy")
526 | 
527 | 
528 | def test_override_output_format_wrong_format(tmp_path):
529 |     path = tmp_path / "not_compressed"
530 |     path.write_text("I am not compressed.", encoding="utf-8")
531 |     with pytest.raises(OSError):  # BadGzipFile is a subclass of OSError
532 |         with xopen(path, "rt", format="gz") as opened_file:
533 |             opened_file.read()
534 | 
535 | 
536 | # Test for threaded and non-threaded.
537 | OPENERS = (xopen, functools.partial(xopen, threads=0))
538 | 
539 | 
540 | @pytest.mark.parametrize("opener", OPENERS)
541 | @pytest.mark.parametrize("extension", extensions)
542 | def test_text_encoding_newline_passthrough(opener, extension, tmp_path):
543 |     if extension == ".zst" and zstandard is None:
544 |         return
545 |     # "Eén ree\nTwee reeën\n" latin-1 encoded with \r for as line separator.
546 |     encoded_text = b"E\xe9n ree\rTwee ree\xebn\r"
547 |     path = tmp_path / f"test.txt{extension}"
548 |     with opener(path, "wb") as f:
549 |         f.write(encoded_text)
550 |     with opener(path, "rt", encoding="latin-1", newline="\r") as f:
551 |         result = f.read()
552 |     assert result == "Eén ree\rTwee reeën\r"
553 | 
554 | 
555 | @pytest.mark.parametrize("opener", OPENERS)
556 | @pytest.mark.parametrize("extension", extensions)
557 | def test_text_encoding_errors(opener, extension, tmp_path):
558 |     if extension == ".zst" and zstandard is None:
559 |         return
560 |     # "Eén ree\nTwee reeën\n" latin-1 encoded. This is not valid ascii.
561 |     encoded_text = b"E\xe9n ree\nTwee ree\xebn\n"
562 |     path = tmp_path / f"test.txt{extension}"
563 |     with opener(path, "wb") as f:
564 |         f.write(encoded_text)
565 |     with opener(path, "rt", encoding="ascii", errors="replace") as f:
566 |         result = f.read()
567 |     assert result == "E�n ree\nTwee ree�n\n"
568 | 
569 | 
570 | @pytest.mark.parametrize("compresslevel", [1, 6])
571 | def test_gzip_compression_is_reproducible_without_piping(tmp_path, compresslevel):
572 |     # compresslevel 1 should give us igzip and 6 should give us regular gzip
573 |     path = tmp_path / "test.gz"
574 |     with xopen(path, mode="wb", compresslevel=compresslevel, threads=0) as f:
575 |         f.write(b"hello")
576 |     data = path.read_bytes()
577 |     assert (data[3] & gzip.FNAME) == 0, "gzip header contains file name"
578 |     assert data[4:8] == b"\0\0\0\0", "gzip header contains mtime"
579 | 
580 | 
581 | def test_read_devnull():
582 |     with xopen(os.devnull):
583 |         pass
584 | 
585 | 
586 | def test_xopen_zst_fails_when_zstandard_not_available(monkeypatch):
587 |     import xopen
588 | 
589 |     monkeypatch.setattr(xopen, "zstandard", None)
590 |     with pytest.raises(ImportError):
591 |         with xopen.xopen(TEST_DIR / "file.txt.zst", mode="rb", threads=0) as f:
592 |             f.read()
593 | 
594 | 
595 | @pytest.mark.parametrize("threads", (0, 1))
596 | def test_xopen_zst_long_window_size(threads):
597 |     if threads == 0 and zstandard is None:
598 |         return
599 |     elif threads == 1 and not shutil.which("zstd"):
600 |         return
601 |     # File created with:
602 |     # cat /dev/zero | head -c 2147483648 > only_zeroes
603 |     # Then compressed with
604 |     # zstd --long=31 -19 only_zeroes
605 |     test_zst = Path(__file__).parent / "only_zeroes.zst"
606 |     with xopen(test_zst, "rb", threads=threads) as f:
607 |         data = f.read(1024)
608 |     assert data == bytes(1024)
609 | 
610 | 
611 | @pytest.mark.parametrize("threads", (0, 1))
612 | @pytest.mark.parametrize("ext", extensions)
613 | def test_pass_file_object_for_reading(ext, threads):
614 |     if ext == ".zst" and zstandard is None:
615 |         return
616 | 
617 |     with open(TEST_DIR / f"file.txt{ext}", "rb") as fh:
618 |         with xopen(fh, mode="rb", threads=threads) as f:
619 |             assert f.readline() == CONTENT_LINES[0].encode("utf-8")
620 | 
621 | 
622 | @pytest.mark.parametrize("threads", (0, 1))
623 | @pytest.mark.parametrize("ext", extensions)
624 | def test_pass_file_object_for_writing(tmp_path, ext, threads):
625 |     if ext == ".zst" and zstandard is None:
626 |         return
627 |     first_line = CONTENT_LINES[0].encode("utf-8")
628 |     with open(tmp_path / "out{ext}", "wb") as fh:
629 |         with xopen(fh, "wb", threads=threads) as f:
630 |             f.write(first_line)
631 |     with xopen(tmp_path / "out{ext}", "rb", threads=threads) as fh:
632 |         assert fh.readline() == first_line
633 | 
634 | 
635 | @pytest.mark.parametrize("threads", (0, 1))
636 | @pytest.mark.parametrize("ext", extensions)
637 | def test_pass_bytesio_for_reading_and_writing(ext, threads):
638 |     filelike = io.BytesIO()
639 |     format = ext[1:]
640 |     if ext == "":
641 |         format = None
642 |     if ext == ".zst" and zstandard is None:
643 |         return
644 |     first_line = CONTENT_LINES[0].encode("utf-8")
645 |     writer = xopen(filelike, "wb", format=format, threads=threads)
646 |     writer.write(first_line)
647 |     if writer is not filelike:
648 |         writer.close()
649 |     assert not filelike.closed
650 |     filelike.seek(0)
651 |     with xopen(filelike, "rb", format=format, threads=threads) as fh:
652 |         assert fh.readline() == first_line
653 | 
654 | 
655 | @pytest.mark.parametrize("threads", (0, 1))
656 | def test_xopen_stdin(monkeypatch, ext, threads):
657 |     if ext == ".zst" and zstandard is None:
658 |         return
659 |     # Add encoding to suppress encoding warnings
660 |     with open(TEST_DIR / f"file.txt{ext}", "rt", encoding="latin-1") as in_file:
661 |         monkeypatch.setattr("sys.stdin", in_file)
662 |         with xopen("-", "rt", threads=threads) as f:
663 |             data = f.read()
664 |         assert data == CONTENT
665 | 
666 | 
667 | def test_xopen_stdout(monkeypatch):
668 |     # Add encoding to suppress encoding warnings
669 |     with tempfile.TemporaryFile(mode="w+t", encoding="latin-1") as raw:
670 |         monkeypatch.setattr("sys.stdout", raw)
671 |         with xopen("-", "wt") as f:
672 |             f.write("Hello world!")
673 |         raw.seek(0)
674 |         data = raw.read()
675 |     assert data == "Hello world!"
676 | 
677 | 
678 | @pytest.mark.parametrize("threads", (0, 1))
679 | def test_xopen_read_from_pipe(ext, threads):
680 |     if ext == ".zst" and zstandard is None:
681 |         return
682 |     in_file = TEST_DIR / f"file.txt{ext}"
683 |     process = subprocess.Popen(("cat", str(in_file)), stdout=subprocess.PIPE)
684 |     with xopen(process.stdout, "rt", threads=threads) as f:
685 |         data = f.read()
686 |     process.wait()
687 |     process.stdout.close()
688 |     assert data == CONTENT
689 | 
690 | 
691 | @pytest.mark.parametrize("threads", (0, 1))
692 | def test_xopen_write_to_pipe(threads, ext):
693 |     if ext == ".zst" and zstandard is None:
694 |         return
695 |     format = ext.lstrip(".")
696 |     if format == "":
697 |         format = None
698 |     process = subprocess.Popen(("cat",), stdout=subprocess.PIPE, stdin=subprocess.PIPE)
699 |     with xopen(process.stdin, "wt", threads=threads, format=format) as f:
700 |         f.write(CONTENT)
701 |     process.stdin.close()
702 |     with xopen(process.stdout, "rt", threads=threads) as f:
703 |         data = f.read()
704 |     process.wait()
705 |     process.stdout.close()
706 |     assert data == CONTENT
707 | 
708 | 
709 | @pytest.mark.skipif(
710 |     not os.path.exists("/dev/stdin"), reason="/dev/stdin does not exist"
711 | )
712 | @pytest.mark.parametrize("threads", (0, 1))
713 | def test_xopen_dev_stdin_read(threads, ext):
714 |     if ext == ".zst" and zstandard is None:
715 |         return
716 |     file = str(Path(__file__).parent / f"file.txt{ext}")
717 |     result = subprocess.run(
718 |         f"cat {file} | python -c 'import xopen; "
719 |         f'f=xopen.xopen("/dev/stdin", "rt", threads={threads});print(f.read())\'',
720 |         shell=True,
721 |         stdout=subprocess.PIPE,
722 |         encoding="ascii",
723 |     )
724 |     assert result.stdout == CONTENT + "\n"
725 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = black,flake8,mypy,py39,py310,py311,py312,py313,pypy3
 3 | isolated_build = True
 4 | 
 5 | [testenv]
 6 | deps =
 7 |     pytest
 8 |     pytest-timeout
 9 |     coverage
10 | setenv =
11 |     PYTHONDEVMODE = 1
12 |     PYTHONWARNDEFAULTENCODING = 1
13 | commands =
14 |     coverage run --branch --source=xopen,tests -m pytest -v --doctest-modules tests
15 |     coverage report
16 |     coverage xml
17 |     coverage html
18 | 
19 | [testenv:zstd]
20 | deps =
21 |     {[testenv]deps}
22 |     zstandard
23 | 
24 | [testenv:no-libs]
25 | commands=
26 |     pip uninstall -y isal zlib-ng
27 |     {[testenv]commands}
28 | 
29 | [testenv:black]
30 | basepython = python3.10
31 | deps = black==22.3.0
32 | skip_install = true
33 | commands = black --check src/ tests/
34 | 
35 | [testenv:flake8]
36 | basepython = python3.10
37 | deps = flake8
38 | commands = flake8 src/ tests/
39 | skip_install = true
40 | 
41 | [testenv:mypy]
42 | basepython = python3.10
43 | deps = mypy
44 | commands = mypy src/
45 | 
46 | [flake8]
47 | max-line-length = 99
48 | max-complexity = 10
49 | extend_ignore = E731
50 | 
51 | [coverage:report]
52 | exclude_lines =
53 |     pragma: no cover
54 |     def __repr__
55 |     @overload
56 | 


--------------------------------------------------------------------------------