├── .codecov.yml ├── .editorconfig ├── .gitattributes ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.rst ├── pyproject.toml ├── src └── xopen │ ├── __init__.py │ ├── _version.pyi │ └── py.typed ├── tests ├── conftest.py ├── file.txt ├── file.txt.bz2 ├── file.txt.gz ├── file.txt.xz ├── file.txt.zst ├── hello.gz ├── only_zeroes.zst ├── test_piped.py └── test_xopen.py └── tox.ini /.codecov.yml: -------------------------------------------------------------------------------- 1 | comment: off 2 | 3 | codecov: 4 | require_ci_to_pass: no 5 | 6 | coverage: 7 | precision: 1 8 | round: down 9 | range: "70...100" 10 | 11 | status: 12 | project: yes 13 | patch: no 14 | changes: no 15 | 16 | comment: off 17 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | [*.py] 2 | charset=utf-8 3 | end_of_line=lf 4 | insert_final_newline=true 5 | indent_style=space 6 | indent_size=4 7 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | tests/file.txt eol=lf 2 | tests/file.txt.test eol=lf 3 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | lint: 7 | # Run for PRs only if they come from a forked repo (avoids duplicate runs) 8 | if: >- 9 | github.event_name != 'pull_request' || 10 | github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name 11 | timeout-minutes: 10 12 | runs-on: ubuntu-latest 13 | strategy: 14 | matrix: 15 | python-version: ["3.10"] 16 | toxenv: [black, flake8, mypy] 17 | steps: 18 | - uses: actions/checkout@v4 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v4 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | - name: Install dependencies 24 | run: python -m pip install tox 25 | - name: Run tox ${{ matrix.toxenv }} 26 | run: tox -e ${{ matrix.toxenv }} 27 | 28 | test: 29 | if: >- 30 | github.event_name != 'pull_request' || 31 | github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name 32 | timeout-minutes: 10 33 | runs-on: ${{ matrix.os }} 34 | strategy: 35 | matrix: 36 | os: [ubuntu-latest] 37 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "pypy-3.9"] 38 | optional-deps: [true] 39 | with-libs: [true] 40 | include: 41 | - os: macos-latest 42 | python-version: "3.10" 43 | optional-deps: true 44 | - os: ubuntu-latest 45 | python-version: "3.10" 46 | with-libs: false 47 | optional-deps: false 48 | - os: ubuntu-latest 49 | python-version: "3.10" 50 | with-libs: false 51 | optional-deps: true 52 | - os: ubuntu-latest 53 | python-version: "3.10" 54 | optional-deps: false 55 | with-libs: false 56 | with-zstandard: true 57 | - os: windows-latest 58 | python-version: "3.10" 59 | steps: 60 | - name: Install optional tools macOS 61 | if: runner.os == 'macOS' && matrix.optional-deps 62 | run: brew install pigz pbzip2 isa-l zstd 63 | - name: Install optional tools Linux 64 | if: runner.os == 'Linux' && matrix.optional-deps 65 | run: sudo apt-get install pigz pbzip2 isal zstd 66 | - name: Remove xz 67 | if: runner.os == 'Linux' && !matrix.optional-deps 68 | run: while which xz; do sudo rm $(which xz); done 69 | - uses: actions/checkout@v4 70 | with: 71 | fetch-depth: 0 72 | - name: Set up Python ${{ matrix.python-version }} 73 | uses: actions/setup-python@v4 74 | with: 75 | python-version: ${{ matrix.python-version }} 76 | - name: Install dependencies 77 | run: python -m pip install tox 78 | - name: Test 79 | run: tox -e py 80 | if: matrix.with-libs 81 | - name: Test without python-isal and python-zlib-ng 82 | run: tox -e no-libs 83 | if: true && !matrix.with-libs 84 | - name: Test with zstandard 85 | if: matrix.with-zstandard 86 | run: tox -e zstd 87 | - name: Upload coverage report 88 | uses: codecov/codecov-action@v3 89 | 90 | deploy: 91 | timeout-minutes: 10 92 | runs-on: ubuntu-latest 93 | needs: [lint, test] 94 | if: startsWith(github.ref, 'refs/tags') 95 | steps: 96 | - uses: actions/checkout@v4 97 | with: 98 | fetch-depth: 0 # required for setuptools_scm 99 | - name: Set up Python 100 | uses: actions/setup-python@v4 101 | with: 102 | python-version: "3.10" 103 | - name: Make distributions 104 | run: | 105 | python -m pip install build 106 | python -m build 107 | ls -l dist/ 108 | - name: Publish to PyPI 109 | uses: pypa/gh-action-pypi-publish@release/v1 110 | with: 111 | user: __token__ 112 | password: ${{ secrets.pypi_password }} 113 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.pyc 3 | *.egg-info 4 | *~ 5 | .tox 6 | venv/ 7 | src/xopen/_version.py 8 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v2.3.0 4 | hooks: 5 | - id: end-of-file-fixer 6 | - id: trailing-whitespace 7 | - repo: https://github.com/psf/black 8 | rev: 22.3.0 9 | hooks: 10 | - id: black 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010 The xopen developers 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. image:: https://github.com/pycompression/xopen/workflows/CI/badge.svg 2 | :target: https://github.com/pycompression/xopen 3 | :alt: 4 | 5 | .. image:: https://img.shields.io/pypi/v/xopen.svg?branch=main 6 | :target: https://pypi.python.org/pypi/xopen 7 | 8 | .. image:: https://img.shields.io/conda/v/conda-forge/xopen.svg 9 | :target: https://anaconda.org/conda-forge/xopen 10 | :alt: 11 | 12 | .. image:: https://codecov.io/gh/pycompression/xopen/branch/main/graph/badge.svg 13 | :target: https://codecov.io/gh/pycompression/xopen 14 | :alt: 15 | 16 | ===== 17 | xopen 18 | ===== 19 | 20 | This Python module provides an ``xopen`` function that works like Python’s 21 | built-in ``open`` function but also transparently deals with compressed files. 22 | ``xopen`` selects the most efficient method for reading or writing a compressed file. 23 | 24 | Supported compression formats are: 25 | 26 | - gzip (``.gz``) 27 | - bzip2 (``.bz2``) 28 | - xz (``.xz``) 29 | - Zstandard (``.zst``) (optional) 30 | 31 | 32 | Example usage 33 | ------------- 34 | 35 | Open a file for reading:: 36 | 37 | from xopen import xopen 38 | 39 | with xopen("file.txt.gz") as f: 40 | content = f.read() 41 | 42 | Write to a file in binary mode, 43 | set the compression level 44 | and avoid using an external process:: 45 | 46 | from xopen import xopen 47 | 48 | with xopen("file.txt.xz", mode="wb", threads=0, compresslevel=3) as f: 49 | f.write(b"Hello") 50 | 51 | 52 | The ``xopen`` function 53 | ---------------------- 54 | 55 | The ``xopen`` module offers a single function named ``xopen`` with the following 56 | signature:: 57 | 58 | xopen( 59 | filename: str | bytes | os.PathLike, 60 | mode: Literal["r", "w", "a", "rt", "rb", "wt", "wb", "at", "ab"] = "r", 61 | compresslevel: Optional[int] = None, 62 | threads: Optional[int] = None, 63 | *, 64 | encoding: str = "utf-8", 65 | errors: Optional[str] = None, 66 | newline: Optional[str] = None, 67 | format: Optional[str] = None, 68 | ) -> IO 69 | 70 | The function opens the file using a function suitable for the detected 71 | file format and returns an open file-like object. 72 | 73 | When writing, the file format is chosen based on the file name extension: 74 | ``.gz``, ``.bz2``, ``.xz``, ``.zst``. This can be overriden with ``format``. 75 | If the extension is not recognized, no compression is used. 76 | 77 | When reading and a file name extension is available, the format is detected 78 | from the extension. 79 | When reading and no file name extension is available, 80 | the format is detected from the 81 | `file signature `. 82 | 83 | Parameters 84 | ~~~~~~~~~~ 85 | 86 | **filename** (str, bytes, or `os.PathLike `_): 87 | Name of the file to open. 88 | 89 | If set to ``"-"``, standard output (in mode ``"w"``) or 90 | standard input (in mode ``"r"``) is returned. 91 | 92 | **mode**, **encoding**, **errors**, **newline**: 93 | These parameters have the same meaning as in Python’s built-in 94 | `open function `_ 95 | except that the default encoding is always UTF-8 instead of the 96 | preferred locale encoding. 97 | ``encoding``, ``errors`` and ``newline`` are only used when opening a file in text mode. 98 | 99 | **compresslevel**: 100 | The compression level for writing to gzip, xz and Zstandard files. 101 | If set to None, a default depending on the format is used: 102 | gzip: 1, xz: 6, Zstandard: 3. 103 | 104 | This parameter is ignored for other compression formats. 105 | 106 | **format**: 107 | Override the autodetection of the input or output format. 108 | Possible values are: ``"gz"``, ``"xz"``, ``"bz2"``, ``"zst"``. 109 | 110 | **threads**: 111 | Set the number of additional threads spawned for compression or decompression. 112 | May be ignored if the backend does not support threads. 113 | 114 | If *threads* is None (the default), as many threads as available CPU cores are 115 | used, but not more than four. 116 | 117 | xopen tries to offload the (de)compression to other threads 118 | to free up the main Python thread for the application. 119 | This can either be done by using a subprocess to an external application or 120 | using a library that supports threads. 121 | 122 | Set threads to 0 to force xopen to use only the main Python thread. 123 | 124 | 125 | Backends 126 | -------- 127 | 128 | Opening of gzip files is delegated to one of these programs or libraries: 129 | 130 | * `python-isal `_. 131 | Supports multiple threads and compression levels up to 3. 132 | * `python-zlib-ng `_ 133 | * `pigz `_ (a parallel version of ``gzip``) 134 | * `gzip `_ 135 | 136 | For xz files, a pipe to the ``xz`` program is used because it has 137 | built-in support for multithreaded compression. 138 | 139 | For bz2 files, `pbzip2 (parallel bzip2) `_ is used. 140 | 141 | ``xopen`` falls back to Python’s built-in functions 142 | (``gzip.open``, ``lzma.open``, ``bz2.open``) 143 | if none of the other methods can be used. 144 | 145 | 146 | Reproducibility 147 | --------------- 148 | 149 | xopen writes gzip files in a reproducible manner. 150 | 151 | Normally, gzip files contain a timestamp in the file header, 152 | which means that compressing the same data at different times results in different output files. 153 | xopen disables this for all of the supported gzip compression backends. 154 | For example, when using an external process, it sets the command-line option 155 | ``--no-name`` (same as ``-n``). 156 | 157 | Note that different gzip compression backends typically do not produce 158 | identical output, so reproducibility may no longer be given when the execution environment changes 159 | from one ``xopen()`` invocation to the next. 160 | This includes the CPU architecture as `igzip adjusts its algorithm 161 | depending on it `_. 162 | 163 | bzip2 and xz compression methods do not store timestamps in the file headers, 164 | so output from them is also reproducible. 165 | 166 | 167 | Optional Zstandard support 168 | -------------------------- 169 | 170 | For reading and writing Zstandard (``.zst``) files, either the ``zstd`` command-line 171 | program or the Python ``zstandard`` package needs to be installed. 172 | 173 | * If the ``threads`` parameter to ``xopen()`` is ``None`` (the default) or any value greater than 0, 174 | ``xopen`` uses an external ``zstd`` process. 175 | * If the above fails (because no ``zstd`` program is available) or if ``threads`` is 0, 176 | the ``zstandard`` package is used. 177 | 178 | To ensure that you get the correct ``zstandard`` version, you can specify the ``zstd`` extra for 179 | ``xopen``, that is, install it using ``pip install xopen[zstd]``. 180 | 181 | 182 | Changelog 183 | --------- 184 | 185 | development version 186 | ~~~~~~~~~~~~~~~~~~~ 187 | 188 | * Dropped support for Python 3.8 189 | * Started supporting Python 3.13 190 | 191 | v2.0.2 (2024-06-12) 192 | ~~~~~~~~~~~~~~~~~~~ 193 | * #161: Fix a bug that was triggered when reading large compressed files with 194 | an external program. 195 | 196 | v2.0.1 (2024-03-28) 197 | ~~~~~~~~~~~~~~~~~~~ 198 | + #158: Fixed a bug where reading from stdin and other pipes would discard the 199 | first bytes from the input. 200 | + #156: Zstd files compressed with the ``--long=31`` files can now be opened 201 | without throwing errors. 202 | 203 | v2.0.0 (2024-03-26) 204 | ~~~~~~~~~~~~~~~~~~~ 205 | 206 | * #154: Support for gzip levels has been made more consistent. Levels 0-9 207 | are supported. Level 11 which was only available when the ``pigz`` backend was 208 | present is not supported anymore. Level 0, gzip format without compression, 209 | lead to crashes when the ``gzip`` application backend was used as this does 210 | not have a ``-0`` flag. ``xopen()`` now defers to other backends in that case. 211 | * #152: ``xopen()`` now accepts `file-like objects 212 | `_ for its filename 213 | argument. 214 | * #146, #147, #148: Various refactors for better code size and readability: 215 | 216 | * PipedCompressionReader/Writer are now combined _PipedCompressionProgram 217 | class. 218 | * _PipedCompressionProgram is binary-only. For text reading and writing 219 | it is wrapped in an ``io.TextIOWrapper`` in the ``xopen()`` function. 220 | * Classes that derive from PipedCompressionReader/Writer have been removed. 221 | * #148: xopen's classes, variables and functions pertaining to piped reading 222 | and writing are all made private by prefixing them with an underscore. 223 | These are not part of the API and may change between releases. 224 | 225 | v1.9.0 (2024-01-31) 226 | ~~~~~~~~~~~~~~~~~~~ 227 | * #142: The python-isal compression backend is now only used for compression 228 | levels 1 and 2. Contrary to other backends, python-isal level 0 gave 229 | compressed rather than uncompressed data in gzip format. Level 3 on 230 | python-isal did not provide better compression than level 2. 231 | * #140: PipedCompressionReader/Writer now derive from the `io.IOBase 232 | `_ abstract class. 233 | * #138: The gzip default compression level is now 1 when no value is provided 234 | by the calling function. The default used to be determined by the backend. 235 | * #135: xopen now uses zlib-ng when available and applicable. 236 | * #133: Piped ``igzip`` is no longer used as a (de)compression backend as 237 | python-isal's threaded mode is a better choice in all use cases. 238 | 239 | v1.8.0 (2023-11-03) 240 | ~~~~~~~~~~~~~~~~~~~ 241 | * #131: xopen now defers to the ``isal.igzip_threaded`` module rather than 242 | piping to external programs in applicable cases. This makes reading and 243 | writing to gzip files using threads more efficient. 244 | * Support for Python 3.7 is dropped and support for Python 3.12 is added. 245 | 246 | v1.7.0 (2022-11-03) 247 | ~~~~~~~~~~~~~~~~~~~ 248 | 249 | * #91: Added optional support for Zstandard (``.zst``) files. 250 | This requires that the Python ``zstandard`` package is installed 251 | or that the ``zstd`` command-line program is available. 252 | 253 | v1.6.0 (2022-08-10) 254 | ~~~~~~~~~~~~~~~~~~~ 255 | 256 | * #94: When writing gzip files, the timestamp and name of the original 257 | file is omitted (equivalent to using ``gzip --no-name`` (or ``-n``) on the 258 | command line). This allows files to be written in a reproducible manner. 259 | 260 | v1.5.0 (2022-03-23) 261 | ~~~~~~~~~~~~~~~~~~~ 262 | 263 | * #100: Dropped Python 3.6 support 264 | * #101: Added support for piping into and from an external ``xz`` process. Contributed by @fanninpm. 265 | * #102: Support setting the xz compression level. Contributed by @tsibley. 266 | 267 | v1.4.0 (2022-01-14) 268 | ~~~~~~~~~~~~~~~~~~~ 269 | 270 | * Add ``seek()`` and ``tell()`` to the ``PipedCompressionReader`` classes 271 | (for Windows compatibility) 272 | 273 | v1.3.0 (2022-01-10) 274 | ~~~~~~~~~~~~~~~~~~~ 275 | 276 | * xopen is now available on Windows (in addition to Linux and macOS). 277 | * For greater compatibility with `the built-in open() 278 | function `_, 279 | ``xopen()`` has gained the parameters *encoding*, *errors* and *newlines* 280 | with the same meaning as in ``open()``. Unlike built-in ``open()``, though, 281 | encoding is UTF-8 by default. 282 | * A parameter *format* has been added that allows to force the compression 283 | file format. 284 | 285 | v1.2.0 (2021-09-21) 286 | ~~~~~~~~~~~~~~~~~~~ 287 | 288 | * `pbzip2 `_ is now used to open ``.bz2`` files if 289 | ``threads`` is greater than zero (contributed by @DriesSchaumont). 290 | 291 | v1.1.0 (2021-01-20) 292 | ~~~~~~~~~~~~~~~~~~~ 293 | 294 | * Python 3.5 support is dropped. 295 | * On Linux systems, `python-isal `_ 296 | is now added as a requirement. This will speed up the reading of gzip files 297 | significantly when no external processes are used. 298 | 299 | v1.0.0 (2020-11-05) 300 | ~~~~~~~~~~~~~~~~~~~ 301 | 302 | * If installed, the ``igzip`` program (part of 303 | `Intel ISA-L `_) is now used for reading 304 | and writing gzip-compressed files at compression levels 1-3, which results 305 | in a significant speedup. 306 | 307 | v0.9.0 (2020-04-02) 308 | ~~~~~~~~~~~~~~~~~~~ 309 | 310 | * #80: When the file name extension of a file to be opened for reading is not 311 | available, the content is inspected (if possible) and used to determine 312 | which compression format applies (contributed by @bvaisvil). 313 | * This release drops Python 2.7 and 3.4 support. Python 3.5 or later is 314 | now required. 315 | 316 | v0.8.4 (2019-10-24) 317 | ~~~~~~~~~~~~~~~~~~~ 318 | 319 | * When reading gzipped files, force ``pigz`` to use only a single process. 320 | ``pigz`` cannot use multiple cores anyway when decompressing. By default, 321 | it would use extra I/O processes, which slightly reduces wall-clock time, 322 | but increases CPU time. Single-core decompression with ``pigz`` is still 323 | about twice as fast as regular ``gzip``. 324 | * Allow ``threads=0`` for specifying that no external ``pigz``/``gzip`` 325 | process should be used (then regular ``gzip.open()`` is used instead). 326 | 327 | v0.8.3 (2019-10-18) 328 | ~~~~~~~~~~~~~~~~~~~ 329 | 330 | * #20: When reading gzipped files, let ``pigz`` use at most four threads by default. 331 | This limit previously only applied when writing to a file. Contributed by @bernt-matthias. 332 | * Support Python 3.8 333 | 334 | v0.8.0 (2019-08-14) 335 | ~~~~~~~~~~~~~~~~~~~ 336 | 337 | * #14: Speed improvements when iterating over gzipped files. 338 | 339 | v0.6.0 (2019-05-23) 340 | ~~~~~~~~~~~~~~~~~~~ 341 | 342 | * For reading from gzipped files, xopen will now use a ``pigz`` subprocess. 343 | This is faster than using ``gzip.open``. 344 | * Python 2 support will be dropped in one of the next releases. 345 | 346 | v0.5.0 (2019-01-30) 347 | ~~~~~~~~~~~~~~~~~~~ 348 | 349 | * By default, pigz is now only allowed to use at most four threads. This hopefully reduces 350 | problems some users had with too many threads when opening many files at the same time. 351 | * xopen now accepts pathlib.Path objects. 352 | 353 | v0.4.0 (2019-01-07) 354 | ~~~~~~~~~~~~~~~~~~~ 355 | 356 | * Drop Python 3.3 support 357 | * Add a ``threads`` parameter (passed on to ``pigz``) 358 | 359 | v0.3.2 (2017-11-22) 360 | ~~~~~~~~~~~~~~~~~~~ 361 | 362 | * #6: Make multi-block bz2 work on Python 2 by using external bz2file library. 363 | 364 | v0.3.1 (2017-11-22) 365 | ~~~~~~~~~~~~~~~~~~~ 366 | 367 | * Drop Python 2.6 support 368 | * #5: Fix PipedGzipReader.read() not returning anything 369 | 370 | v0.3.0 (2017-11-15) 371 | ~~~~~~~~~~~~~~~~~~~ 372 | 373 | * Add gzip compression parameter 374 | 375 | v0.2.1 (2017-05-31) 376 | ~~~~~~~~~~~~~~~~~~~ 377 | 378 | * #3: Allow appending to bz2 and lzma files where possible 379 | 380 | v0.1.1 (2016-12-02) 381 | ~~~~~~~~~~~~~~~~~~~ 382 | 383 | * Fix a deadlock 384 | 385 | v0.1.0 (2016-09-09) 386 | ~~~~~~~~~~~~~~~~~~~ 387 | 388 | * Initial release 389 | 390 | Credits 391 | ------- 392 | 393 | The name ``xopen`` was taken from the C function of the same name in the 394 | `utils.h file that is part of 395 | BWA `_. 396 | 397 | Some ideas were taken from the `canopener project `_. 398 | If you also want to open S3 files, you may want to use that module instead. 399 | 400 | @kyleabeauchamp contributed support for appending to files before this repository was created. 401 | 402 | 403 | Maintainers 404 | ----------- 405 | 406 | * Marcel Martin 407 | * Ruben Vorderman 408 | * See also the `full list of contributors `_. 409 | 410 | 411 | Links 412 | ----- 413 | 414 | * `Source code `_ 415 | * `Report an issue `_ 416 | * `Project page on PyPI (Python package index) `_ 417 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools >= 77.0.3", "setuptools_scm[toml]>=6.2"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "xopen" 7 | authors = [ 8 | {name = "Marcel Martin", email = "marcel.martin@scilifelab.se"}, 9 | {name = "Ruben Vorderman", email = "r.h.p.vorderman@lumc.nl"} 10 | ] 11 | description = "Open compressed files transparently" 12 | readme = "README.rst" 13 | license = "MIT" 14 | classifiers = [ 15 | "Development Status :: 5 - Production/Stable", 16 | "Programming Language :: Python :: 3" 17 | ] 18 | requires-python = ">=3.9" 19 | dynamic = ["version"] 20 | dependencies = [ 21 | 'isal>=1.6.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"', 22 | 'zlib-ng>=0.4.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"' 23 | ] 24 | 25 | [project.urls] 26 | homepage = "https://github.com/pycompression/xopen/" 27 | 28 | [project.optional-dependencies] 29 | dev = ["pytest"] 30 | zstd = ["zstandard<1"] 31 | 32 | [tool.setuptools_scm] 33 | write_to = "src/xopen/_version.py" 34 | 35 | [tool.pytest.ini_options] 36 | addopts = "--strict-markers" 37 | -------------------------------------------------------------------------------- /src/xopen/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Open compressed files transparently. 3 | """ 4 | 5 | __all__ = [ 6 | "xopen", 7 | "_PipedCompressionProgram", 8 | "__version__", 9 | ] 10 | 11 | import dataclasses 12 | import gzip 13 | import stat 14 | import sys 15 | import io 16 | import os 17 | import bz2 18 | import lzma 19 | import signal 20 | import pathlib 21 | import subprocess 22 | import tempfile 23 | import threading 24 | import time 25 | from typing import ( 26 | Dict, 27 | Optional, 28 | Union, 29 | IO, 30 | overload, 31 | BinaryIO, 32 | Literal, 33 | Tuple, 34 | ) 35 | from types import ModuleType 36 | 37 | from ._version import version as __version__ 38 | 39 | # 128K buffer size also used by cat, pigz etc. It is faster than the 8K default. 40 | BUFFER_SIZE = max(io.DEFAULT_BUFFER_SIZE, 128 * 1024) 41 | 42 | XOPEN_DEFAULT_GZIP_COMPRESSION = 1 43 | XOPEN_DEFAULT_BZ2_COMPRESSION = 9 44 | XOPEN_DEFAULT_XZ_COMPRESSION = 6 45 | XOPEN_DEFAULT_ZST_COMPRESSION = 3 46 | 47 | igzip: Optional[ModuleType] 48 | isal_zlib: Optional[ModuleType] 49 | igzip_threaded: Optional[ModuleType] 50 | zlib_ng: Optional[ModuleType] 51 | gzip_ng: Optional[ModuleType] 52 | gzip_ng_threaded: Optional[ModuleType] 53 | 54 | try: 55 | from isal import igzip, igzip_threaded, isal_zlib 56 | except ImportError: 57 | igzip = None 58 | isal_zlib = None 59 | igzip_threaded = None 60 | 61 | try: 62 | from zlib_ng import gzip_ng, gzip_ng_threaded, zlib_ng 63 | except ImportError: 64 | gzip_ng = None 65 | gzip_ng_threaded = None 66 | zlib_ng = None 67 | 68 | try: 69 | import zstandard # type: ignore 70 | except ImportError: 71 | zstandard = None # type: ignore 72 | 73 | try: 74 | import fcntl 75 | 76 | # fcntl.F_SETPIPE_SZ will be available in python 3.10. 77 | # https://github.com/python/cpython/pull/21921 78 | # If not available: set it to the correct value for known platforms. 79 | if not hasattr(fcntl, "F_SETPIPE_SZ") and sys.platform == "linux": 80 | setattr(fcntl, "F_SETPIPE_SZ", 1031) 81 | except ImportError: 82 | fcntl = None # type: ignore 83 | 84 | _MAX_PIPE_SIZE_PATH = pathlib.Path("/proc/sys/fs/pipe-max-size") 85 | try: 86 | _MAX_PIPE_SIZE = int( 87 | _MAX_PIPE_SIZE_PATH.read_text(encoding="ascii") 88 | ) # type: Optional[int] 89 | except ( 90 | OSError 91 | ): # Catches file not found and permission errors. Possible other errors too. 92 | _MAX_PIPE_SIZE = None 93 | 94 | 95 | FilePath = Union[str, bytes, os.PathLike] 96 | FileOrPath = Union[FilePath, IO] 97 | 98 | 99 | @dataclasses.dataclass 100 | class _ProgramSettings: 101 | program_args: Tuple[str, ...] 102 | acceptable_compression_levels: Tuple[int, ...] = tuple(range(1, 10)) 103 | threads_flag: Optional[str] = None 104 | # This exit code is not interpreted as an error when terminating the process 105 | allowed_exit_code: Optional[int] = -signal.SIGTERM 106 | # If this message is printed on stderr on terminating the process, 107 | # it is not interpreted as an error 108 | allowed_exit_message: Optional[bytes] = None 109 | 110 | 111 | _PROGRAM_SETTINGS: Dict[str, _ProgramSettings] = { 112 | "pbzip2": _ProgramSettings( 113 | ("pbzip2",), 114 | tuple(range(1, 10)), 115 | "-p", 116 | allowed_exit_code=None, 117 | allowed_exit_message=b"\n *Control-C or similar caught [sig=15], quitting...", 118 | ), 119 | "xz": _ProgramSettings(("xz",), tuple(range(0, 10)), "-T"), 120 | "zstd": _ProgramSettings(("zstd",), tuple(range(1, 20)), "-T"), 121 | "pigz": _ProgramSettings(("pigz", "--no-name"), tuple(range(0, 10)) + (11,), "-p"), 122 | "gzip": _ProgramSettings(("gzip", "--no-name"), tuple(range(1, 10))), 123 | } 124 | 125 | 126 | def _available_cpu_count() -> int: 127 | """ 128 | Number of available virtual or physical CPUs on this system 129 | Adapted from http://stackoverflow.com/a/1006301/715090 130 | """ 131 | try: 132 | return len(os.sched_getaffinity(0)) 133 | except AttributeError: 134 | pass 135 | import re 136 | 137 | try: 138 | with open("/proc/self/status") as f: 139 | status = f.read() 140 | m = re.search(r"(?m)^Cpus_allowed:\s*(.*)$", status) 141 | if m: 142 | res = bin(int(m.group(1).replace(",", ""), 16)).count("1") 143 | if res > 0: 144 | return res 145 | except OSError: 146 | pass 147 | count = os.cpu_count() 148 | return 1 if count is None else count 149 | 150 | 151 | def _set_pipe_size_to_max(fd: int) -> None: 152 | """ 153 | Set pipe size to maximum on platforms that support it. 154 | :param fd: The file descriptor to increase the pipe size for. 155 | """ 156 | if not hasattr(fcntl, "F_SETPIPE_SZ") or not _MAX_PIPE_SIZE: 157 | return 158 | try: 159 | fcntl.fcntl(fd, fcntl.F_SETPIPE_SZ, _MAX_PIPE_SIZE) # type: ignore 160 | except OSError: 161 | pass 162 | 163 | 164 | class _PipedCompressionProgram(io.IOBase): 165 | """ 166 | Read and write compressed files by running an external process and piping into it. 167 | """ 168 | 169 | def __init__( 170 | self, 171 | filename: FileOrPath, 172 | mode="rb", 173 | compresslevel: Optional[int] = None, 174 | threads: Optional[int] = None, 175 | program_settings: _ProgramSettings = _ProgramSettings(("gzip", "--no-name")), 176 | ): 177 | """ 178 | mode -- one of 'w', 'wb', 'a', 'ab' 179 | compresslevel -- compression level 180 | threads_flag -- which flag is used to denote the number of threads in the program. 181 | If set to none, program will be called without threads flag. 182 | threads (int) -- number of threads. If this is set to None, a reasonable default is 183 | used. At the moment, this means that the number of available CPU cores is used, capped 184 | at four to avoid creating too many threads. Use 0 to use all available cores. 185 | """ 186 | self._error_raised = False 187 | self._program_args = list(program_settings.program_args) 188 | self._allowed_exit_code = program_settings.allowed_exit_code 189 | self._allowed_exit_message = program_settings.allowed_exit_message 190 | if mode not in ("r", "rb", "w", "wb", "a", "ab"): 191 | raise ValueError( 192 | f"Mode is '{mode}', but it must be 'r', 'rb', 'w', 'wb', 'a', or 'ab'" 193 | ) 194 | if "b" not in mode: 195 | mode += "b" 196 | if ( 197 | compresslevel is not None 198 | and compresslevel not in program_settings.acceptable_compression_levels 199 | ): 200 | raise ValueError( 201 | f"compresslevel must be in {program_settings.acceptable_compression_levels}." 202 | ) 203 | self._compresslevel = compresslevel 204 | self.fileobj, self.closefd = _file_or_path_to_binary_stream(filename, mode) 205 | self._path = _filepath_from_path_or_filelike(filename) 206 | self.name: str = str(self._path) 207 | self._mode: str = mode 208 | self._stderr = tempfile.TemporaryFile("w+b") 209 | self._threads_flag: Optional[str] = program_settings.threads_flag 210 | 211 | if threads is None: 212 | if "r" in mode: 213 | # Reading occurs single threaded by default. This has the least 214 | # amount of overhead and is fast enough for most use cases. 215 | threads = 1 216 | else: 217 | threads = min(_available_cpu_count(), 4) 218 | self._threads = threads 219 | 220 | self._open_process() 221 | 222 | def _open_process(self): 223 | if self._threads != 0 and self._threads_flag is not None: 224 | self._program_args += [f"{self._threads_flag}{self._threads}"] 225 | 226 | # Setting close_fds to True in the Popen arguments is necessary due to 227 | # . 228 | # However, close_fds is not supported on Windows. See 229 | # . 230 | close_fds = False 231 | if sys.platform != "win32": 232 | close_fds = True 233 | 234 | self.in_pipe = None 235 | self.in_thread = None 236 | self._feeding = True 237 | if "r" in self._mode: 238 | self._program_args += ["-c", "-d"] # type: ignore 239 | stdout = subprocess.PIPE 240 | else: 241 | if self._compresslevel is not None: 242 | self._program_args += ["-" + str(self._compresslevel)] 243 | stdout = self.fileobj # type: ignore 244 | try: 245 | self.process = subprocess.Popen( 246 | self._program_args, 247 | stderr=self._stderr, 248 | stdout=stdout, 249 | stdin=subprocess.PIPE, 250 | close_fds=close_fds, 251 | ) # type: ignore 252 | except OSError: 253 | if self.closefd: 254 | self.fileobj.close() 255 | raise 256 | assert self.process.stdin is not None 257 | if "r" in self._mode: 258 | self.in_pipe = self.process.stdin 259 | # A python subprocess can read and write from pipes, but not from 260 | # Python in-memory objects. In order for a program to read from an 261 | # in-memory object, a pipe must be created. This pipe must be fed 262 | # data from the in-memory object. This must be done in a separate 263 | # thread, because IO operations will block when the pipe is full 264 | # when writing, or empty when reading. Since the quantity of output 265 | # data generated by a certain amount of input data is unknown, the 266 | # only way to prevent a blocking application is to write 267 | # data continuously to the process stdin on another thread. 268 | self.in_thread = threading.Thread(target=self._feed_pipe) 269 | self.in_thread.start() 270 | self._process_explicitly_terminated = False 271 | self._file: BinaryIO = self.process.stdout # type: ignore 272 | self._wait_for_output_or_process_exit() 273 | self._raise_if_error() 274 | else: 275 | self._file = self.process.stdin # type: ignore 276 | 277 | _set_pipe_size_to_max(self._file.fileno()) 278 | 279 | def __repr__(self): 280 | return ( 281 | f"{self.__class__.__name__}" 282 | f"('{self.name}', mode='{self._mode}', " 283 | f"program='{' '.join(self._program_args)}', " 284 | f"threads={self._threads})" 285 | ) 286 | 287 | def _feed_pipe(self): 288 | try: 289 | while self._feeding: 290 | chunk = self.fileobj.read(BUFFER_SIZE) 291 | if chunk == b"": 292 | self.in_pipe.close() 293 | return 294 | try: 295 | self.in_pipe.write(chunk) 296 | except BrokenPipeError: 297 | if not self._process_explicitly_terminated: 298 | raise 299 | finally: 300 | self.in_pipe.close() 301 | 302 | def write(self, arg: bytes) -> int: 303 | return self._file.write(arg) 304 | 305 | def read(self, *args) -> bytes: 306 | return self._file.read(*args) 307 | 308 | def readinto(self, *args): 309 | return self._file.readinto(*args) 310 | 311 | def readline(self, *args) -> bytes: 312 | return self._file.readline(*args) 313 | 314 | def seekable(self) -> bool: 315 | return self._file.seekable() 316 | 317 | def tell(self) -> int: 318 | return self._file.tell() 319 | 320 | def peek(self, n: Optional[int] = None): 321 | return self._file.peek(n) # type: ignore 322 | 323 | def seek(self, offset, whence=0) -> int: 324 | return self._file.seek(offset, whence) 325 | 326 | def close(self) -> None: 327 | if self.closed: 328 | return 329 | super().close() 330 | if not hasattr(self, "process"): 331 | # Exception was raised during __init__ 332 | if hasattr(self, "_stderr"): 333 | self._stderr.close() 334 | return 335 | check_allowed_code_and_message = False 336 | if "r" in self._mode: 337 | retcode = self.process.poll() 338 | if retcode is None: 339 | # still running 340 | self._process_explicitly_terminated = True 341 | self.process.terminate() 342 | check_allowed_code_and_message = True 343 | self.process.wait() 344 | self._feeding = False 345 | self._file.read() 346 | if self.in_thread: 347 | self.in_thread.join() 348 | self._file.close() 349 | else: 350 | self._file.close() 351 | self.process.wait() 352 | if self.closefd: 353 | self.fileobj.close() 354 | stderr_message = self._read_error_message() 355 | self._stderr.close() 356 | if not self._error_raised: 357 | # Only check for errors if none have been found earlier. 358 | self._raise_if_error(check_allowed_code_and_message, stderr_message) 359 | 360 | def _wait_for_output_or_process_exit(self): 361 | """ 362 | Wait for the process to produce at least some output, or has exited. 363 | """ 364 | # The program may crash due to a non-existing file, internal error etc. 365 | # In that case we need to check. However the 'time-to-crash' differs 366 | # between programs. Some crash faster than others. 367 | # Therefore we peek the first character(s) of stdout. Peek will return at 368 | # least one byte of data, unless the buffer is empty or at EOF. If at EOF, 369 | # we should wait for the program to exit. This way we ensure the program 370 | # has at least decompressed some output, or stopped before we continue. 371 | 372 | # stdout is io.BufferedReader if set to PIPE 373 | while True: 374 | first_output = self.process.stdout.peek(1) 375 | exit_code = self.process.poll() 376 | if first_output or exit_code is not None: 377 | break 378 | time.sleep(0.01) 379 | 380 | def _raise_if_error( 381 | self, check_allowed_code_and_message: bool = False, stderr_message: bytes = b"" 382 | ) -> None: 383 | """ 384 | Raise OSError if process is not running anymore and the exit code is 385 | nonzero. If check_allowed_code_and_message is set, OSError is not raised when 386 | (1) the exit value of the process is equal to the value of the allowed_exit_code 387 | attribute or (2) the allowed_exit_message attribute is set and it matches with 388 | stderr_message. 389 | """ 390 | retcode = self.process.poll() 391 | 392 | if sys.platform == "win32" and retcode == 1 and stderr_message == b"": 393 | # Special case for Windows. Winapi terminates processes with exit code 1 394 | # and an empty error message. 395 | return 396 | 397 | if retcode is None: 398 | # process still running 399 | return 400 | if retcode == 0: 401 | # process terminated successfully 402 | return 403 | 404 | if check_allowed_code_and_message: 405 | if retcode == self._allowed_exit_code: 406 | # terminated with allowed exit code 407 | return 408 | if self._allowed_exit_message and stderr_message.startswith( 409 | self._allowed_exit_message 410 | ): 411 | # terminated with another exit code, but message is allowed 412 | return 413 | 414 | if not stderr_message: 415 | stderr_message = self._read_error_message() 416 | 417 | self._file.close() 418 | self._error_raised = True 419 | raise OSError(f"{stderr_message!r} (exit code {retcode})") 420 | 421 | def _read_error_message(self): 422 | if self._stderr.closed: 423 | return b"" 424 | self._stderr.flush() 425 | self._stderr.seek(0) 426 | return self._stderr.read() 427 | 428 | def __iter__(self): 429 | return self 430 | 431 | def __next__(self) -> bytes: 432 | return self._file.__next__() 433 | 434 | def readable(self): 435 | return self._file.readable() 436 | 437 | def writable(self): 438 | return self._file.writable() 439 | 440 | def flush(self) -> None: 441 | return None 442 | 443 | 444 | def _open_stdin_or_out(mode: str) -> BinaryIO: 445 | assert mode in ("rb", "ab", "wb") 446 | std = sys.stdin if mode == "rb" else sys.stdout 447 | return open(std.fileno(), mode=mode, closefd=False) # type: ignore 448 | 449 | 450 | def _open_bz2( 451 | filename: FileOrPath, 452 | mode: str, 453 | compresslevel: Optional[int], 454 | threads: Optional[int], 455 | ): 456 | assert mode in ("rb", "ab", "wb") 457 | if compresslevel is None: 458 | compresslevel = XOPEN_DEFAULT_BZ2_COMPRESSION 459 | if threads != 0: 460 | try: 461 | # pbzip2 can compress using multiple cores. 462 | return _PipedCompressionProgram( 463 | filename, 464 | mode, 465 | compresslevel, 466 | threads=threads, 467 | program_settings=_PROGRAM_SETTINGS["pbzip2"], 468 | ) 469 | except OSError: 470 | pass # We try without threads. 471 | 472 | bz2_file = bz2.open(filename, mode, compresslevel) 473 | if "r" in mode: 474 | return bz2_file 475 | # Buffer writes on bz2.open to mitigate overhead of small writes 476 | return io.BufferedWriter(bz2_file) # type: ignore 477 | 478 | 479 | def _open_xz( 480 | filename: FileOrPath, 481 | mode: str, 482 | compresslevel: Optional[int], 483 | threads: Optional[int], 484 | ): 485 | assert mode in ("rb", "ab", "wb") 486 | if compresslevel is None: 487 | compresslevel = XOPEN_DEFAULT_XZ_COMPRESSION 488 | 489 | if threads != 0: 490 | try: 491 | # xz can compress using multiple cores. 492 | return _PipedCompressionProgram( 493 | filename, 494 | mode, 495 | compresslevel, 496 | threads, 497 | _PROGRAM_SETTINGS["xz"], 498 | ) 499 | except OSError: 500 | pass # We try without threads. 501 | 502 | if "r" in mode: 503 | return lzma.open(filename, mode) 504 | # Buffer writes on lzma.open to mitigate overhead of small writes 505 | return io.BufferedWriter(lzma.open(filename, mode, preset=compresslevel)) # type: ignore 506 | 507 | 508 | def _open_zst( 509 | filename: FileOrPath, 510 | mode: str, 511 | compresslevel: Optional[int], 512 | threads: Optional[int], 513 | ): 514 | assert mode in ("rb", "ab", "wb") 515 | assert compresslevel != 0 516 | if compresslevel is None: 517 | compresslevel = XOPEN_DEFAULT_ZST_COMPRESSION 518 | if zstandard: 519 | max_window_bits = zstandard.WINDOWLOG_MAX 520 | else: 521 | max_window_bits = 31 522 | if threads != 0: 523 | try: 524 | # zstd can compress using multiple cores 525 | program_args: Tuple[str, ...] = ("zstd",) 526 | if "r" in mode: 527 | # Only use --long=31 for decompression. Using it for 528 | # compression overrides level settings for window size and 529 | # forces other zstd users to use `--long=31` to decompress any 530 | # archive that has been compressed by xopen. 531 | program_args += (f"--long={max_window_bits}",) 532 | return _PipedCompressionProgram( 533 | filename, 534 | mode, 535 | compresslevel, 536 | threads, 537 | _ProgramSettings(program_args, tuple(range(1, 20)), "-T"), 538 | ) 539 | except OSError: 540 | if zstandard is None: 541 | # No fallback available 542 | raise 543 | 544 | if zstandard is None: 545 | raise ImportError("zstandard module (python-zstandard) not available") 546 | dctx = zstandard.ZstdDecompressor(max_window_size=2**max_window_bits) 547 | cctx = zstandard.ZstdCompressor(level=compresslevel) 548 | f = zstandard.open(filename, mode, cctx=cctx, dctx=dctx) # type: ignore 549 | if mode == "rb": 550 | return io.BufferedReader(f) 551 | return io.BufferedWriter(f) # mode "ab" and "wb" 552 | 553 | 554 | def _open_gz( 555 | filename: FileOrPath, 556 | mode: str, 557 | compresslevel: Optional[int], 558 | threads: Optional[int], 559 | ): 560 | """ 561 | Open a gzip file. The ISA-L library is preferred when applicable because 562 | it is the fastest. Then zlib-ng which is not as fast, but supports all 563 | compression levels. After that comes pigz, which can utilize multiple 564 | threads and is more efficient than gzip, even on one core. gzip is chosen 565 | when none of the alternatives are available. Despite it being able to use 566 | only one core, it still finishes faster than using the builtin gzip library 567 | as the (de)compression is moved to another thread. 568 | """ 569 | assert mode in ("rb", "ab", "wb") 570 | if compresslevel is None: 571 | # Force the same compression level on every tool regardless of 572 | # library defaults 573 | compresslevel = XOPEN_DEFAULT_GZIP_COMPRESSION 574 | if compresslevel not in range(10): 575 | # Level 0-9 are supported regardless of backend support 576 | # (zlib_ng supports -1, pigz supports 11 etc.) 577 | raise ValueError( 578 | f"gzip compresslevel must be in range 0-9, got {compresslevel}." 579 | ) 580 | 581 | if threads != 0: 582 | # Igzip level 0 does not output uncompressed deflate blocks as zlib does 583 | # and level 3 is slower but does not compress better than level 1 and 2. 584 | if igzip_threaded and (compresslevel in (1, 2) or "r" in mode): 585 | return igzip_threaded.open( # type: ignore 586 | filename, 587 | mode, 588 | compresslevel, 589 | threads=1, 590 | ) 591 | if gzip_ng_threaded and zlib_ng: 592 | return gzip_ng_threaded.open( 593 | filename, 594 | mode, 595 | # zlib-ng level 1 is 50% bigger than zlib level 1. Level 596 | # 2 gives a size close to expectations. 597 | compresslevel=2 if compresslevel == 1 else compresslevel, 598 | threads=threads or max(_available_cpu_count(), 4), 599 | ) 600 | 601 | for program in ("pigz", "gzip"): 602 | try: 603 | return _PipedCompressionProgram( 604 | filename, 605 | mode, 606 | compresslevel, 607 | threads, 608 | _PROGRAM_SETTINGS[program], 609 | ) 610 | # ValueError when compresslevel is not supported. i.e. gzip and level 0 611 | except (OSError, ValueError): 612 | pass # We try without threads. 613 | return _open_reproducible_gzip(filename, mode=mode, compresslevel=compresslevel) 614 | 615 | 616 | def _open_reproducible_gzip(filename, mode: str, compresslevel: int): 617 | """ 618 | Open a gzip file for writing (without external processes) 619 | that has neither mtime nor the file name in the header 620 | (equivalent to gzip --no-name) 621 | """ 622 | assert mode in ("rb", "wb", "ab") 623 | assert compresslevel is not None 624 | fileobj, closefd = _file_or_path_to_binary_stream(filename, mode) 625 | # Neither gzip.open nor igzip.open have an mtime option, and they will 626 | # always write the file name, so we need to open the file separately 627 | # and pass it to gzip.GzipFile/igzip.IGzipFile. 628 | kwargs = dict( 629 | fileobj=fileobj, 630 | filename="", 631 | mode=mode, 632 | mtime=0, 633 | ) 634 | # Igzip level 0 does not output uncompressed deflate blocks as zlib does 635 | # and level 3 is slower but does not compress better than level 1 and 2. 636 | if igzip is not None and (compresslevel in (1, 2) or "r" in mode): 637 | gzip_file = igzip.IGzipFile(**kwargs, compresslevel=compresslevel) 638 | elif gzip_ng is not None: 639 | # Zlib-ng level 1 creates much bigger files than zlib level 1. 640 | gzip_file = gzip_ng.GzipNGFile( 641 | **kwargs, compresslevel=2 if compresslevel == 1 else compresslevel 642 | ) 643 | else: 644 | gzip_file = gzip.GzipFile(**kwargs, compresslevel=compresslevel) # type: ignore 645 | # When (I)GzipFile is created with a fileobj instead of a filename, 646 | # the passed file object is not closed when (I)GzipFile.close() 647 | # is called. This forces it to be closed. 648 | if closefd: 649 | gzip_file.myfileobj = fileobj 650 | if sys.version_info < (3, 12) and "r" not in mode: 651 | # From version 3.12 onwards, gzip is properly internally buffered for writing. 652 | return io.BufferedWriter(gzip_file) # type: ignore 653 | return gzip_file 654 | 655 | 656 | def _detect_format_from_content(filename: FileOrPath) -> Optional[str]: 657 | """ 658 | Attempts to detect file format from the content by reading the first 659 | 6 bytes. Returns None if no format could be detected. 660 | """ 661 | fileobj, closefd = _file_or_path_to_binary_stream(filename, "rb") 662 | try: 663 | if not fileobj.readable(): 664 | return None 665 | if hasattr(fileobj, "peek"): 666 | bs = fileobj.peek(6) 667 | elif hasattr(fileobj, "seekable") and fileobj.seekable(): 668 | current_pos = fileobj.tell() 669 | bs = fileobj.read(6) 670 | fileobj.seek(current_pos) 671 | else: 672 | return None 673 | 674 | if bs[:2] == b"\x1f\x8b": 675 | # https://tools.ietf.org/html/rfc1952#page-6 676 | return "gz" 677 | elif bs[:3] == b"\x42\x5a\x68": 678 | # https://en.wikipedia.org/wiki/List_of_file_signatures 679 | return "bz2" 680 | elif bs[:6] == b"\xfd\x37\x7a\x58\x5a\x00": 681 | # https://tukaani.org/xz/xz-file-format.txt 682 | return "xz" 683 | elif bs[:4] == b"\x28\xb5\x2f\xfd": 684 | # https://datatracker.ietf.org/doc/html/rfc8478#section-3.1.1 685 | return "zst" 686 | return None 687 | finally: 688 | if closefd: 689 | fileobj.close() 690 | 691 | 692 | def _detect_format_from_extension(filename: Union[str, bytes]) -> Optional[str]: 693 | """ 694 | Attempt to detect file format from the filename extension. 695 | Return None if no format could be detected. 696 | """ 697 | for ext in ("bz2", "xz", "gz", "zst"): 698 | if isinstance(filename, bytes): 699 | if filename.endswith(b"." + ext.encode()): 700 | return ext 701 | else: 702 | if filename.endswith("." + ext): 703 | return ext 704 | return None 705 | 706 | 707 | def _file_or_path_to_binary_stream( 708 | file_or_path: FileOrPath, binary_mode: str 709 | ) -> Tuple[BinaryIO, bool]: 710 | assert binary_mode in ("rb", "wb", "ab") 711 | if isinstance(file_or_path, (str, bytes)) or hasattr(file_or_path, "__fspath__"): 712 | return open(os.fspath(file_or_path), binary_mode), True # type: ignore 713 | if isinstance(file_or_path, io.TextIOWrapper): 714 | return file_or_path.buffer, False 715 | if hasattr(file_or_path, "readinto") or hasattr(file_or_path, "write"): 716 | # Very lenient fallback for all filelike objects. If the filelike 717 | # object is not binary, this will crash at a later point. 718 | return file_or_path, False # type: ignore 719 | raise TypeError( 720 | f"Unsupported type for {file_or_path}, " f"{file_or_path.__class__.__name__}." 721 | ) 722 | 723 | 724 | def _filepath_from_path_or_filelike(fileorpath: FileOrPath) -> str: 725 | try: 726 | return os.fspath(fileorpath) # type: ignore 727 | except TypeError: 728 | pass 729 | if hasattr(fileorpath, "name"): 730 | name = fileorpath.name 731 | if isinstance(name, str): 732 | return name 733 | elif isinstance(name, bytes): 734 | return name.decode() 735 | return "" 736 | 737 | 738 | def _file_is_a_socket_or_pipe(filepath): 739 | try: 740 | mode = os.stat(filepath).st_mode 741 | # Treat anything that is not a regular file as special 742 | return not stat.S_ISREG(mode) 743 | except (OSError, TypeError): # Type error for unexpected types in stat. 744 | return False 745 | 746 | 747 | @overload 748 | def xopen( 749 | filename: FileOrPath, 750 | mode: Literal["r", "w", "a", "rt", "wt", "at"] = ..., 751 | compresslevel: Optional[int] = ..., 752 | threads: Optional[int] = ..., 753 | *, 754 | encoding: str = ..., 755 | errors: Optional[str] = ..., 756 | newline: Optional[str] = ..., 757 | format: Optional[str] = ..., 758 | ) -> io.TextIOWrapper: 759 | ... 760 | 761 | 762 | @overload 763 | def xopen( 764 | filename: FileOrPath, 765 | mode: Literal["rb", "wb", "ab"], 766 | compresslevel: Optional[int] = ..., 767 | threads: Optional[int] = ..., 768 | *, 769 | encoding: str = ..., 770 | errors: None = ..., 771 | newline: None = ..., 772 | format: Optional[str] = ..., 773 | ) -> BinaryIO: 774 | ... 775 | 776 | 777 | def xopen( # noqa: C901 778 | filename: FileOrPath, 779 | mode: Literal["r", "w", "a", "rt", "rb", "wt", "wb", "at", "ab"] = "r", 780 | compresslevel: Optional[int] = None, 781 | threads: Optional[int] = None, 782 | *, 783 | encoding: str = "utf-8", 784 | errors: Optional[str] = None, 785 | newline: Optional[str] = None, 786 | format: Optional[str] = None, 787 | ) -> IO: 788 | """ 789 | A replacement for the "open" function that can also read and write 790 | compressed files transparently. The supported compression formats are gzip, 791 | bzip2, xz and zstandard. If the filename is '-', standard output (mode 'w') or 792 | standard input (mode 'r') is returned. Filename can be a string or a 793 | file object. (See https://docs.python.org/3/glossary.html#term-file-object.) 794 | 795 | When writing, the file format is chosen based on the file name extension: 796 | - .gz uses gzip compression 797 | - .bz2 uses bzip2 compression 798 | - .xz uses xz/lzma compression 799 | - .zst uses zstandard compression 800 | - otherwise, no compression is used 801 | 802 | When reading, if a file name extension is available, the format is detected 803 | using it, but if not, the format is detected from the contents. 804 | 805 | mode can be: 'rt', 'rb', 'at', 'ab', 'wt', or 'wb'. Also, the 't' can be omitted, 806 | so instead of 'rt', 'wt' and 'at', the abbreviations 'r', 'w' and 'a' can be used. 807 | 808 | compresslevel is the compression level for writing to gzip, xz and zst files. 809 | This parameter is ignored for the other compression formats. 810 | If set to None, a default depending on the format is used: 811 | gzip: 6, xz: 6, zstd: 3. 812 | 813 | When threads is None (the default), compressed file formats are read or written 814 | using a pipe to a subprocess running an external tool such as, 815 | ``pbzip2``, ``gzip`` etc., see PipedGzipWriter, PipedGzipReader etc. 816 | If the external tool supports multiple threads, *threads* can be set to an int 817 | specifying the number of threads to use. 818 | If no external tool supporting the compression format is available, the file is 819 | opened calling the appropriate Python function 820 | (that is, no subprocess is spawned). 821 | 822 | Set threads to 0 to force opening the file without using a subprocess. 823 | 824 | encoding, errors and newline are used when opening a file in text mode. 825 | The parameters have the same meaning as in the built-in open function, 826 | except that the default encoding is always UTF-8 instead of the 827 | preferred locale encoding. 828 | 829 | format overrides the autodetection of input and output formats. This can be 830 | useful when compressed output needs to be written to a file without an 831 | extension. Possible values are "gz", "xz", "bz2", "zst". 832 | """ 833 | if mode in ("r", "w", "a"): 834 | mode += "t" # type: ignore 835 | if mode not in ("rt", "rb", "wt", "wb", "at", "ab"): 836 | raise ValueError("Mode '{}' not supported".format(mode)) 837 | binary_mode = mode[0] + "b" 838 | filepath = _filepath_from_path_or_filelike(filename) 839 | 840 | # Open non-regular files such as pipes and sockets here to force opening 841 | # them once. 842 | if filename == "-": 843 | filename = _open_stdin_or_out(binary_mode) 844 | elif _file_is_a_socket_or_pipe(filename): 845 | filename = open(filename, binary_mode) # type: ignore 846 | 847 | if format not in (None, "gz", "xz", "bz2", "zst"): 848 | raise ValueError( 849 | f"Format not supported: {format}. " 850 | f"Choose one of: 'gz', 'xz', 'bz2', 'zst'" 851 | ) 852 | detected_format = format or _detect_format_from_extension(filepath) 853 | if detected_format is None and "r" in mode: 854 | detected_format = _detect_format_from_content(filename) 855 | 856 | if detected_format == "gz": 857 | opened_file = _open_gz(filename, binary_mode, compresslevel, threads) 858 | elif detected_format == "xz": 859 | opened_file = _open_xz(filename, binary_mode, compresslevel, threads) 860 | elif detected_format == "bz2": 861 | opened_file = _open_bz2(filename, binary_mode, compresslevel, threads) 862 | elif detected_format == "zst": 863 | opened_file = _open_zst(filename, binary_mode, compresslevel, threads) 864 | else: 865 | opened_file, _ = _file_or_path_to_binary_stream(filename, binary_mode) 866 | 867 | if "t" in mode: 868 | return io.TextIOWrapper(opened_file, encoding, errors, newline) 869 | return opened_file 870 | -------------------------------------------------------------------------------- /src/xopen/_version.pyi: -------------------------------------------------------------------------------- 1 | # The _version.py file is generated on installation. By including this stub, 2 | # we can run mypy without having to install the package. 3 | 4 | version: str 5 | -------------------------------------------------------------------------------- /src/xopen/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pycompression/xopen/d1931cb5485f1f5a055edb35eec162d6a349722c/src/xopen/py.typed -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import string 4 | import pytest 5 | 6 | from xopen import xopen 7 | 8 | 9 | @pytest.fixture 10 | def create_large_file(tmp_path): 11 | def _create_large_file(extension): 12 | path = tmp_path / f"large{extension}" 13 | random.seed(0) 14 | chars = string.ascii_lowercase + "\n" 15 | # Do not decrease this length. The generated file needs to have 16 | # a certain length after compression to trigger some bugs 17 | # (in particular, 512 kB is not sufficient). 18 | random_text = "".join(random.choices(chars, k=1024 * 1024)) 19 | with xopen(path, "w") as f: 20 | f.write(random_text) 21 | return path 22 | 23 | return _create_large_file 24 | 25 | 26 | @pytest.fixture 27 | def create_truncated_file(create_large_file): 28 | def _create_truncated_file(extension): 29 | large_file = create_large_file(extension) 30 | with open(large_file, "a", encoding="ascii") as f: 31 | f.truncate(os.stat(large_file).st_size - 10) 32 | return large_file 33 | 34 | return _create_truncated_file 35 | -------------------------------------------------------------------------------- /tests/file.txt: -------------------------------------------------------------------------------- 1 | Testing, testing ... 2 | The second line. 3 | -------------------------------------------------------------------------------- /tests/file.txt.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pycompression/xopen/d1931cb5485f1f5a055edb35eec162d6a349722c/tests/file.txt.bz2 -------------------------------------------------------------------------------- /tests/file.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pycompression/xopen/d1931cb5485f1f5a055edb35eec162d6a349722c/tests/file.txt.gz -------------------------------------------------------------------------------- /tests/file.txt.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pycompression/xopen/d1931cb5485f1f5a055edb35eec162d6a349722c/tests/file.txt.xz -------------------------------------------------------------------------------- /tests/file.txt.zst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pycompression/xopen/d1931cb5485f1f5a055edb35eec162d6a349722c/tests/file.txt.zst -------------------------------------------------------------------------------- /tests/hello.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pycompression/xopen/d1931cb5485f1f5a055edb35eec162d6a349722c/tests/hello.gz -------------------------------------------------------------------------------- /tests/only_zeroes.zst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pycompression/xopen/d1931cb5485f1f5a055edb35eec162d6a349722c/tests/only_zeroes.zst -------------------------------------------------------------------------------- /tests/test_piped.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for the PipedCompression classes 3 | """ 4 | import gzip 5 | import io 6 | import os 7 | import shutil 8 | import sys 9 | import pytest 10 | from pathlib import Path 11 | from itertools import cycle 12 | 13 | from xopen import ( 14 | xopen, 15 | _PipedCompressionProgram, 16 | _MAX_PIPE_SIZE, 17 | _PROGRAM_SETTINGS, 18 | _ProgramSettings, 19 | ) 20 | 21 | extensions = ["", ".gz", ".bz2", ".xz", ".zst"] 22 | 23 | try: 24 | import fcntl 25 | 26 | if not hasattr(fcntl, "F_GETPIPE_SZ") and sys.platform == "linux": 27 | setattr(fcntl, "F_GETPIPE_SZ", 1032) 28 | except ImportError: 29 | fcntl = None 30 | 31 | base = os.path.join(os.path.dirname(__file__), "file.txt") 32 | files = [base + ext for ext in extensions] 33 | TEST_DIR = Path(__file__).parent 34 | CONTENT_LINES = [b"Testing, testing ...\n", b"The second line.\n"] 35 | CONTENT = b"".join(CONTENT_LINES) 36 | 37 | 38 | def available_gzip_programs(): 39 | return [_PROGRAM_SETTINGS[prog] for prog in ("gzip", "pigz") if shutil.which(prog)] 40 | 41 | 42 | def available_bzip2_programs(): 43 | if shutil.which("pbzip2"): 44 | return [_PROGRAM_SETTINGS["pbzip2"]] 45 | return [] 46 | 47 | 48 | def available_xz_programs(): 49 | if shutil.which("xz"): 50 | return [_PROGRAM_SETTINGS["xz"]] 51 | return [] 52 | 53 | 54 | def available_zstd_programs(): 55 | if shutil.which("zstd"): 56 | return [_PROGRAM_SETTINGS["zstd"]] 57 | return [] 58 | 59 | 60 | PIPED_GZIP_PROGRAMS = available_gzip_programs() 61 | PIPED_BZIP2_PROGRAMS = available_bzip2_programs() 62 | PIPED_XZ_PROGRAMS = available_xz_programs() 63 | PIPED_ZST_PROGRAMS = available_zstd_programs() 64 | 65 | ALL_PROGRAMS_WITH_EXTENSION = ( 66 | list(zip(PIPED_GZIP_PROGRAMS, cycle([".gz"]))) 67 | + list(zip(PIPED_BZIP2_PROGRAMS, cycle([".bz2"]))) 68 | + list(zip(PIPED_XZ_PROGRAMS, cycle([".xz"]))) 69 | + list(zip(PIPED_ZST_PROGRAMS, cycle([".zst"]))) 70 | ) 71 | 72 | 73 | THREADED_PROGRAMS = [ 74 | settings 75 | for settings in ALL_PROGRAMS_WITH_EXTENSION 76 | if "pbzip2" in settings[0].program_args or "pigz" in settings[0].program_args 77 | ] 78 | 79 | 80 | @pytest.fixture(params=PIPED_GZIP_PROGRAMS) 81 | def gzip_writer(request): 82 | return request.param 83 | 84 | 85 | @pytest.fixture(params=ALL_PROGRAMS_WITH_EXTENSION) 86 | def reader(request): 87 | return request.param 88 | 89 | 90 | @pytest.fixture(params=THREADED_PROGRAMS) 91 | def threaded_reader(request): 92 | return request.param 93 | 94 | 95 | @pytest.fixture(params=ALL_PROGRAMS_WITH_EXTENSION) 96 | def writer(request): 97 | return request.param 98 | 99 | 100 | def test_reader_readinto(reader): 101 | program_settings, extension = reader 102 | content = CONTENT 103 | with _PipedCompressionProgram( 104 | TEST_DIR / f"file.txt{extension}", "rb", program_settings=program_settings 105 | ) as f: 106 | b = bytearray(len(content) + 100) 107 | length = f.readinto(b) 108 | assert length == len(content) 109 | assert b[:length] == content 110 | 111 | 112 | def test_reader_textiowrapper(reader): 113 | program_settings, extension = reader 114 | with _PipedCompressionProgram( 115 | TEST_DIR / f"file.txt{extension}", "rb", program_settings=program_settings 116 | ) as f: 117 | wrapped = io.TextIOWrapper(f, encoding="utf-8") 118 | assert wrapped.read() == CONTENT.decode("utf-8") 119 | 120 | 121 | def test_reader_readline(reader): 122 | program_settings, extension = reader 123 | with _PipedCompressionProgram( 124 | TEST_DIR / f"file.txt{extension}", 125 | "rb", 126 | program_settings=program_settings, 127 | ) as f: 128 | assert f.readline() == CONTENT_LINES[0] 129 | 130 | 131 | def test_reader_readlines(reader): 132 | program_settings, extension = reader 133 | with _PipedCompressionProgram( 134 | TEST_DIR / f"file.txt{extension}", "rb", program_settings=program_settings 135 | ) as f: 136 | assert f.readlines() == CONTENT_LINES 137 | 138 | 139 | @pytest.mark.parametrize("threads", [None, 1, 2]) 140 | def test_piped_reader_iter(threads, threaded_reader): 141 | program_settings, extension = threaded_reader 142 | with _PipedCompressionProgram( 143 | TEST_DIR / f"file.txt{extension}", 144 | "rb", 145 | program_settings=program_settings, 146 | ) as f: 147 | lines = list(f) 148 | assert lines[0] == CONTENT_LINES[0] 149 | 150 | 151 | def test_writer(tmp_path, writer): 152 | program_settings, extension = writer 153 | path = tmp_path / f"out{extension}" 154 | with _PipedCompressionProgram( 155 | path, mode="wb", program_settings=program_settings 156 | ) as f: 157 | f.write(b"hello") 158 | with xopen(path, mode="rb") as f: 159 | assert f.read() == b"hello" 160 | 161 | 162 | def test_writer_has_iter_method(tmp_path, writer): 163 | program_settings, extension = writer 164 | path = tmp_path / f"out{extension}" 165 | with _PipedCompressionProgram( 166 | path, 167 | mode="wb", 168 | program_settings=program_settings, 169 | ) as f: 170 | f.write(b"hello") 171 | assert hasattr(f, "__iter__") 172 | 173 | 174 | def test_reader_iter_without_with(reader): 175 | program_settings, extension = reader 176 | f = _PipedCompressionProgram( 177 | TEST_DIR / f"file.txt{extension}", program_settings=program_settings 178 | ) 179 | it = iter(f) 180 | assert CONTENT_LINES[0] == next(it) 181 | f.close() 182 | 183 | 184 | def test_reader_close(reader, create_large_file): 185 | program_settings, extension = reader 186 | large_file = create_large_file(extension) 187 | with _PipedCompressionProgram( 188 | large_file, "rb", program_settings=program_settings 189 | ) as f: 190 | f.readline() 191 | 192 | 193 | def test_invalid_gzip_compression_level(gzip_writer, tmp_path): 194 | with pytest.raises(ValueError) as e: 195 | with _PipedCompressionProgram( 196 | tmp_path / "out.gz", 197 | mode="w", 198 | compresslevel=17, 199 | program_settings=gzip_writer, 200 | ) as f: 201 | f.write(b"hello") # pragma: no cover 202 | assert "compresslevel must be" in e.value.args[0] 203 | 204 | 205 | def test_invalid_xz_compression_level(tmp_path): 206 | with pytest.raises(ValueError) as e: 207 | with _PipedCompressionProgram( 208 | tmp_path / "out.xz", 209 | mode="w", 210 | compresslevel=17, 211 | program_settings=_PROGRAM_SETTINGS["xz"], 212 | ) as f: 213 | f.write(b"hello") # pragma: no cover 214 | assert "compresslevel must be" in e.value.args[0] 215 | 216 | 217 | def test_invalid_zstd_compression_level(tmp_path): 218 | with pytest.raises(ValueError) as e: 219 | with _PipedCompressionProgram( 220 | tmp_path / "out.zst", 221 | mode="w", 222 | compresslevel=25, 223 | program_settings=_PROGRAM_SETTINGS["zstd"], 224 | ) as f: 225 | f.write(b"hello") # pragma: no cover 226 | assert "compresslevel must be" in e.value.args[0] 227 | 228 | 229 | def test_readers_read(reader): 230 | program_settings, extension = reader 231 | with _PipedCompressionProgram( 232 | TEST_DIR / f"file.txt{extension}", "rb", program_settings=program_settings 233 | ) as f: 234 | assert f.read() == CONTENT 235 | 236 | 237 | @pytest.mark.skipif( 238 | not hasattr(fcntl, "F_GETPIPE_SZ") or _MAX_PIPE_SIZE is None, 239 | reason="Pipe size modifications not available on this platform.", 240 | ) 241 | def test_pipesize_changed(tmp_path): 242 | # Higher compression level to avoid opening with threaded opener 243 | with _PipedCompressionProgram(tmp_path / "hello.gz", "wb", compresslevel=5) as f: 244 | assert fcntl.fcntl(f._file.fileno(), fcntl.F_GETPIPE_SZ) == _MAX_PIPE_SIZE 245 | 246 | 247 | def test_pipedcompressionwriter_wrong_mode(tmp_path): 248 | with pytest.raises(ValueError) as error: 249 | _PipedCompressionProgram(tmp_path / "test", "xb") 250 | error.match("Mode is 'xb', but it must be") 251 | 252 | 253 | def test_pipedcompressionwriter_wrong_program(tmp_path): 254 | with pytest.raises(OSError): 255 | _PipedCompressionProgram( 256 | tmp_path / "test", "wb", program_settings=_ProgramSettings(("XVXCLSKDLA",)) 257 | ) 258 | 259 | 260 | def test_compression_level(tmp_path, gzip_writer): 261 | # Currently only the gzip writers handle compression levels. 262 | path = tmp_path / "test.gz" 263 | with _PipedCompressionProgram( 264 | path, "wb", 2, program_settings=gzip_writer 265 | ) as test_h: 266 | test_h.write(b"test") 267 | assert gzip.decompress(path.read_bytes()) == b"test" 268 | 269 | 270 | def test_iter_method_writers(writer, tmp_path): 271 | program_settings, extension = writer 272 | writer = _PipedCompressionProgram( 273 | tmp_path / f"test{extension}", "wb", program_settings=program_settings 274 | ) 275 | assert iter(writer) == writer 276 | writer.close() 277 | 278 | 279 | def test_next_method_writers(writer, tmp_path): 280 | program_settings, extension = writer 281 | writer = _PipedCompressionProgram( 282 | tmp_path / f"test{extension}", "wb", program_settings=program_settings 283 | ) 284 | with pytest.raises(io.UnsupportedOperation) as error: 285 | next(writer) 286 | error.match("read") 287 | writer.close() 288 | 289 | 290 | def test_pipedcompressionprogram_wrong_mode(): 291 | with pytest.raises(ValueError) as error: 292 | _PipedCompressionProgram("test", "xb") 293 | error.match("Mode is 'xb', but it must be") 294 | 295 | 296 | def test_piped_compression_reader_peek_binary(reader): 297 | program_settings, extension = reader 298 | filegz = TEST_DIR / f"file.txt{extension}" 299 | with _PipedCompressionProgram( 300 | filegz, "rb", program_settings=program_settings 301 | ) as read_h: 302 | # Peek returns at least the amount of characters but maybe more 303 | # depending on underlying stream. Hence startswith not ==. 304 | assert read_h.peek(1).startswith(b"T") 305 | 306 | 307 | @pytest.mark.skipif( 308 | sys.platform != "win32", reason="seeking only works on Windows for now" 309 | ) 310 | def test_piped_compression_reader_seek_and_tell(reader): 311 | program_settings, extension = reader 312 | filegz = TEST_DIR / f"file.txt{extension}" 313 | with _PipedCompressionProgram(filegz, "rb", program_settings=program_settings) as f: 314 | original_position = f.tell() 315 | assert f.read(4) == b"Test" 316 | f.seek(original_position) 317 | assert f.read(8) == b"Testing," 318 | 319 | 320 | @pytest.mark.parametrize("mode", ["r", "rb"]) 321 | def test_piped_compression_reader_peek_text(reader, mode): 322 | program_settings, extension = reader 323 | compressed_file = TEST_DIR / f"file.txt{extension}" 324 | with _PipedCompressionProgram( 325 | compressed_file, mode, program_settings=program_settings 326 | ) as read_h: 327 | assert read_h.peek(1)[0] == CONTENT[0] 328 | 329 | 330 | def writers_and_levels(): 331 | for writer in PIPED_GZIP_PROGRAMS: 332 | if "gzip" in writer.program_args: 333 | # Levels 1-9 are supported 334 | yield from ((writer, i) for i in range(1, 10)) 335 | elif "pigz" in writer.program_args: 336 | # Levels 0-9 + 11 are supported 337 | yield from ((writer, i) for i in list(range(10)) + [11]) 338 | else: 339 | raise NotImplementedError( 340 | f"Test should be implemented for " f"{writer}" 341 | ) # pragma: no cover 342 | 343 | 344 | @pytest.mark.parametrize(["writer", "level"], writers_and_levels()) 345 | def test_valid_compression_levels(writer, level, tmp_path): 346 | path = tmp_path / "test.gz" 347 | with _PipedCompressionProgram(path, "wb", level, program_settings=writer) as handle: 348 | handle.write(b"test") 349 | assert gzip.decompress(path.read_bytes()) == b"test" 350 | 351 | 352 | def test_reproducible_gzip_compression(gzip_writer, tmp_path): 353 | path = tmp_path / "file.gz" 354 | with _PipedCompressionProgram(path, mode="wb", program_settings=gzip_writer) as f: 355 | f.write(b"hello") 356 | 357 | data = path.read_bytes() 358 | assert (data[3] & gzip.FNAME) == 0, "gzip header contains file name" 359 | assert data[4:8] == b"\0\0\0\0", "gzip header contains mtime" 360 | 361 | 362 | def test_piped_tool_fails_on_close(tmp_path): 363 | # This test exercises the retcode != 0 case in PipedCompressionWriter.close() 364 | with pytest.raises(OSError) as e: 365 | with _PipedCompressionProgram( 366 | tmp_path / "out.txt", 367 | "wb", 368 | program_settings=_ProgramSettings( 369 | ( 370 | sys.executable, 371 | "-c", 372 | "import sys\nfor line in sys.stdin: pass\nprint()\nsys.exit(5)", 373 | ), 374 | ), 375 | ) as f: 376 | f.write(b"Hello") 377 | assert "exit code 5" in e.value.args[0] 378 | -------------------------------------------------------------------------------- /tests/test_xopen.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for the xopen.xopen function 3 | """ 4 | import bz2 5 | import subprocess 6 | import sys 7 | import tempfile 8 | from contextlib import contextmanager 9 | import functools 10 | import gzip 11 | import io 12 | import lzma 13 | import os 14 | from pathlib import Path 15 | import shutil 16 | 17 | import pytest 18 | 19 | from xopen import xopen, _detect_format_from_content 20 | 21 | try: 22 | import zstandard 23 | except ImportError: 24 | zstandard = None 25 | 26 | 27 | # TODO this is duplicated in test_piped.py 28 | TEST_DIR = Path(__file__).parent 29 | CONTENT_LINES = ["Testing, testing ...\n", "The second line.\n"] 30 | CONTENT = "".join(CONTENT_LINES) 31 | extensions = ["", ".gz", ".bz2", ".xz"] 32 | if shutil.which("zstd") or zstandard: 33 | extensions += [".zst"] 34 | base = os.path.join(os.path.dirname(__file__), "file.txt") 35 | files = [base + ext for ext in extensions] 36 | 37 | 38 | @contextmanager 39 | def disable_binary(tmp_path, binary_name): 40 | """ 41 | Find the location of the binary by its name, then set PATH to a directory that contains 42 | the binary with permissions set to 000. If no suitable binary could be found, 43 | PATH is set to an empty directory 44 | """ 45 | binary_path = shutil.which(binary_name) 46 | if binary_path: 47 | shutil.copy(binary_path, tmp_path) 48 | os.chmod(tmp_path / Path(binary_path).name, 0) 49 | path = os.environ["PATH"] 50 | try: 51 | os.environ["PATH"] = str(tmp_path) 52 | yield 53 | finally: 54 | os.environ["PATH"] = path 55 | 56 | 57 | @pytest.fixture(params=extensions) 58 | def ext(request): 59 | return request.param 60 | 61 | 62 | @pytest.fixture(params=files) 63 | def fname(request): 64 | return request.param 65 | 66 | 67 | @pytest.fixture 68 | def lacking_pigz_permissions(tmp_path): 69 | with disable_binary(tmp_path, "pigz"): 70 | yield 71 | 72 | 73 | @pytest.fixture 74 | def lacking_pbzip2_permissions(tmp_path): 75 | with disable_binary(tmp_path, "pbzip2"): 76 | yield 77 | 78 | 79 | @pytest.fixture 80 | def lacking_xz_permissions(tmp_path): 81 | with disable_binary(tmp_path, "xz"): 82 | yield 83 | 84 | 85 | @pytest.fixture 86 | def xopen_without_igzip(monkeypatch): 87 | import xopen # xopen local overrides xopen global variable 88 | 89 | monkeypatch.setattr(xopen, "igzip", None) 90 | return xopen.xopen 91 | 92 | 93 | def test_text(fname): 94 | with xopen(fname, "rt") as f: 95 | lines = list(f) 96 | assert len(lines) == 2 97 | assert lines[1] == "The second line.\n", fname 98 | 99 | 100 | def test_binary(fname): 101 | with xopen(fname, "rb") as f: 102 | lines = list(f) 103 | assert len(lines) == 2 104 | assert lines[1] == b"The second line.\n", fname 105 | 106 | 107 | @pytest.mark.parametrize("mode", ["b", "", "t"]) 108 | @pytest.mark.parametrize("threads", [None, 0]) 109 | def test_roundtrip(ext, tmp_path, threads, mode): 110 | if ext == ".zst" and threads == 0 and zstandard is None: 111 | return 112 | path = tmp_path / f"file{ext}" 113 | data = b"Hello" if mode == "b" else "Hello" 114 | with xopen(path, "w" + mode, threads=threads) as f: 115 | f.write(data) 116 | with xopen(path, "r" + mode, threads=threads) as f: 117 | assert f.read() == data 118 | 119 | 120 | def test_binary_no_isal_no_threads(fname, xopen_without_igzip): 121 | if fname.endswith(".zst") and zstandard is None: 122 | return 123 | with xopen_without_igzip(fname, "rb", threads=0) as f: 124 | lines = list(f) 125 | assert len(lines) == 2 126 | assert lines[1] == b"The second line.\n", fname 127 | 128 | 129 | def test_binary_no_isal(fname, xopen_without_igzip): 130 | with xopen_without_igzip(fname, "rb", threads=1) as f: 131 | lines = list(f) 132 | assert len(lines) == 2 133 | assert lines[1] == b"The second line.\n", fname 134 | 135 | 136 | def test_no_context_manager_text(fname): 137 | f = xopen(fname, "rt") 138 | lines = list(f) 139 | assert len(lines) == 2 140 | assert lines[1] == "The second line.\n", fname 141 | f.close() 142 | assert f.closed 143 | 144 | 145 | def test_no_context_manager_binary(fname): 146 | f = xopen(fname, "rb") 147 | lines = list(f) 148 | assert len(lines) == 2 149 | assert lines[1] == b"The second line.\n", fname 150 | f.close() 151 | assert f.closed 152 | 153 | 154 | def test_bytes_path(fname): 155 | path = fname.encode("utf-8") 156 | with xopen(path, "rt") as f: 157 | lines = list(f) 158 | assert len(lines) == 2 159 | assert lines[1] == "The second line.\n", fname 160 | 161 | 162 | def test_readinto(fname): 163 | content = CONTENT.encode("utf-8") 164 | with xopen(fname, "rb") as f: 165 | b = bytearray(len(content) + 100) 166 | length = f.readinto(b) 167 | assert length == len(content) 168 | assert b[:length] == content 169 | 170 | 171 | def test_detect_format_from_content(ext): 172 | with open(Path(__file__).parent / f"file.txt{ext}", "rb") as f: 173 | detected = _detect_format_from_content(f) 174 | if ext == "": 175 | assert detected is None 176 | else: 177 | assert ext[1:] == detected 178 | 179 | 180 | def test_detect_file_format_from_content(ext, tmp_path): 181 | path = tmp_path / f"file.txt{ext}.test" 182 | shutil.copy(TEST_DIR / f"file.txt{ext}", path) 183 | with xopen(path, "rb") as fh: 184 | assert fh.readline() == CONTENT_LINES[0].encode("utf-8") 185 | 186 | 187 | def test_readline(fname): 188 | first_line = CONTENT_LINES[0].encode("utf-8") 189 | with xopen(fname, "rb") as f: 190 | assert f.readline() == first_line 191 | 192 | 193 | def test_readline_text(fname): 194 | with xopen(fname, "r") as f: 195 | assert f.readline() == CONTENT_LINES[0] 196 | 197 | 198 | def test_next(fname): 199 | with xopen(fname, "rt") as f: 200 | _ = next(f) 201 | line2 = next(f) 202 | assert line2 == "The second line.\n", fname 203 | 204 | 205 | def test_has_iter_method(ext, tmp_path): 206 | path = tmp_path / f"out{ext}" 207 | with xopen(path, mode="w") as f: 208 | # Writing anything isn’t strictly necessary, but if we don’t, then 209 | # pbzip2 causes a delay of one second 210 | f.write("hello") 211 | assert hasattr(f, "__iter__") 212 | 213 | 214 | def test_iter_without_with(fname): 215 | f = xopen(fname, "rt") 216 | it = iter(f) 217 | assert CONTENT_LINES[0] == next(it) 218 | f.close() 219 | 220 | 221 | @pytest.mark.parametrize("extension", [".gz", ".bz2"]) 222 | def test_partial_iteration_closes_correctly(extension, create_large_file): 223 | class LineReader: 224 | def __init__(self, file): 225 | self.file = xopen(file, "rb") 226 | 227 | def __iter__(self): 228 | wrapper = io.TextIOWrapper(self.file, encoding="utf-8") 229 | yield from wrapper 230 | 231 | large_file = create_large_file(extension) 232 | f = LineReader(large_file) 233 | next(iter(f)) 234 | f.file.close() 235 | 236 | 237 | def test_nonexisting_file(ext): 238 | with pytest.raises(IOError): 239 | with xopen("this-file-does-not-exist" + ext): 240 | pass # pragma: no cover 241 | 242 | 243 | def test_write_to_nonexisting_dir(ext): 244 | with pytest.raises(IOError): 245 | with xopen("this/path/does/not/exist/file.txt" + ext, "w"): 246 | pass # pragma: no cover 247 | 248 | 249 | def test_invalid_mode(ext): 250 | with pytest.raises(ValueError): 251 | with xopen(TEST_DIR / f"file.txt.{ext}", mode="hallo"): 252 | pass # pragma: no cover 253 | 254 | 255 | def test_filename_invalid_type(): 256 | with pytest.raises(TypeError): 257 | with xopen(123, mode="r"): 258 | pass # pragma: no cover 259 | 260 | 261 | def test_invalid_compression_level(tmp_path): 262 | with pytest.raises(ValueError) as e: 263 | with xopen(tmp_path / "out.gz", mode="w", compresslevel=17) as f: 264 | f.write("hello") # pragma: no cover 265 | assert "compresslevel must be" in e.value.args[0] 266 | 267 | 268 | @pytest.mark.parametrize("ext", extensions) 269 | @pytest.mark.parametrize("threads", (0, 1)) 270 | def test_append(ext, threads, tmp_path): 271 | if ext == ".zst" and zstandard is None and threads == 0: 272 | pytest.skip("No zstandard installed") 273 | text = b"AB" 274 | reference = text + text 275 | path = tmp_path / f"the-file{ext}" 276 | with xopen(path, "ab", threads=threads) as f: 277 | f.write(text) 278 | with xopen(path, "ab", threads=threads) as f: 279 | f.write(text) 280 | with xopen(path, "r") as f: 281 | for appended in f: 282 | pass 283 | reference = reference.decode("utf-8") 284 | assert appended == reference 285 | 286 | 287 | @pytest.mark.parametrize("ext", extensions) 288 | def test_append_text(ext, tmp_path): 289 | text = "AB" 290 | reference = text + text 291 | path = tmp_path / f"the-file{ext}" 292 | with xopen(path, "at") as f: 293 | f.write(text) 294 | with xopen(path, "at") as f: 295 | f.write(text) 296 | with xopen(path, "rt") as f: 297 | for appended in f: 298 | pass 299 | assert appended == reference 300 | 301 | 302 | @pytest.mark.timeout(5) 303 | @pytest.mark.parametrize("extension", [".gz", ".bz2", ".xz"]) 304 | def test_truncated_file(extension, create_truncated_file): 305 | truncated_file = create_truncated_file(extension) 306 | with pytest.raises((EOFError, IOError)): 307 | f = xopen(truncated_file, "r") 308 | f.read() 309 | f.close() # pragma: no cover 310 | 311 | 312 | @pytest.mark.timeout(5) 313 | @pytest.mark.parametrize("extension", [".gz", ".bz2", ".xz"]) 314 | def test_truncated_iter(extension, create_truncated_file): 315 | truncated_file = create_truncated_file(extension) 316 | with pytest.raises((EOFError, IOError)): 317 | f = xopen(truncated_file, "r") 318 | for line in f: 319 | pass 320 | f.close() # pragma: no cover 321 | 322 | 323 | @pytest.mark.timeout(5) 324 | @pytest.mark.parametrize("extension", [".gz", ".bz2", ".xz"]) 325 | def test_truncated_with(extension, create_truncated_file): 326 | truncated_file = create_truncated_file(extension) 327 | with pytest.raises((EOFError, IOError)): 328 | with xopen(truncated_file, "r") as f: 329 | f.read() 330 | 331 | 332 | @pytest.mark.timeout(5) 333 | @pytest.mark.parametrize("extension", [".gz", ".bz2", ".xz"]) 334 | def test_truncated_iter_with(extension, create_truncated_file): 335 | truncated_file = create_truncated_file(extension) 336 | with pytest.raises((EOFError, IOError)): 337 | with xopen(truncated_file, "r") as f: 338 | for line in f: 339 | pass 340 | 341 | 342 | def test_bare_read_from_gz(): 343 | hello_file = TEST_DIR / "hello.gz" 344 | with xopen(hello_file, "rt") as f: 345 | assert f.read() == "hello" 346 | 347 | 348 | @pytest.mark.parametrize("threads", [None, 0, 2]) 349 | def test_concatenated_gzip(tmp_path, threads): 350 | path = tmp_path / "hello.gz" 351 | with gzip.open(path, mode="wt") as f: 352 | print("Hello", file=f) 353 | with gzip.open(path, mode="at") as f: 354 | print("world", file=f) 355 | 356 | with xopen(path, threads=threads) as f: 357 | lines = list(f) 358 | 359 | assert lines == ["Hello\n", "world\n"] 360 | 361 | 362 | def test_read_no_threads(ext): 363 | klasses = { 364 | ".bz2": bz2.BZ2File, 365 | ".gz": gzip.GzipFile, 366 | ".xz": lzma.LZMAFile, 367 | ".zst": io.BufferedReader, 368 | "": io.BufferedReader, 369 | } 370 | if ext == ".zst" and zstandard is None: 371 | return 372 | klass = klasses[ext] 373 | with xopen(TEST_DIR / f"file.txt{ext}", "rb", threads=0) as f: 374 | assert isinstance(f, klass), f 375 | 376 | 377 | def test_write_threads(tmp_path, ext): 378 | path = tmp_path / f"out.{ext}" 379 | with xopen(path, mode="w", threads=3) as f: 380 | f.write("hello") 381 | with xopen(path) as f: 382 | assert f.read() == "hello" 383 | 384 | 385 | def test_write_pigz_threads_no_isal(tmp_path, xopen_without_igzip): 386 | path = tmp_path / "out.gz" 387 | with xopen_without_igzip(path, mode="w", threads=3) as f: 388 | f.write("hello") 389 | with xopen_without_igzip(path) as f: 390 | assert f.read() == "hello" 391 | 392 | 393 | def test_write_no_threads(tmp_path, ext): 394 | klasses = { 395 | ".bz2": bz2.BZ2File, 396 | ".gz": gzip.GzipFile, 397 | ".xz": lzma.LZMAFile, 398 | "": io.BufferedWriter, 399 | } 400 | if ext == ".zst": 401 | # Skip zst because if python-zstandard is not installed, 402 | # we fall back to an external process even when threads=0 403 | return 404 | klass = klasses[ext] 405 | with xopen(tmp_path / f"out{ext}", "wb", threads=0) as f: 406 | if isinstance(f, io.BufferedWriter): 407 | if ext: 408 | assert isinstance(f.raw, klass), f 409 | else: 410 | if ext: 411 | assert isinstance(f, klass) 412 | 413 | 414 | def test_write_gzip_no_threads_no_isal(tmp_path, xopen_without_igzip): 415 | import gzip 416 | 417 | with xopen_without_igzip(tmp_path / "out.gz", "wb", threads=0) as f: 418 | if sys.version_info.major == 3 and sys.version_info.minor >= 12: 419 | assert isinstance(f, gzip.GzipFile), f 420 | else: 421 | assert isinstance(f.raw, gzip.GzipFile) 422 | 423 | 424 | def test_write_stdout(): 425 | f = xopen("-", mode="w") 426 | print("Hello", file=f) 427 | f.close() 428 | # ensure stdout is not closed 429 | print("Still there?") 430 | 431 | 432 | def test_write_stdout_contextmanager(): 433 | # Do not close stdout 434 | with xopen("-", mode="w") as f: 435 | print("Hello", file=f) 436 | # ensure stdout is not closed 437 | print("Still there?") 438 | 439 | 440 | def test_read_pathlib(fname): 441 | path = Path(fname) 442 | with xopen(path, mode="rt") as f: 443 | assert f.read() == CONTENT 444 | 445 | 446 | def test_read_pathlib_binary(fname): 447 | path = Path(fname) 448 | with xopen(path, mode="rb") as f: 449 | assert f.read() == bytes(CONTENT, "ascii") 450 | 451 | 452 | def test_write_pathlib(ext, tmp_path): 453 | path = tmp_path / f"hello.txt{ext}" 454 | with xopen(path, mode="wt") as f: 455 | f.write("hello") 456 | with xopen(path, mode="rt") as f: 457 | assert f.read() == "hello" 458 | 459 | 460 | def test_write_pathlib_binary(ext, tmp_path): 461 | path = tmp_path / f"hello.txt{ext}" 462 | with xopen(path, mode="wb") as f: 463 | f.write(b"hello") 464 | with xopen(path, mode="rb") as f: 465 | assert f.read() == b"hello" 466 | 467 | 468 | def test_falls_back_to_gzip_open(lacking_pigz_permissions): 469 | with xopen(TEST_DIR / "file.txt.gz", "rb") as f: 470 | assert f.readline() == CONTENT_LINES[0].encode("utf-8") 471 | 472 | 473 | def test_falls_back_to_gzip_open_no_isal(lacking_pigz_permissions, xopen_without_igzip): 474 | with xopen_without_igzip(TEST_DIR / "file.txt.gz", "rb") as f: 475 | assert f.readline() == CONTENT_LINES[0].encode("utf-8") 476 | 477 | 478 | def test_fals_back_to_gzip_open_write_no_isal( 479 | lacking_pigz_permissions, xopen_without_igzip, tmp_path 480 | ): 481 | tmp = tmp_path / "test.gz" 482 | with xopen_without_igzip(tmp, "wb") as f: 483 | f.write(b"hello") 484 | assert gzip.decompress(tmp.read_bytes()) == b"hello" 485 | 486 | 487 | def test_falls_back_to_bzip2_open(lacking_pbzip2_permissions): 488 | with xopen(TEST_DIR / "file.txt.bz2", "rb") as f: 489 | assert f.readline() == CONTENT_LINES[0].encode("utf-8") 490 | 491 | 492 | def test_falls_back_to_lzma_open(lacking_xz_permissions): 493 | with xopen(TEST_DIR / "file.txt.xz", "rb") as f: 494 | assert f.readline() == CONTENT_LINES[0].encode("utf-8") 495 | 496 | 497 | def test_open_many_writers(tmp_path, ext): 498 | files = [] 499 | # Because lzma.open allocates a lot of memory, 500 | # open fewer files to avoid MemoryError on 32-bit architectures 501 | n = 21 if ext == ".xz" else 61 502 | for i in range(1, n): 503 | path = tmp_path / f"{i:03d}.txt{ext}" 504 | f = xopen(path, "wb", threads=2) 505 | f.write(b"hello") 506 | files.append(f) 507 | for f in files: 508 | f.close() 509 | 510 | 511 | def test_override_output_format(tmp_path): 512 | path = tmp_path / "test_gzip_compressed" 513 | with xopen(path, mode="wb", format="gz") as f: 514 | f.write(b"test") 515 | test_contents = path.read_bytes() 516 | assert test_contents.startswith(b"\x1f\x8b") # Gzip magic 517 | assert gzip.decompress(test_contents) == b"test" 518 | 519 | 520 | def test_override_output_format_unsupported_format(tmp_path): 521 | path = tmp_path / "test_fairy_format_compressed" 522 | with pytest.raises(ValueError) as error: 523 | xopen(path, mode="wb", format="fairy") 524 | error.match("not supported") 525 | error.match("fairy") 526 | 527 | 528 | def test_override_output_format_wrong_format(tmp_path): 529 | path = tmp_path / "not_compressed" 530 | path.write_text("I am not compressed.", encoding="utf-8") 531 | with pytest.raises(OSError): # BadGzipFile is a subclass of OSError 532 | with xopen(path, "rt", format="gz") as opened_file: 533 | opened_file.read() 534 | 535 | 536 | # Test for threaded and non-threaded. 537 | OPENERS = (xopen, functools.partial(xopen, threads=0)) 538 | 539 | 540 | @pytest.mark.parametrize("opener", OPENERS) 541 | @pytest.mark.parametrize("extension", extensions) 542 | def test_text_encoding_newline_passthrough(opener, extension, tmp_path): 543 | if extension == ".zst" and zstandard is None: 544 | return 545 | # "Eén ree\nTwee reeën\n" latin-1 encoded with \r for as line separator. 546 | encoded_text = b"E\xe9n ree\rTwee ree\xebn\r" 547 | path = tmp_path / f"test.txt{extension}" 548 | with opener(path, "wb") as f: 549 | f.write(encoded_text) 550 | with opener(path, "rt", encoding="latin-1", newline="\r") as f: 551 | result = f.read() 552 | assert result == "Eén ree\rTwee reeën\r" 553 | 554 | 555 | @pytest.mark.parametrize("opener", OPENERS) 556 | @pytest.mark.parametrize("extension", extensions) 557 | def test_text_encoding_errors(opener, extension, tmp_path): 558 | if extension == ".zst" and zstandard is None: 559 | return 560 | # "Eén ree\nTwee reeën\n" latin-1 encoded. This is not valid ascii. 561 | encoded_text = b"E\xe9n ree\nTwee ree\xebn\n" 562 | path = tmp_path / f"test.txt{extension}" 563 | with opener(path, "wb") as f: 564 | f.write(encoded_text) 565 | with opener(path, "rt", encoding="ascii", errors="replace") as f: 566 | result = f.read() 567 | assert result == "E�n ree\nTwee ree�n\n" 568 | 569 | 570 | @pytest.mark.parametrize("compresslevel", [1, 6]) 571 | def test_gzip_compression_is_reproducible_without_piping(tmp_path, compresslevel): 572 | # compresslevel 1 should give us igzip and 6 should give us regular gzip 573 | path = tmp_path / "test.gz" 574 | with xopen(path, mode="wb", compresslevel=compresslevel, threads=0) as f: 575 | f.write(b"hello") 576 | data = path.read_bytes() 577 | assert (data[3] & gzip.FNAME) == 0, "gzip header contains file name" 578 | assert data[4:8] == b"\0\0\0\0", "gzip header contains mtime" 579 | 580 | 581 | def test_read_devnull(): 582 | with xopen(os.devnull): 583 | pass 584 | 585 | 586 | def test_xopen_zst_fails_when_zstandard_not_available(monkeypatch): 587 | import xopen 588 | 589 | monkeypatch.setattr(xopen, "zstandard", None) 590 | with pytest.raises(ImportError): 591 | with xopen.xopen(TEST_DIR / "file.txt.zst", mode="rb", threads=0) as f: 592 | f.read() 593 | 594 | 595 | @pytest.mark.parametrize("threads", (0, 1)) 596 | def test_xopen_zst_long_window_size(threads): 597 | if threads == 0 and zstandard is None: 598 | return 599 | elif threads == 1 and not shutil.which("zstd"): 600 | return 601 | # File created with: 602 | # cat /dev/zero | head -c 2147483648 > only_zeroes 603 | # Then compressed with 604 | # zstd --long=31 -19 only_zeroes 605 | test_zst = Path(__file__).parent / "only_zeroes.zst" 606 | with xopen(test_zst, "rb", threads=threads) as f: 607 | data = f.read(1024) 608 | assert data == bytes(1024) 609 | 610 | 611 | @pytest.mark.parametrize("threads", (0, 1)) 612 | @pytest.mark.parametrize("ext", extensions) 613 | def test_pass_file_object_for_reading(ext, threads): 614 | if ext == ".zst" and zstandard is None: 615 | return 616 | 617 | with open(TEST_DIR / f"file.txt{ext}", "rb") as fh: 618 | with xopen(fh, mode="rb", threads=threads) as f: 619 | assert f.readline() == CONTENT_LINES[0].encode("utf-8") 620 | 621 | 622 | @pytest.mark.parametrize("threads", (0, 1)) 623 | @pytest.mark.parametrize("ext", extensions) 624 | def test_pass_file_object_for_writing(tmp_path, ext, threads): 625 | if ext == ".zst" and zstandard is None: 626 | return 627 | first_line = CONTENT_LINES[0].encode("utf-8") 628 | with open(tmp_path / "out{ext}", "wb") as fh: 629 | with xopen(fh, "wb", threads=threads) as f: 630 | f.write(first_line) 631 | with xopen(tmp_path / "out{ext}", "rb", threads=threads) as fh: 632 | assert fh.readline() == first_line 633 | 634 | 635 | @pytest.mark.parametrize("threads", (0, 1)) 636 | @pytest.mark.parametrize("ext", extensions) 637 | def test_pass_bytesio_for_reading_and_writing(ext, threads): 638 | filelike = io.BytesIO() 639 | format = ext[1:] 640 | if ext == "": 641 | format = None 642 | if ext == ".zst" and zstandard is None: 643 | return 644 | first_line = CONTENT_LINES[0].encode("utf-8") 645 | writer = xopen(filelike, "wb", format=format, threads=threads) 646 | writer.write(first_line) 647 | if writer is not filelike: 648 | writer.close() 649 | assert not filelike.closed 650 | filelike.seek(0) 651 | with xopen(filelike, "rb", format=format, threads=threads) as fh: 652 | assert fh.readline() == first_line 653 | 654 | 655 | @pytest.mark.parametrize("threads", (0, 1)) 656 | def test_xopen_stdin(monkeypatch, ext, threads): 657 | if ext == ".zst" and zstandard is None: 658 | return 659 | # Add encoding to suppress encoding warnings 660 | with open(TEST_DIR / f"file.txt{ext}", "rt", encoding="latin-1") as in_file: 661 | monkeypatch.setattr("sys.stdin", in_file) 662 | with xopen("-", "rt", threads=threads) as f: 663 | data = f.read() 664 | assert data == CONTENT 665 | 666 | 667 | def test_xopen_stdout(monkeypatch): 668 | # Add encoding to suppress encoding warnings 669 | with tempfile.TemporaryFile(mode="w+t", encoding="latin-1") as raw: 670 | monkeypatch.setattr("sys.stdout", raw) 671 | with xopen("-", "wt") as f: 672 | f.write("Hello world!") 673 | raw.seek(0) 674 | data = raw.read() 675 | assert data == "Hello world!" 676 | 677 | 678 | @pytest.mark.parametrize("threads", (0, 1)) 679 | def test_xopen_read_from_pipe(ext, threads): 680 | if ext == ".zst" and zstandard is None: 681 | return 682 | in_file = TEST_DIR / f"file.txt{ext}" 683 | process = subprocess.Popen(("cat", str(in_file)), stdout=subprocess.PIPE) 684 | with xopen(process.stdout, "rt", threads=threads) as f: 685 | data = f.read() 686 | process.wait() 687 | process.stdout.close() 688 | assert data == CONTENT 689 | 690 | 691 | @pytest.mark.parametrize("threads", (0, 1)) 692 | def test_xopen_write_to_pipe(threads, ext): 693 | if ext == ".zst" and zstandard is None: 694 | return 695 | format = ext.lstrip(".") 696 | if format == "": 697 | format = None 698 | process = subprocess.Popen(("cat",), stdout=subprocess.PIPE, stdin=subprocess.PIPE) 699 | with xopen(process.stdin, "wt", threads=threads, format=format) as f: 700 | f.write(CONTENT) 701 | process.stdin.close() 702 | with xopen(process.stdout, "rt", threads=threads) as f: 703 | data = f.read() 704 | process.wait() 705 | process.stdout.close() 706 | assert data == CONTENT 707 | 708 | 709 | @pytest.mark.skipif( 710 | not os.path.exists("/dev/stdin"), reason="/dev/stdin does not exist" 711 | ) 712 | @pytest.mark.parametrize("threads", (0, 1)) 713 | def test_xopen_dev_stdin_read(threads, ext): 714 | if ext == ".zst" and zstandard is None: 715 | return 716 | file = str(Path(__file__).parent / f"file.txt{ext}") 717 | result = subprocess.run( 718 | f"cat {file} | python -c 'import xopen; " 719 | f'f=xopen.xopen("/dev/stdin", "rt", threads={threads});print(f.read())\'', 720 | shell=True, 721 | stdout=subprocess.PIPE, 722 | encoding="ascii", 723 | ) 724 | assert result.stdout == CONTENT + "\n" 725 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = black,flake8,mypy,py39,py310,py311,py312,py313,pypy3 3 | isolated_build = True 4 | 5 | [testenv] 6 | deps = 7 | pytest 8 | pytest-timeout 9 | coverage 10 | setenv = 11 | PYTHONDEVMODE = 1 12 | PYTHONWARNDEFAULTENCODING = 1 13 | commands = 14 | coverage run --branch --source=xopen,tests -m pytest -v --doctest-modules tests 15 | coverage report 16 | coverage xml 17 | coverage html 18 | 19 | [testenv:zstd] 20 | deps = 21 | {[testenv]deps} 22 | zstandard 23 | 24 | [testenv:no-libs] 25 | commands= 26 | pip uninstall -y isal zlib-ng 27 | {[testenv]commands} 28 | 29 | [testenv:black] 30 | basepython = python3.10 31 | deps = black==22.3.0 32 | skip_install = true 33 | commands = black --check src/ tests/ 34 | 35 | [testenv:flake8] 36 | basepython = python3.10 37 | deps = flake8 38 | commands = flake8 src/ tests/ 39 | skip_install = true 40 | 41 | [testenv:mypy] 42 | basepython = python3.10 43 | deps = mypy 44 | commands = mypy src/ 45 | 46 | [flake8] 47 | max-line-length = 99 48 | max-complexity = 10 49 | extend_ignore = E731 50 | 51 | [coverage:report] 52 | exclude_lines = 53 | pragma: no cover 54 | def __repr__ 55 | @overload 56 | --------------------------------------------------------------------------------