├── .editorconfig ├── .flake8 ├── .github ├── dependabot.yml └── workflows │ └── ci.yml ├── .gitignore ├── CHANGES.rst ├── CONTRIBUTING.md ├── LICENSE.txt ├── README.rst ├── asv.conf.json ├── benchmarks ├── __init__.py └── benchmarks.py ├── codecov.yml ├── docs ├── _static │ └── .gitkeep ├── changes.rst ├── conf.py ├── index.rst └── sphinxext │ └── ignore_missing_refs.py ├── mypy.ini ├── pyproject.toml ├── rle_array ├── __init__.py ├── _algorithms.py ├── _slicing.py ├── array.py ├── autoconversion.py ├── dtype.py ├── testing.py └── types.py ├── scripts ├── fmt.sh └── test.sh ├── setup.cfg ├── setup.py └── tests ├── __init__.py ├── test_algorithms.py ├── test_astype.py ├── test_autoconversion.py ├── test_builtins.py ├── test_constructors.py ├── test_dtype.py ├── test_fastpath.py ├── test_indexing.py ├── test_misc_operations.py ├── test_operators.py ├── test_pandas.py ├── test_reduce.py ├── test_regressions.py ├── test_slicing.py ├── test_testing.py ├── test_ufunc.py └── test_view.py /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | charset = utf-8 5 | end_of_line = lf 6 | insert_final_newline = true 7 | 8 | [*.py] 9 | include_trailing_comma = true 10 | indent_size = 4 11 | indent_style = space 12 | trim_trailing_whitespace = true 13 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E203, E266, E501, W503 3 | max-line-length = 80 4 | max-complexity = 18 5 | select = B,C,E,F,W,T4,B9 6 | exclude = 7 | build, 8 | dist 9 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "daily" 7 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | tags: 8 | - v* 9 | pull_request: 10 | 11 | env: 12 | IS_TAG: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')}} 13 | MASTER_PYTHON: "3.8" 14 | 15 | jobs: 16 | lint: 17 | runs-on: ubuntu-latest 18 | timeout-minutes: 10 19 | steps: 20 | - name: Checkout 21 | uses: actions/checkout@v2 22 | - name: Set up Python ${{ env.MASTER_PYTHON }} 23 | uses: actions/setup-python@v2 24 | with: 25 | python-version: ${{ env.MASTER_PYTHON }} 26 | - name: Install Poetry Itself 27 | run: pip install poetry 28 | - name: Poetry Install 29 | run: poetry install 30 | - name: Flake8 31 | run: poetry run flake8 32 | - name: Mypy 33 | run: poetry run mypy . 34 | - name: Black 35 | run: poetry run black --check . 36 | - name: Isort 37 | run: poetry run isort --check-only . 38 | - name: Shellcheck 39 | run: shellcheck scripts/*.sh 40 | 41 | test: 42 | strategy: 43 | matrix: 44 | python: ["3.6", "3.7", "3.8"] 45 | runs-on: ubuntu-latest 46 | timeout-minutes: 10 47 | steps: 48 | - name: Checkout 49 | uses: actions/checkout@v2 50 | - name: Set up Python ${{ matrix.python }} 51 | uses: actions/setup-python@v2 52 | with: 53 | python-version: ${{ matrix.python }} 54 | - name: Install Poetry Itself 55 | run: pip install poetry 56 | - name: Poetry Install 57 | run: poetry install 58 | - name: Pytest 59 | run: poetry run pytest 60 | - name: ASV 61 | run: | 62 | poetry run asv machine --machine travis --os unknown --arch unknown --cpu unknown --ram unknown 63 | poetry run asv run --show-stderr --environment existing --quick 64 | - name: Codecov 65 | uses: codecov/codecov-action@v1.2.1 66 | with: 67 | # NOTE: `token` is not required, because the rle-array repo is public 68 | file: ./coverage.xml 69 | name: pytest-${{ runner.OS }}-${{ matrix.python }} 70 | 71 | docs: 72 | runs-on: ubuntu-latest 73 | timeout-minutes: 10 74 | steps: 75 | - name: Checkout 76 | uses: actions/checkout@v2 77 | - name: Set up Python ${{ env.MASTER_PYTHON }} 78 | uses: actions/setup-python@v2 79 | with: 80 | python-version: ${{ env.MASTER_PYTHON }} 81 | - name: Install Poetry Itself 82 | run: pip install poetry 83 | - name: Poetry Install 84 | run: poetry install 85 | - name: Sphinx 86 | run: | 87 | poetry run python setup.py build_sphinx 88 | touch ./docs/_build/html/.nojekyll 89 | - name: Preserve Docs 90 | uses: actions/upload-artifact@v2.2.2 91 | with: 92 | name: docs 93 | path: docs/_build/html 94 | - name: Deploy Docs 95 | if: github.event_name == 'push' && github.event.ref == 'refs/heads/master' 96 | uses: peaceiris/actions-gh-pages@v3 97 | with: 98 | github_token: ${{ secrets.GITHUB_TOKEN }} 99 | publish_dir: ./docs/_build/html 100 | 101 | release: 102 | runs-on: ubuntu-latest 103 | needs: [lint, test, docs] 104 | steps: 105 | - name: Checkout 106 | uses: actions/checkout@v2 107 | - name: Set up Python ${{ env.MASTER_PYTHON }} 108 | uses: actions/setup-python@v2 109 | with: 110 | python-version: ${{ env.MASTER_PYTHON }} 111 | - name: Install Poetry Itself 112 | run: pip install poetry 113 | - name: Poetry Install 114 | run: poetry install 115 | - name: Build 116 | run: poetry build 117 | - name: Prepare Release Notes 118 | run: awk 'BEGIN{found=0} {if (match($0, "==============")) {if (found == 1) exit; found=1}; if (found == 1) {print last}; last=$0}' CHANGES.rst > release_notes.rst 119 | - name: Create Release Notes 120 | uses: docker://pandoc/core:2.9 121 | with: 122 | args: --from=rst --to=markdown -o release_notes.md release_notes.rst 123 | - name: Preserve Dist 124 | uses: actions/upload-artifact@v2.2.2 125 | with: 126 | name: dist 127 | path: dist 128 | - name: Preserve Release Notes 129 | uses: actions/upload-artifact@v2.2.2 130 | with: 131 | name: release_notes.md 132 | path: release_notes.md 133 | - name: Publish to PyPI 134 | if: env.IS_TAG == 'true' 135 | run: poetry publish 136 | env: 137 | POETRY_PYPI_TOKEN_PYPI: ${{ secrets.PYPI_TOKEN }} 138 | - name: Create GitHub Release 139 | if: env.IS_TAG == 'true' 140 | uses: actions/create-release@v1.1.4 141 | env: 142 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 143 | with: 144 | tag_name: ${{ github.ref }} 145 | release_name: rle-array ${{ github.ref }} 146 | body_path: release_notes.md 147 | draft: false 148 | prerelease: false 149 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.asv/ 2 | *.egg-info/ 3 | .coverage 4 | .mypy_cache/ 5 | .pytest_cache/ 6 | .venv/ 7 | __pycache__/ 8 | build/ 9 | coverage.xml 10 | dist/ 11 | docs/_build/ 12 | docs/_rst/ 13 | pip-wheel-metadata/ 14 | poetry.lock 15 | -------------------------------------------------------------------------------- /CHANGES.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | Changelog 3 | ========= 4 | 5 | Version 0.1.0 (unreleased) 6 | ========================== 7 | Initial public release. 8 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How To Contribute 2 | 3 | 4 | ## Bugs 5 | 6 | If you've found a bug, please report it to the issue tracker and 7 | 8 | * Describe the bug you encountered and what the expected behavior should be. 9 | * Provide a [Minimal, Reproducible Example](https://stackoverflow.com/help/mcve) (if possible). 10 | * Be as explicit about your environment as possible, e.g. provide a `pip freeze` / `conda list`. 11 | 12 | ## Code Contributions 13 | 14 | **Unless you explicitly state otherwise, any contribution you intentionally submit for inclusion in the work, shall be 15 | dual-licensed under MIT license, without any additional terms or conditions.** 16 | 17 | Please file a GitHub pull request with your contribution. See the [Development](#Development) section for details on 18 | tooling. See the "Development Plan" in the README for the generic prioritization. 19 | 20 | 21 | ## Development 22 | 23 | ### Installation 24 | To get started, set up a new virtual environment and install all requirements: 25 | 26 | ```bash 27 | virtualenv --python=python3.6 .venv 28 | source .venv/bin/activate 29 | pip install poetry 30 | poetry install 31 | ``` 32 | 33 | ### Code style 34 | 35 | To ensure a consistent code style across the code base we're using the following tools: 36 | 37 | - [`black`](https://github.com/psf/black): code formatter 38 | - [`flake8`](https://gitlab.com/pycqa/flake8): linting 39 | - [`isort`](https://github.com/timothycrosley/isort): sorting of imports 40 | 41 | We have a convenience script that runs all these tools and a code style check for you: 42 | 43 | ```bash 44 | poetry run ./scripts/fmt.sh 45 | ``` 46 | 47 | ### Testing 48 | There are different tools that ensure a well tested and presented library. To run them all at once (useful for 49 | development), use: 50 | 51 | ```bash 52 | poetry run ./scripts/test.sh 53 | ``` 54 | 55 | ### Pytest 56 | We're using [pytest](https://pytest.org) as a testing framework and make heavy use of `fixtures` and `parametrization`. 57 | To run the tests simply run: 58 | 59 | ```bash 60 | poetry run pytest 61 | ``` 62 | 63 | ### Benchmarks 64 | For performance critical code paths we have [asv](https://asv.readthedocs.io/) benchmarks in place in the subfolder 65 | `benchmarks`. To run the benchmarks a single time and receive immediate feedback run: 66 | 67 | ```bash 68 | poetry run asv run --python=same --show-stderr 69 | ``` 70 | 71 | ### Documentation 72 | Documentation is created using [Sphinx](https://www.sphinx-doc.org/) and can be build by using: 73 | 74 | ```bash 75 | poetry run python setup.py build_sphinx 76 | ``` 77 | 78 | ### Typing 79 | We use [mypy](http://mypy-lang.org/) to check python types. It can be run using: 80 | 81 | ```bash 82 | poetry run mypy . 83 | ``` 84 | 85 | ## Performance Improvements 86 | If you wish to contribute a performance improvement, please ensure that a benchmark (in `asv_bench`) exists or that you 87 | provide on in your pull request. Please run that benchmark before and after your change and add both values to the 88 | commit message of your contribution. 89 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019-2020 Blue Yonder Group, Inc 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | rle-array 3 | ========= 4 | 5 | .. image:: https://github.com/JDASoftwareGroup/rle-array/workflows/CI/badge.svg?branch=master 6 | :target: https://github.com/JDASoftwareGroup/rle-array/actions?query=branch%3Amaster+workflow%3ACI 7 | :alt: Build Status 8 | .. image:: https://codecov.io/gh/JDASoftwareGroup/rle-array/branch/master/graph/badge.svg?token=y2q96vlHqc 9 | :target: https://codecov.io/gh/JDASoftwareGroup/rle-array 10 | :alt: Coverage Status 11 | 12 | `Extension Array`_ for `Pandas`_ that implements `Run-length Encoding`_. 13 | 14 | 15 | .. contents:: Table of Contents 16 | 17 | 18 | Quick Start 19 | *********** 20 | 21 | Some basic setup first: 22 | 23 | >>> import pandas as pd 24 | >>> pd.set_option("display.max_rows", 40) 25 | >>> pd.set_option("display.width", None) 26 | 27 | We need some example data, so let's create some pseudo-weather data: 28 | 29 | >>> from rle_array.testing import generate_example 30 | >>> df = generate_example() 31 | >>> df.head(10) 32 | date month year city country avg_temp rain mood 33 | 0 2000-01-01 1 2000 city_0 country_0 12.400000 False ok 34 | 1 2000-01-02 1 2000 city_0 country_0 4.000000 False ok 35 | 2 2000-01-03 1 2000 city_0 country_0 17.200001 False great 36 | 3 2000-01-04 1 2000 city_0 country_0 8.400000 False ok 37 | 4 2000-01-05 1 2000 city_0 country_0 6.400000 False ok 38 | 5 2000-01-06 1 2000 city_0 country_0 14.400000 False ok 39 | 6 2000-01-07 1 2000 city_0 country_0 14.300000 True ok 40 | 7 2000-01-08 1 2000 city_0 country_0 6.800000 False ok 41 | 8 2000-01-09 1 2000 city_0 country_0 10.100000 False ok 42 | 9 2000-01-10 1 2000 city_0 country_0 -1.200000 False ok 43 | 44 | Due to the large number of attributes for locations and the date, the data size is quite large: 45 | 46 | >>> df.memory_usage() 47 | Index 128 48 | date 32000000 49 | month 4000000 50 | year 8000000 51 | city 32000000 52 | country 32000000 53 | avg_temp 16000000 54 | rain 4000000 55 | mood 32000000 56 | dtype: int64 57 | >>> df.memory_usage().sum() 58 | 160000128 59 | 60 | To compress the data, we can use ``rle-array``: 61 | 62 | >>> import rle_array 63 | >>> df_rle = df.astype({ 64 | ... "city": "RLEDtype[object]", 65 | ... "country": "RLEDtype[object]", 66 | ... "month": "RLEDtype[int8]", 67 | ... "mood": "RLEDtype[object]", 68 | ... "rain": "RLEDtype[bool]", 69 | ... "year": "RLEDtype[int16]", 70 | ... }) 71 | >>> df_rle.memory_usage() 72 | Index 128 73 | date 32000000 74 | month 1188000 75 | year 120000 76 | city 32000 77 | country 64 78 | avg_temp 16000000 79 | rain 6489477 80 | mood 17153296 81 | dtype: int64 82 | >>> df_rle.memory_usage().sum() 83 | 72982965 84 | 85 | This works better the longer the runs are. In the above example, it does not work too well for ``"rain"``. 86 | 87 | 88 | Development Plan 89 | **************** 90 | 91 | The development of ``rle-array`` has the following priorities (in decreasing order): 92 | 93 | 1. **Correctness:** All results must be correct. The `Pandas`_-provided test suite must pass. Approximation are not 94 | allowed. 95 | 2. **Transparency:** The user can use :class:`~rle_array.RLEDtype` and :class:`~rle_array.RLEArray` like other `Pandas`_ 96 | types. No special parameters or extra functions are required. 97 | 3. **Features:** Support all features that `Pandas`_ offers, even if it is slow (but inform the user using a 98 | :class:`pandas.errors.PerformanceWarning`). 99 | 4. **Simplicity:** Do not use `Python C Extensions`_ or `Cython`_ (`NumPy`_ and `Numba`_ are allowed). 100 | 5. **Memory Reduction:** Do not decompress the encoded data when not required, try to do as many calculations directly 101 | on the compressed representation. 102 | 6. **Performance:** It should be quick, for large data ideally faster than working on the uncompressed data. Use 103 | `Numba`_ to speed up code. 104 | 105 | 106 | Implementation 107 | ************** 108 | 109 | Imagine the following data array: 110 | 111 | +-------+------+ 112 | | Index | Data | 113 | +=======+======+ 114 | | 1 | "a" | 115 | +-------+------+ 116 | | 2 | "a" | 117 | +-------+------+ 118 | | 3 | "a" | 119 | +-------+------+ 120 | | 4 | "x" | 121 | +-------+------+ 122 | | 5 | "c" | 123 | +-------+------+ 124 | | 6 | "c" | 125 | +-------+------+ 126 | | 7 | "a" | 127 | +-------+------+ 128 | | 8 | "a" | 129 | +-------+------+ 130 | 131 | There some data points valid for multiple entries in a row: 132 | 133 | +-------+------+ 134 | | Index | Data | 135 | +=======+======+ 136 | | 1 | "a" | 137 | +-------+ + 138 | | 2 | | 139 | +-------+ + 140 | | 3 | | 141 | +-------+------+ 142 | | 4 | "x" | 143 | +-------+------+ 144 | | 5 | "c" | 145 | +-------+ + 146 | | 6 | | 147 | +-------+------+ 148 | | 7 | "a" | 149 | +-------+ + 150 | | 8 | | 151 | +-------+------+ 152 | 153 | These sections are also called *runs* and can be encoded by their value and their length: 154 | 155 | +--------+-------+ 156 | | Length | Value | 157 | +========+=======+ 158 | | 3 | "a" | 159 | +--------+-------+ 160 | | 1 | "x" | 161 | +--------+-------+ 162 | | 2 | "c" | 163 | +--------+-------+ 164 | | 2 | "a" | 165 | +--------+-------+ 166 | 167 | This representation is called `Run-length Encoding`_. To integrate this encoding better with `Pandas`_ and `NumPy`_ and 168 | to support operations like slicing and random access (e.g. via :func:`pandas.api.extensions.ExtensionArray.take`), we 169 | store the end position (the cum-sum of the length column) instead of the length: 170 | 171 | +--------------+-------+ 172 | | End-position | Value | 173 | +==============+=======+ 174 | | 3 | "a" | 175 | +--------------+-------+ 176 | | 4 | "x" | 177 | +--------------+-------+ 178 | | 6 | "c" | 179 | +--------------+-------+ 180 | | 8 | "a" | 181 | +--------------+-------+ 182 | 183 | The value array is an :class:`numpy.ndarray` with the same dtype as the original data and the end-positions are an 184 | :class:`numpy.ndarray` with the dtype ``int64``. 185 | 186 | 187 | License 188 | ******* 189 | 190 | Licensed under: 191 | 192 | - MIT License (``LICENSE.txt`` or https://opensource.org/licenses/MIT) 193 | 194 | 195 | .. _Cython: https://cython.org/ 196 | .. _Extension Array: https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extensionarray 197 | .. _Numba: https://numba.pydata.org/ 198 | .. _NumPy: https://numpy.org/ 199 | .. _Pandas: https://pandas.pydata.org/ 200 | .. _Python C Extensions: https://docs.python.org/3/extending/building.html 201 | .. _Run-length Encoding: https://en.wikipedia.org/wiki/Run-length_encoding 202 | -------------------------------------------------------------------------------- /asv.conf.json: -------------------------------------------------------------------------------- 1 | { 2 | // The version of the config file format. Do not change, unless 3 | // you know what you are doing. 4 | "version": 1, 5 | 6 | // The name of the project being benchmarked 7 | "project": "rle-array", 8 | 9 | // The project's homepage 10 | "project_url": "http://project-homepage.org/", 11 | 12 | // The URL or local path of the source code repository for the 13 | // project being benchmarked 14 | "repo": "..", 15 | 16 | // The Python project's subdirectory in your repo. If missing or 17 | // the empty string, the project is assumed to be located at the root 18 | // of the repository. 19 | // "repo_subdir": "", 20 | 21 | // Customizable commands for building, installing, and 22 | // uninstalling the project. See asv.conf.json documentation. 23 | // 24 | // "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}"], 25 | // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"], 26 | // "build_command": [ 27 | // "python setup.py build", 28 | // "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}" 29 | // ], 30 | 31 | // List of branches to benchmark. If not provided, defaults to "master" 32 | // (for git) or "default" (for mercurial). 33 | // "branches": ["master"], // for git 34 | // "branches": ["default"], // for mercurial 35 | 36 | // The DVCS being used. If not set, it will be automatically 37 | // determined from "repo" by looking at the protocol in the URL 38 | // (if remote), or by looking for special directories, such as 39 | // ".git" (if local). 40 | // "dvcs": "git", 41 | 42 | // The tool to use to create environments. May be "conda", 43 | // "virtualenv" or other value depending on the plugins in use. 44 | // If missing or the empty string, the tool will be automatically 45 | // determined by looking for tools on the PATH environment 46 | // variable. 47 | "environment_type": "virtualenv", 48 | 49 | // timeout in seconds for installing any dependencies in environment 50 | // defaults to 10 min 51 | //"install_timeout": 600, 52 | 53 | // the base URL to show a commit for the project. 54 | // "show_commit_url": "http://github.com/owner/project/commit/", 55 | 56 | // The Pythons you'd like to test against. If not provided, defaults 57 | // to the current version of Python used to run `asv`. 58 | "pythons": ["3.6", "3.7", "3.8"], 59 | 60 | // The list of conda channel names to be searched for benchmark 61 | // dependency packages in the specified order 62 | // "conda_channels": ["conda-forge", "defaults"], 63 | 64 | // The matrix of dependencies to test. Each key is the name of a 65 | // package (in PyPI) and the values are version numbers. An empty 66 | // list or empty string indicates to just test against the default 67 | // (latest) version. null indicates that the package is to not be 68 | // installed. If the package to be tested is only available from 69 | // PyPi, and the 'environment_type' is conda, then you can preface 70 | // the package name by 'pip+', and the package will be installed via 71 | // pip (with all the conda available packages installed first, 72 | // followed by the pip installed packages). 73 | // 74 | // "matrix": { 75 | // "numpy": ["1.6", "1.7"], 76 | // "six": ["", null], // test with and without six installed 77 | // "pip+emcee": [""], // emcee is only available for install with pip. 78 | // }, 79 | 80 | // Combinations of libraries/python versions can be excluded/included 81 | // from the set to test. Each entry is a dictionary containing additional 82 | // key-value pairs to include/exclude. 83 | // 84 | // An exclude entry excludes entries where all values match. The 85 | // values are regexps that should match the whole string. 86 | // 87 | // An include entry adds an environment. Only the packages listed 88 | // are installed. The 'python' key is required. The exclude rules 89 | // do not apply to includes. 90 | // 91 | // In addition to package names, the following keys are available: 92 | // 93 | // - python 94 | // Python version, as in the *pythons* variable above. 95 | // - environment_type 96 | // Environment type, as above. 97 | // - sys_platform 98 | // Platform, as in sys.platform. Possible values for the common 99 | // cases: 'linux2', 'win32', 'cygwin', 'darwin'. 100 | // 101 | // "exclude": [ 102 | // {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows 103 | // {"environment_type": "conda", "six": null}, // don't run without six on conda 104 | // ], 105 | // 106 | // "include": [ 107 | // // additional env for python2.7 108 | // {"python": "2.7", "numpy": "1.8"}, 109 | // // additional env if run on windows+conda 110 | // {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""}, 111 | // ], 112 | 113 | // The directory (relative to the current directory) that benchmarks are 114 | // stored in. If not provided, defaults to "benchmarks" 115 | // "benchmark_dir": "benchmarks", 116 | 117 | // The directory (relative to the current directory) to cache the Python 118 | // environments in. If not provided, defaults to "env" 119 | "env_dir": ".asv/env", 120 | 121 | // The directory (relative to the current directory) that raw benchmark 122 | // results are stored in. If not provided, defaults to "results". 123 | "results_dir": ".asv/results", 124 | 125 | // The directory (relative to the current directory) that the html tree 126 | // should be written to. If not provided, defaults to "html". 127 | "html_dir": ".asv/html", 128 | 129 | // The number of characters to retain in the commit hashes. 130 | // "hash_length": 8, 131 | 132 | // `asv` will cache results of the recent builds in each 133 | // environment, making them faster to install next time. This is 134 | // the number of builds to keep, per environment. 135 | // "build_cache_size": 2, 136 | 137 | // The commits after which the regression search in `asv publish` 138 | // should start looking for regressions. Dictionary whose keys are 139 | // regexps matching to benchmark names, and values corresponding to 140 | // the commit (exclusive) after which to start looking for 141 | // regressions. The default is to start from the first commit 142 | // with results. If the commit is `null`, regression detection is 143 | // skipped for the matching benchmark. 144 | // 145 | // "regressions_first_commits": { 146 | // "some_benchmark": "352cdf", // Consider regressions only after this commit 147 | // "another_benchmark": null, // Skip regression detection altogether 148 | // }, 149 | 150 | // The thresholds for relative change in results, after which `asv 151 | // publish` starts reporting regressions. Dictionary of the same 152 | // form as in ``regressions_first_commits``, with values 153 | // indicating the thresholds. If multiple entries match, the 154 | // maximum is taken. If no entry matches, the default is 5%. 155 | // 156 | // "regressions_thresholds": { 157 | // "some_benchmark": 0.01, // Threshold of 1% 158 | // "another_benchmark": 0.5, // Threshold of 50% 159 | // }, 160 | } 161 | -------------------------------------------------------------------------------- /benchmarks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JDASoftwareGroup/rle-array/e5201b9185079f4fc4fd907d8f591426df79946e/benchmarks/__init__.py -------------------------------------------------------------------------------- /benchmarks/benchmarks.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from contextlib import contextmanager 3 | from typing import Generator 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from pandas.errors import PerformanceWarning 8 | 9 | from rle_array.autoconversion import auto_convert_to_rle, decompress 10 | from rle_array.testing import const_col, dim_col, generate_test_dataframe 11 | 12 | 13 | class Base: 14 | min_run_count = 10 15 | processes = 1 16 | repeat = 5 17 | sample_time = 1.0 18 | warmup_time = 1.0 19 | 20 | def gen_baseline(self) -> pd.DataFrame: 21 | return generate_test_dataframe(n_dims=3, size=100) 22 | 23 | def setup(self) -> None: 24 | self.df_baseline = self.gen_baseline() 25 | self.df_rle = self.df_baseline.astype("RLEDtype[int64]") 26 | 27 | @contextmanager 28 | def ignore_performance_warnings(self) -> Generator[None, None, None]: 29 | with warnings.catch_warnings(): 30 | warnings.simplefilter(action="ignore", category=PerformanceWarning) 31 | yield 32 | 33 | 34 | class TimeAutoConversion(Base): 35 | def time_auto_convert_to_rle_compress_all(self) -> None: 36 | auto_convert_to_rle(self.df_baseline) 37 | 38 | def time_auto_convert_to_rle_no_compression_allowed(self) -> None: 39 | auto_convert_to_rle(self.df_baseline, 0.0) 40 | 41 | def time_auto_convert_to_rle_already_compressed(self) -> None: 42 | auto_convert_to_rle(self.df_rle) 43 | 44 | def time_decompress_compressed(self) -> None: 45 | decompress(self.df_rle) 46 | 47 | def time_decompress_noop(self) -> None: 48 | decompress(self.df_baseline) 49 | 50 | 51 | class TimeCompression(Base): 52 | def time_decompress_array_astype(self) -> None: 53 | with self.ignore_performance_warnings(): 54 | self.df_rle[const_col([1, 2])].array.astype(np.int64) 55 | 56 | def time_decompress_to_numpy(self) -> None: 57 | with self.ignore_performance_warnings(): 58 | self.df_rle[const_col([1, 2])].to_numpy() 59 | 60 | 61 | class TimeTake(Base): 62 | def setup(self) -> None: 63 | super().setup() 64 | 65 | self.shuffle_dim2_unstable = self.df_baseline.sort_values( 66 | dim_col(2), kind="quicksort" 67 | ).index.values 68 | self.shuffle_dim2_stable = self.df_baseline.sort_values( 69 | dim_col(2), kind="mergesort" 70 | ).index.values 71 | 72 | def time_unstable_const12_base(self) -> None: 73 | self.df_baseline[const_col([1, 2])].take(self.shuffle_dim2_unstable) 74 | 75 | def time_unstable_const12_rle(self) -> None: 76 | self.df_rle[const_col([1, 2])].take(self.shuffle_dim2_unstable) 77 | 78 | def time_stable_const12_base(self) -> None: 79 | self.df_baseline[const_col([1, 2])].take(self.shuffle_dim2_stable) 80 | 81 | def time_stable_const12_rle(self) -> None: 82 | self.df_rle[const_col([1, 2])].take(self.shuffle_dim2_stable) 83 | 84 | 85 | class TimeGroupByReduce(Base): 86 | def setup(self) -> None: 87 | super().setup() 88 | 89 | df_rle_wo_dims = self.df_rle.copy() 90 | for d in range(3): 91 | df_rle_wo_dims[dim_col(d)] = self.df_baseline[dim_col(d)].copy() 92 | self.df_rle_wo_dims = df_rle_wo_dims 93 | 94 | def time_key2_opsum_const12_baseline(self) -> None: 95 | self.df_baseline.groupby(dim_col(2))[const_col([1, 2])].sum() 96 | 97 | def time_key2_opsum_const12_rle(self) -> None: 98 | with self.ignore_performance_warnings(): 99 | self.df_rle_wo_dims.groupby(dim_col(2))[const_col([1, 2])].sum() 100 | 101 | 102 | class TimeSeriesReduce(Base): 103 | def time_sum_const12_baseline(self) -> None: 104 | self.df_baseline[const_col([1, 2])].sum() 105 | 106 | def time_sum_const12_rle(self) -> None: 107 | self.df_rle[const_col([1, 2])].sum() 108 | 109 | def time_sum_const012_baseline(self) -> None: 110 | self.df_baseline[const_col([0, 1, 2])].sum() 111 | 112 | def time_sum_const012_rle(self) -> None: 113 | self.df_rle[const_col([0, 1, 2])].sum() 114 | 115 | 116 | class TimeShift(Base): 117 | def time_int_const12_base(self) -> None: 118 | self.df_baseline[const_col([1, 2])].shift(periods=1, fill_value=1) 119 | 120 | def time_int_const12_rle(self) -> None: 121 | self.df_rle[const_col([1, 2])].shift(periods=1, fill_value=1) 122 | 123 | def time_float_const12_base(self) -> None: 124 | self.df_baseline[const_col([1, 2])].shift(periods=1) 125 | 126 | def time_float_const12_rle(self) -> None: 127 | self.df_rle[const_col([1, 2])].shift(periods=1) 128 | 129 | 130 | class TimeUnique(Base): 131 | def time_const12_base(self) -> None: 132 | self.df_baseline[const_col([1, 2])].unique() 133 | 134 | def time_const12_rle(self) -> None: 135 | self.df_rle[const_col([1, 2])].unique() 136 | 137 | 138 | class TimeOperator(Base): 139 | def time_add_const12_baseline(self) -> None: 140 | self.df_baseline[const_col([1, 2])] + self.df_baseline[const_col([1, 2])] 141 | 142 | def time_add_const12_rle(self) -> None: 143 | self.df_rle[const_col([1, 2])] + self.df_rle[const_col([1, 2])] 144 | 145 | def time_eq_const12_baseline(self) -> None: 146 | self.df_baseline[const_col([1, 2])] == self.df_baseline[const_col([1, 2])] 147 | 148 | def time_eq_const12_rle(self) -> None: 149 | self.df_rle[const_col([1, 2])] == self.df_rle[const_col([1, 2])] 150 | 151 | 152 | class TimeGenerateTestDataFrame(Base): 153 | def time(self) -> None: 154 | self.gen_baseline() 155 | 156 | 157 | class TimeFactorize(Base): 158 | def time_const12_base(self) -> None: 159 | self.df_baseline[const_col([1, 2])].factorize() 160 | 161 | def time_const12_rle(self) -> None: 162 | with self.ignore_performance_warnings(): 163 | self.df_rle[const_col([1, 2])].factorize() 164 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | coverage: 3 | status: 4 | patch: off 5 | project: 6 | default: 7 | target: 100% 8 | threshold: 0% 9 | -------------------------------------------------------------------------------- /docs/_static/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JDASoftwareGroup/rle-array/e5201b9185079f4fc4fd907d8f591426df79946e/docs/_static/.gitkeep -------------------------------------------------------------------------------- /docs/changes.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../CHANGES.rst 2 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import os 3 | import sys 4 | 5 | from sphinx.ext import apidoc 6 | 7 | # Generate module references 8 | __location__ = os.path.join( 9 | os.getcwd(), os.path.dirname(inspect.getfile(inspect.currentframe())) 10 | ) 11 | output_dir = os.path.abspath(os.path.join(__location__, "..", "docs", "_rst")) 12 | module_dir = os.path.abspath(os.path.join(__location__, "..", "rle_array")) 13 | apidoc_parameters = ["-f", "-e", "-o", output_dir, module_dir] 14 | apidoc.main(apidoc_parameters) 15 | 16 | sys.path.append(os.path.abspath(os.path.join(__location__, "sphinxext"))) 17 | 18 | add_module_names = False 19 | author = "Blue Yonder Group, Inc" 20 | copyright = "2019-2020, Blue Yonder Group, Inc" 21 | project = "rle-array" 22 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 23 | extensions = [ 24 | "ignore_missing_refs", 25 | "sphinx.ext.autodoc", 26 | "sphinx.ext.doctest", 27 | "sphinx.ext.napoleon", 28 | ] 29 | html_static_path = ["_static"] 30 | html_theme = "alabaster" 31 | nitpicky = True 32 | templates_path = ["_templates"] 33 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../README.rst 2 | 3 | 4 | Contents 5 | ******** 6 | 7 | .. toctree:: 8 | 9 | Module Reference <_rst/modules> 10 | Changelog 11 | 12 | 13 | Indices and tables 14 | ****************** 15 | 16 | * :ref:`genindex` 17 | * :ref:`modindex` 18 | * :ref:`search` 19 | -------------------------------------------------------------------------------- /docs/sphinxext/ignore_missing_refs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from docutils import nodes 3 | 4 | PACKAGES = ["rle_array"] 5 | 6 | 7 | def _is_external_target(target): 8 | return not any(((target == p) or target.startswith(p + ".") for p in PACKAGES)) 9 | 10 | 11 | def _is_private_target(target): 12 | return any((part.startswith("_") for part in target.split("."))) 13 | 14 | 15 | def missing_reference(app, env, node, contnode): 16 | target = node["reftarget"] 17 | if _is_external_target(target) or _is_private_target(target): 18 | newnode = nodes.reference("", "", internal=False, refuri="#", reftitle="") 19 | newnode.append(contnode) 20 | return newnode 21 | 22 | 23 | def setup(app): 24 | app.connect("missing-reference", missing_reference) 25 | return {"version": "0.1", "parallel_read_safe": True} 26 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | # Specify the target platform details in config, so your developers are 3 | # free to run mypy on Windows, Linux, or macOS and get consistent 4 | # results. 5 | python_version = 3.8 6 | platform = linux 7 | 8 | # flake8-mypy expects the two following for sensible formatting 9 | show_column_numbers = True 10 | 11 | # show error messages from unrelated files 12 | follow_imports = normal 13 | 14 | # be strict 15 | strict = True 16 | disallow_subclassing_any = False 17 | disallow_untyped_decorators = False 18 | 19 | [mypy-numba.*] 20 | ignore_missing_imports = True 21 | 22 | [mypy-numpy.*] 23 | ignore_missing_imports = True 24 | 25 | [mypy-pandas.*] 26 | ignore_missing_imports = True 27 | 28 | [mypy-setuptools.*] 29 | ignore_missing_imports = True 30 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "poetry-core>=1.0.0", 4 | ] 5 | build-backend = "poetry.core.masonry.api" 6 | 7 | [tool.isort] 8 | profile = "black" 9 | 10 | [tool.poetry] 11 | name = "rle-array" 12 | description = "Run-length encoded pandas." 13 | authors= [ 14 | "Blue Yonder Group, Inc", 15 | ] 16 | version = "0.1" 17 | readme = "README.rst" 18 | license = "MIT" 19 | packages = [ 20 | { include = "rle_array" }, 21 | ] 22 | repository = "https://github.com/JDASoftwareGroup/rle_array" 23 | keywords = [ 24 | "python", 25 | ] 26 | classifiers = [ 27 | "Development Status :: 4 - Beta", 28 | "Environment :: Console", 29 | "Intended Audience :: Developers", 30 | "Natural Language :: English", 31 | "Programming Language :: Python", 32 | "Programming Language :: Python :: 3", 33 | ] 34 | 35 | [tool.poetry.dependencies] 36 | python = ">=3.6.1,<3.9" 37 | numba = ">=0.51.2" 38 | numpy = ">=1.17" 39 | pandas = ">=1.1.5,<1.2" 40 | 41 | [tool.poetry.dev-dependencies] 42 | asv = "*" 43 | black = "19.10b0" 44 | flake8-mutable = "1.2.0" 45 | flake8 = "3.8.3" 46 | isort = "5.0.9" 47 | mypy = "*" 48 | pytest = ">=6" 49 | pytest-cov = "*" 50 | sphinx = "*" 51 | 52 | [tool.pytest.ini_options] 53 | addopts = "--cov=rle_array --cov-report term-missing --cov-report xml" 54 | testpaths = "tests" 55 | -------------------------------------------------------------------------------- /rle_array/__init__.py: -------------------------------------------------------------------------------- 1 | from .array import RLEArray 2 | from .autoconversion import auto_convert_to_rle 3 | from .dtype import RLEDtype 4 | 5 | __all__ = ("auto_convert_to_rle", "RLEArray", "RLEDtype") 6 | -------------------------------------------------------------------------------- /rle_array/_algorithms.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Iterator, List, Optional, Tuple 2 | 3 | import numba 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from ._slicing import NormalizedSlice 8 | from .types import POSITIONS_DTYPE 9 | 10 | 11 | def calc_lengths(positions: np.ndarray) -> np.ndarray: 12 | """ 13 | Calculate lengths of runs. 14 | 15 | Parameters 16 | ---------- 17 | positions: 18 | End positions of runs. 19 | 20 | Returns 21 | ------- 22 | lengths: 23 | Lengths of runs. 24 | """ 25 | return np.concatenate([positions[:1], positions[1:] - positions[:-1]]) 26 | 27 | 28 | def compress(scalars: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: 29 | """ 30 | Compress given array of scalars to RLE. 31 | 32 | Parameters 33 | ---------- 34 | scalars: 35 | Scalars to compress. 36 | 37 | Returns 38 | ------- 39 | data: 40 | Data at start of reach run. 41 | positions: 42 | End positions of runs. 43 | 44 | Raises 45 | ------ 46 | ValueError: If non-1-dimensional arrays are compressed. 47 | """ 48 | if scalars.ndim != 1: 49 | raise ValueError("Only 1-dimensional arrays can be compressed.") 50 | if len(scalars) == 0: 51 | return (scalars, np.array([], dtype=POSITIONS_DTYPE)) 52 | 53 | changes = detect_changes(scalars) 54 | 55 | data = np.concatenate([scalars[:-1][changes], scalars[-1:]]) 56 | positions = np.concatenate( 57 | [np.where(changes)[0] + 1, np.asarray([len(scalars)], dtype=POSITIONS_DTYPE)] 58 | ) 59 | return (data, positions) 60 | 61 | 62 | def concat( 63 | data_parts: List[np.ndarray], positions_parts: List[np.ndarray] 64 | ) -> Tuple[np.ndarray, np.ndarray]: 65 | """ 66 | Concatenate RLE data. 67 | 68 | Parameters 69 | ---------- 70 | data_parts: 71 | For each part: Data at start of reach run. 72 | positions_parts: 73 | For each part: End positions of runs. 74 | 75 | Returns 76 | ------- 77 | data: 78 | Data at start of reach run. 79 | positions: 80 | End positions of runs. 81 | """ 82 | assert len(data_parts) == len(positions_parts) 83 | if len(data_parts) == 0: 84 | return (np.array([]), np.array([], dtype=POSITIONS_DTYPE)) 85 | 86 | lengths = np.asarray([get_len(positions) for positions in positions_parts]) 87 | offsets = np.roll(np.cumsum(lengths), 1) 88 | offsets[0] = 0 89 | 90 | data = np.concatenate([data for data in data_parts]) 91 | positions = np.concatenate( 92 | [positions + o for positions, o in zip(positions_parts, offsets)] 93 | ) 94 | 95 | data, positions = recompress(data, positions) 96 | return (data, positions) 97 | 98 | 99 | @numba.jit(nopython=True, cache=True, nogil=True) 100 | def _inplace_repeat( 101 | data: np.ndarray, positions: np.ndarray, out: np.ndarray 102 | ) -> np.ndarray: 103 | n = len(positions) 104 | assert len(data) == n 105 | assert n > 0 106 | 107 | out[0 : positions[0]] = data[0] 108 | 109 | if n == 1: 110 | return 111 | 112 | for i in range(1, n): 113 | out[positions[i - 1] : positions[i]] = data[i] 114 | return 115 | 116 | 117 | def decompress( 118 | data: np.ndarray, positions: np.ndarray, dtype: Optional[Any] = None 119 | ) -> np.ndarray: 120 | """ 121 | Decompress RLE data. 122 | 123 | Parameters 124 | ---------- 125 | data: 126 | Data at start of reach run. 127 | positions: 128 | End positions of runs. 129 | dtype: 130 | Optional dtype for conversion. 131 | 132 | Returns 133 | ------- 134 | scalars: 135 | Scalars, decompressed. 136 | """ 137 | target_dtype = dtype if dtype is not None else data.dtype 138 | if len(data) == 0: 139 | return np.empty(0, dtype=target_dtype) 140 | 141 | if dtype is not None: 142 | data = data.astype(target_dtype, copy=False) 143 | 144 | if (target_dtype != np.dtype(object)) and not np.issubdtype( 145 | target_dtype, np.flexible 146 | ): 147 | out = np.empty(positions[-1], dtype=target_dtype) 148 | _inplace_repeat(data, positions, out) 149 | return out 150 | else: 151 | lengths = calc_lengths(positions) 152 | return np.repeat(data, lengths) 153 | 154 | 155 | def detect_changes(scalars: np.ndarray) -> np.ndarray: 156 | """ 157 | Detect changes in array of scalars. These changes can be used as boundaries for RLE-runs. 158 | 159 | Parameters 160 | ---------- 161 | scalars: 162 | Scalars to compress. 163 | 164 | Returns 165 | ------- 166 | changes: 167 | Change points (boolean mask). 168 | """ 169 | nulls = pd.isna(scalars) 170 | identical = (scalars[1:] == scalars[:-1]) | (nulls[1:] & nulls[:-1]) 171 | return ~identical 172 | 173 | 174 | def dropna(data: np.ndarray, positions: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: 175 | """ 176 | Drop NULL-values from RLE data. 177 | 178 | Parameters 179 | ---------- 180 | data: 181 | Data at start of reach run. 182 | positions: 183 | End positions of runs. 184 | 185 | Returns 186 | ------- 187 | data: 188 | Data at start of reach run. 189 | positions: 190 | End positions of runs. 191 | """ 192 | mask = pd.notnull(data) 193 | data = data[mask] 194 | lenghts = calc_lengths(positions) 195 | positions = ( 196 | positions 197 | - np.cumsum(lenghts * (~mask).astype(POSITIONS_DTYPE), dtype=POSITIONS_DTYPE) 198 | )[mask] 199 | return (data, positions) 200 | 201 | 202 | def find_single_index(data: np.ndarray, positions: np.ndarray, i: int) -> Any: 203 | """ 204 | Find single element in RLE data. 205 | 206 | .. important: 207 | This function does NOT handle negative indices. 208 | 209 | Parameters 210 | ---------- 211 | data: 212 | Data at start of reach run. 213 | positions: 214 | End positions of runs. 215 | 216 | Returns 217 | ------- 218 | element: 219 | Found element. 220 | 221 | Raises 222 | ------ 223 | IndexError: In case of an out-of-bounds index request. 224 | """ 225 | if (i < 0) or (i > get_len(positions)): 226 | raise IndexError(f"{i} out of bounds") 227 | return data[np.searchsorted(positions, i, side="right")] 228 | 229 | 230 | def find_slice( 231 | data: np.ndarray, positions: np.ndarray, s: slice 232 | ) -> Tuple[np.ndarray, np.ndarray]: 233 | """ 234 | Get slice of RLE data. 235 | 236 | Parameters 237 | ---------- 238 | data: 239 | Data at start of reach run. 240 | positions: 241 | End positions of runs. 242 | 243 | Returns 244 | ------- 245 | data: 246 | Data at start of reach run. 247 | positions: 248 | End positions of runs. 249 | """ 250 | length = get_len(positions) 251 | s_norm = NormalizedSlice.from_slice(length, s) 252 | 253 | start, stop, step = s_norm.start, s_norm.stop, s_norm.step 254 | invert = False 255 | if step < 0: 256 | invert = True 257 | start, stop = stop + 1, start + 1 258 | step = abs(step) 259 | 260 | if start == 0: 261 | idx_start = 0 262 | else: 263 | idx_start = np.searchsorted(positions, start, side="right") 264 | # start >= length cannot occur here because NormalizedSlice sets start=0 and stop=0 for empty slices 265 | 266 | if stop == 0: 267 | idx_stop = 0 268 | elif stop >= length: 269 | idx_stop = len(positions) 270 | else: 271 | idx_stop = np.searchsorted(positions, stop, side="left") + 1 272 | 273 | data = data[idx_start:idx_stop] 274 | positions = positions[idx_start:idx_stop] - start 275 | if len(positions) > 0: 276 | positions[-1] = stop - start 277 | 278 | if invert: 279 | lenghts = calc_lengths(positions) 280 | lenghts = lenghts[::-1] 281 | positions = np.cumsum(lenghts) 282 | data = data[::-1] 283 | 284 | if step != 1: 285 | positions = ((positions - 1) // step) + 1 286 | 287 | mask = np.empty(len(positions), dtype=bool) 288 | if len(positions) > 0: 289 | mask[0] = True 290 | mask[1:] = positions[1:] != positions[:-1] 291 | 292 | data = data[mask] 293 | positions = positions[mask] 294 | 295 | data, positions = recompress(data, positions) 296 | 297 | return (data, positions) 298 | 299 | 300 | def gen_iterator(data: np.ndarray, positions: np.ndarray) -> Iterator[Any]: 301 | """ 302 | Generate iterator over RLE data. 303 | 304 | Parameters 305 | ---------- 306 | data: 307 | Data at start of reach run. 308 | positions: 309 | End positions of runs. 310 | 311 | Returns 312 | ------- 313 | it: 314 | Iterator over uncompressed values. 315 | """ 316 | old_p = 0 317 | for x, p in zip(data, positions): 318 | for _ in range(p - old_p): 319 | yield x 320 | old_p = p 321 | 322 | 323 | def get_len(positions: np.ndarray) -> int: 324 | """ 325 | Get length of RLE data. 326 | 327 | Parameters 328 | ---------- 329 | positions: 330 | End positions of runs. 331 | 332 | Returns 333 | ------- 334 | len: 335 | Length. 336 | """ 337 | if len(positions) > 0: 338 | return int(positions[-1]) 339 | else: 340 | return 0 341 | 342 | 343 | def recompress( 344 | data: np.ndarray, positions: np.ndarray 345 | ) -> Tuple[np.ndarray, np.ndarray]: 346 | """ 347 | Try to compress RLE data even more. 348 | 349 | Parameters 350 | ---------- 351 | data: 352 | Data at start of reach run. 353 | positions: 354 | End positions of runs. 355 | 356 | Returns 357 | ------- 358 | data: 359 | Data at start of reach run. 360 | positions: 361 | End positions of runs. 362 | """ 363 | changes = detect_changes(data) 364 | 365 | data = np.concatenate([data[:-1][changes], data[-1:]]) 366 | positions = np.concatenate([positions[:-1][changes], positions[-1:]]) 367 | return (data, positions) 368 | 369 | 370 | @numba.jit((numba.int64[:], numba.int64[:]), nopython=True, cache=True, nogil=True) 371 | def _take_kernel( 372 | positions: np.ndarray, indices: np.ndarray 373 | ) -> Tuple[np.ndarray, np.ndarray]: 374 | n = len(indices) 375 | 376 | # pre-allocate output buffers 377 | result_data_idx = np.empty(n, dtype=POSITIONS_DTYPE) 378 | result_positions = np.empty(n, dtype=POSITIONS_DTYPE) 379 | 380 | current = -2 381 | run_start = 0 382 | run_stop = 0 383 | out_count = 0 384 | for pos in range(n): 385 | i = indices[pos] 386 | if i == -1: 387 | # fill 388 | idx = -1 389 | elif current >= 0 and (run_start <= i) and (i < run_stop): 390 | # great, same RLE-run 391 | idx = current 392 | else: 393 | # run full search 394 | idx = np.searchsorted(positions, i, side="right") 395 | 396 | # flush? 397 | if idx != current: 398 | if current != -2: 399 | result_data_idx[out_count] = current 400 | result_positions[out_count] = pos 401 | out_count += 1 402 | current = idx 403 | 404 | if current > 0: 405 | run_start = positions[current - 1] 406 | else: 407 | run_start = 0 408 | 409 | if current >= 0: 410 | run_stop = positions[current] 411 | 412 | # flush? 413 | if current != -2: 414 | result_data_idx[out_count] = current 415 | result_positions[out_count] = n 416 | out_count += 1 417 | 418 | # return clean-cut outputs 419 | return result_data_idx[:out_count].copy(), result_positions[:out_count].copy() 420 | 421 | 422 | def take( 423 | data: np.ndarray, 424 | positions: np.ndarray, 425 | indices: np.ndarray, 426 | allow_fill: bool, 427 | fill_value: Any, 428 | ) -> Tuple[np.ndarray, np.ndarray]: 429 | """ 430 | Take values from RLE array. 431 | 432 | Parameters 433 | ---------- 434 | data: 435 | Data at start of reach run. 436 | positions: 437 | End positions of runs. 438 | indices: 439 | Indices to take. If ``allow_fill`` is set, the only negative element allowed is ``-1``. If ``allow_fill`` is not 440 | set, then negative entries will be counted from the end of the array. 441 | allow_fill: 442 | If filling with missing values is allowed. In that case, ``-1`` in ``indices`` will be filled with 443 | ``fill_value``. 444 | fill_value: 445 | Fill-value in case ``allow_fill`` is set. 446 | 447 | Returns 448 | ------- 449 | data: 450 | Data at start of reach run. 451 | positions: 452 | End positions of runs. 453 | """ 454 | length = get_len(positions) 455 | indices = indices.copy() 456 | 457 | if (length == 0) and ((np.any(indices != -1) and allow_fill) or not allow_fill): 458 | raise IndexError("cannot do a non-empty take") 459 | 460 | if allow_fill: 461 | out_of_bounds_mask = indices < -1 462 | if np.any(out_of_bounds_mask): 463 | raise ValueError(f"{indices[out_of_bounds_mask][0]}") 464 | min_idx_allowed = -1 465 | else: 466 | indices[indices < 0] += length 467 | min_idx_allowed = 0 468 | 469 | out_of_bounds_mask = (indices < min_idx_allowed) | (indices >= length) 470 | if np.any(out_of_bounds_mask): 471 | raise IndexError(f"{indices[out_of_bounds_mask][0]} out of bounds") 472 | 473 | result_data_idx, result_positions = _take_kernel(positions, indices) 474 | 475 | result_data_mask = result_data_idx != -1 476 | result_data = np.empty(len(result_data_idx), dtype=data.dtype) 477 | result_data[result_data_mask] = data[result_data_idx[result_data_mask]] 478 | if np.any(~result_data_mask): 479 | result_data[~result_data_mask] = fill_value 480 | 481 | return recompress(result_data, result_positions) 482 | 483 | 484 | @numba.jit((numba.int64[:], numba.int64[:]), nopython=True, cache=True, nogil=True) 485 | def _extend_positions_kernel( 486 | positions1: np.ndarray, positions2: np.ndarray 487 | ) -> np.ndarray: 488 | n1 = len(positions1) 489 | n2 = len(positions2) 490 | 491 | # pre-allocate output buffers 492 | result = np.empty(n1 + n2, dtype=POSITIONS_DTYPE) 493 | 494 | i_out = 0 495 | i1 = 0 496 | i2 = 0 497 | 498 | while (i1 < n1) and (i2 < n2): 499 | x1 = positions1[i1] 500 | x2 = positions2[i2] 501 | 502 | if x1 == x2: 503 | result[i_out] = x1 504 | i1 += 1 505 | i2 += 1 506 | elif x1 < x2: 507 | result[i_out] = x1 508 | i1 += 1 509 | else: 510 | # x2 < x1 511 | result[i_out] = x2 512 | i2 += 1 513 | 514 | i_out += 1 515 | 516 | while i1 < n1: 517 | result[i_out] = positions1[i1] 518 | i1 += 1 519 | i_out += 1 520 | 521 | while i2 < n2: 522 | result[i_out] = positions2[i2] 523 | i2 += 1 524 | i_out += 1 525 | 526 | # return clean-cut output 527 | return result[:i_out].copy() 528 | 529 | 530 | def extend_positions(positions1: np.ndarray, positions2: np.ndarray) -> np.ndarray: 531 | """ 532 | Create union of two position arrays. 533 | 534 | Parameters 535 | ---------- 536 | positions1 537 | First position array. 538 | positions2 539 | Second position array. 540 | 541 | Returns 542 | ------- 543 | extended_positions 544 | Sorted position array that contains all entries from input arrays (without duplicates). 545 | """ 546 | return _extend_positions_kernel(positions1, positions2) 547 | 548 | 549 | @numba.jit(nopython=True, cache=True, nogil=True) 550 | def _extend_data_kernel( 551 | data: np.ndarray, positions: np.ndarray, extended_positions: np.ndarray 552 | ) -> np.ndarray: 553 | n = extended_positions.shape[0] 554 | extended_array = np.empty(n, dtype=data.dtype) 555 | 556 | k = 0 # current index for data/positions 557 | for i in range(n): 558 | if extended_positions[i] > positions[k]: 559 | k += 1 560 | extended_array[i] = data[k] 561 | 562 | return extended_array 563 | 564 | 565 | def extend_data( 566 | data: np.ndarray, positions: np.ndarray, extended_positions: np.ndarray 567 | ) -> np.ndarray: 568 | """ 569 | Extend data array to match new positions. 570 | 571 | Parameters 572 | ---------- 573 | data 574 | Data at start of reach run. 575 | positions 576 | End positions of runs. 577 | extended_positions 578 | Extended position array (superset of ``positions``). See :func:`extend_positions`. 579 | 580 | Returns 581 | ------- 582 | extended_data 583 | Extended data array. 584 | """ 585 | return _extend_data_kernel(data, positions, extended_positions) 586 | -------------------------------------------------------------------------------- /rle_array/_slicing.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helpers that allows us to deal with Python slicing. 3 | 4 | The issue with Python slicing are: 5 | 6 | - ``slice`` type: 7 | - the types in ``slice`` are completely unchecked (can even be a string or any user-provided type) 8 | - the consistency of the values in ``slice`` are unchecked 9 | - there is not information about the container size (which makes consistency checks more complicated) 10 | 11 | - ``slice.step`` value: 12 | - ``step`` has the implicit default 1 13 | - there can be forward and backward slices depending on the ``step`` value 14 | - there can be step sizes which are not modulo 1 15 | 16 | - ``slice.start`` and ``slice.stop`` values: 17 | - the implicit defaults of ``start`` and ``stop`` depend on ``step`` (is it positive or negative?) 18 | - ``start`` and ``stop`` can be negative (aka "from the end") 19 | - ``start`` and ``stop`` can over/underflow the container 20 | 21 | We do not want to deal with all these edge cases in every code snipped that deals with slicing, so we introduce 22 | :class:`NormalizedSlice` that solves the issue in a central place. 23 | """ 24 | from typing import Optional 25 | 26 | import numpy as np 27 | 28 | 29 | class NormalizedSlice: 30 | """ 31 | A normalized slice. 32 | 33 | .. important:: 34 | 35 | Do not try to construct this class by hand. Use :func:`NormalizedSlice.from_slice` instead! 36 | 37 | Parameters 38 | ---------- 39 | start 40 | First absolute index in the container (inclusive start). Always positive. 41 | stop 42 | Last absolute index not being part of the slice (exclusive end). Counted from the container start. Can be 43 | negative for refersed slices (aka ``step < 0``). Must be normalized so that ``abs(stop - start) % step == 0``. 44 | For forward slices (``step > 0``), this must be greater than ``start``. For backward slices (``step < 0``) this 45 | must be less than ``start``. For empty slices (``start = stop``), ``start``, ``stop`` and ``step`` have the 46 | fixed values 0, 0 and 1. 47 | step 48 | Step size. Must not be ``0``. 49 | container_length 50 | Size of the container this slice applies to. Must not be negative. For empty containers 51 | (``container_length = 0``), ``start``, ``stop`` and ``step`` have the fixed values 0, 0 and 1. 52 | """ 53 | 54 | def __init__(self, start: int, stop: int, step: int, container_length: int): 55 | if not isinstance(start, int): 56 | raise TypeError(f"start must be int but is {type(start).__name__}") 57 | if not isinstance(stop, int): 58 | raise TypeError(f"stop must be int but is {type(stop).__name__}") 59 | if not isinstance(step, int): 60 | raise TypeError(f"step must be int but is {type(step).__name__}") 61 | if not isinstance(container_length, int): 62 | raise TypeError( 63 | f"container_length must be int but is {type(container_length).__name__}" 64 | ) 65 | 66 | self._start = start 67 | self._stop = stop 68 | self._step = step 69 | self._container_length = container_length 70 | 71 | self._verify() 72 | 73 | def _verify(self) -> None: 74 | """ 75 | Verify integrity. 76 | """ 77 | if self.container_length < 0: 78 | raise ValueError( 79 | f"container_length ({self.container_length}) must be greater or equal to zero" 80 | ) 81 | elif self.container_length == 0: 82 | self._verify_container_empty() 83 | else: 84 | self._verify_container_not_empty() 85 | 86 | def _verify_container_empty(self) -> None: 87 | """ 88 | Verify integrity in case the container is empty (``container_length = 0``). 89 | """ 90 | # empty container => special values required 91 | if self.start != 0: 92 | raise ValueError( 93 | f"for empty containers, start must be 0 but is {self.start}" 94 | ) 95 | 96 | if self.stop != 0: 97 | raise ValueError(f"for empty containers, stop must be 0 but is {self.stop}") 98 | 99 | if self.step != 1: 100 | raise ValueError(f"for empty containers, step must be 1 but is {self.step}") 101 | 102 | def _verify_container_not_empty(self) -> None: 103 | """ 104 | Verify integrity in case the container is not empty (``container_length > 0``). 105 | """ 106 | if (self.start < 0) or (self.start >= self.container_length): 107 | raise ValueError( 108 | f"start ({self.start}) must be in [0,{self.container_length}) but is not" 109 | ) 110 | 111 | if (self.stop < -abs(self.step)) or ( 112 | self.stop >= self.container_length + abs(self.step) 113 | ): 114 | raise ValueError( 115 | f"stop ({self.stop}) must be in [{-abs(self.step)},{self.container_length + abs(self.step)}) but is not" 116 | ) 117 | 118 | if self.start == self.stop: 119 | # empty slice 120 | if self.start != 0: 121 | raise ValueError( 122 | f"for empty slices, start and stop must be 0 but are {self.start}" 123 | ) 124 | if self.step != 1: 125 | raise ValueError(f"for empty slices, step must be 1 but is {self.step}") 126 | else: 127 | # non-empty slice 128 | if self.step == 0: 129 | raise ValueError("step cannot be zero") 130 | elif self.step > 0: 131 | # forward slice 132 | if self.start > self.stop: 133 | raise ValueError( 134 | "for forward slices, stop must be greater or equal to start" 135 | ) 136 | else: 137 | # backward slice 138 | if self.stop > self.start: 139 | raise ValueError( 140 | "for backward slices, start must be greater or equal to stop" 141 | ) 142 | 143 | if abs(self.start - self.stop) % abs(self.step) != 0: 144 | raise ValueError( 145 | "The distance between start and stop most be divisible by the step size" 146 | ) 147 | 148 | @property 149 | def start(self) -> int: 150 | """ 151 | Start index of the slice. Inclusive start. 152 | """ 153 | return self._start 154 | 155 | @property 156 | def stop(self) -> int: 157 | """ 158 | Stop index of the slice. Exclusive end. 159 | """ 160 | return self._stop 161 | 162 | @property 163 | def step(self) -> int: 164 | """ 165 | Step width. 166 | """ 167 | return self._step 168 | 169 | @property 170 | def container_length(self) -> int: 171 | """ 172 | Length of the container. 173 | """ 174 | return self._container_length 175 | 176 | def __repr__(self) -> str: 177 | return ( 178 | f"{type(self).__name__}(start={self.start}, stop={self.stop}, step={self.step}, container_length=" 179 | f"{self.container_length})" 180 | ) 181 | 182 | def __len__(self) -> int: 183 | return self._calc_len(start=self.start, stop=self.stop, step=self.step) 184 | 185 | @classmethod 186 | def _calc_len(cls, start: int, stop: int, step: int) -> int: 187 | """ 188 | Calculate slice length. 189 | 190 | Parameters 191 | ---------- 192 | start 193 | Inclusive start index. 194 | stop 195 | Exclusive stop index. 196 | step 197 | Step width. 198 | """ 199 | delta = abs(stop - start) 200 | steps = delta // abs(step) 201 | if delta % abs(step) != 0: 202 | steps += 1 203 | return steps 204 | 205 | @classmethod 206 | def _check_and_prepare_slice(cls, s: Optional[slice]) -> slice: 207 | """ 208 | Check and prepare input slice for conversion. 209 | """ 210 | if s is None: 211 | s = slice(None, None, None) 212 | 213 | if not isinstance(s, slice): 214 | raise TypeError(f"slice must be a slice but is {type(s).__name__}") 215 | 216 | if (s.start is not None) and not isinstance(s.start, (int, np.int64)): 217 | raise TypeError( 218 | f"slice start must be int or None but is {type(s.start).__name__}" 219 | ) 220 | start = None if s.start is None else int(s.start) 221 | 222 | if (s.stop is not None) and not isinstance(s.stop, (int, np.int64)): 223 | raise TypeError( 224 | f"slice stop must be int or None but is {type(s.stop).__name__}" 225 | ) 226 | stop = None if s.stop is None else int(s.stop) 227 | 228 | if (s.step is not None) and not isinstance(s.step, (int, np.int64)): 229 | raise TypeError( 230 | f"slice step must be int or None but is {type(s.step).__name__}" 231 | ) 232 | if s.step == 0: 233 | raise ValueError("slice step cannot be zero") 234 | step = None if s.step is None else int(s.step) 235 | 236 | return slice(start, stop, step) 237 | 238 | @classmethod 239 | def from_slice(cls, container_length: int, s: Optional[slice]) -> "NormalizedSlice": 240 | """ 241 | Create a new :class:`NormalizedSlice` from a given Python ``slice`` and container length. 242 | 243 | Parameters 244 | ---------- 245 | container_length 246 | Non-negative container length. 247 | s 248 | Slice or ``None`` (for "take all"). 249 | 250 | Raises 251 | ------ 252 | TypeError: If ``s`` is not ``None`` and not a ``slice`` or any of the arguments for ``slice`` are neither 253 | ``None`` nor an integer. 254 | ValueError: Illegal ``slice`` values or ``container_length``. 255 | """ 256 | s2 = cls._check_and_prepare_slice(s) 257 | 258 | if not isinstance(container_length, (int, np.int64)): 259 | raise TypeError( 260 | f"container_length must be an int but is {type(container_length).__name__}" 261 | ) 262 | if container_length < 0: 263 | raise ValueError("container_length cannot be negative") 264 | 265 | if container_length == 0: 266 | return cls(start=0, stop=0, step=1, container_length=0) 267 | 268 | container_length = int(container_length) 269 | 270 | default_start, default_stop = 0, container_length 271 | 272 | if s2.step is not None: 273 | step = s2.step 274 | if step < 0: 275 | default_start, default_stop = default_stop - 1, default_start - 1 276 | else: 277 | step = 1 278 | 279 | def limit(x: int) -> int: 280 | a = min(default_start, default_stop) 281 | b = max(default_start, default_stop) 282 | return max(a, min(b, x)) 283 | 284 | if s2.start is not None: 285 | if s2.start < 0: 286 | start = limit(container_length + s2.start) 287 | else: 288 | start = limit(s2.start) 289 | else: 290 | start = default_start 291 | 292 | if s2.stop is not None: 293 | if s2.stop < 0: 294 | stop = limit(container_length + s2.stop) 295 | else: 296 | stop = limit(s2.stop) 297 | else: 298 | stop = default_stop 299 | 300 | if step > 0: 301 | if stop < start: 302 | stop = start 303 | else: 304 | if stop > start: 305 | stop = start 306 | 307 | if start == stop: 308 | return cls(start=0, stop=0, step=1, container_length=container_length) 309 | 310 | # re-adjusting the range to be modulo `step` 311 | stop = start + step * cls._calc_len(start=start, stop=stop, step=step) 312 | 313 | return cls(start=start, stop=stop, step=step, container_length=container_length) 314 | 315 | def project(self, child: "NormalizedSlice") -> "NormalizedSlice": 316 | """ 317 | Project a slice. 318 | 319 | Given a parent slice (``self``) which is applied first, calculate slice of this slice would look like so it can 320 | be applied to the original data. 321 | 322 | Parameters 323 | ---------- 324 | child 325 | Second slice to apply. 326 | 327 | Raises 328 | ------ 329 | TypeError: If ``child`` is not a ``NormalizedSlice``. 330 | ValueError: If ``child.container_length`` is not the length of ``self``. 331 | 332 | Example 333 | ------- 334 | >>> # given some unknown data: 335 | >>> data = list(range(100)) 336 | 337 | >>> # and two slices: 338 | >>> parent = slice(10, -8, 2) 339 | >>> child = slice(-20, -1, -1) 340 | 341 | >>> # and the application of both slices 342 | >>> expected = data[parent][child] 343 | 344 | >>> # construct a slice that does both steps at once 345 | >>> from rle_array._algorithms import NormalizedSlice 346 | >>> parent_normalized = NormalizedSlice.from_slice(len(data), parent) 347 | >>> child_normalized = NormalizedSlice.from_slice(len(parent), child) 348 | >>> projected = parent_normalized.project(child_normalized).to_slice() 349 | >>> actual = data[projected] 350 | >>> assert actual == expected 351 | """ 352 | if not isinstance(child, NormalizedSlice): 353 | raise TypeError( 354 | f"child must be NormalizedSlice but is {type(child).__name__}" 355 | ) 356 | if child.container_length != len(self): 357 | raise ValueError( 358 | f"container_length of child ({child.container_length}) must be length of parent ({len(self)})" 359 | ) 360 | 361 | start = self.start + child.start * self.step 362 | stop = self.start + child.stop * self.step 363 | step = self.step * child.step 364 | 365 | if start == stop: 366 | return type(self)( 367 | start=0, stop=0, step=1, container_length=self.container_length 368 | ) 369 | 370 | return type(self)( 371 | start=start, stop=stop, step=step, container_length=self.container_length 372 | ) 373 | 374 | def to_slice(self) -> Optional[slice]: 375 | """ 376 | Convert :class:`NormalizedSlice` back to a slice. 377 | 378 | Returns ``None`` if no slicing is applied (e.g. the whole container with ``step=1`` is taken). 379 | """ 380 | start: Optional[int] = self.start 381 | stop: Optional[int] = self.stop 382 | step: Optional[int] = self.step 383 | 384 | if self.step > 0: 385 | # forwards 386 | if self.start <= 0: 387 | start = None 388 | if self.stop >= self.container_length: 389 | stop = None 390 | if self.step == 1: 391 | step = None 392 | else: 393 | # backward 394 | if self.start >= self.container_length - 1: 395 | start = None 396 | if self.stop < 0: 397 | stop = None 398 | 399 | if (start is None) and (stop is None) and (step is None): 400 | return None 401 | else: 402 | return slice(start, stop, step) 403 | -------------------------------------------------------------------------------- /rle_array/array.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import operator 3 | import warnings 4 | from collections import namedtuple 5 | from copy import copy 6 | from typing import Any, Callable, Dict, Iterator, Optional, Sequence, Tuple, Union 7 | from weakref import WeakSet, ref 8 | 9 | import numpy as np 10 | import pandas as pd 11 | from pandas.api.extensions import ExtensionArray 12 | from pandas.arrays import BooleanArray, IntegerArray, StringArray 13 | from pandas.core import ops 14 | from pandas.core.algorithms import factorize, unique 15 | from pandas.core.arrays.boolean import coerce_to_array as coerce_to_boolean_array 16 | from pandas.core.dtypes.common import is_array_like 17 | from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries 18 | from pandas.core.dtypes.inference import is_scalar 19 | from pandas.core.dtypes.missing import isna 20 | from pandas.errors import PerformanceWarning 21 | 22 | from ._algorithms import ( 23 | calc_lengths, 24 | compress, 25 | concat, 26 | decompress, 27 | dropna, 28 | extend_data, 29 | extend_positions, 30 | find_single_index, 31 | find_slice, 32 | gen_iterator, 33 | get_len, 34 | recompress, 35 | take, 36 | ) 37 | from ._slicing import NormalizedSlice 38 | from .dtype import RLEDtype 39 | from .types import POSITIONS_DTYPE 40 | 41 | _logger = logging.getLogger(__name__) 42 | 43 | 44 | def _normalize_arraylike_indexing(arr: Any, length: int) -> np.ndarray: 45 | """ 46 | Normalize array-like index arguments for ``__getitem__`` and ``__setitem__``. 47 | 48 | This is required since pandas can pass us many different types with potentially nullable data. 49 | 50 | Parameters 51 | ---------- 52 | arr 53 | Index argument passed to ``__getitem__`` and ``__setitem__`` if arraylike. 54 | length 55 | Array length. 56 | """ 57 | if isinstance(arr, BooleanArray): 58 | result = np.asarray(arr.fillna(False), dtype=bool) 59 | elif isinstance(arr, IntegerArray): 60 | try: 61 | return np.asarray(arr, dtype=int) 62 | except ValueError: 63 | raise ValueError( 64 | "Cannot index with an integer indexer containing NA values" 65 | ) 66 | elif isinstance(arr, RLEArray): 67 | result = np.asarray(arr, dtype=arr.dtype._dtype) 68 | elif isinstance(arr, list): 69 | if any((pd.isna(x) for x in arr)): 70 | raise ValueError( 71 | "Cannot index with an integer indexer containing NA values" 72 | ) 73 | result = np.asarray(arr) 74 | else: 75 | result = np.asarray(arr) 76 | 77 | if (result.dtype == np.bool_) and (len(result) != length): 78 | raise IndexError("Indexer has wrong length") 79 | 80 | return result 81 | 82 | 83 | class _ViewAnchor: 84 | """ 85 | Anchor object that references a RLEArray because it is not hashable. 86 | """ 87 | 88 | def __init__(self, array: "RLEArray") -> None: 89 | self.array = ref(array) 90 | 91 | def __hash__(self) -> int: 92 | return id(self.array) 93 | 94 | 95 | class _ViewMaster: 96 | """ 97 | Collection of all views to an array. 98 | 99 | This tracks the original data as well as all views. 100 | """ 101 | 102 | def __init__(self, data: np.ndarray, positions: np.ndarray): 103 | self.data = data 104 | self.positions = positions 105 | self.views: WeakSet[_ViewAnchor] = WeakSet() 106 | 107 | @classmethod 108 | def register_first(cls, array: "RLEArray") -> "_Projection": 109 | """ 110 | Register array with new master. 111 | 112 | The array must not have a view master yet! 113 | """ 114 | assert getattr(array, "_projection", None) is None 115 | 116 | projection = _Projection( 117 | projection_slice=None, 118 | master=cls(data=array._data, positions=array._positions), 119 | ) 120 | projection.master.views.add(array._view_anchor) 121 | return projection 122 | 123 | def register_change( 124 | self, array: "RLEArray", projection_slice: Optional[slice] 125 | ) -> None: 126 | """ 127 | Re-register array with new view-master. 128 | 129 | The array must only be registered with a single, no orphan master! 130 | """ 131 | # ensure the array is only registered with another orphan master 132 | assert array._projection is not None 133 | assert array._projection.projection_slice is None 134 | assert array._projection.master is not self 135 | assert len(array._projection.master.views) == 1 136 | assert array._view_anchor not in self.views 137 | 138 | array._projection = _Projection(projection_slice=projection_slice, master=self) 139 | self.views.add(array._view_anchor) 140 | 141 | def modify(self, data: np.ndarray, positions: np.ndarray) -> None: 142 | """ 143 | Modify the original (unprojected) data and populate change to all views. 144 | """ 145 | self.data = data 146 | self.positions = positions 147 | 148 | for view in self.views: 149 | array = view.array() 150 | assert array is not None 151 | assert array._projection is not None 152 | assert array._projection.master is self 153 | 154 | if array._projection.projection_slice is not None: 155 | data2, positions2 = find_slice( 156 | data=self.data, 157 | positions=self.positions, 158 | s=array._projection.projection_slice, 159 | ) 160 | else: 161 | data2, positions2 = self.data, self.positions 162 | 163 | array._data = data2 164 | array._positions = positions2 165 | 166 | 167 | _Projection = namedtuple("_Projection", ["master", "projection_slice"]) 168 | 169 | 170 | class RLEArray(ExtensionArray): 171 | """ 172 | Run-length encoded array. 173 | 174 | Parameters 175 | ---------- 176 | data 177 | Data for each run. Must be a one-dimensional. All Pandas-supported dtypes are supported. 178 | positions 179 | End-positions for each run. Must be one-dimensional and must have same length as ``data``. dtype must be 180 | ``POSITIONS_DTYPE``. 181 | """ 182 | 183 | _HANDLED_TYPES = tuple( 184 | t for types in np.sctypes.values() for t in types if t is not object 185 | ) + (np.ndarray, list, tuple, int, float, complex) 186 | 187 | # For comparisons, so that numpy uses our implementation. 188 | __array_priority__ = 1000 189 | 190 | def __init__(self, data: np.ndarray, positions: np.ndarray): 191 | if not isinstance(data, np.ndarray): 192 | raise TypeError(f"data must be an ndarray but is {type(data).__name__}") 193 | if not isinstance(positions, np.ndarray): 194 | raise TypeError( 195 | f"positions must be an ndarray but is {type(positions).__name__}" 196 | ) 197 | if data.ndim != 1: 198 | raise ValueError( 199 | f"data must be an 1-dimensional ndarray but has {data.ndim} dimensions" 200 | ) 201 | if positions.ndim != 1: 202 | raise ValueError( 203 | f"positions must be an 1-dimensional ndarray but has {positions.ndim} dimensions" 204 | ) 205 | if positions.dtype != POSITIONS_DTYPE: 206 | raise ValueError( 207 | f"positions must have dtype {POSITIONS_DTYPE.__name__} but has {positions.dtype}" 208 | ) 209 | if len(data) != len(positions): 210 | raise ValueError( 211 | f"data and positions must have same length but have {len(data)} and {len(positions)}" 212 | ) 213 | if np.any(positions[1:] <= positions[:-1]): 214 | raise ValueError("positions must be strictly sorted") 215 | 216 | _logger.debug( 217 | "RLEArray.__init__(data=%s(len=%r, dtype=%r), positions=%s(len=%r, dtype=%r))", 218 | type(data).__name__, 219 | len(data), 220 | data.dtype, 221 | type(positions).__name__, 222 | len(positions), 223 | positions.dtype, 224 | ) 225 | 226 | self._dtype = RLEDtype(data.dtype) 227 | self._data = data 228 | self._positions = positions 229 | self._setup_view_system() 230 | 231 | def _setup_view_system(self) -> None: 232 | """ 233 | Setup any view-related tracking parts. 234 | 235 | Must be called after initialization or unpickling. 236 | """ 237 | self._view_anchor = _ViewAnchor(self) 238 | self._projection = _ViewMaster.register_first(self) 239 | 240 | def __getstate__(self) -> Dict[str, Any]: 241 | state = copy(self.__dict__) 242 | del state["_view_anchor"] 243 | del state["_projection"] 244 | return state 245 | 246 | def __setstate__(self, state: Dict[str, Any]) -> None: 247 | self.__dict__.update(state) 248 | self._setup_view_system() 249 | 250 | @property 251 | def _lengths(self) -> Any: 252 | return calc_lengths(self._positions) 253 | 254 | @classmethod 255 | def _from_sequence( 256 | cls, scalars: Any, dtype: Any = None, copy: bool = False 257 | ) -> "RLEArray": 258 | _logger.debug( 259 | "RLEArray._from_sequence(scalars=%s(...), dtype=%r, copy=%r)", 260 | type(scalars).__name__, 261 | dtype, 262 | copy, 263 | ) 264 | if isinstance(dtype, RLEDtype): 265 | dtype = dtype._dtype 266 | 267 | if isinstance(scalars, np.ndarray): 268 | if (dtype is not None) and (dtype != scalars.dtype): 269 | # some cast required 270 | if dtype == np.bool_: 271 | # bool case 272 | scalars, mask = coerce_to_boolean_array(scalars) 273 | if mask.any(): 274 | raise TypeError("Masked booleans are not supported") 275 | else: 276 | # catch-them-all case 277 | # TODO: get rid of this unsafe cast 278 | scalars = scalars.astype(dtype) 279 | else: 280 | scalars = np.asarray(scalars, dtype=dtype) 281 | data, positions = compress(scalars) 282 | return RLEArray(data=data, positions=positions) 283 | 284 | @classmethod 285 | def _from_factorized(cls, data: Any, original: "RLEArray") -> "RLEArray": 286 | _logger.debug("RLEArray._from_factorized(...)") 287 | return cls._from_sequence(np.asarray(data, dtype=original.dtype._dtype)) 288 | 289 | def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: 290 | # decompressing version of `_values_for_factorize` which is not only required for `factorize` but also for other 291 | # things like `pandas.core.util.hashing.hash_array` 292 | return decompress(self._data, self._positions), self.dtype.na_value 293 | 294 | def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "RLEArray"]: 295 | # optimized version of `ExtensionArray.factorize`: 296 | # 1. replace `_values_for_factorize` with a version that does not decompress the data 297 | # 2. passing compressed data to `factorize` (instead of `_factorize_array` because that does not handle NA 298 | # values nicely) 299 | # 3. decompress `codes` 300 | arr = self._data 301 | 302 | codes, uniques = factorize(arr, na_sentinel=na_sentinel) 303 | 304 | uniques = self._from_factorized(uniques, self) 305 | codes = decompress(codes, self._positions) 306 | return codes, uniques 307 | 308 | def __getitem__(self, arr: Any) -> Any: 309 | _logger.debug("RLEArray.__getitem__(arr=%s(...))", type(arr).__name__) 310 | if isinstance(arr, tuple): 311 | # This is for example called by Pandas as values[:, None] to prepare the data for the cythonized 312 | # aggregation. Since we do not want to support the the aggregation over decompression, it is OK to not 313 | # implement this. 314 | raise NotImplementedError( 315 | "__getitem__ does currently only work w/ a single parameter" 316 | ) 317 | 318 | if is_array_like(arr) or isinstance(arr, list): 319 | arr = _normalize_arraylike_indexing(arr, len(self)) 320 | 321 | if arr.dtype == np.bool_: 322 | arr = np.arange(len(self))[arr] 323 | else: 324 | arr = arr.astype(int) 325 | 326 | if len(arr) == 0: 327 | return RLEArray(data=self._data[[]], positions=self._positions[[]]) 328 | 329 | arr[arr < 0] += len(self) 330 | 331 | data, positions = take( 332 | data=self._data, 333 | positions=self._positions, 334 | indices=arr, 335 | allow_fill=False, 336 | fill_value=self.dtype.na_value, 337 | ) 338 | return RLEArray(data=data, positions=positions) 339 | elif isinstance(arr, slice): 340 | data, positions = find_slice(self._data, self._positions, arr) 341 | parent_normalized = NormalizedSlice.from_slice( 342 | get_len(self._projection.master.positions), 343 | self._projection.projection_slice, 344 | ) 345 | child_normalized = NormalizedSlice.from_slice(len(self), arr) 346 | subslice = parent_normalized.project(child_normalized).to_slice() 347 | result = RLEArray(data=data, positions=positions) 348 | self._projection.master.register_change(result, subslice) 349 | return result 350 | else: 351 | if arr < 0: 352 | arr = arr + len(self) 353 | return find_single_index(self._data, self._positions, arr) 354 | 355 | def __setitem__(self, index: Any, data: Any) -> None: 356 | _logger.debug("RLEArray.__setitem__(...)") 357 | 358 | # get master data 359 | orig = decompress( 360 | data=self._projection.master.data, 361 | positions=self._projection.master.positions, 362 | ) 363 | 364 | # get our view 365 | if self._projection.projection_slice is not None: 366 | sub = orig[self._projection.projection_slice] 367 | else: 368 | sub = orig 369 | 370 | # prepare index 371 | if is_array_like(index) or isinstance(index, list): 372 | index = _normalize_arraylike_indexing(index, len(self)) 373 | 374 | # modify master data through view 375 | sub[index] = data 376 | 377 | # commit to all views (including self) 378 | data, positions = compress(orig) 379 | self._projection.master.modify(data, positions) 380 | 381 | def __len__(self) -> int: 382 | _logger.debug("RLEArray.__len__()") 383 | return get_len(self._positions) 384 | 385 | @property 386 | def dtype(self) -> RLEDtype: 387 | _logger.debug("RLEArray.dtype") 388 | return self._dtype 389 | 390 | @property 391 | def nbytes(self) -> int: 392 | _logger.debug("RLEArray.nbytes") 393 | return int(self._data.nbytes) + int(self._positions.nbytes) 394 | 395 | def isna(self) -> "RLEArray": 396 | _logger.debug("RLEArray.isna()") 397 | return RLEArray(data=pd.isna(self._data), positions=self._positions.copy()) 398 | 399 | def take( 400 | self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None 401 | ) -> "RLEArray": 402 | _logger.debug( 403 | "RLEArray.take(indices=%s(len=%s), allow_fill=%r, fill_value=%r)", 404 | type(indices).__name__, 405 | len(indices), 406 | allow_fill, 407 | fill_value, 408 | ) 409 | if fill_value is None: 410 | fill_value = self.dtype.na_value 411 | 412 | indices = np.asarray(indices) 413 | 414 | data, positions = take( 415 | self._data, self._positions, indices, allow_fill, fill_value 416 | ) 417 | return RLEArray(data=data, positions=positions) 418 | 419 | def copy(self) -> "RLEArray": 420 | _logger.debug("RLEArray.copy()") 421 | return RLEArray(data=self._data.copy(), positions=self._positions.copy()) 422 | 423 | @classmethod 424 | def _concat_same_type(cls, to_concat: Sequence["RLEArray"]) -> "RLEArray": 425 | t_to_concat = type(to_concat) 426 | to_concat = list(to_concat) 427 | _logger.debug( 428 | "RLEArray._concat_same_type(to_concat=%s(len=%i))", 429 | t_to_concat.__name__, 430 | len(to_concat), 431 | ) 432 | data, positions = concat( 433 | [s._data for s in to_concat], [s._positions for s in to_concat] 434 | ) 435 | return RLEArray(data=data, positions=positions) 436 | 437 | def __array__(self, dtype: Any = None) -> Any: 438 | _logger.debug("RLEArray.__array__(dtype=%r)", dtype) 439 | warnings.warn("performance: __array__ blows up data", PerformanceWarning) 440 | if dtype is None: 441 | dtype = self.dtype._dtype 442 | 443 | return decompress(self._data, self._positions, dtype) 444 | 445 | def astype(self, dtype: Any, copy: bool = True, casting: str = "unsafe") -> Any: 446 | _logger.debug("RLEArray.astype(dtype=%r, copy=%r)", dtype, copy) 447 | if isinstance(dtype, RLEDtype): 448 | if (not copy) and (dtype == self.dtype): 449 | return self 450 | return RLEArray( 451 | data=self._data.astype(dtype._dtype, casting=casting), 452 | positions=self._positions.copy(), 453 | ) 454 | if isinstance(dtype, pd.StringDtype): 455 | # TODO: fast-path 456 | return StringArray._from_sequence([str(x) for x in self]) 457 | 458 | if casting != "unsafe": 459 | return np.array(self, copy=copy).astype(dtype=dtype, casting=casting) 460 | else: 461 | return np.array(self, dtype=dtype, copy=copy) 462 | 463 | def _get_reduce_data(self, skipna: bool) -> Any: 464 | data = self._data 465 | if skipna: 466 | data = data[pd.notnull(data)] 467 | return data 468 | 469 | def _get_reduce_data_len(self, skipna: bool) -> Any: 470 | data = self._data 471 | lengths = self._lengths 472 | if skipna: 473 | mask = pd.notnull(data) 474 | data = data[mask] 475 | lengths = lengths[mask] 476 | return data, lengths 477 | 478 | def all(self, axis: Optional[int] = 0, out: Any = None) -> bool: 479 | _logger.debug("RLEArray.all()") 480 | if (axis is not None) and (axis != 0): 481 | raise NotImplementedError("Only axis=0 is supported.") 482 | if out is not None: 483 | raise NotImplementedError("out parameter is not supported.") 484 | 485 | return bool(np.all(self._data)) 486 | 487 | def any(self, axis: Optional[int] = 0, out: Any = None) -> bool: 488 | _logger.debug("RLEArray.any(axis=%r, out=%r)", axis, out) 489 | if (axis is not None) and (axis != 0): 490 | raise NotImplementedError("Only axis=0 is supported.") 491 | if out is not None: 492 | raise NotImplementedError("out parameter is not supported.") 493 | 494 | return bool(np.any(self._data)) 495 | 496 | def kurt(self, skipna: bool = True) -> Any: 497 | _logger.debug("RLEArray.kurt(skipna=%r)", skipna) 498 | # TODO: fast-path 499 | data = np.asarray(self) 500 | return pd.Series(data).kurt(skipna=skipna) 501 | 502 | def max(self, skipna: bool = True, axis: Optional[int] = 0, out: Any = None) -> Any: 503 | _logger.debug("RLEArray.max(skipna=%r)", skipna) 504 | if (axis is not None) and (axis != 0): 505 | raise NotImplementedError("Only axis=0 is supported.") 506 | if out is not None: 507 | raise NotImplementedError("out parameter is not supported.") 508 | 509 | data = self._get_reduce_data(skipna) 510 | if len(data): 511 | return np.max(data) 512 | else: 513 | return self.dtype.na_value 514 | 515 | def mean( 516 | self, 517 | skipna: bool = True, 518 | dtype: Optional[Any] = None, 519 | axis: Optional[int] = 0, 520 | out: Any = None, 521 | ) -> Any: 522 | _logger.debug("RLEArray.mean(skipna=%r)", skipna) 523 | if (axis is not None) and (axis != 0): 524 | raise NotImplementedError("Only axis=0 is supported.") 525 | if out is not None: 526 | raise NotImplementedError("out parameter is not supported.") 527 | if dtype is not None: 528 | raise NotImplementedError("dtype parameter is not supported.") 529 | 530 | data, lengths = self._get_reduce_data_len(skipna) 531 | n = lengths.sum() if skipna else len(self) 532 | if n == 0: 533 | return self.dtype.na_value 534 | else: 535 | return np.dot(data, lengths) / np.float64(n) 536 | 537 | def median( 538 | self, skipna: bool = True, axis: Optional[int] = 0, out: Any = None 539 | ) -> Any: 540 | _logger.debug("RLEArray.median(skipna=%r)", skipna) 541 | if (axis is not None) and (axis != 0): 542 | raise NotImplementedError("Only axis=0 is supported.") 543 | if out is not None: 544 | raise NotImplementedError("out parameter is not supported.") 545 | 546 | # TODO: fast-path 547 | data = np.asarray(self) 548 | if skipna: 549 | data = data[pd.notnull(data)] 550 | 551 | if len(data) == 0: 552 | return self.dtype.na_value 553 | else: 554 | return np.median(data) 555 | 556 | def min(self, skipna: bool = True, axis: Optional[int] = 0, out: Any = None) -> Any: 557 | _logger.debug("RLEArray.min(skipna=%r)", skipna) 558 | if (axis is not None) and (axis != 0): 559 | raise NotImplementedError("Only axis=0 is supported.") 560 | if out is not None: 561 | raise NotImplementedError("out parameter is not supported.") 562 | 563 | data = self._get_reduce_data(skipna) 564 | if len(data): 565 | return np.min(data) 566 | else: 567 | return self.dtype.na_value 568 | 569 | def prod( 570 | self, skipna: bool = True, axis: Optional[int] = 0, out: Any = None 571 | ) -> Any: 572 | _logger.debug("RLEArray.prod(skipna=%r)", skipna) 573 | if (axis is not None) and (axis != 0): 574 | raise NotImplementedError("Only axis=0 is supported.") 575 | if out is not None: 576 | raise NotImplementedError("out parameter is not supported.") 577 | 578 | data, lengths = self._get_reduce_data_len(skipna) 579 | return np.prod(np.power(data, lengths)) 580 | 581 | def skew(self, skipna: bool = True) -> Any: 582 | _logger.debug("RLEArray.skew(skipna=%r)", skipna) 583 | # TODO: fast-path 584 | data = np.asarray(self) 585 | return pd.Series(data).skew(skipna=skipna) 586 | 587 | def std( 588 | self, 589 | skipna: bool = True, 590 | ddof: int = 1, 591 | dtype: Optional[Any] = None, 592 | axis: Optional[int] = 0, 593 | out: Any = None, 594 | ) -> Any: 595 | _logger.debug("RLEArray.std(skipna=%r, ddof=%r)", skipna, ddof) 596 | if (axis is not None) and (axis != 0): 597 | raise NotImplementedError("Only axis=0 is supported.") 598 | if out is not None: 599 | raise NotImplementedError("out parameter is not supported.") 600 | if dtype is not None: 601 | raise NotImplementedError("dtype parameter is not supported.") 602 | 603 | # TODO: fast-path 604 | data = np.asarray(self).astype(dtype) 605 | # use pandas-style std, since numpy results in different results 606 | return pd.Series(data).std(skipna=skipna, ddof=ddof) 607 | 608 | def sum(self, skipna: bool = True, axis: Optional[int] = 0, out: Any = None) -> Any: 609 | _logger.debug("RLEArray.sum(skipna=%r)", skipna) 610 | if (axis is not None) and (axis != 0): 611 | raise NotImplementedError("Only axis=0 is supported.") 612 | if out is not None: 613 | raise NotImplementedError("out parameter is not supported.") 614 | 615 | data, lengths = self._get_reduce_data_len(skipna) 616 | return np.dot(data, lengths) 617 | 618 | def var( 619 | self, 620 | skipna: bool = True, 621 | ddof: int = 1, 622 | dtype: Optional[Any] = None, 623 | axis: Optional[int] = 0, 624 | out: Any = None, 625 | ) -> Any: 626 | _logger.debug("RLEArray.var(skipna=%r)", skipna) 627 | if (axis is not None) and (axis != 0): 628 | raise NotImplementedError("Only axis=0 is supported.") 629 | if out is not None: 630 | raise NotImplementedError("out parameter is not supported.") 631 | if dtype is not None: 632 | raise NotImplementedError("dtype parameter is not supported.") 633 | 634 | # TODO: fast-path 635 | data = np.asarray(self).astype(dtype) 636 | # use pandas-style var, since numpy results in different results 637 | return pd.Series(data).var(skipna=skipna, ddof=ddof) 638 | 639 | def _reduce(self, name: str, skipna: bool = True, **kwargs: Any) -> Any: 640 | _logger.debug( 641 | "RLEArray._reduce(name=%r, skipna=%r, **kwargs=%r)", name, skipna, kwargs 642 | ) 643 | if name == "all": 644 | return self.all() 645 | elif name == "any": 646 | return self.any() 647 | elif name == "kurt": 648 | return self.kurt(skipna=skipna) 649 | elif name == "max": 650 | return self.max(skipna=skipna) 651 | elif name == "mean": 652 | return self.mean(skipna=skipna) 653 | elif name == "median": 654 | return self.median(skipna=skipna) 655 | elif name == "min": 656 | return self.min(skipna=skipna) 657 | elif name == "prod": 658 | return self.prod(skipna=skipna) 659 | elif name == "skew": 660 | return self.skew(skipna=skipna) 661 | elif name == "std": 662 | return self.std(skipna=skipna, ddof=int(kwargs.get("ddof", 1))) 663 | elif name == "sum": 664 | return self.sum(skipna=skipna) 665 | elif name == "var": 666 | return self.var(skipna=skipna) 667 | else: 668 | raise NotImplementedError(f"reduction {name} is not implemented.") 669 | 670 | def view(self, dtype: Optional[Any] = None) -> Any: 671 | _logger.debug("RLEArray.view(dtype=%r)", dtype) 672 | if dtype is None: 673 | dtype = self.dtype._dtype 674 | if isinstance(dtype, RLEDtype): 675 | dtype = dtype._dtype 676 | if dtype != self.dtype._dtype: 677 | raise ValueError("Cannot create view with different dtype.") 678 | 679 | result = RLEArray(data=self._data.copy(), positions=self._positions.copy()) 680 | self._projection.master.register_change(result, None) 681 | return result 682 | 683 | def dropna(self) -> "RLEArray": 684 | _logger.debug("RLEArray.dropna()") 685 | data, positions = dropna(self._data, self._positions) 686 | return RLEArray(data=data, positions=positions) 687 | 688 | def value_counts(self, dropna: bool = True) -> pd.Series: 689 | _logger.debug("RLEArray.value_counts(dropna=%r)", dropna) 690 | # TODO: add fast-path 691 | return pd.Series(np.asarray(self)).value_counts(dropna=dropna) 692 | 693 | def __iter__(self) -> Iterator[Any]: 694 | _logger.debug("RLEArray.__iter__()") 695 | warnings.warn("performance: __iter__ blows up entire data", PerformanceWarning) 696 | return gen_iterator(self._data, self._positions) 697 | 698 | def __array_ufunc__( 699 | self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any 700 | ) -> Union[None, "RLEArray", np.ndarray]: 701 | _logger.debug("RLEArray.__array_ufunc__(...)") 702 | out = kwargs.get("out", ()) 703 | for x in inputs + out: 704 | # Only support operations with instances of _HANDLED_TYPES. 705 | # Use ArrayLike instead of type(self) for isinstance to 706 | # allow subclasses that don't override __array_ufunc__ to 707 | # handle ArrayLike objects. 708 | if not isinstance(x, self._HANDLED_TYPES + (RLEArray,)): 709 | return NotImplemented 710 | 711 | # Defer to the implementation of the ufunc on unwrapped values. 712 | inputs_has_ndarray = any(isinstance(x, np.ndarray) for x in inputs) 713 | inputs = tuple(np.asarray(x) if isinstance(x, RLEArray) else x for x in inputs) 714 | 715 | if out: 716 | kwargs["out"] = tuple( 717 | np.asarray(x) if isinstance(x, RLEArray) else x for x in out 718 | ) 719 | result = getattr(ufunc, method)(*inputs, **kwargs) 720 | if out: 721 | for x, y in zip(out, kwargs["out"]): 722 | if isinstance(x, RLEArray): 723 | x[:] = y 724 | 725 | def maybe_from_sequence(x: np.ndarray) -> Union[RLEArray, np.ndarray]: 726 | if (x.ndim == 1) and (not inputs_has_ndarray): 727 | # suitable for RLE compression 728 | return type(self)._from_sequence(x) 729 | else: 730 | # likely a broadcast operation 731 | return x 732 | 733 | if type(result) is tuple: 734 | # multiple return values 735 | return tuple(maybe_from_sequence(x) for x in result) 736 | elif method == "at": 737 | assert result is None 738 | 739 | # inplace modification 740 | self[:] = inputs[0] 741 | 742 | # no return value 743 | return None 744 | else: 745 | # one return value 746 | return maybe_from_sequence(result) 747 | 748 | def __eq__(self, other: Any) -> Union["RLEArray", np.ndarray]: 749 | return self._apply_binary_operator(other, op=operator.eq) 750 | 751 | def __ne__(self, other: Any) -> Union["RLEArray", np.ndarray]: 752 | return self._apply_binary_operator(other, op=operator.ne) 753 | 754 | def __gt__(self, other: Any) -> Union["RLEArray", np.ndarray]: 755 | return self._apply_binary_operator(other, op=operator.gt) 756 | 757 | def __ge__(self, other: Any) -> Union["RLEArray", np.ndarray]: 758 | return self._apply_binary_operator(other, op=operator.ge) 759 | 760 | def __lt__(self, other: Any) -> Union["RLEArray", np.ndarray]: 761 | return self._apply_binary_operator(other, op=operator.lt) 762 | 763 | def __le__(self, other: Any) -> Union["RLEArray", np.ndarray]: 764 | return self._apply_binary_operator(other, op=operator.le) 765 | 766 | def __add__(self, other: Any) -> Union["RLEArray", np.ndarray]: 767 | return self._apply_binary_operator(other, op=operator.add) 768 | 769 | def __radd__(self, other: Any) -> Union["RLEArray", np.ndarray]: 770 | return self._apply_binary_operator(other, op=ops.radd) 771 | 772 | def __sub__(self, other: Any) -> Union["RLEArray", np.ndarray]: 773 | return self._apply_binary_operator(other, op=operator.sub) 774 | 775 | def __rsub__(self, other: Any) -> Union["RLEArray", np.ndarray]: 776 | return self._apply_binary_operator(other, op=ops.rsub) 777 | 778 | def __mul__(self, other: Any) -> Union["RLEArray", np.ndarray]: 779 | return self._apply_binary_operator(other, op=operator.mul) 780 | 781 | def __rmul__(self, other: Any) -> Union["RLEArray", np.ndarray]: 782 | return self._apply_binary_operator(other, op=ops.rmul) 783 | 784 | def __truediv__(self, other: Any) -> Union["RLEArray", np.ndarray]: 785 | return self._apply_binary_operator(other, op=operator.truediv) 786 | 787 | def __rtruediv__(self, other: Any) -> Union["RLEArray", np.ndarray]: 788 | return self._apply_binary_operator(other, op=ops.rtruediv) 789 | 790 | def __floordiv__(self, other: Any) -> Union["RLEArray", np.ndarray]: 791 | return self._apply_binary_operator(other, op=operator.floordiv) 792 | 793 | def __rfloordiv__(self, other: Any) -> Union["RLEArray", np.ndarray]: 794 | return self._apply_binary_operator(other, op=ops.rfloordiv) 795 | 796 | def __mod__(self, other: Any) -> Union["RLEArray", np.ndarray]: 797 | return self._apply_binary_operator(other, op=operator.mod) 798 | 799 | def __rmod__(self, other: Any) -> Union["RLEArray", np.ndarray]: 800 | return self._apply_binary_operator(other, op=ops.rmod) 801 | 802 | def __pow__(self, other: Any) -> Union["RLEArray", np.ndarray]: 803 | return self._apply_binary_operator(other, op=operator.pow) 804 | 805 | def __rpow__(self, other: Any) -> Union["RLEArray", np.ndarray]: 806 | return self._apply_binary_operator(other, op=ops.rpow) 807 | 808 | def __and__(self, other: Any) -> Union["RLEArray", np.ndarray]: 809 | return self._apply_binary_operator(other, op=operator.and_) 810 | 811 | def __rand__(self, other: Any) -> Union["RLEArray", np.ndarray]: 812 | return self._apply_binary_operator(other, op=ops.rand_) 813 | 814 | def __or__(self, other: Any) -> Union["RLEArray", np.ndarray]: 815 | return self._apply_binary_operator(other, op=operator.or_) 816 | 817 | def __ror__(self, other: Any) -> Union["RLEArray", np.ndarray]: 818 | return self._apply_binary_operator(other, op=ops.ror_) 819 | 820 | def __xor__(self, other: Any) -> Union["RLEArray", np.ndarray]: 821 | return self._apply_binary_operator(other, op=operator.xor) 822 | 823 | def __rxor__(self, other: Any) -> Union["RLEArray", np.ndarray]: 824 | return self._apply_binary_operator(other, op=ops.rxor) 825 | 826 | def __pos__(self) -> "RLEArray": 827 | return self._apply_unary_operator(op=operator.pos) 828 | 829 | def __neg__(self) -> "RLEArray": 830 | return self._apply_unary_operator(op=operator.neg) 831 | 832 | def __abs__(self) -> "RLEArray": 833 | return self._apply_unary_operator(op=operator.abs) 834 | 835 | def __invert__(self) -> "RLEArray": 836 | _logger.debug("RLEArray.__invert__()") 837 | return self._apply_unary_operator(op=operator.inv) 838 | 839 | def _apply_binary_operator( 840 | self, other: Any, op: Any 841 | ) -> Union["RLEArray", np.ndarray]: 842 | if isinstance(other, (ABCSeries, ABCIndexClass)): 843 | # rely on pandas to unbox and dispatch to us 844 | return NotImplemented 845 | 846 | if is_scalar(other): 847 | with np.errstate(invalid="ignore"): 848 | new_data = op(self._data, other) 849 | return RLEArray(*recompress(new_data, self._positions)) 850 | elif isinstance(other, RLEArray): 851 | if len(self) != len(other): 852 | raise ValueError("arrays have different lengths") 853 | extended_positions = extend_positions(self._positions, other._positions) 854 | data_self = extend_data( 855 | data=self._data, 856 | positions=self._positions, 857 | extended_positions=extended_positions, 858 | ) 859 | data_other = extend_data( 860 | data=other._data, 861 | positions=other._positions, 862 | extended_positions=extended_positions, 863 | ) 864 | with np.errstate(invalid="ignore"): 865 | new_data = op(data_self, data_other) 866 | return RLEArray(*recompress(new_data, extended_positions)) 867 | else: 868 | array = self.__array__() 869 | with np.errstate(invalid="ignore"): 870 | return op(array, other) 871 | 872 | def _apply_unary_operator(self, op: Any) -> "RLEArray": 873 | return RLEArray(data=op(self._data), positions=self._positions.copy()) 874 | 875 | def shift(self, periods: int = 1, fill_value: object = None) -> "RLEArray": 876 | self2 = self 877 | dtype = self.dtype 878 | 879 | if isna(fill_value): 880 | fill_value = self.dtype.na_value 881 | np_dtype_fill = np.asarray([fill_value]).dtype 882 | if np_dtype_fill.kind != dtype.kind: 883 | dtype = RLEDtype(np_dtype_fill) 884 | self2 = self.astype(dtype) 885 | 886 | if not len(self) or periods == 0: 887 | return self2.copy() 888 | 889 | empty = RLEArray( 890 | data=np.asarray([fill_value], dtype=dtype._dtype), 891 | positions=np.asarray([min(abs(periods), len(self))], dtype=POSITIONS_DTYPE), 892 | ) 893 | 894 | if periods > 0: 895 | a = empty 896 | b = self2[:-periods] 897 | else: 898 | a = self2[abs(periods) :] 899 | b = empty 900 | return self._concat_same_type([a, b]) 901 | 902 | def fillna( 903 | self, 904 | value: Any = None, 905 | method: Optional[str] = None, 906 | limit: Optional[int] = None, 907 | ) -> "RLEArray": 908 | # TODO: fast-path 909 | arr = pd.Series(np.asarray(self)).array.fillna(value, method, limit).to_numpy() 910 | data, positions = compress(arr) 911 | return RLEArray(data=data, positions=positions) 912 | 913 | def round(self, decimals: int = 0) -> "RLEArray": 914 | _logger.debug("RLEArray.round(decimals=%r)", decimals) 915 | new_data = self._data.round(decimals) 916 | return RLEArray(*recompress(new_data, self._positions)) 917 | 918 | def unique(self) -> "RLEArray": 919 | uniques = unique(self._data) 920 | return RLEArray( 921 | data=uniques, 922 | positions=np.arange(1, 1 + len(uniques), dtype=POSITIONS_DTYPE), 923 | ) 924 | -------------------------------------------------------------------------------- /rle_array/autoconversion.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from typing import Optional, Union 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from pandas.api.extensions import ExtensionDtype 7 | from pandas.errors import PerformanceWarning 8 | 9 | from .array import RLEArray 10 | from .dtype import RLEDtype 11 | 12 | 13 | def _is_rle_dtype(dtype: Union[np.dtype, ExtensionDtype]) -> bool: 14 | """ 15 | Checks if the given dtype is already RLE compressed. 16 | 17 | Parameters 18 | ---------- 19 | dtype 20 | Input dtype. 21 | """ 22 | return isinstance(dtype, RLEDtype) 23 | 24 | 25 | def _uses_datetimeblock(dtype: Union[np.dtype, ExtensionDtype]) -> bool: 26 | """ 27 | Detects if the RLEArray would use a pandas ``DatetimeBlock``. 28 | 29 | It seems to be a bug in pandas that it cannot deal with datetime extension arrays. 30 | 31 | Parameters 32 | ---------- 33 | dtype 34 | Dtype of the original, uncompressed array. 35 | """ 36 | vtype = dtype.type 37 | return issubclass(vtype, np.datetime64) 38 | 39 | 40 | def auto_convert_to_rle( 41 | df: pd.DataFrame, threshold: Optional[float] = None 42 | ) -> pd.DataFrame: 43 | """ 44 | Auto-convert given DataFrame to RLE compressed DataFrame. 45 | 46 | .. important:: 47 | 48 | Datetime columns are currently not compressed due to pandas not supporting them. 49 | 50 | Please note that RLE can, under some circumstances, require MORE memory than the uncompressed data. It is not 51 | advisable to set ``threshold`` to a value larger than 1 except for testing purposes. 52 | 53 | Parameters 54 | ---------- 55 | df 56 | Input DataFrame, may already contain RLE columns. This input data MIGHT not be copied! 57 | threshold 58 | Compression threshold, e.g.: 59 | 60 | - ``None``: compress all 61 | - ``1.0`` compresses only if RLE does NOT take up more space 62 | - ``0.5`` compresses if at least 50% memory are safed 63 | - ``0.0`` do not compress at all 64 | 65 | Raises 66 | ------ 67 | ValueError 68 | If threshold is negative. 69 | """ 70 | if (threshold is not None) and (threshold < 0.0): 71 | raise ValueError(f"threshold ({threshold}) must be non-negative") 72 | 73 | index = df.index 74 | 75 | data = {} 76 | for col in df.columns: 77 | series = df[col] 78 | array_orig = series.array 79 | 80 | array_target = array_orig 81 | 82 | dtype = series.dtype 83 | 84 | if not _is_rle_dtype(dtype): 85 | if _uses_datetimeblock(dtype): 86 | warnings.warn( 87 | f"Column {col} would use a DatetimeBlock and can currently not be RLE compressed." 88 | ) 89 | else: 90 | array_rle = RLEArray._from_sequence( 91 | scalars=array_orig, dtype=dtype, copy=True 92 | ) 93 | if threshold is None: 94 | array_target = array_rle 95 | elif threshold > 0: 96 | if (len(array_orig) == 0) or ( 97 | array_rle.nbytes / array_orig.nbytes <= threshold 98 | ): 99 | array_target = array_rle 100 | 101 | data[col] = array_target 102 | 103 | return pd.DataFrame(data, index=index) 104 | 105 | 106 | def decompress(df: pd.DataFrame) -> pd.DataFrame: 107 | """ 108 | Decompress all RLE columns in the provided DataFrame. 109 | 110 | Parameters 111 | ---------- 112 | df 113 | Input DataFrame. This input data MIGHT not be copied! 114 | """ 115 | index = df.index 116 | 117 | data = {} 118 | for col in df.columns: 119 | series = df[col] 120 | array = series.array 121 | dtype = series.dtype 122 | 123 | if _is_rle_dtype(dtype): 124 | with warnings.catch_warnings(): 125 | warnings.simplefilter("ignore", category=PerformanceWarning) 126 | array = array.astype(dtype._dtype) 127 | 128 | data[col] = array 129 | 130 | return pd.DataFrame(data, index=index) 131 | -------------------------------------------------------------------------------- /rle_array/dtype.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable, List, Optional, cast 2 | 3 | import numpy as np 4 | from pandas.api.extensions import ExtensionDtype, register_extension_dtype 5 | from pandas.core.dtypes.cast import find_common_type 6 | 7 | import rle_array 8 | 9 | 10 | @register_extension_dtype 11 | class RLEDtype(ExtensionDtype): 12 | _metadata = ("_dtype",) 13 | 14 | def __init__(self, dtype: Any): 15 | self._dtype = np.dtype(dtype) 16 | 17 | @property 18 | def type(self) -> Callable[..., Any]: 19 | return cast(Callable[..., Any], self._dtype.type) 20 | 21 | @property 22 | def kind(self) -> str: 23 | return str(self._dtype.kind) 24 | 25 | @property 26 | def name(self) -> str: 27 | return f"RLEDtype[{self._dtype}]" 28 | 29 | @classmethod 30 | def construct_from_string(cls, string: str) -> "RLEDtype": 31 | """ 32 | Strict construction from a string, raise a TypeError if not possible. 33 | """ 34 | if not isinstance(string, str): 35 | raise TypeError( 36 | f"'construct_from_string' expects a string, got {type(string)}" 37 | ) 38 | 39 | prefix = "RLEDtype[" 40 | suffix = "]" 41 | if not (string.startswith(prefix) and string.endswith(suffix)): 42 | raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") 43 | sub = string[len(prefix) : -len(suffix)] 44 | return cls(np.dtype(sub)) 45 | 46 | @classmethod 47 | def construct_array_type( 48 | cls, 49 | ) -> Callable[[np.ndarray, np.ndarray], "rle_array.RLEArray"]: 50 | return rle_array.RLEArray 51 | 52 | @property 53 | def _is_numeric(self) -> bool: 54 | # exclude object, str, unicode, void. 55 | return self.kind in set("biufc") 56 | 57 | @property 58 | def _is_boolean(self) -> bool: 59 | return self.kind == "b" 60 | 61 | def _get_common_dtype(self, dtypes: List[Any]) -> Optional[Any]: 62 | unpacked_dtypes = [] 63 | only_rle = True 64 | for t in dtypes: 65 | if isinstance(t, RLEDtype): 66 | unpacked_dtypes.append(t._dtype) 67 | else: 68 | unpacked_dtypes.append(t) 69 | only_rle = False 70 | 71 | # ask pandas for help 72 | suggested_type = find_common_type(unpacked_dtypes) 73 | 74 | # prefer RLE 75 | if (suggested_type is not None) and only_rle: 76 | return RLEDtype(suggested_type) 77 | else: 78 | return suggested_type 79 | 80 | def __repr__(self) -> str: 81 | return f"RLEDtype({self._dtype!r})" 82 | -------------------------------------------------------------------------------- /rle_array/testing.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functionality useful for testing and documentation. 3 | """ 4 | import itertools 5 | from typing import Iterable 6 | 7 | import numpy as np 8 | import pandas as pd 9 | 10 | 11 | def dim_col(d: int) -> str: 12 | """ 13 | Name of an dimension columns. 14 | 15 | Parameters 16 | ---------- 17 | d 18 | Dimension number. 19 | 20 | Returns 21 | ------- 22 | name: str 23 | Dimension name. 24 | 25 | Example 26 | ------- 27 | >>> from rle_array.testing import dim_col 28 | >>> dim_col(1) 29 | 'dim_1' 30 | """ 31 | return f"dim_{d}" 32 | 33 | 34 | def const_col(dims: Iterable[int]) -> str: 35 | """ 36 | Name of an constant columns. 37 | 38 | Parameters 39 | ---------- 40 | dims 41 | Dimensions, that describe the column content. 42 | 43 | Returns 44 | ------- 45 | name: str 46 | Column name. 47 | 48 | Example 49 | ------- 50 | >>> from rle_array.testing import const_col 51 | >>> const_col([1, 2]) 52 | 'const_1_2' 53 | >>> const_col([2, 1]) 54 | 'const_1_2' 55 | """ 56 | dims = sorted(dims) 57 | dims_str = [str(d) for d in dims] 58 | return f"const_{'_'.join(dims_str)}" 59 | 60 | 61 | def _insert_sorted(df: pd.DataFrame, column: str, value: np.ndarray) -> None: 62 | pos = 0 63 | for i, c in enumerate(df.columns): 64 | if c > column: 65 | break 66 | pos = i + 1 67 | df.insert(pos, column, value) 68 | 69 | 70 | def _setup_dim_df(n_dims: int, size: int) -> pd.DataFrame: 71 | elements = np.arange(size ** n_dims) 72 | df = pd.DataFrame(index=pd.RangeIndex(0, len(elements))) 73 | for i in range(n_dims): 74 | _insert_sorted(df, dim_col(i), (elements // (size ** i)) % size) 75 | return df 76 | 77 | 78 | def _add_const_cols(df: pd.DataFrame, n_dims: int, size: int) -> pd.DataFrame: 79 | for dims in itertools.chain( 80 | *( 81 | itertools.combinations(range(n_dims), dims_len + 1) 82 | for dims_len in range(n_dims) 83 | ) 84 | ): 85 | data = None 86 | for d in dims: 87 | if data is None: 88 | data = df[dim_col(d)].values 89 | else: 90 | data = data * size + df[dim_col(d)].values 91 | _insert_sorted(df, const_col(dims), data) 92 | return df 93 | 94 | 95 | def generate_test_dataframe(n_dims: int, size: int) -> pd.DataFrame: 96 | """ 97 | Generates testing data. 98 | 99 | Parameters 100 | --------- 101 | n_dims 102 | Number of dimensions of test cube. 103 | size 104 | Size of every dimension (edge length). 105 | 106 | Returns 107 | ------- 108 | df: pd.DataFrame 109 | Test DataFrame. 110 | """ 111 | df = _setup_dim_df(n_dims, size) 112 | df = _add_const_cols(df, n_dims, size) 113 | return df 114 | 115 | 116 | def generate_example() -> pd.DataFrame: 117 | """ 118 | Generate example DataFrame for documentation purposes. 119 | 120 | Returns 121 | ------- 122 | df: pd.DataFrame 123 | Example DataFrame. 124 | """ 125 | rng = np.random.RandomState(1234) 126 | 127 | df = generate_test_dataframe(n_dims=2, size=2000) 128 | df["date"] = pd.Timestamp("2000-01-01") + pd.to_timedelta(df["dim_0"], unit="D") 129 | df["month"] = df["date"].dt.month.astype(np.int8) 130 | df["year"] = df["date"].dt.year.astype(np.int16) 131 | df["city"] = "city_" + df["dim_1"].astype("str") 132 | df["country"] = "country_" + (df["dim_1"] // 500).astype("str") 133 | df["avg_temp"] = ( 134 | rng.normal(loc=10.0, scale=5.0, size=len(df)) 135 | .round(decimals=1) 136 | .astype(np.float32) 137 | ) 138 | df["rain"] = rng.rand(len(df)) > 0.9 139 | df["mood"] = "ok" 140 | df.loc[(~df["rain"]) & (df["avg_temp"] > 15), "mood"] = "great" 141 | df.loc[(df["rain"]) & (df["avg_temp"] < 5), "mood"] = "sad" 142 | return df[["date", "month", "year", "city", "country", "avg_temp", "rain", "mood"]] 143 | -------------------------------------------------------------------------------- /rle_array/types.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | __all__ = ("POSITIONS_DTYPE",) 4 | 5 | 6 | #: Data type used to encode positions of RLE-runs. 7 | POSITIONS_DTYPE = np.int64 8 | -------------------------------------------------------------------------------- /scripts/fmt.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -exuo pipefail 4 | 5 | black . 6 | isort --atomic . 7 | -------------------------------------------------------------------------------- /scripts/test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -exuo pipefail 4 | 5 | mypy . 6 | pytest 7 | black --check . 8 | isort --check-only . 9 | flake8 10 | asv run --show-stderr --environment existing --quick 11 | python setup.py build_sphinx 12 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [build_sphinx] 2 | source-dir = docs 3 | build-dir = docs/_build 4 | builder = doctest,html 5 | warning-is-error = true 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import setuptools 3 | 4 | if __name__ == "__main__": 5 | setuptools.setup() 6 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JDASoftwareGroup/rle-array/e5201b9185079f4fc4fd907d8f591426df79946e/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_astype.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | 5 | from rle_array import RLEDtype 6 | 7 | pytestmark = pytest.mark.filterwarnings("ignore:performance") 8 | 9 | 10 | @pytest.fixture 11 | def series() -> pd.Series: 12 | return pd.Series([1, 1, 2]).astype(RLEDtype(int)) 13 | 14 | 15 | def test_no_copy(series: pd.Series) -> None: 16 | series2 = series.astype(series.dtype, copy=False) 17 | assert series2.values is series.values 18 | assert series2.dtype == RLEDtype(int) 19 | 20 | 21 | def test_copy_different_dtype(series: pd.Series) -> None: 22 | series2 = series.astype(RLEDtype(float), copy=False) 23 | assert series2.values is not series.values 24 | assert series2.dtype == RLEDtype(float) 25 | 26 | 27 | def test_cast_to_np_array(series: pd.Series) -> None: 28 | series2 = series.astype(int, copy=False) 29 | assert series2.values is not series.values 30 | assert series2.dtype == np.dtype(int) 31 | -------------------------------------------------------------------------------- /tests/test_autoconversion.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pytest 6 | from pandas import testing as pdt 7 | 8 | from rle_array.autoconversion import auto_convert_to_rle, decompress 9 | from rle_array.dtype import RLEDtype 10 | 11 | pytestmark = pytest.mark.filterwarnings("ignore:performance") 12 | 13 | 14 | @pytest.mark.parametrize( 15 | "orig, threshold, expected", 16 | [ 17 | ( 18 | # orig 19 | pd.DataFrame( 20 | { 21 | "int64": pd.Series([1], dtype=np.int64), 22 | "int32": pd.Series([1], dtype=np.int32), 23 | "uint64": pd.Series([1], dtype=np.uint64), 24 | "float64": pd.Series([1.2], dtype=np.float64), 25 | "bool": pd.Series([True], dtype=np.bool_), 26 | "object": pd.Series(["foo"], dtype=np.object_), 27 | "datetime64": pd.Series( 28 | [pd.Timestamp("2020-01-01")], dtype="datetime64[ns]" 29 | ), 30 | } 31 | ), 32 | # threshold 33 | None, 34 | # expected 35 | pd.DataFrame( 36 | { 37 | "int64": pd.Series([1], dtype=RLEDtype(np.int64)), 38 | "int32": pd.Series([1], dtype=RLEDtype(np.int32)), 39 | "uint64": pd.Series([1], dtype=RLEDtype(np.uint64)), 40 | "float64": pd.Series([1.2], dtype=RLEDtype(np.float64)), 41 | "bool": pd.Series([True], dtype=RLEDtype(np.bool_)), 42 | "object": pd.Series(["foo"]).astype(RLEDtype(np.object_)), 43 | "datetime64": pd.Series( 44 | [pd.Timestamp("2020-01-01")], dtype="datetime64[ns]" 45 | ), 46 | } 47 | ), 48 | ), 49 | ( 50 | # orig 51 | pd.DataFrame( 52 | { 53 | "int64": pd.Series([1], dtype=np.int64), 54 | "int32": pd.Series([1], dtype=np.int32), 55 | "uint64": pd.Series([1], dtype=np.uint64), 56 | "float64": pd.Series([1.2], dtype=np.float64), 57 | "bool": pd.Series([True], dtype=np.bool_), 58 | "object": pd.Series(["foo"], dtype=np.object_), 59 | "datetime64": pd.Series( 60 | [pd.Timestamp("2020-01-01")], dtype="datetime64[ns]" 61 | ), 62 | } 63 | ), 64 | # threshold 65 | 2.0, 66 | # expected 67 | pd.DataFrame( 68 | { 69 | "int64": pd.Series([1], dtype=RLEDtype(np.int64)), 70 | "int32": pd.Series([1], dtype=np.int32), 71 | "uint64": pd.Series([1], dtype=RLEDtype(np.uint64)), 72 | "float64": pd.Series([1.2], dtype=RLEDtype(np.float64)), 73 | "bool": pd.Series([True], dtype=np.bool_), 74 | "object": pd.Series(["foo"]).astype(RLEDtype(np.object_)), 75 | "datetime64": pd.Series( 76 | [pd.Timestamp("2020-01-01")], dtype="datetime64[ns]" 77 | ), 78 | } 79 | ), 80 | ), 81 | ( 82 | # orig 83 | pd.DataFrame( 84 | { 85 | "single_value": pd.Series([1, 1, 1, 1, 1, 1], dtype=np.int64), 86 | "two_values": pd.Series([1, 1, 1, 2, 2, 2], dtype=np.int64), 87 | "increasing": pd.Series([1, 2, 3, 4, 5, 6], dtype=np.int64), 88 | } 89 | ), 90 | # threshold 91 | None, 92 | # expected 93 | pd.DataFrame( 94 | { 95 | "single_value": pd.Series( 96 | [1, 1, 1, 1, 1, 1], dtype=RLEDtype(np.int64) 97 | ), 98 | "two_values": pd.Series( 99 | [1, 1, 1, 2, 2, 2], dtype=RLEDtype(np.int64) 100 | ), 101 | "increasing": pd.Series( 102 | [1, 2, 3, 4, 5, 6], dtype=RLEDtype(np.int64) 103 | ), 104 | } 105 | ), 106 | ), 107 | ( 108 | # orig 109 | pd.DataFrame( 110 | { 111 | "single_value": pd.Series([1, 1, 1, 1, 1, 1], dtype=np.int64), 112 | "two_values": pd.Series([1, 1, 1, 2, 2, 2], dtype=np.int64), 113 | "increasing": pd.Series([1, 2, 3, 4, 5, 6], dtype=np.int64), 114 | } 115 | ), 116 | # threshold 117 | 0.9, 118 | # expected 119 | pd.DataFrame( 120 | { 121 | "single_value": pd.Series( 122 | [1, 1, 1, 1, 1, 1], dtype=RLEDtype(np.int64) 123 | ), 124 | "two_values": pd.Series( 125 | [1, 1, 1, 2, 2, 2], dtype=RLEDtype(np.int64) 126 | ), 127 | "increasing": pd.Series([1, 2, 3, 4, 5, 6], dtype=np.int64), 128 | } 129 | ), 130 | ), 131 | ( 132 | # orig 133 | pd.DataFrame( 134 | { 135 | "single_value": pd.Series([1, 1, 1, 1, 1, 1], dtype=np.int64), 136 | "two_values": pd.Series([1, 1, 1, 2, 2, 2], dtype=np.int64), 137 | "increasing": pd.Series([1, 2, 3, 4, 5, 6], dtype=np.int64), 138 | } 139 | ), 140 | # threshold 141 | 0.5, 142 | # expected 143 | pd.DataFrame( 144 | { 145 | "single_value": pd.Series( 146 | [1, 1, 1, 1, 1, 1], dtype=RLEDtype(np.int64) 147 | ), 148 | "two_values": pd.Series([1, 1, 1, 2, 2, 2], dtype=np.int64), 149 | "increasing": pd.Series([1, 2, 3, 4, 5, 6], dtype=np.int64), 150 | } 151 | ), 152 | ), 153 | ( 154 | # orig 155 | pd.DataFrame( 156 | { 157 | "single_value": pd.Series([1, 1, 1, 1, 1, 1], dtype=np.int64), 158 | "two_values": pd.Series([1, 1, 1, 2, 2, 2], dtype=np.int64), 159 | "increasing": pd.Series([1, 2, 3, 4, 5, 6], dtype=np.int64), 160 | } 161 | ), 162 | # threshold 163 | 0.0, 164 | # expected 165 | pd.DataFrame( 166 | { 167 | "single_value": pd.Series([1, 1, 1, 1, 1, 1], dtype=np.int64), 168 | "two_values": pd.Series([1, 1, 1, 2, 2, 2], dtype=np.int64), 169 | "increasing": pd.Series([1, 2, 3, 4, 5, 6], dtype=np.int64), 170 | } 171 | ), 172 | ), 173 | ( 174 | # orig 175 | pd.DataFrame({"x": pd.Series([], dtype=np.int64)}), 176 | # threshold 177 | 0.0, 178 | # expected 179 | pd.DataFrame({"x": pd.Series([], dtype=np.int64)}), 180 | ), 181 | ( 182 | # orig 183 | pd.DataFrame({"x": pd.Series([], dtype=np.int64)}), 184 | # threshold 185 | 0.1, 186 | # expected 187 | pd.DataFrame({"x": pd.Series([], dtype=RLEDtype(np.int64))}), 188 | ), 189 | ( 190 | # orig 191 | pd.DataFrame( 192 | { 193 | "single_value": pd.Series([1, 1, 1, 1, 1, 1], dtype=np.int64), 194 | "two_values": pd.Series([1, 1, 1, 2, 2, 2], dtype=np.int64), 195 | "increasing": pd.Series( 196 | [1, 2, 3, 4, 5, 6], dtype=RLEDtype(np.int64) 197 | ), 198 | } 199 | ), 200 | # threshold 201 | 0.5, 202 | # expected 203 | pd.DataFrame( 204 | { 205 | "single_value": pd.Series( 206 | [1, 1, 1, 1, 1, 1], dtype=RLEDtype(np.int64) 207 | ), 208 | "two_values": pd.Series([1, 1, 1, 2, 2, 2], dtype=np.int64), 209 | "increasing": pd.Series( 210 | [1, 2, 3, 4, 5, 6], dtype=RLEDtype(np.int64) 211 | ), 212 | } 213 | ), 214 | ), 215 | ( 216 | # orig 217 | pd.DataFrame({"x": pd.Series(range(10), dtype=np.int64)}), 218 | # threshold 219 | 1.0, 220 | # expected 221 | pd.DataFrame({"x": pd.Series(range(10), dtype=np.int64)}), 222 | ), 223 | ( 224 | # orig 225 | pd.DataFrame(), 226 | # threshold 227 | None, 228 | # expected 229 | pd.DataFrame(), 230 | ), 231 | ], 232 | ) 233 | @pytest.mark.filterwarnings("ignore:.*would use a DatetimeBlock:UserWarning") 234 | def test_auto_convert_to_rle_ok( 235 | orig: pd.DataFrame, threshold: Optional[float], expected: pd.DataFrame 236 | ) -> None: 237 | actual = auto_convert_to_rle(orig, threshold) 238 | pdt.assert_frame_equal(actual, expected) 239 | 240 | 241 | def test_datetime_warns() -> None: 242 | df = pd.DataFrame( 243 | { 244 | "i1": pd.Series([1], dtype=np.int64), 245 | "d1": pd.Series([pd.Timestamp("2020-01-01")], dtype="datetime64[ns]"), 246 | "i2": pd.Series([1], dtype=np.int64), 247 | "d2": pd.Series([pd.Timestamp("2020-01-01")], dtype="datetime64[ns]"), 248 | } 249 | ) 250 | with pytest.warns(None) as record: 251 | auto_convert_to_rle(df, 0.5) 252 | assert len(record) == 2 253 | assert ( 254 | str(record[0].message) 255 | == "Column d1 would use a DatetimeBlock and can currently not be RLE compressed." 256 | ) 257 | assert ( 258 | str(record[1].message) 259 | == "Column d2 would use a DatetimeBlock and can currently not be RLE compressed." 260 | ) 261 | 262 | 263 | def test_auto_convert_to_rle_threshold_out_of_range() -> None: 264 | df = pd.DataFrame({"x": [1]}) 265 | 266 | with pytest.raises(ValueError, match=r"threshold \(-0.1\) must be non-negative"): 267 | auto_convert_to_rle(df, -0.1) 268 | 269 | 270 | @pytest.mark.parametrize( 271 | "orig, expected", 272 | [ 273 | ( 274 | # orig 275 | pd.DataFrame( 276 | { 277 | "int64": pd.Series([1], dtype=RLEDtype(np.int64)), 278 | "int32": pd.Series([1], dtype=RLEDtype(np.int32)), 279 | "uint64": pd.Series([1], dtype=RLEDtype(np.uint64)), 280 | "float64": pd.Series([1.2], dtype=RLEDtype(np.float64)), 281 | "bool": pd.Series([True], dtype=RLEDtype(np.bool_)), 282 | "object": pd.Series(["foo"]).astype(RLEDtype(np.object_)), 283 | "datetime64": pd.Series( 284 | [pd.Timestamp("2020-01-01")], dtype="datetime64[ns]" 285 | ), 286 | } 287 | ), 288 | # expected 289 | pd.DataFrame( 290 | { 291 | "int64": pd.Series([1], dtype=np.int64), 292 | "int32": pd.Series([1], dtype=np.int32), 293 | "uint64": pd.Series([1], dtype=np.uint64), 294 | "float64": pd.Series([1.2], dtype=np.float64), 295 | "bool": pd.Series([True], dtype=np.bool_), 296 | "object": pd.Series(["foo"], dtype=np.object_), 297 | "datetime64": pd.Series( 298 | [pd.Timestamp("2020-01-01")], dtype="datetime64[ns]" 299 | ), 300 | } 301 | ), 302 | ), 303 | ( 304 | # orig 305 | pd.DataFrame( 306 | { 307 | "int64": pd.Series([1], dtype=np.int64), 308 | "int32": pd.Series([1], dtype=np.int32), 309 | "uint64": pd.Series([1], dtype=np.uint64), 310 | "float64": pd.Series([1.2], dtype=np.float64), 311 | "bool": pd.Series([True], dtype=np.bool_), 312 | "object": pd.Series(["foo"], dtype=np.object_), 313 | "datetime64": pd.Series( 314 | [pd.Timestamp("2020-01-01")], dtype="datetime64[ns]" 315 | ), 316 | } 317 | ), 318 | # expected 319 | pd.DataFrame( 320 | { 321 | "int64": pd.Series([1], dtype=np.int64), 322 | "int32": pd.Series([1], dtype=np.int32), 323 | "uint64": pd.Series([1], dtype=np.uint64), 324 | "float64": pd.Series([1.2], dtype=np.float64), 325 | "bool": pd.Series([True], dtype=np.bool_), 326 | "object": pd.Series(["foo"], dtype=np.object_), 327 | "datetime64": pd.Series( 328 | [pd.Timestamp("2020-01-01")], dtype="datetime64[ns]" 329 | ), 330 | } 331 | ), 332 | ), 333 | ( 334 | # orig 335 | pd.DataFrame(), 336 | # expected 337 | pd.DataFrame(), 338 | ), 339 | ], 340 | ) 341 | def test_decompress_ok(orig: pd.DataFrame, expected: pd.DataFrame) -> None: 342 | actual = decompress(orig) 343 | pdt.assert_frame_equal(actual, expected) 344 | 345 | 346 | def test_decompress_does_not_warn() -> None: 347 | df = pd.DataFrame({"x": pd.Series([1] * 10, dtype=RLEDtype(np.int64))}) 348 | 349 | with pytest.warns(None) as record: 350 | decompress(df) 351 | 352 | assert len(record) == 0 353 | -------------------------------------------------------------------------------- /tests/test_builtins.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Union, cast 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pytest 6 | from _pytest.fixtures import SubRequest 7 | from numpy import testing as npt 8 | from pandas import testing as pdt 9 | 10 | from rle_array import RLEArray, RLEDtype 11 | 12 | pytestmark = pytest.mark.filterwarnings("ignore:performance") 13 | 14 | FComp = Callable[[Union[pd.Series, np.ndarray], Union[pd.Series, RLEArray]], None] 15 | 16 | 17 | @pytest.fixture 18 | def series_orig() -> pd.Series: 19 | return pd.Series([1, 1, 2, 3, 3], dtype=int) 20 | 21 | 22 | @pytest.fixture 23 | def array_orig(series_orig: pd.Series) -> np.ndarray: 24 | return series_orig.values 25 | 26 | 27 | @pytest.fixture 28 | def series_rle(series_orig: pd.Series) -> pd.Series: 29 | return series_orig.astype(RLEDtype(series_orig.dtype)) 30 | 31 | 32 | @pytest.fixture 33 | def array_rle(series_rle: pd.Series) -> RLEArray: 34 | values = series_rle.values 35 | assert isinstance(values, RLEArray) 36 | return values 37 | 38 | 39 | @pytest.fixture(params=["series", "array"]) 40 | def mode(request: SubRequest) -> str: 41 | m = request.param 42 | assert isinstance(m, str) 43 | return m 44 | 45 | 46 | @pytest.fixture 47 | def object_orig( 48 | series_orig: pd.Series, array_orig: np.ndarray, mode: str 49 | ) -> Union[pd.Series, np.ndarray]: 50 | if mode == "series": 51 | return series_orig 52 | elif mode == "array": 53 | return array_orig 54 | else: 55 | raise ValueError(f"Unknown mode {mode}") 56 | 57 | 58 | @pytest.fixture 59 | def object_rle( 60 | series_rle: pd.Series, array_rle: RLEArray, mode: str 61 | ) -> Union[pd.Series, RLEArray]: 62 | if mode == "series": 63 | return series_rle 64 | elif mode == "array": 65 | return array_rle 66 | else: 67 | raise ValueError(f"Unknown mode {mode}") 68 | 69 | 70 | @pytest.fixture 71 | def comp(mode: str) -> FComp: 72 | if mode == "series": 73 | return cast(FComp, pdt.assert_series_equal) 74 | elif mode == "array": 75 | return cast(FComp, npt.assert_array_equal) 76 | else: 77 | raise ValueError(f"Unknown mode {mode}") 78 | 79 | 80 | def test_sum( 81 | object_orig: Union[pd.Series, np.ndarray], 82 | object_rle: Union[pd.Series, RLEArray], 83 | comp: FComp, 84 | ) -> None: 85 | elements_orig = [object_orig, object_orig] 86 | elements_rle = [object_rle, object_rle] 87 | elements_mixed = [object_rle, object_orig] 88 | 89 | result_orig: np.int64 = sum(elements_orig) 90 | result_rle: np.int64 = sum(elements_rle) 91 | result_mixed: np.int64 = sum(elements_mixed) 92 | 93 | result_converted1 = result_rle.astype(int) 94 | comp(result_orig, result_converted1) 95 | 96 | result_converted2 = result_mixed.astype(int) 97 | comp(result_orig, result_converted2) 98 | -------------------------------------------------------------------------------- /tests/test_constructors.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from rle_array import RLEArray 5 | from rle_array.types import POSITIONS_DTYPE 6 | 7 | 8 | def test_valid() -> None: 9 | RLEArray( 10 | data=np.asarray([1.0, 2.0]), 11 | positions=np.asarray([10, 20], dtype=POSITIONS_DTYPE), 12 | ) 13 | 14 | 15 | def test_data_invalid_type() -> None: 16 | with pytest.raises(TypeError, match="data must be an ndarray but is int"): 17 | RLEArray(data=1, positions=np.asarray([10, 20], dtype=POSITIONS_DTYPE)) 18 | 19 | 20 | def test_positions_invalid_type() -> None: 21 | with pytest.raises(TypeError, match="positions must be an ndarray but is int"): 22 | RLEArray(data=np.asarray([1.0, 2.0]), positions=1) 23 | 24 | 25 | def test_data_invalid_dims() -> None: 26 | with pytest.raises( 27 | ValueError, match="data must be an 1-dimensional ndarray but has 2 dimensions" 28 | ): 29 | RLEArray( 30 | data=np.asarray([[1.0, 2.0], [3.0, 4.0]]), 31 | positions=np.asarray([10, 20], dtype=POSITIONS_DTYPE), 32 | ) 33 | 34 | 35 | def test_positions_invalid_dims() -> None: 36 | with pytest.raises( 37 | ValueError, 38 | match="positions must be an 1-dimensional ndarray but has 2 dimensions", 39 | ): 40 | RLEArray( 41 | data=np.asarray([1.0, 2.0]), 42 | positions=np.asarray([[10, 20], [30, 40]], dtype=POSITIONS_DTYPE), 43 | ) 44 | 45 | 46 | def test_positions_invalid_dtype() -> None: 47 | with pytest.raises( 48 | ValueError, match="positions must have dtype int64 but has uint64" 49 | ): 50 | RLEArray( 51 | data=np.asarray([1.0, 2.0]), positions=np.asarray([10, 20], dtype=np.uint64) 52 | ) 53 | 54 | 55 | def test_different_lengths() -> None: 56 | with pytest.raises( 57 | ValueError, match="data and positions must have same length but have 3 and 2" 58 | ): 59 | RLEArray( 60 | data=np.asarray([1.0, 2.0, 3.0]), 61 | positions=np.asarray([10, 20], dtype=POSITIONS_DTYPE), 62 | ) 63 | 64 | 65 | def test_not_sorted_1() -> None: 66 | with pytest.raises(ValueError, match="positions must be strictly sorted"): 67 | RLEArray( 68 | data=np.asarray([1.0, 2.0]), 69 | positions=np.asarray([10, 9], dtype=POSITIONS_DTYPE), 70 | ) 71 | 72 | 73 | def test_not_sorted_2() -> None: 74 | with pytest.raises(ValueError, match="positions must be strictly sorted"): 75 | RLEArray( 76 | data=np.asarray([1.0, 2.0]), 77 | positions=np.asarray([10, 10], dtype=POSITIONS_DTYPE), 78 | ) 79 | -------------------------------------------------------------------------------- /tests/test_dtype.py: -------------------------------------------------------------------------------- 1 | from typing import Any, List 2 | 3 | import numpy as np 4 | import pytest 5 | 6 | from rle_array import RLEDtype 7 | 8 | 9 | @pytest.mark.parametrize( 10 | "a, b, expected", 11 | [ 12 | ( 13 | # a 14 | RLEDtype(int), 15 | # b 16 | RLEDtype(int), 17 | # expected 18 | True, 19 | ), 20 | ( 21 | # a 22 | RLEDtype(int), 23 | # b 24 | RLEDtype(float), 25 | # expected 26 | False, 27 | ), 28 | ( 29 | # a 30 | RLEDtype(int), 31 | # b 32 | RLEDtype(np.int64), 33 | # expected 34 | True, 35 | ), 36 | ], 37 | ) 38 | def test_eq(a: RLEDtype, b: RLEDtype, expected: bool) -> None: 39 | actual = a == b 40 | assert actual is expected 41 | 42 | 43 | @pytest.mark.parametrize( 44 | "dtype, dtypes, expected", 45 | [ 46 | ( # RLE: idempotents 47 | # dtype 48 | RLEDtype(np.int8), 49 | # dtypes 50 | [RLEDtype(np.int8)], 51 | # expected 52 | RLEDtype(np.int8), 53 | ), 54 | ( # RLE: same types 55 | # dtype 56 | RLEDtype(np.int8), 57 | # dtypes 58 | [RLEDtype(np.int8), RLEDtype(np.int8)], 59 | # expected 60 | RLEDtype(np.int8), 61 | ), 62 | ( # RLE: larger integer 63 | # dtype 64 | RLEDtype(np.int8), 65 | # dtypes 66 | [RLEDtype(np.int8), RLEDtype(np.int16)], 67 | # expected 68 | RLEDtype(np.int16), 69 | ), 70 | ( # RLE: choose float 71 | # dtype 72 | RLEDtype(np.int8), 73 | # dtypes 74 | [RLEDtype(np.int8), RLEDtype(np.float32)], 75 | # expected 76 | RLEDtype(np.float32), 77 | ), 78 | ( # RLE: use special pandas rule and chose object 79 | # dtype 80 | RLEDtype(np.bool_), 81 | # dtypes 82 | [RLEDtype(np.bool_), RLEDtype(np.float32)], 83 | # expected 84 | RLEDtype(object), 85 | ), 86 | ( # uncompressed: same types 87 | # dtype 88 | RLEDtype(np.int8), 89 | # dtypes 90 | [RLEDtype(np.int8), np.dtype(np.int8)], 91 | # expected 92 | np.dtype(np.int8), 93 | ), 94 | ( # uncompressed: larger integer 95 | # dtype 96 | RLEDtype(np.int8), 97 | # dtypes 98 | [RLEDtype(np.int8), np.dtype(np.int16)], 99 | # expected 100 | np.dtype(np.int16), 101 | ), 102 | ( # uncompressed: choose float 103 | # dtype 104 | RLEDtype(np.int8), 105 | # dtypes 106 | [RLEDtype(np.int8), np.dtype(np.float32)], 107 | # expected 108 | np.dtype(np.float32), 109 | ), 110 | ( # uncompressed: use special pandas rule and chose object 111 | # dtype 112 | RLEDtype(np.bool_), 113 | # dtypes 114 | [RLEDtype(np.bool_), np.dtype(np.float32)], 115 | # expected 116 | np.dtype(object), 117 | ), 118 | ], 119 | ) 120 | def test_get_common_dtype(dtype: RLEDtype, dtypes: List[Any], expected: Any) -> None: 121 | actual = dtype._get_common_dtype(dtypes) 122 | assert actual == expected 123 | 124 | 125 | @pytest.mark.parametrize( 126 | "dtype, expected", 127 | [ 128 | ( 129 | # dtype 130 | RLEDtype(np.dtype(int)), 131 | # expected 132 | "RLEDtype(dtype('int64'))", 133 | ), 134 | ( 135 | # dtype 136 | RLEDtype(int), 137 | # expected 138 | "RLEDtype(dtype('int64'))", 139 | ), 140 | ], 141 | ) 142 | def test_repr(dtype: RLEDtype, expected: str) -> None: 143 | assert repr(dtype) == expected 144 | -------------------------------------------------------------------------------- /tests/test_fastpath.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | 4 | from rle_array import RLEDtype # noqa 5 | 6 | pytestmark = pytest.mark.filterwarnings("error:performance") 7 | 8 | 9 | @pytest.fixture 10 | def series() -> pd.Series: 11 | return pd.Series(range(10), dtype="RLEDtype[int64]") 12 | 13 | 14 | @pytest.fixture 15 | def df(series: pd.Series) -> pd.DataFrame: 16 | return pd.DataFrame({"x": series, "y": series}) 17 | 18 | 19 | def test_array_slice(series: pd.Series) -> None: 20 | series.array[:] 21 | series.array[::-1] 22 | 23 | 24 | def test_create_series(series: pd.Series) -> None: 25 | pass 26 | 27 | 28 | def test_create_df(df: pd.Series) -> None: 29 | pass 30 | 31 | 32 | def test_getitem_single(series: pd.Series) -> None: 33 | assert series[2] == 2 34 | 35 | 36 | def test_sum(series: pd.Series) -> None: 37 | assert series.sum() == 45 38 | -------------------------------------------------------------------------------- /tests/test_indexing.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from rle_array import RLEArray 4 | 5 | 6 | def test_fail_two_dim_indexing() -> None: 7 | array = RLEArray._from_sequence(range(10)) 8 | with pytest.raises( 9 | NotImplementedError, 10 | match="__getitem__ does currently only work w/ a single parameter", 11 | ): 12 | array[1, 2] 13 | -------------------------------------------------------------------------------- /tests/test_misc_operations.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pytest 6 | from _pytest.fixtures import SubRequest 7 | from pandas import testing as pdt 8 | 9 | from rle_array import RLEDtype 10 | 11 | pytestmark = pytest.mark.filterwarnings("ignore:performance") 12 | 13 | 14 | @pytest.fixture( 15 | params=[ 16 | "single_int", 17 | "single_float", 18 | "single_float32", 19 | "empty_int", 20 | "empty_float", 21 | "empty_float32", 22 | "multi_int", 23 | "multi_float", 24 | "multi_float32", 25 | ] 26 | ) 27 | def data_orig(request: SubRequest) -> pd.Series: 28 | f1 = 1.2363 29 | f2 = 2.6263 30 | if request.param == "single_int": 31 | return pd.Series([1], dtype=int) 32 | elif request.param == "single_float": 33 | return pd.Series([f1], dtype=float) 34 | elif request.param == "single_float32": 35 | return pd.Series([f1], dtype=np.float32) 36 | elif request.param == "empty_int": 37 | return pd.Series([], dtype=int) 38 | elif request.param == "empty_float": 39 | return pd.Series([], dtype=float) 40 | elif request.param == "empty_float32": 41 | return pd.Series([], dtype=np.float32) 42 | elif request.param == "multi_int": 43 | return pd.Series([1, 1, 2, 2], dtype=int) 44 | elif request.param == "multi_float": 45 | return pd.Series([f1, f1, f2, f2], dtype=float) 46 | elif request.param == "multi_float32": 47 | return pd.Series([f1, f1, f2, f2], dtype=np.float32) 48 | else: 49 | raise ValueError(f"Unknown data variant: {request.param}") 50 | 51 | 52 | @pytest.fixture 53 | def data_rle(data_orig: pd.Series) -> pd.Series: 54 | return data_orig.astype(RLEDtype(data_orig.dtype)) 55 | 56 | 57 | @pytest.mark.parametrize("periods", [0, -1, 1, -2, 2]) 58 | @pytest.mark.parametrize("fill_value", [1, np.nan]) 59 | def test_shift( 60 | data_orig: pd.Series, data_rle: pd.Series, periods: int, fill_value: Any 61 | ) -> None: 62 | result_orig = data_orig.shift(periods=periods, fill_value=fill_value) 63 | result_rle = data_rle.shift(periods=periods, fill_value=fill_value) 64 | 65 | assert result_rle.dtype == RLEDtype(result_orig.dtype) 66 | 67 | result_converted = result_rle.astype(result_rle.dtype._dtype) 68 | pdt.assert_series_equal(result_orig, result_converted) 69 | 70 | 71 | @pytest.mark.parametrize("decimals", [0, 1, 2, 3, 4]) 72 | def test_round(data_orig: pd.Series, data_rle: pd.Series, decimals: int) -> None: 73 | result_orig = data_orig.round(decimals=decimals) 74 | result_rle = data_rle.round(decimals=decimals) 75 | 76 | assert result_rle.dtype == RLEDtype(result_orig.dtype) 77 | 78 | result_converted = result_rle.astype(result_rle.dtype._dtype) 79 | pdt.assert_series_equal(result_orig, result_converted) 80 | -------------------------------------------------------------------------------- /tests/test_operators.py: -------------------------------------------------------------------------------- 1 | import operator 2 | from typing import Any, Callable, cast 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import pytest 7 | from _pytest.fixtures import SubRequest 8 | from numpy import testing as npt 9 | from pandas.core import ops 10 | 11 | from rle_array import RLEArray, RLEDtype 12 | 13 | pytestmark = pytest.mark.filterwarnings("ignore:performance") 14 | 15 | FCompareOperator = Callable[[Any, Any], Any] 16 | FUnaryOperator = Callable[[Any], Any] 17 | FUnaryBoolOperator = Callable[[Any], Any] 18 | FBinaryOperator = Callable[[Any, Any], Any] 19 | FBinaryBoolOperator = Callable[[Any, Any], Any] 20 | 21 | 22 | @pytest.fixture 23 | def values() -> np.ndarray: 24 | return np.array([2.0, 2.0, 2.0, 3.0, 3.0, 2.0, np.nan, np.nan, 1.0, 1.0]) 25 | 26 | 27 | @pytest.fixture 28 | def scalar(values: np.ndarray) -> float: 29 | return float(values[0]) 30 | 31 | 32 | @pytest.fixture 33 | def uncompressed_series(values: np.ndarray) -> pd.Series: 34 | return pd.Series(values, index=np.arange(len(values)) + 1) 35 | 36 | 37 | @pytest.fixture 38 | def uncompressed_series2(values: np.ndarray) -> pd.Series: 39 | return pd.Series(values[::-1], index=np.arange(len(values)) + 1) 40 | 41 | 42 | @pytest.fixture 43 | def rle_series(values: np.ndarray) -> pd.Series: 44 | return pd.Series(RLEArray._from_sequence(values), index=np.arange(len(values)) + 1) 45 | 46 | 47 | @pytest.fixture 48 | def rle_series2(values: np.ndarray) -> pd.Series: 49 | return pd.Series( 50 | RLEArray._from_sequence(values[::-1]), index=np.arange(len(values)) + 1 51 | ) 52 | 53 | 54 | @pytest.fixture 55 | def bool_values() -> np.ndarray: 56 | return np.array([False] * 4 + [True] * 5 + [False]) 57 | 58 | 59 | @pytest.fixture 60 | def bool_scalar(bool_values: np.ndarray) -> bool: 61 | return bool(bool_values[0]) 62 | 63 | 64 | @pytest.fixture 65 | def uncompressed_bool_series(bool_values: np.ndarray) -> pd.Series: 66 | return pd.Series(bool_values) 67 | 68 | 69 | @pytest.fixture 70 | def uncompressed_bool_series2(bool_values: np.ndarray) -> pd.Series: 71 | return pd.Series(bool_values[::-1]) 72 | 73 | 74 | @pytest.fixture 75 | def rle_bool_series(bool_values: np.ndarray) -> pd.Series: 76 | return pd.Series(RLEArray._from_sequence(bool_values)) 77 | 78 | 79 | @pytest.fixture 80 | def rle_bool_series2(bool_values: np.ndarray) -> pd.Series: 81 | # TODO: Use `index=np.arange(len(bool_values)) + 1`. 82 | # For some reason, pandas casts us back to dtype=bool in that case. 83 | return pd.Series(RLEArray._from_sequence(bool_values[::-1])) 84 | 85 | 86 | @pytest.fixture( 87 | params=[ 88 | operator.eq, 89 | operator.ne, 90 | operator.lt, 91 | operator.gt, 92 | operator.le, 93 | operator.ge, 94 | ], 95 | ids=lambda op: str(op.__name__), 96 | ) 97 | def compare_operator(request: SubRequest) -> FCompareOperator: 98 | return cast(FCompareOperator, request.param) 99 | 100 | 101 | @pytest.fixture( 102 | params=[operator.abs, operator.neg, operator.pos], ids=lambda op: str(op.__name__) 103 | ) 104 | def unary_operator(request: SubRequest) -> FUnaryOperator: 105 | return cast(FUnaryOperator, request.param) 106 | 107 | 108 | @pytest.fixture(params=[operator.inv], ids=lambda op: str(op.__name__)) 109 | def unary_bool_operator(request: SubRequest) -> FUnaryBoolOperator: 110 | return cast(FUnaryBoolOperator, request.param) 111 | 112 | 113 | @pytest.fixture( 114 | params=[ 115 | operator.add, 116 | operator.iadd, 117 | ops.radd, 118 | operator.sub, 119 | operator.isub, 120 | ops.rsub, 121 | operator.mul, 122 | operator.imul, 123 | ops.rmul, 124 | operator.truediv, 125 | operator.itruediv, 126 | ops.rtruediv, 127 | operator.floordiv, 128 | operator.ifloordiv, 129 | ops.rfloordiv, 130 | operator.mod, 131 | operator.imod, 132 | ops.rmod, 133 | operator.pow, 134 | operator.ipow, 135 | ops.rpow, 136 | ], 137 | ids=lambda op: str(op.__name__), 138 | ) 139 | def binary_operator(request: SubRequest) -> FBinaryOperator: 140 | return cast(FBinaryOperator, request.param) 141 | 142 | 143 | @pytest.fixture( 144 | params=[ 145 | operator.and_, 146 | operator.iand, 147 | ops.rand_, 148 | operator.or_, 149 | operator.ior, 150 | ops.ror_, 151 | operator.xor, 152 | operator.ixor, 153 | ops.rxor, 154 | ], 155 | ids=lambda op: str(op.__name__), 156 | ) 157 | def binary_bool_operator(request: SubRequest) -> FBinaryBoolOperator: 158 | return cast(FBinaryBoolOperator, request.param) 159 | 160 | 161 | def test_compare_scalar( 162 | rle_series: pd.Series, 163 | uncompressed_series: pd.Series, 164 | scalar: float, 165 | compare_operator: FCompareOperator, 166 | ) -> None: 167 | actual = compare_operator(rle_series, scalar) 168 | assert actual.dtype == RLEDtype(bool) 169 | 170 | expected = compare_operator(uncompressed_series, scalar).astype("RLEDtype[bool]") 171 | pd.testing.assert_series_equal(actual, expected) 172 | 173 | 174 | def test_compare_rle_series( 175 | rle_series: pd.Series, 176 | rle_series2: pd.Series, 177 | uncompressed_series: pd.Series, 178 | uncompressed_series2: pd.Series, 179 | compare_operator: FCompareOperator, 180 | ) -> None: 181 | actual = compare_operator(rle_series, rle_series2) 182 | assert actual.dtype == RLEDtype(bool) 183 | 184 | expected = compare_operator(uncompressed_series, uncompressed_series2).astype( 185 | "RLEDtype[bool]" 186 | ) 187 | pd.testing.assert_series_equal(actual, expected) 188 | 189 | 190 | def test_compare_uncompressed_series( 191 | rle_series: pd.Series, 192 | uncompressed_series: pd.Series, 193 | compare_operator: FCompareOperator, 194 | ) -> None: 195 | actual = compare_operator(rle_series, uncompressed_series) 196 | assert actual.dtype == bool 197 | 198 | expected = compare_operator(uncompressed_series, uncompressed_series) 199 | pd.testing.assert_series_equal(actual, expected) 200 | 201 | 202 | def test_binary_operator_scalar( 203 | rle_series: pd.Series, 204 | uncompressed_series: pd.Series, 205 | scalar: float, 206 | binary_operator: FBinaryOperator, 207 | ) -> None: 208 | actual = binary_operator(rle_series, scalar) 209 | assert actual.dtype == RLEDtype(float) 210 | 211 | expected = binary_operator(uncompressed_series, scalar).astype("RLEDtype[float]") 212 | pd.testing.assert_series_equal(actual, expected) 213 | 214 | 215 | def test_binary_operator_rle_series( 216 | rle_series: pd.Series, 217 | rle_series2: pd.Series, 218 | uncompressed_series: pd.Series, 219 | uncompressed_series2: pd.Series, 220 | binary_operator: FBinaryOperator, 221 | ) -> None: 222 | actual = binary_operator(rle_series, rle_series2) 223 | assert actual.dtype == RLEDtype(float) 224 | 225 | expected = binary_operator(uncompressed_series, uncompressed_series2).astype( 226 | "RLEDtype[float]" 227 | ) 228 | pd.testing.assert_series_equal(actual, expected) 229 | 230 | 231 | def test_binary_operator_uncompressed_series( 232 | rle_series: pd.Series, 233 | uncompressed_series: pd.Series, 234 | uncompressed_series2: pd.Series, 235 | binary_operator: FBinaryOperator, 236 | ) -> None: 237 | actual = binary_operator(rle_series, uncompressed_series2) 238 | assert actual.dtype == float 239 | 240 | expected = binary_operator(uncompressed_series, uncompressed_series2) 241 | pd.testing.assert_series_equal(actual, expected) 242 | 243 | 244 | def test_binary_bool_operator_scalar( 245 | rle_bool_series: pd.Series, 246 | uncompressed_bool_series: pd.Series, 247 | bool_scalar: bool, 248 | binary_bool_operator: FBinaryBoolOperator, 249 | ) -> None: 250 | actual = binary_bool_operator(rle_bool_series, bool_scalar) 251 | assert actual.dtype == RLEDtype(bool) 252 | 253 | expected = binary_bool_operator(uncompressed_bool_series, bool_scalar).astype( 254 | RLEDtype(bool) 255 | ) 256 | pd.testing.assert_series_equal(actual, expected) 257 | 258 | 259 | def test_binary_bool_operator_rle_series( 260 | rle_bool_series: pd.Series, 261 | rle_bool_series2: pd.Series, 262 | uncompressed_bool_series: pd.Series, 263 | uncompressed_bool_series2: pd.Series, 264 | binary_bool_operator: FBinaryBoolOperator, 265 | ) -> None: 266 | actual = binary_bool_operator(rle_bool_series, rle_bool_series2) 267 | assert actual.dtype == RLEDtype(bool) 268 | 269 | expected = binary_bool_operator( 270 | uncompressed_bool_series, uncompressed_bool_series2 271 | ).astype(RLEDtype(bool)) 272 | pd.testing.assert_series_equal(actual, expected) 273 | 274 | 275 | def test_binary_bool_operator_uncompressed_series( 276 | rle_bool_series: pd.Series, 277 | uncompressed_bool_series: pd.Series, 278 | uncompressed_bool_series2: pd.Series, 279 | binary_bool_operator: FBinaryBoolOperator, 280 | ) -> None: 281 | actual = binary_bool_operator(rle_bool_series, uncompressed_bool_series2) 282 | assert actual.dtype == bool 283 | 284 | expected = binary_bool_operator(uncompressed_bool_series, uncompressed_bool_series2) 285 | pd.testing.assert_series_equal(actual, expected) 286 | 287 | 288 | def test_unary_operator( 289 | rle_series: pd.Series, 290 | uncompressed_series: pd.Series, 291 | unary_operator: FUnaryOperator, 292 | ) -> None: 293 | actual = unary_operator(rle_series) 294 | assert actual.dtype == RLEDtype(float) 295 | 296 | expected = unary_operator(uncompressed_series).astype(RLEDtype(float)) 297 | pd.testing.assert_series_equal(actual, expected) 298 | 299 | 300 | def test_unary_operator_array( 301 | rle_series: pd.Series, 302 | uncompressed_series: pd.Series, 303 | unary_operator: FUnaryOperator, 304 | ) -> None: 305 | actual = unary_operator(rle_series.array) 306 | assert actual.dtype == RLEDtype(float) 307 | 308 | expected = unary_operator(uncompressed_series.array) 309 | npt.assert_array_equal(actual, expected) 310 | 311 | 312 | def test_unary_bool_operator( 313 | rle_bool_series: pd.Series, 314 | uncompressed_bool_series: pd.Series, 315 | unary_bool_operator: FUnaryBoolOperator, 316 | ) -> None: 317 | actual = unary_bool_operator(rle_bool_series) 318 | assert actual.dtype == RLEDtype(bool) 319 | 320 | expected = unary_bool_operator(uncompressed_bool_series).astype(RLEDtype(bool)) 321 | pd.testing.assert_series_equal(actual, expected) 322 | 323 | 324 | def test_unary_bool_operator_array( 325 | rle_bool_series: pd.Series, 326 | uncompressed_bool_series: pd.Series, 327 | unary_bool_operator: FUnaryBoolOperator, 328 | ) -> None: 329 | actual = unary_bool_operator(rle_bool_series.array) 330 | assert actual.dtype == RLEDtype(bool) 331 | 332 | expected = unary_bool_operator(uncompressed_bool_series.array) 333 | npt.assert_array_equal(actual, expected) 334 | 335 | 336 | def test_different_length_raises(values: np.ndarray) -> None: 337 | array1 = RLEArray._from_sequence(values) 338 | array2 = RLEArray._from_sequence(values[:-1]) 339 | with pytest.raises(ValueError, match="arrays have different lengths"): 340 | array1 + array2 341 | -------------------------------------------------------------------------------- /tests/test_pandas.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable, Generator, cast 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pytest 6 | from _pytest.fixtures import SubRequest 7 | from pandas.tests.extension import base 8 | 9 | from rle_array import RLEArray, RLEDtype 10 | from rle_array.types import POSITIONS_DTYPE 11 | 12 | pytestmark = pytest.mark.filterwarnings("ignore:performance") 13 | 14 | 15 | _all_arithmetic_operators = [ 16 | "__add__", 17 | "__radd__", 18 | "__sub__", 19 | "__rsub__", 20 | "__mul__", 21 | "__rmul__", 22 | "__floordiv__", 23 | "__rfloordiv__", 24 | "__truediv__", 25 | "__rtruediv__", 26 | "__pow__", 27 | "__rpow__", 28 | "__mod__", 29 | "__rmod__", 30 | ] 31 | 32 | 33 | @pytest.fixture(params=_all_arithmetic_operators) 34 | def all_arithmetic_operators(request: SubRequest) -> str: 35 | """ 36 | Fixture for dunder names for common arithmetic operations 37 | """ 38 | op = request.param 39 | assert isinstance(op, str) 40 | return op 41 | 42 | 43 | @pytest.fixture(params=["__eq__", "__ne__", "__le__", "__lt__", "__ge__", "__gt__"]) 44 | def all_compare_operators(request: SubRequest) -> str: 45 | """ 46 | Fixture for dunder names for common compare operations 47 | 48 | * >= 49 | * > 50 | * == 51 | * != 52 | * < 53 | * <= 54 | """ 55 | op = request.param 56 | assert isinstance(op, str) 57 | return op 58 | 59 | 60 | _all_boolean_reductions = ["all", "any"] 61 | 62 | 63 | @pytest.fixture(params=_all_boolean_reductions) 64 | def all_boolean_reductions(request: SubRequest) -> str: 65 | """ 66 | Fixture for boolean reduction names 67 | """ 68 | op = request.param 69 | assert isinstance(op, str) 70 | return op 71 | 72 | 73 | @pytest.fixture(params=["data", "data_missing"]) 74 | def all_data(request: SubRequest, data: RLEArray, data_missing: RLEArray) -> RLEArray: 75 | """Parametrized fixture giving 'data' and 'data_missing'""" 76 | if request.param == "data": 77 | return data 78 | elif request.param == "data_missing": 79 | return data_missing 80 | else: 81 | raise RuntimeError(f"Unkonwn all_data type: {request.param}") 82 | 83 | 84 | _all_numeric_reductions = [ 85 | "sum", 86 | "max", 87 | "min", 88 | "mean", 89 | "prod", 90 | "std", 91 | "var", 92 | "median", 93 | "kurt", 94 | "skew", 95 | ] 96 | 97 | 98 | @pytest.fixture(params=_all_numeric_reductions) 99 | def all_numeric_reductions(request: SubRequest) -> str: 100 | """ 101 | Fixture for numeric reduction names 102 | """ 103 | op = request.param 104 | assert isinstance(op, str) 105 | return op 106 | 107 | 108 | @pytest.fixture(params=[True, False]) 109 | def as_array(request: SubRequest) -> bool: 110 | """ 111 | Boolean fixture to support ExtensionDtype _from_sequence method testing. 112 | """ 113 | b = request.param 114 | assert isinstance(b, bool) 115 | return b 116 | 117 | 118 | @pytest.fixture(params=[True, False]) 119 | def as_frame(request: SubRequest) -> bool: 120 | """ 121 | Boolean fixture to support Series and Series.to_frame() comparison testing. 122 | """ 123 | b = request.param 124 | assert isinstance(b, bool) 125 | return b 126 | 127 | 128 | @pytest.fixture(params=[True, False]) 129 | def as_series(request: SubRequest) -> bool: 130 | """ 131 | Boolean fixture to support arr and Series(arr) comparison testing. 132 | """ 133 | b = request.param 134 | assert isinstance(b, bool) 135 | return b 136 | 137 | 138 | @pytest.fixture(params=[True, False]) 139 | def box_in_series(request: SubRequest) -> bool: 140 | """Whether to box the data in a Series""" 141 | b = request.param 142 | assert isinstance(b, bool) 143 | return b 144 | 145 | 146 | @pytest.fixture 147 | def data() -> RLEArray: 148 | """Length-100 array for this type. 149 | * data[0] and data[1] should both be non missing 150 | * data[0] and data[1] should not be equal 151 | """ 152 | return RLEArray( 153 | data=np.asarray([13, -1, -2, 42], dtype=np.float32), 154 | positions=np.asarray([1, 2, 4, 100], dtype=POSITIONS_DTYPE), 155 | ) 156 | 157 | 158 | @pytest.fixture 159 | def data_for_grouping() -> RLEArray: 160 | """Data for factorization, grouping, and unique tests. 161 | Expected to be like [B, B, NA, NA, A, A, B, C] 162 | Where A < B < C and NA is missing 163 | """ 164 | return RLEArray( 165 | data=np.asarray([2.0, np.nan, 1.0, 2.0, 3.0], dtype=np.float32), 166 | positions=np.asarray([2, 4, 6, 7, 8], dtype=POSITIONS_DTYPE), 167 | ) 168 | 169 | 170 | @pytest.fixture 171 | def data_for_sorting() -> RLEArray: 172 | """Length-3 array with a known sort order. 173 | This should be three items [B, C, A] with 174 | A < B < C 175 | """ 176 | return RLEArray( 177 | data=np.asarray([2.0, 3.0, 1.0], dtype=np.float32), 178 | positions=np.asarray([1, 2, 3], dtype=POSITIONS_DTYPE), 179 | ) 180 | 181 | 182 | @pytest.fixture 183 | def data_for_twos() -> RLEArray: 184 | """Length-100 array in which all the elements are two.""" 185 | return RLEArray( 186 | data=np.asarray([2.0], dtype=np.float32), 187 | positions=np.asarray([100], dtype=POSITIONS_DTYPE), 188 | ) 189 | 190 | 191 | @pytest.fixture 192 | def data_missing() -> RLEArray: 193 | """Length-2 array with [NA, Valid]""" 194 | return RLEArray( 195 | data=np.asarray([np.nan, 42], dtype=np.float32), 196 | positions=np.asarray([1, 2], dtype=POSITIONS_DTYPE), 197 | ) 198 | 199 | 200 | @pytest.fixture 201 | def data_missing_for_sorting() -> RLEArray: 202 | """Length-3 array with a known sort order. 203 | This should be three items [B, NA, A] with 204 | A < B and NA missing. 205 | """ 206 | return RLEArray( 207 | data=np.asarray([2.0, np.nan, 1.0], dtype=np.float32), 208 | positions=np.asarray([1, 2, 3], dtype=POSITIONS_DTYPE), 209 | ) 210 | 211 | 212 | @pytest.fixture 213 | def data_repeated(data: RLEArray) -> Callable[[int], Generator[RLEArray, None, None]]: 214 | """ 215 | Generate many datasets. 216 | Parameters 217 | ---------- 218 | data : fixture implementing `data` 219 | Returns 220 | ------- 221 | Callable[[int], Generator]: 222 | A callable that takes a `count` argument and 223 | returns a generator yielding `count` datasets. 224 | """ 225 | 226 | def gen(count: int) -> Generator[RLEArray, None, None]: 227 | for _ in range(count): 228 | yield data 229 | 230 | return gen 231 | 232 | 233 | @pytest.fixture 234 | def dtype() -> RLEDtype: 235 | """A fixture providing the ExtensionDtype to validate.""" 236 | return RLEDtype(np.float32) 237 | 238 | 239 | @pytest.fixture(params=["ffill", "bfill"]) 240 | def fillna_method(request: SubRequest) -> str: 241 | """ 242 | Parametrized fixture giving method parameters 'ffill' and 'bfill' for 243 | Series.fillna(method=) testing. 244 | """ 245 | op = request.param 246 | assert isinstance(op, str) 247 | return op 248 | 249 | 250 | @pytest.fixture( 251 | params=[ 252 | lambda x: 1, 253 | lambda x: [1] * len(x), 254 | lambda x: pd.Series([1] * len(x)), 255 | lambda x: x, 256 | ], 257 | ids=["scalar", "list", "series", "object"], 258 | ) 259 | def groupby_apply_op(request: SubRequest) -> Callable[..., Any]: 260 | """ 261 | Functions to test groupby.apply(). 262 | """ 263 | return cast(Callable[..., Any], request.param) 264 | 265 | 266 | @pytest.fixture 267 | def na_cmp() -> Callable[[Any, Any], Any]: 268 | """Binary operator for comparing NA values. 269 | Should return a function of two arguments that returns 270 | True if both arguments are (scalar) NA for your type. 271 | By default, uses ``operator.is_`` 272 | """ 273 | return lambda x, y: np.isnan(x) and np.isnan(y) 274 | 275 | 276 | @pytest.fixture 277 | def na_value() -> np.nan: 278 | """The scalar missing value for this type. Default 'None'""" 279 | return np.nan 280 | 281 | 282 | @pytest.fixture(params=[True, False]) 283 | def use_numpy(request: SubRequest) -> bool: 284 | """ 285 | Boolean fixture to support comparison testing of ExtensionDtype array 286 | and numpy array. 287 | """ 288 | b = request.param 289 | assert isinstance(b, bool) 290 | return b 291 | 292 | 293 | @pytest.fixture(params=[None, lambda x: x]) 294 | def sort_by_key(request: SubRequest) -> Any: 295 | """ 296 | Simple fixture for testing keys in sorting methods. 297 | Tests None (no key) and the identity key. 298 | """ 299 | return request.param 300 | 301 | 302 | class TestArithmeticOps(base.BaseArithmeticOpsTests): 303 | frame_scalar_exc = None 304 | series_array_exc = None 305 | series_scalar_exc = None 306 | 307 | def test_error(self) -> None: 308 | pytest.skip("upstream test is broken?") 309 | 310 | def _check_op( 311 | self, s: Any, op: Any, other: Any, op_name: str, exc: type = NotImplementedError 312 | ) -> None: 313 | # upstream version checks dtype -> we return an RLEDtype 314 | if exc is None: 315 | result = op(s, other) 316 | expected = s.combine(other, op) 317 | self.assert_series_equal(result, expected, check_dtype=False) 318 | else: 319 | with pytest.raises(exc): 320 | op(s, other) 321 | 322 | 323 | class TestBooleanReduce(base.BaseBooleanReduceTests): 324 | pass 325 | 326 | 327 | class TestCasting(base.BaseCastingTests): 328 | pass 329 | 330 | 331 | class TestConstructors(base.BaseConstructorsTests): 332 | pass 333 | 334 | 335 | class TestDtype(base.BaseDtypeTests): 336 | pass 337 | 338 | 339 | class TestGetitem(base.BaseGetitemTests): 340 | pass 341 | 342 | 343 | class TestGroupby(base.BaseGroupbyTests): 344 | pass 345 | 346 | 347 | class TestInterface(base.BaseInterfaceTests): 348 | pass 349 | 350 | 351 | class TestMethods(base.BaseMethodsTests): 352 | def test_combine_le(self) -> None: 353 | pytest.skip("upstream test is broken?") 354 | 355 | 356 | class TestMissing(base.BaseMissingTests): 357 | def test_isna(self) -> None: 358 | pytest.skip("upstream test is broken") 359 | 360 | 361 | class TestNumericReduce(base.BaseNumericReduceTests): 362 | pass 363 | 364 | 365 | class TestPrinting(base.BasePrintingTests): 366 | pass 367 | 368 | 369 | class TestReshaping(base.BaseReshapingTests): 370 | pass 371 | 372 | 373 | class TestSetitem(base.BaseSetitemTests): 374 | pass 375 | 376 | 377 | class TestComparisonOps(base.BaseComparisonOpsTests): 378 | def _compare_other(self, s: Any, data: Any, op_name: str, other: Any) -> None: 379 | # upstream version looks pretty broken... 380 | op = self.get_op_from_name(op_name) 381 | if op_name == "__eq__": 382 | assert getattr(data, op_name)(other) is NotImplemented 383 | assert not op(s, other).all() 384 | else: 385 | assert getattr(data, op_name)(other) is NotImplemented 386 | 387 | def test_compare_scalar(self, data: RLEArray, all_compare_operators: str) -> None: 388 | pytest.skip("upstream test is broken: comparison with scalar works") 389 | -------------------------------------------------------------------------------- /tests/test_reduce.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | from _pytest.fixtures import SubRequest 5 | 6 | from rle_array import RLEDtype 7 | 8 | pytestmark = pytest.mark.filterwarnings("ignore:performance") 9 | 10 | 11 | @pytest.fixture(params=["single", "multi", "empty", "sparse"]) 12 | def data_orig(request: SubRequest) -> pd.Series: 13 | if request.param == "single": 14 | return pd.Series([1], dtype=int) 15 | elif request.param == "multi": 16 | return pd.Series([1, 1, 2, 3, 1, 1], dtype=int) 17 | elif request.param == "empty": 18 | return pd.Series([], dtype=int) 19 | elif request.param == "sparse": 20 | return pd.Series([1, 1, np.nan, np.nan, 1, 1], dtype=float) 21 | else: 22 | raise ValueError(f"Unknown data type: {request.param}") 23 | 24 | 25 | @pytest.fixture(params=["single", "multi", "empty"]) 26 | def data_orig_bool(request: SubRequest) -> pd.Series: 27 | if request.param == "single": 28 | return pd.Series([False], dtype=bool) 29 | elif request.param == "multi": 30 | return pd.Series([False, False, True, False], dtype=bool) 31 | elif request.param == "empty": 32 | return pd.Series([], dtype=bool) 33 | else: 34 | raise ValueError(f"Unknown data type: {request.param}") 35 | 36 | 37 | @pytest.fixture 38 | def data_rle(data_orig: pd.Series) -> pd.Series: 39 | return data_orig.astype(RLEDtype(data_orig.dtype)) 40 | 41 | 42 | @pytest.fixture 43 | def data_rle_bool(data_orig_bool: pd.Series) -> pd.Series: 44 | return data_orig_bool.astype(RLEDtype(data_orig_bool.dtype)) 45 | 46 | 47 | @pytest.fixture(params=[True, False]) 48 | def skipna(request: SubRequest) -> bool: 49 | b = request.param 50 | assert isinstance(b, bool) 51 | return b 52 | 53 | 54 | @pytest.fixture( 55 | params=["min", "max", "mean", "median", "prod", "skew", "std", "sum", "var", "kurt"] 56 | ) 57 | def name(request: SubRequest) -> str: 58 | n = request.param 59 | assert isinstance(n, str) 60 | return n 61 | 62 | 63 | @pytest.fixture(params=["any", "all"]) 64 | def name_bool(request: SubRequest) -> str: 65 | n = request.param 66 | assert isinstance(n, str) 67 | return n 68 | 69 | 70 | @pytest.fixture(params=["max", "mean", "median", "min", "prod", "std", "sum", "var"]) 71 | def numpy_op(request: SubRequest) -> str: 72 | n = request.param 73 | assert isinstance(n, str) 74 | return n 75 | 76 | 77 | @pytest.fixture(params=["all", "any"]) 78 | def numpy_op_bool(request: SubRequest) -> str: 79 | op = request.param 80 | assert isinstance(op, str) 81 | return op 82 | 83 | 84 | @pytest.fixture(params=["mean", "std", "var"]) 85 | def numpy_op_with_dtype(request: SubRequest) -> str: 86 | op = request.param 87 | assert isinstance(op, str) 88 | return op 89 | 90 | 91 | def test_reduce( 92 | data_orig: pd.Series, data_rle: pd.Series, skipna: bool, name: str 93 | ) -> None: 94 | f_orig = getattr(data_orig, name) 95 | f_rle = getattr(data_rle, name) 96 | result_orig = f_orig(skipna=skipna) 97 | result_rle = f_rle(skipna=skipna) 98 | assert ( 99 | (np.isnan(result_orig) & np.isnan(result_rle)) | (result_orig == result_rle) 100 | ).all() 101 | # don't check type here since pandas does some magic casting from numpy to python 102 | 103 | 104 | def test_reduce_bool( 105 | data_orig_bool: pd.Series, data_rle_bool: pd.Series, name_bool: str 106 | ) -> None: 107 | f_orig = getattr(data_orig_bool, name_bool) 108 | f_rle = getattr(data_rle_bool, name_bool) 109 | result_orig = f_orig() 110 | result_rle = f_rle() 111 | assert (result_orig == result_rle).all() 112 | # don't check type here since pandas does some magic casting from numpy to python 113 | 114 | 115 | def test_array_numpy_bool_axis_notimplemented( 116 | data_rle_bool: pd.Series, numpy_op_bool: str 117 | ) -> None: 118 | f = getattr(data_rle_bool.array, numpy_op_bool) 119 | with pytest.raises(NotImplementedError, match="Only axis=0 is supported."): 120 | f(axis=2) 121 | 122 | 123 | def test_array_numpy_bool_out_notimplemented( 124 | data_rle_bool: pd.Series, numpy_op_bool: str 125 | ) -> None: 126 | f = getattr(data_rle_bool.array, numpy_op_bool) 127 | out = data_rle_bool.array.copy() 128 | with pytest.raises(NotImplementedError, match="out parameter is not supported."): 129 | f(out=out) 130 | 131 | 132 | def test_array_reduction_not_implemented(data_rle: pd.Series) -> None: 133 | with pytest.raises(NotImplementedError, match="reduction foo is not implemented."): 134 | data_rle.array._reduce(name="foo") 135 | 136 | 137 | def test_array_numpy_bool( 138 | data_orig_bool: pd.Series, data_rle_bool: pd.Series, numpy_op_bool: str 139 | ) -> None: 140 | f = getattr(np, numpy_op_bool) 141 | result_orig = f(data_rle_bool.array) 142 | result_rle = f(data_rle_bool.array) 143 | assert result_orig == result_rle 144 | assert type(result_orig) == type(result_rle) 145 | 146 | 147 | def test_array_numpy(data_orig: pd.Series, data_rle: pd.Series, numpy_op: str) -> None: 148 | f = getattr(np, numpy_op) 149 | result_orig = f(data_orig.array) 150 | result_rle = f(data_rle.array) 151 | assert (pd.isna(result_orig) and pd.isna(result_rle)) or (result_orig == result_rle) 152 | if len(data_orig) > 0: 153 | assert type(result_orig) == type(result_rle) 154 | else: 155 | # pandas might use pd.NA, while we still use float, see https://github.com/pandas-dev/pandas/issues/35475 156 | if isinstance(result_orig, type(pd.NA)): 157 | assert type(result_rle) == float 158 | else: 159 | assert type(result_orig) == type(result_rle) 160 | 161 | 162 | def test_array_numpy_axis_notimplemented(data_rle: pd.Series, numpy_op: str) -> None: 163 | f = getattr(data_rle.array, numpy_op) 164 | with pytest.raises(NotImplementedError, match="Only axis=0 is supported."): 165 | f(axis=2) 166 | 167 | 168 | def test_array_numpy_out_notimplemented(data_rle: pd.Series, numpy_op: str) -> None: 169 | f = getattr(data_rle.array, numpy_op) 170 | out = data_rle.array.copy() 171 | with pytest.raises(NotImplementedError, match="out parameter is not supported."): 172 | f(out=out) 173 | 174 | 175 | def test_array_numpy_dtype(data_rle: pd.Series, numpy_op_with_dtype: str) -> None: 176 | f = getattr(np, numpy_op_with_dtype) 177 | with pytest.raises(NotImplementedError, match="dtype parameter is not supported."): 178 | f(data_rle.array, dtype=np.float16) 179 | -------------------------------------------------------------------------------- /tests/test_regressions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Misc collection of regression tests. 3 | """ 4 | import pickle 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import pytest 9 | from numpy import testing as npt 10 | from pandas.core.dtypes.common import ensure_int_or_float 11 | 12 | from rle_array import RLEArray, RLEDtype 13 | 14 | pytestmark = pytest.mark.filterwarnings("ignore:performance") 15 | 16 | 17 | def test_object_isna() -> None: 18 | array = RLEArray._from_sequence(["foo", None], dtype=object) 19 | actual = array.isna() 20 | expected = np.asarray([False, True]) 21 | npt.assert_equal(actual, expected) 22 | 23 | 24 | def test_mean_divisor_overflow() -> None: 25 | # https://github.com/JDASoftwareGroup/rle-array/issues/22 26 | array = RLEArray._from_sequence([1] * 256, dtype=np.uint8) 27 | assert array.mean() == 1 28 | 29 | 30 | def test_pickle() -> None: 31 | array = RLEArray._from_sequence([1]) 32 | 33 | # roundtrip 34 | s = pickle.dumps(array) 35 | array2 = pickle.loads(s) 36 | npt.assert_array_equal(array, array2) 37 | 38 | # views must not be linked (A) 39 | array2_orig = array2.copy() 40 | array[:] = 2 41 | npt.assert_array_equal(array2, array2_orig) 42 | 43 | # views must not be linked (B) 44 | array_orig = array.copy() 45 | array2[:] = 3 46 | npt.assert_array_equal(array, array_orig) 47 | 48 | 49 | def test_inplace_update() -> None: 50 | array = RLEArray._from_sequence([1], dtype=np.int64) 51 | array[[True]] = 2 52 | 53 | expected = np.array([2], dtype=np.int64) 54 | npt.assert_array_equal(array, expected) 55 | 56 | assert array._dtype._dtype == np.int64 57 | assert array._data.dtype == np.int64 58 | 59 | 60 | def test_append_mixed() -> None: 61 | actual = pd.concat( 62 | [pd.Series([1], dtype=np.int8), pd.Series([1], dtype=RLEDtype(np.int8))] 63 | ) 64 | assert actual.dtype == np.int8 65 | 66 | 67 | def test_bool_ensure_int_or_float() -> None: 68 | array = RLEArray._from_sequence([False, True], dtype=np.bool_) 69 | actual = ensure_int_or_float(array) 70 | 71 | expected = np.array([0, 1], dtype=np.int64) 72 | assert actual.dtype == expected.dtype 73 | npt.assert_array_equal(actual, expected) 74 | 75 | 76 | def test_groupby_bool_first() -> None: 77 | df = pd.DataFrame({"x": pd.Series([True, True], dtype=RLEDtype(bool)), "g": 1}) 78 | series = df.groupby("g")["x"].first() 79 | assert series.dtype == RLEDtype(bool) 80 | 81 | expected = RLEArray._from_sequence([True]) 82 | npt.assert_array_equal(series.array, expected) 83 | 84 | 85 | def test_from_sequence_bool() -> None: 86 | array = RLEArray._from_sequence( 87 | np.array([0, 1], dtype=np.int64), dtype=RLEDtype(bool) 88 | ) 89 | npt.assert_array_equal(array, np.array([False, True])) 90 | 91 | array = RLEArray._from_sequence( 92 | np.array([0.0, 1.0], dtype=np.float64), dtype=RLEDtype(bool) 93 | ) 94 | npt.assert_array_equal(array, np.array([False, True])) 95 | 96 | with pytest.raises(TypeError, match="Need to pass bool-like values"): 97 | RLEArray._from_sequence(np.array([1, 2], dtype=np.int64), dtype=RLEDtype(bool)) 98 | 99 | with pytest.raises(TypeError, match="Need to pass bool-like values"): 100 | RLEArray._from_sequence(np.array([-1, 1], dtype=np.int64), dtype=RLEDtype(bool)) 101 | 102 | with pytest.raises(TypeError, match="Masked booleans are not supported"): 103 | RLEArray._from_sequence( 104 | np.array([np.nan, 1.0], dtype=np.float64), dtype=RLEDtype(bool) 105 | ) 106 | 107 | 108 | def test_groupby_bool_sum() -> None: 109 | # Cython routines for integer addition are not available, so we need to accept floats here. 110 | df = pd.DataFrame({"x": pd.Series([True, True], dtype=RLEDtype(bool)), "g": 1}) 111 | series = df.groupby("g")["x"].sum() 112 | assert series.dtype == np.float64 113 | 114 | expected = np.array([2], dtype=np.float64) 115 | npt.assert_array_equal(series.to_numpy(), expected) 116 | 117 | 118 | def test_factorize_int() -> None: 119 | array = RLEArray._from_sequence([42, -10, -10], dtype=RLEDtype(np.int32)) 120 | codes_actual, uniques_actual = array.factorize() 121 | 122 | codes_expected = np.array([0, 1, 1], dtype=np.int64) 123 | assert codes_actual.dtype == codes_expected.dtype 124 | npt.assert_array_equal(codes_actual, codes_expected) 125 | 126 | uniques_expected = RLEArray._from_sequence([42, -10], dtype=np.int32) 127 | assert uniques_actual.dtype == uniques_expected.dtype 128 | npt.assert_array_equal(uniques_actual, uniques_expected) 129 | -------------------------------------------------------------------------------- /tests/test_slicing.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, cast 2 | 3 | import numpy as np 4 | import pytest 5 | 6 | from rle_array._slicing import NormalizedSlice 7 | 8 | 9 | class TestConstructor: 10 | def test_ok_simple(self) -> None: 11 | s = NormalizedSlice(start=1, stop=11, step=2, container_length=100) 12 | assert s.start == 1 13 | assert s.stop == 11 14 | assert s.step == 2 15 | assert s.container_length == 100 16 | 17 | def test_ok_start_at_zero(self) -> None: 18 | NormalizedSlice(start=0, stop=10, step=2, container_length=100) 19 | 20 | def test_ok_stop_at_modulo_end(self) -> None: 21 | NormalizedSlice(start=0, stop=12, step=3, container_length=10) 22 | 23 | def test_ok_stop_at_modulo_begin(self) -> None: 24 | NormalizedSlice(start=0, stop=-3, step=-3, container_length=10) 25 | 26 | def test_ok_zero_length(self) -> None: 27 | NormalizedSlice(start=0, stop=0, step=1, container_length=0) 28 | 29 | def test_fail_start_none(self) -> None: 30 | with pytest.raises(TypeError, match="start must be int but is None"): 31 | NormalizedSlice( 32 | start=cast(int, None), stop=10, step=2, container_length=100 33 | ) 34 | 35 | def test_fail_stop_none(self) -> None: 36 | with pytest.raises(TypeError, match="stop must be int but is None"): 37 | NormalizedSlice(start=1, stop=cast(int, None), step=2, container_length=100) 38 | 39 | def test_fail_step_none(self) -> None: 40 | with pytest.raises(TypeError, match="step must be int but is None"): 41 | NormalizedSlice( 42 | start=1, stop=10, step=cast(int, None), container_length=100 43 | ) 44 | 45 | def test_fail_container_length_none(self) -> None: 46 | with pytest.raises(TypeError, match="container_length must be int but is None"): 47 | NormalizedSlice(start=1, stop=10, step=2, container_length=cast(int, None)) 48 | 49 | def test_fail_step_zero(self) -> None: 50 | with pytest.raises(ValueError, match="step cannot be zero"): 51 | NormalizedSlice(start=1, stop=10, step=0, container_length=100) 52 | 53 | def test_fail_start_negative(self) -> None: 54 | with pytest.raises( 55 | ValueError, match=r"start \(-1\) must be in \[0,100\) but is not" 56 | ): 57 | NormalizedSlice(start=-1, stop=10, step=1, container_length=100) 58 | 59 | def test_fail_start_large(self) -> None: 60 | with pytest.raises( 61 | ValueError, match=r"start \(100\) must be in \[0,100\) but is not" 62 | ): 63 | NormalizedSlice(start=100, stop=10, step=1, container_length=100) 64 | 65 | def test_fail_stop_small(self) -> None: 66 | with pytest.raises( 67 | ValueError, match=r"stop \(-2\) must be in \[-1,101\) but is not" 68 | ): 69 | NormalizedSlice(start=2, stop=-2, step=-1, container_length=100) 70 | 71 | def test_fail_stop_large(self) -> None: 72 | with pytest.raises( 73 | ValueError, match=r"stop \(102\) must be in \[-1,101\) but is not" 74 | ): 75 | NormalizedSlice(start=2, stop=102, step=1, container_length=100) 76 | 77 | def test_fail_container_length_negative(self) -> None: 78 | with pytest.raises( 79 | ValueError, 80 | match=r"container_length \(-1\) must be greater or equal to zero", 81 | ): 82 | NormalizedSlice(start=2, stop=102, step=1, container_length=-1) 83 | 84 | def test_fail_container_empty_start_fail(self) -> None: 85 | with pytest.raises( 86 | ValueError, match="for empty containers, start must be 0 but is 1" 87 | ): 88 | NormalizedSlice(start=1, stop=0, step=1, container_length=0) 89 | 90 | def test_fail_container_empty_stop_fail(self) -> None: 91 | with pytest.raises( 92 | ValueError, match="for empty containers, stop must be 0 but is 1" 93 | ): 94 | NormalizedSlice(start=0, stop=1, step=1, container_length=0) 95 | 96 | def test_fail_container_empty_step_fail(self) -> None: 97 | with pytest.raises( 98 | ValueError, match="for empty containers, step must be 1 but is 2" 99 | ): 100 | NormalizedSlice(start=0, stop=0, step=2, container_length=0) 101 | 102 | def test_fail_forward_slice_not_forward(self) -> None: 103 | with pytest.raises( 104 | ValueError, 105 | match="for forward slices, stop must be greater or equal to start", 106 | ): 107 | NormalizedSlice(start=1, stop=0, step=1, container_length=100) 108 | 109 | def test_fail_backward_slice_not_backward(self) -> None: 110 | with pytest.raises( 111 | ValueError, 112 | match="for backward slices, start must be greater or equal to stop", 113 | ): 114 | NormalizedSlice(start=0, stop=1, step=-1, container_length=100) 115 | 116 | def test_fail_slice_empty_start(self) -> None: 117 | with pytest.raises( 118 | ValueError, match="for empty slices, start and stop must be 0 but are 1" 119 | ): 120 | NormalizedSlice(start=1, stop=1, step=1, container_length=100) 121 | 122 | def test_fail_slice_empty_step(self) -> None: 123 | with pytest.raises( 124 | ValueError, match="for empty slices, step must be 1 but is 2" 125 | ): 126 | NormalizedSlice(start=0, stop=0, step=2, container_length=100) 127 | 128 | def test_fail_distance_not_modulo(self) -> None: 129 | with pytest.raises( 130 | ValueError, 131 | match="The distance between start and stop most be divisible by the step size", 132 | ): 133 | NormalizedSlice(start=0, stop=10, step=3, container_length=100) 134 | 135 | 136 | class TestFrozen: 137 | def test_start(self) -> None: 138 | s = NormalizedSlice(start=1, stop=11, step=2, container_length=100) 139 | with pytest.raises(AttributeError, match="can't set attribute"): 140 | s.start = 2 # type: ignore 141 | 142 | def test_stop(self) -> None: 143 | s = NormalizedSlice(start=1, stop=11, step=2, container_length=100) 144 | with pytest.raises(AttributeError, match="can't set attribute"): 145 | s.stop = 2 # type: ignore 146 | 147 | def test_step(self) -> None: 148 | s = NormalizedSlice(start=1, stop=11, step=2, container_length=100) 149 | with pytest.raises(AttributeError, match="can't set attribute"): 150 | s.step = 3 # type: ignore 151 | 152 | def test_container_length(self) -> None: 153 | s = NormalizedSlice(start=1, stop=11, step=2, container_length=100) 154 | with pytest.raises(AttributeError, match="can't set attribute"): 155 | s.container_length = 3 # type: ignore 156 | 157 | 158 | def test_repr() -> None: 159 | s = NormalizedSlice(start=1, stop=11, step=2, container_length=100) 160 | assert repr(s) == "NormalizedSlice(start=1, stop=11, step=2, container_length=100)" 161 | 162 | 163 | @pytest.mark.parametrize( 164 | "s, expected", 165 | [ 166 | ( # empty 167 | # s 168 | NormalizedSlice(start=0, stop=0, step=1, container_length=0), 169 | # expected 170 | 0, 171 | ), 172 | ( # simple, forward 173 | # s 174 | NormalizedSlice(start=0, stop=10, step=1, container_length=100), 175 | # expected 176 | 10, 177 | ), 178 | ( # simple, backward 179 | # s 180 | NormalizedSlice(start=9, stop=-1, step=-1, container_length=100), 181 | # expected 182 | 10, 183 | ), 184 | ( # even, forward 185 | # s 186 | NormalizedSlice(start=0, stop=10, step=2, container_length=100), 187 | # expected 188 | 5, 189 | ), 190 | ( # even, backward 191 | # s 192 | NormalizedSlice(start=9, stop=-1, step=-2, container_length=100), 193 | # expected 194 | 5, 195 | ), 196 | ( # complex, forward 197 | # s 198 | NormalizedSlice(start=10, stop=22, step=3, container_length=100), 199 | # expected 200 | 4, 201 | ), 202 | ( # complex, backward 203 | # s 204 | NormalizedSlice(start=19, stop=7, step=-3, container_length=100), 205 | # expected 206 | 4, 207 | ), 208 | ], 209 | ) 210 | def test_len(s: NormalizedSlice, expected: int) -> None: 211 | assert len(s) == expected 212 | 213 | 214 | class TestFromSlice: 215 | def test_fail_slice_wrong_type(self) -> None: 216 | with pytest.raises(TypeError, match="slice must be a slice but is str"): 217 | NormalizedSlice.from_slice(container_length=10, s=cast(slice, "foo")) 218 | 219 | def test_fail_slice_start_wrong_type(self) -> None: 220 | with pytest.raises( 221 | TypeError, match="slice start must be int or None but is str" 222 | ): 223 | NormalizedSlice.from_slice(container_length=10, s=slice("foo", 20, 2)) 224 | 225 | def test_fail_slice_stop_wrong_type(self) -> None: 226 | with pytest.raises( 227 | TypeError, match="slice stop must be int or None but is str" 228 | ): 229 | NormalizedSlice.from_slice(container_length=10, s=slice(2, "foo", 2)) 230 | 231 | def test_fail_slice_step_wrong_type(self) -> None: 232 | with pytest.raises( 233 | TypeError, match="slice step must be int or None but is str" 234 | ): 235 | NormalizedSlice.from_slice(container_length=10, s=slice(2, 20, "foo")) 236 | 237 | def test_fail_step_zero(self) -> None: 238 | with pytest.raises(ValueError, match="slice step cannot be zero"): 239 | NormalizedSlice.from_slice(container_length=10, s=slice(2, 10, 0)) 240 | 241 | def test_fail_container_length_wrong_type(self) -> None: 242 | with pytest.raises( 243 | TypeError, match="container_length must be an int but is str" 244 | ): 245 | NormalizedSlice.from_slice( 246 | container_length=cast(int, "foo"), s=slice(2, 10, 2) 247 | ) 248 | 249 | def test_fail_container_length_negative(self) -> None: 250 | with pytest.raises(ValueError, match="container_length cannot be negative"): 251 | NormalizedSlice.from_slice(container_length=-1, s=slice(2, 10, 2)) 252 | 253 | @pytest.mark.parametrize( 254 | "container_length, s, expected", 255 | [ 256 | ( # empty 257 | # container_length 258 | 0, 259 | # s 260 | None, 261 | # expected 262 | NormalizedSlice(start=0, stop=0, step=1, container_length=0), 263 | ), 264 | ( # implicit full via None 265 | # container_length 266 | 100, 267 | # s 268 | None, 269 | # expected 270 | NormalizedSlice(start=0, stop=100, step=1, container_length=100), 271 | ), 272 | ( # explicit full via slice 273 | # container_length 274 | 100, 275 | # s 276 | slice(None, None, None), 277 | # expected 278 | NormalizedSlice(start=0, stop=100, step=1, container_length=100), 279 | ), 280 | ( # explicit full 281 | # container_length 282 | 100, 283 | # s 284 | slice(0, 100, 1), 285 | # expected 286 | NormalizedSlice(start=0, stop=100, step=1, container_length=100), 287 | ), 288 | ( # full reverse 289 | # container_length 290 | 100, 291 | # s 292 | slice(None, None, -1), 293 | # expected 294 | NormalizedSlice(start=99, stop=-1, step=-1, container_length=100), 295 | ), 296 | ( # start negative 297 | # container_length 298 | 100, 299 | # s 300 | slice(-20, None, None), 301 | # expected 302 | NormalizedSlice(start=80, stop=100, step=1, container_length=100), 303 | ), 304 | ( # start negative overflow container 305 | # container_length 306 | 100, 307 | # s 308 | slice(-1000, None, None), 309 | # expected 310 | NormalizedSlice(start=0, stop=100, step=1, container_length=100), 311 | ), 312 | ( # stop negative 313 | # container_length 314 | 100, 315 | # s 316 | slice(None, -20, None), 317 | # expected 318 | NormalizedSlice(start=0, stop=80, step=1, container_length=100), 319 | ), 320 | ( # stop negative overflow container 321 | # container_length 322 | 100, 323 | # s 324 | slice(None, -1000, None), 325 | # expected 326 | NormalizedSlice(start=0, stop=0, step=1, container_length=100), 327 | ), 328 | ( # stop negative overflow start 329 | # container_length 330 | 100, 331 | # s 332 | slice(10, -1000, None), 333 | # expected 334 | NormalizedSlice(start=0, stop=0, step=1, container_length=100), 335 | ), 336 | ( # stop negative overflow start reverse 337 | # container_length 338 | 100, 339 | # s 340 | slice(10, -10, -1), 341 | # expected 342 | NormalizedSlice(start=0, stop=0, step=1, container_length=100), 343 | ), 344 | ( # modulo normlization forward 345 | # container_length 346 | 10, 347 | # s 348 | slice(0, 10, 3), 349 | # expected 350 | NormalizedSlice(start=0, stop=12, step=3, container_length=10), 351 | ), 352 | ( # modulo normlization forward, empty 353 | # container_length 354 | 10, 355 | # s 356 | slice(0, 0, 3), 357 | # expected 358 | NormalizedSlice(start=0, stop=0, step=1, container_length=10), 359 | ), 360 | ( # modulo normlization backward 361 | # container_length 362 | 10, 363 | # s 364 | slice(0, -1000, -3), 365 | # expected 366 | NormalizedSlice(start=0, stop=-3, step=-3, container_length=10), 367 | ), 368 | ( # modulo normlization backward, empty 369 | # container_length 370 | 10, 371 | # s 372 | slice(0, 0, -3), 373 | # expected 374 | NormalizedSlice(start=0, stop=0, step=1, container_length=10), 375 | ), 376 | ( # numpy.int64 377 | # container_length 378 | np.int64(100), 379 | # s 380 | slice(np.int64(0), np.int64(100), np.int64(1)), 381 | # expected 382 | NormalizedSlice(start=0, stop=100, step=1, container_length=100), 383 | ), 384 | ], 385 | ) 386 | def test_ok( 387 | self, container_length: int, s: Optional[slice], expected: NormalizedSlice 388 | ) -> None: 389 | actual = NormalizedSlice.from_slice(container_length, s) 390 | assert type(actual) == NormalizedSlice 391 | assert actual.start == expected.start 392 | assert type(actual.start) == int 393 | assert actual.stop == expected.stop 394 | assert type(actual.stop) == int 395 | assert actual.step == expected.step 396 | assert type(actual.step) == int 397 | assert actual.container_length == expected.container_length 398 | assert type(actual.container_length) == int 399 | 400 | 401 | class TestProject: 402 | def test_fail_no_normalizedslice(self) -> None: 403 | s1 = NormalizedSlice(start=0, stop=10, step=1, container_length=100) 404 | s2 = slice(1, 2, 1) 405 | with pytest.raises( 406 | TypeError, match="child must be NormalizedSlice but is slice" 407 | ): 408 | s1.project(cast(NormalizedSlice, s2)) 409 | 410 | def test_fail_len_diff(self) -> None: 411 | s1 = NormalizedSlice(start=0, stop=10, step=1, container_length=100) 412 | s2 = NormalizedSlice(start=0, stop=10, step=1, container_length=20) 413 | with pytest.raises( 414 | ValueError, 415 | match=r"container_length of child \(20\) must be length of parent \(10\)", 416 | ): 417 | s1.project(s2) 418 | 419 | @pytest.mark.parametrize( 420 | "s1, s2, expected", 421 | [ 422 | ( # simple full take 423 | # s1 424 | NormalizedSlice(start=0, stop=10, step=1, container_length=100), 425 | # s2 426 | NormalizedSlice(start=0, stop=10, step=1, container_length=10), 427 | # expected 428 | NormalizedSlice(start=0, stop=10, step=1, container_length=100), 429 | ), 430 | ( # reverse reverse 431 | # s1 432 | NormalizedSlice(start=9, stop=-1, step=-1, container_length=100), 433 | # s2 434 | NormalizedSlice(start=9, stop=-1, step=-1, container_length=10), 435 | # expected 436 | NormalizedSlice(start=0, stop=10, step=1, container_length=100), 437 | ), 438 | ( # two modulos 439 | # s1 440 | NormalizedSlice(start=2, stop=29, step=3, container_length=100), 441 | # s2 442 | NormalizedSlice(start=1, stop=7, step=3, container_length=9), 443 | # expected 444 | NormalizedSlice(start=5, stop=23, step=9, container_length=100), 445 | ), 446 | ( # take empty 447 | # s1 448 | NormalizedSlice(start=1, stop=9, step=2, container_length=100), 449 | # s2 450 | NormalizedSlice(start=0, stop=0, step=1, container_length=4), 451 | # expected 452 | NormalizedSlice(start=0, stop=0, step=1, container_length=100), 453 | ), 454 | ], 455 | ) 456 | def test_ok( 457 | self, s1: NormalizedSlice, s2: NormalizedSlice, expected: NormalizedSlice 458 | ) -> None: 459 | actual = s1.project(s2) 460 | assert type(actual) == NormalizedSlice 461 | assert actual.start == expected.start 462 | assert actual.stop == expected.stop 463 | assert actual.step == expected.step 464 | assert actual.container_length == expected.container_length 465 | 466 | 467 | @pytest.mark.parametrize( 468 | "s, expected", 469 | [ 470 | ( # full take 471 | # s 472 | NormalizedSlice(start=0, stop=100, step=1, container_length=100), 473 | # expected 474 | None, 475 | ), 476 | ( # full reverse 477 | # s 478 | NormalizedSlice(start=99, stop=-1, step=-1, container_length=100), 479 | # expected 480 | slice(None, None, -1), 481 | ), 482 | ( # only start 483 | # s 484 | NormalizedSlice(start=1, stop=100, step=1, container_length=100), 485 | # expected 486 | slice(1, None, None), 487 | ), 488 | ( # only stop 489 | # s 490 | NormalizedSlice(start=0, stop=99, step=1, container_length=100), 491 | # expected 492 | slice(None, 99, None), 493 | ), 494 | ( # only step 495 | # s 496 | NormalizedSlice(start=0, stop=100, step=2, container_length=100), 497 | # expected 498 | slice(None, None, 2), 499 | ), 500 | ( # complex 501 | # s 502 | NormalizedSlice(start=1, stop=22, step=3, container_length=100), 503 | # expected 504 | slice(1, 22, 3), 505 | ), 506 | ], 507 | ) 508 | def test_to_slice(s: NormalizedSlice, expected: Optional[slice]) -> None: 509 | actual = s.to_slice() 510 | if expected is None: 511 | assert actual is None 512 | else: 513 | assert isinstance(actual, slice) 514 | assert type(actual) == slice 515 | assert actual.start == expected.start 516 | assert actual.stop == expected.stop 517 | assert actual.step == expected.step 518 | -------------------------------------------------------------------------------- /tests/test_testing.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from typing import List, cast 3 | 4 | import pandas as pd 5 | import pytest 6 | from _pytest.fixtures import SubRequest 7 | from pandas import testing as pdt 8 | 9 | from rle_array.testing import ( 10 | const_col, 11 | dim_col, 12 | generate_example, 13 | generate_test_dataframe, 14 | ) 15 | 16 | 17 | @pytest.mark.parametrize( 18 | "dims, expected", 19 | [ 20 | ( 21 | # dims 22 | [1], 23 | # expected 24 | "const_1", 25 | ), 26 | ( 27 | # dims 28 | [1, 2], 29 | # expected 30 | "const_1_2", 31 | ), 32 | ( 33 | # dims 34 | [2, 1], 35 | # expected 36 | "const_1_2", 37 | ), 38 | ], 39 | ) 40 | def test_const_col(dims: List[int], expected: str) -> None: 41 | actual = const_col(dims) 42 | assert actual == expected 43 | 44 | 45 | @pytest.mark.parametrize( 46 | "d, expected", 47 | [ 48 | ( 49 | # d 50 | 1, 51 | # expected 52 | "dim_1", 53 | ), 54 | ( 55 | # d 56 | 2, 57 | # expected 58 | "dim_2", 59 | ), 60 | ], 61 | ) 62 | def test_dim_col(d: int, expected: str) -> None: 63 | actual = dim_col(d) 64 | assert actual == expected 65 | 66 | 67 | SIZE = 4 68 | N_DIMS = 3 69 | 70 | 71 | class TestGenerateTestDataFrame: 72 | @pytest.fixture 73 | def df(self) -> pd.DataFrame: 74 | return generate_test_dataframe(n_dims=N_DIMS, size=SIZE) 75 | 76 | @pytest.fixture(params=list(range(N_DIMS))) 77 | def d(self, request: SubRequest) -> int: 78 | i = request.param 79 | assert isinstance(i, int) 80 | return i 81 | 82 | @pytest.fixture( 83 | params=list( 84 | itertools.chain( 85 | *( 86 | itertools.combinations(range(N_DIMS), r) 87 | for r in range(1, N_DIMS + 1) 88 | ) 89 | ) 90 | ) 91 | ) 92 | def dims(self, request: SubRequest) -> List[int]: 93 | return cast(List[int], request.param) 94 | 95 | def test_len(self, df: pd.DataFrame) -> None: 96 | assert len(df) == SIZE ** N_DIMS 97 | 98 | def test_index(self, df: pd.DataFrame) -> None: 99 | pdt.assert_index_equal(df.index, pd.RangeIndex(0, len(df))) 100 | assert isinstance(df.index, pd.RangeIndex) 101 | 102 | def test_dim_nunique(self, df: pd.DataFrame, d: int) -> None: 103 | assert df[dim_col(d)].nunique() == SIZE 104 | 105 | def test_dim_value_counts(self, df: pd.DataFrame, d: int) -> None: 106 | assert (df[dim_col(d)].value_counts() == SIZE ** (N_DIMS - 1)).all() 107 | 108 | def test_dims_sorted(self, df: pd.DataFrame, d: int) -> None: 109 | delta = df[dim_col(d)].values[1:] - df[dim_col(d)].values[:-1] 110 | assert ((delta == 0) | (delta == 1) | (delta == -(SIZE - 1))).all() 111 | 112 | def test_const_nunique(self, df: pd.DataFrame, dims: List[int]) -> None: 113 | assert df[const_col(dims)].nunique() == SIZE ** len(dims) 114 | 115 | def test_const_value_counts(self, df: pd.DataFrame, dims: List[int]) -> None: 116 | assert ( 117 | df[const_col(dims)].value_counts() == SIZE ** (N_DIMS - len(dims)) 118 | ).all() 119 | 120 | def test_cols_sorted(self, df: pd.DataFrame) -> None: 121 | assert list(df.columns) == sorted(df.columns) 122 | 123 | 124 | def test_generate_example() -> None: 125 | df = generate_example() 126 | assert len(df) == 2000 ** 2 127 | assert list(df.columns) == [ 128 | "date", 129 | "month", 130 | "year", 131 | "city", 132 | "country", 133 | "avg_temp", 134 | "rain", 135 | "mood", 136 | ] 137 | -------------------------------------------------------------------------------- /tests/test_ufunc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | from numpy import testing as npt 5 | 6 | from rle_array import RLEArray 7 | 8 | pytestmark = pytest.mark.filterwarnings("ignore:performance") 9 | 10 | 11 | @pytest.fixture 12 | def array_orig() -> np.ndarray: 13 | return np.array([1, 1, 2, 1], dtype=np.int32) 14 | 15 | 16 | @pytest.fixture 17 | def array_rle(array_orig: np.ndarray) -> RLEArray: 18 | return RLEArray._from_sequence(array_orig) 19 | 20 | 21 | def test_square(array_orig: np.ndarray, array_rle: RLEArray) -> None: 22 | expected = np.square(array_orig) 23 | actual = np.square(array_rle) 24 | npt.assert_array_equal(actual, expected) 25 | 26 | 27 | @pytest.mark.parametrize("out_is_rle", [False, True]) 28 | def test_square_out( 29 | array_orig: np.ndarray, array_rle: RLEArray, out_is_rle: bool 30 | ) -> None: 31 | out_orig = np.array([0] * len(array_orig), dtype=array_orig.dtype) 32 | if out_is_rle: 33 | out_rle = RLEArray._from_sequence(out_orig) 34 | else: 35 | out_rle = out_orig.copy() 36 | 37 | np.square(array_orig, out=out_orig) 38 | np.square(array_rle, out=out_rle) 39 | 40 | npt.assert_array_equal(out_orig, out_rle) 41 | 42 | 43 | def test_add_at(array_orig: np.ndarray, array_rle: RLEArray) -> None: 44 | expected = np.add.at(array_orig, [0, 2], 10) 45 | actual = np.add.at(array_rle, [0, 2], 10) 46 | assert expected is None 47 | assert actual is None 48 | npt.assert_array_equal(array_orig, array_rle) 49 | 50 | 51 | def test_divmod(array_orig: np.ndarray, array_rle: RLEArray) -> None: 52 | expected1, expected2 = np.divmod(array_orig, 2) 53 | actual1, actual2 = np.divmod(array_rle, 2) 54 | npt.assert_array_equal(actual1, expected1) 55 | npt.assert_array_equal(actual2, expected2) 56 | 57 | 58 | @pytest.mark.parametrize("t", [pd.Series, pd.DataFrame, pd.Index]) 59 | def test_add_unhandled(array_orig: np.ndarray, array_rle: RLEArray, t: type) -> None: 60 | other = t(array_orig) 61 | 62 | # the pandas docs say we should not handle these 63 | assert ( 64 | array_rle.__array_ufunc__(np.add, "__call__", array_rle, other) 65 | is NotImplemented 66 | ) 67 | 68 | 69 | def test_2d_broadcast_add(array_orig: np.ndarray, array_rle: RLEArray) -> None: 70 | # ufuncs can result in high-dimensional arrays. In that case, just return a normal NumPy array. 71 | other = np.vstack([array_orig, array_orig]) 72 | assert other.shape == (2, len(array_orig)) 73 | 74 | expected = other * array_orig 75 | actual = other * array_rle 76 | assert actual.dtype == expected.dtype 77 | npt.assert_array_equal(actual, expected) 78 | 79 | 80 | def test_2d_broadcast_divmod(array_orig: np.ndarray, array_rle: RLEArray) -> None: 81 | # ufuncs can result in high-dimensional arrays. In that case, just return a normal NumPy array. 82 | other = np.vstack([array_orig, array_orig]) 83 | assert other.shape == (2, len(array_orig)) 84 | 85 | expected1, expected2 = np.divmod(other, array_orig) 86 | actual1, actual2 = np.divmod(other, array_rle) 87 | assert actual1.dtype == expected1.dtype 88 | assert actual2.dtype == expected2.dtype 89 | npt.assert_array_equal(actual1, expected1) 90 | npt.assert_array_equal(actual2, expected2) 91 | 92 | 93 | def test_mixed_typing_mul(array_orig: np.ndarray, array_rle: RLEArray) -> None: 94 | actual = array_orig * array_rle 95 | 96 | expected = array_orig * array_orig 97 | assert actual.dtype == expected.dtype 98 | npt.assert_array_equal(actual, expected) 99 | 100 | 101 | def test_mixed_typing_divmod(array_orig: np.ndarray, array_rle: RLEArray) -> None: 102 | actual1, actual2 = np.divmod(array_orig, array_rle) 103 | 104 | expected1, expected2 = np.divmod(array_orig, array_orig) 105 | assert actual1.dtype == expected1.dtype 106 | assert actual2.dtype == expected2.dtype 107 | npt.assert_array_equal(actual1, expected1) 108 | npt.assert_array_equal(actual2, expected2) 109 | -------------------------------------------------------------------------------- /tests/test_view.py: -------------------------------------------------------------------------------- 1 | import gc 2 | 3 | import numpy as np 4 | import pytest 5 | from numpy import testing as npt 6 | 7 | from rle_array.array import RLEArray, _ViewAnchor 8 | 9 | pytestmark = pytest.mark.filterwarnings("ignore:performance") 10 | 11 | 12 | def test_view_raises_differnt_dtype() -> None: 13 | orig = RLEArray._from_sequence(np.arange(10)) 14 | with pytest.raises(ValueError, match="Cannot create view with different dtype"): 15 | orig.view(np.int8) 16 | 17 | 18 | @pytest.mark.parametrize("dtype", ["none", "numpy", "rle"]) 19 | def test_plain_view(dtype: str) -> None: 20 | orig = RLEArray._from_sequence(np.arange(10)) 21 | 22 | if dtype == "none": 23 | dtype_view = None 24 | elif dtype == "numpy": 25 | dtype_view = orig.dtype._dtype 26 | elif dtype == "rle": 27 | dtype_view = orig.dtype 28 | else: 29 | raise ValueError(f"unknown dtype variante {dtype}") 30 | view = orig.view(dtype_view) 31 | 32 | assert view is not orig 33 | assert view.dtype == orig.dtype 34 | npt.assert_array_equal(orig, view) 35 | 36 | orig[[0, 1]] = [100, 101] 37 | view[[0, 8, 9]] = [1000, 108, 109] 38 | 39 | result = RLEArray._from_sequence([1000, 101, 2, 3, 4, 5, 6, 7, 108, 109]) 40 | 41 | npt.assert_array_equal(orig, result) 42 | npt.assert_array_equal(orig, view) 43 | 44 | 45 | def test_view_tree() -> None: 46 | # o-->1-+->11 47 | # +->12 48 | orig = RLEArray._from_sequence(np.arange(10)) 49 | 50 | view1 = orig.view() 51 | view11 = view1.view() 52 | view12 = view1.view() 53 | 54 | assert view1 is not orig 55 | assert view11 is not orig 56 | assert view12 is not orig 57 | assert view11 is not view1 58 | assert view12 is not view1 59 | assert view11 is not view12 60 | npt.assert_array_equal(orig, view1) 61 | npt.assert_array_equal(orig, view11) 62 | npt.assert_array_equal(orig, view12) 63 | 64 | view11[[8, 9]] = [108, 109] 65 | view1[[0, 1, 9]] = [100, 101, 1009] 66 | 67 | result = RLEArray._from_sequence([100, 101, 2, 3, 4, 5, 6, 7, 108, 1009]) 68 | 69 | npt.assert_array_equal(orig, result) 70 | npt.assert_array_equal(orig, view1) 71 | npt.assert_array_equal(orig, view11) 72 | npt.assert_array_equal(orig, view12) 73 | 74 | 75 | def test_slicing() -> None: 76 | N = 100 77 | orig_np = np.arange(N) 78 | orig_rle = RLEArray._from_sequence(orig_np) 79 | 80 | ops = [ 81 | slice(None, None, None), 82 | slice(1, -3, 2), 83 | slice(None, None, -1), 84 | slice(None, None, -1), 85 | slice(3, 4, -3), 86 | ] 87 | 88 | arrays_np = [orig_np] 89 | arrays_rle = [orig_rle] 90 | for i, o in enumerate(ops): 91 | last_np = arrays_np[-1] 92 | last_rle = arrays_rle[-1] 93 | npt.assert_array_equal(last_np, last_rle) 94 | 95 | sub_np = last_np[o] 96 | sub_rle = last_rle[o] 97 | 98 | assert sub_np is not last_np 99 | assert sub_rle is not last_rle 100 | npt.assert_array_equal(sub_np, sub_rle) 101 | 102 | delta = np.arange(len(sub_np)) * (N ** i) 103 | 104 | # `+=` seems to convert sub_rle from RLEArray to ndarray? 105 | sub_np[:] = sub_np + delta 106 | sub_rle[:] = sub_rle + delta 107 | 108 | arrays_np.append(sub_np) 109 | arrays_rle.append(sub_rle) 110 | 111 | for arr_np, arr_rle in zip(arrays_np, arrays_rle): 112 | npt.assert_array_equal(arr_np, arr_rle) 113 | 114 | 115 | def test_anchor_ref() -> None: 116 | try: 117 | gc.disable() 118 | gc.collect() 119 | 120 | n_objects_pre = len( 121 | [o for o in gc.get_objects() if isinstance(o, (RLEArray, _ViewAnchor))] 122 | ) 123 | 124 | RLEArray._from_sequence(np.arange(10)) 125 | 126 | n_objects_post = len( 127 | [o for o in gc.get_objects() if isinstance(o, (RLEArray, _ViewAnchor))] 128 | ) 129 | assert n_objects_pre == n_objects_post 130 | finally: 131 | gc.enable() 132 | --------------------------------------------------------------------------------