├── .editorconfig
├── .flake8
├── .github
    ├── dependabot.yml
    └── workflows
    │   └── ci.yml
├── .gitignore
├── CHANGES.rst
├── CONTRIBUTING.md
├── LICENSE.txt
├── README.rst
├── asv.conf.json
├── benchmarks
    ├── __init__.py
    └── benchmarks.py
├── codecov.yml
├── docs
    ├── _static
    │   └── .gitkeep
    ├── changes.rst
    ├── conf.py
    ├── index.rst
    └── sphinxext
    │   └── ignore_missing_refs.py
├── mypy.ini
├── pyproject.toml
├── rle_array
    ├── __init__.py
    ├── _algorithms.py
    ├── _slicing.py
    ├── array.py
    ├── autoconversion.py
    ├── dtype.py
    ├── testing.py
    └── types.py
├── scripts
    ├── fmt.sh
    └── test.sh
├── setup.cfg
├── setup.py
└── tests
    ├── __init__.py
    ├── test_algorithms.py
    ├── test_astype.py
    ├── test_autoconversion.py
    ├── test_builtins.py
    ├── test_constructors.py
    ├── test_dtype.py
    ├── test_fastpath.py
    ├── test_indexing.py
    ├── test_misc_operations.py
    ├── test_operators.py
    ├── test_pandas.py
    ├── test_reduce.py
    ├── test_regressions.py
    ├── test_slicing.py
    ├── test_testing.py
    ├── test_ufunc.py
    └── test_view.py


/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | charset = utf-8
 5 | end_of_line = lf
 6 | insert_final_newline = true
 7 | 
 8 | [*.py]
 9 | include_trailing_comma = true
10 | indent_size = 4
11 | indent_style = space
12 | trim_trailing_whitespace = true
13 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E203, E266, E501, W503
3 | max-line-length = 80
4 | max-complexity = 18
5 | select = B,C,E,F,W,T4,B9
6 | exclude =
7 |     build,
8 |     dist
9 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 |   - package-ecosystem: "github-actions"
4 |     directory: "/"
5 |     schedule:
6 |       interval: "daily"
7 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | name: CI
  2 | 
  3 | on:
  4 |   push:
  5 |     branches:
  6 |       - master
  7 |     tags:
  8 |       - v*
  9 |   pull_request:
 10 | 
 11 | env:
 12 |   IS_TAG: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')}}
 13 |   MASTER_PYTHON: "3.8"
 14 | 
 15 | jobs:
 16 |   lint:
 17 |     runs-on: ubuntu-latest
 18 |     timeout-minutes: 10
 19 |     steps:
 20 |       - name: Checkout
 21 |         uses: actions/checkout@v2
 22 |       - name: Set up Python ${{ env.MASTER_PYTHON }}
 23 |         uses: actions/setup-python@v2
 24 |         with:
 25 |           python-version: ${{ env.MASTER_PYTHON }}
 26 |       - name: Install Poetry Itself
 27 |         run: pip install poetry
 28 |       - name: Poetry Install
 29 |         run: poetry install
 30 |       - name: Flake8
 31 |         run: poetry run flake8
 32 |       - name: Mypy
 33 |         run: poetry run mypy .
 34 |       - name: Black
 35 |         run: poetry run black --check .
 36 |       - name: Isort
 37 |         run: poetry run isort --check-only .
 38 |       - name: Shellcheck
 39 |         run: shellcheck scripts/*.sh
 40 | 
 41 |   test:
 42 |     strategy:
 43 |       matrix:
 44 |         python: ["3.6", "3.7", "3.8"]
 45 |     runs-on: ubuntu-latest
 46 |     timeout-minutes: 10
 47 |     steps:
 48 |       - name: Checkout
 49 |         uses: actions/checkout@v2
 50 |       - name: Set up Python ${{ matrix.python }}
 51 |         uses: actions/setup-python@v2
 52 |         with:
 53 |           python-version: ${{ matrix.python }}
 54 |       - name: Install Poetry Itself
 55 |         run: pip install poetry
 56 |       - name: Poetry Install
 57 |         run: poetry install
 58 |       - name: Pytest
 59 |         run: poetry run pytest
 60 |       - name: ASV
 61 |         run: |
 62 |           poetry run asv machine --machine travis --os unknown --arch unknown --cpu unknown --ram unknown
 63 |           poetry run asv run --show-stderr --environment existing --quick
 64 |       - name: Codecov
 65 |         uses: codecov/codecov-action@v1.2.1
 66 |         with:
 67 |           # NOTE: `token` is not required, because the rle-array repo is public
 68 |           file: ./coverage.xml
 69 |           name: pytest-${{ runner.OS }}-${{ matrix.python }}
 70 | 
 71 |   docs:
 72 |     runs-on: ubuntu-latest
 73 |     timeout-minutes: 10
 74 |     steps:
 75 |       - name: Checkout
 76 |         uses: actions/checkout@v2
 77 |       - name: Set up Python ${{ env.MASTER_PYTHON }}
 78 |         uses: actions/setup-python@v2
 79 |         with:
 80 |           python-version: ${{ env.MASTER_PYTHON }}
 81 |       - name: Install Poetry Itself
 82 |         run: pip install poetry
 83 |       - name: Poetry Install
 84 |         run: poetry install
 85 |       - name: Sphinx
 86 |         run: |
 87 |           poetry run python setup.py build_sphinx
 88 |           touch ./docs/_build/html/.nojekyll
 89 |       - name: Preserve Docs
 90 |         uses: actions/upload-artifact@v2.2.2
 91 |         with:
 92 |           name: docs
 93 |           path: docs/_build/html
 94 |       - name: Deploy Docs
 95 |         if: github.event_name == 'push' && github.event.ref == 'refs/heads/master'
 96 |         uses: peaceiris/actions-gh-pages@v3
 97 |         with:
 98 |           github_token: ${{ secrets.GITHUB_TOKEN }}
 99 |           publish_dir: ./docs/_build/html
100 | 
101 |   release:
102 |     runs-on: ubuntu-latest
103 |     needs: [lint, test, docs]
104 |     steps:
105 |       - name: Checkout
106 |         uses: actions/checkout@v2
107 |       - name: Set up Python ${{ env.MASTER_PYTHON }}
108 |         uses: actions/setup-python@v2
109 |         with:
110 |           python-version: ${{ env.MASTER_PYTHON }}
111 |       - name: Install Poetry Itself
112 |         run: pip install poetry
113 |       - name: Poetry Install
114 |         run: poetry install
115 |       - name: Build
116 |         run: poetry build
117 |       - name: Prepare Release Notes
118 |         run: awk 'BEGIN{found=0} {if (match($0, "==============")) {if (found == 1) exit; found=1}; if (found == 1) {print last}; last=$0}' CHANGES.rst > release_notes.rst
119 |       - name: Create Release Notes
120 |         uses: docker://pandoc/core:2.9
121 |         with:
122 |           args: --from=rst --to=markdown -o release_notes.md release_notes.rst
123 |       - name: Preserve Dist
124 |         uses: actions/upload-artifact@v2.2.2
125 |         with:
126 |           name: dist
127 |           path: dist
128 |       - name: Preserve Release Notes
129 |         uses: actions/upload-artifact@v2.2.2
130 |         with:
131 |           name: release_notes.md
132 |           path: release_notes.md
133 |       - name: Publish to PyPI
134 |         if: env.IS_TAG == 'true'
135 |         run: poetry publish
136 |         env:
137 |           POETRY_PYPI_TOKEN_PYPI: ${{ secrets.PYPI_TOKEN }}
138 |       - name: Create GitHub Release
139 |         if: env.IS_TAG == 'true'
140 |         uses: actions/create-release@v1.1.4
141 |         env:
142 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
143 |         with:
144 |           tag_name: ${{ github.ref }}
145 |           release_name: rle-array ${{ github.ref }}
146 |           body_path: release_notes.md
147 |           draft: false
148 |           prerelease: false
149 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.asv/
 2 | *.egg-info/
 3 | .coverage
 4 | .mypy_cache/
 5 | .pytest_cache/
 6 | .venv/
 7 | __pycache__/
 8 | build/
 9 | coverage.xml
10 | dist/
11 | docs/_build/
12 | docs/_rst/
13 | pip-wheel-metadata/
14 | poetry.lock
15 | 


--------------------------------------------------------------------------------
/CHANGES.rst:
--------------------------------------------------------------------------------
1 | =========
2 | Changelog
3 | =========
4 | 
5 | Version 0.1.0 (unreleased)
6 | ==========================
7 | Initial public release.
8 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How To Contribute
 2 | 
 3 | 
 4 | ## Bugs
 5 | 
 6 | If you've found a bug, please report it to the issue tracker and
 7 | 
 8 | * Describe the bug you encountered and what the expected behavior should be.
 9 | * Provide a [Minimal, Reproducible Example](https://stackoverflow.com/help/mcve) (if possible).
10 | * Be as explicit about your environment as possible, e.g. provide a `pip freeze` / `conda list`.
11 | 
12 | ## Code Contributions
13 | 
14 | **Unless you explicitly state otherwise, any contribution you intentionally submit for inclusion in the work, shall be
15 | dual-licensed under MIT license, without any additional terms or conditions.**
16 | 
17 | Please file a GitHub pull request with your contribution. See the [Development](#Development) section for details on
18 | tooling. See the "Development Plan" in the README for the generic prioritization.
19 | 
20 | 
21 | ## Development
22 | 
23 | ### Installation
24 | To get started, set up a new virtual environment and install all requirements:
25 | 
26 | ```bash
27 | virtualenv --python=python3.6 .venv
28 | source .venv/bin/activate
29 | pip install poetry
30 | poetry install
31 | ```
32 | 
33 | ### Code style
34 | 
35 | To ensure a consistent code style across the code base we're using the following tools:
36 | 
37 | - [`black`](https://github.com/psf/black): code formatter
38 | - [`flake8`](https://gitlab.com/pycqa/flake8): linting
39 | - [`isort`](https://github.com/timothycrosley/isort): sorting of imports
40 | 
41 | We have a convenience script that runs all these tools and a code style check for you:
42 | 
43 | ```bash
44 | poetry run ./scripts/fmt.sh
45 | ```
46 | 
47 | ### Testing
48 | There are different tools that ensure a well tested and presented library. To run them all at once (useful for
49 | development), use:
50 | 
51 | ```bash
52 | poetry run ./scripts/test.sh
53 | ```
54 | 
55 | ### Pytest
56 | We're using [pytest](https://pytest.org) as a testing framework and make heavy use of `fixtures` and `parametrization`.
57 | To run the tests simply run:
58 | 
59 | ```bash
60 | poetry run pytest
61 | ```
62 | 
63 | ### Benchmarks
64 | For performance critical code paths we have [asv](https://asv.readthedocs.io/) benchmarks in place in the subfolder
65 | `benchmarks`. To run the benchmarks a single time and receive immediate feedback run:
66 | 
67 | ```bash
68 | poetry run asv run --python=same --show-stderr
69 | ```
70 | 
71 | ### Documentation
72 | Documentation is created using [Sphinx](https://www.sphinx-doc.org/) and can be build by using:
73 | 
74 | ```bash
75 | poetry run python setup.py build_sphinx
76 | ```
77 | 
78 | ### Typing
79 | We use [mypy](http://mypy-lang.org/) to check python types. It can be run using:
80 | 
81 | ```bash
82 | poetry run mypy .
83 | ```
84 | 
85 | ## Performance Improvements
86 | If you wish to contribute a performance improvement, please ensure that a benchmark (in `asv_bench`) exists or that you
87 | provide on in your pull request. Please run that benchmark before and after your change and add both values to the
88 | commit message of your contribution.
89 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019-2020 Blue Yonder Group, Inc
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | =========
  2 | rle-array
  3 | =========
  4 | 
  5 | .. image:: https://github.com/JDASoftwareGroup/rle-array/workflows/CI/badge.svg?branch=master
  6 |     :target: https://github.com/JDASoftwareGroup/rle-array/actions?query=branch%3Amaster+workflow%3ACI
  7 |     :alt: Build Status
  8 | .. image:: https://codecov.io/gh/JDASoftwareGroup/rle-array/branch/master/graph/badge.svg?token=y2q96vlHqc
  9 |     :target: https://codecov.io/gh/JDASoftwareGroup/rle-array
 10 |     :alt: Coverage Status
 11 | 
 12 | `Extension Array`_ for `Pandas`_ that implements `Run-length Encoding`_.
 13 | 
 14 | 
 15 | .. contents:: Table of Contents
 16 | 
 17 | 
 18 | Quick Start
 19 | ***********
 20 | 
 21 | Some basic setup first:
 22 | 
 23 | >>> import pandas as pd
 24 | >>> pd.set_option("display.max_rows", 40)
 25 | >>> pd.set_option("display.width", None)
 26 | 
 27 | We need some example data, so let's create some pseudo-weather data:
 28 | 
 29 | >>> from rle_array.testing import generate_example
 30 | >>> df = generate_example()
 31 | >>> df.head(10)
 32 |         date  month  year    city    country   avg_temp   rain   mood
 33 | 0 2000-01-01      1  2000  city_0  country_0  12.400000  False     ok
 34 | 1 2000-01-02      1  2000  city_0  country_0   4.000000  False     ok
 35 | 2 2000-01-03      1  2000  city_0  country_0  17.200001  False  great
 36 | 3 2000-01-04      1  2000  city_0  country_0   8.400000  False     ok
 37 | 4 2000-01-05      1  2000  city_0  country_0   6.400000  False     ok
 38 | 5 2000-01-06      1  2000  city_0  country_0  14.400000  False     ok
 39 | 6 2000-01-07      1  2000  city_0  country_0  14.300000   True     ok
 40 | 7 2000-01-08      1  2000  city_0  country_0   6.800000  False     ok
 41 | 8 2000-01-09      1  2000  city_0  country_0  10.100000  False     ok
 42 | 9 2000-01-10      1  2000  city_0  country_0  -1.200000  False     ok
 43 | 
 44 | Due to the large number of attributes for locations and the date, the data size is quite large:
 45 | 
 46 | >>> df.memory_usage()
 47 | Index            128
 48 | date        32000000
 49 | month        4000000
 50 | year         8000000
 51 | city        32000000
 52 | country     32000000
 53 | avg_temp    16000000
 54 | rain         4000000
 55 | mood        32000000
 56 | dtype: int64
 57 | >>> df.memory_usage().sum()
 58 | 160000128
 59 | 
 60 | To compress the data, we can use ``rle-array``:
 61 | 
 62 | >>> import rle_array
 63 | >>> df_rle = df.astype({
 64 | ...     "city": "RLEDtype[object]",
 65 | ...     "country": "RLEDtype[object]",
 66 | ...     "month": "RLEDtype[int8]",
 67 | ...     "mood": "RLEDtype[object]",
 68 | ...     "rain": "RLEDtype[bool]",
 69 | ...     "year": "RLEDtype[int16]",
 70 | ... })
 71 | >>> df_rle.memory_usage()
 72 | Index            128
 73 | date        32000000
 74 | month        1188000
 75 | year          120000
 76 | city           32000
 77 | country           64
 78 | avg_temp    16000000
 79 | rain         6489477
 80 | mood        17153296
 81 | dtype: int64
 82 | >>> df_rle.memory_usage().sum()
 83 | 72982965
 84 | 
 85 | This works better the longer the runs are. In the above example, it does not work too well for ``"rain"``.
 86 | 
 87 | 
 88 | Development Plan
 89 | ****************
 90 | 
 91 | The development of ``rle-array`` has the following priorities (in decreasing order):
 92 | 
 93 | 1. **Correctness:** All results must be correct. The `Pandas`_-provided test suite must pass. Approximation are not
 94 |    allowed.
 95 | 2. **Transparency:** The user can use :class:`~rle_array.RLEDtype` and :class:`~rle_array.RLEArray` like other `Pandas`_
 96 |    types. No special parameters or extra functions are required.
 97 | 3. **Features:** Support all features that `Pandas`_ offers, even if it is slow (but inform the user using a
 98 |    :class:`pandas.errors.PerformanceWarning`).
 99 | 4. **Simplicity:** Do not use `Python C Extensions`_ or `Cython`_ (`NumPy`_ and `Numba`_ are allowed).
100 | 5. **Memory Reduction:** Do not decompress the encoded data when not required, try to do as many calculations directly
101 |    on the compressed representation.
102 | 6. **Performance:** It should be quick, for large data ideally faster than working on the uncompressed data. Use
103 |    `Numba`_ to speed up code.
104 | 
105 | 
106 | Implementation
107 | **************
108 | 
109 | Imagine the following data array:
110 | 
111 | +-------+------+
112 | | Index | Data |
113 | +=======+======+
114 | | 1     | "a"  |
115 | +-------+------+
116 | | 2     | "a"  |
117 | +-------+------+
118 | | 3     | "a"  |
119 | +-------+------+
120 | | 4     | "x"  |
121 | +-------+------+
122 | | 5     | "c"  |
123 | +-------+------+
124 | | 6     | "c"  |
125 | +-------+------+
126 | | 7     | "a"  |
127 | +-------+------+
128 | | 8     | "a"  |
129 | +-------+------+
130 | 
131 | There some data points valid for multiple entries in a row:
132 | 
133 | +-------+------+
134 | | Index | Data |
135 | +=======+======+
136 | | 1     | "a"  |
137 | +-------+      +
138 | | 2     |      |
139 | +-------+      +
140 | | 3     |      |
141 | +-------+------+
142 | | 4     | "x"  |
143 | +-------+------+
144 | | 5     | "c"  |
145 | +-------+      +
146 | | 6     |      |
147 | +-------+------+
148 | | 7     | "a"  |
149 | +-------+      +
150 | | 8     |      |
151 | +-------+------+
152 | 
153 | These sections are also called *runs* and can be encoded by their value and their length:
154 | 
155 | +--------+-------+
156 | | Length | Value |
157 | +========+=======+
158 | | 3      | "a"   |
159 | +--------+-------+
160 | | 1      | "x"   |
161 | +--------+-------+
162 | | 2      | "c"   |
163 | +--------+-------+
164 | | 2      | "a"   |
165 | +--------+-------+
166 | 
167 | This representation is called `Run-length Encoding`_. To integrate this encoding better with `Pandas`_ and `NumPy`_ and
168 | to support operations like slicing and random access (e.g. via :func:`pandas.api.extensions.ExtensionArray.take`), we
169 | store the end position (the cum-sum of the length column) instead of the length:
170 | 
171 | +--------------+-------+
172 | | End-position | Value |
173 | +==============+=======+
174 | | 3            | "a"   |
175 | +--------------+-------+
176 | | 4            | "x"   |
177 | +--------------+-------+
178 | | 6            | "c"   |
179 | +--------------+-------+
180 | | 8            | "a"   |
181 | +--------------+-------+
182 | 
183 | The value array is an :class:`numpy.ndarray` with the same dtype as the original data and the end-positions are an
184 | :class:`numpy.ndarray` with the dtype ``int64``.
185 | 
186 | 
187 | License
188 | *******
189 | 
190 | Licensed under:
191 | 
192 | - MIT License (``LICENSE.txt`` or https://opensource.org/licenses/MIT)
193 | 
194 | 
195 | .. _Cython: https://cython.org/
196 | .. _Extension Array: https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extensionarray
197 | .. _Numba: https://numba.pydata.org/
198 | .. _NumPy: https://numpy.org/
199 | .. _Pandas: https://pandas.pydata.org/
200 | .. _Python C Extensions: https://docs.python.org/3/extending/building.html
201 | .. _Run-length Encoding: https://en.wikipedia.org/wiki/Run-length_encoding
202 | 


--------------------------------------------------------------------------------
/asv.conf.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     // The version of the config file format.  Do not change, unless
  3 |     // you know what you are doing.
  4 |     "version": 1,
  5 | 
  6 |     // The name of the project being benchmarked
  7 |     "project": "rle-array",
  8 | 
  9 |     // The project's homepage
 10 |     "project_url": "http://project-homepage.org/",
 11 | 
 12 |     // The URL or local path of the source code repository for the
 13 |     // project being benchmarked
 14 |     "repo": "..",
 15 | 
 16 |     // The Python project's subdirectory in your repo.  If missing or
 17 |     // the empty string, the project is assumed to be located at the root
 18 |     // of the repository.
 19 |     // "repo_subdir": "",
 20 | 
 21 |     // Customizable commands for building, installing, and
 22 |     // uninstalling the project. See asv.conf.json documentation.
 23 |     //
 24 |     // "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}"],
 25 |     // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"],
 26 |     // "build_command": [
 27 |     //     "python setup.py build",
 28 |     //     "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}"
 29 |     // ],
 30 | 
 31 |     // List of branches to benchmark. If not provided, defaults to "master"
 32 |     // (for git) or "default" (for mercurial).
 33 |     // "branches": ["master"], // for git
 34 |     // "branches": ["default"],    // for mercurial
 35 | 
 36 |     // The DVCS being used.  If not set, it will be automatically
 37 |     // determined from "repo" by looking at the protocol in the URL
 38 |     // (if remote), or by looking for special directories, such as
 39 |     // ".git" (if local).
 40 |     // "dvcs": "git",
 41 | 
 42 |     // The tool to use to create environments.  May be "conda",
 43 |     // "virtualenv" or other value depending on the plugins in use.
 44 |     // If missing or the empty string, the tool will be automatically
 45 |     // determined by looking for tools on the PATH environment
 46 |     // variable.
 47 |     "environment_type": "virtualenv",
 48 | 
 49 |     // timeout in seconds for installing any dependencies in environment
 50 |     // defaults to 10 min
 51 |     //"install_timeout": 600,
 52 | 
 53 |     // the base URL to show a commit for the project.
 54 |     // "show_commit_url": "http://github.com/owner/project/commit/",
 55 | 
 56 |     // The Pythons you'd like to test against.  If not provided, defaults
 57 |     // to the current version of Python used to run `asv`.
 58 |     "pythons": ["3.6", "3.7", "3.8"],
 59 | 
 60 |     // The list of conda channel names to be searched for benchmark
 61 |     // dependency packages in the specified order
 62 |     // "conda_channels": ["conda-forge", "defaults"],
 63 | 
 64 |     // The matrix of dependencies to test.  Each key is the name of a
 65 |     // package (in PyPI) and the values are version numbers.  An empty
 66 |     // list or empty string indicates to just test against the default
 67 |     // (latest) version. null indicates that the package is to not be
 68 |     // installed. If the package to be tested is only available from
 69 |     // PyPi, and the 'environment_type' is conda, then you can preface
 70 |     // the package name by 'pip+', and the package will be installed via
 71 |     // pip (with all the conda available packages installed first,
 72 |     // followed by the pip installed packages).
 73 |     //
 74 |     // "matrix": {
 75 |     //     "numpy": ["1.6", "1.7"],
 76 |     //     "six": ["", null],        // test with and without six installed
 77 |     //     "pip+emcee": [""],   // emcee is only available for install with pip.
 78 |     // },
 79 | 
 80 |     // Combinations of libraries/python versions can be excluded/included
 81 |     // from the set to test. Each entry is a dictionary containing additional
 82 |     // key-value pairs to include/exclude.
 83 |     //
 84 |     // An exclude entry excludes entries where all values match. The
 85 |     // values are regexps that should match the whole string.
 86 |     //
 87 |     // An include entry adds an environment. Only the packages listed
 88 |     // are installed. The 'python' key is required. The exclude rules
 89 |     // do not apply to includes.
 90 |     //
 91 |     // In addition to package names, the following keys are available:
 92 |     //
 93 |     // - python
 94 |     //     Python version, as in the *pythons* variable above.
 95 |     // - environment_type
 96 |     //     Environment type, as above.
 97 |     // - sys_platform
 98 |     //     Platform, as in sys.platform. Possible values for the common
 99 |     //     cases: 'linux2', 'win32', 'cygwin', 'darwin'.
100 |     //
101 |     // "exclude": [
102 |     //     {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
103 |     //     {"environment_type": "conda", "six": null}, // don't run without six on conda
104 |     // ],
105 |     //
106 |     // "include": [
107 |     //     // additional env for python2.7
108 |     //     {"python": "2.7", "numpy": "1.8"},
109 |     //     // additional env if run on windows+conda
110 |     //     {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""},
111 |     // ],
112 | 
113 |     // The directory (relative to the current directory) that benchmarks are
114 |     // stored in.  If not provided, defaults to "benchmarks"
115 |     // "benchmark_dir": "benchmarks",
116 | 
117 |     // The directory (relative to the current directory) to cache the Python
118 |     // environments in.  If not provided, defaults to "env"
119 |     "env_dir": ".asv/env",
120 | 
121 |     // The directory (relative to the current directory) that raw benchmark
122 |     // results are stored in.  If not provided, defaults to "results".
123 |     "results_dir": ".asv/results",
124 | 
125 |     // The directory (relative to the current directory) that the html tree
126 |     // should be written to.  If not provided, defaults to "html".
127 |     "html_dir": ".asv/html",
128 | 
129 |     // The number of characters to retain in the commit hashes.
130 |     // "hash_length": 8,
131 | 
132 |     // `asv` will cache results of the recent builds in each
133 |     // environment, making them faster to install next time.  This is
134 |     // the number of builds to keep, per environment.
135 |     // "build_cache_size": 2,
136 | 
137 |     // The commits after which the regression search in `asv publish`
138 |     // should start looking for regressions. Dictionary whose keys are
139 |     // regexps matching to benchmark names, and values corresponding to
140 |     // the commit (exclusive) after which to start looking for
141 |     // regressions.  The default is to start from the first commit
142 |     // with results. If the commit is `null`, regression detection is
143 |     // skipped for the matching benchmark.
144 |     //
145 |     // "regressions_first_commits": {
146 |     //    "some_benchmark": "352cdf",  // Consider regressions only after this commit
147 |     //    "another_benchmark": null,   // Skip regression detection altogether
148 |     // },
149 | 
150 |     // The thresholds for relative change in results, after which `asv
151 |     // publish` starts reporting regressions. Dictionary of the same
152 |     // form as in ``regressions_first_commits``, with values
153 |     // indicating the thresholds.  If multiple entries match, the
154 |     // maximum is taken. If no entry matches, the default is 5%.
155 |     //
156 |     // "regressions_thresholds": {
157 |     //    "some_benchmark": 0.01,     // Threshold of 1%
158 |     //    "another_benchmark": 0.5,   // Threshold of 50%
159 |     // },
160 | }
161 | 


--------------------------------------------------------------------------------
/benchmarks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JDASoftwareGroup/rle-array/e5201b9185079f4fc4fd907d8f591426df79946e/benchmarks/__init__.py


--------------------------------------------------------------------------------
/benchmarks/benchmarks.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from contextlib import contextmanager
  3 | from typing import Generator
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from pandas.errors import PerformanceWarning
  8 | 
  9 | from rle_array.autoconversion import auto_convert_to_rle, decompress
 10 | from rle_array.testing import const_col, dim_col, generate_test_dataframe
 11 | 
 12 | 
 13 | class Base:
 14 |     min_run_count = 10
 15 |     processes = 1
 16 |     repeat = 5
 17 |     sample_time = 1.0
 18 |     warmup_time = 1.0
 19 | 
 20 |     def gen_baseline(self) -> pd.DataFrame:
 21 |         return generate_test_dataframe(n_dims=3, size=100)
 22 | 
 23 |     def setup(self) -> None:
 24 |         self.df_baseline = self.gen_baseline()
 25 |         self.df_rle = self.df_baseline.astype("RLEDtype[int64]")
 26 | 
 27 |     @contextmanager
 28 |     def ignore_performance_warnings(self) -> Generator[None, None, None]:
 29 |         with warnings.catch_warnings():
 30 |             warnings.simplefilter(action="ignore", category=PerformanceWarning)
 31 |             yield
 32 | 
 33 | 
 34 | class TimeAutoConversion(Base):
 35 |     def time_auto_convert_to_rle_compress_all(self) -> None:
 36 |         auto_convert_to_rle(self.df_baseline)
 37 | 
 38 |     def time_auto_convert_to_rle_no_compression_allowed(self) -> None:
 39 |         auto_convert_to_rle(self.df_baseline, 0.0)
 40 | 
 41 |     def time_auto_convert_to_rle_already_compressed(self) -> None:
 42 |         auto_convert_to_rle(self.df_rle)
 43 | 
 44 |     def time_decompress_compressed(self) -> None:
 45 |         decompress(self.df_rle)
 46 | 
 47 |     def time_decompress_noop(self) -> None:
 48 |         decompress(self.df_baseline)
 49 | 
 50 | 
 51 | class TimeCompression(Base):
 52 |     def time_decompress_array_astype(self) -> None:
 53 |         with self.ignore_performance_warnings():
 54 |             self.df_rle[const_col([1, 2])].array.astype(np.int64)
 55 | 
 56 |     def time_decompress_to_numpy(self) -> None:
 57 |         with self.ignore_performance_warnings():
 58 |             self.df_rle[const_col([1, 2])].to_numpy()
 59 | 
 60 | 
 61 | class TimeTake(Base):
 62 |     def setup(self) -> None:
 63 |         super().setup()
 64 | 
 65 |         self.shuffle_dim2_unstable = self.df_baseline.sort_values(
 66 |             dim_col(2), kind="quicksort"
 67 |         ).index.values
 68 |         self.shuffle_dim2_stable = self.df_baseline.sort_values(
 69 |             dim_col(2), kind="mergesort"
 70 |         ).index.values
 71 | 
 72 |     def time_unstable_const12_base(self) -> None:
 73 |         self.df_baseline[const_col([1, 2])].take(self.shuffle_dim2_unstable)
 74 | 
 75 |     def time_unstable_const12_rle(self) -> None:
 76 |         self.df_rle[const_col([1, 2])].take(self.shuffle_dim2_unstable)
 77 | 
 78 |     def time_stable_const12_base(self) -> None:
 79 |         self.df_baseline[const_col([1, 2])].take(self.shuffle_dim2_stable)
 80 | 
 81 |     def time_stable_const12_rle(self) -> None:
 82 |         self.df_rle[const_col([1, 2])].take(self.shuffle_dim2_stable)
 83 | 
 84 | 
 85 | class TimeGroupByReduce(Base):
 86 |     def setup(self) -> None:
 87 |         super().setup()
 88 | 
 89 |         df_rle_wo_dims = self.df_rle.copy()
 90 |         for d in range(3):
 91 |             df_rle_wo_dims[dim_col(d)] = self.df_baseline[dim_col(d)].copy()
 92 |         self.df_rle_wo_dims = df_rle_wo_dims
 93 | 
 94 |     def time_key2_opsum_const12_baseline(self) -> None:
 95 |         self.df_baseline.groupby(dim_col(2))[const_col([1, 2])].sum()
 96 | 
 97 |     def time_key2_opsum_const12_rle(self) -> None:
 98 |         with self.ignore_performance_warnings():
 99 |             self.df_rle_wo_dims.groupby(dim_col(2))[const_col([1, 2])].sum()
100 | 
101 | 
102 | class TimeSeriesReduce(Base):
103 |     def time_sum_const12_baseline(self) -> None:
104 |         self.df_baseline[const_col([1, 2])].sum()
105 | 
106 |     def time_sum_const12_rle(self) -> None:
107 |         self.df_rle[const_col([1, 2])].sum()
108 | 
109 |     def time_sum_const012_baseline(self) -> None:
110 |         self.df_baseline[const_col([0, 1, 2])].sum()
111 | 
112 |     def time_sum_const012_rle(self) -> None:
113 |         self.df_rle[const_col([0, 1, 2])].sum()
114 | 
115 | 
116 | class TimeShift(Base):
117 |     def time_int_const12_base(self) -> None:
118 |         self.df_baseline[const_col([1, 2])].shift(periods=1, fill_value=1)
119 | 
120 |     def time_int_const12_rle(self) -> None:
121 |         self.df_rle[const_col([1, 2])].shift(periods=1, fill_value=1)
122 | 
123 |     def time_float_const12_base(self) -> None:
124 |         self.df_baseline[const_col([1, 2])].shift(periods=1)
125 | 
126 |     def time_float_const12_rle(self) -> None:
127 |         self.df_rle[const_col([1, 2])].shift(periods=1)
128 | 
129 | 
130 | class TimeUnique(Base):
131 |     def time_const12_base(self) -> None:
132 |         self.df_baseline[const_col([1, 2])].unique()
133 | 
134 |     def time_const12_rle(self) -> None:
135 |         self.df_rle[const_col([1, 2])].unique()
136 | 
137 | 
138 | class TimeOperator(Base):
139 |     def time_add_const12_baseline(self) -> None:
140 |         self.df_baseline[const_col([1, 2])] + self.df_baseline[const_col([1, 2])]
141 | 
142 |     def time_add_const12_rle(self) -> None:
143 |         self.df_rle[const_col([1, 2])] + self.df_rle[const_col([1, 2])]
144 | 
145 |     def time_eq_const12_baseline(self) -> None:
146 |         self.df_baseline[const_col([1, 2])] == self.df_baseline[const_col([1, 2])]
147 | 
148 |     def time_eq_const12_rle(self) -> None:
149 |         self.df_rle[const_col([1, 2])] == self.df_rle[const_col([1, 2])]
150 | 
151 | 
152 | class TimeGenerateTestDataFrame(Base):
153 |     def time(self) -> None:
154 |         self.gen_baseline()
155 | 
156 | 
157 | class TimeFactorize(Base):
158 |     def time_const12_base(self) -> None:
159 |         self.df_baseline[const_col([1, 2])].factorize()
160 | 
161 |     def time_const12_rle(self) -> None:
162 |         with self.ignore_performance_warnings():
163 |             self.df_rle[const_col([1, 2])].factorize()
164 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | comment: false
2 | coverage:
3 |   status:
4 |     patch: off
5 |     project:
6 |       default:
7 |         target: 100%
8 |         threshold: 0%
9 | 


--------------------------------------------------------------------------------
/docs/_static/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JDASoftwareGroup/rle-array/e5201b9185079f4fc4fd907d8f591426df79946e/docs/_static/.gitkeep


--------------------------------------------------------------------------------
/docs/changes.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../CHANGES.rst
2 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | import os
 3 | import sys
 4 | 
 5 | from sphinx.ext import apidoc
 6 | 
 7 | # Generate module references
 8 | __location__ = os.path.join(
 9 |     os.getcwd(), os.path.dirname(inspect.getfile(inspect.currentframe()))
10 | )
11 | output_dir = os.path.abspath(os.path.join(__location__, "..", "docs", "_rst"))
12 | module_dir = os.path.abspath(os.path.join(__location__, "..", "rle_array"))
13 | apidoc_parameters = ["-f", "-e", "-o", output_dir, module_dir]
14 | apidoc.main(apidoc_parameters)
15 | 
16 | sys.path.append(os.path.abspath(os.path.join(__location__, "sphinxext")))
17 | 
18 | add_module_names = False
19 | author = "Blue Yonder Group, Inc"
20 | copyright = "2019-2020, Blue Yonder Group, Inc"
21 | project = "rle-array"
22 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
23 | extensions = [
24 |     "ignore_missing_refs",
25 |     "sphinx.ext.autodoc",
26 |     "sphinx.ext.doctest",
27 |     "sphinx.ext.napoleon",
28 | ]
29 | html_static_path = ["_static"]
30 | html_theme = "alabaster"
31 | nitpicky = True
32 | templates_path = ["_templates"]
33 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. include:: ../README.rst
 2 | 
 3 | 
 4 | Contents
 5 | ********
 6 | 
 7 | .. toctree::
 8 | 
 9 |    Module Reference <_rst/modules>
10 |    Changelog <changes>
11 | 
12 | 
13 | Indices and tables
14 | ******************
15 | 
16 | * :ref:`genindex`
17 | * :ref:`modindex`
18 | * :ref:`search`
19 | 


--------------------------------------------------------------------------------
/docs/sphinxext/ignore_missing_refs.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from docutils import nodes
 3 | 
 4 | PACKAGES = ["rle_array"]
 5 | 
 6 | 
 7 | def _is_external_target(target):
 8 |     return not any(((target == p) or target.startswith(p + ".") for p in PACKAGES))
 9 | 
10 | 
11 | def _is_private_target(target):
12 |     return any((part.startswith("_") for part in target.split(".")))
13 | 
14 | 
15 | def missing_reference(app, env, node, contnode):
16 |     target = node["reftarget"]
17 |     if _is_external_target(target) or _is_private_target(target):
18 |         newnode = nodes.reference("", "", internal=False, refuri="#", reftitle="")
19 |         newnode.append(contnode)
20 |         return newnode
21 | 
22 | 
23 | def setup(app):
24 |     app.connect("missing-reference", missing_reference)
25 |     return {"version": "0.1", "parallel_read_safe": True}
26 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | # Specify the target platform details in config, so your developers are
 3 | # free to run mypy on Windows, Linux, or macOS and get consistent
 4 | # results.
 5 | python_version = 3.8
 6 | platform = linux
 7 | 
 8 | # flake8-mypy expects the two following for sensible formatting
 9 | show_column_numbers = True
10 | 
11 | # show error messages from unrelated files
12 | follow_imports = normal
13 | 
14 | # be strict
15 | strict = True
16 | disallow_subclassing_any = False
17 | disallow_untyped_decorators = False
18 | 
19 | [mypy-numba.*]
20 | ignore_missing_imports = True
21 | 
22 | [mypy-numpy.*]
23 | ignore_missing_imports = True
24 | 
25 | [mypy-pandas.*]
26 | ignore_missing_imports = True
27 | 
28 | [mypy-setuptools.*]
29 | ignore_missing_imports = True
30 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = [
 3 |     "poetry-core>=1.0.0",
 4 | ]
 5 | build-backend = "poetry.core.masonry.api"
 6 | 
 7 | [tool.isort]
 8 | profile = "black"
 9 | 
10 | [tool.poetry]
11 | name = "rle-array"
12 | description = "Run-length encoded pandas."
13 | authors= [
14 |     "Blue Yonder Group, Inc",
15 | ]
16 | version = "0.1"
17 | readme = "README.rst"
18 | license = "MIT"
19 | packages = [
20 |     { include = "rle_array" },
21 | ]
22 | repository = "https://github.com/JDASoftwareGroup/rle_array"
23 | keywords = [
24 |     "python",
25 | ]
26 | classifiers = [
27 |     "Development Status :: 4 - Beta",
28 |     "Environment :: Console",
29 |     "Intended Audience :: Developers",
30 |     "Natural Language :: English",
31 |     "Programming Language :: Python",
32 |     "Programming Language :: Python :: 3",
33 | ]
34 | 
35 | [tool.poetry.dependencies]
36 | python = ">=3.6.1,<3.9"
37 | numba = ">=0.51.2"
38 | numpy = ">=1.17"
39 | pandas = ">=1.1.5,<1.2"
40 | 
41 | [tool.poetry.dev-dependencies]
42 | asv = "*"
43 | black = "19.10b0"
44 | flake8-mutable = "1.2.0"
45 | flake8 = "3.8.3"
46 | isort = "5.0.9"
47 | mypy = "*"
48 | pytest = ">=6"
49 | pytest-cov = "*"
50 | sphinx = "*"
51 | 
52 | [tool.pytest.ini_options]
53 | addopts = "--cov=rle_array --cov-report term-missing --cov-report xml"
54 | testpaths = "tests"
55 | 


--------------------------------------------------------------------------------
/rle_array/__init__.py:
--------------------------------------------------------------------------------
1 | from .array import RLEArray
2 | from .autoconversion import auto_convert_to_rle
3 | from .dtype import RLEDtype
4 | 
5 | __all__ = ("auto_convert_to_rle", "RLEArray", "RLEDtype")
6 | 


--------------------------------------------------------------------------------
/rle_array/_algorithms.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Iterator, List, Optional, Tuple
  2 | 
  3 | import numba
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | from ._slicing import NormalizedSlice
  8 | from .types import POSITIONS_DTYPE
  9 | 
 10 | 
 11 | def calc_lengths(positions: np.ndarray) -> np.ndarray:
 12 |     """
 13 |     Calculate lengths of runs.
 14 | 
 15 |     Parameters
 16 |     ----------
 17 |     positions:
 18 |         End positions of runs.
 19 | 
 20 |     Returns
 21 |     -------
 22 |     lengths:
 23 |         Lengths of runs.
 24 |     """
 25 |     return np.concatenate([positions[:1], positions[1:] - positions[:-1]])
 26 | 
 27 | 
 28 | def compress(scalars: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
 29 |     """
 30 |     Compress given array of scalars to RLE.
 31 | 
 32 |     Parameters
 33 |     ----------
 34 |     scalars:
 35 |         Scalars to compress.
 36 | 
 37 |     Returns
 38 |     -------
 39 |     data:
 40 |         Data at start of reach run.
 41 |     positions:
 42 |         End positions of runs.
 43 | 
 44 |     Raises
 45 |     ------
 46 |     ValueError: If non-1-dimensional arrays are compressed.
 47 |     """
 48 |     if scalars.ndim != 1:
 49 |         raise ValueError("Only 1-dimensional arrays can be compressed.")
 50 |     if len(scalars) == 0:
 51 |         return (scalars, np.array([], dtype=POSITIONS_DTYPE))
 52 | 
 53 |     changes = detect_changes(scalars)
 54 | 
 55 |     data = np.concatenate([scalars[:-1][changes], scalars[-1:]])
 56 |     positions = np.concatenate(
 57 |         [np.where(changes)[0] + 1, np.asarray([len(scalars)], dtype=POSITIONS_DTYPE)]
 58 |     )
 59 |     return (data, positions)
 60 | 
 61 | 
 62 | def concat(
 63 |     data_parts: List[np.ndarray], positions_parts: List[np.ndarray]
 64 | ) -> Tuple[np.ndarray, np.ndarray]:
 65 |     """
 66 |     Concatenate RLE data.
 67 | 
 68 |     Parameters
 69 |     ----------
 70 |     data_parts:
 71 |         For each part: Data at start of reach run.
 72 |     positions_parts:
 73 |         For each part: End positions of runs.
 74 | 
 75 |     Returns
 76 |     -------
 77 |     data:
 78 |         Data at start of reach run.
 79 |     positions:
 80 |         End positions of runs.
 81 |     """
 82 |     assert len(data_parts) == len(positions_parts)
 83 |     if len(data_parts) == 0:
 84 |         return (np.array([]), np.array([], dtype=POSITIONS_DTYPE))
 85 | 
 86 |     lengths = np.asarray([get_len(positions) for positions in positions_parts])
 87 |     offsets = np.roll(np.cumsum(lengths), 1)
 88 |     offsets[0] = 0
 89 | 
 90 |     data = np.concatenate([data for data in data_parts])
 91 |     positions = np.concatenate(
 92 |         [positions + o for positions, o in zip(positions_parts, offsets)]
 93 |     )
 94 | 
 95 |     data, positions = recompress(data, positions)
 96 |     return (data, positions)
 97 | 
 98 | 
 99 | @numba.jit(nopython=True, cache=True, nogil=True)
100 | def _inplace_repeat(
101 |     data: np.ndarray, positions: np.ndarray, out: np.ndarray
102 | ) -> np.ndarray:
103 |     n = len(positions)
104 |     assert len(data) == n
105 |     assert n > 0
106 | 
107 |     out[0 : positions[0]] = data[0]
108 | 
109 |     if n == 1:
110 |         return
111 | 
112 |     for i in range(1, n):
113 |         out[positions[i - 1] : positions[i]] = data[i]
114 |     return
115 | 
116 | 
117 | def decompress(
118 |     data: np.ndarray, positions: np.ndarray, dtype: Optional[Any] = None
119 | ) -> np.ndarray:
120 |     """
121 |     Decompress RLE data.
122 | 
123 |     Parameters
124 |     ----------
125 |     data:
126 |         Data at start of reach run.
127 |     positions:
128 |         End positions of runs.
129 |     dtype:
130 |         Optional dtype for conversion.
131 | 
132 |     Returns
133 |     -------
134 |     scalars:
135 |         Scalars, decompressed.
136 |     """
137 |     target_dtype = dtype if dtype is not None else data.dtype
138 |     if len(data) == 0:
139 |         return np.empty(0, dtype=target_dtype)
140 | 
141 |     if dtype is not None:
142 |         data = data.astype(target_dtype, copy=False)
143 | 
144 |     if (target_dtype != np.dtype(object)) and not np.issubdtype(
145 |         target_dtype, np.flexible
146 |     ):
147 |         out = np.empty(positions[-1], dtype=target_dtype)
148 |         _inplace_repeat(data, positions, out)
149 |         return out
150 |     else:
151 |         lengths = calc_lengths(positions)
152 |         return np.repeat(data, lengths)
153 | 
154 | 
155 | def detect_changes(scalars: np.ndarray) -> np.ndarray:
156 |     """
157 |     Detect changes in array of scalars. These changes can be used as boundaries for RLE-runs.
158 | 
159 |     Parameters
160 |     ----------
161 |     scalars:
162 |         Scalars to compress.
163 | 
164 |     Returns
165 |     -------
166 |     changes:
167 |         Change points (boolean mask).
168 |     """
169 |     nulls = pd.isna(scalars)
170 |     identical = (scalars[1:] == scalars[:-1]) | (nulls[1:] & nulls[:-1])
171 |     return ~identical
172 | 
173 | 
174 | def dropna(data: np.ndarray, positions: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
175 |     """
176 |     Drop NULL-values from RLE data.
177 | 
178 |     Parameters
179 |     ----------
180 |     data:
181 |         Data at start of reach run.
182 |     positions:
183 |         End positions of runs.
184 | 
185 |     Returns
186 |     -------
187 |     data:
188 |         Data at start of reach run.
189 |     positions:
190 |         End positions of runs.
191 |     """
192 |     mask = pd.notnull(data)
193 |     data = data[mask]
194 |     lenghts = calc_lengths(positions)
195 |     positions = (
196 |         positions
197 |         - np.cumsum(lenghts * (~mask).astype(POSITIONS_DTYPE), dtype=POSITIONS_DTYPE)
198 |     )[mask]
199 |     return (data, positions)
200 | 
201 | 
202 | def find_single_index(data: np.ndarray, positions: np.ndarray, i: int) -> Any:
203 |     """
204 |     Find single element in RLE data.
205 | 
206 |     .. important:
207 |         This function does NOT handle negative indices.
208 | 
209 |     Parameters
210 |     ----------
211 |     data:
212 |         Data at start of reach run.
213 |     positions:
214 |         End positions of runs.
215 | 
216 |     Returns
217 |     -------
218 |     element:
219 |         Found element.
220 | 
221 |     Raises
222 |     ------
223 |     IndexError: In case of an out-of-bounds index request.
224 |     """
225 |     if (i < 0) or (i > get_len(positions)):
226 |         raise IndexError(f"{i} out of bounds")
227 |     return data[np.searchsorted(positions, i, side="right")]
228 | 
229 | 
230 | def find_slice(
231 |     data: np.ndarray, positions: np.ndarray, s: slice
232 | ) -> Tuple[np.ndarray, np.ndarray]:
233 |     """
234 |     Get slice of RLE data.
235 | 
236 |     Parameters
237 |     ----------
238 |     data:
239 |         Data at start of reach run.
240 |     positions:
241 |         End positions of runs.
242 | 
243 |     Returns
244 |     -------
245 |     data:
246 |         Data at start of reach run.
247 |     positions:
248 |         End positions of runs.
249 |     """
250 |     length = get_len(positions)
251 |     s_norm = NormalizedSlice.from_slice(length, s)
252 | 
253 |     start, stop, step = s_norm.start, s_norm.stop, s_norm.step
254 |     invert = False
255 |     if step < 0:
256 |         invert = True
257 |         start, stop = stop + 1, start + 1
258 |         step = abs(step)
259 | 
260 |     if start == 0:
261 |         idx_start = 0
262 |     else:
263 |         idx_start = np.searchsorted(positions, start, side="right")
264 |     # start >= length cannot occur here because NormalizedSlice sets start=0 and stop=0 for empty slices
265 | 
266 |     if stop == 0:
267 |         idx_stop = 0
268 |     elif stop >= length:
269 |         idx_stop = len(positions)
270 |     else:
271 |         idx_stop = np.searchsorted(positions, stop, side="left") + 1
272 | 
273 |     data = data[idx_start:idx_stop]
274 |     positions = positions[idx_start:idx_stop] - start
275 |     if len(positions) > 0:
276 |         positions[-1] = stop - start
277 | 
278 |     if invert:
279 |         lenghts = calc_lengths(positions)
280 |         lenghts = lenghts[::-1]
281 |         positions = np.cumsum(lenghts)
282 |         data = data[::-1]
283 | 
284 |     if step != 1:
285 |         positions = ((positions - 1) // step) + 1
286 | 
287 |         mask = np.empty(len(positions), dtype=bool)
288 |         if len(positions) > 0:
289 |             mask[0] = True
290 |         mask[1:] = positions[1:] != positions[:-1]
291 | 
292 |         data = data[mask]
293 |         positions = positions[mask]
294 | 
295 |         data, positions = recompress(data, positions)
296 | 
297 |     return (data, positions)
298 | 
299 | 
300 | def gen_iterator(data: np.ndarray, positions: np.ndarray) -> Iterator[Any]:
301 |     """
302 |     Generate iterator over RLE data.
303 | 
304 |     Parameters
305 |     ----------
306 |     data:
307 |         Data at start of reach run.
308 |     positions:
309 |         End positions of runs.
310 | 
311 |     Returns
312 |     -------
313 |     it:
314 |         Iterator over uncompressed values.
315 |     """
316 |     old_p = 0
317 |     for x, p in zip(data, positions):
318 |         for _ in range(p - old_p):
319 |             yield x
320 |         old_p = p
321 | 
322 | 
323 | def get_len(positions: np.ndarray) -> int:
324 |     """
325 |     Get length of RLE data.
326 | 
327 |     Parameters
328 |     ----------
329 |     positions:
330 |         End positions of runs.
331 | 
332 |     Returns
333 |     -------
334 |     len:
335 |         Length.
336 |     """
337 |     if len(positions) > 0:
338 |         return int(positions[-1])
339 |     else:
340 |         return 0
341 | 
342 | 
343 | def recompress(
344 |     data: np.ndarray, positions: np.ndarray
345 | ) -> Tuple[np.ndarray, np.ndarray]:
346 |     """
347 |     Try to compress RLE data even more.
348 | 
349 |     Parameters
350 |     ----------
351 |     data:
352 |         Data at start of reach run.
353 |     positions:
354 |         End positions of runs.
355 | 
356 |     Returns
357 |     -------
358 |     data:
359 |         Data at start of reach run.
360 |     positions:
361 |         End positions of runs.
362 |     """
363 |     changes = detect_changes(data)
364 | 
365 |     data = np.concatenate([data[:-1][changes], data[-1:]])
366 |     positions = np.concatenate([positions[:-1][changes], positions[-1:]])
367 |     return (data, positions)
368 | 
369 | 
370 | @numba.jit((numba.int64[:], numba.int64[:]), nopython=True, cache=True, nogil=True)
371 | def _take_kernel(
372 |     positions: np.ndarray, indices: np.ndarray
373 | ) -> Tuple[np.ndarray, np.ndarray]:
374 |     n = len(indices)
375 | 
376 |     # pre-allocate output buffers
377 |     result_data_idx = np.empty(n, dtype=POSITIONS_DTYPE)
378 |     result_positions = np.empty(n, dtype=POSITIONS_DTYPE)
379 | 
380 |     current = -2
381 |     run_start = 0
382 |     run_stop = 0
383 |     out_count = 0
384 |     for pos in range(n):
385 |         i = indices[pos]
386 |         if i == -1:
387 |             # fill
388 |             idx = -1
389 |         elif current >= 0 and (run_start <= i) and (i < run_stop):
390 |             # great, same RLE-run
391 |             idx = current
392 |         else:
393 |             # run full search
394 |             idx = np.searchsorted(positions, i, side="right")
395 | 
396 |         # flush?
397 |         if idx != current:
398 |             if current != -2:
399 |                 result_data_idx[out_count] = current
400 |                 result_positions[out_count] = pos
401 |                 out_count += 1
402 |             current = idx
403 | 
404 |             if current > 0:
405 |                 run_start = positions[current - 1]
406 |             else:
407 |                 run_start = 0
408 | 
409 |             if current >= 0:
410 |                 run_stop = positions[current]
411 | 
412 |     # flush?
413 |     if current != -2:
414 |         result_data_idx[out_count] = current
415 |         result_positions[out_count] = n
416 |         out_count += 1
417 | 
418 |     # return clean-cut outputs
419 |     return result_data_idx[:out_count].copy(), result_positions[:out_count].copy()
420 | 
421 | 
422 | def take(
423 |     data: np.ndarray,
424 |     positions: np.ndarray,
425 |     indices: np.ndarray,
426 |     allow_fill: bool,
427 |     fill_value: Any,
428 | ) -> Tuple[np.ndarray, np.ndarray]:
429 |     """
430 |     Take values from RLE array.
431 | 
432 |     Parameters
433 |     ----------
434 |     data:
435 |         Data at start of reach run.
436 |     positions:
437 |         End positions of runs.
438 |     indices:
439 |         Indices to take. If ``allow_fill`` is set, the only negative element allowed is ``-1``. If ``allow_fill`` is not
440 |         set, then negative entries will be counted from the end of the array.
441 |     allow_fill:
442 |         If filling with missing values is allowed. In that case, ``-1`` in ``indices`` will be filled with
443 |         ``fill_value``.
444 |     fill_value:
445 |         Fill-value in case ``allow_fill`` is set.
446 | 
447 |     Returns
448 |     -------
449 |     data:
450 |         Data at start of reach run.
451 |     positions:
452 |         End positions of runs.
453 |     """
454 |     length = get_len(positions)
455 |     indices = indices.copy()
456 | 
457 |     if (length == 0) and ((np.any(indices != -1) and allow_fill) or not allow_fill):
458 |         raise IndexError("cannot do a non-empty take")
459 | 
460 |     if allow_fill:
461 |         out_of_bounds_mask = indices < -1
462 |         if np.any(out_of_bounds_mask):
463 |             raise ValueError(f"{indices[out_of_bounds_mask][0]}")
464 |         min_idx_allowed = -1
465 |     else:
466 |         indices[indices < 0] += length
467 |         min_idx_allowed = 0
468 | 
469 |     out_of_bounds_mask = (indices < min_idx_allowed) | (indices >= length)
470 |     if np.any(out_of_bounds_mask):
471 |         raise IndexError(f"{indices[out_of_bounds_mask][0]} out of bounds")
472 | 
473 |     result_data_idx, result_positions = _take_kernel(positions, indices)
474 | 
475 |     result_data_mask = result_data_idx != -1
476 |     result_data = np.empty(len(result_data_idx), dtype=data.dtype)
477 |     result_data[result_data_mask] = data[result_data_idx[result_data_mask]]
478 |     if np.any(~result_data_mask):
479 |         result_data[~result_data_mask] = fill_value
480 | 
481 |     return recompress(result_data, result_positions)
482 | 
483 | 
484 | @numba.jit((numba.int64[:], numba.int64[:]), nopython=True, cache=True, nogil=True)
485 | def _extend_positions_kernel(
486 |     positions1: np.ndarray, positions2: np.ndarray
487 | ) -> np.ndarray:
488 |     n1 = len(positions1)
489 |     n2 = len(positions2)
490 | 
491 |     # pre-allocate output buffers
492 |     result = np.empty(n1 + n2, dtype=POSITIONS_DTYPE)
493 | 
494 |     i_out = 0
495 |     i1 = 0
496 |     i2 = 0
497 | 
498 |     while (i1 < n1) and (i2 < n2):
499 |         x1 = positions1[i1]
500 |         x2 = positions2[i2]
501 | 
502 |         if x1 == x2:
503 |             result[i_out] = x1
504 |             i1 += 1
505 |             i2 += 1
506 |         elif x1 < x2:
507 |             result[i_out] = x1
508 |             i1 += 1
509 |         else:
510 |             # x2 < x1
511 |             result[i_out] = x2
512 |             i2 += 1
513 | 
514 |         i_out += 1
515 | 
516 |     while i1 < n1:
517 |         result[i_out] = positions1[i1]
518 |         i1 += 1
519 |         i_out += 1
520 | 
521 |     while i2 < n2:
522 |         result[i_out] = positions2[i2]
523 |         i2 += 1
524 |         i_out += 1
525 | 
526 |     # return clean-cut output
527 |     return result[:i_out].copy()
528 | 
529 | 
530 | def extend_positions(positions1: np.ndarray, positions2: np.ndarray) -> np.ndarray:
531 |     """
532 |     Create union of two position arrays.
533 | 
534 |     Parameters
535 |     ----------
536 |     positions1
537 |         First position array.
538 |     positions2
539 |         Second position array.
540 | 
541 |     Returns
542 |     -------
543 |     extended_positions
544 |         Sorted position array that contains all entries from input arrays (without duplicates).
545 |     """
546 |     return _extend_positions_kernel(positions1, positions2)
547 | 
548 | 
549 | @numba.jit(nopython=True, cache=True, nogil=True)
550 | def _extend_data_kernel(
551 |     data: np.ndarray, positions: np.ndarray, extended_positions: np.ndarray
552 | ) -> np.ndarray:
553 |     n = extended_positions.shape[0]
554 |     extended_array = np.empty(n, dtype=data.dtype)
555 | 
556 |     k = 0  # current index for data/positions
557 |     for i in range(n):
558 |         if extended_positions[i] > positions[k]:
559 |             k += 1
560 |         extended_array[i] = data[k]
561 | 
562 |     return extended_array
563 | 
564 | 
565 | def extend_data(
566 |     data: np.ndarray, positions: np.ndarray, extended_positions: np.ndarray
567 | ) -> np.ndarray:
568 |     """
569 |     Extend data array to match new positions.
570 | 
571 |     Parameters
572 |     ----------
573 |     data
574 |         Data at start of reach run.
575 |     positions
576 |         End positions of runs.
577 |     extended_positions
578 |         Extended position array (superset of ``positions``). See :func:`extend_positions`.
579 | 
580 |     Returns
581 |     -------
582 |     extended_data
583 |         Extended data array.
584 |     """
585 |     return _extend_data_kernel(data, positions, extended_positions)
586 | 


--------------------------------------------------------------------------------
/rle_array/_slicing.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Helpers that allows us to deal with Python slicing.
  3 | 
  4 | The issue with Python slicing are:
  5 | 
  6 | - ``slice`` type:
  7 |   - the types in ``slice`` are completely unchecked (can even be a string or any user-provided type)
  8 |   - the consistency of the values in ``slice`` are unchecked
  9 |   - there is not information about the container size (which makes consistency checks more complicated)
 10 | 
 11 | - ``slice.step`` value:
 12 |   - ``step`` has the implicit default 1
 13 |   - there can be forward and backward slices depending on the ``step`` value
 14 |   - there can be step sizes which are not modulo 1
 15 | 
 16 | - ``slice.start`` and ``slice.stop`` values:
 17 |   - the implicit defaults of ``start`` and ``stop`` depend on ``step`` (is it positive or negative?)
 18 |   - ``start`` and ``stop`` can be negative (aka "from the end")
 19 |   - ``start`` and ``stop`` can over/underflow the container
 20 | 
 21 | We do not want to deal with all these edge cases in every code snipped that deals with slicing, so we introduce
 22 | :class:`NormalizedSlice` that solves the issue in a central place.
 23 | """
 24 | from typing import Optional
 25 | 
 26 | import numpy as np
 27 | 
 28 | 
 29 | class NormalizedSlice:
 30 |     """
 31 |     A normalized slice.
 32 | 
 33 |     .. important::
 34 | 
 35 |         Do not try to construct this class by hand. Use :func:`NormalizedSlice.from_slice` instead!
 36 | 
 37 |     Parameters
 38 |     ----------
 39 |     start
 40 |         First absolute index in the container (inclusive start). Always positive.
 41 |     stop
 42 |         Last absolute index not being part of the slice (exclusive end). Counted from the container start. Can be
 43 |         negative for refersed slices (aka ``step < 0``). Must be normalized so that ``abs(stop - start) % step == 0``.
 44 |         For forward slices (``step > 0``), this must be greater than ``start``. For backward slices (``step < 0``) this
 45 |         must be less than ``start``. For empty slices (``start = stop``), ``start``, ``stop`` and ``step`` have the
 46 |         fixed values 0, 0 and 1.
 47 |     step
 48 |         Step size. Must not be ``0``.
 49 |     container_length
 50 |         Size of the container this slice applies to. Must not be negative. For empty containers
 51 |         (``container_length = 0``), ``start``, ``stop`` and ``step`` have the fixed values 0, 0 and 1.
 52 |     """
 53 | 
 54 |     def __init__(self, start: int, stop: int, step: int, container_length: int):
 55 |         if not isinstance(start, int):
 56 |             raise TypeError(f"start must be int but is {type(start).__name__}")
 57 |         if not isinstance(stop, int):
 58 |             raise TypeError(f"stop must be int but is {type(stop).__name__}")
 59 |         if not isinstance(step, int):
 60 |             raise TypeError(f"step must be int but is {type(step).__name__}")
 61 |         if not isinstance(container_length, int):
 62 |             raise TypeError(
 63 |                 f"container_length must be int but is {type(container_length).__name__}"
 64 |             )
 65 | 
 66 |         self._start = start
 67 |         self._stop = stop
 68 |         self._step = step
 69 |         self._container_length = container_length
 70 | 
 71 |         self._verify()
 72 | 
 73 |     def _verify(self) -> None:
 74 |         """
 75 |         Verify integrity.
 76 |         """
 77 |         if self.container_length < 0:
 78 |             raise ValueError(
 79 |                 f"container_length ({self.container_length}) must be greater or equal to zero"
 80 |             )
 81 |         elif self.container_length == 0:
 82 |             self._verify_container_empty()
 83 |         else:
 84 |             self._verify_container_not_empty()
 85 | 
 86 |     def _verify_container_empty(self) -> None:
 87 |         """
 88 |         Verify integrity in case the container is empty (``container_length = 0``).
 89 |         """
 90 |         # empty container => special values required
 91 |         if self.start != 0:
 92 |             raise ValueError(
 93 |                 f"for empty containers, start must be 0 but is {self.start}"
 94 |             )
 95 | 
 96 |         if self.stop != 0:
 97 |             raise ValueError(f"for empty containers, stop must be 0 but is {self.stop}")
 98 | 
 99 |         if self.step != 1:
100 |             raise ValueError(f"for empty containers, step must be 1 but is {self.step}")
101 | 
102 |     def _verify_container_not_empty(self) -> None:
103 |         """
104 |         Verify integrity in case the container is not empty (``container_length > 0``).
105 |         """
106 |         if (self.start < 0) or (self.start >= self.container_length):
107 |             raise ValueError(
108 |                 f"start ({self.start}) must be in [0,{self.container_length}) but is not"
109 |             )
110 | 
111 |         if (self.stop < -abs(self.step)) or (
112 |             self.stop >= self.container_length + abs(self.step)
113 |         ):
114 |             raise ValueError(
115 |                 f"stop ({self.stop}) must be in [{-abs(self.step)},{self.container_length + abs(self.step)}) but is not"
116 |             )
117 | 
118 |         if self.start == self.stop:
119 |             # empty slice
120 |             if self.start != 0:
121 |                 raise ValueError(
122 |                     f"for empty slices, start and stop must be 0 but are {self.start}"
123 |                 )
124 |             if self.step != 1:
125 |                 raise ValueError(f"for empty slices, step must be 1 but is {self.step}")
126 |         else:
127 |             # non-empty slice
128 |             if self.step == 0:
129 |                 raise ValueError("step cannot be zero")
130 |             elif self.step > 0:
131 |                 # forward slice
132 |                 if self.start > self.stop:
133 |                     raise ValueError(
134 |                         "for forward slices, stop must be greater or equal to start"
135 |                     )
136 |             else:
137 |                 # backward slice
138 |                 if self.stop > self.start:
139 |                     raise ValueError(
140 |                         "for backward slices, start must be greater or equal to stop"
141 |                     )
142 | 
143 |             if abs(self.start - self.stop) % abs(self.step) != 0:
144 |                 raise ValueError(
145 |                     "The distance between start and stop most be divisible by the step size"
146 |                 )
147 | 
148 |     @property
149 |     def start(self) -> int:
150 |         """
151 |         Start index of the slice. Inclusive start.
152 |         """
153 |         return self._start
154 | 
155 |     @property
156 |     def stop(self) -> int:
157 |         """
158 |         Stop index of the slice. Exclusive end.
159 |         """
160 |         return self._stop
161 | 
162 |     @property
163 |     def step(self) -> int:
164 |         """
165 |         Step width.
166 |         """
167 |         return self._step
168 | 
169 |     @property
170 |     def container_length(self) -> int:
171 |         """
172 |         Length of the container.
173 |         """
174 |         return self._container_length
175 | 
176 |     def __repr__(self) -> str:
177 |         return (
178 |             f"{type(self).__name__}(start={self.start}, stop={self.stop}, step={self.step}, container_length="
179 |             f"{self.container_length})"
180 |         )
181 | 
182 |     def __len__(self) -> int:
183 |         return self._calc_len(start=self.start, stop=self.stop, step=self.step)
184 | 
185 |     @classmethod
186 |     def _calc_len(cls, start: int, stop: int, step: int) -> int:
187 |         """
188 |         Calculate slice length.
189 | 
190 |         Parameters
191 |         ----------
192 |         start
193 |             Inclusive start index.
194 |         stop
195 |             Exclusive stop index.
196 |         step
197 |             Step width.
198 |         """
199 |         delta = abs(stop - start)
200 |         steps = delta // abs(step)
201 |         if delta % abs(step) != 0:
202 |             steps += 1
203 |         return steps
204 | 
205 |     @classmethod
206 |     def _check_and_prepare_slice(cls, s: Optional[slice]) -> slice:
207 |         """
208 |         Check and prepare input slice for conversion.
209 |         """
210 |         if s is None:
211 |             s = slice(None, None, None)
212 | 
213 |         if not isinstance(s, slice):
214 |             raise TypeError(f"slice must be a slice but is {type(s).__name__}")
215 | 
216 |         if (s.start is not None) and not isinstance(s.start, (int, np.int64)):
217 |             raise TypeError(
218 |                 f"slice start must be int or None but is {type(s.start).__name__}"
219 |             )
220 |         start = None if s.start is None else int(s.start)
221 | 
222 |         if (s.stop is not None) and not isinstance(s.stop, (int, np.int64)):
223 |             raise TypeError(
224 |                 f"slice stop must be int or None but is {type(s.stop).__name__}"
225 |             )
226 |         stop = None if s.stop is None else int(s.stop)
227 | 
228 |         if (s.step is not None) and not isinstance(s.step, (int, np.int64)):
229 |             raise TypeError(
230 |                 f"slice step must be int or None but is {type(s.step).__name__}"
231 |             )
232 |         if s.step == 0:
233 |             raise ValueError("slice step cannot be zero")
234 |         step = None if s.step is None else int(s.step)
235 | 
236 |         return slice(start, stop, step)
237 | 
238 |     @classmethod
239 |     def from_slice(cls, container_length: int, s: Optional[slice]) -> "NormalizedSlice":
240 |         """
241 |         Create a new :class:`NormalizedSlice` from a given Python ``slice`` and container length.
242 | 
243 |         Parameters
244 |         ----------
245 |         container_length
246 |             Non-negative container length.
247 |         s
248 |             Slice or ``None`` (for "take all").
249 | 
250 |         Raises
251 |         ------
252 |         TypeError: If ``s`` is not ``None`` and not a ``slice`` or any of the arguments for ``slice`` are neither
253 |                    ``None`` nor an integer.
254 |         ValueError: Illegal ``slice`` values or ``container_length``.
255 |         """
256 |         s2 = cls._check_and_prepare_slice(s)
257 | 
258 |         if not isinstance(container_length, (int, np.int64)):
259 |             raise TypeError(
260 |                 f"container_length must be an int but is {type(container_length).__name__}"
261 |             )
262 |         if container_length < 0:
263 |             raise ValueError("container_length cannot be negative")
264 | 
265 |         if container_length == 0:
266 |             return cls(start=0, stop=0, step=1, container_length=0)
267 | 
268 |         container_length = int(container_length)
269 | 
270 |         default_start, default_stop = 0, container_length
271 | 
272 |         if s2.step is not None:
273 |             step = s2.step
274 |             if step < 0:
275 |                 default_start, default_stop = default_stop - 1, default_start - 1
276 |         else:
277 |             step = 1
278 | 
279 |         def limit(x: int) -> int:
280 |             a = min(default_start, default_stop)
281 |             b = max(default_start, default_stop)
282 |             return max(a, min(b, x))
283 | 
284 |         if s2.start is not None:
285 |             if s2.start < 0:
286 |                 start = limit(container_length + s2.start)
287 |             else:
288 |                 start = limit(s2.start)
289 |         else:
290 |             start = default_start
291 | 
292 |         if s2.stop is not None:
293 |             if s2.stop < 0:
294 |                 stop = limit(container_length + s2.stop)
295 |             else:
296 |                 stop = limit(s2.stop)
297 |         else:
298 |             stop = default_stop
299 | 
300 |         if step > 0:
301 |             if stop < start:
302 |                 stop = start
303 |         else:
304 |             if stop > start:
305 |                 stop = start
306 | 
307 |         if start == stop:
308 |             return cls(start=0, stop=0, step=1, container_length=container_length)
309 | 
310 |         # re-adjusting the range to be modulo `step`
311 |         stop = start + step * cls._calc_len(start=start, stop=stop, step=step)
312 | 
313 |         return cls(start=start, stop=stop, step=step, container_length=container_length)
314 | 
315 |     def project(self, child: "NormalizedSlice") -> "NormalizedSlice":
316 |         """
317 |         Project a slice.
318 | 
319 |         Given a parent slice (``self``) which is applied first, calculate slice of this slice would look like so it can
320 |         be applied to the original data.
321 | 
322 |         Parameters
323 |         ----------
324 |         child
325 |             Second slice to apply.
326 | 
327 |         Raises
328 |         ------
329 |         TypeError: If ``child`` is not a ``NormalizedSlice``.
330 |         ValueError: If ``child.container_length`` is not the length of ``self``.
331 | 
332 |         Example
333 |         -------
334 |         >>> # given some unknown data:
335 |         >>> data = list(range(100))
336 | 
337 |         >>> # and two slices:
338 |         >>> parent = slice(10, -8, 2)
339 |         >>> child = slice(-20, -1, -1)
340 | 
341 |         >>> # and the application of both slices
342 |         >>> expected = data[parent][child]
343 | 
344 |         >>> # construct a slice that does both steps at once
345 |         >>> from rle_array._algorithms import NormalizedSlice
346 |         >>> parent_normalized = NormalizedSlice.from_slice(len(data), parent)
347 |         >>> child_normalized = NormalizedSlice.from_slice(len(parent), child)
348 |         >>> projected = parent_normalized.project(child_normalized).to_slice()
349 |         >>> actual = data[projected]
350 |         >>> assert actual == expected
351 |         """
352 |         if not isinstance(child, NormalizedSlice):
353 |             raise TypeError(
354 |                 f"child must be NormalizedSlice but is {type(child).__name__}"
355 |             )
356 |         if child.container_length != len(self):
357 |             raise ValueError(
358 |                 f"container_length of child ({child.container_length}) must be length of parent ({len(self)})"
359 |             )
360 | 
361 |         start = self.start + child.start * self.step
362 |         stop = self.start + child.stop * self.step
363 |         step = self.step * child.step
364 | 
365 |         if start == stop:
366 |             return type(self)(
367 |                 start=0, stop=0, step=1, container_length=self.container_length
368 |             )
369 | 
370 |         return type(self)(
371 |             start=start, stop=stop, step=step, container_length=self.container_length
372 |         )
373 | 
374 |     def to_slice(self) -> Optional[slice]:
375 |         """
376 |         Convert :class:`NormalizedSlice` back to a slice.
377 | 
378 |         Returns ``None`` if no slicing is applied (e.g. the whole container with ``step=1`` is taken).
379 |         """
380 |         start: Optional[int] = self.start
381 |         stop: Optional[int] = self.stop
382 |         step: Optional[int] = self.step
383 | 
384 |         if self.step > 0:
385 |             # forwards
386 |             if self.start <= 0:
387 |                 start = None
388 |             if self.stop >= self.container_length:
389 |                 stop = None
390 |             if self.step == 1:
391 |                 step = None
392 |         else:
393 |             # backward
394 |             if self.start >= self.container_length - 1:
395 |                 start = None
396 |             if self.stop < 0:
397 |                 stop = None
398 | 
399 |         if (start is None) and (stop is None) and (step is None):
400 |             return None
401 |         else:
402 |             return slice(start, stop, step)
403 | 


--------------------------------------------------------------------------------
/rle_array/array.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import operator
  3 | import warnings
  4 | from collections import namedtuple
  5 | from copy import copy
  6 | from typing import Any, Callable, Dict, Iterator, Optional, Sequence, Tuple, Union
  7 | from weakref import WeakSet, ref
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | from pandas.api.extensions import ExtensionArray
 12 | from pandas.arrays import BooleanArray, IntegerArray, StringArray
 13 | from pandas.core import ops
 14 | from pandas.core.algorithms import factorize, unique
 15 | from pandas.core.arrays.boolean import coerce_to_array as coerce_to_boolean_array
 16 | from pandas.core.dtypes.common import is_array_like
 17 | from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
 18 | from pandas.core.dtypes.inference import is_scalar
 19 | from pandas.core.dtypes.missing import isna
 20 | from pandas.errors import PerformanceWarning
 21 | 
 22 | from ._algorithms import (
 23 |     calc_lengths,
 24 |     compress,
 25 |     concat,
 26 |     decompress,
 27 |     dropna,
 28 |     extend_data,
 29 |     extend_positions,
 30 |     find_single_index,
 31 |     find_slice,
 32 |     gen_iterator,
 33 |     get_len,
 34 |     recompress,
 35 |     take,
 36 | )
 37 | from ._slicing import NormalizedSlice
 38 | from .dtype import RLEDtype
 39 | from .types import POSITIONS_DTYPE
 40 | 
 41 | _logger = logging.getLogger(__name__)
 42 | 
 43 | 
 44 | def _normalize_arraylike_indexing(arr: Any, length: int) -> np.ndarray:
 45 |     """
 46 |     Normalize array-like index arguments for ``__getitem__`` and ``__setitem__``.
 47 | 
 48 |     This is required since pandas can pass us many different types with potentially nullable data.
 49 | 
 50 |     Parameters
 51 |     ----------
 52 |     arr
 53 |         Index argument passed to ``__getitem__`` and ``__setitem__`` if arraylike.
 54 |     length
 55 |         Array length.
 56 |     """
 57 |     if isinstance(arr, BooleanArray):
 58 |         result = np.asarray(arr.fillna(False), dtype=bool)
 59 |     elif isinstance(arr, IntegerArray):
 60 |         try:
 61 |             return np.asarray(arr, dtype=int)
 62 |         except ValueError:
 63 |             raise ValueError(
 64 |                 "Cannot index with an integer indexer containing NA values"
 65 |             )
 66 |     elif isinstance(arr, RLEArray):
 67 |         result = np.asarray(arr, dtype=arr.dtype._dtype)
 68 |     elif isinstance(arr, list):
 69 |         if any((pd.isna(x) for x in arr)):
 70 |             raise ValueError(
 71 |                 "Cannot index with an integer indexer containing NA values"
 72 |             )
 73 |         result = np.asarray(arr)
 74 |     else:
 75 |         result = np.asarray(arr)
 76 | 
 77 |     if (result.dtype == np.bool_) and (len(result) != length):
 78 |         raise IndexError("Indexer has wrong length")
 79 | 
 80 |     return result
 81 | 
 82 | 
 83 | class _ViewAnchor:
 84 |     """
 85 |     Anchor object that references a RLEArray because it is not hashable.
 86 |     """
 87 | 
 88 |     def __init__(self, array: "RLEArray") -> None:
 89 |         self.array = ref(array)
 90 | 
 91 |     def __hash__(self) -> int:
 92 |         return id(self.array)
 93 | 
 94 | 
 95 | class _ViewMaster:
 96 |     """
 97 |     Collection of all views to an array.
 98 | 
 99 |     This tracks the original data as well as all views.
100 |     """
101 | 
102 |     def __init__(self, data: np.ndarray, positions: np.ndarray):
103 |         self.data = data
104 |         self.positions = positions
105 |         self.views: WeakSet[_ViewAnchor] = WeakSet()
106 | 
107 |     @classmethod
108 |     def register_first(cls, array: "RLEArray") -> "_Projection":
109 |         """
110 |         Register array with new master.
111 | 
112 |         The array must not have a view master yet!
113 |         """
114 |         assert getattr(array, "_projection", None) is None
115 | 
116 |         projection = _Projection(
117 |             projection_slice=None,
118 |             master=cls(data=array._data, positions=array._positions),
119 |         )
120 |         projection.master.views.add(array._view_anchor)
121 |         return projection
122 | 
123 |     def register_change(
124 |         self, array: "RLEArray", projection_slice: Optional[slice]
125 |     ) -> None:
126 |         """
127 |         Re-register array with new view-master.
128 | 
129 |         The array must only be registered with a single, no orphan master!
130 |         """
131 |         # ensure the array is only registered with another orphan master
132 |         assert array._projection is not None
133 |         assert array._projection.projection_slice is None
134 |         assert array._projection.master is not self
135 |         assert len(array._projection.master.views) == 1
136 |         assert array._view_anchor not in self.views
137 | 
138 |         array._projection = _Projection(projection_slice=projection_slice, master=self)
139 |         self.views.add(array._view_anchor)
140 | 
141 |     def modify(self, data: np.ndarray, positions: np.ndarray) -> None:
142 |         """
143 |         Modify the original (unprojected) data and populate change to all views.
144 |         """
145 |         self.data = data
146 |         self.positions = positions
147 | 
148 |         for view in self.views:
149 |             array = view.array()
150 |             assert array is not None
151 |             assert array._projection is not None
152 |             assert array._projection.master is self
153 | 
154 |             if array._projection.projection_slice is not None:
155 |                 data2, positions2 = find_slice(
156 |                     data=self.data,
157 |                     positions=self.positions,
158 |                     s=array._projection.projection_slice,
159 |                 )
160 |             else:
161 |                 data2, positions2 = self.data, self.positions
162 | 
163 |             array._data = data2
164 |             array._positions = positions2
165 | 
166 | 
167 | _Projection = namedtuple("_Projection", ["master", "projection_slice"])
168 | 
169 | 
170 | class RLEArray(ExtensionArray):
171 |     """
172 |     Run-length encoded array.
173 | 
174 |     Parameters
175 |     ----------
176 |     data
177 |         Data for each run. Must be a one-dimensional. All Pandas-supported dtypes are supported.
178 |     positions
179 |         End-positions for each run. Must be one-dimensional and must have same length as ``data``. dtype must be
180 |         ``POSITIONS_DTYPE``.
181 |     """
182 | 
183 |     _HANDLED_TYPES = tuple(
184 |         t for types in np.sctypes.values() for t in types if t is not object
185 |     ) + (np.ndarray, list, tuple, int, float, complex)
186 | 
187 |     # For comparisons, so that numpy uses our implementation.
188 |     __array_priority__ = 1000
189 | 
190 |     def __init__(self, data: np.ndarray, positions: np.ndarray):
191 |         if not isinstance(data, np.ndarray):
192 |             raise TypeError(f"data must be an ndarray but is {type(data).__name__}")
193 |         if not isinstance(positions, np.ndarray):
194 |             raise TypeError(
195 |                 f"positions must be an ndarray but is {type(positions).__name__}"
196 |             )
197 |         if data.ndim != 1:
198 |             raise ValueError(
199 |                 f"data must be an 1-dimensional ndarray but has {data.ndim} dimensions"
200 |             )
201 |         if positions.ndim != 1:
202 |             raise ValueError(
203 |                 f"positions must be an 1-dimensional ndarray but has {positions.ndim} dimensions"
204 |             )
205 |         if positions.dtype != POSITIONS_DTYPE:
206 |             raise ValueError(
207 |                 f"positions must have dtype {POSITIONS_DTYPE.__name__} but has {positions.dtype}"
208 |             )
209 |         if len(data) != len(positions):
210 |             raise ValueError(
211 |                 f"data and positions must have same length but have {len(data)} and {len(positions)}"
212 |             )
213 |         if np.any(positions[1:] <= positions[:-1]):
214 |             raise ValueError("positions must be strictly sorted")
215 | 
216 |         _logger.debug(
217 |             "RLEArray.__init__(data=%s(len=%r, dtype=%r), positions=%s(len=%r, dtype=%r))",
218 |             type(data).__name__,
219 |             len(data),
220 |             data.dtype,
221 |             type(positions).__name__,
222 |             len(positions),
223 |             positions.dtype,
224 |         )
225 | 
226 |         self._dtype = RLEDtype(data.dtype)
227 |         self._data = data
228 |         self._positions = positions
229 |         self._setup_view_system()
230 | 
231 |     def _setup_view_system(self) -> None:
232 |         """
233 |         Setup any view-related tracking parts.
234 | 
235 |         Must be called after initialization or unpickling.
236 |         """
237 |         self._view_anchor = _ViewAnchor(self)
238 |         self._projection = _ViewMaster.register_first(self)
239 | 
240 |     def __getstate__(self) -> Dict[str, Any]:
241 |         state = copy(self.__dict__)
242 |         del state["_view_anchor"]
243 |         del state["_projection"]
244 |         return state
245 | 
246 |     def __setstate__(self, state: Dict[str, Any]) -> None:
247 |         self.__dict__.update(state)
248 |         self._setup_view_system()
249 | 
250 |     @property
251 |     def _lengths(self) -> Any:
252 |         return calc_lengths(self._positions)
253 | 
254 |     @classmethod
255 |     def _from_sequence(
256 |         cls, scalars: Any, dtype: Any = None, copy: bool = False
257 |     ) -> "RLEArray":
258 |         _logger.debug(
259 |             "RLEArray._from_sequence(scalars=%s(...), dtype=%r, copy=%r)",
260 |             type(scalars).__name__,
261 |             dtype,
262 |             copy,
263 |         )
264 |         if isinstance(dtype, RLEDtype):
265 |             dtype = dtype._dtype
266 | 
267 |         if isinstance(scalars, np.ndarray):
268 |             if (dtype is not None) and (dtype != scalars.dtype):
269 |                 # some cast required
270 |                 if dtype == np.bool_:
271 |                     # bool case
272 |                     scalars, mask = coerce_to_boolean_array(scalars)
273 |                     if mask.any():
274 |                         raise TypeError("Masked booleans are not supported")
275 |                 else:
276 |                     # catch-them-all case
277 |                     # TODO: get rid of this unsafe cast
278 |                     scalars = scalars.astype(dtype)
279 |         else:
280 |             scalars = np.asarray(scalars, dtype=dtype)
281 |         data, positions = compress(scalars)
282 |         return RLEArray(data=data, positions=positions)
283 | 
284 |     @classmethod
285 |     def _from_factorized(cls, data: Any, original: "RLEArray") -> "RLEArray":
286 |         _logger.debug("RLEArray._from_factorized(...)")
287 |         return cls._from_sequence(np.asarray(data, dtype=original.dtype._dtype))
288 | 
289 |     def _values_for_factorize(self) -> Tuple[np.ndarray, Any]:
290 |         # decompressing version of `_values_for_factorize` which is not only required for `factorize` but also for other
291 |         # things like `pandas.core.util.hashing.hash_array`
292 |         return decompress(self._data, self._positions), self.dtype.na_value
293 | 
294 |     def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "RLEArray"]:
295 |         # optimized version of `ExtensionArray.factorize`:
296 |         #   1. replace `_values_for_factorize` with a version that does not decompress the data
297 |         #   2. passing compressed data to `factorize` (instead of `_factorize_array` because that does not handle NA
298 |         #      values nicely)
299 |         #   3. decompress `codes`
300 |         arr = self._data
301 | 
302 |         codes, uniques = factorize(arr, na_sentinel=na_sentinel)
303 | 
304 |         uniques = self._from_factorized(uniques, self)
305 |         codes = decompress(codes, self._positions)
306 |         return codes, uniques
307 | 
308 |     def __getitem__(self, arr: Any) -> Any:
309 |         _logger.debug("RLEArray.__getitem__(arr=%s(...))", type(arr).__name__)
310 |         if isinstance(arr, tuple):
311 |             # This is for example called by Pandas as values[:, None] to prepare the data for the cythonized
312 |             # aggregation. Since we do not want to support the the aggregation over decompression, it is OK to not
313 |             # implement this.
314 |             raise NotImplementedError(
315 |                 "__getitem__ does currently only work w/ a single parameter"
316 |             )
317 | 
318 |         if is_array_like(arr) or isinstance(arr, list):
319 |             arr = _normalize_arraylike_indexing(arr, len(self))
320 | 
321 |             if arr.dtype == np.bool_:
322 |                 arr = np.arange(len(self))[arr]
323 |             else:
324 |                 arr = arr.astype(int)
325 | 
326 |             if len(arr) == 0:
327 |                 return RLEArray(data=self._data[[]], positions=self._positions[[]])
328 | 
329 |             arr[arr < 0] += len(self)
330 | 
331 |             data, positions = take(
332 |                 data=self._data,
333 |                 positions=self._positions,
334 |                 indices=arr,
335 |                 allow_fill=False,
336 |                 fill_value=self.dtype.na_value,
337 |             )
338 |             return RLEArray(data=data, positions=positions)
339 |         elif isinstance(arr, slice):
340 |             data, positions = find_slice(self._data, self._positions, arr)
341 |             parent_normalized = NormalizedSlice.from_slice(
342 |                 get_len(self._projection.master.positions),
343 |                 self._projection.projection_slice,
344 |             )
345 |             child_normalized = NormalizedSlice.from_slice(len(self), arr)
346 |             subslice = parent_normalized.project(child_normalized).to_slice()
347 |             result = RLEArray(data=data, positions=positions)
348 |             self._projection.master.register_change(result, subslice)
349 |             return result
350 |         else:
351 |             if arr < 0:
352 |                 arr = arr + len(self)
353 |             return find_single_index(self._data, self._positions, arr)
354 | 
355 |     def __setitem__(self, index: Any, data: Any) -> None:
356 |         _logger.debug("RLEArray.__setitem__(...)")
357 | 
358 |         # get master data
359 |         orig = decompress(
360 |             data=self._projection.master.data,
361 |             positions=self._projection.master.positions,
362 |         )
363 | 
364 |         # get our view
365 |         if self._projection.projection_slice is not None:
366 |             sub = orig[self._projection.projection_slice]
367 |         else:
368 |             sub = orig
369 | 
370 |         # prepare index
371 |         if is_array_like(index) or isinstance(index, list):
372 |             index = _normalize_arraylike_indexing(index, len(self))
373 | 
374 |         # modify master data through view
375 |         sub[index] = data
376 | 
377 |         # commit to all views (including self)
378 |         data, positions = compress(orig)
379 |         self._projection.master.modify(data, positions)
380 | 
381 |     def __len__(self) -> int:
382 |         _logger.debug("RLEArray.__len__()")
383 |         return get_len(self._positions)
384 | 
385 |     @property
386 |     def dtype(self) -> RLEDtype:
387 |         _logger.debug("RLEArray.dtype")
388 |         return self._dtype
389 | 
390 |     @property
391 |     def nbytes(self) -> int:
392 |         _logger.debug("RLEArray.nbytes")
393 |         return int(self._data.nbytes) + int(self._positions.nbytes)
394 | 
395 |     def isna(self) -> "RLEArray":
396 |         _logger.debug("RLEArray.isna()")
397 |         return RLEArray(data=pd.isna(self._data), positions=self._positions.copy())
398 | 
399 |     def take(
400 |         self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None
401 |     ) -> "RLEArray":
402 |         _logger.debug(
403 |             "RLEArray.take(indices=%s(len=%s), allow_fill=%r, fill_value=%r)",
404 |             type(indices).__name__,
405 |             len(indices),
406 |             allow_fill,
407 |             fill_value,
408 |         )
409 |         if fill_value is None:
410 |             fill_value = self.dtype.na_value
411 | 
412 |         indices = np.asarray(indices)
413 | 
414 |         data, positions = take(
415 |             self._data, self._positions, indices, allow_fill, fill_value
416 |         )
417 |         return RLEArray(data=data, positions=positions)
418 | 
419 |     def copy(self) -> "RLEArray":
420 |         _logger.debug("RLEArray.copy()")
421 |         return RLEArray(data=self._data.copy(), positions=self._positions.copy())
422 | 
423 |     @classmethod
424 |     def _concat_same_type(cls, to_concat: Sequence["RLEArray"]) -> "RLEArray":
425 |         t_to_concat = type(to_concat)
426 |         to_concat = list(to_concat)
427 |         _logger.debug(
428 |             "RLEArray._concat_same_type(to_concat=%s(len=%i))",
429 |             t_to_concat.__name__,
430 |             len(to_concat),
431 |         )
432 |         data, positions = concat(
433 |             [s._data for s in to_concat], [s._positions for s in to_concat]
434 |         )
435 |         return RLEArray(data=data, positions=positions)
436 | 
437 |     def __array__(self, dtype: Any = None) -> Any:
438 |         _logger.debug("RLEArray.__array__(dtype=%r)", dtype)
439 |         warnings.warn("performance: __array__ blows up data", PerformanceWarning)
440 |         if dtype is None:
441 |             dtype = self.dtype._dtype
442 | 
443 |         return decompress(self._data, self._positions, dtype)
444 | 
445 |     def astype(self, dtype: Any, copy: bool = True, casting: str = "unsafe") -> Any:
446 |         _logger.debug("RLEArray.astype(dtype=%r, copy=%r)", dtype, copy)
447 |         if isinstance(dtype, RLEDtype):
448 |             if (not copy) and (dtype == self.dtype):
449 |                 return self
450 |             return RLEArray(
451 |                 data=self._data.astype(dtype._dtype, casting=casting),
452 |                 positions=self._positions.copy(),
453 |             )
454 |         if isinstance(dtype, pd.StringDtype):
455 |             # TODO: fast-path
456 |             return StringArray._from_sequence([str(x) for x in self])
457 | 
458 |         if casting != "unsafe":
459 |             return np.array(self, copy=copy).astype(dtype=dtype, casting=casting)
460 |         else:
461 |             return np.array(self, dtype=dtype, copy=copy)
462 | 
463 |     def _get_reduce_data(self, skipna: bool) -> Any:
464 |         data = self._data
465 |         if skipna:
466 |             data = data[pd.notnull(data)]
467 |         return data
468 | 
469 |     def _get_reduce_data_len(self, skipna: bool) -> Any:
470 |         data = self._data
471 |         lengths = self._lengths
472 |         if skipna:
473 |             mask = pd.notnull(data)
474 |             data = data[mask]
475 |             lengths = lengths[mask]
476 |         return data, lengths
477 | 
478 |     def all(self, axis: Optional[int] = 0, out: Any = None) -> bool:
479 |         _logger.debug("RLEArray.all()")
480 |         if (axis is not None) and (axis != 0):
481 |             raise NotImplementedError("Only axis=0 is supported.")
482 |         if out is not None:
483 |             raise NotImplementedError("out parameter is not supported.")
484 | 
485 |         return bool(np.all(self._data))
486 | 
487 |     def any(self, axis: Optional[int] = 0, out: Any = None) -> bool:
488 |         _logger.debug("RLEArray.any(axis=%r, out=%r)", axis, out)
489 |         if (axis is not None) and (axis != 0):
490 |             raise NotImplementedError("Only axis=0 is supported.")
491 |         if out is not None:
492 |             raise NotImplementedError("out parameter is not supported.")
493 | 
494 |         return bool(np.any(self._data))
495 | 
496 |     def kurt(self, skipna: bool = True) -> Any:
497 |         _logger.debug("RLEArray.kurt(skipna=%r)", skipna)
498 |         # TODO: fast-path
499 |         data = np.asarray(self)
500 |         return pd.Series(data).kurt(skipna=skipna)
501 | 
502 |     def max(self, skipna: bool = True, axis: Optional[int] = 0, out: Any = None) -> Any:
503 |         _logger.debug("RLEArray.max(skipna=%r)", skipna)
504 |         if (axis is not None) and (axis != 0):
505 |             raise NotImplementedError("Only axis=0 is supported.")
506 |         if out is not None:
507 |             raise NotImplementedError("out parameter is not supported.")
508 | 
509 |         data = self._get_reduce_data(skipna)
510 |         if len(data):
511 |             return np.max(data)
512 |         else:
513 |             return self.dtype.na_value
514 | 
515 |     def mean(
516 |         self,
517 |         skipna: bool = True,
518 |         dtype: Optional[Any] = None,
519 |         axis: Optional[int] = 0,
520 |         out: Any = None,
521 |     ) -> Any:
522 |         _logger.debug("RLEArray.mean(skipna=%r)", skipna)
523 |         if (axis is not None) and (axis != 0):
524 |             raise NotImplementedError("Only axis=0 is supported.")
525 |         if out is not None:
526 |             raise NotImplementedError("out parameter is not supported.")
527 |         if dtype is not None:
528 |             raise NotImplementedError("dtype parameter is not supported.")
529 | 
530 |         data, lengths = self._get_reduce_data_len(skipna)
531 |         n = lengths.sum() if skipna else len(self)
532 |         if n == 0:
533 |             return self.dtype.na_value
534 |         else:
535 |             return np.dot(data, lengths) / np.float64(n)
536 | 
537 |     def median(
538 |         self, skipna: bool = True, axis: Optional[int] = 0, out: Any = None
539 |     ) -> Any:
540 |         _logger.debug("RLEArray.median(skipna=%r)", skipna)
541 |         if (axis is not None) and (axis != 0):
542 |             raise NotImplementedError("Only axis=0 is supported.")
543 |         if out is not None:
544 |             raise NotImplementedError("out parameter is not supported.")
545 | 
546 |         # TODO: fast-path
547 |         data = np.asarray(self)
548 |         if skipna:
549 |             data = data[pd.notnull(data)]
550 | 
551 |         if len(data) == 0:
552 |             return self.dtype.na_value
553 |         else:
554 |             return np.median(data)
555 | 
556 |     def min(self, skipna: bool = True, axis: Optional[int] = 0, out: Any = None) -> Any:
557 |         _logger.debug("RLEArray.min(skipna=%r)", skipna)
558 |         if (axis is not None) and (axis != 0):
559 |             raise NotImplementedError("Only axis=0 is supported.")
560 |         if out is not None:
561 |             raise NotImplementedError("out parameter is not supported.")
562 | 
563 |         data = self._get_reduce_data(skipna)
564 |         if len(data):
565 |             return np.min(data)
566 |         else:
567 |             return self.dtype.na_value
568 | 
569 |     def prod(
570 |         self, skipna: bool = True, axis: Optional[int] = 0, out: Any = None
571 |     ) -> Any:
572 |         _logger.debug("RLEArray.prod(skipna=%r)", skipna)
573 |         if (axis is not None) and (axis != 0):
574 |             raise NotImplementedError("Only axis=0 is supported.")
575 |         if out is not None:
576 |             raise NotImplementedError("out parameter is not supported.")
577 | 
578 |         data, lengths = self._get_reduce_data_len(skipna)
579 |         return np.prod(np.power(data, lengths))
580 | 
581 |     def skew(self, skipna: bool = True) -> Any:
582 |         _logger.debug("RLEArray.skew(skipna=%r)", skipna)
583 |         # TODO: fast-path
584 |         data = np.asarray(self)
585 |         return pd.Series(data).skew(skipna=skipna)
586 | 
587 |     def std(
588 |         self,
589 |         skipna: bool = True,
590 |         ddof: int = 1,
591 |         dtype: Optional[Any] = None,
592 |         axis: Optional[int] = 0,
593 |         out: Any = None,
594 |     ) -> Any:
595 |         _logger.debug("RLEArray.std(skipna=%r, ddof=%r)", skipna, ddof)
596 |         if (axis is not None) and (axis != 0):
597 |             raise NotImplementedError("Only axis=0 is supported.")
598 |         if out is not None:
599 |             raise NotImplementedError("out parameter is not supported.")
600 |         if dtype is not None:
601 |             raise NotImplementedError("dtype parameter is not supported.")
602 | 
603 |         # TODO: fast-path
604 |         data = np.asarray(self).astype(dtype)
605 |         # use pandas-style std, since numpy results in different results
606 |         return pd.Series(data).std(skipna=skipna, ddof=ddof)
607 | 
608 |     def sum(self, skipna: bool = True, axis: Optional[int] = 0, out: Any = None) -> Any:
609 |         _logger.debug("RLEArray.sum(skipna=%r)", skipna)
610 |         if (axis is not None) and (axis != 0):
611 |             raise NotImplementedError("Only axis=0 is supported.")
612 |         if out is not None:
613 |             raise NotImplementedError("out parameter is not supported.")
614 | 
615 |         data, lengths = self._get_reduce_data_len(skipna)
616 |         return np.dot(data, lengths)
617 | 
618 |     def var(
619 |         self,
620 |         skipna: bool = True,
621 |         ddof: int = 1,
622 |         dtype: Optional[Any] = None,
623 |         axis: Optional[int] = 0,
624 |         out: Any = None,
625 |     ) -> Any:
626 |         _logger.debug("RLEArray.var(skipna=%r)", skipna)
627 |         if (axis is not None) and (axis != 0):
628 |             raise NotImplementedError("Only axis=0 is supported.")
629 |         if out is not None:
630 |             raise NotImplementedError("out parameter is not supported.")
631 |         if dtype is not None:
632 |             raise NotImplementedError("dtype parameter is not supported.")
633 | 
634 |         # TODO: fast-path
635 |         data = np.asarray(self).astype(dtype)
636 |         # use pandas-style var, since numpy results in different results
637 |         return pd.Series(data).var(skipna=skipna, ddof=ddof)
638 | 
639 |     def _reduce(self, name: str, skipna: bool = True, **kwargs: Any) -> Any:
640 |         _logger.debug(
641 |             "RLEArray._reduce(name=%r, skipna=%r, **kwargs=%r)", name, skipna, kwargs
642 |         )
643 |         if name == "all":
644 |             return self.all()
645 |         elif name == "any":
646 |             return self.any()
647 |         elif name == "kurt":
648 |             return self.kurt(skipna=skipna)
649 |         elif name == "max":
650 |             return self.max(skipna=skipna)
651 |         elif name == "mean":
652 |             return self.mean(skipna=skipna)
653 |         elif name == "median":
654 |             return self.median(skipna=skipna)
655 |         elif name == "min":
656 |             return self.min(skipna=skipna)
657 |         elif name == "prod":
658 |             return self.prod(skipna=skipna)
659 |         elif name == "skew":
660 |             return self.skew(skipna=skipna)
661 |         elif name == "std":
662 |             return self.std(skipna=skipna, ddof=int(kwargs.get("ddof", 1)))
663 |         elif name == "sum":
664 |             return self.sum(skipna=skipna)
665 |         elif name == "var":
666 |             return self.var(skipna=skipna)
667 |         else:
668 |             raise NotImplementedError(f"reduction {name} is not implemented.")
669 | 
670 |     def view(self, dtype: Optional[Any] = None) -> Any:
671 |         _logger.debug("RLEArray.view(dtype=%r)", dtype)
672 |         if dtype is None:
673 |             dtype = self.dtype._dtype
674 |         if isinstance(dtype, RLEDtype):
675 |             dtype = dtype._dtype
676 |         if dtype != self.dtype._dtype:
677 |             raise ValueError("Cannot create view with different dtype.")
678 | 
679 |         result = RLEArray(data=self._data.copy(), positions=self._positions.copy())
680 |         self._projection.master.register_change(result, None)
681 |         return result
682 | 
683 |     def dropna(self) -> "RLEArray":
684 |         _logger.debug("RLEArray.dropna()")
685 |         data, positions = dropna(self._data, self._positions)
686 |         return RLEArray(data=data, positions=positions)
687 | 
688 |     def value_counts(self, dropna: bool = True) -> pd.Series:
689 |         _logger.debug("RLEArray.value_counts(dropna=%r)", dropna)
690 |         # TODO: add fast-path
691 |         return pd.Series(np.asarray(self)).value_counts(dropna=dropna)
692 | 
693 |     def __iter__(self) -> Iterator[Any]:
694 |         _logger.debug("RLEArray.__iter__()")
695 |         warnings.warn("performance: __iter__ blows up entire data", PerformanceWarning)
696 |         return gen_iterator(self._data, self._positions)
697 | 
698 |     def __array_ufunc__(
699 |         self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any
700 |     ) -> Union[None, "RLEArray", np.ndarray]:
701 |         _logger.debug("RLEArray.__array_ufunc__(...)")
702 |         out = kwargs.get("out", ())
703 |         for x in inputs + out:
704 |             # Only support operations with instances of _HANDLED_TYPES.
705 |             # Use ArrayLike instead of type(self) for isinstance to
706 |             # allow subclasses that don't override __array_ufunc__ to
707 |             # handle ArrayLike objects.
708 |             if not isinstance(x, self._HANDLED_TYPES + (RLEArray,)):
709 |                 return NotImplemented
710 | 
711 |         # Defer to the implementation of the ufunc on unwrapped values.
712 |         inputs_has_ndarray = any(isinstance(x, np.ndarray) for x in inputs)
713 |         inputs = tuple(np.asarray(x) if isinstance(x, RLEArray) else x for x in inputs)
714 | 
715 |         if out:
716 |             kwargs["out"] = tuple(
717 |                 np.asarray(x) if isinstance(x, RLEArray) else x for x in out
718 |             )
719 |         result = getattr(ufunc, method)(*inputs, **kwargs)
720 |         if out:
721 |             for x, y in zip(out, kwargs["out"]):
722 |                 if isinstance(x, RLEArray):
723 |                     x[:] = y
724 | 
725 |         def maybe_from_sequence(x: np.ndarray) -> Union[RLEArray, np.ndarray]:
726 |             if (x.ndim == 1) and (not inputs_has_ndarray):
727 |                 # suitable for RLE compression
728 |                 return type(self)._from_sequence(x)
729 |             else:
730 |                 # likely a broadcast operation
731 |                 return x
732 | 
733 |         if type(result) is tuple:
734 |             # multiple return values
735 |             return tuple(maybe_from_sequence(x) for x in result)
736 |         elif method == "at":
737 |             assert result is None
738 | 
739 |             # inplace modification
740 |             self[:] = inputs[0]
741 | 
742 |             # no return value
743 |             return None
744 |         else:
745 |             # one return value
746 |             return maybe_from_sequence(result)
747 | 
748 |     def __eq__(self, other: Any) -> Union["RLEArray", np.ndarray]:
749 |         return self._apply_binary_operator(other, op=operator.eq)
750 | 
751 |     def __ne__(self, other: Any) -> Union["RLEArray", np.ndarray]:
752 |         return self._apply_binary_operator(other, op=operator.ne)
753 | 
754 |     def __gt__(self, other: Any) -> Union["RLEArray", np.ndarray]:
755 |         return self._apply_binary_operator(other, op=operator.gt)
756 | 
757 |     def __ge__(self, other: Any) -> Union["RLEArray", np.ndarray]:
758 |         return self._apply_binary_operator(other, op=operator.ge)
759 | 
760 |     def __lt__(self, other: Any) -> Union["RLEArray", np.ndarray]:
761 |         return self._apply_binary_operator(other, op=operator.lt)
762 | 
763 |     def __le__(self, other: Any) -> Union["RLEArray", np.ndarray]:
764 |         return self._apply_binary_operator(other, op=operator.le)
765 | 
766 |     def __add__(self, other: Any) -> Union["RLEArray", np.ndarray]:
767 |         return self._apply_binary_operator(other, op=operator.add)
768 | 
769 |     def __radd__(self, other: Any) -> Union["RLEArray", np.ndarray]:
770 |         return self._apply_binary_operator(other, op=ops.radd)
771 | 
772 |     def __sub__(self, other: Any) -> Union["RLEArray", np.ndarray]:
773 |         return self._apply_binary_operator(other, op=operator.sub)
774 | 
775 |     def __rsub__(self, other: Any) -> Union["RLEArray", np.ndarray]:
776 |         return self._apply_binary_operator(other, op=ops.rsub)
777 | 
778 |     def __mul__(self, other: Any) -> Union["RLEArray", np.ndarray]:
779 |         return self._apply_binary_operator(other, op=operator.mul)
780 | 
781 |     def __rmul__(self, other: Any) -> Union["RLEArray", np.ndarray]:
782 |         return self._apply_binary_operator(other, op=ops.rmul)
783 | 
784 |     def __truediv__(self, other: Any) -> Union["RLEArray", np.ndarray]:
785 |         return self._apply_binary_operator(other, op=operator.truediv)
786 | 
787 |     def __rtruediv__(self, other: Any) -> Union["RLEArray", np.ndarray]:
788 |         return self._apply_binary_operator(other, op=ops.rtruediv)
789 | 
790 |     def __floordiv__(self, other: Any) -> Union["RLEArray", np.ndarray]:
791 |         return self._apply_binary_operator(other, op=operator.floordiv)
792 | 
793 |     def __rfloordiv__(self, other: Any) -> Union["RLEArray", np.ndarray]:
794 |         return self._apply_binary_operator(other, op=ops.rfloordiv)
795 | 
796 |     def __mod__(self, other: Any) -> Union["RLEArray", np.ndarray]:
797 |         return self._apply_binary_operator(other, op=operator.mod)
798 | 
799 |     def __rmod__(self, other: Any) -> Union["RLEArray", np.ndarray]:
800 |         return self._apply_binary_operator(other, op=ops.rmod)
801 | 
802 |     def __pow__(self, other: Any) -> Union["RLEArray", np.ndarray]:
803 |         return self._apply_binary_operator(other, op=operator.pow)
804 | 
805 |     def __rpow__(self, other: Any) -> Union["RLEArray", np.ndarray]:
806 |         return self._apply_binary_operator(other, op=ops.rpow)
807 | 
808 |     def __and__(self, other: Any) -> Union["RLEArray", np.ndarray]:
809 |         return self._apply_binary_operator(other, op=operator.and_)
810 | 
811 |     def __rand__(self, other: Any) -> Union["RLEArray", np.ndarray]:
812 |         return self._apply_binary_operator(other, op=ops.rand_)
813 | 
814 |     def __or__(self, other: Any) -> Union["RLEArray", np.ndarray]:
815 |         return self._apply_binary_operator(other, op=operator.or_)
816 | 
817 |     def __ror__(self, other: Any) -> Union["RLEArray", np.ndarray]:
818 |         return self._apply_binary_operator(other, op=ops.ror_)
819 | 
820 |     def __xor__(self, other: Any) -> Union["RLEArray", np.ndarray]:
821 |         return self._apply_binary_operator(other, op=operator.xor)
822 | 
823 |     def __rxor__(self, other: Any) -> Union["RLEArray", np.ndarray]:
824 |         return self._apply_binary_operator(other, op=ops.rxor)
825 | 
826 |     def __pos__(self) -> "RLEArray":
827 |         return self._apply_unary_operator(op=operator.pos)
828 | 
829 |     def __neg__(self) -> "RLEArray":
830 |         return self._apply_unary_operator(op=operator.neg)
831 | 
832 |     def __abs__(self) -> "RLEArray":
833 |         return self._apply_unary_operator(op=operator.abs)
834 | 
835 |     def __invert__(self) -> "RLEArray":
836 |         _logger.debug("RLEArray.__invert__()")
837 |         return self._apply_unary_operator(op=operator.inv)
838 | 
839 |     def _apply_binary_operator(
840 |         self, other: Any, op: Any
841 |     ) -> Union["RLEArray", np.ndarray]:
842 |         if isinstance(other, (ABCSeries, ABCIndexClass)):
843 |             # rely on pandas to unbox and dispatch to us
844 |             return NotImplemented
845 | 
846 |         if is_scalar(other):
847 |             with np.errstate(invalid="ignore"):
848 |                 new_data = op(self._data, other)
849 |             return RLEArray(*recompress(new_data, self._positions))
850 |         elif isinstance(other, RLEArray):
851 |             if len(self) != len(other):
852 |                 raise ValueError("arrays have different lengths")
853 |             extended_positions = extend_positions(self._positions, other._positions)
854 |             data_self = extend_data(
855 |                 data=self._data,
856 |                 positions=self._positions,
857 |                 extended_positions=extended_positions,
858 |             )
859 |             data_other = extend_data(
860 |                 data=other._data,
861 |                 positions=other._positions,
862 |                 extended_positions=extended_positions,
863 |             )
864 |             with np.errstate(invalid="ignore"):
865 |                 new_data = op(data_self, data_other)
866 |             return RLEArray(*recompress(new_data, extended_positions))
867 |         else:
868 |             array = self.__array__()
869 |             with np.errstate(invalid="ignore"):
870 |                 return op(array, other)
871 | 
872 |     def _apply_unary_operator(self, op: Any) -> "RLEArray":
873 |         return RLEArray(data=op(self._data), positions=self._positions.copy())
874 | 
875 |     def shift(self, periods: int = 1, fill_value: object = None) -> "RLEArray":
876 |         self2 = self
877 |         dtype = self.dtype
878 | 
879 |         if isna(fill_value):
880 |             fill_value = self.dtype.na_value
881 |             np_dtype_fill = np.asarray([fill_value]).dtype
882 |             if np_dtype_fill.kind != dtype.kind:
883 |                 dtype = RLEDtype(np_dtype_fill)
884 |                 self2 = self.astype(dtype)
885 | 
886 |         if not len(self) or periods == 0:
887 |             return self2.copy()
888 | 
889 |         empty = RLEArray(
890 |             data=np.asarray([fill_value], dtype=dtype._dtype),
891 |             positions=np.asarray([min(abs(periods), len(self))], dtype=POSITIONS_DTYPE),
892 |         )
893 | 
894 |         if periods > 0:
895 |             a = empty
896 |             b = self2[:-periods]
897 |         else:
898 |             a = self2[abs(periods) :]
899 |             b = empty
900 |         return self._concat_same_type([a, b])
901 | 
902 |     def fillna(
903 |         self,
904 |         value: Any = None,
905 |         method: Optional[str] = None,
906 |         limit: Optional[int] = None,
907 |     ) -> "RLEArray":
908 |         # TODO: fast-path
909 |         arr = pd.Series(np.asarray(self)).array.fillna(value, method, limit).to_numpy()
910 |         data, positions = compress(arr)
911 |         return RLEArray(data=data, positions=positions)
912 | 
913 |     def round(self, decimals: int = 0) -> "RLEArray":
914 |         _logger.debug("RLEArray.round(decimals=%r)", decimals)
915 |         new_data = self._data.round(decimals)
916 |         return RLEArray(*recompress(new_data, self._positions))
917 | 
918 |     def unique(self) -> "RLEArray":
919 |         uniques = unique(self._data)
920 |         return RLEArray(
921 |             data=uniques,
922 |             positions=np.arange(1, 1 + len(uniques), dtype=POSITIONS_DTYPE),
923 |         )
924 | 


--------------------------------------------------------------------------------
/rle_array/autoconversion.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from typing import Optional, Union
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | from pandas.api.extensions import ExtensionDtype
  7 | from pandas.errors import PerformanceWarning
  8 | 
  9 | from .array import RLEArray
 10 | from .dtype import RLEDtype
 11 | 
 12 | 
 13 | def _is_rle_dtype(dtype: Union[np.dtype, ExtensionDtype]) -> bool:
 14 |     """
 15 |     Checks if the given dtype is already RLE compressed.
 16 | 
 17 |     Parameters
 18 |     ----------
 19 |     dtype
 20 |         Input dtype.
 21 |     """
 22 |     return isinstance(dtype, RLEDtype)
 23 | 
 24 | 
 25 | def _uses_datetimeblock(dtype: Union[np.dtype, ExtensionDtype]) -> bool:
 26 |     """
 27 |     Detects if the RLEArray would use a pandas ``DatetimeBlock``.
 28 | 
 29 |     It seems to be a bug in pandas that it cannot deal with datetime extension arrays.
 30 | 
 31 |     Parameters
 32 |     ----------
 33 |     dtype
 34 |         Dtype of the original, uncompressed array.
 35 |     """
 36 |     vtype = dtype.type
 37 |     return issubclass(vtype, np.datetime64)
 38 | 
 39 | 
 40 | def auto_convert_to_rle(
 41 |     df: pd.DataFrame, threshold: Optional[float] = None
 42 | ) -> pd.DataFrame:
 43 |     """
 44 |     Auto-convert given DataFrame to RLE compressed DataFrame.
 45 | 
 46 |     .. important::
 47 | 
 48 |         Datetime columns are currently not compressed due to pandas not supporting them.
 49 | 
 50 |     Please note that RLE can, under some circumstances, require MORE memory than the uncompressed data. It is not
 51 |     advisable to set ``threshold`` to a value larger than 1 except for testing purposes.
 52 | 
 53 |     Parameters
 54 |     ----------
 55 |     df
 56 |         Input DataFrame, may already contain RLE columns. This input data MIGHT not be copied!
 57 |     threshold
 58 |         Compression threshold, e.g.:
 59 | 
 60 |         - ``None``: compress all
 61 |         - ``1.0`` compresses only if RLE does NOT take up more space
 62 |         - ``0.5`` compresses if at least 50% memory are safed
 63 |         - ``0.0`` do not compress at all
 64 | 
 65 |     Raises
 66 |     ------
 67 |     ValueError
 68 |         If threshold is negative.
 69 |     """
 70 |     if (threshold is not None) and (threshold < 0.0):
 71 |         raise ValueError(f"threshold ({threshold}) must be non-negative")
 72 | 
 73 |     index = df.index
 74 | 
 75 |     data = {}
 76 |     for col in df.columns:
 77 |         series = df[col]
 78 |         array_orig = series.array
 79 | 
 80 |         array_target = array_orig
 81 | 
 82 |         dtype = series.dtype
 83 | 
 84 |         if not _is_rle_dtype(dtype):
 85 |             if _uses_datetimeblock(dtype):
 86 |                 warnings.warn(
 87 |                     f"Column {col} would use a DatetimeBlock and can currently not be RLE compressed."
 88 |                 )
 89 |             else:
 90 |                 array_rle = RLEArray._from_sequence(
 91 |                     scalars=array_orig, dtype=dtype, copy=True
 92 |                 )
 93 |                 if threshold is None:
 94 |                     array_target = array_rle
 95 |                 elif threshold > 0:
 96 |                     if (len(array_orig) == 0) or (
 97 |                         array_rle.nbytes / array_orig.nbytes <= threshold
 98 |                     ):
 99 |                         array_target = array_rle
100 | 
101 |         data[col] = array_target
102 | 
103 |     return pd.DataFrame(data, index=index)
104 | 
105 | 
106 | def decompress(df: pd.DataFrame) -> pd.DataFrame:
107 |     """
108 |     Decompress all RLE columns in the provided DataFrame.
109 | 
110 |     Parameters
111 |     ----------
112 |     df
113 |         Input DataFrame. This input data MIGHT not be copied!
114 |     """
115 |     index = df.index
116 | 
117 |     data = {}
118 |     for col in df.columns:
119 |         series = df[col]
120 |         array = series.array
121 |         dtype = series.dtype
122 | 
123 |         if _is_rle_dtype(dtype):
124 |             with warnings.catch_warnings():
125 |                 warnings.simplefilter("ignore", category=PerformanceWarning)
126 |                 array = array.astype(dtype._dtype)
127 | 
128 |         data[col] = array
129 | 
130 |     return pd.DataFrame(data, index=index)
131 | 


--------------------------------------------------------------------------------
/rle_array/dtype.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Callable, List, Optional, cast
 2 | 
 3 | import numpy as np
 4 | from pandas.api.extensions import ExtensionDtype, register_extension_dtype
 5 | from pandas.core.dtypes.cast import find_common_type
 6 | 
 7 | import rle_array
 8 | 
 9 | 
10 | @register_extension_dtype
11 | class RLEDtype(ExtensionDtype):
12 |     _metadata = ("_dtype",)
13 | 
14 |     def __init__(self, dtype: Any):
15 |         self._dtype = np.dtype(dtype)
16 | 
17 |     @property
18 |     def type(self) -> Callable[..., Any]:
19 |         return cast(Callable[..., Any], self._dtype.type)
20 | 
21 |     @property
22 |     def kind(self) -> str:
23 |         return str(self._dtype.kind)
24 | 
25 |     @property
26 |     def name(self) -> str:
27 |         return f"RLEDtype[{self._dtype}]"
28 | 
29 |     @classmethod
30 |     def construct_from_string(cls, string: str) -> "RLEDtype":
31 |         """
32 |         Strict construction from a string, raise a TypeError if not possible.
33 |         """
34 |         if not isinstance(string, str):
35 |             raise TypeError(
36 |                 f"'construct_from_string' expects a string, got {type(string)}"
37 |             )
38 | 
39 |         prefix = "RLEDtype["
40 |         suffix = "]"
41 |         if not (string.startswith(prefix) and string.endswith(suffix)):
42 |             raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
43 |         sub = string[len(prefix) : -len(suffix)]
44 |         return cls(np.dtype(sub))
45 | 
46 |     @classmethod
47 |     def construct_array_type(
48 |         cls,
49 |     ) -> Callable[[np.ndarray, np.ndarray], "rle_array.RLEArray"]:
50 |         return rle_array.RLEArray
51 | 
52 |     @property
53 |     def _is_numeric(self) -> bool:
54 |         # exclude object, str, unicode, void.
55 |         return self.kind in set("biufc")
56 | 
57 |     @property
58 |     def _is_boolean(self) -> bool:
59 |         return self.kind == "b"
60 | 
61 |     def _get_common_dtype(self, dtypes: List[Any]) -> Optional[Any]:
62 |         unpacked_dtypes = []
63 |         only_rle = True
64 |         for t in dtypes:
65 |             if isinstance(t, RLEDtype):
66 |                 unpacked_dtypes.append(t._dtype)
67 |             else:
68 |                 unpacked_dtypes.append(t)
69 |                 only_rle = False
70 | 
71 |         # ask pandas for help
72 |         suggested_type = find_common_type(unpacked_dtypes)
73 | 
74 |         # prefer RLE
75 |         if (suggested_type is not None) and only_rle:
76 |             return RLEDtype(suggested_type)
77 |         else:
78 |             return suggested_type
79 | 
80 |     def __repr__(self) -> str:
81 |         return f"RLEDtype({self._dtype!r})"
82 | 


--------------------------------------------------------------------------------
/rle_array/testing.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Functionality useful for testing and documentation.
  3 | """
  4 | import itertools
  5 | from typing import Iterable
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | 
 10 | 
 11 | def dim_col(d: int) -> str:
 12 |     """
 13 |     Name of an dimension columns.
 14 | 
 15 |     Parameters
 16 |     ----------
 17 |     d
 18 |         Dimension number.
 19 | 
 20 |     Returns
 21 |     -------
 22 |     name: str
 23 |         Dimension name.
 24 | 
 25 |     Example
 26 |     -------
 27 |     >>> from rle_array.testing import dim_col
 28 |     >>> dim_col(1)
 29 |     'dim_1'
 30 |     """
 31 |     return f"dim_{d}"
 32 | 
 33 | 
 34 | def const_col(dims: Iterable[int]) -> str:
 35 |     """
 36 |     Name of an constant columns.
 37 | 
 38 |     Parameters
 39 |     ----------
 40 |     dims
 41 |         Dimensions, that describe the column content.
 42 | 
 43 |     Returns
 44 |     -------
 45 |     name: str
 46 |         Column name.
 47 | 
 48 |     Example
 49 |     -------
 50 |     >>> from rle_array.testing import const_col
 51 |     >>> const_col([1, 2])
 52 |     'const_1_2'
 53 |     >>> const_col([2, 1])
 54 |     'const_1_2'
 55 |     """
 56 |     dims = sorted(dims)
 57 |     dims_str = [str(d) for d in dims]
 58 |     return f"const_{'_'.join(dims_str)}"
 59 | 
 60 | 
 61 | def _insert_sorted(df: pd.DataFrame, column: str, value: np.ndarray) -> None:
 62 |     pos = 0
 63 |     for i, c in enumerate(df.columns):
 64 |         if c > column:
 65 |             break
 66 |         pos = i + 1
 67 |     df.insert(pos, column, value)
 68 | 
 69 | 
 70 | def _setup_dim_df(n_dims: int, size: int) -> pd.DataFrame:
 71 |     elements = np.arange(size ** n_dims)
 72 |     df = pd.DataFrame(index=pd.RangeIndex(0, len(elements)))
 73 |     for i in range(n_dims):
 74 |         _insert_sorted(df, dim_col(i), (elements // (size ** i)) % size)
 75 |     return df
 76 | 
 77 | 
 78 | def _add_const_cols(df: pd.DataFrame, n_dims: int, size: int) -> pd.DataFrame:
 79 |     for dims in itertools.chain(
 80 |         *(
 81 |             itertools.combinations(range(n_dims), dims_len + 1)
 82 |             for dims_len in range(n_dims)
 83 |         )
 84 |     ):
 85 |         data = None
 86 |         for d in dims:
 87 |             if data is None:
 88 |                 data = df[dim_col(d)].values
 89 |             else:
 90 |                 data = data * size + df[dim_col(d)].values
 91 |         _insert_sorted(df, const_col(dims), data)
 92 |     return df
 93 | 
 94 | 
 95 | def generate_test_dataframe(n_dims: int, size: int) -> pd.DataFrame:
 96 |     """
 97 |     Generates testing data.
 98 | 
 99 |     Parameters
100 |     ---------
101 |     n_dims
102 |         Number of dimensions of test cube.
103 |     size
104 |         Size of every dimension (edge length).
105 | 
106 |     Returns
107 |     -------
108 |     df: pd.DataFrame
109 |         Test DataFrame.
110 |     """
111 |     df = _setup_dim_df(n_dims, size)
112 |     df = _add_const_cols(df, n_dims, size)
113 |     return df
114 | 
115 | 
116 | def generate_example() -> pd.DataFrame:
117 |     """
118 |     Generate example DataFrame for documentation purposes.
119 | 
120 |     Returns
121 |     -------
122 |     df: pd.DataFrame
123 |         Example DataFrame.
124 |     """
125 |     rng = np.random.RandomState(1234)
126 | 
127 |     df = generate_test_dataframe(n_dims=2, size=2000)
128 |     df["date"] = pd.Timestamp("2000-01-01") + pd.to_timedelta(df["dim_0"], unit="D")
129 |     df["month"] = df["date"].dt.month.astype(np.int8)
130 |     df["year"] = df["date"].dt.year.astype(np.int16)
131 |     df["city"] = "city_" + df["dim_1"].astype("str")
132 |     df["country"] = "country_" + (df["dim_1"] // 500).astype("str")
133 |     df["avg_temp"] = (
134 |         rng.normal(loc=10.0, scale=5.0, size=len(df))
135 |         .round(decimals=1)
136 |         .astype(np.float32)
137 |     )
138 |     df["rain"] = rng.rand(len(df)) > 0.9
139 |     df["mood"] = "ok"
140 |     df.loc[(~df["rain"]) & (df["avg_temp"] > 15), "mood"] = "great"
141 |     df.loc[(df["rain"]) & (df["avg_temp"] < 5), "mood"] = "sad"
142 |     return df[["date", "month", "year", "city", "country", "avg_temp", "rain", "mood"]]
143 | 


--------------------------------------------------------------------------------
/rle_array/types.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | 
3 | __all__ = ("POSITIONS_DTYPE",)
4 | 
5 | 
6 | #: Data type used to encode positions of RLE-runs.
7 | POSITIONS_DTYPE = np.int64
8 | 


--------------------------------------------------------------------------------
/scripts/fmt.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -exuo pipefail
4 | 
5 | black .
6 | isort --atomic .
7 | 


--------------------------------------------------------------------------------
/scripts/test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -exuo pipefail
 4 | 
 5 | mypy .
 6 | pytest
 7 | black --check .
 8 | isort --check-only .
 9 | flake8
10 | asv run --show-stderr --environment existing --quick
11 | python setup.py build_sphinx
12 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [build_sphinx]
2 | source-dir = docs
3 | build-dir = docs/_build
4 | builder = doctest,html
5 | warning-is-error = true
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import setuptools
3 | 
4 | if __name__ == "__main__":
5 |     setuptools.setup()
6 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JDASoftwareGroup/rle-array/e5201b9185079f4fc4fd907d8f591426df79946e/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_astype.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pytest
 4 | 
 5 | from rle_array import RLEDtype
 6 | 
 7 | pytestmark = pytest.mark.filterwarnings("ignore:performance")
 8 | 
 9 | 
10 | @pytest.fixture
11 | def series() -> pd.Series:
12 |     return pd.Series([1, 1, 2]).astype(RLEDtype(int))
13 | 
14 | 
15 | def test_no_copy(series: pd.Series) -> None:
16 |     series2 = series.astype(series.dtype, copy=False)
17 |     assert series2.values is series.values
18 |     assert series2.dtype == RLEDtype(int)
19 | 
20 | 
21 | def test_copy_different_dtype(series: pd.Series) -> None:
22 |     series2 = series.astype(RLEDtype(float), copy=False)
23 |     assert series2.values is not series.values
24 |     assert series2.dtype == RLEDtype(float)
25 | 
26 | 
27 | def test_cast_to_np_array(series: pd.Series) -> None:
28 |     series2 = series.astype(int, copy=False)
29 |     assert series2.values is not series.values
30 |     assert series2.dtype == np.dtype(int)
31 | 


--------------------------------------------------------------------------------
/tests/test_autoconversion.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import pytest
  6 | from pandas import testing as pdt
  7 | 
  8 | from rle_array.autoconversion import auto_convert_to_rle, decompress
  9 | from rle_array.dtype import RLEDtype
 10 | 
 11 | pytestmark = pytest.mark.filterwarnings("ignore:performance")
 12 | 
 13 | 
 14 | @pytest.mark.parametrize(
 15 |     "orig, threshold, expected",
 16 |     [
 17 |         (
 18 |             # orig
 19 |             pd.DataFrame(
 20 |                 {
 21 |                     "int64": pd.Series([1], dtype=np.int64),
 22 |                     "int32": pd.Series([1], dtype=np.int32),
 23 |                     "uint64": pd.Series([1], dtype=np.uint64),
 24 |                     "float64": pd.Series([1.2], dtype=np.float64),
 25 |                     "bool": pd.Series([True], dtype=np.bool_),
 26 |                     "object": pd.Series(["foo"], dtype=np.object_),
 27 |                     "datetime64": pd.Series(
 28 |                         [pd.Timestamp("2020-01-01")], dtype="datetime64[ns]"
 29 |                     ),
 30 |                 }
 31 |             ),
 32 |             # threshold
 33 |             None,
 34 |             # expected
 35 |             pd.DataFrame(
 36 |                 {
 37 |                     "int64": pd.Series([1], dtype=RLEDtype(np.int64)),
 38 |                     "int32": pd.Series([1], dtype=RLEDtype(np.int32)),
 39 |                     "uint64": pd.Series([1], dtype=RLEDtype(np.uint64)),
 40 |                     "float64": pd.Series([1.2], dtype=RLEDtype(np.float64)),
 41 |                     "bool": pd.Series([True], dtype=RLEDtype(np.bool_)),
 42 |                     "object": pd.Series(["foo"]).astype(RLEDtype(np.object_)),
 43 |                     "datetime64": pd.Series(
 44 |                         [pd.Timestamp("2020-01-01")], dtype="datetime64[ns]"
 45 |                     ),
 46 |                 }
 47 |             ),
 48 |         ),
 49 |         (
 50 |             # orig
 51 |             pd.DataFrame(
 52 |                 {
 53 |                     "int64": pd.Series([1], dtype=np.int64),
 54 |                     "int32": pd.Series([1], dtype=np.int32),
 55 |                     "uint64": pd.Series([1], dtype=np.uint64),
 56 |                     "float64": pd.Series([1.2], dtype=np.float64),
 57 |                     "bool": pd.Series([True], dtype=np.bool_),
 58 |                     "object": pd.Series(["foo"], dtype=np.object_),
 59 |                     "datetime64": pd.Series(
 60 |                         [pd.Timestamp("2020-01-01")], dtype="datetime64[ns]"
 61 |                     ),
 62 |                 }
 63 |             ),
 64 |             # threshold
 65 |             2.0,
 66 |             # expected
 67 |             pd.DataFrame(
 68 |                 {
 69 |                     "int64": pd.Series([1], dtype=RLEDtype(np.int64)),
 70 |                     "int32": pd.Series([1], dtype=np.int32),
 71 |                     "uint64": pd.Series([1], dtype=RLEDtype(np.uint64)),
 72 |                     "float64": pd.Series([1.2], dtype=RLEDtype(np.float64)),
 73 |                     "bool": pd.Series([True], dtype=np.bool_),
 74 |                     "object": pd.Series(["foo"]).astype(RLEDtype(np.object_)),
 75 |                     "datetime64": pd.Series(
 76 |                         [pd.Timestamp("2020-01-01")], dtype="datetime64[ns]"
 77 |                     ),
 78 |                 }
 79 |             ),
 80 |         ),
 81 |         (
 82 |             # orig
 83 |             pd.DataFrame(
 84 |                 {
 85 |                     "single_value": pd.Series([1, 1, 1, 1, 1, 1], dtype=np.int64),
 86 |                     "two_values": pd.Series([1, 1, 1, 2, 2, 2], dtype=np.int64),
 87 |                     "increasing": pd.Series([1, 2, 3, 4, 5, 6], dtype=np.int64),
 88 |                 }
 89 |             ),
 90 |             # threshold
 91 |             None,
 92 |             # expected
 93 |             pd.DataFrame(
 94 |                 {
 95 |                     "single_value": pd.Series(
 96 |                         [1, 1, 1, 1, 1, 1], dtype=RLEDtype(np.int64)
 97 |                     ),
 98 |                     "two_values": pd.Series(
 99 |                         [1, 1, 1, 2, 2, 2], dtype=RLEDtype(np.int64)
100 |                     ),
101 |                     "increasing": pd.Series(
102 |                         [1, 2, 3, 4, 5, 6], dtype=RLEDtype(np.int64)
103 |                     ),
104 |                 }
105 |             ),
106 |         ),
107 |         (
108 |             # orig
109 |             pd.DataFrame(
110 |                 {
111 |                     "single_value": pd.Series([1, 1, 1, 1, 1, 1], dtype=np.int64),
112 |                     "two_values": pd.Series([1, 1, 1, 2, 2, 2], dtype=np.int64),
113 |                     "increasing": pd.Series([1, 2, 3, 4, 5, 6], dtype=np.int64),
114 |                 }
115 |             ),
116 |             # threshold
117 |             0.9,
118 |             # expected
119 |             pd.DataFrame(
120 |                 {
121 |                     "single_value": pd.Series(
122 |                         [1, 1, 1, 1, 1, 1], dtype=RLEDtype(np.int64)
123 |                     ),
124 |                     "two_values": pd.Series(
125 |                         [1, 1, 1, 2, 2, 2], dtype=RLEDtype(np.int64)
126 |                     ),
127 |                     "increasing": pd.Series([1, 2, 3, 4, 5, 6], dtype=np.int64),
128 |                 }
129 |             ),
130 |         ),
131 |         (
132 |             # orig
133 |             pd.DataFrame(
134 |                 {
135 |                     "single_value": pd.Series([1, 1, 1, 1, 1, 1], dtype=np.int64),
136 |                     "two_values": pd.Series([1, 1, 1, 2, 2, 2], dtype=np.int64),
137 |                     "increasing": pd.Series([1, 2, 3, 4, 5, 6], dtype=np.int64),
138 |                 }
139 |             ),
140 |             # threshold
141 |             0.5,
142 |             # expected
143 |             pd.DataFrame(
144 |                 {
145 |                     "single_value": pd.Series(
146 |                         [1, 1, 1, 1, 1, 1], dtype=RLEDtype(np.int64)
147 |                     ),
148 |                     "two_values": pd.Series([1, 1, 1, 2, 2, 2], dtype=np.int64),
149 |                     "increasing": pd.Series([1, 2, 3, 4, 5, 6], dtype=np.int64),
150 |                 }
151 |             ),
152 |         ),
153 |         (
154 |             # orig
155 |             pd.DataFrame(
156 |                 {
157 |                     "single_value": pd.Series([1, 1, 1, 1, 1, 1], dtype=np.int64),
158 |                     "two_values": pd.Series([1, 1, 1, 2, 2, 2], dtype=np.int64),
159 |                     "increasing": pd.Series([1, 2, 3, 4, 5, 6], dtype=np.int64),
160 |                 }
161 |             ),
162 |             # threshold
163 |             0.0,
164 |             # expected
165 |             pd.DataFrame(
166 |                 {
167 |                     "single_value": pd.Series([1, 1, 1, 1, 1, 1], dtype=np.int64),
168 |                     "two_values": pd.Series([1, 1, 1, 2, 2, 2], dtype=np.int64),
169 |                     "increasing": pd.Series([1, 2, 3, 4, 5, 6], dtype=np.int64),
170 |                 }
171 |             ),
172 |         ),
173 |         (
174 |             # orig
175 |             pd.DataFrame({"x": pd.Series([], dtype=np.int64)}),
176 |             # threshold
177 |             0.0,
178 |             # expected
179 |             pd.DataFrame({"x": pd.Series([], dtype=np.int64)}),
180 |         ),
181 |         (
182 |             # orig
183 |             pd.DataFrame({"x": pd.Series([], dtype=np.int64)}),
184 |             # threshold
185 |             0.1,
186 |             # expected
187 |             pd.DataFrame({"x": pd.Series([], dtype=RLEDtype(np.int64))}),
188 |         ),
189 |         (
190 |             # orig
191 |             pd.DataFrame(
192 |                 {
193 |                     "single_value": pd.Series([1, 1, 1, 1, 1, 1], dtype=np.int64),
194 |                     "two_values": pd.Series([1, 1, 1, 2, 2, 2], dtype=np.int64),
195 |                     "increasing": pd.Series(
196 |                         [1, 2, 3, 4, 5, 6], dtype=RLEDtype(np.int64)
197 |                     ),
198 |                 }
199 |             ),
200 |             # threshold
201 |             0.5,
202 |             # expected
203 |             pd.DataFrame(
204 |                 {
205 |                     "single_value": pd.Series(
206 |                         [1, 1, 1, 1, 1, 1], dtype=RLEDtype(np.int64)
207 |                     ),
208 |                     "two_values": pd.Series([1, 1, 1, 2, 2, 2], dtype=np.int64),
209 |                     "increasing": pd.Series(
210 |                         [1, 2, 3, 4, 5, 6], dtype=RLEDtype(np.int64)
211 |                     ),
212 |                 }
213 |             ),
214 |         ),
215 |         (
216 |             # orig
217 |             pd.DataFrame({"x": pd.Series(range(10), dtype=np.int64)}),
218 |             # threshold
219 |             1.0,
220 |             # expected
221 |             pd.DataFrame({"x": pd.Series(range(10), dtype=np.int64)}),
222 |         ),
223 |         (
224 |             # orig
225 |             pd.DataFrame(),
226 |             # threshold
227 |             None,
228 |             # expected
229 |             pd.DataFrame(),
230 |         ),
231 |     ],
232 | )
233 | @pytest.mark.filterwarnings("ignore:.*would use a DatetimeBlock:UserWarning")
234 | def test_auto_convert_to_rle_ok(
235 |     orig: pd.DataFrame, threshold: Optional[float], expected: pd.DataFrame
236 | ) -> None:
237 |     actual = auto_convert_to_rle(orig, threshold)
238 |     pdt.assert_frame_equal(actual, expected)
239 | 
240 | 
241 | def test_datetime_warns() -> None:
242 |     df = pd.DataFrame(
243 |         {
244 |             "i1": pd.Series([1], dtype=np.int64),
245 |             "d1": pd.Series([pd.Timestamp("2020-01-01")], dtype="datetime64[ns]"),
246 |             "i2": pd.Series([1], dtype=np.int64),
247 |             "d2": pd.Series([pd.Timestamp("2020-01-01")], dtype="datetime64[ns]"),
248 |         }
249 |     )
250 |     with pytest.warns(None) as record:
251 |         auto_convert_to_rle(df, 0.5)
252 |     assert len(record) == 2
253 |     assert (
254 |         str(record[0].message)
255 |         == "Column d1 would use a DatetimeBlock and can currently not be RLE compressed."
256 |     )
257 |     assert (
258 |         str(record[1].message)
259 |         == "Column d2 would use a DatetimeBlock and can currently not be RLE compressed."
260 |     )
261 | 
262 | 
263 | def test_auto_convert_to_rle_threshold_out_of_range() -> None:
264 |     df = pd.DataFrame({"x": [1]})
265 | 
266 |     with pytest.raises(ValueError, match=r"threshold \(-0.1\) must be non-negative"):
267 |         auto_convert_to_rle(df, -0.1)
268 | 
269 | 
270 | @pytest.mark.parametrize(
271 |     "orig, expected",
272 |     [
273 |         (
274 |             # orig
275 |             pd.DataFrame(
276 |                 {
277 |                     "int64": pd.Series([1], dtype=RLEDtype(np.int64)),
278 |                     "int32": pd.Series([1], dtype=RLEDtype(np.int32)),
279 |                     "uint64": pd.Series([1], dtype=RLEDtype(np.uint64)),
280 |                     "float64": pd.Series([1.2], dtype=RLEDtype(np.float64)),
281 |                     "bool": pd.Series([True], dtype=RLEDtype(np.bool_)),
282 |                     "object": pd.Series(["foo"]).astype(RLEDtype(np.object_)),
283 |                     "datetime64": pd.Series(
284 |                         [pd.Timestamp("2020-01-01")], dtype="datetime64[ns]"
285 |                     ),
286 |                 }
287 |             ),
288 |             # expected
289 |             pd.DataFrame(
290 |                 {
291 |                     "int64": pd.Series([1], dtype=np.int64),
292 |                     "int32": pd.Series([1], dtype=np.int32),
293 |                     "uint64": pd.Series([1], dtype=np.uint64),
294 |                     "float64": pd.Series([1.2], dtype=np.float64),
295 |                     "bool": pd.Series([True], dtype=np.bool_),
296 |                     "object": pd.Series(["foo"], dtype=np.object_),
297 |                     "datetime64": pd.Series(
298 |                         [pd.Timestamp("2020-01-01")], dtype="datetime64[ns]"
299 |                     ),
300 |                 }
301 |             ),
302 |         ),
303 |         (
304 |             # orig
305 |             pd.DataFrame(
306 |                 {
307 |                     "int64": pd.Series([1], dtype=np.int64),
308 |                     "int32": pd.Series([1], dtype=np.int32),
309 |                     "uint64": pd.Series([1], dtype=np.uint64),
310 |                     "float64": pd.Series([1.2], dtype=np.float64),
311 |                     "bool": pd.Series([True], dtype=np.bool_),
312 |                     "object": pd.Series(["foo"], dtype=np.object_),
313 |                     "datetime64": pd.Series(
314 |                         [pd.Timestamp("2020-01-01")], dtype="datetime64[ns]"
315 |                     ),
316 |                 }
317 |             ),
318 |             # expected
319 |             pd.DataFrame(
320 |                 {
321 |                     "int64": pd.Series([1], dtype=np.int64),
322 |                     "int32": pd.Series([1], dtype=np.int32),
323 |                     "uint64": pd.Series([1], dtype=np.uint64),
324 |                     "float64": pd.Series([1.2], dtype=np.float64),
325 |                     "bool": pd.Series([True], dtype=np.bool_),
326 |                     "object": pd.Series(["foo"], dtype=np.object_),
327 |                     "datetime64": pd.Series(
328 |                         [pd.Timestamp("2020-01-01")], dtype="datetime64[ns]"
329 |                     ),
330 |                 }
331 |             ),
332 |         ),
333 |         (
334 |             # orig
335 |             pd.DataFrame(),
336 |             # expected
337 |             pd.DataFrame(),
338 |         ),
339 |     ],
340 | )
341 | def test_decompress_ok(orig: pd.DataFrame, expected: pd.DataFrame) -> None:
342 |     actual = decompress(orig)
343 |     pdt.assert_frame_equal(actual, expected)
344 | 
345 | 
346 | def test_decompress_does_not_warn() -> None:
347 |     df = pd.DataFrame({"x": pd.Series([1] * 10, dtype=RLEDtype(np.int64))})
348 | 
349 |     with pytest.warns(None) as record:
350 |         decompress(df)
351 | 
352 |     assert len(record) == 0
353 | 


--------------------------------------------------------------------------------
/tests/test_builtins.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable, Union, cast
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import pytest
 6 | from _pytest.fixtures import SubRequest
 7 | from numpy import testing as npt
 8 | from pandas import testing as pdt
 9 | 
10 | from rle_array import RLEArray, RLEDtype
11 | 
12 | pytestmark = pytest.mark.filterwarnings("ignore:performance")
13 | 
14 | FComp = Callable[[Union[pd.Series, np.ndarray], Union[pd.Series, RLEArray]], None]
15 | 
16 | 
17 | @pytest.fixture
18 | def series_orig() -> pd.Series:
19 |     return pd.Series([1, 1, 2, 3, 3], dtype=int)
20 | 
21 | 
22 | @pytest.fixture
23 | def array_orig(series_orig: pd.Series) -> np.ndarray:
24 |     return series_orig.values
25 | 
26 | 
27 | @pytest.fixture
28 | def series_rle(series_orig: pd.Series) -> pd.Series:
29 |     return series_orig.astype(RLEDtype(series_orig.dtype))
30 | 
31 | 
32 | @pytest.fixture
33 | def array_rle(series_rle: pd.Series) -> RLEArray:
34 |     values = series_rle.values
35 |     assert isinstance(values, RLEArray)
36 |     return values
37 | 
38 | 
39 | @pytest.fixture(params=["series", "array"])
40 | def mode(request: SubRequest) -> str:
41 |     m = request.param
42 |     assert isinstance(m, str)
43 |     return m
44 | 
45 | 
46 | @pytest.fixture
47 | def object_orig(
48 |     series_orig: pd.Series, array_orig: np.ndarray, mode: str
49 | ) -> Union[pd.Series, np.ndarray]:
50 |     if mode == "series":
51 |         return series_orig
52 |     elif mode == "array":
53 |         return array_orig
54 |     else:
55 |         raise ValueError(f"Unknown mode {mode}")
56 | 
57 | 
58 | @pytest.fixture
59 | def object_rle(
60 |     series_rle: pd.Series, array_rle: RLEArray, mode: str
61 | ) -> Union[pd.Series, RLEArray]:
62 |     if mode == "series":
63 |         return series_rle
64 |     elif mode == "array":
65 |         return array_rle
66 |     else:
67 |         raise ValueError(f"Unknown mode {mode}")
68 | 
69 | 
70 | @pytest.fixture
71 | def comp(mode: str) -> FComp:
72 |     if mode == "series":
73 |         return cast(FComp, pdt.assert_series_equal)
74 |     elif mode == "array":
75 |         return cast(FComp, npt.assert_array_equal)
76 |     else:
77 |         raise ValueError(f"Unknown mode {mode}")
78 | 
79 | 
80 | def test_sum(
81 |     object_orig: Union[pd.Series, np.ndarray],
82 |     object_rle: Union[pd.Series, RLEArray],
83 |     comp: FComp,
84 | ) -> None:
85 |     elements_orig = [object_orig, object_orig]
86 |     elements_rle = [object_rle, object_rle]
87 |     elements_mixed = [object_rle, object_orig]
88 | 
89 |     result_orig: np.int64 = sum(elements_orig)
90 |     result_rle: np.int64 = sum(elements_rle)
91 |     result_mixed: np.int64 = sum(elements_mixed)
92 | 
93 |     result_converted1 = result_rle.astype(int)
94 |     comp(result_orig, result_converted1)
95 | 
96 |     result_converted2 = result_mixed.astype(int)
97 |     comp(result_orig, result_converted2)
98 | 


--------------------------------------------------------------------------------
/tests/test_constructors.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | 
 4 | from rle_array import RLEArray
 5 | from rle_array.types import POSITIONS_DTYPE
 6 | 
 7 | 
 8 | def test_valid() -> None:
 9 |     RLEArray(
10 |         data=np.asarray([1.0, 2.0]),
11 |         positions=np.asarray([10, 20], dtype=POSITIONS_DTYPE),
12 |     )
13 | 
14 | 
15 | def test_data_invalid_type() -> None:
16 |     with pytest.raises(TypeError, match="data must be an ndarray but is int"):
17 |         RLEArray(data=1, positions=np.asarray([10, 20], dtype=POSITIONS_DTYPE))
18 | 
19 | 
20 | def test_positions_invalid_type() -> None:
21 |     with pytest.raises(TypeError, match="positions must be an ndarray but is int"):
22 |         RLEArray(data=np.asarray([1.0, 2.0]), positions=1)
23 | 
24 | 
25 | def test_data_invalid_dims() -> None:
26 |     with pytest.raises(
27 |         ValueError, match="data must be an 1-dimensional ndarray but has 2 dimensions"
28 |     ):
29 |         RLEArray(
30 |             data=np.asarray([[1.0, 2.0], [3.0, 4.0]]),
31 |             positions=np.asarray([10, 20], dtype=POSITIONS_DTYPE),
32 |         )
33 | 
34 | 
35 | def test_positions_invalid_dims() -> None:
36 |     with pytest.raises(
37 |         ValueError,
38 |         match="positions must be an 1-dimensional ndarray but has 2 dimensions",
39 |     ):
40 |         RLEArray(
41 |             data=np.asarray([1.0, 2.0]),
42 |             positions=np.asarray([[10, 20], [30, 40]], dtype=POSITIONS_DTYPE),
43 |         )
44 | 
45 | 
46 | def test_positions_invalid_dtype() -> None:
47 |     with pytest.raises(
48 |         ValueError, match="positions must have dtype int64 but has uint64"
49 |     ):
50 |         RLEArray(
51 |             data=np.asarray([1.0, 2.0]), positions=np.asarray([10, 20], dtype=np.uint64)
52 |         )
53 | 
54 | 
55 | def test_different_lengths() -> None:
56 |     with pytest.raises(
57 |         ValueError, match="data and positions must have same length but have 3 and 2"
58 |     ):
59 |         RLEArray(
60 |             data=np.asarray([1.0, 2.0, 3.0]),
61 |             positions=np.asarray([10, 20], dtype=POSITIONS_DTYPE),
62 |         )
63 | 
64 | 
65 | def test_not_sorted_1() -> None:
66 |     with pytest.raises(ValueError, match="positions must be strictly sorted"):
67 |         RLEArray(
68 |             data=np.asarray([1.0, 2.0]),
69 |             positions=np.asarray([10, 9], dtype=POSITIONS_DTYPE),
70 |         )
71 | 
72 | 
73 | def test_not_sorted_2() -> None:
74 |     with pytest.raises(ValueError, match="positions must be strictly sorted"):
75 |         RLEArray(
76 |             data=np.asarray([1.0, 2.0]),
77 |             positions=np.asarray([10, 10], dtype=POSITIONS_DTYPE),
78 |         )
79 | 


--------------------------------------------------------------------------------
/tests/test_dtype.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, List
  2 | 
  3 | import numpy as np
  4 | import pytest
  5 | 
  6 | from rle_array import RLEDtype
  7 | 
  8 | 
  9 | @pytest.mark.parametrize(
 10 |     "a, b, expected",
 11 |     [
 12 |         (
 13 |             # a
 14 |             RLEDtype(int),
 15 |             # b
 16 |             RLEDtype(int),
 17 |             # expected
 18 |             True,
 19 |         ),
 20 |         (
 21 |             # a
 22 |             RLEDtype(int),
 23 |             # b
 24 |             RLEDtype(float),
 25 |             # expected
 26 |             False,
 27 |         ),
 28 |         (
 29 |             # a
 30 |             RLEDtype(int),
 31 |             # b
 32 |             RLEDtype(np.int64),
 33 |             # expected
 34 |             True,
 35 |         ),
 36 |     ],
 37 | )
 38 | def test_eq(a: RLEDtype, b: RLEDtype, expected: bool) -> None:
 39 |     actual = a == b
 40 |     assert actual is expected
 41 | 
 42 | 
 43 | @pytest.mark.parametrize(
 44 |     "dtype, dtypes, expected",
 45 |     [
 46 |         (  # RLE: idempotents
 47 |             # dtype
 48 |             RLEDtype(np.int8),
 49 |             # dtypes
 50 |             [RLEDtype(np.int8)],
 51 |             # expected
 52 |             RLEDtype(np.int8),
 53 |         ),
 54 |         (  # RLE: same types
 55 |             # dtype
 56 |             RLEDtype(np.int8),
 57 |             # dtypes
 58 |             [RLEDtype(np.int8), RLEDtype(np.int8)],
 59 |             # expected
 60 |             RLEDtype(np.int8),
 61 |         ),
 62 |         (  # RLE: larger integer
 63 |             # dtype
 64 |             RLEDtype(np.int8),
 65 |             # dtypes
 66 |             [RLEDtype(np.int8), RLEDtype(np.int16)],
 67 |             # expected
 68 |             RLEDtype(np.int16),
 69 |         ),
 70 |         (  # RLE: choose float
 71 |             # dtype
 72 |             RLEDtype(np.int8),
 73 |             # dtypes
 74 |             [RLEDtype(np.int8), RLEDtype(np.float32)],
 75 |             # expected
 76 |             RLEDtype(np.float32),
 77 |         ),
 78 |         (  # RLE: use special pandas rule and chose object
 79 |             # dtype
 80 |             RLEDtype(np.bool_),
 81 |             # dtypes
 82 |             [RLEDtype(np.bool_), RLEDtype(np.float32)],
 83 |             # expected
 84 |             RLEDtype(object),
 85 |         ),
 86 |         (  # uncompressed: same types
 87 |             # dtype
 88 |             RLEDtype(np.int8),
 89 |             # dtypes
 90 |             [RLEDtype(np.int8), np.dtype(np.int8)],
 91 |             # expected
 92 |             np.dtype(np.int8),
 93 |         ),
 94 |         (  # uncompressed: larger integer
 95 |             # dtype
 96 |             RLEDtype(np.int8),
 97 |             # dtypes
 98 |             [RLEDtype(np.int8), np.dtype(np.int16)],
 99 |             # expected
100 |             np.dtype(np.int16),
101 |         ),
102 |         (  # uncompressed: choose float
103 |             # dtype
104 |             RLEDtype(np.int8),
105 |             # dtypes
106 |             [RLEDtype(np.int8), np.dtype(np.float32)],
107 |             # expected
108 |             np.dtype(np.float32),
109 |         ),
110 |         (  # uncompressed: use special pandas rule and chose object
111 |             # dtype
112 |             RLEDtype(np.bool_),
113 |             # dtypes
114 |             [RLEDtype(np.bool_), np.dtype(np.float32)],
115 |             # expected
116 |             np.dtype(object),
117 |         ),
118 |     ],
119 | )
120 | def test_get_common_dtype(dtype: RLEDtype, dtypes: List[Any], expected: Any) -> None:
121 |     actual = dtype._get_common_dtype(dtypes)
122 |     assert actual == expected
123 | 
124 | 
125 | @pytest.mark.parametrize(
126 |     "dtype, expected",
127 |     [
128 |         (
129 |             # dtype
130 |             RLEDtype(np.dtype(int)),
131 |             # expected
132 |             "RLEDtype(dtype('int64'))",
133 |         ),
134 |         (
135 |             # dtype
136 |             RLEDtype(int),
137 |             # expected
138 |             "RLEDtype(dtype('int64'))",
139 |         ),
140 |     ],
141 | )
142 | def test_repr(dtype: RLEDtype, expected: str) -> None:
143 |     assert repr(dtype) == expected
144 | 


--------------------------------------------------------------------------------
/tests/test_fastpath.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pytest
 3 | 
 4 | from rle_array import RLEDtype  # noqa
 5 | 
 6 | pytestmark = pytest.mark.filterwarnings("error:performance")
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def series() -> pd.Series:
11 |     return pd.Series(range(10), dtype="RLEDtype[int64]")
12 | 
13 | 
14 | @pytest.fixture
15 | def df(series: pd.Series) -> pd.DataFrame:
16 |     return pd.DataFrame({"x": series, "y": series})
17 | 
18 | 
19 | def test_array_slice(series: pd.Series) -> None:
20 |     series.array[:]
21 |     series.array[::-1]
22 | 
23 | 
24 | def test_create_series(series: pd.Series) -> None:
25 |     pass
26 | 
27 | 
28 | def test_create_df(df: pd.Series) -> None:
29 |     pass
30 | 
31 | 
32 | def test_getitem_single(series: pd.Series) -> None:
33 |     assert series[2] == 2
34 | 
35 | 
36 | def test_sum(series: pd.Series) -> None:
37 |     assert series.sum() == 45
38 | 


--------------------------------------------------------------------------------
/tests/test_indexing.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from rle_array import RLEArray
 4 | 
 5 | 
 6 | def test_fail_two_dim_indexing() -> None:
 7 |     array = RLEArray._from_sequence(range(10))
 8 |     with pytest.raises(
 9 |         NotImplementedError,
10 |         match="__getitem__ does currently only work w/ a single parameter",
11 |     ):
12 |         array[1, 2]
13 | 


--------------------------------------------------------------------------------
/tests/test_misc_operations.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import pytest
 6 | from _pytest.fixtures import SubRequest
 7 | from pandas import testing as pdt
 8 | 
 9 | from rle_array import RLEDtype
10 | 
11 | pytestmark = pytest.mark.filterwarnings("ignore:performance")
12 | 
13 | 
14 | @pytest.fixture(
15 |     params=[
16 |         "single_int",
17 |         "single_float",
18 |         "single_float32",
19 |         "empty_int",
20 |         "empty_float",
21 |         "empty_float32",
22 |         "multi_int",
23 |         "multi_float",
24 |         "multi_float32",
25 |     ]
26 | )
27 | def data_orig(request: SubRequest) -> pd.Series:
28 |     f1 = 1.2363
29 |     f2 = 2.6263
30 |     if request.param == "single_int":
31 |         return pd.Series([1], dtype=int)
32 |     elif request.param == "single_float":
33 |         return pd.Series([f1], dtype=float)
34 |     elif request.param == "single_float32":
35 |         return pd.Series([f1], dtype=np.float32)
36 |     elif request.param == "empty_int":
37 |         return pd.Series([], dtype=int)
38 |     elif request.param == "empty_float":
39 |         return pd.Series([], dtype=float)
40 |     elif request.param == "empty_float32":
41 |         return pd.Series([], dtype=np.float32)
42 |     elif request.param == "multi_int":
43 |         return pd.Series([1, 1, 2, 2], dtype=int)
44 |     elif request.param == "multi_float":
45 |         return pd.Series([f1, f1, f2, f2], dtype=float)
46 |     elif request.param == "multi_float32":
47 |         return pd.Series([f1, f1, f2, f2], dtype=np.float32)
48 |     else:
49 |         raise ValueError(f"Unknown data variant: {request.param}")
50 | 
51 | 
52 | @pytest.fixture
53 | def data_rle(data_orig: pd.Series) -> pd.Series:
54 |     return data_orig.astype(RLEDtype(data_orig.dtype))
55 | 
56 | 
57 | @pytest.mark.parametrize("periods", [0, -1, 1, -2, 2])
58 | @pytest.mark.parametrize("fill_value", [1, np.nan])
59 | def test_shift(
60 |     data_orig: pd.Series, data_rle: pd.Series, periods: int, fill_value: Any
61 | ) -> None:
62 |     result_orig = data_orig.shift(periods=periods, fill_value=fill_value)
63 |     result_rle = data_rle.shift(periods=periods, fill_value=fill_value)
64 | 
65 |     assert result_rle.dtype == RLEDtype(result_orig.dtype)
66 | 
67 |     result_converted = result_rle.astype(result_rle.dtype._dtype)
68 |     pdt.assert_series_equal(result_orig, result_converted)
69 | 
70 | 
71 | @pytest.mark.parametrize("decimals", [0, 1, 2, 3, 4])
72 | def test_round(data_orig: pd.Series, data_rle: pd.Series, decimals: int) -> None:
73 |     result_orig = data_orig.round(decimals=decimals)
74 |     result_rle = data_rle.round(decimals=decimals)
75 | 
76 |     assert result_rle.dtype == RLEDtype(result_orig.dtype)
77 | 
78 |     result_converted = result_rle.astype(result_rle.dtype._dtype)
79 |     pdt.assert_series_equal(result_orig, result_converted)
80 | 


--------------------------------------------------------------------------------
/tests/test_operators.py:
--------------------------------------------------------------------------------
  1 | import operator
  2 | from typing import Any, Callable, cast
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import pytest
  7 | from _pytest.fixtures import SubRequest
  8 | from numpy import testing as npt
  9 | from pandas.core import ops
 10 | 
 11 | from rle_array import RLEArray, RLEDtype
 12 | 
 13 | pytestmark = pytest.mark.filterwarnings("ignore:performance")
 14 | 
 15 | FCompareOperator = Callable[[Any, Any], Any]
 16 | FUnaryOperator = Callable[[Any], Any]
 17 | FUnaryBoolOperator = Callable[[Any], Any]
 18 | FBinaryOperator = Callable[[Any, Any], Any]
 19 | FBinaryBoolOperator = Callable[[Any, Any], Any]
 20 | 
 21 | 
 22 | @pytest.fixture
 23 | def values() -> np.ndarray:
 24 |     return np.array([2.0, 2.0, 2.0, 3.0, 3.0, 2.0, np.nan, np.nan, 1.0, 1.0])
 25 | 
 26 | 
 27 | @pytest.fixture
 28 | def scalar(values: np.ndarray) -> float:
 29 |     return float(values[0])
 30 | 
 31 | 
 32 | @pytest.fixture
 33 | def uncompressed_series(values: np.ndarray) -> pd.Series:
 34 |     return pd.Series(values, index=np.arange(len(values)) + 1)
 35 | 
 36 | 
 37 | @pytest.fixture
 38 | def uncompressed_series2(values: np.ndarray) -> pd.Series:
 39 |     return pd.Series(values[::-1], index=np.arange(len(values)) + 1)
 40 | 
 41 | 
 42 | @pytest.fixture
 43 | def rle_series(values: np.ndarray) -> pd.Series:
 44 |     return pd.Series(RLEArray._from_sequence(values), index=np.arange(len(values)) + 1)
 45 | 
 46 | 
 47 | @pytest.fixture
 48 | def rle_series2(values: np.ndarray) -> pd.Series:
 49 |     return pd.Series(
 50 |         RLEArray._from_sequence(values[::-1]), index=np.arange(len(values)) + 1
 51 |     )
 52 | 
 53 | 
 54 | @pytest.fixture
 55 | def bool_values() -> np.ndarray:
 56 |     return np.array([False] * 4 + [True] * 5 + [False])
 57 | 
 58 | 
 59 | @pytest.fixture
 60 | def bool_scalar(bool_values: np.ndarray) -> bool:
 61 |     return bool(bool_values[0])
 62 | 
 63 | 
 64 | @pytest.fixture
 65 | def uncompressed_bool_series(bool_values: np.ndarray) -> pd.Series:
 66 |     return pd.Series(bool_values)
 67 | 
 68 | 
 69 | @pytest.fixture
 70 | def uncompressed_bool_series2(bool_values: np.ndarray) -> pd.Series:
 71 |     return pd.Series(bool_values[::-1])
 72 | 
 73 | 
 74 | @pytest.fixture
 75 | def rle_bool_series(bool_values: np.ndarray) -> pd.Series:
 76 |     return pd.Series(RLEArray._from_sequence(bool_values))
 77 | 
 78 | 
 79 | @pytest.fixture
 80 | def rle_bool_series2(bool_values: np.ndarray) -> pd.Series:
 81 |     # TODO: Use `index=np.arange(len(bool_values)) + 1`.
 82 |     #       For some reason, pandas casts us back to dtype=bool in that case.
 83 |     return pd.Series(RLEArray._from_sequence(bool_values[::-1]))
 84 | 
 85 | 
 86 | @pytest.fixture(
 87 |     params=[
 88 |         operator.eq,
 89 |         operator.ne,
 90 |         operator.lt,
 91 |         operator.gt,
 92 |         operator.le,
 93 |         operator.ge,
 94 |     ],
 95 |     ids=lambda op: str(op.__name__),
 96 | )
 97 | def compare_operator(request: SubRequest) -> FCompareOperator:
 98 |     return cast(FCompareOperator, request.param)
 99 | 
100 | 
101 | @pytest.fixture(
102 |     params=[operator.abs, operator.neg, operator.pos], ids=lambda op: str(op.__name__)
103 | )
104 | def unary_operator(request: SubRequest) -> FUnaryOperator:
105 |     return cast(FUnaryOperator, request.param)
106 | 
107 | 
108 | @pytest.fixture(params=[operator.inv], ids=lambda op: str(op.__name__))
109 | def unary_bool_operator(request: SubRequest) -> FUnaryBoolOperator:
110 |     return cast(FUnaryBoolOperator, request.param)
111 | 
112 | 
113 | @pytest.fixture(
114 |     params=[
115 |         operator.add,
116 |         operator.iadd,
117 |         ops.radd,
118 |         operator.sub,
119 |         operator.isub,
120 |         ops.rsub,
121 |         operator.mul,
122 |         operator.imul,
123 |         ops.rmul,
124 |         operator.truediv,
125 |         operator.itruediv,
126 |         ops.rtruediv,
127 |         operator.floordiv,
128 |         operator.ifloordiv,
129 |         ops.rfloordiv,
130 |         operator.mod,
131 |         operator.imod,
132 |         ops.rmod,
133 |         operator.pow,
134 |         operator.ipow,
135 |         ops.rpow,
136 |     ],
137 |     ids=lambda op: str(op.__name__),
138 | )
139 | def binary_operator(request: SubRequest) -> FBinaryOperator:
140 |     return cast(FBinaryOperator, request.param)
141 | 
142 | 
143 | @pytest.fixture(
144 |     params=[
145 |         operator.and_,
146 |         operator.iand,
147 |         ops.rand_,
148 |         operator.or_,
149 |         operator.ior,
150 |         ops.ror_,
151 |         operator.xor,
152 |         operator.ixor,
153 |         ops.rxor,
154 |     ],
155 |     ids=lambda op: str(op.__name__),
156 | )
157 | def binary_bool_operator(request: SubRequest) -> FBinaryBoolOperator:
158 |     return cast(FBinaryBoolOperator, request.param)
159 | 
160 | 
161 | def test_compare_scalar(
162 |     rle_series: pd.Series,
163 |     uncompressed_series: pd.Series,
164 |     scalar: float,
165 |     compare_operator: FCompareOperator,
166 | ) -> None:
167 |     actual = compare_operator(rle_series, scalar)
168 |     assert actual.dtype == RLEDtype(bool)
169 | 
170 |     expected = compare_operator(uncompressed_series, scalar).astype("RLEDtype[bool]")
171 |     pd.testing.assert_series_equal(actual, expected)
172 | 
173 | 
174 | def test_compare_rle_series(
175 |     rle_series: pd.Series,
176 |     rle_series2: pd.Series,
177 |     uncompressed_series: pd.Series,
178 |     uncompressed_series2: pd.Series,
179 |     compare_operator: FCompareOperator,
180 | ) -> None:
181 |     actual = compare_operator(rle_series, rle_series2)
182 |     assert actual.dtype == RLEDtype(bool)
183 | 
184 |     expected = compare_operator(uncompressed_series, uncompressed_series2).astype(
185 |         "RLEDtype[bool]"
186 |     )
187 |     pd.testing.assert_series_equal(actual, expected)
188 | 
189 | 
190 | def test_compare_uncompressed_series(
191 |     rle_series: pd.Series,
192 |     uncompressed_series: pd.Series,
193 |     compare_operator: FCompareOperator,
194 | ) -> None:
195 |     actual = compare_operator(rle_series, uncompressed_series)
196 |     assert actual.dtype == bool
197 | 
198 |     expected = compare_operator(uncompressed_series, uncompressed_series)
199 |     pd.testing.assert_series_equal(actual, expected)
200 | 
201 | 
202 | def test_binary_operator_scalar(
203 |     rle_series: pd.Series,
204 |     uncompressed_series: pd.Series,
205 |     scalar: float,
206 |     binary_operator: FBinaryOperator,
207 | ) -> None:
208 |     actual = binary_operator(rle_series, scalar)
209 |     assert actual.dtype == RLEDtype(float)
210 | 
211 |     expected = binary_operator(uncompressed_series, scalar).astype("RLEDtype[float]")
212 |     pd.testing.assert_series_equal(actual, expected)
213 | 
214 | 
215 | def test_binary_operator_rle_series(
216 |     rle_series: pd.Series,
217 |     rle_series2: pd.Series,
218 |     uncompressed_series: pd.Series,
219 |     uncompressed_series2: pd.Series,
220 |     binary_operator: FBinaryOperator,
221 | ) -> None:
222 |     actual = binary_operator(rle_series, rle_series2)
223 |     assert actual.dtype == RLEDtype(float)
224 | 
225 |     expected = binary_operator(uncompressed_series, uncompressed_series2).astype(
226 |         "RLEDtype[float]"
227 |     )
228 |     pd.testing.assert_series_equal(actual, expected)
229 | 
230 | 
231 | def test_binary_operator_uncompressed_series(
232 |     rle_series: pd.Series,
233 |     uncompressed_series: pd.Series,
234 |     uncompressed_series2: pd.Series,
235 |     binary_operator: FBinaryOperator,
236 | ) -> None:
237 |     actual = binary_operator(rle_series, uncompressed_series2)
238 |     assert actual.dtype == float
239 | 
240 |     expected = binary_operator(uncompressed_series, uncompressed_series2)
241 |     pd.testing.assert_series_equal(actual, expected)
242 | 
243 | 
244 | def test_binary_bool_operator_scalar(
245 |     rle_bool_series: pd.Series,
246 |     uncompressed_bool_series: pd.Series,
247 |     bool_scalar: bool,
248 |     binary_bool_operator: FBinaryBoolOperator,
249 | ) -> None:
250 |     actual = binary_bool_operator(rle_bool_series, bool_scalar)
251 |     assert actual.dtype == RLEDtype(bool)
252 | 
253 |     expected = binary_bool_operator(uncompressed_bool_series, bool_scalar).astype(
254 |         RLEDtype(bool)
255 |     )
256 |     pd.testing.assert_series_equal(actual, expected)
257 | 
258 | 
259 | def test_binary_bool_operator_rle_series(
260 |     rle_bool_series: pd.Series,
261 |     rle_bool_series2: pd.Series,
262 |     uncompressed_bool_series: pd.Series,
263 |     uncompressed_bool_series2: pd.Series,
264 |     binary_bool_operator: FBinaryBoolOperator,
265 | ) -> None:
266 |     actual = binary_bool_operator(rle_bool_series, rle_bool_series2)
267 |     assert actual.dtype == RLEDtype(bool)
268 | 
269 |     expected = binary_bool_operator(
270 |         uncompressed_bool_series, uncompressed_bool_series2
271 |     ).astype(RLEDtype(bool))
272 |     pd.testing.assert_series_equal(actual, expected)
273 | 
274 | 
275 | def test_binary_bool_operator_uncompressed_series(
276 |     rle_bool_series: pd.Series,
277 |     uncompressed_bool_series: pd.Series,
278 |     uncompressed_bool_series2: pd.Series,
279 |     binary_bool_operator: FBinaryBoolOperator,
280 | ) -> None:
281 |     actual = binary_bool_operator(rle_bool_series, uncompressed_bool_series2)
282 |     assert actual.dtype == bool
283 | 
284 |     expected = binary_bool_operator(uncompressed_bool_series, uncompressed_bool_series2)
285 |     pd.testing.assert_series_equal(actual, expected)
286 | 
287 | 
288 | def test_unary_operator(
289 |     rle_series: pd.Series,
290 |     uncompressed_series: pd.Series,
291 |     unary_operator: FUnaryOperator,
292 | ) -> None:
293 |     actual = unary_operator(rle_series)
294 |     assert actual.dtype == RLEDtype(float)
295 | 
296 |     expected = unary_operator(uncompressed_series).astype(RLEDtype(float))
297 |     pd.testing.assert_series_equal(actual, expected)
298 | 
299 | 
300 | def test_unary_operator_array(
301 |     rle_series: pd.Series,
302 |     uncompressed_series: pd.Series,
303 |     unary_operator: FUnaryOperator,
304 | ) -> None:
305 |     actual = unary_operator(rle_series.array)
306 |     assert actual.dtype == RLEDtype(float)
307 | 
308 |     expected = unary_operator(uncompressed_series.array)
309 |     npt.assert_array_equal(actual, expected)
310 | 
311 | 
312 | def test_unary_bool_operator(
313 |     rle_bool_series: pd.Series,
314 |     uncompressed_bool_series: pd.Series,
315 |     unary_bool_operator: FUnaryBoolOperator,
316 | ) -> None:
317 |     actual = unary_bool_operator(rle_bool_series)
318 |     assert actual.dtype == RLEDtype(bool)
319 | 
320 |     expected = unary_bool_operator(uncompressed_bool_series).astype(RLEDtype(bool))
321 |     pd.testing.assert_series_equal(actual, expected)
322 | 
323 | 
324 | def test_unary_bool_operator_array(
325 |     rle_bool_series: pd.Series,
326 |     uncompressed_bool_series: pd.Series,
327 |     unary_bool_operator: FUnaryBoolOperator,
328 | ) -> None:
329 |     actual = unary_bool_operator(rle_bool_series.array)
330 |     assert actual.dtype == RLEDtype(bool)
331 | 
332 |     expected = unary_bool_operator(uncompressed_bool_series.array)
333 |     npt.assert_array_equal(actual, expected)
334 | 
335 | 
336 | def test_different_length_raises(values: np.ndarray) -> None:
337 |     array1 = RLEArray._from_sequence(values)
338 |     array2 = RLEArray._from_sequence(values[:-1])
339 |     with pytest.raises(ValueError, match="arrays have different lengths"):
340 |         array1 + array2
341 | 


--------------------------------------------------------------------------------
/tests/test_pandas.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Callable, Generator, cast
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import pytest
  6 | from _pytest.fixtures import SubRequest
  7 | from pandas.tests.extension import base
  8 | 
  9 | from rle_array import RLEArray, RLEDtype
 10 | from rle_array.types import POSITIONS_DTYPE
 11 | 
 12 | pytestmark = pytest.mark.filterwarnings("ignore:performance")
 13 | 
 14 | 
 15 | _all_arithmetic_operators = [
 16 |     "__add__",
 17 |     "__radd__",
 18 |     "__sub__",
 19 |     "__rsub__",
 20 |     "__mul__",
 21 |     "__rmul__",
 22 |     "__floordiv__",
 23 |     "__rfloordiv__",
 24 |     "__truediv__",
 25 |     "__rtruediv__",
 26 |     "__pow__",
 27 |     "__rpow__",
 28 |     "__mod__",
 29 |     "__rmod__",
 30 | ]
 31 | 
 32 | 
 33 | @pytest.fixture(params=_all_arithmetic_operators)
 34 | def all_arithmetic_operators(request: SubRequest) -> str:
 35 |     """
 36 |     Fixture for dunder names for common arithmetic operations
 37 |     """
 38 |     op = request.param
 39 |     assert isinstance(op, str)
 40 |     return op
 41 | 
 42 | 
 43 | @pytest.fixture(params=["__eq__", "__ne__", "__le__", "__lt__", "__ge__", "__gt__"])
 44 | def all_compare_operators(request: SubRequest) -> str:
 45 |     """
 46 |     Fixture for dunder names for common compare operations
 47 | 
 48 |     * >=
 49 |     * >
 50 |     * ==
 51 |     * !=
 52 |     * <
 53 |     * <=
 54 |     """
 55 |     op = request.param
 56 |     assert isinstance(op, str)
 57 |     return op
 58 | 
 59 | 
 60 | _all_boolean_reductions = ["all", "any"]
 61 | 
 62 | 
 63 | @pytest.fixture(params=_all_boolean_reductions)
 64 | def all_boolean_reductions(request: SubRequest) -> str:
 65 |     """
 66 |     Fixture for boolean reduction names
 67 |     """
 68 |     op = request.param
 69 |     assert isinstance(op, str)
 70 |     return op
 71 | 
 72 | 
 73 | @pytest.fixture(params=["data", "data_missing"])
 74 | def all_data(request: SubRequest, data: RLEArray, data_missing: RLEArray) -> RLEArray:
 75 |     """Parametrized fixture giving 'data' and 'data_missing'"""
 76 |     if request.param == "data":
 77 |         return data
 78 |     elif request.param == "data_missing":
 79 |         return data_missing
 80 |     else:
 81 |         raise RuntimeError(f"Unkonwn all_data type: {request.param}")
 82 | 
 83 | 
 84 | _all_numeric_reductions = [
 85 |     "sum",
 86 |     "max",
 87 |     "min",
 88 |     "mean",
 89 |     "prod",
 90 |     "std",
 91 |     "var",
 92 |     "median",
 93 |     "kurt",
 94 |     "skew",
 95 | ]
 96 | 
 97 | 
 98 | @pytest.fixture(params=_all_numeric_reductions)
 99 | def all_numeric_reductions(request: SubRequest) -> str:
100 |     """
101 |     Fixture for numeric reduction names
102 |     """
103 |     op = request.param
104 |     assert isinstance(op, str)
105 |     return op
106 | 
107 | 
108 | @pytest.fixture(params=[True, False])
109 | def as_array(request: SubRequest) -> bool:
110 |     """
111 |     Boolean fixture to support ExtensionDtype _from_sequence method testing.
112 |     """
113 |     b = request.param
114 |     assert isinstance(b, bool)
115 |     return b
116 | 
117 | 
118 | @pytest.fixture(params=[True, False])
119 | def as_frame(request: SubRequest) -> bool:
120 |     """
121 |     Boolean fixture to support Series and Series.to_frame() comparison testing.
122 |     """
123 |     b = request.param
124 |     assert isinstance(b, bool)
125 |     return b
126 | 
127 | 
128 | @pytest.fixture(params=[True, False])
129 | def as_series(request: SubRequest) -> bool:
130 |     """
131 |     Boolean fixture to support arr and Series(arr) comparison testing.
132 |     """
133 |     b = request.param
134 |     assert isinstance(b, bool)
135 |     return b
136 | 
137 | 
138 | @pytest.fixture(params=[True, False])
139 | def box_in_series(request: SubRequest) -> bool:
140 |     """Whether to box the data in a Series"""
141 |     b = request.param
142 |     assert isinstance(b, bool)
143 |     return b
144 | 
145 | 
146 | @pytest.fixture
147 | def data() -> RLEArray:
148 |     """Length-100 array for this type.
149 |     * data[0] and data[1] should both be non missing
150 |     * data[0] and data[1] should not be equal
151 |     """
152 |     return RLEArray(
153 |         data=np.asarray([13, -1, -2, 42], dtype=np.float32),
154 |         positions=np.asarray([1, 2, 4, 100], dtype=POSITIONS_DTYPE),
155 |     )
156 | 
157 | 
158 | @pytest.fixture
159 | def data_for_grouping() -> RLEArray:
160 |     """Data for factorization, grouping, and unique tests.
161 |     Expected to be like [B, B, NA, NA, A, A, B, C]
162 |     Where A < B < C and NA is missing
163 |     """
164 |     return RLEArray(
165 |         data=np.asarray([2.0, np.nan, 1.0, 2.0, 3.0], dtype=np.float32),
166 |         positions=np.asarray([2, 4, 6, 7, 8], dtype=POSITIONS_DTYPE),
167 |     )
168 | 
169 | 
170 | @pytest.fixture
171 | def data_for_sorting() -> RLEArray:
172 |     """Length-3 array with a known sort order.
173 |     This should be three items [B, C, A] with
174 |     A < B < C
175 |     """
176 |     return RLEArray(
177 |         data=np.asarray([2.0, 3.0, 1.0], dtype=np.float32),
178 |         positions=np.asarray([1, 2, 3], dtype=POSITIONS_DTYPE),
179 |     )
180 | 
181 | 
182 | @pytest.fixture
183 | def data_for_twos() -> RLEArray:
184 |     """Length-100 array in which all the elements are two."""
185 |     return RLEArray(
186 |         data=np.asarray([2.0], dtype=np.float32),
187 |         positions=np.asarray([100], dtype=POSITIONS_DTYPE),
188 |     )
189 | 
190 | 
191 | @pytest.fixture
192 | def data_missing() -> RLEArray:
193 |     """Length-2 array with [NA, Valid]"""
194 |     return RLEArray(
195 |         data=np.asarray([np.nan, 42], dtype=np.float32),
196 |         positions=np.asarray([1, 2], dtype=POSITIONS_DTYPE),
197 |     )
198 | 
199 | 
200 | @pytest.fixture
201 | def data_missing_for_sorting() -> RLEArray:
202 |     """Length-3 array with a known sort order.
203 |     This should be three items [B, NA, A] with
204 |     A < B and NA missing.
205 |     """
206 |     return RLEArray(
207 |         data=np.asarray([2.0, np.nan, 1.0], dtype=np.float32),
208 |         positions=np.asarray([1, 2, 3], dtype=POSITIONS_DTYPE),
209 |     )
210 | 
211 | 
212 | @pytest.fixture
213 | def data_repeated(data: RLEArray) -> Callable[[int], Generator[RLEArray, None, None]]:
214 |     """
215 |     Generate many datasets.
216 |     Parameters
217 |     ----------
218 |     data : fixture implementing `data`
219 |     Returns
220 |     -------
221 |     Callable[[int], Generator]:
222 |         A callable that takes a `count` argument and
223 |         returns a generator yielding `count` datasets.
224 |     """
225 | 
226 |     def gen(count: int) -> Generator[RLEArray, None, None]:
227 |         for _ in range(count):
228 |             yield data
229 | 
230 |     return gen
231 | 
232 | 
233 | @pytest.fixture
234 | def dtype() -> RLEDtype:
235 |     """A fixture providing the ExtensionDtype to validate."""
236 |     return RLEDtype(np.float32)
237 | 
238 | 
239 | @pytest.fixture(params=["ffill", "bfill"])
240 | def fillna_method(request: SubRequest) -> str:
241 |     """
242 |     Parametrized fixture giving method parameters 'ffill' and 'bfill' for
243 |     Series.fillna(method=<method>) testing.
244 |     """
245 |     op = request.param
246 |     assert isinstance(op, str)
247 |     return op
248 | 
249 | 
250 | @pytest.fixture(
251 |     params=[
252 |         lambda x: 1,
253 |         lambda x: [1] * len(x),
254 |         lambda x: pd.Series([1] * len(x)),
255 |         lambda x: x,
256 |     ],
257 |     ids=["scalar", "list", "series", "object"],
258 | )
259 | def groupby_apply_op(request: SubRequest) -> Callable[..., Any]:
260 |     """
261 |     Functions to test groupby.apply().
262 |     """
263 |     return cast(Callable[..., Any], request.param)
264 | 
265 | 
266 | @pytest.fixture
267 | def na_cmp() -> Callable[[Any, Any], Any]:
268 |     """Binary operator for comparing NA values.
269 |     Should return a function of two arguments that returns
270 |     True if both arguments are (scalar) NA for your type.
271 |     By default, uses ``operator.is_``
272 |     """
273 |     return lambda x, y: np.isnan(x) and np.isnan(y)
274 | 
275 | 
276 | @pytest.fixture
277 | def na_value() -> np.nan:
278 |     """The scalar missing value for this type. Default 'None'"""
279 |     return np.nan
280 | 
281 | 
282 | @pytest.fixture(params=[True, False])
283 | def use_numpy(request: SubRequest) -> bool:
284 |     """
285 |     Boolean fixture to support comparison testing of ExtensionDtype array
286 |     and numpy array.
287 |     """
288 |     b = request.param
289 |     assert isinstance(b, bool)
290 |     return b
291 | 
292 | 
293 | @pytest.fixture(params=[None, lambda x: x])
294 | def sort_by_key(request: SubRequest) -> Any:
295 |     """
296 |     Simple fixture for testing keys in sorting methods.
297 |     Tests None (no key) and the identity key.
298 |     """
299 |     return request.param
300 | 
301 | 
302 | class TestArithmeticOps(base.BaseArithmeticOpsTests):
303 |     frame_scalar_exc = None
304 |     series_array_exc = None
305 |     series_scalar_exc = None
306 | 
307 |     def test_error(self) -> None:
308 |         pytest.skip("upstream test is broken?")
309 | 
310 |     def _check_op(
311 |         self, s: Any, op: Any, other: Any, op_name: str, exc: type = NotImplementedError
312 |     ) -> None:
313 |         # upstream version checks dtype -> we return an RLEDtype
314 |         if exc is None:
315 |             result = op(s, other)
316 |             expected = s.combine(other, op)
317 |             self.assert_series_equal(result, expected, check_dtype=False)
318 |         else:
319 |             with pytest.raises(exc):
320 |                 op(s, other)
321 | 
322 | 
323 | class TestBooleanReduce(base.BaseBooleanReduceTests):
324 |     pass
325 | 
326 | 
327 | class TestCasting(base.BaseCastingTests):
328 |     pass
329 | 
330 | 
331 | class TestConstructors(base.BaseConstructorsTests):
332 |     pass
333 | 
334 | 
335 | class TestDtype(base.BaseDtypeTests):
336 |     pass
337 | 
338 | 
339 | class TestGetitem(base.BaseGetitemTests):
340 |     pass
341 | 
342 | 
343 | class TestGroupby(base.BaseGroupbyTests):
344 |     pass
345 | 
346 | 
347 | class TestInterface(base.BaseInterfaceTests):
348 |     pass
349 | 
350 | 
351 | class TestMethods(base.BaseMethodsTests):
352 |     def test_combine_le(self) -> None:
353 |         pytest.skip("upstream test is broken?")
354 | 
355 | 
356 | class TestMissing(base.BaseMissingTests):
357 |     def test_isna(self) -> None:
358 |         pytest.skip("upstream test is broken")
359 | 
360 | 
361 | class TestNumericReduce(base.BaseNumericReduceTests):
362 |     pass
363 | 
364 | 
365 | class TestPrinting(base.BasePrintingTests):
366 |     pass
367 | 
368 | 
369 | class TestReshaping(base.BaseReshapingTests):
370 |     pass
371 | 
372 | 
373 | class TestSetitem(base.BaseSetitemTests):
374 |     pass
375 | 
376 | 
377 | class TestComparisonOps(base.BaseComparisonOpsTests):
378 |     def _compare_other(self, s: Any, data: Any, op_name: str, other: Any) -> None:
379 |         # upstream version looks pretty broken...
380 |         op = self.get_op_from_name(op_name)
381 |         if op_name == "__eq__":
382 |             assert getattr(data, op_name)(other) is NotImplemented
383 |             assert not op(s, other).all()
384 |         else:
385 |             assert getattr(data, op_name)(other) is NotImplemented
386 | 
387 |     def test_compare_scalar(self, data: RLEArray, all_compare_operators: str) -> None:
388 |         pytest.skip("upstream test is broken: comparison with scalar works")
389 | 


--------------------------------------------------------------------------------
/tests/test_reduce.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import pytest
  4 | from _pytest.fixtures import SubRequest
  5 | 
  6 | from rle_array import RLEDtype
  7 | 
  8 | pytestmark = pytest.mark.filterwarnings("ignore:performance")
  9 | 
 10 | 
 11 | @pytest.fixture(params=["single", "multi", "empty", "sparse"])
 12 | def data_orig(request: SubRequest) -> pd.Series:
 13 |     if request.param == "single":
 14 |         return pd.Series([1], dtype=int)
 15 |     elif request.param == "multi":
 16 |         return pd.Series([1, 1, 2, 3, 1, 1], dtype=int)
 17 |     elif request.param == "empty":
 18 |         return pd.Series([], dtype=int)
 19 |     elif request.param == "sparse":
 20 |         return pd.Series([1, 1, np.nan, np.nan, 1, 1], dtype=float)
 21 |     else:
 22 |         raise ValueError(f"Unknown data type: {request.param}")
 23 | 
 24 | 
 25 | @pytest.fixture(params=["single", "multi", "empty"])
 26 | def data_orig_bool(request: SubRequest) -> pd.Series:
 27 |     if request.param == "single":
 28 |         return pd.Series([False], dtype=bool)
 29 |     elif request.param == "multi":
 30 |         return pd.Series([False, False, True, False], dtype=bool)
 31 |     elif request.param == "empty":
 32 |         return pd.Series([], dtype=bool)
 33 |     else:
 34 |         raise ValueError(f"Unknown data type: {request.param}")
 35 | 
 36 | 
 37 | @pytest.fixture
 38 | def data_rle(data_orig: pd.Series) -> pd.Series:
 39 |     return data_orig.astype(RLEDtype(data_orig.dtype))
 40 | 
 41 | 
 42 | @pytest.fixture
 43 | def data_rle_bool(data_orig_bool: pd.Series) -> pd.Series:
 44 |     return data_orig_bool.astype(RLEDtype(data_orig_bool.dtype))
 45 | 
 46 | 
 47 | @pytest.fixture(params=[True, False])
 48 | def skipna(request: SubRequest) -> bool:
 49 |     b = request.param
 50 |     assert isinstance(b, bool)
 51 |     return b
 52 | 
 53 | 
 54 | @pytest.fixture(
 55 |     params=["min", "max", "mean", "median", "prod", "skew", "std", "sum", "var", "kurt"]
 56 | )
 57 | def name(request: SubRequest) -> str:
 58 |     n = request.param
 59 |     assert isinstance(n, str)
 60 |     return n
 61 | 
 62 | 
 63 | @pytest.fixture(params=["any", "all"])
 64 | def name_bool(request: SubRequest) -> str:
 65 |     n = request.param
 66 |     assert isinstance(n, str)
 67 |     return n
 68 | 
 69 | 
 70 | @pytest.fixture(params=["max", "mean", "median", "min", "prod", "std", "sum", "var"])
 71 | def numpy_op(request: SubRequest) -> str:
 72 |     n = request.param
 73 |     assert isinstance(n, str)
 74 |     return n
 75 | 
 76 | 
 77 | @pytest.fixture(params=["all", "any"])
 78 | def numpy_op_bool(request: SubRequest) -> str:
 79 |     op = request.param
 80 |     assert isinstance(op, str)
 81 |     return op
 82 | 
 83 | 
 84 | @pytest.fixture(params=["mean", "std", "var"])
 85 | def numpy_op_with_dtype(request: SubRequest) -> str:
 86 |     op = request.param
 87 |     assert isinstance(op, str)
 88 |     return op
 89 | 
 90 | 
 91 | def test_reduce(
 92 |     data_orig: pd.Series, data_rle: pd.Series, skipna: bool, name: str
 93 | ) -> None:
 94 |     f_orig = getattr(data_orig, name)
 95 |     f_rle = getattr(data_rle, name)
 96 |     result_orig = f_orig(skipna=skipna)
 97 |     result_rle = f_rle(skipna=skipna)
 98 |     assert (
 99 |         (np.isnan(result_orig) & np.isnan(result_rle)) | (result_orig == result_rle)
100 |     ).all()
101 |     # don't check type here since pandas does some magic casting from numpy to python
102 | 
103 | 
104 | def test_reduce_bool(
105 |     data_orig_bool: pd.Series, data_rle_bool: pd.Series, name_bool: str
106 | ) -> None:
107 |     f_orig = getattr(data_orig_bool, name_bool)
108 |     f_rle = getattr(data_rle_bool, name_bool)
109 |     result_orig = f_orig()
110 |     result_rle = f_rle()
111 |     assert (result_orig == result_rle).all()
112 |     # don't check type here since pandas does some magic casting from numpy to python
113 | 
114 | 
115 | def test_array_numpy_bool_axis_notimplemented(
116 |     data_rle_bool: pd.Series, numpy_op_bool: str
117 | ) -> None:
118 |     f = getattr(data_rle_bool.array, numpy_op_bool)
119 |     with pytest.raises(NotImplementedError, match="Only axis=0 is supported."):
120 |         f(axis=2)
121 | 
122 | 
123 | def test_array_numpy_bool_out_notimplemented(
124 |     data_rle_bool: pd.Series, numpy_op_bool: str
125 | ) -> None:
126 |     f = getattr(data_rle_bool.array, numpy_op_bool)
127 |     out = data_rle_bool.array.copy()
128 |     with pytest.raises(NotImplementedError, match="out parameter is not supported."):
129 |         f(out=out)
130 | 
131 | 
132 | def test_array_reduction_not_implemented(data_rle: pd.Series) -> None:
133 |     with pytest.raises(NotImplementedError, match="reduction foo is not implemented."):
134 |         data_rle.array._reduce(name="foo")
135 | 
136 | 
137 | def test_array_numpy_bool(
138 |     data_orig_bool: pd.Series, data_rle_bool: pd.Series, numpy_op_bool: str
139 | ) -> None:
140 |     f = getattr(np, numpy_op_bool)
141 |     result_orig = f(data_rle_bool.array)
142 |     result_rle = f(data_rle_bool.array)
143 |     assert result_orig == result_rle
144 |     assert type(result_orig) == type(result_rle)
145 | 
146 | 
147 | def test_array_numpy(data_orig: pd.Series, data_rle: pd.Series, numpy_op: str) -> None:
148 |     f = getattr(np, numpy_op)
149 |     result_orig = f(data_orig.array)
150 |     result_rle = f(data_rle.array)
151 |     assert (pd.isna(result_orig) and pd.isna(result_rle)) or (result_orig == result_rle)
152 |     if len(data_orig) > 0:
153 |         assert type(result_orig) == type(result_rle)
154 |     else:
155 |         # pandas might use pd.NA, while we still use float, see https://github.com/pandas-dev/pandas/issues/35475
156 |         if isinstance(result_orig, type(pd.NA)):
157 |             assert type(result_rle) == float
158 |         else:
159 |             assert type(result_orig) == type(result_rle)
160 | 
161 | 
162 | def test_array_numpy_axis_notimplemented(data_rle: pd.Series, numpy_op: str) -> None:
163 |     f = getattr(data_rle.array, numpy_op)
164 |     with pytest.raises(NotImplementedError, match="Only axis=0 is supported."):
165 |         f(axis=2)
166 | 
167 | 
168 | def test_array_numpy_out_notimplemented(data_rle: pd.Series, numpy_op: str) -> None:
169 |     f = getattr(data_rle.array, numpy_op)
170 |     out = data_rle.array.copy()
171 |     with pytest.raises(NotImplementedError, match="out parameter is not supported."):
172 |         f(out=out)
173 | 
174 | 
175 | def test_array_numpy_dtype(data_rle: pd.Series, numpy_op_with_dtype: str) -> None:
176 |     f = getattr(np, numpy_op_with_dtype)
177 |     with pytest.raises(NotImplementedError, match="dtype parameter is not supported."):
178 |         f(data_rle.array, dtype=np.float16)
179 | 


--------------------------------------------------------------------------------
/tests/test_regressions.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Misc collection of regression tests.
  3 | """
  4 | import pickle
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import pytest
  9 | from numpy import testing as npt
 10 | from pandas.core.dtypes.common import ensure_int_or_float
 11 | 
 12 | from rle_array import RLEArray, RLEDtype
 13 | 
 14 | pytestmark = pytest.mark.filterwarnings("ignore:performance")
 15 | 
 16 | 
 17 | def test_object_isna() -> None:
 18 |     array = RLEArray._from_sequence(["foo", None], dtype=object)
 19 |     actual = array.isna()
 20 |     expected = np.asarray([False, True])
 21 |     npt.assert_equal(actual, expected)
 22 | 
 23 | 
 24 | def test_mean_divisor_overflow() -> None:
 25 |     # https://github.com/JDASoftwareGroup/rle-array/issues/22
 26 |     array = RLEArray._from_sequence([1] * 256, dtype=np.uint8)
 27 |     assert array.mean() == 1
 28 | 
 29 | 
 30 | def test_pickle() -> None:
 31 |     array = RLEArray._from_sequence([1])
 32 | 
 33 |     # roundtrip
 34 |     s = pickle.dumps(array)
 35 |     array2 = pickle.loads(s)
 36 |     npt.assert_array_equal(array, array2)
 37 | 
 38 |     # views must not be linked (A)
 39 |     array2_orig = array2.copy()
 40 |     array[:] = 2
 41 |     npt.assert_array_equal(array2, array2_orig)
 42 | 
 43 |     # views must not be linked (B)
 44 |     array_orig = array.copy()
 45 |     array2[:] = 3
 46 |     npt.assert_array_equal(array, array_orig)
 47 | 
 48 | 
 49 | def test_inplace_update() -> None:
 50 |     array = RLEArray._from_sequence([1], dtype=np.int64)
 51 |     array[[True]] = 2
 52 | 
 53 |     expected = np.array([2], dtype=np.int64)
 54 |     npt.assert_array_equal(array, expected)
 55 | 
 56 |     assert array._dtype._dtype == np.int64
 57 |     assert array._data.dtype == np.int64
 58 | 
 59 | 
 60 | def test_append_mixed() -> None:
 61 |     actual = pd.concat(
 62 |         [pd.Series([1], dtype=np.int8), pd.Series([1], dtype=RLEDtype(np.int8))]
 63 |     )
 64 |     assert actual.dtype == np.int8
 65 | 
 66 | 
 67 | def test_bool_ensure_int_or_float() -> None:
 68 |     array = RLEArray._from_sequence([False, True], dtype=np.bool_)
 69 |     actual = ensure_int_or_float(array)
 70 | 
 71 |     expected = np.array([0, 1], dtype=np.int64)
 72 |     assert actual.dtype == expected.dtype
 73 |     npt.assert_array_equal(actual, expected)
 74 | 
 75 | 
 76 | def test_groupby_bool_first() -> None:
 77 |     df = pd.DataFrame({"x": pd.Series([True, True], dtype=RLEDtype(bool)), "g": 1})
 78 |     series = df.groupby("g")["x"].first()
 79 |     assert series.dtype == RLEDtype(bool)
 80 | 
 81 |     expected = RLEArray._from_sequence([True])
 82 |     npt.assert_array_equal(series.array, expected)
 83 | 
 84 | 
 85 | def test_from_sequence_bool() -> None:
 86 |     array = RLEArray._from_sequence(
 87 |         np.array([0, 1], dtype=np.int64), dtype=RLEDtype(bool)
 88 |     )
 89 |     npt.assert_array_equal(array, np.array([False, True]))
 90 | 
 91 |     array = RLEArray._from_sequence(
 92 |         np.array([0.0, 1.0], dtype=np.float64), dtype=RLEDtype(bool)
 93 |     )
 94 |     npt.assert_array_equal(array, np.array([False, True]))
 95 | 
 96 |     with pytest.raises(TypeError, match="Need to pass bool-like values"):
 97 |         RLEArray._from_sequence(np.array([1, 2], dtype=np.int64), dtype=RLEDtype(bool))
 98 | 
 99 |     with pytest.raises(TypeError, match="Need to pass bool-like values"):
100 |         RLEArray._from_sequence(np.array([-1, 1], dtype=np.int64), dtype=RLEDtype(bool))
101 | 
102 |     with pytest.raises(TypeError, match="Masked booleans are not supported"):
103 |         RLEArray._from_sequence(
104 |             np.array([np.nan, 1.0], dtype=np.float64), dtype=RLEDtype(bool)
105 |         )
106 | 
107 | 
108 | def test_groupby_bool_sum() -> None:
109 |     # Cython routines for integer addition are not available, so we need to accept floats here.
110 |     df = pd.DataFrame({"x": pd.Series([True, True], dtype=RLEDtype(bool)), "g": 1})
111 |     series = df.groupby("g")["x"].sum()
112 |     assert series.dtype == np.float64
113 | 
114 |     expected = np.array([2], dtype=np.float64)
115 |     npt.assert_array_equal(series.to_numpy(), expected)
116 | 
117 | 
118 | def test_factorize_int() -> None:
119 |     array = RLEArray._from_sequence([42, -10, -10], dtype=RLEDtype(np.int32))
120 |     codes_actual, uniques_actual = array.factorize()
121 | 
122 |     codes_expected = np.array([0, 1, 1], dtype=np.int64)
123 |     assert codes_actual.dtype == codes_expected.dtype
124 |     npt.assert_array_equal(codes_actual, codes_expected)
125 | 
126 |     uniques_expected = RLEArray._from_sequence([42, -10], dtype=np.int32)
127 |     assert uniques_actual.dtype == uniques_expected.dtype
128 |     npt.assert_array_equal(uniques_actual, uniques_expected)
129 | 


--------------------------------------------------------------------------------
/tests/test_slicing.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, cast
  2 | 
  3 | import numpy as np
  4 | import pytest
  5 | 
  6 | from rle_array._slicing import NormalizedSlice
  7 | 
  8 | 
  9 | class TestConstructor:
 10 |     def test_ok_simple(self) -> None:
 11 |         s = NormalizedSlice(start=1, stop=11, step=2, container_length=100)
 12 |         assert s.start == 1
 13 |         assert s.stop == 11
 14 |         assert s.step == 2
 15 |         assert s.container_length == 100
 16 | 
 17 |     def test_ok_start_at_zero(self) -> None:
 18 |         NormalizedSlice(start=0, stop=10, step=2, container_length=100)
 19 | 
 20 |     def test_ok_stop_at_modulo_end(self) -> None:
 21 |         NormalizedSlice(start=0, stop=12, step=3, container_length=10)
 22 | 
 23 |     def test_ok_stop_at_modulo_begin(self) -> None:
 24 |         NormalizedSlice(start=0, stop=-3, step=-3, container_length=10)
 25 | 
 26 |     def test_ok_zero_length(self) -> None:
 27 |         NormalizedSlice(start=0, stop=0, step=1, container_length=0)
 28 | 
 29 |     def test_fail_start_none(self) -> None:
 30 |         with pytest.raises(TypeError, match="start must be int but is None"):
 31 |             NormalizedSlice(
 32 |                 start=cast(int, None), stop=10, step=2, container_length=100
 33 |             )
 34 | 
 35 |     def test_fail_stop_none(self) -> None:
 36 |         with pytest.raises(TypeError, match="stop must be int but is None"):
 37 |             NormalizedSlice(start=1, stop=cast(int, None), step=2, container_length=100)
 38 | 
 39 |     def test_fail_step_none(self) -> None:
 40 |         with pytest.raises(TypeError, match="step must be int but is None"):
 41 |             NormalizedSlice(
 42 |                 start=1, stop=10, step=cast(int, None), container_length=100
 43 |             )
 44 | 
 45 |     def test_fail_container_length_none(self) -> None:
 46 |         with pytest.raises(TypeError, match="container_length must be int but is None"):
 47 |             NormalizedSlice(start=1, stop=10, step=2, container_length=cast(int, None))
 48 | 
 49 |     def test_fail_step_zero(self) -> None:
 50 |         with pytest.raises(ValueError, match="step cannot be zero"):
 51 |             NormalizedSlice(start=1, stop=10, step=0, container_length=100)
 52 | 
 53 |     def test_fail_start_negative(self) -> None:
 54 |         with pytest.raises(
 55 |             ValueError, match=r"start \(-1\) must be in \[0,100\) but is not"
 56 |         ):
 57 |             NormalizedSlice(start=-1, stop=10, step=1, container_length=100)
 58 | 
 59 |     def test_fail_start_large(self) -> None:
 60 |         with pytest.raises(
 61 |             ValueError, match=r"start \(100\) must be in \[0,100\) but is not"
 62 |         ):
 63 |             NormalizedSlice(start=100, stop=10, step=1, container_length=100)
 64 | 
 65 |     def test_fail_stop_small(self) -> None:
 66 |         with pytest.raises(
 67 |             ValueError, match=r"stop \(-2\) must be in \[-1,101\) but is not"
 68 |         ):
 69 |             NormalizedSlice(start=2, stop=-2, step=-1, container_length=100)
 70 | 
 71 |     def test_fail_stop_large(self) -> None:
 72 |         with pytest.raises(
 73 |             ValueError, match=r"stop \(102\) must be in \[-1,101\) but is not"
 74 |         ):
 75 |             NormalizedSlice(start=2, stop=102, step=1, container_length=100)
 76 | 
 77 |     def test_fail_container_length_negative(self) -> None:
 78 |         with pytest.raises(
 79 |             ValueError,
 80 |             match=r"container_length \(-1\) must be greater or equal to zero",
 81 |         ):
 82 |             NormalizedSlice(start=2, stop=102, step=1, container_length=-1)
 83 | 
 84 |     def test_fail_container_empty_start_fail(self) -> None:
 85 |         with pytest.raises(
 86 |             ValueError, match="for empty containers, start must be 0 but is 1"
 87 |         ):
 88 |             NormalizedSlice(start=1, stop=0, step=1, container_length=0)
 89 | 
 90 |     def test_fail_container_empty_stop_fail(self) -> None:
 91 |         with pytest.raises(
 92 |             ValueError, match="for empty containers, stop must be 0 but is 1"
 93 |         ):
 94 |             NormalizedSlice(start=0, stop=1, step=1, container_length=0)
 95 | 
 96 |     def test_fail_container_empty_step_fail(self) -> None:
 97 |         with pytest.raises(
 98 |             ValueError, match="for empty containers, step must be 1 but is 2"
 99 |         ):
100 |             NormalizedSlice(start=0, stop=0, step=2, container_length=0)
101 | 
102 |     def test_fail_forward_slice_not_forward(self) -> None:
103 |         with pytest.raises(
104 |             ValueError,
105 |             match="for forward slices, stop must be greater or equal to start",
106 |         ):
107 |             NormalizedSlice(start=1, stop=0, step=1, container_length=100)
108 | 
109 |     def test_fail_backward_slice_not_backward(self) -> None:
110 |         with pytest.raises(
111 |             ValueError,
112 |             match="for backward slices, start must be greater or equal to stop",
113 |         ):
114 |             NormalizedSlice(start=0, stop=1, step=-1, container_length=100)
115 | 
116 |     def test_fail_slice_empty_start(self) -> None:
117 |         with pytest.raises(
118 |             ValueError, match="for empty slices, start and stop must be 0 but are 1"
119 |         ):
120 |             NormalizedSlice(start=1, stop=1, step=1, container_length=100)
121 | 
122 |     def test_fail_slice_empty_step(self) -> None:
123 |         with pytest.raises(
124 |             ValueError, match="for empty slices, step must be 1 but is 2"
125 |         ):
126 |             NormalizedSlice(start=0, stop=0, step=2, container_length=100)
127 | 
128 |     def test_fail_distance_not_modulo(self) -> None:
129 |         with pytest.raises(
130 |             ValueError,
131 |             match="The distance between start and stop most be divisible by the step size",
132 |         ):
133 |             NormalizedSlice(start=0, stop=10, step=3, container_length=100)
134 | 
135 | 
136 | class TestFrozen:
137 |     def test_start(self) -> None:
138 |         s = NormalizedSlice(start=1, stop=11, step=2, container_length=100)
139 |         with pytest.raises(AttributeError, match="can't set attribute"):
140 |             s.start = 2  # type: ignore
141 | 
142 |     def test_stop(self) -> None:
143 |         s = NormalizedSlice(start=1, stop=11, step=2, container_length=100)
144 |         with pytest.raises(AttributeError, match="can't set attribute"):
145 |             s.stop = 2  # type: ignore
146 | 
147 |     def test_step(self) -> None:
148 |         s = NormalizedSlice(start=1, stop=11, step=2, container_length=100)
149 |         with pytest.raises(AttributeError, match="can't set attribute"):
150 |             s.step = 3  # type: ignore
151 | 
152 |     def test_container_length(self) -> None:
153 |         s = NormalizedSlice(start=1, stop=11, step=2, container_length=100)
154 |         with pytest.raises(AttributeError, match="can't set attribute"):
155 |             s.container_length = 3  # type: ignore
156 | 
157 | 
158 | def test_repr() -> None:
159 |     s = NormalizedSlice(start=1, stop=11, step=2, container_length=100)
160 |     assert repr(s) == "NormalizedSlice(start=1, stop=11, step=2, container_length=100)"
161 | 
162 | 
163 | @pytest.mark.parametrize(
164 |     "s, expected",
165 |     [
166 |         (  # empty
167 |             # s
168 |             NormalizedSlice(start=0, stop=0, step=1, container_length=0),
169 |             # expected
170 |             0,
171 |         ),
172 |         (  # simple, forward
173 |             # s
174 |             NormalizedSlice(start=0, stop=10, step=1, container_length=100),
175 |             # expected
176 |             10,
177 |         ),
178 |         (  # simple, backward
179 |             # s
180 |             NormalizedSlice(start=9, stop=-1, step=-1, container_length=100),
181 |             # expected
182 |             10,
183 |         ),
184 |         (  # even, forward
185 |             # s
186 |             NormalizedSlice(start=0, stop=10, step=2, container_length=100),
187 |             # expected
188 |             5,
189 |         ),
190 |         (  # even, backward
191 |             # s
192 |             NormalizedSlice(start=9, stop=-1, step=-2, container_length=100),
193 |             # expected
194 |             5,
195 |         ),
196 |         (  # complex, forward
197 |             # s
198 |             NormalizedSlice(start=10, stop=22, step=3, container_length=100),
199 |             # expected
200 |             4,
201 |         ),
202 |         (  # complex, backward
203 |             # s
204 |             NormalizedSlice(start=19, stop=7, step=-3, container_length=100),
205 |             # expected
206 |             4,
207 |         ),
208 |     ],
209 | )
210 | def test_len(s: NormalizedSlice, expected: int) -> None:
211 |     assert len(s) == expected
212 | 
213 | 
214 | class TestFromSlice:
215 |     def test_fail_slice_wrong_type(self) -> None:
216 |         with pytest.raises(TypeError, match="slice must be a slice but is str"):
217 |             NormalizedSlice.from_slice(container_length=10, s=cast(slice, "foo"))
218 | 
219 |     def test_fail_slice_start_wrong_type(self) -> None:
220 |         with pytest.raises(
221 |             TypeError, match="slice start must be int or None but is str"
222 |         ):
223 |             NormalizedSlice.from_slice(container_length=10, s=slice("foo", 20, 2))
224 | 
225 |     def test_fail_slice_stop_wrong_type(self) -> None:
226 |         with pytest.raises(
227 |             TypeError, match="slice stop must be int or None but is str"
228 |         ):
229 |             NormalizedSlice.from_slice(container_length=10, s=slice(2, "foo", 2))
230 | 
231 |     def test_fail_slice_step_wrong_type(self) -> None:
232 |         with pytest.raises(
233 |             TypeError, match="slice step must be int or None but is str"
234 |         ):
235 |             NormalizedSlice.from_slice(container_length=10, s=slice(2, 20, "foo"))
236 | 
237 |     def test_fail_step_zero(self) -> None:
238 |         with pytest.raises(ValueError, match="slice step cannot be zero"):
239 |             NormalizedSlice.from_slice(container_length=10, s=slice(2, 10, 0))
240 | 
241 |     def test_fail_container_length_wrong_type(self) -> None:
242 |         with pytest.raises(
243 |             TypeError, match="container_length must be an int but is str"
244 |         ):
245 |             NormalizedSlice.from_slice(
246 |                 container_length=cast(int, "foo"), s=slice(2, 10, 2)
247 |             )
248 | 
249 |     def test_fail_container_length_negative(self) -> None:
250 |         with pytest.raises(ValueError, match="container_length cannot be negative"):
251 |             NormalizedSlice.from_slice(container_length=-1, s=slice(2, 10, 2))
252 | 
253 |     @pytest.mark.parametrize(
254 |         "container_length, s, expected",
255 |         [
256 |             (  # empty
257 |                 # container_length
258 |                 0,
259 |                 # s
260 |                 None,
261 |                 # expected
262 |                 NormalizedSlice(start=0, stop=0, step=1, container_length=0),
263 |             ),
264 |             (  # implicit full via None
265 |                 # container_length
266 |                 100,
267 |                 # s
268 |                 None,
269 |                 # expected
270 |                 NormalizedSlice(start=0, stop=100, step=1, container_length=100),
271 |             ),
272 |             (  # explicit full via slice
273 |                 # container_length
274 |                 100,
275 |                 # s
276 |                 slice(None, None, None),
277 |                 # expected
278 |                 NormalizedSlice(start=0, stop=100, step=1, container_length=100),
279 |             ),
280 |             (  # explicit full
281 |                 # container_length
282 |                 100,
283 |                 # s
284 |                 slice(0, 100, 1),
285 |                 # expected
286 |                 NormalizedSlice(start=0, stop=100, step=1, container_length=100),
287 |             ),
288 |             (  # full reverse
289 |                 # container_length
290 |                 100,
291 |                 # s
292 |                 slice(None, None, -1),
293 |                 # expected
294 |                 NormalizedSlice(start=99, stop=-1, step=-1, container_length=100),
295 |             ),
296 |             (  # start negative
297 |                 # container_length
298 |                 100,
299 |                 # s
300 |                 slice(-20, None, None),
301 |                 # expected
302 |                 NormalizedSlice(start=80, stop=100, step=1, container_length=100),
303 |             ),
304 |             (  # start negative overflow container
305 |                 # container_length
306 |                 100,
307 |                 # s
308 |                 slice(-1000, None, None),
309 |                 # expected
310 |                 NormalizedSlice(start=0, stop=100, step=1, container_length=100),
311 |             ),
312 |             (  # stop negative
313 |                 # container_length
314 |                 100,
315 |                 # s
316 |                 slice(None, -20, None),
317 |                 # expected
318 |                 NormalizedSlice(start=0, stop=80, step=1, container_length=100),
319 |             ),
320 |             (  # stop negative overflow container
321 |                 # container_length
322 |                 100,
323 |                 # s
324 |                 slice(None, -1000, None),
325 |                 # expected
326 |                 NormalizedSlice(start=0, stop=0, step=1, container_length=100),
327 |             ),
328 |             (  # stop negative overflow start
329 |                 # container_length
330 |                 100,
331 |                 # s
332 |                 slice(10, -1000, None),
333 |                 # expected
334 |                 NormalizedSlice(start=0, stop=0, step=1, container_length=100),
335 |             ),
336 |             (  # stop negative overflow start reverse
337 |                 # container_length
338 |                 100,
339 |                 # s
340 |                 slice(10, -10, -1),
341 |                 # expected
342 |                 NormalizedSlice(start=0, stop=0, step=1, container_length=100),
343 |             ),
344 |             (  # modulo normlization forward
345 |                 # container_length
346 |                 10,
347 |                 # s
348 |                 slice(0, 10, 3),
349 |                 # expected
350 |                 NormalizedSlice(start=0, stop=12, step=3, container_length=10),
351 |             ),
352 |             (  # modulo normlization forward, empty
353 |                 # container_length
354 |                 10,
355 |                 # s
356 |                 slice(0, 0, 3),
357 |                 # expected
358 |                 NormalizedSlice(start=0, stop=0, step=1, container_length=10),
359 |             ),
360 |             (  # modulo normlization backward
361 |                 # container_length
362 |                 10,
363 |                 # s
364 |                 slice(0, -1000, -3),
365 |                 # expected
366 |                 NormalizedSlice(start=0, stop=-3, step=-3, container_length=10),
367 |             ),
368 |             (  # modulo normlization backward, empty
369 |                 # container_length
370 |                 10,
371 |                 # s
372 |                 slice(0, 0, -3),
373 |                 # expected
374 |                 NormalizedSlice(start=0, stop=0, step=1, container_length=10),
375 |             ),
376 |             (  # numpy.int64
377 |                 # container_length
378 |                 np.int64(100),
379 |                 # s
380 |                 slice(np.int64(0), np.int64(100), np.int64(1)),
381 |                 # expected
382 |                 NormalizedSlice(start=0, stop=100, step=1, container_length=100),
383 |             ),
384 |         ],
385 |     )
386 |     def test_ok(
387 |         self, container_length: int, s: Optional[slice], expected: NormalizedSlice
388 |     ) -> None:
389 |         actual = NormalizedSlice.from_slice(container_length, s)
390 |         assert type(actual) == NormalizedSlice
391 |         assert actual.start == expected.start
392 |         assert type(actual.start) == int
393 |         assert actual.stop == expected.stop
394 |         assert type(actual.stop) == int
395 |         assert actual.step == expected.step
396 |         assert type(actual.step) == int
397 |         assert actual.container_length == expected.container_length
398 |         assert type(actual.container_length) == int
399 | 
400 | 
401 | class TestProject:
402 |     def test_fail_no_normalizedslice(self) -> None:
403 |         s1 = NormalizedSlice(start=0, stop=10, step=1, container_length=100)
404 |         s2 = slice(1, 2, 1)
405 |         with pytest.raises(
406 |             TypeError, match="child must be NormalizedSlice but is slice"
407 |         ):
408 |             s1.project(cast(NormalizedSlice, s2))
409 | 
410 |     def test_fail_len_diff(self) -> None:
411 |         s1 = NormalizedSlice(start=0, stop=10, step=1, container_length=100)
412 |         s2 = NormalizedSlice(start=0, stop=10, step=1, container_length=20)
413 |         with pytest.raises(
414 |             ValueError,
415 |             match=r"container_length of child \(20\) must be length of parent \(10\)",
416 |         ):
417 |             s1.project(s2)
418 | 
419 |     @pytest.mark.parametrize(
420 |         "s1, s2, expected",
421 |         [
422 |             (  # simple full take
423 |                 # s1
424 |                 NormalizedSlice(start=0, stop=10, step=1, container_length=100),
425 |                 # s2
426 |                 NormalizedSlice(start=0, stop=10, step=1, container_length=10),
427 |                 # expected
428 |                 NormalizedSlice(start=0, stop=10, step=1, container_length=100),
429 |             ),
430 |             (  # reverse reverse
431 |                 # s1
432 |                 NormalizedSlice(start=9, stop=-1, step=-1, container_length=100),
433 |                 # s2
434 |                 NormalizedSlice(start=9, stop=-1, step=-1, container_length=10),
435 |                 # expected
436 |                 NormalizedSlice(start=0, stop=10, step=1, container_length=100),
437 |             ),
438 |             (  # two modulos
439 |                 # s1
440 |                 NormalizedSlice(start=2, stop=29, step=3, container_length=100),
441 |                 # s2
442 |                 NormalizedSlice(start=1, stop=7, step=3, container_length=9),
443 |                 # expected
444 |                 NormalizedSlice(start=5, stop=23, step=9, container_length=100),
445 |             ),
446 |             (  # take empty
447 |                 # s1
448 |                 NormalizedSlice(start=1, stop=9, step=2, container_length=100),
449 |                 # s2
450 |                 NormalizedSlice(start=0, stop=0, step=1, container_length=4),
451 |                 # expected
452 |                 NormalizedSlice(start=0, stop=0, step=1, container_length=100),
453 |             ),
454 |         ],
455 |     )
456 |     def test_ok(
457 |         self, s1: NormalizedSlice, s2: NormalizedSlice, expected: NormalizedSlice
458 |     ) -> None:
459 |         actual = s1.project(s2)
460 |         assert type(actual) == NormalizedSlice
461 |         assert actual.start == expected.start
462 |         assert actual.stop == expected.stop
463 |         assert actual.step == expected.step
464 |         assert actual.container_length == expected.container_length
465 | 
466 | 
467 | @pytest.mark.parametrize(
468 |     "s, expected",
469 |     [
470 |         (  # full take
471 |             # s
472 |             NormalizedSlice(start=0, stop=100, step=1, container_length=100),
473 |             # expected
474 |             None,
475 |         ),
476 |         (  # full reverse
477 |             # s
478 |             NormalizedSlice(start=99, stop=-1, step=-1, container_length=100),
479 |             # expected
480 |             slice(None, None, -1),
481 |         ),
482 |         (  # only start
483 |             # s
484 |             NormalizedSlice(start=1, stop=100, step=1, container_length=100),
485 |             # expected
486 |             slice(1, None, None),
487 |         ),
488 |         (  # only stop
489 |             # s
490 |             NormalizedSlice(start=0, stop=99, step=1, container_length=100),
491 |             # expected
492 |             slice(None, 99, None),
493 |         ),
494 |         (  # only step
495 |             # s
496 |             NormalizedSlice(start=0, stop=100, step=2, container_length=100),
497 |             # expected
498 |             slice(None, None, 2),
499 |         ),
500 |         (  # complex
501 |             # s
502 |             NormalizedSlice(start=1, stop=22, step=3, container_length=100),
503 |             # expected
504 |             slice(1, 22, 3),
505 |         ),
506 |     ],
507 | )
508 | def test_to_slice(s: NormalizedSlice, expected: Optional[slice]) -> None:
509 |     actual = s.to_slice()
510 |     if expected is None:
511 |         assert actual is None
512 |     else:
513 |         assert isinstance(actual, slice)
514 |         assert type(actual) == slice
515 |         assert actual.start == expected.start
516 |         assert actual.stop == expected.stop
517 |         assert actual.step == expected.step
518 | 


--------------------------------------------------------------------------------
/tests/test_testing.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | from typing import List, cast
  3 | 
  4 | import pandas as pd
  5 | import pytest
  6 | from _pytest.fixtures import SubRequest
  7 | from pandas import testing as pdt
  8 | 
  9 | from rle_array.testing import (
 10 |     const_col,
 11 |     dim_col,
 12 |     generate_example,
 13 |     generate_test_dataframe,
 14 | )
 15 | 
 16 | 
 17 | @pytest.mark.parametrize(
 18 |     "dims, expected",
 19 |     [
 20 |         (
 21 |             # dims
 22 |             [1],
 23 |             # expected
 24 |             "const_1",
 25 |         ),
 26 |         (
 27 |             # dims
 28 |             [1, 2],
 29 |             # expected
 30 |             "const_1_2",
 31 |         ),
 32 |         (
 33 |             # dims
 34 |             [2, 1],
 35 |             # expected
 36 |             "const_1_2",
 37 |         ),
 38 |     ],
 39 | )
 40 | def test_const_col(dims: List[int], expected: str) -> None:
 41 |     actual = const_col(dims)
 42 |     assert actual == expected
 43 | 
 44 | 
 45 | @pytest.mark.parametrize(
 46 |     "d, expected",
 47 |     [
 48 |         (
 49 |             # d
 50 |             1,
 51 |             # expected
 52 |             "dim_1",
 53 |         ),
 54 |         (
 55 |             # d
 56 |             2,
 57 |             # expected
 58 |             "dim_2",
 59 |         ),
 60 |     ],
 61 | )
 62 | def test_dim_col(d: int, expected: str) -> None:
 63 |     actual = dim_col(d)
 64 |     assert actual == expected
 65 | 
 66 | 
 67 | SIZE = 4
 68 | N_DIMS = 3
 69 | 
 70 | 
 71 | class TestGenerateTestDataFrame:
 72 |     @pytest.fixture
 73 |     def df(self) -> pd.DataFrame:
 74 |         return generate_test_dataframe(n_dims=N_DIMS, size=SIZE)
 75 | 
 76 |     @pytest.fixture(params=list(range(N_DIMS)))
 77 |     def d(self, request: SubRequest) -> int:
 78 |         i = request.param
 79 |         assert isinstance(i, int)
 80 |         return i
 81 | 
 82 |     @pytest.fixture(
 83 |         params=list(
 84 |             itertools.chain(
 85 |                 *(
 86 |                     itertools.combinations(range(N_DIMS), r)
 87 |                     for r in range(1, N_DIMS + 1)
 88 |                 )
 89 |             )
 90 |         )
 91 |     )
 92 |     def dims(self, request: SubRequest) -> List[int]:
 93 |         return cast(List[int], request.param)
 94 | 
 95 |     def test_len(self, df: pd.DataFrame) -> None:
 96 |         assert len(df) == SIZE ** N_DIMS
 97 | 
 98 |     def test_index(self, df: pd.DataFrame) -> None:
 99 |         pdt.assert_index_equal(df.index, pd.RangeIndex(0, len(df)))
100 |         assert isinstance(df.index, pd.RangeIndex)
101 | 
102 |     def test_dim_nunique(self, df: pd.DataFrame, d: int) -> None:
103 |         assert df[dim_col(d)].nunique() == SIZE
104 | 
105 |     def test_dim_value_counts(self, df: pd.DataFrame, d: int) -> None:
106 |         assert (df[dim_col(d)].value_counts() == SIZE ** (N_DIMS - 1)).all()
107 | 
108 |     def test_dims_sorted(self, df: pd.DataFrame, d: int) -> None:
109 |         delta = df[dim_col(d)].values[1:] - df[dim_col(d)].values[:-1]
110 |         assert ((delta == 0) | (delta == 1) | (delta == -(SIZE - 1))).all()
111 | 
112 |     def test_const_nunique(self, df: pd.DataFrame, dims: List[int]) -> None:
113 |         assert df[const_col(dims)].nunique() == SIZE ** len(dims)
114 | 
115 |     def test_const_value_counts(self, df: pd.DataFrame, dims: List[int]) -> None:
116 |         assert (
117 |             df[const_col(dims)].value_counts() == SIZE ** (N_DIMS - len(dims))
118 |         ).all()
119 | 
120 |     def test_cols_sorted(self, df: pd.DataFrame) -> None:
121 |         assert list(df.columns) == sorted(df.columns)
122 | 
123 | 
124 | def test_generate_example() -> None:
125 |     df = generate_example()
126 |     assert len(df) == 2000 ** 2
127 |     assert list(df.columns) == [
128 |         "date",
129 |         "month",
130 |         "year",
131 |         "city",
132 |         "country",
133 |         "avg_temp",
134 |         "rain",
135 |         "mood",
136 |     ]
137 | 


--------------------------------------------------------------------------------
/tests/test_ufunc.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import pytest
  4 | from numpy import testing as npt
  5 | 
  6 | from rle_array import RLEArray
  7 | 
  8 | pytestmark = pytest.mark.filterwarnings("ignore:performance")
  9 | 
 10 | 
 11 | @pytest.fixture
 12 | def array_orig() -> np.ndarray:
 13 |     return np.array([1, 1, 2, 1], dtype=np.int32)
 14 | 
 15 | 
 16 | @pytest.fixture
 17 | def array_rle(array_orig: np.ndarray) -> RLEArray:
 18 |     return RLEArray._from_sequence(array_orig)
 19 | 
 20 | 
 21 | def test_square(array_orig: np.ndarray, array_rle: RLEArray) -> None:
 22 |     expected = np.square(array_orig)
 23 |     actual = np.square(array_rle)
 24 |     npt.assert_array_equal(actual, expected)
 25 | 
 26 | 
 27 | @pytest.mark.parametrize("out_is_rle", [False, True])
 28 | def test_square_out(
 29 |     array_orig: np.ndarray, array_rle: RLEArray, out_is_rle: bool
 30 | ) -> None:
 31 |     out_orig = np.array([0] * len(array_orig), dtype=array_orig.dtype)
 32 |     if out_is_rle:
 33 |         out_rle = RLEArray._from_sequence(out_orig)
 34 |     else:
 35 |         out_rle = out_orig.copy()
 36 | 
 37 |     np.square(array_orig, out=out_orig)
 38 |     np.square(array_rle, out=out_rle)
 39 | 
 40 |     npt.assert_array_equal(out_orig, out_rle)
 41 | 
 42 | 
 43 | def test_add_at(array_orig: np.ndarray, array_rle: RLEArray) -> None:
 44 |     expected = np.add.at(array_orig, [0, 2], 10)
 45 |     actual = np.add.at(array_rle, [0, 2], 10)
 46 |     assert expected is None
 47 |     assert actual is None
 48 |     npt.assert_array_equal(array_orig, array_rle)
 49 | 
 50 | 
 51 | def test_divmod(array_orig: np.ndarray, array_rle: RLEArray) -> None:
 52 |     expected1, expected2 = np.divmod(array_orig, 2)
 53 |     actual1, actual2 = np.divmod(array_rle, 2)
 54 |     npt.assert_array_equal(actual1, expected1)
 55 |     npt.assert_array_equal(actual2, expected2)
 56 | 
 57 | 
 58 | @pytest.mark.parametrize("t", [pd.Series, pd.DataFrame, pd.Index])
 59 | def test_add_unhandled(array_orig: np.ndarray, array_rle: RLEArray, t: type) -> None:
 60 |     other = t(array_orig)
 61 | 
 62 |     # the pandas docs say we should not handle these
 63 |     assert (
 64 |         array_rle.__array_ufunc__(np.add, "__call__", array_rle, other)
 65 |         is NotImplemented
 66 |     )
 67 | 
 68 | 
 69 | def test_2d_broadcast_add(array_orig: np.ndarray, array_rle: RLEArray) -> None:
 70 |     # ufuncs can result in high-dimensional arrays. In that case, just return a normal NumPy array.
 71 |     other = np.vstack([array_orig, array_orig])
 72 |     assert other.shape == (2, len(array_orig))
 73 | 
 74 |     expected = other * array_orig
 75 |     actual = other * array_rle
 76 |     assert actual.dtype == expected.dtype
 77 |     npt.assert_array_equal(actual, expected)
 78 | 
 79 | 
 80 | def test_2d_broadcast_divmod(array_orig: np.ndarray, array_rle: RLEArray) -> None:
 81 |     # ufuncs can result in high-dimensional arrays. In that case, just return a normal NumPy array.
 82 |     other = np.vstack([array_orig, array_orig])
 83 |     assert other.shape == (2, len(array_orig))
 84 | 
 85 |     expected1, expected2 = np.divmod(other, array_orig)
 86 |     actual1, actual2 = np.divmod(other, array_rle)
 87 |     assert actual1.dtype == expected1.dtype
 88 |     assert actual2.dtype == expected2.dtype
 89 |     npt.assert_array_equal(actual1, expected1)
 90 |     npt.assert_array_equal(actual2, expected2)
 91 | 
 92 | 
 93 | def test_mixed_typing_mul(array_orig: np.ndarray, array_rle: RLEArray) -> None:
 94 |     actual = array_orig * array_rle
 95 | 
 96 |     expected = array_orig * array_orig
 97 |     assert actual.dtype == expected.dtype
 98 |     npt.assert_array_equal(actual, expected)
 99 | 
100 | 
101 | def test_mixed_typing_divmod(array_orig: np.ndarray, array_rle: RLEArray) -> None:
102 |     actual1, actual2 = np.divmod(array_orig, array_rle)
103 | 
104 |     expected1, expected2 = np.divmod(array_orig, array_orig)
105 |     assert actual1.dtype == expected1.dtype
106 |     assert actual2.dtype == expected2.dtype
107 |     npt.assert_array_equal(actual1, expected1)
108 |     npt.assert_array_equal(actual2, expected2)
109 | 


--------------------------------------------------------------------------------
/tests/test_view.py:
--------------------------------------------------------------------------------
  1 | import gc
  2 | 
  3 | import numpy as np
  4 | import pytest
  5 | from numpy import testing as npt
  6 | 
  7 | from rle_array.array import RLEArray, _ViewAnchor
  8 | 
  9 | pytestmark = pytest.mark.filterwarnings("ignore:performance")
 10 | 
 11 | 
 12 | def test_view_raises_differnt_dtype() -> None:
 13 |     orig = RLEArray._from_sequence(np.arange(10))
 14 |     with pytest.raises(ValueError, match="Cannot create view with different dtype"):
 15 |         orig.view(np.int8)
 16 | 
 17 | 
 18 | @pytest.mark.parametrize("dtype", ["none", "numpy", "rle"])
 19 | def test_plain_view(dtype: str) -> None:
 20 |     orig = RLEArray._from_sequence(np.arange(10))
 21 | 
 22 |     if dtype == "none":
 23 |         dtype_view = None
 24 |     elif dtype == "numpy":
 25 |         dtype_view = orig.dtype._dtype
 26 |     elif dtype == "rle":
 27 |         dtype_view = orig.dtype
 28 |     else:
 29 |         raise ValueError(f"unknown dtype variante {dtype}")
 30 |     view = orig.view(dtype_view)
 31 | 
 32 |     assert view is not orig
 33 |     assert view.dtype == orig.dtype
 34 |     npt.assert_array_equal(orig, view)
 35 | 
 36 |     orig[[0, 1]] = [100, 101]
 37 |     view[[0, 8, 9]] = [1000, 108, 109]
 38 | 
 39 |     result = RLEArray._from_sequence([1000, 101, 2, 3, 4, 5, 6, 7, 108, 109])
 40 | 
 41 |     npt.assert_array_equal(orig, result)
 42 |     npt.assert_array_equal(orig, view)
 43 | 
 44 | 
 45 | def test_view_tree() -> None:
 46 |     # o-->1-+->11
 47 |     #       +->12
 48 |     orig = RLEArray._from_sequence(np.arange(10))
 49 | 
 50 |     view1 = orig.view()
 51 |     view11 = view1.view()
 52 |     view12 = view1.view()
 53 | 
 54 |     assert view1 is not orig
 55 |     assert view11 is not orig
 56 |     assert view12 is not orig
 57 |     assert view11 is not view1
 58 |     assert view12 is not view1
 59 |     assert view11 is not view12
 60 |     npt.assert_array_equal(orig, view1)
 61 |     npt.assert_array_equal(orig, view11)
 62 |     npt.assert_array_equal(orig, view12)
 63 | 
 64 |     view11[[8, 9]] = [108, 109]
 65 |     view1[[0, 1, 9]] = [100, 101, 1009]
 66 | 
 67 |     result = RLEArray._from_sequence([100, 101, 2, 3, 4, 5, 6, 7, 108, 1009])
 68 | 
 69 |     npt.assert_array_equal(orig, result)
 70 |     npt.assert_array_equal(orig, view1)
 71 |     npt.assert_array_equal(orig, view11)
 72 |     npt.assert_array_equal(orig, view12)
 73 | 
 74 | 
 75 | def test_slicing() -> None:
 76 |     N = 100
 77 |     orig_np = np.arange(N)
 78 |     orig_rle = RLEArray._from_sequence(orig_np)
 79 | 
 80 |     ops = [
 81 |         slice(None, None, None),
 82 |         slice(1, -3, 2),
 83 |         slice(None, None, -1),
 84 |         slice(None, None, -1),
 85 |         slice(3, 4, -3),
 86 |     ]
 87 | 
 88 |     arrays_np = [orig_np]
 89 |     arrays_rle = [orig_rle]
 90 |     for i, o in enumerate(ops):
 91 |         last_np = arrays_np[-1]
 92 |         last_rle = arrays_rle[-1]
 93 |         npt.assert_array_equal(last_np, last_rle)
 94 | 
 95 |         sub_np = last_np[o]
 96 |         sub_rle = last_rle[o]
 97 | 
 98 |         assert sub_np is not last_np
 99 |         assert sub_rle is not last_rle
100 |         npt.assert_array_equal(sub_np, sub_rle)
101 | 
102 |         delta = np.arange(len(sub_np)) * (N ** i)
103 | 
104 |         # `+=` seems to convert sub_rle from RLEArray to ndarray?
105 |         sub_np[:] = sub_np + delta
106 |         sub_rle[:] = sub_rle + delta
107 | 
108 |         arrays_np.append(sub_np)
109 |         arrays_rle.append(sub_rle)
110 | 
111 |     for arr_np, arr_rle in zip(arrays_np, arrays_rle):
112 |         npt.assert_array_equal(arr_np, arr_rle)
113 | 
114 | 
115 | def test_anchor_ref() -> None:
116 |     try:
117 |         gc.disable()
118 |         gc.collect()
119 | 
120 |         n_objects_pre = len(
121 |             [o for o in gc.get_objects() if isinstance(o, (RLEArray, _ViewAnchor))]
122 |         )
123 | 
124 |         RLEArray._from_sequence(np.arange(10))
125 | 
126 |         n_objects_post = len(
127 |             [o for o in gc.get_objects() if isinstance(o, (RLEArray, _ViewAnchor))]
128 |         )
129 |         assert n_objects_pre == n_objects_post
130 |     finally:
131 |         gc.enable()
132 | 


--------------------------------------------------------------------------------