├── .github └── workflows │ ├── check_remote_polars_version.yml │ ├── ci.yml │ └── pytest.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CODE_OF_CONDUCT.md ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── Makefile ├── README.md ├── check_polars_version.py ├── docs ├── abs.md ├── aggregate.md ├── arguments.md ├── arrays.md ├── assets │ ├── array00.png │ ├── array01.png │ ├── image.png │ ├── life_toad.gif │ ├── life_toad_df.gif │ ├── list_chunked_memory_layout.png │ ├── struct_array_memory_layout.png │ ├── struct_example_Point2D.png │ ├── structchunked_fields_memory_layout.png │ └── timings.png ├── branch_mispredictions.md ├── cum_sum.md ├── index.md ├── life_pt1.md ├── life_pt2.md ├── lists.md ├── lists_in_lists_out.md ├── lost_in_space.md ├── noop.md ├── prerequisites.md ├── publishing.md ├── requirements-docs.txt ├── stem.md ├── stringify.md ├── struct.md ├── sum.md ├── vec_of_option.md └── where_to_go.md ├── minimal_plugin ├── __init__.py └── typing.py ├── mkdocs.yml ├── perf.py ├── perf_list.py ├── pyproject.toml ├── requirements-dev.txt ├── requirements.txt ├── run.py ├── rust-toolchain.toml ├── rustfmt.toml ├── src ├── arrays.rs ├── expressions.rs └── lib.rs └── test_plugin.py /.github/workflows/check_remote_polars_version.yml: -------------------------------------------------------------------------------- 1 | name: Remote polars version check 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | concurrency: 12 | group: ${{ github.workflow }}-${{ github.ref }} 13 | cancel-in-progress: true 14 | 15 | permissions: 16 | contents: read 17 | 18 | env: 19 | RUSTFLAGS: "-Dwarnings" 20 | 21 | jobs: 22 | polars_version_check: 23 | runs-on: ubuntu-latest 24 | strategy: 25 | matrix: 26 | target: [x86_64] 27 | python-version: ["3.11", "3.12"] 28 | steps: 29 | - uses: actions/checkout@v3 30 | - uses: actions/setup-python@v4 31 | with: 32 | python-version: ${{ matrix.python-version }} 33 | 34 | - name: Check polars version used by the remote cookiecutter template 35 | run: python check_polars_version.py 36 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: ci 2 | on: 3 | push: 4 | branches: 5 | - master 6 | - main 7 | permissions: 8 | contents: write 9 | jobs: 10 | deploy: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | - name: Configure Git Credentials 15 | run: | 16 | git config user.name github-actions[bot] 17 | git config user.email 41898282+github-actions[bot]@users.noreply.github.com 18 | - uses: actions/setup-python@v4 19 | with: 20 | python-version: 3.x 21 | - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV 22 | - uses: actions/cache@v3 23 | with: 24 | key: mkdocs-material-${{ env.cache_id }} 25 | path: .cache 26 | restore-keys: | 27 | mkdocs-material- 28 | - run: pip install mkdocs-material 29 | - run: mkdocs gh-deploy --force 30 | -------------------------------------------------------------------------------- /.github/workflows/pytest.yml: -------------------------------------------------------------------------------- 1 | # This file is autogenerated by maturin v1.4.0 2 | # To update, run 3 | # 4 | # maturin generate-ci github --pytest 5 | # 6 | name: CI 7 | 8 | on: 9 | push: 10 | branches: 11 | - main 12 | - master 13 | tags: 14 | - '*' 15 | pull_request: 16 | workflow_dispatch: 17 | 18 | concurrency: 19 | group: ${{ github.workflow }}-${{ github.ref }} 20 | cancel-in-progress: true 21 | 22 | permissions: 23 | contents: read 24 | 25 | # Make sure CI fails on all warnings, including Clippy lints 26 | env: 27 | RUSTFLAGS: "-Dwarnings" 28 | 29 | jobs: 30 | linux_tests: 31 | runs-on: ubuntu-latest 32 | strategy: 33 | matrix: 34 | target: [x86_64] 35 | python-version: ["3.9", "3.10", "3.11"] 36 | steps: 37 | - uses: actions/checkout@v3 38 | - uses: actions/setup-python@v4 39 | with: 40 | python-version: ${{ matrix.python-version }} 41 | 42 | - name: Set up Rust 43 | run: rustup show 44 | - uses: mozilla-actions/sccache-action@v0.0.3 45 | - run: curl -LsSf https://astral.sh/uv/install.sh | sh 46 | - run: uv venv --seed 47 | - run: . .venv/bin/activate && uv pip install -r requirements.txt -r requirements-dev.txt 48 | - run: make pre-commit 49 | - run: make install 50 | - run: make test 51 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/codespell-project/codespell 3 | rev: v2.2.5 4 | hooks: 5 | - id: codespell 6 | args: [--ignore-words-list=crate] 7 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | https://www.linkedin.com/in/marcogorelli/. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. 129 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | # Name of the project goes here 3 | # Note - it should be the same as the folder which you store your code in! 4 | name = "minimal_plugin" 5 | version = "0.1.0" 6 | edition = "2021" 7 | 8 | [lib] 9 | # Name of the project goes here 10 | # Note - it should be the same as the folder which you store your code in! 11 | name = "minimal_plugin" 12 | crate-type= ["cdylib"] 13 | 14 | [dependencies] 15 | pyo3 = { version = "0.23", features = ["extension-module", "abi3-py38"] } 16 | pyo3-polars = { version = "0.20", features = ["derive", "dtype-struct", "dtype-decimal", "dtype-array"] } 17 | serde = { version = "1", features = ["derive"] } 18 | polars = { version = "0.46.0", features = ["dtype-struct"], default-features = false } 19 | polars-arrow = { version = "0.46.0", default-features = false } 20 | polars-core = { version = "0.46.0", features = ["dtype-array"], default-features = false } 21 | polars-sql = { version = "0.46.0", default-features = false } 22 | reverse_geocoder = "4.1.1" 23 | num-traits = "0.2.19" 24 | # rust-stemmers = "1.2.0" 25 | 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Marco Edward Gorelli 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | SHELL=/bin/bash 3 | 4 | install: 5 | unset CONDA_PREFIX && \ 6 | source .venv/bin/activate && maturin develop -m Cargo.toml 7 | 8 | install-release: 9 | unset CONDA_PREFIX && \ 10 | source .venv/bin/activate && maturin develop --release -m Cargo.toml 11 | 12 | pre-commit: 13 | cargo +nightly fmt --all --manifest-path Cargo.toml && cargo clippy --all-features --manifest-path Cargo.toml 14 | .venv/bin/python -m ruff format minimal_plugin test_plugin.py 15 | .venv/bin/python -m ruff check minimal_plugin test_plugin.py 16 | 17 | run: install 18 | source .venv/bin/activate && python run.py 19 | 20 | run-release: install-release 21 | source .venv/bin/activate && python run.py 22 | 23 | test: 24 | source .venv/bin/activate && pytest test_plugin.py 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # How you (yes, you!) can write a Polars Plugin 2 | 3 |

4 | image 8 |

9 | 10 | - ✅ Unlock super-high performance 11 | - ✅ Have a tonne of fun 12 | - ✅ Impress everybody with your superpowers 13 | 14 | This repository is meant to accompany the tutorial from https://marcogorelli.github.io/polars-plugins-tutorial/. 15 | 16 | Logo 17 | ---- 18 | 19 | Thanks to [Olha Urdeichuk](https://www.fiverr.com/olhaurdeichuk) for the illustration. 20 | 21 | Funding 22 | ------- 23 | 24 | Thank you to [Quansight Labs](https://labs.quansight.org/) for having provided support to this project during their internships 25 | programme, during which [Bruno Conde](https://github.com/condekind) made very significant contributions to chapters on Strings, 26 | performance (`Vec>` vs. `Vec`), nested data types (Array, List, Struct), the CI process, and the "can we run doom?" extra. 27 | -------------------------------------------------------------------------------- /check_polars_version.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | import tomllib 4 | import re 5 | from typing import Any 6 | 7 | 8 | # Remote Cargo.toml from which we extract the version of 'polars' 9 | remote_cargo_url = r"https://raw.githubusercontent.com/MarcoGorelli/cookiecutter-polars-plugins/main/%7B%7B%20cookiecutter.project_slug%20%7D%7D/Cargo.toml" 10 | # Packages that should have the same version as the 'polars' package from above 11 | pinned_packages = ["polars", "polars-arrow", "polars-core"] 12 | 13 | 14 | def fetch_url_content(url: str) -> str | None: 15 | try: 16 | res = subprocess.run( 17 | ["curl", "-s", url], 18 | check=True, 19 | stdout=subprocess.PIPE, 20 | stderr=subprocess.PIPE, 21 | text=True, 22 | ) 23 | return res.stdout 24 | except subprocess.CalledProcessError as e: 25 | print(f"Error fetching URL: {e.stderr}") 26 | return None 27 | 28 | 29 | # Fetch contents of remote (template) Cargo.toml 30 | raw_content = fetch_url_content(remote_cargo_url) 31 | if not raw_content: 32 | print("Fetched template Cargo.toml is empty, try again") 33 | sys.exit(1) 34 | 35 | # Load it as a dict with tomllib 36 | try: 37 | template_cargo_toml = tomllib.loads(raw_content) 38 | except tomllib.TOMLDecodeError as e: 39 | print(f"Error decoding template Cargo.toml: {e}") 40 | sys.exit(1) 41 | 42 | # Store remote (template) polars version 43 | template_polars_version: str = template_cargo_toml["dependencies"]["polars"]["version"] 44 | 45 | # Load local Cargo.toml as a dict 46 | local_cargo_toml: dict[str, Any] 47 | with open("Cargo.toml", mode="rb") as local_cargo_toml_file: 48 | try: 49 | local_cargo_toml = tomllib.load(local_cargo_toml_file) 50 | except tomllib.TOMLDecodeError as e: 51 | print(f"Error decoding local Cargo.toml: {e}") 52 | sys.exit(1) 53 | 54 | # Check each local pkg that should be pinned with the same ver. as the remote 55 | for pkg in pinned_packages: 56 | version = local_cargo_toml["dependencies"][pkg]["version"] 57 | if version != template_polars_version: 58 | print(f"{pkg=} {version=} does not match {template_polars_version=}") 59 | sys.exit(1) 60 | 61 | 62 | # Additionally, check other locations that reference polars version 63 | def find_local_polars_reference() -> str | None: 64 | """ 65 | This will output a string with the format: 66 | 67 | 68 | ... 69 | """ 70 | try: 71 | res = subprocess.run( 72 | ["grep", "-rEZ", "--exclude-dir=target", r"^[+-]?polars = ", "."], 73 | check=True, 74 | stdout=subprocess.PIPE, 75 | stderr=subprocess.PIPE, 76 | text=True, 77 | ) 78 | return res.stdout 79 | except subprocess.CalledProcessError: 80 | print("Error running `grep -rEZ --exclude-dir=target '^[+-]?polars = ' .`") 81 | return None 82 | 83 | 84 | grep_result = find_local_polars_reference() 85 | if not grep_result: 86 | print("Error running grep, try again") 87 | sys.exit(1) 88 | 89 | # Iterate each non-empty line of the grep result 90 | for line in [ln for ln in grep_result.split("\n") if ln.strip()]: 91 | # File name and line contents are separated with a null byte 92 | filename, line = line.split("\0") 93 | 94 | # Use a group to capture the exact version present in the line 95 | m = re.search(r'\bversion = "([^"]+)"', line) 96 | if not m: 97 | print(f"Error extracting version from package in {filename}: {line}") 98 | sys.exit(1) 99 | if (ver := m.group(1)) != template_polars_version: 100 | print( 101 | f"Error in {filename}: local {ver=} does not" 102 | f"match {template_polars_version=}: {line=}" 103 | ) 104 | sys.exit(1) 105 | -------------------------------------------------------------------------------- /docs/abs.md: -------------------------------------------------------------------------------- 1 | # 2. How to do ABSolutely nothing 2 | 3 | OK, the title's misleading. We won't do "nothing", we'll make an `abs` function 4 | which will work on numeric data. 5 | 6 | We'll do this in phases: 7 | 8 | - `abs_i64` will take the absolute value of each row of an `Int64` column 9 | - `abs_numeric` will take the absolute value of each row in any numeric column 10 | 11 | ## `abs_i64` 12 | 13 | Let's start with the Python side - this is almost the same as what 14 | we did for `noop`, we'll just change the names. Please add this to 15 | `minimal_plugin/__init__.py`, right below the definition of `noop`: 16 | ```python 17 | def abs_i64(expr: IntoExprColumn) -> pl.Expr: 18 | return register_plugin_function( 19 | args=[expr], 20 | plugin_path=LIB, 21 | function_name="abs_i64", 22 | is_elementwise=True, 23 | ) 24 | ``` 25 | 26 | Then, please add this to `src/expressions.rs`, right below the Rust 27 | definition of `noop`: 28 | 29 | ```Rust 30 | #[polars_expr(output_type=Int64)] 31 | fn abs_i64(inputs: &[Series]) -> PolarsResult { 32 | let s = &inputs[0]; 33 | let ca: &Int64Chunked = s.i64()?; 34 | // NOTE: there's a faster way of implementing `abs_i64`, which we'll 35 | // cover in section 7. 36 | let out: Int64Chunked = ca.apply(|opt_v: Option| opt_v.map(|v: i64| v.abs())); 37 | Ok(out.into_series()) 38 | } 39 | ``` 40 | 41 | The general idea here is: 42 | 43 | - Each element `opt_v` can either be `Some(i64)`, or `None`. 44 | If it's `None`, we return `None`, whereas if it's `Some(i64)`, 45 | then we return `Some` of the absolute value of the `i64` value. 46 | 47 | !!!note 48 | 49 | There's a faster way of implementing `abs_i64`, which you'll learn 50 | about in [Branch mispredictions]. 51 | 52 | - We produce a new ChunkedArray, convert it to Series, and return it. 53 | 54 | Let's try this out. Make a Python file `run.py` with the following: 55 | ```python 56 | import polars as pl 57 | import minimal_plugin as mp 58 | 59 | df = pl.DataFrame({ 60 | 'a': [1, -1, None], 61 | 'b': [4.1, 5.2, -6.3], 62 | 'c': ['hello', 'everybody!', '!'] 63 | }) 64 | print(df.with_columns(mp.abs_i64('a').name.suffix('_abs'))) 65 | ``` 66 | Compile it with `maturin develop` (or `maturin develop --release` if you're benchmarking), and run it with `python run.py`. 67 | If it outputs 68 | ``` 69 | shape: (3, 4) 70 | ┌──────┬──────┬────────────┬───────┐ 71 | │ a ┆ b ┆ c ┆ a_abs │ 72 | │ --- ┆ --- ┆ --- ┆ --- │ 73 | │ i64 ┆ f64 ┆ str ┆ i64 │ 74 | ╞══════╪══════╪════════════╪═══════╡ 75 | │ 1 ┆ 4.1 ┆ hello ┆ 1 │ 76 | │ -1 ┆ 5.2 ┆ everybody! ┆ 1 │ 77 | │ null ┆ -6.3 ┆ ! ┆ null │ 78 | └──────┴──────┴────────────┴───────┘ 79 | ``` 80 | then you did everything correctly! 81 | 82 | [Branch mispredictions]: ../branch_mispredictions/ 83 | 84 | ## `abs_numeric` 85 | 86 | The code above unfortunately only supports `Int64` columns. Let's try to 87 | generalise it a bit, so that it can accept any signed numeric column. 88 | 89 | First, add the following definition to `minimal_plugin/__init__.py`: 90 | 91 | ```python 92 | def abs_numeric(expr: IntoExprColumn) -> pl.Expr: 93 | return register_plugin_function( 94 | args=[expr], 95 | plugin_path=LIB, 96 | function_name="abs_numeric", 97 | is_elementwise=True, 98 | ) 99 | ``` 100 | 101 | Then, we'll go back to `src/expressions.rs`. 102 | Paste in the following: 103 | 104 | ```Rust 105 | fn impl_abs_numeric(ca: &Int64Chunked) -> Int64Chunked { 106 | // NOTE: there's a faster way of implementing `abs`, which we'll 107 | // cover in section 7. 108 | ca.apply(|opt_v: Option| opt_v.map(|v: i64| v.abs())) 109 | } 110 | 111 | #[polars_expr(output_type=Int64)] 112 | fn abs_numeric(inputs: &[Series]) -> PolarsResult { 113 | let s = &inputs[0]; 114 | let ca: &Int64Chunked = s.i64()?; 115 | let out = impl_abs_numeric(ca); 116 | Ok(out.into_series()) 117 | } 118 | ``` 119 | 120 | Note how it's exactly like `abs_i64`, but `impl_abs_numeric` was 121 | factored out of the `abs_numeric` function. It's not yet generic, 122 | we need to do a bit more work. 123 | The general idea is: 124 | 125 | - each `ChunkedArray` is of some Polars Type `T` (e.g. `Int64`); 126 | - to each Polars Type `T`, there corresponds a Rust native type `T::Native` (e.g. `i64`). 127 | 128 | Change `impl_abs_numeric` to: 129 | 130 | ```Rust 131 | fn impl_abs_numeric(ca: &ChunkedArray) -> ChunkedArray 132 | where 133 | T: PolarsNumericType, 134 | T::Native: Signed, 135 | { 136 | // NOTE: there's a faster way of implementing `abs`, which we'll 137 | // cover in section 7. 138 | ca.apply(|opt_v: Option| opt_v.map(|v: T::Native| v.abs())) 139 | } 140 | ``` 141 | Make sure to add 142 | ```Rust 143 | use pyo3_polars::export::polars_core::export::num::Signed; 144 | ``` 145 | to the top of the `src/expression.rs` file. 146 | 147 | We then need to modify `abs_numeric` as follows: 148 | ```Rust 149 | #[polars_expr(output_type_func=same_output_type)] 150 | fn abs_numeric(inputs: &[Series]) -> PolarsResult { 151 | let s = &inputs[0]; 152 | match s.dtype() { 153 | DataType::Int32 => Ok(impl_abs_numeric(s.i32().unwrap()).into_series()), 154 | DataType::Int64 => Ok(impl_abs_numeric(s.i64().unwrap()).into_series()), 155 | DataType::Float32 => Ok(impl_abs_numeric(s.f32().unwrap()).into_series()), 156 | DataType::Float64 => Ok(impl_abs_numeric(s.f64().unwrap()).into_series()), 157 | dtype => { 158 | polars_bail!(InvalidOperation:format!("dtype {dtype} not \ 159 | supported for abs_numeric, expected Int32, Int64, Float32, Float64.")) 160 | } 161 | } 162 | } 163 | ``` 164 | That's it! Our function is now generic over signed numeric types, 165 | instead of only accepting the `Int64` type. 166 | 167 | Finally, modify the `print` line of `run.py` to be 168 | ```python 169 | print(df.with_columns(mp.abs_numeric(pl.col('a', 'b')).name.suffix('_abs'))) 170 | ``` 171 | 172 | Compile with `maturin develop` (or `maturin develop --release` 173 | if you're benchmarking) and then run with `python run.py`. You should 174 | see: 175 | ``` 176 | shape: (3, 5) 177 | ┌──────┬──────┬────────────┬───────┬───────┐ 178 | │ a ┆ b ┆ c ┆ a_abs ┆ b_abs │ 179 | │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ 180 | │ i64 ┆ f64 ┆ str ┆ i64 ┆ f64 │ 181 | ╞══════╪══════╪════════════╪═══════╪═══════╡ 182 | │ 1 ┆ 4.1 ┆ hello ┆ 1 ┆ 4.1 │ 183 | │ -1 ┆ 5.2 ┆ everybody! ┆ 1 ┆ 5.2 │ 184 | │ null ┆ -6.3 ┆ ! ┆ null ┆ 6.3 │ 185 | └──────┴──────┴────────────┴───────┴───────┘ 186 | ``` 187 | Note how we were able to take the absolute value of both `b` (`f64`) 188 | and `a` (`i64`) columns with `abs_numeric`! 189 | -------------------------------------------------------------------------------- /docs/aggregate.md: -------------------------------------------------------------------------------- 1 | # 15. In (the) aggregate 2 | 3 | Enough transorming columns! Let's aggregate them instead. 4 | 5 | A Polars expression is a function from a Dataframe to a Series. So, 6 | how can we possibly write an expression which produces a scalar? 7 | 8 | Simple: 9 | 10 | - write an expression which returns a 1-row Series 11 | - when you register the expression, pass `returns_scalar = True` 12 | 13 | As an example, let's compute the weighted mean of a column, where 14 | the weights are given by a second column. 15 | 16 | ## Hello Python my old friend 17 | 18 | Nothing fancy here: 19 | 20 | ```python 21 | def vertical_weighted_mean(values: IntoExprColumn, weights: IntoExprColumn) -> pl.Expr: 22 | return register_plugin_function( 23 | args=[values, weights], 24 | plugin_path=LIB, 25 | function_name="vertical_weighted_mean", 26 | is_elementwise=False, 27 | returns_scalar=True, 28 | ) 29 | ``` 30 | 31 | ## Rust 32 | 33 | To keep this example's complexity down, let's just limit it to `Float64` columns. 34 | 35 | ```rust 36 | #[polars_expr(output_type=Float64)] 37 | fn vertical_weighted_mean(inputs: &[Series]) -> PolarsResult { 38 | let values = &inputs[0].f64()?; 39 | let weights = &inputs[1].f64()?; 40 | let mut numerator = 0.; 41 | let mut denominator = 0.; 42 | values.iter().zip(weights.iter()).for_each(|(v, w)| { 43 | if let (Some(v), Some(w)) = (v, w) { 44 | numerator += v * w; 45 | denominator += w; 46 | } 47 | }); 48 | let result = numerator / denominator; 49 | Ok(Series::new(PlSmallStr::EMPTY, vec![result])) 50 | } 51 | ``` 52 | 53 | ## Run it! 54 | 55 | Put the following in `run.py`: 56 | 57 | ```python 58 | df = pl.DataFrame({ 59 | 'values': [1., 3, 2, 5, 7], 60 | 'weights': [.5, .3, .2, .1, .9], 61 | 'group': ['a', 'a', 'a', 'b', 'b'], 62 | }) 63 | print(df.group_by('group').agg(weighted_mean = mp.vertical_weighted_mean('values', 'weights'))) 64 | ``` 65 | 66 | If you compile with `maturin develop` (or `maturin develop --release` if benchmarking), you'll see: 67 | 68 | ``` 69 | shape: (2, 2) 70 | ┌───────┬───────────────┐ 71 | │ group ┆ weighted_mean │ 72 | │ --- ┆ --- │ 73 | │ str ┆ f64 │ 74 | ╞═══════╪═══════════════╡ 75 | │ b ┆ 6.166667 │ 76 | │ a ┆ 2.333333 │ 77 | └───────┴───────────────┘ 78 | ``` 79 | 80 | Try omitting `returns_scalar=True` when registering the expression - what changes? 81 | -------------------------------------------------------------------------------- /docs/arguments.md: -------------------------------------------------------------------------------- 1 | # 8. I'd like to have an argument, please 2 | 3 | Say you want to rewrite 4 | ```python 5 | def add_suffix(s, *, suffix): 6 | return s + suffix 7 | 8 | s.map_elements(lambda x: add_suffix(x, suffix='-billy')) 9 | ``` 10 | as a plugin. How can you do that? 11 | 12 | We've covered passing in extra columns, but...how about passing extra 13 | keyword arguments? 14 | 15 | We'll do this with `kwargs`. In `minimal_plugin/__init__.py`, add the 16 | following: 17 | 18 | ```python 19 | def add_suffix(expr: IntoExprColumn, *, suffix: str) -> pl.Expr: 20 | return register_plugin_function( 21 | args=[expr], 22 | plugin_path=LIB, 23 | function_name="add_suffix", 24 | is_elementwise=True, 25 | kwargs={"suffix": suffix}, 26 | ) 27 | ``` 28 | 29 | In `src/expressions.rs`, we'll then first have to define a struct to hold 30 | our keyword-arguments: 31 | 32 | ```rust 33 | #[derive(Deserialize)] 34 | struct AddSuffixKwargs { 35 | suffix: String, 36 | } 37 | ``` 38 | Make sure to also add 39 | ```rust 40 | use serde::Deserialize; 41 | ``` 42 | to the top of the file. 43 | 44 | Then, we can just pass an argument of this type to a `add_suffix` function, 45 | which is going to be very similar to the good version of `pig_latinnify`: 46 | 47 | ```rust 48 | #[polars_expr(output_type=String)] 49 | fn add_suffix(inputs: &[Series], kwargs: AddSuffixKwargs) -> PolarsResult { 50 | let s = &inputs[0]; 51 | let ca = s.str()?; 52 | let out = ca.apply_into_string_amortized(|value, output| { 53 | write!(output, "{}{}", value, kwargs.suffix).unwrap(); 54 | }); 55 | Ok(out.into_series()) 56 | } 57 | ``` 58 | 59 | To see it in action, compile with `maturin develop` (or `maturin develop --release` if you're 60 | benchmarking), and then you should be able to put 61 | ```python 62 | import polars as pl 63 | import minimal_plugin as mp 64 | 65 | df = pl.DataFrame({'a': ['bob', 'billy']}) 66 | print(df.with_columns(mp.add_suffix('a', suffix='-billy'))) 67 | ``` 68 | into `run.py`, and run it to get 69 | ``` 70 | shape: (2, 1) 71 | ┌─────────────┐ 72 | │ a │ 73 | │ --- │ 74 | │ str │ 75 | ╞═════════════╡ 76 | │ bob-billy │ 77 | │ billy-billy │ 78 | └─────────────┘ 79 | ``` 80 | You can add multiple keyword-arguments in the same function, just make sure to 81 | include them in the struct which you define on the Rust side. 82 | -------------------------------------------------------------------------------- /docs/arrays.md: -------------------------------------------------------------------------------- 1 | 2 | # 11. ARRAY, captain! 3 | 4 | We've talked about lists, structs, but what about arrays? 5 | 6 | In this section we're gonna cover how to deal with fixed sized arrays, e.g., x and y coordinates of 2d points *in the same column*: 7 | 8 | ```python 9 | points = pl.Series( 10 | "points", 11 | [ 12 | [6.63, 8.35], 13 | [7.19, 4.85], 14 | [2.1, 4.21], 15 | [3.4, 6.13], 16 | ], 17 | dtype=pl.Array(pl.Float64, 2), 18 | ) 19 | df = pl.DataFrame(points) 20 | 21 | print(df) 22 | ``` 23 | 24 | ``` 25 | shape: (4, 1) 26 | ┌───────────────┐ 27 | │ points │ 28 | │ --- │ 29 | │ array[f64, 2] │ 30 | ╞═══════════════╡ 31 | │ [6.63, 8.35] │ 32 | │ [7.19, 4.85] │ 33 | │ [2.1, 4.21] │ 34 | │ [3.4, 6.13] │ 35 | └───────────────┘ 36 | ``` 37 | 38 | Let's get to work - what if we wanted to make a plugin that takes a Series like `points` above, and, likewise, returned a Series of arrays? 39 | Turns out we _can_ do it! But it's a little bit tricky. 40 | 41 | __First of all__, we need to include `features = ["dtype-array"]` in both `pyo3-polars` and `polars-core` in our `Cargo.toml`. 42 | 43 | Now let's create a plugin that calculates the midpoint between a reference point and each point in a Series like the one above. 44 | This should illustrate both how to unpack an array inside our Rust code and also return a Series of the same type. 45 | 46 | We'll start by registering our plugin: 47 | 48 | ```python 49 | def midpoint_2d(expr: IntoExprColumn, ref_point: tuple[float, float]) -> pl.Expr: 50 | return register_plugin_function( 51 | args=[expr], 52 | plugin_path=Path(__file__).parent, 53 | function_name="midpoint_2d", 54 | is_elementwise=True, 55 | kwargs={"ref_point": ref_point}, 56 | ) 57 | ``` 58 | 59 | As you can see, we included an additional kwarg: `ref_point`, which we annotated with the type `tuple: [float, float]`. 60 | In our Rust code, we won't receive it as a tuple, though, it'll also be an array. 61 | This isn't crucial for this example, so just accept it for now. 62 | As you saw in the __arguments__ chapter, we take kwargs by defining a struct for them: 63 | 64 | ```rust 65 | #[derive(Deserialize)] 66 | struct MidPoint2DKwargs { 67 | ref_point: [f64; 2], 68 | } 69 | ``` 70 | 71 | And we can finally move to the actual plugin code: 72 | 73 | ```rust 74 | // We need this to ensure the output is of dtype array. 75 | // Unfortunately, polars plugins do not support something similar to: 76 | // #[polars_expr(output_type=Array)] 77 | pub fn point_2d_output(_: &[Field]) -> PolarsResult { 78 | Ok(Field::new( 79 | PlSmallStr::from_static("point_2d"), 80 | DataType::Array(Box::new(DataType::Float64), 2), 81 | )) 82 | } 83 | 84 | #[polars_expr(output_type_func=point_2d_output)] 85 | fn midpoint_2d(inputs: &[Series], kwargs: MidPoint2DKwargs) -> PolarsResult { 86 | let ca: &ArrayChunked = inputs[0].array()?; 87 | let ref_point = kwargs.ref_point; 88 | 89 | let out: ArrayChunked = unsafe { 90 | ca.try_apply_amortized_same_type(|row| { 91 | let s = row.as_ref(); 92 | let ca = s.f64()?; 93 | let out_inner: Float64Chunked = ca 94 | .iter() 95 | .enumerate() 96 | .map(|(idx, opt_val)| { 97 | opt_val.map(|val| { 98 | (val + ref_point[idx]) / 2.0f64 99 | }) 100 | }).collect_trusted(); 101 | Ok(out_inner.into_series()) 102 | })}?; 103 | 104 | Ok(out.into_series()) 105 | } 106 | ``` 107 | 108 | Uh-oh, unsafe, we're doomed! 109 | 110 | Hold on a moment - it's true that we need unsafe here, but let's not freak out. 111 | If we read the docs of `try_apply_amortized_same_type`, we see the following: 112 | 113 | > ```rust 114 | > /// Try apply a closure `F` to each array. 115 | > /// 116 | > /// # Safety 117 | > /// Return series of `F` must has the same dtype and number of elements as input if it is Ok. 118 | > pub unsafe fn try_apply_amortized_same_type(&self, mut f: F) -> PolarsResult 119 | > where 120 | > F: FnMut(AmortSeries) -> PolarsResult, 121 | > ``` 122 | 123 | 124 | In this example, we can uphold that contract - we know we're returning a Series with the same number of elements and same dtype as the input! 125 | 126 | Still, the code looks a bit scary, doesn't it? So let's break it down: 127 | 128 | ```rust 129 | let out: ArrayChunked = unsafe { 130 | 131 | // This is similar to apply_values, but it's amortized and made specifically 132 | // for arrays. 133 | ca.try_apply_amortized_same_type(|row| { 134 | let s = row.as_ref(); 135 | // `s` is a Series which contains two elements. 136 | // We unpack it similarly to the way we've been unpacking Series in the 137 | // previous chapters: 138 | // 139 | // Previously we've been doing this to unpack a column we had behind a 140 | // Series - this time, inside this closure, the Series contains the two 141 | // elements composing the "row" (x and y): 142 | let ca = s.f64()?; 143 | 144 | // There are many ways to extract the x and y coordinates from ca. 145 | // Here, we remain idiomatic and consistent with what we've been doing 146 | // in the past - iterate, enumerate and map: 147 | let out_inner: Float64Chunked = ca 148 | .iter() 149 | .enumerate() 150 | .map(|(idx, opt_val)| { 151 | 152 | // We only use map here because opt_val is an Option 153 | opt_val.map(|val| { 154 | 155 | // Here's where the simple logic of calculating a 156 | // midpoint happens. We take the coordinate (`val`) at 157 | // index `idx`, add it to the `idx-th` entry of our 158 | // reference point (which is a coordinate of our point), 159 | // then divide it by two, since we're dealing with 2d 160 | // points only. 161 | (val + ref_point[idx]) / 2.0f64 162 | }) 163 | // Our map already returns Some or None, so we don't have to 164 | // worry about wrapping the result in, e.g., Some() 165 | }).collect_trusted(); 166 | 167 | // At last, we convert out_inner (which is a Float64Chunked) back to a 168 | // Series 169 | Ok(out_inner.into_series()) 170 | })}?; 171 | 172 | // And finally, we convert our ArrayChunked into a Series, ready to ship to 173 | // Python-land: 174 | Ok(out.into_series()) 175 | ``` 176 | 177 | That's it. What does the result look like? 178 | In `run.py`, we have: 179 | 180 | ```python 181 | import polars as pl 182 | from minimal_plugin import midpoint_2d 183 | 184 | points = pl.Series( 185 | "points", 186 | [ 187 | [6.63, 8.35], 188 | [7.19, 4.85], 189 | [2.1, 4.21], 190 | [3.4, 6.13], 191 | [2.48, 9.26], 192 | [9.41, 7.26], 193 | [7.45, 8.85], 194 | [6.58, 5.22], 195 | [6.05, 5.77], 196 | [8.57, 4.16], 197 | [3.22, 4.98], 198 | [6.62, 6.62], 199 | [9.36, 7.44], 200 | [8.34, 3.43], 201 | [4.47, 7.61], 202 | [4.34, 5.05], 203 | [5.0, 5.05], 204 | [5.0, 5.0], 205 | [2.07, 7.8], 206 | [9.45, 9.6], 207 | [3.1, 3.26], 208 | [4.37, 5.72], 209 | ], 210 | dtype=pl.Array(pl.Float64, 2), 211 | ) 212 | df = pl.DataFrame(points) 213 | 214 | # Now we call our plugin: 215 | result = df.with_columns(midpoints=midpoint_2d("points", ref_point=(5.0, 5.0))) 216 | print(result) 217 | ``` 218 | 219 | Let's compile and run it: 220 | ```shell 221 | maturin develop 222 | 223 | python run.py 224 | ``` 225 | 226 | 🥁: 227 | ``` 228 | shape: (22, 2) 229 | ┌───────────────┬────────────────┐ 230 | │ points ┆ midpoints │ 231 | │ --- ┆ --- │ 232 | │ array[f64, 2] ┆ array[f64, 2] │ 233 | ╞═══════════════╪════════════════╡ 234 | │ [6.63, 8.35] ┆ [5.815, 6.675] │ 235 | │ [7.19, 4.85] ┆ [6.095, 4.925] │ 236 | │ [2.1, 4.21] ┆ [3.55, 4.605] │ 237 | │ [3.4, 6.13] ┆ [4.2, 5.565] │ 238 | │ [2.48, 9.26] ┆ [3.74, 7.13] │ 239 | │ … ┆ … │ 240 | │ [5.0, 5.0] ┆ [5.0, 5.0] │ 241 | │ [2.07, 7.8] ┆ [3.535, 6.4] │ 242 | │ [9.45, 9.6] ┆ [7.225, 7.3] │ 243 | │ [3.1, 3.26] ┆ [4.05, 4.13] │ 244 | │ [4.37, 5.72] ┆ [4.685, 5.36] │ 245 | └───────────────┴────────────────┘ 246 | ``` 247 | 248 | 249 | !!!note 250 | Notice how the dtype remains the same. 251 | As an exercise, try to achieve the same in pure-Python (without Rust plugins) 252 | without explicitly casting the type of the Series. 253 | 254 | Hurray, we did it! 255 | And why exactly go through all this trouble instead of just doing the same thing in pure Python? 256 | For performance of course! 257 | 258 | _Spoilers ahead if you haven't tried the exercise from the note above_ 259 | 260 | With the following implementation in Python, we can take some measurements: 261 | 262 | ```python 263 | ref_point = (5.0, 5.0) 264 | 265 | def using_plugin(df=df, ref_point=ref_point): 266 | result = df.with_columns(midpoints=midpoint_2d("points", ref_point=ref_point)) 267 | return result 268 | 269 | def midpoint(points:pl.Series) -> pl.Series: 270 | result=[] 271 | for point in points: 272 | result.append([(point[0]+ref_point[0])/2, (point[1]+ref_point[1])/2]) 273 | return pl.Series(result, dtype=pl.Array(pl.Float64, 2)) 274 | 275 | def using_python(df=df, ref_point=ref_point): 276 | result = ( 277 | df.with_columns( 278 | midpoints=pl.col('points').map_batches(midpoint, return_dtype=pl.Array(pl.Float64, 2)) 279 | ) 280 | ) 281 | return result 282 | ``` 283 | 284 | For the sake of brevity, some extra methods to generate and parse an input file were left out of the code above, as well as the `timeit` bits. 285 | By measuring both versions with 1.000.000 points a few times and taking the average, we got the following result: 286 | 287 | ``` 288 | Using plugin: 289 | min: 0.5307095803339811 290 | max: 0.5741689523274545 291 | mean +/- stderr: 0.5524565599986263 +/- 0.0064489015434971925 292 | 293 | Using python: 294 | min: 6.682447870339577 295 | max: 6.99253460233255 296 | mean +/- stderr: 6.808615755191394 +/- 0.03757884107880601 297 | ``` 298 | 299 | A speedup of __12x__, that's a __big win__! 300 | 301 | !!!note 302 | When benchmarking Rust code, remember to use `maturin develop --release`, otherwise the timings will be much slower! 303 | -------------------------------------------------------------------------------- /docs/assets/array00.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarcoGorelli/polars-plugins-tutorial/c132a11de495fe1b88eac767db14bb8337a88df2/docs/assets/array00.png -------------------------------------------------------------------------------- /docs/assets/array01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarcoGorelli/polars-plugins-tutorial/c132a11de495fe1b88eac767db14bb8337a88df2/docs/assets/array01.png -------------------------------------------------------------------------------- /docs/assets/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarcoGorelli/polars-plugins-tutorial/c132a11de495fe1b88eac767db14bb8337a88df2/docs/assets/image.png -------------------------------------------------------------------------------- /docs/assets/life_toad.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarcoGorelli/polars-plugins-tutorial/c132a11de495fe1b88eac767db14bb8337a88df2/docs/assets/life_toad.gif -------------------------------------------------------------------------------- /docs/assets/life_toad_df.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarcoGorelli/polars-plugins-tutorial/c132a11de495fe1b88eac767db14bb8337a88df2/docs/assets/life_toad_df.gif -------------------------------------------------------------------------------- /docs/assets/list_chunked_memory_layout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarcoGorelli/polars-plugins-tutorial/c132a11de495fe1b88eac767db14bb8337a88df2/docs/assets/list_chunked_memory_layout.png -------------------------------------------------------------------------------- /docs/assets/struct_array_memory_layout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarcoGorelli/polars-plugins-tutorial/c132a11de495fe1b88eac767db14bb8337a88df2/docs/assets/struct_array_memory_layout.png -------------------------------------------------------------------------------- /docs/assets/struct_example_Point2D.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarcoGorelli/polars-plugins-tutorial/c132a11de495fe1b88eac767db14bb8337a88df2/docs/assets/struct_example_Point2D.png -------------------------------------------------------------------------------- /docs/assets/structchunked_fields_memory_layout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarcoGorelli/polars-plugins-tutorial/c132a11de495fe1b88eac767db14bb8337a88df2/docs/assets/structchunked_fields_memory_layout.png -------------------------------------------------------------------------------- /docs/assets/timings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarcoGorelli/polars-plugins-tutorial/c132a11de495fe1b88eac767db14bb8337a88df2/docs/assets/timings.png -------------------------------------------------------------------------------- /docs/branch_mispredictions.md: -------------------------------------------------------------------------------- 1 | # 7. Branch mispredictions 2 | 3 | Time to go back to the past. In Section 2, I told you that the 4 | implementation we had of `abs_i64` wasn't the most efficient one 5 | you could possibly write. Time to see how to improve it! 6 | 7 | Which algorithm do you think would win? 8 | 9 | 1. for each row: 10 | - check if it's null or not 11 | - if it's not null, calculate its absolute value 12 | 2. for each row: 13 | - calculate its absolute value, even if we don't need it 14 | because it's a null row 15 | 16 | If you've not come across the concept of branch mispredictions 17 | before, then the answer may surprise you, because the second 18 | one is faster here. This is because `.abs` is a very fast 19 | operation, and wasting time checking whether each element is null 20 | or not actually slows us down! 21 | 22 | Here's how you can make `abs_i64` faster: 23 | 24 | ```Rust 25 | #[polars_expr(output_type=Int64)] 26 | fn abs_i64(inputs: &[Series]) -> PolarsResult { 27 | let s = &inputs[0]; 28 | let ca = s.i64()?; 29 | let out = ca.apply_values(|x| x.abs()); 30 | Ok(out.into_series()) 31 | } 32 | ``` 33 | 34 | For operations more complex than `.abs`, it may be that computing the operation 35 | for only the non-null values is cheaper. In general, you should 36 | measure, not guess. 37 | If you're just starting out with plugins and only need to beat 38 | `.map_elements`, then either of these solutions will blow it out 39 | of the water. 40 | 41 | ![](assets/timings.png) 42 | 43 | ## Practice! 44 | 45 | Can you go back and make a faster version of `sum_i64`? 46 | -------------------------------------------------------------------------------- /docs/cum_sum.md: -------------------------------------------------------------------------------- 1 | # 4. Yes we SCAN 2 | 3 | The operations we've seen so far have all been elementwise, e.g.: 4 | 5 | - for each row, we calculated the absolute value 6 | - for each row, we summed the respective values in two columns 7 | 8 | Let's do something (completely) different - instead of working with 9 | each row in isolation, we'll calculate a quantity which depends on the 10 | rows which precede it. 11 | 12 | We're going to implement `cum_sum`. 13 | 14 | ## Python side 15 | 16 | Add this to `minimal_plugin/__init__.py`: 17 | ```python 18 | def cum_sum(expr: IntoExprColumn) -> pl.Expr: 19 | return register_plugin_function( 20 | args=[expr], 21 | plugin_path=LIB, 22 | function_name="cum_sum", 23 | is_elementwise=False, 24 | ) 25 | ``` 26 | Note how, unlike in previous examples, we set `is_elementwise=False`. 27 | You'll see why this is so important at the end of this page. 28 | 29 | ## Rust 30 | 31 | Time to learn a new Rust function: `scan`. 32 | If you're not familiar with it, please take a little break from this tutorial 33 | and [read the scan docs](https://doc.rust-lang.org/std/iter/trait.Iterator.html#method.scan). 34 | 35 | Welcome back! Let's use our newfound scan-superpowers to implement `cum_sum`. Here's what goes into `src/expressions.rs`: 36 | ```Rust 37 | #[polars_expr(output_type_func=same_output_type)] 38 | fn cum_sum(inputs: &[Series]) -> PolarsResult { 39 | let s = &inputs[0]; 40 | let ca: &Int64Chunked = s.i64()?; 41 | let out: Int64Chunked = ca 42 | .iter() 43 | .scan(0_i64, |state: &mut i64, x: Option| { 44 | match x { 45 | Some(x) => { 46 | *state += x; 47 | Some(Some(*state)) 48 | }, 49 | None => Some(None), 50 | } 51 | }) 52 | .collect_trusted(); 53 | Ok(out.into_series()) 54 | } 55 | ``` 56 | Make sure to also add 57 | ```Rust 58 | use pyo3_polars::export::polars_core::utils::CustomIterTools; 59 | ``` 60 | to the top of the file. 61 | 62 | The `cum_sum` definition may look complex, but it's not too bad once we 63 | break it down: 64 | 65 | - we hold the running sum in `state` 66 | - we iterate over rows, initialising `state` to be `0` 67 | - if the current row is `Some`, then add the current row's value to `state` and emit the current value of `state` 68 | - if the current row is `None`, then don't modify `state` and emit `None` 69 | 70 | Note how we use `collect_trusted` at the end, rather than `collect`. 71 | `collect` would work as well, but if we know the length of the output 72 | (and we do in this case, `cum_sum` doesn't change the column's length) 73 | then we can safely use `collect_trusted` and save some precious time. 74 | 75 | Let's compile with `maturin develop` (or `maturin develop --release` 76 | if you're benchmarking), change the last line of `run.py` to 77 | ```python 78 | print(df.with_columns(a_cum_sum=mp.cum_sum('a'))) 79 | ``` 80 | and then run `python run.py`: 81 | 82 | ``` 83 | shape: (3, 3) 84 | ┌─────┬──────┬───────────┐ 85 | │ a ┆ b ┆ a_cum_sum │ 86 | │ --- ┆ --- ┆ --- │ 87 | │ i64 ┆ i64 ┆ i64 │ 88 | ╞═════╪══════╪═══════════╡ 89 | │ 1 ┆ 3 ┆ 1 │ 90 | │ 5 ┆ null ┆ 6 │ 91 | │ 2 ┆ -1 ┆ 8 │ 92 | └─────┴──────┴───────────┘ 93 | ``` 94 | 95 | ## Elementwise, my dear Watson 96 | 97 | Why was it so important to set `is_elementwise` correctly? Let's see 98 | with an example. 99 | 100 | Put the following in `run.py`: 101 | ```python 102 | import polars as pl 103 | import minimal_plugin as mp 104 | 105 | df = pl.DataFrame({ 106 | 'a': [1, 2, 3, 4, None, 5], 107 | 'b': [1, 1, 1, 2, 2, 2], 108 | }) 109 | print(df.with_columns(a_cum_sum=mp.cum_sum('a'))) 110 | ``` 111 | 112 | Then, run `python run.py`. 113 | 114 | Finally, go to `minimal_plugin/__init__.py` and change `is_elementwise` 115 | from `False` to `True`, and run `python run.py` again. 116 | 117 | In both cases, you should see the following output: 118 | ``` 119 | shape: (6, 3) 120 | ┌──────┬─────┬───────────┐ 121 | │ a ┆ b ┆ a_cum_sum │ 122 | │ --- ┆ --- ┆ --- │ 123 | │ i64 ┆ i64 ┆ i64 │ 124 | ╞══════╪═════╪═══════════╡ 125 | │ 1 ┆ 1 ┆ 1 │ 126 | │ 2 ┆ 1 ┆ 3 │ 127 | │ 3 ┆ 1 ┆ 6 │ 128 | │ 4 ┆ 2 ┆ 10 │ 129 | │ null ┆ 2 ┆ null │ 130 | │ 5 ┆ 2 ┆ 15 │ 131 | └──────┴─────┴───────────┘ 132 | ``` 133 | which looks correct. So, what's the deal with `is_elementwise`? 134 | 135 | The deal is that we need it in order for window functions / `group_by`s 136 | to be correct. Change the last line of `run.py` to 137 | ```python 138 | print(df.with_columns(a_cum_sum=mp.cum_sum('a').over('b'))) 139 | ``` 140 | 141 | Now, we get: 142 | 143 | - with `elementwise=True`: 144 | 145 | ``` 146 | shape: (6, 3) 147 | ┌──────┬─────┬───────────┐ 148 | │ a ┆ b ┆ a_cum_sum │ 149 | │ --- ┆ --- ┆ --- │ 150 | │ i64 ┆ i64 ┆ i64 │ 151 | ╞══════╪═════╪═══════════╡ 152 | │ 1 ┆ 1 ┆ 1 │ 153 | │ 2 ┆ 1 ┆ 3 │ 154 | │ 3 ┆ 1 ┆ 6 │ 155 | │ 4 ┆ 2 ┆ 10 │ 156 | │ null ┆ 2 ┆ null │ 157 | │ 5 ┆ 2 ┆ 15 │ 158 | └──────┴─────┴───────────┘ 159 | ``` 160 | 161 | - with `elementwise=False`: 162 | 163 | ``` 164 | shape: (6, 3) 165 | ┌──────┬─────┬───────────┐ 166 | │ a ┆ b ┆ a_cum_sum │ 167 | │ --- ┆ --- ┆ --- │ 168 | │ i64 ┆ i64 ┆ i64 │ 169 | ╞══════╪═════╪═══════════╡ 170 | │ 1 ┆ 1 ┆ 1 │ 171 | │ 2 ┆ 1 ┆ 3 │ 172 | │ 3 ┆ 1 ┆ 6 │ 173 | │ 4 ┆ 2 ┆ 4 │ 174 | │ null ┆ 2 ┆ null │ 175 | │ 5 ┆ 2 ┆ 9 │ 176 | └──────┴─────┴───────────┘ 177 | ``` 178 | 179 | Only `elementwise=False` actually respected the window! This is why 180 | it's important to set `elementwise` correctly. 181 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # How you (yes, you!) can write a Polars Plugin 2 | 3 | - ✅ Unlock super-high performance 4 | - ✅ Have a tonne of fun 5 | - ✅ Impress everybody with your superpowers 6 | 7 | ![](assets/image.png){: style="width:400px"} 8 | 9 | ## Why? 10 | 11 | Polars is an incredible and groundbreaking Dataframe library, and its expressions API 12 | is simply amazing. Sometimes, however, you need to express really custom business logic 13 | which just isn't in scope for the Polars API. In that situation, people tend to use 14 | `map_elements`, which lets you express anything but also kills most of Polars' benefits. 15 | 16 | But it doesn't have to be that way - with just basic Rust knowledge and this tutorial, 17 | I postulate that you'll be able to address at least 99% of inefficient `map_elements` tasks! 18 | 19 | ## What will you learn 20 | 21 | - Writing simple single-column elementwise expressions 22 | - Writing complex multi-column non-elementwise expressions which use third-party Rust packages 23 | - How to share your plugin superpowers with others 24 | 25 | ## What are people saying? 26 | 27 | **Nelson Griffiths**, Engineering & ML Lead at Double River Investments | Core Maintainer Functime 28 | 29 | > this was an awesome intro. I am no rust expert, though I have written a few plugins. And I learned quite a bit from this! Having my team read it now as well. Thanks for putting this together. I think more content like this for people who don’t know how to write optimal polars code on the rust side will be really useful for people like me who want to work on plugins! 30 | 31 | **Barak David**, Software Engineer 32 | 33 | > Amazing tutorial! I just created nltk plugin, and experienced X50 speedup! 34 | -------------------------------------------------------------------------------- /docs/life_pt1.md: -------------------------------------------------------------------------------- 1 | 2 | # Extra.1 Well... 3 | 4 | > "No." - _Doom Slayer_ 5 | 6 |
7 | 8 | !!!note 9 | This section is completely optional, and is provided for a bit 10 | of nerdy fun. It is by no means essential, feel free to skip 11 | it if it doesn't interest you! 12 | 13 | Well, someone can, probably. But doom in a dataframe would be kinda hard to play, so let's try something simpler. 14 | [Conway's Game of Life](https://en.wikipedia.org/wiki/Conway%27s_Game_of_Life) is a notorious Cellular Automaton that we could perhaps implement with a plugin. 15 | For science, of course. 16 | 17 | ![Toad pattern with period = 2](assets/life_toad.gif) 18 | 19 | Jokes aside, life allows us to show how a plugin can access elements in both neighbouring rows and columns for each element. 20 | With a little bit of extra Python, we can display things in an almost pretty manner. 21 | 22 | !!!note 23 | For this tutorial, we'll assume you created a new plugin from the 24 | cookiecutter template and named it `game_of_life` 25 | (these steps aren't shown here, since they were already covered at the 26 | very beginning of this series). 27 | 28 | In this section we'll cover the developer side of the plugin (both Python and Rust). 29 | In the next section we'll show how a user can import and use what we developed here. 30 | 31 | ## The Python side 32 | 33 | Let's take a look at what we'll implement first, in `game_of_life/__init__.py`: 34 | 35 | ```python 36 | import fileinput 37 | from collections import OrderedDict 38 | from itertools import tee, islice 39 | from os import PathLike 40 | from pathlib import Path 41 | from typing import Iterable, Any 42 | 43 | import polars as pl 44 | from polars._typing import IntoExprColumn 45 | from polars.plugins import register_plugin_function 46 | 47 | 48 | # Parse a board from a file or stdin 49 | def parse_board(ifile: str | ...) -> list[list[int]]: ... 50 | 51 | # Transpose a list of lists 52 | def _transpose(board: list[list[int]]) -> list[list[int]]: ... 53 | 54 | # Creates a DataFrame from a list of lists 55 | def board_to_df(board: list[list[int]]) -> pl.DataFrame: ... 56 | 57 | # Helper function to help us deal with corner cases 58 | def _nwise_wrapping(iterable: Iterable[Any], n: int): ... 59 | 60 | # Advance the simulation by n steps 61 | def step(df: pl.DataFrame, n: int = 1): ... 62 | 63 | # Register our plugin 64 | def life_step(left: IntoExprColumn, mid: IntoExprColumn, right: IntoExprColumn) -> pl.Expr: ... 65 | ``` 66 | 67 | Starting with the function to parse a board from a file or stdin: 68 | 69 | ```python 70 | def parse_board( 71 | ifile: ( 72 | str 73 | | bytes 74 | | PathLike[str] 75 | | PathLike[bytes] 76 | | Iterable[str | bytes | PathLike[str] | PathLike[bytes]] 77 | ), 78 | ) -> list[list[int]]: 79 | """ 80 | Converts a board in a file containing only 0s and 1s, e.g.:: 81 | 82 | 0010 83 | 0100 84 | 85 | into: 86 | [[0010],[0100]] 87 | """ 88 | return [ 89 | [c for ch in ln if (c := int(ch)) in [0, 1]] 90 | for line in fileinput.input(ifile) 91 | if len(ln := line.strip()) > 0 92 | ] 93 | ``` 94 | 95 | Next, we have transpose. Why do we need it, anyway? Because the way a dataframe reads our list of lists is counter-intuitive when constructing it from a dict comprehension. 96 | If we start with an input board like: 97 | 98 | ``` 99 | 0000 100 | 1111 101 | ``` 102 | 103 | without transpose, we'd end up with: 104 | 105 | ``` 106 | >>> import polars as pl 107 | >>> board = [[0,0,0,0],[1,1,1,1]] 108 | >>> pl.DataFrame({f"c{idx}": row for idx, row in enumerate(board)}) 109 | shape: (4, 2) 110 | ┌─────┬─────┐ 111 | │ c0 ┆ c1 │ 112 | │ --- ┆ --- │ 113 | │ i64 ┆ i64 │ 114 | ╞═════╪═════╡ 115 | │ 0 ┆ 1 │ 116 | │ 0 ┆ 1 │ 117 | │ 0 ┆ 1 │ 118 | │ 0 ┆ 1 │ 119 | └─────┴─────┘ 120 | ``` 121 | 122 | Not what we expected _visually_, so we transpose the initial board to have the resulting dataframe match it. 123 | 124 | ```python 125 | def _transpose(board: list[list[int]]) -> list[list[int]]: 126 | return [[row[idx] for row in board] for idx in range(len(board[0]))] 127 | ``` 128 | 129 | Next one is `board_to_df`, which calls `_transpose` and constructs the DataFrame in a similar way to the example above. 130 | The padding detail is just to avoid columns with larger names than others, feel free to ignore it: 131 | 132 | ```python 133 | def board_to_df(board: list[list[int]]) -> pl.DataFrame: 134 | """ 135 | Converts a list of lists of integers (0s and 1s) to a Polars DataFrame. 136 | The inner lists must have the same length. 137 | """ 138 | 139 | # This is done because each row will become a column - the user likely 140 | # expects a dataframe that *visually* matches the input file 141 | board = _transpose(board) 142 | 143 | padding_len = len(str(len(board) - 1)) 144 | board_t_dict = {f"{idx:0{padding_len}}": row for idx, row in enumerate(board)} 145 | return pl.DataFrame( 146 | board_t_dict, 147 | ) 148 | ``` 149 | 150 | Let's skip `_nwise_wrapping` and `step` for now and jump straight to the last function - we'll return to the two we skipped soon: 151 | 152 | !!!note 153 | Don't forget to read the comments! 154 | 155 | ```python 156 | def life_step(left: IntoExprColumn, mid: IntoExprColumn, right: IntoExprColumn) -> pl.Expr: 157 | """ 158 | This is the function that registers the polars plugin. To use it directly, 159 | data must be in the correct format. An interesting way to do so is to use 160 | the same column names as the original data frame, so the resulting df will 161 | have the same shape. See how this is done in the `step(df, n)` function. 162 | """ 163 | return register_plugin_function( 164 | args=[left, mid, right], 165 | plugin_path=LIB, 166 | function_name="life_step", 167 | is_elementwise=False, 168 | ) 169 | ``` 170 | 171 | Ok, plugin registered. How do we use it? We create columns in `step` with `with_columns`. 172 | And we do so in a way that the new columns will have the exact name as the previously existing ones, so they're overridden. 173 | 174 | But wait, there's something we didn't talk about. 175 | What happens at the border of the board (both vertically and horizontally)? 176 | Do we stop the simulation from propagating there, do we wrap around, or something else? 177 | Many implementations stop the simulation at the border, so let's do it differently, let's wrap around! 178 | 179 | Wait, why are we talking about this here - isn't this a concern to be solved by our plugin in Rust? 180 | Yes, but Python-land is where we name our columns. 181 | So in order to have that nice overriding behavior, we need to address it here. 182 | This is also a hint at what the mysterious `_nwise_wrapping` function does: 183 | 184 | ```python 185 | def _nwise_wrapping(iterable: Iterable[Any], n: int): 186 | """ 187 | Returns overlapping n-tuples from an iterable, wrapping around. This means 188 | the result will have the same length as `iterable`. It also means the first 189 | element(s) will include elements from the end of the iterable, and 190 | likewise, the last element(s) will include elements from the start, e.g.:: 191 | 192 | fn('ABCDE', 3) -> 'EAB', 'ABC', 'BCD', 'CDE', 'DEA' 193 | """ 194 | elements = list(iterable) 195 | to_be_wrapped = elements[-(n - 2) :] + elements + elements[: n - 2] 196 | iterators = tee(to_be_wrapped, n) 197 | return [ 198 | list(z) for z in zip(*(islice(it, i, None) for i, it in enumerate(iterators))) 199 | ] 200 | ``` 201 | 202 | The implementation might look a bit complicated, but the docstring should clarify its goal. 203 | 204 | Now we're only missing `step`, which takes a DataFrame already in the expected format and returns another DataFrame with our plugin applied `n` times to it: 205 | 206 | ```python 207 | def step(df: pl.DataFrame, n: int = 1): 208 | """ 209 | Takes a df and returns df.with_columns(...) corresponding to `n` advanced 210 | steps in the simulation 211 | """ 212 | padding_len = len(str(df.width - 1)) 213 | 214 | # colnums: [['{n-1}', '00', '01'], ['00', '01', '02'], ['01', '02', '03'], ... ] 215 | colnums = _nwise_wrapping([f"{idx:0{padding_len}}" for idx in range(df.width)], 3) 216 | 217 | # colnames: ['00', '01', '02', '03', ... , '{n-1}'] 218 | colnames = [cols[1] for cols in colnums] 219 | 220 | # colvalues: [, ... ] 221 | colvalues = [life_step(*tuple(cols)) for cols in colnums] 222 | 223 | for _ in range(n): 224 | df = df.with_columns(**OrderedDict(zip(colnames, colvalues))) 225 | return df 226 | ``` 227 | 228 | We're done with the Python side of things. 229 | And if you're wondering: "what plugin did we actually register with `life_step`?" - 230 | you're totally right to be confused, we didn't touch Rust yet! 231 | Why did we leave it for last? 232 | Because surprisingly, it's much simpler than the Python side, and much shorter too. 233 | 234 | ## Let's get rusty 235 | 236 | What do we need to do? 237 | For each element, we need to look at the the sum of the 8 neighbours, then apply the rule to decide whether the element will be dead or alive in the next iteration. 238 | Here's what our entire `src/expressions.rs` looks like: 239 | 240 | ```rust 241 | #![allow(clippy::unused_unit)] 242 | use polars::export::arrow::legacy::utils::CustomIterTools; 243 | use polars::prelude::*; 244 | use pyo3_polars::derive::polars_expr; 245 | 246 | #[polars_expr(output_type=Int64)] 247 | fn life_step(inputs: &[Series]) -> PolarsResult { 248 | let (ca_lf, ca_curr, ca_rt) = (inputs[0].i64()?, inputs[1].i64()?, inputs[2].i64()?); 249 | 250 | /* 251 | We're "counting" on the user not to append or modify the DataFrame created 252 | from the board file. 253 | 254 | In general, this might sound insane, but for our Game of Life, this is not 255 | so unreasonable. 256 | */ 257 | let lf = ca_lf 258 | .cont_slice() 259 | .expect("Expected input to be contiguous (in a single chunk)"); 260 | let mid = ca_curr 261 | .cont_slice() 262 | .expect("Expected input to be contiguous (in a single chunk)"); 263 | let rt = ca_rt 264 | .cont_slice() 265 | .expect("Expected input to be contiguous (in a single chunk)"); 266 | 267 | let len = lf.len(); 268 | 269 | let out: Int64Chunked = mid 270 | .iter() 271 | .enumerate() 272 | .map(|(idx, val)| { 273 | // Neighbours above 274 | let prev_row = if 0 == idx { 275 | lf[len - 1] + mid[len - 1] + rt[len - 1] 276 | } else { 277 | lf[idx - 1] + mid[idx - 1] + rt[idx - 1] 278 | }; 279 | 280 | // Curr row does not include cell in the middle, 281 | // a cell is not a neighbour of itself 282 | let curr_row = lf[idx] + rt[idx]; 283 | 284 | // Neighbours below 285 | let next_row = if len - 1 == idx { 286 | lf[0] + mid[0] + rt[0] 287 | } else { 288 | lf[idx + 1] + mid[idx + 1] + rt[idx + 1] 289 | }; 290 | 291 | // Life logic 292 | Some(match (val, prev_row + curr_row + next_row) { 293 | (1, 2) | (1, 3) => 1, 294 | (0, 3) => 1, 295 | _ => 0, 296 | }) 297 | }) 298 | .collect_trusted(); 299 | Ok(out.into_series()) 300 | } 301 | ``` 302 | 303 | Awesome, now what? If we ignore tests, _as plugin developers_, we could say we're done. 304 | Nothing's happened yet, so how could we be done? 305 | In the next section we'll take a look at how the plugin _user_ would call the functions we made available. 306 | -------------------------------------------------------------------------------- /docs/life_pt2.md: -------------------------------------------------------------------------------- 1 | 2 | # Extra.2 Plugin user 3 | 4 | In the last section we saw what the plugin developers made available for a plugin user. 5 | Now we put the user's hat and demonstrate that _usage_. 6 | For this, we'll implement a CLI app that will parse a board file provided as an argument, then run a step of the simulation every `delay` seconds (also provided as an argument). 7 | 8 | > Tip: place the code from this section in a separate file, e.g., `run.py`. 9 | 10 | Just like what we did previously, let's look at an overview of what's to come: 11 | 12 | ```python 13 | import argparse 14 | import contextlib 15 | import io 16 | import sys 17 | from time import sleep 18 | 19 | from game_of_life import parse_board, board_to_df, step 20 | import polars as pl 21 | 22 | 23 | class Application: 24 | 25 | # Initialize the board 26 | def __init__(self): ... 27 | 28 | # Printing the application object prints the board 29 | def __str__(self) -> str: ... 30 | 31 | # Run a step of the simulation every `delay` steps, for `n` maximum steps 32 | def start(self, n, delay, print_df): ... 33 | ``` 34 | 35 | Notice how we're importing `parse_board`, `board_to_df` and `step` from our fully-developed plugin. 36 | This could've been installed with pip! Check the [publishing chapter](publishing.md) for more on this. 37 | 38 | So first things first: `__init__`. 39 | Here we use the stdlib `argparse` module to capture the command line arguments we mentioned above. 40 | Then, we call `board_to_df` with the result of `parse_board`, storing the resulting DataFrame in the `Application` object itself. 41 | 42 | ```python 43 | class Application: 44 | 45 | def __init__(self): 46 | self._args = argparse.Namespace() 47 | cli = argparse.ArgumentParser( 48 | prog="python -m game_of_life", description="Options" 49 | ) 50 | cli.add_argument("-i", "--input", type=str, required=True) 51 | cli.add_argument("-d", "--delay", type=float, default=0.2) 52 | cli.add_argument("-n", "--num-steps", type=int, default=sys.maxsize) 53 | 54 | cli.parse_args(namespace=self._args) 55 | 56 | # [-i] 57 | self.ifile: str = self._args.input 58 | 59 | # [-d] 60 | self.delay: float = self._args.delay 61 | 62 | # [-n] 63 | self.steps: int = self._args.num_steps 64 | 65 | # Creates a pl.DataFrame from the provided file 66 | self.df = board_to_df(parse_board(self.ifile)) 67 | ``` 68 | 69 | Next, an optional but handy detail - we implement `__str__` for `Application` in a way that printing an `Application` object will actually print the DataFrame stored internally: 70 | 71 | ```python 72 | class Application: 73 | 74 | # ... 75 | 76 | def __str__(self) -> str: 77 | res = io.StringIO() 78 | with ( 79 | pl.Config(tbl_rows=-1, tbl_cols=-1), 80 | contextlib.redirect_stdout(res), 81 | ): 82 | print(self.df) 83 | return res.getvalue() 84 | ``` 85 | 86 | The `pl.Config` part just removes the default row and column limits when displaying a DataFrame - otherwise we'd see ellipses (`...`) instead of `1`s and `0`s. 87 | 88 | Finally, `start` is where we display the DataFrame and call `step` to advance the simulation, over and over: 89 | 90 | ```python 91 | class Application: 92 | 93 | # ... 94 | 95 | def start( 96 | self, 97 | n: int | None = None, 98 | delay: float | None = None, 99 | print_df: bool = True, 100 | ): 101 | if n is None: 102 | n = self.steps 103 | 104 | if delay is None: 105 | delay = self.delay 106 | 107 | if print_df: 108 | print(self) 109 | 110 | iteration_cnt = 0 111 | try: 112 | for _ in range(n): 113 | self.df = step(self.df) 114 | iteration_cnt += 1 115 | if print_df: 116 | # Clear screen 117 | print("\033[2J") 118 | print(self) 119 | sleep(delay) 120 | 121 | except KeyboardInterrupt: 122 | print( 123 | f"\nKeyboard Interrupt: ran for {iteration_cnt} iterations. Aborting..." 124 | ) 125 | print(f"max_num_steps={self._args.num_steps}\ndelay={self._args.delay}") 126 | ``` 127 | 128 | To run the program, we only need two more things - an entry point and an input file. 129 | Create a `toad.txt` in an `input` folder, containing: 130 | 131 | ``` 132 | 00000000000 133 | 00000000000 134 | 00000000000 135 | 00001110000 136 | 00011100000 137 | 00000000000 138 | 00000000000 139 | 00000000000 140 | ``` 141 | 142 | and add this entry point at the end of `run.py`: 143 | 144 | ```python 145 | if __name__ == "__main__": 146 | app = Application() 147 | app.start() 148 | ``` 149 | 150 | Now we can see the results of our work, at last: 151 | 152 | ```shell 153 | # Compile the rust code 154 | maturin develop --release 155 | 156 | # Run the application 157 | python run.py -i input/toad.txt -d 0.3 158 | ``` 159 | 160 | ![Toad pattern with period = 2, running in a dataframe](assets/life_toad_df.gif) 161 | 162 | __Victory!__ 163 | 164 | ## Reference 165 | 166 | The entire code for this plugin, including the user's side, can be found on [GitHub](https://github.com/condekind/life_polars_plugin). 167 | -------------------------------------------------------------------------------- /docs/lists.md: -------------------------------------------------------------------------------- 1 | # 9.0 Weighted-mean watchers 2 | 3 | According to [one YouTube talk](https://youtu.be/u5mIDz5ldmI?si=4AtnyyAwdVk33bYu), 4 | the `list` namespace is one of Polars' main selling points. 5 | If you're also a fan of it, this section will teach you how to extend it even further. 6 | 7 | ## Motivation 8 | 9 | Say you have 10 | ```python 11 | In [10]: df = pl.DataFrame({ 12 | ...: 'values': [[1, 3, 2], [5, 7]], 13 | ...: 'weights': [[.5, .3, .2], [.1, .9]] 14 | ...: }) 15 | 16 | In [11]: df 17 | Out[11]: 18 | shape: (2, 2) 19 | ┌───────────┬─────────────────┐ 20 | │ values ┆ weights │ 21 | │ --- ┆ --- │ 22 | │ list[i64] ┆ list[f64] │ 23 | ╞═══════════╪═════════════════╡ 24 | │ [1, 3, 2] ┆ [0.5, 0.3, 0.2] │ 25 | │ [5, 7] ┆ [0.1, 0.9] │ 26 | └───────────┴─────────────────┘ 27 | ``` 28 | 29 | Can you calculate the mean of the values in `'values'`, weighted by the values in `'weights'`? 30 | 31 | So: 32 | 33 | - `.5*1 + .3*3 + .2*2 = 1.8` 34 | - `5*.1 + 7*.9 = 6.8` 35 | 36 | I don't know of an easy way to do this with Polars expressions. There probably is a way - but 37 | as you'll see here, it's not that hard to write a plugin, and it's probably faster too. 38 | 39 | ## Weighted mean 40 | 41 | On the Python side, this'll be similar to `sum_i64`: 42 | 43 | ```python 44 | def weighted_mean(expr: IntoExprColumn, weights: IntoExprColumn) -> pl.Expr: 45 | return register_plugin_function( 46 | args=[expr, weights], 47 | plugin_path=LIB, 48 | function_name="weighted_mean", 49 | is_elementwise=True, 50 | ) 51 | ``` 52 | 53 | On the Rust side, we'll define a helper function which will let us work with 54 | pairs of list chunked arrays: 55 | 56 | ```rust 57 | fn binary_amortized_elementwise<'a, T, K, F>( 58 | lhs: &'a ListChunked, 59 | rhs: &'a ListChunked, 60 | mut f: F, 61 | ) -> ChunkedArray 62 | where 63 | T: PolarsDataType, 64 | T::Array: ArrayFromIter>, 65 | F: FnMut(&AmortSeries, &AmortSeries) -> Option + Copy, 66 | { 67 | { 68 | let (lhs, rhs) = align_chunks_binary(lhs, rhs); 69 | lhs.amortized_iter() 70 | .zip(rhs.amortized_iter()) 71 | .map(|(lhs, rhs)| match (lhs, rhs) { 72 | (Some(lhs), Some(rhs)) => f(&lhs, &rhs), 73 | _ => None, 74 | }) 75 | .collect_ca(PlSmallStr::EMPTY) 76 | } 77 | } 78 | ``` 79 | 80 | That's a bit of a mouthful, so let's try to make sense of it. 81 | 82 | - As we learned about in [Prerequisites], Polars Series are backed by chunked arrays. 83 | `align_chunks_binary` just ensures that the chunks have the same lengths. It may need 84 | to rechunk under the hood for us; 85 | - `amortized_iter` returns an iterator of `AmortSeries`, each of which corresponds 86 | to a row from our input. 87 | 88 | We'll explain more about `AmortSeries` in a future iteration of this tutorial. 89 | For now, let's just look at how to use this utility: 90 | 91 | - we pass it `ListChunked` as inputs; 92 | - we also pass a function which takes two `AmortSeries` and produces a scalar 93 | value. 94 | 95 | ```rust 96 | #[polars_expr(output_type=Float64)] 97 | fn weighted_mean(inputs: &[Series]) -> PolarsResult { 98 | let values = inputs[0].list()?; 99 | let weights = &inputs[1].list()?; 100 | polars_ensure!( 101 | values.dtype() == &DataType::List(Box::new(DataType::Int64)), 102 | ComputeError: "Expected `values` to be of type `List(Int64)`, got: {}", values.dtype() 103 | ); 104 | polars_ensure!( 105 | weights.dtype() == &DataType::List(Box::new(DataType::Float64)), 106 | ComputeError: "Expected `weights` to be of type `List(Float64)`, got: {}", weights.dtype() 107 | ); 108 | 109 | let out: Float64Chunked = binary_amortized_elementwise( 110 | values, 111 | weights, 112 | |values_inner: &AmortSeries, weights_inner: &AmortSeries| -> Option { 113 | let values_inner = values_inner.as_ref().i64().unwrap(); 114 | let weights_inner = weights_inner.as_ref().f64().unwrap(); 115 | if values_inner.len() == 0 { 116 | // Mirror Polars, and return None for empty mean. 117 | return None 118 | } 119 | let mut numerator: f64 = 0.; 120 | let mut denominator: f64 = 0.; 121 | values_inner 122 | .iter() 123 | .zip(weights_inner.iter()) 124 | .for_each(|(v, w)| { 125 | if let (Some(v), Some(w)) = (v, w) { 126 | numerator += v as f64 * w; 127 | denominator += w; 128 | } 129 | }); 130 | Some(numerator / denominator) 131 | }, 132 | ); 133 | Ok(out.into_series()) 134 | } 135 | ``` 136 | 137 | If you just need to get a problem solved, this function works! But let's note its 138 | limitations: 139 | 140 | - it assumes that each inner element of `values` and `weights` has the same 141 | length - it would be better to raise an error if this assumption is not met 142 | - it only accepts `Int64` `values` and `Float64` `weights` 143 | (see section 2 for how you could make it more generic). 144 | 145 | To try it out, we compile with `maturin develop` (or `maturin develop --release` if you're 146 | benchmarking), and then we should be able to run `run.py`: 147 | 148 | ```python 149 | import polars as pl 150 | import minimal_plugin as mp 151 | 152 | df = pl.DataFrame({ 153 | 'values': [[1, 3, 2], [5, 7]], 154 | 'weights': [[.5, .3, .2], [.1, .9]] 155 | }) 156 | print(df.with_columns(weighted_mean = mp.weighted_mean('values', 'weights'))) 157 | ``` 158 | to see 159 | ``` 160 | shape: (2, 3) 161 | ┌───────────┬─────────────────┬───────────────┐ 162 | │ values ┆ weights ┆ weighted_mean │ 163 | │ --- ┆ --- ┆ --- │ 164 | │ list[i64] ┆ list[f64] ┆ f64 │ 165 | ╞═══════════╪═════════════════╪═══════════════╡ 166 | │ [1, 3, 2] ┆ [0.5, 0.3, 0.2] ┆ 1.8 │ 167 | │ [5, 7] ┆ [0.1, 0.9] ┆ 6.8 │ 168 | └───────────┴─────────────────┴───────────────┘ 169 | ``` 170 | 171 | [Prerequisites]: ../prerequisites/ 172 | 173 | ## Gimme ~~chocolate~~ challenge 174 | 175 | Could you implement a weighted standard deviation calculator? 176 | -------------------------------------------------------------------------------- /docs/lists_in_lists_out.md: -------------------------------------------------------------------------------- 1 | # 9.1 Lists in, lists out, lists all about 2 | 3 | Chapter 9.0 ([Weighted-mean watchers]) was fun. Let's do it all over again! 4 | 5 | Or rather, let's do another list operation. We're going to start with 6 | a dataframe such as: 7 | 8 | ```python 9 | shape: (4, 1) 10 | ┌──────────────┐ 11 | │ dense │ 12 | │ --- │ 13 | │ list[i64] │ 14 | ╞══════════════╡ 15 | │ [0, 9] │ 16 | │ [8, 6, 0, 9] │ 17 | │ null │ 18 | │ [3, 3] │ 19 | └──────────────┘ 20 | ``` 21 | 22 | Before we start, however, let's take a look into how Polars stores lists in memory. 23 | As we saw, lists are backed up by chunks. 24 | Inside each chunk, Polars stores all the lists ("rows") as one single list, while keeping track of where each row starts, and how many elements they have. 25 | This is consistent with Apache Arrow's columnar format. 26 | It looks something like this: 27 | 28 | ![Diagram showing how Polars stores lists under the hood](assets/list_chunked_memory_layout.png) 29 | 30 | Back to where we were - we're going to try to count the indices which are non-zero. --> 31 | 32 | !!! note 33 | 34 | You don't really need a plugin to do this, you can just do 35 | 36 | ```python 37 | df.with_columns(sparse_indices=pl.col('dense').list.eval(pl.arg_where(pl.element() != 0))) 38 | ``` 39 | 40 | But `eval` won't cover every need you ever have ever, so...it's good 41 | to learn how to do this as a plugin so you can then customize it according to your needs. 42 | 43 | --- 44 | 45 | Polars has a helper function built-in for dealing with this: `apply_amortized`. We can use it to apply 46 | a function to each element of a List Series. In this case, we just want to find the indices of non-zero 47 | elements, so we'll do: 48 | 49 | ```rust 50 | fn list_idx_dtype(input_fields: &[Field]) -> PolarsResult { 51 | let field = Field::new(input_fields[0].name.clone(), DataType::List(Box::new(IDX_DTYPE))); 52 | Ok(field.clone()) 53 | } 54 | 55 | #[polars_expr(output_type_func=list_idx_dtype)] 56 | fn non_zero_indices(inputs: &[Series]) -> PolarsResult { 57 | let ca = inputs[0].list()?; 58 | polars_ensure!( 59 | ca.dtype() == &DataType::List(Box::new(DataType::Int64)), 60 | ComputeError: "Expected `List(Int64)`, got: {}", ca.dtype() 61 | ); 62 | 63 | let out: ListChunked = ca.apply_amortized(|s| { 64 | let s: &Series = s.as_ref(); 65 | let ca: &Int64Chunked = s.i64().unwrap(); 66 | let out: IdxCa = ca 67 | .iter() 68 | .enumerate() 69 | .filter(|(_idx, opt_val)| opt_val != &Some(0)) 70 | .map(|(idx, _opt_val)| Some(idx as IdxSize)) 71 | .collect_ca(PlSmallStr::EMPTY); 72 | out.into_series() 73 | }); 74 | Ok(out.into_series()) 75 | } 76 | ``` 77 | `apply_amortized` is a bit like the `apply_into_string_amortized` function we used in [How to STRING something together], 78 | in that it makes a big allocation upfront to amortize the allocation costs. Think of it as a list version 79 | of `apply_values`, where each element is itself a `Series`. 80 | 81 | Something new in this example is: 82 | 83 | - `IdxSize` 84 | - `IdxCa` 85 | - `IDX_DTYPE` 86 | 87 | `IdxSize` is either `u32` or `u64`, depending on your platform, and are what Polars generally uses 88 | for counting-related operations. `IdxCa` is the associated `ChunkedArray`, and `IDX_DTYPE` the associated 89 | Polars dtype. 90 | 91 | [Weighted-mean watchers]: ../lists/ 92 | [How to STRING something together]: ../stringify/ 93 | 94 | To finish this off, the Python side will be a bog-standard: 95 | 96 | ```python 97 | def non_zero_indices(expr: IntoExprColumn) -> pl.Expr: 98 | return register_plugin_function( 99 | args=[expr], plugin_path=LIB, function_name="non_zero_indices", is_elementwise=True 100 | ) 101 | ``` 102 | 103 | If we then make `run.py` with 104 | 105 | ```python 106 | import polars as pl 107 | import minimal_plugin as mp 108 | 109 | pl.Config().set_fmt_table_cell_list_len(10) 110 | 111 | df = pl.DataFrame({'dense': [[0, 9], [8, 6, 0, 9], None, [3, 3]]}) 112 | print(df) 113 | print(df.with_columns(indices=mp.non_zero_indices('dense'))) 114 | ``` 115 | and compile with `maturin develop` (or `maturin develop --release` if you're benchmarking!) 116 | then we'll see 117 | 118 | ``` 119 | shape: (4, 2) 120 | ┌──────────────┬───────────┐ 121 | │ dense ┆ indices │ 122 | │ --- ┆ --- │ 123 | │ list[i64] ┆ list[u32] │ 124 | ╞══════════════╪═══════════╡ 125 | │ [0, 9] ┆ [1] │ 126 | │ [8, 6, 0, 9] ┆ [0, 1, 3] │ 127 | │ null ┆ null │ 128 | │ [3, 3] ┆ [0, 1] │ 129 | └──────────────┴───────────┘ 130 | ``` 131 | 132 | Yay, it worked! And not only that, but it's about 1.5x as fast as the `list.eval` solution 133 | noted above! 134 | -------------------------------------------------------------------------------- /docs/lost_in_space.md: -------------------------------------------------------------------------------- 1 | # 12. Lost in space 2 | 3 | Suppose, hypothetically speaking, that you're lost somewhere and only have access 4 | to your latitude, your longitude, and a laptop on which you can write a Polars Plugin. 5 | How can you find out what the closest city to you is? 6 | 7 | ## Reverse geocoding 8 | 9 | The practice of starting with a (latitude, longitude) pair and finding out which 10 | city it corresponds to is known as reverse geocoding. 11 | We're not going to implement a reverse geocoder from scratch - instead, we'll 12 | use the `reverse-geocoder` crate and make a plugin out of it! 13 | 14 | ## Cargo here, cargo there, cargo everywhere 15 | 16 | Let's add that crate to our project by running `cargo add reverse-geocoder`. 17 | You'll need to activate the nightly Rust channel, which you can do by making 18 | a file ` rust-toolchain.toml` in your root directory 19 | ```toml 20 | [toolchain] 21 | channel = "nightly" 22 | ``` 23 | You'll also need to add `polars-arrow` and `polars-core` to `Cargo.toml` 24 | and pin them to the same version that you pin `polars` to. 25 | Yes, this example is getting a bit heavier... 26 | 27 | The way the `reverse-geocoder` crate works is: 28 | 29 | - you instantiate a `ReverseGeocoder` instance 30 | - you pass a (latitude, longitude) pair to `search` 31 | - you get the city name out 32 | 33 | So our plugin will work by taking two `Float64` columns (one of latitude, one 34 | for longitude) and producing a String output column. 35 | 36 | ## Binary elementwise apply to buffer 37 | 38 | In [How to STRING something together], we learned how to use `StringChunked.apply_into_string_amortized` 39 | to run an elementwise function on a String column. Does Polars have a binary version of that one 40 | which allows us to start from any data type? 41 | 42 | [Prerequisites]: ../prerequisites/ 43 | [How to STRING something together]: ../stringify/ 44 | 45 | Unfortunately, not. But, this is a good chance to learn about a few new concepts! 46 | 47 | We'll start easy by dealing with the Python side. Add the following to `minimal_plugin/__init__.py`: 48 | 49 | ```python 50 | def reverse_geocode(lat: IntoExprColumn, long: IntoExprColumn) -> pl.Expr: 51 | return register_plugin_function( 52 | args=[lat, long], plugin_path=LIB, function_name="reverse_geocode", is_elementwise=True 53 | ) 54 | ``` 55 | 56 | On the Rust side, in `src/expressions.rs`, get ready for it, we're going to add: 57 | 58 | ```Rust 59 | use polars_arrow::array::MutablePlString; 60 | use polars_core::utils::align_chunks_binary; 61 | use reverse_geocoder::ReverseGeocoder; 62 | 63 | #[polars_expr(output_type=String)] 64 | fn reverse_geocode(inputs: &[Series]) -> PolarsResult { 65 | let latitude = inputs[0].f64()?; 66 | let longitude = inputs[1].f64()?; 67 | let geocoder = ReverseGeocoder::new(); 68 | let out = binary_elementwise_into_string_amortized(latitude, longitude, |lhs, rhs, out| { 69 | let search_result = geocoder.search((lhs, rhs)); 70 | write!(out, "{}", search_result.record.name).unwrap(); 71 | }); 72 | Ok(out.into_series()) 73 | } 74 | ``` 75 | 76 | We use the utility function `binary_elementwise_into_string_amortized`, 77 | which is a binary version of `apply_into_string_amortized` which we learned 78 | about in the [Stringify] chapter. 79 | 80 | [Stringify]: ../stringify/ 81 | 82 | To run it, put the following in `run.py`: 83 | ```python 84 | import polars as pl 85 | import minimal_plugin as mp 86 | 87 | df = pl.DataFrame({ 88 | 'lat': [37.7749, 51.01, 52.5], 89 | 'lon': [-122.4194, -3.9, -.91] 90 | }) 91 | print(df.with_columns(city=mp.reverse_geocode('lat', 'lon'))) 92 | ``` 93 | then compile with `maturin develop` (or `maturin develop --release` if you're benchmarking) 94 | and you should see 95 | ``` 96 | shape: (3, 3) 97 | ┌─────────┬───────────┬───────────────────┐ 98 | │ lat ┆ lon ┆ city │ 99 | │ --- ┆ --- ┆ --- │ 100 | │ f64 ┆ f64 ┆ str │ 101 | ╞═════════╪═══════════╪═══════════════════╡ 102 | │ 37.7749 ┆ -122.4194 ┆ San Francisco │ 103 | │ 51.01 ┆ -3.9 ┆ South Molton │ 104 | │ 52.5 ┆ -0.91 ┆ Market Harborough │ 105 | └─────────┴───────────┴───────────────────┘ 106 | ``` 107 | in the output! 108 | 109 | Great, now in our hypothetical scenario, you're probably still lost, but 110 | at least you know which city you're closest to. 111 | -------------------------------------------------------------------------------- /docs/noop.md: -------------------------------------------------------------------------------- 1 | # 1. How to do nothing 2 | 3 | That's right - this section is about how to do _nothing_. 4 | 5 | We'll write a Polars plugin which takes an expression, and returns it exactly 6 | as it is. Nothing more, nothing less. This will just be an exercise in setting 7 | everything up! 8 | 9 | If you followed the instructions in [Prerequisites], then your working directory 10 | should look a bit like the following: 11 | ``` 12 | . 13 | ├── Cargo.toml 14 | ├── minimal_plugin 15 | │   ├── __init__.py 16 | │   └── typing.py 17 | ├── pyproject.toml 18 | ├── run.py 19 | ├── src 20 | │ ├── expressions.rs 21 | │ └── lib.rs 22 | └── tests 23 | ``` 24 | The cookiecutter command you ran earlier set up a Polars plugin project with a 25 | sample function called `pig_latinnify` already implemented. The [Polars Plugins Cookiecutter](https://github.com/MarcoGorelli/cookiecutter-polars-plugins) 26 | helps you quickly start a Polars plugin project, skipping the boilerplate setup. 27 | Check it out for more details! 28 | 29 | [Prerequisites]: ../prerequisites/ 30 | 31 | ## The Python side 32 | 33 | Let's start by getting the Python side ready. It won't run until we 34 | implement the Rust side too, but it's a necessary step. 35 | Start by adding the following to `minimal_plugin/__init__.py`: 36 | 37 | ```python 38 | def noop(expr: IntoExprColumn) -> pl.Expr: 39 | return register_plugin_function( 40 | args=[expr], 41 | plugin_path=LIB, 42 | function_name="noop", 43 | is_elementwise=True, 44 | ) 45 | ``` 46 | Let's go through this line-by-line: 47 | 48 | - when we compile Rust, it generates a Shared Object file. 49 | The `LIB` variable holds its filepath; 50 | - We'll cover `is_elementwise` in [Yes we SCAN], for now don't pay attention to it; 51 | - We use the Polars utility function [register_plugin_function](https://docs.pola.rs/py-polars/html/reference/plugins.html#polars.plugins.register_plugin_function) to extend its functionality with our own. 52 | 53 | 54 | Note that string literals are parsed as expressions, so that if somebody 55 | calls `noop('a')`, it gets interpreted as `noop(pl.col('a'))`. 56 | 57 | [Yes we SCAN]: ../cum_sum/ 58 | 59 | ## Let's get Rusty 60 | 61 | Let's leave `src/lib.rs` as it is, and add the following to `src/expressions.rs`: 62 | 63 | ``` rust 64 | fn same_output_type(input_fields: &[Field]) -> PolarsResult { 65 | let field = &input_fields[0]; 66 | Ok(field.clone()) 67 | } 68 | 69 | #[polars_expr(output_type_func=same_output_type)] 70 | fn noop(inputs: &[Series]) -> PolarsResult { 71 | let s = &inputs[0]; 72 | Ok(s.clone()) 73 | } 74 | ``` 75 | 76 | There's a lot to cover here so we'll break it down below. 77 | 78 | ### Defining `noop`'s schema 79 | 80 | Polars needs to know the schema/dtypes resulting from operations to make good 81 | optimization decisions. The way we tell Polars what to expect from our custom 82 | function is with the `polars_expr` attribute. 83 | 84 | Our beautiful `noop` doesn't change the data type (in fact, it doesn't change anything...) 85 | so we'll just write a function which returns the same input type: 86 | 87 | ```Rust 88 | fn same_output_type(input_fields: &[Field]) -> PolarsResult { 89 | let field = &input_fields[0]; 90 | Ok(field.clone()) 91 | } 92 | ``` 93 | and use that to define the function output's schema. Just like 94 | `noop`, this function takes a reference to its only input and 95 | clones it. 96 | 97 | ### Defining `noop`'s body 98 | 99 | The input is an iterable of `Series`. In our case, `noop` just 100 | receives a single Series as input, but as we'll see in later 101 | sections, it's possible to pass multiple Series. 102 | 103 | We said we wanted our function to do nothing, so let's implement 104 | that: take a reference to the first (and only) input Series, 105 | and return a (cheap!) clone of it. 106 | 107 | ## Putting it all together 108 | 109 | Right, does this all work? Let's edit the Python file `run.py`, 110 | which we will use for testing. We'll just make a toy dataframe 111 | and apply `noop` to each column! 112 | ```python 113 | import polars as pl 114 | import minimal_plugin as mp 115 | 116 | df = pl.DataFrame({ 117 | 'a': [1, 1, None], 118 | 'b': [4.1, 5.2, 6.3], 119 | 'c': ['hello', 'everybody!', '!'] 120 | }) 121 | print(df.with_columns(mp.noop(pl.all()).name.suffix('_noop'))) 122 | ``` 123 | 124 | Let's compile! Please run `maturin develop` (or `maturin develop --release` if benchmarking). 125 | You'll need to do this every time you change any of your Rust code. 126 | It may take a while the first time, but subsequent executions will 127 | be significantly faster as the build process is incremental. 128 | 129 | Finally, you can run your code! If you run `python run.py` and get 130 | the following output: 131 | ``` 132 | shape: (3, 6) 133 | ┌──────┬─────┬────────────┬────────┬────────┬────────────┐ 134 | │ a ┆ b ┆ c ┆ a_noop ┆ b_noop ┆ c_noop │ 135 | │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ 136 | │ i64 ┆ f64 ┆ str ┆ i64 ┆ f64 ┆ str │ 137 | ╞══════╪═════╪════════════╪════════╪════════╪════════════╡ 138 | │ 1 ┆ 4.1 ┆ hello ┆ 1 ┆ 4.1 ┆ hello │ 139 | │ 1 ┆ 5.2 ┆ everybody! ┆ 1 ┆ 5.2 ┆ everybody! │ 140 | │ null ┆ 6.3 ┆ ! ┆ null ┆ 6.3 ┆ ! │ 141 | └──────┴─────┴────────────┴────────┴────────┴────────────┘ 142 | ``` 143 | then it means everything worked correctly. Congrats! 144 | 145 | You're now ready to learn how to do ABSolutely nothing. 146 | -------------------------------------------------------------------------------- /docs/prerequisites.md: -------------------------------------------------------------------------------- 1 | # 0. Prerequisites 2 | 3 | ## Knowledge 4 | 5 | > "But you know what I like more than materialistic things? Knowledge." Tai Lopez 6 | 7 | How much Rust do you need to know to write your own Polars plugin? Less than 8 | you think. 9 | 10 | I'd suggest starting out with the [Rustlings](https://github.com/rust-lang/rustlings) 11 | course, which provides some fun and interactive exercises designed to make you familiar 12 | with the language. I'd suggest starting the following sections: 13 | 14 | - 00 intro 15 | - 01 variables 16 | - 02 functions 17 | - 03 if 18 | - 05 vecs 19 | - 12 options 20 | - 13 error handling 21 | 22 | You'll also need basic Python knowledge: classes, decorators, and functions. 23 | 24 | Alternatively, you could just clone this repo and then hack away 25 | at the examples trial-and-error style until you get what you're looking 26 | for - the compiler will probably help you more than you're expecting. 27 | 28 | ## Software 29 | 30 | To get started, please [install cookiecutter](https://cookiecutter.readthedocs.io/en/stable/README.html#installation). 31 | 32 | Then, from your home directory (or wherever you store your Python projects) please run 33 | ``` 34 | cookiecutter https://github.com/MarcoGorelli/cookiecutter-polars-plugins 35 | ``` 36 | When prompted, please enter (let's suppose your name is "Maja Anima", but replace that 37 | with your preferred name): 38 | ``` 39 | [1/3] plugin_name (Polars Cookiecutter): Minimal Plugin 40 | [2/3] project_slug (polars_minimal_plugin): 41 | [3/3] author (anonymous): Maja Anima 42 | ``` 43 | This will create a folder call `minimal_plugin`. 44 | Please navigate to it with `cd minimal_plugin`. 45 | 46 | Next, [create a Python3.8+ virtual environment](https://docs.python.org/3/library/venv.html), and install: 47 | 48 | - `polars>=1.3.0` 49 | - `maturin>=1.4.0` 50 | 51 | Finally, you'll also need to [install Rust](https://rustup.rs/). 52 | 53 | That's it! However, you are highly encouraged to also install 54 | [rust-analyzer](https://rust-analyzer.github.io/manual.html) if you want to 55 | improve your Rust-writing experience by exactly 120%. 56 | 57 | ## What's in a Series? 58 | 59 | If you take a look at a Series such as 60 | ```python 61 | In [9]: s = pl.Series([None, 2, 3]) + 42 62 | 63 | In [10]: s 64 | Out[10]: 65 | shape: (3,) 66 | Series: '' [i64] 67 | [ 68 | null 69 | 44 70 | 45 71 | ] 72 | ``` 73 | you may be tempted to conclude that it contains three values: `[null, 44, 45]`. 74 | 75 | However, if you print out `s._get_buffers()`, you'll see 76 | something different: 77 | 78 | - `s._get_buffers()["values"]`: `[42, 44, 45]`. These are the _values_. 79 | - `s._get_buffers()["validity"]`: `[False, True, True]`. These are the _validities_. 80 | 81 | So we don't really have integers and `null` mixed together into a single array - we 82 | have a pair of arrays, one holding values and another one holding booleans indicating 83 | whether each value is valid or not. 84 | If a value appears as `null` to you, then there's no guarantee about what physical number 85 | is behind it! It was `42` here, but it could well be `43`, or any other number, 86 | in another example. 87 | 88 | ## What's a chunk? 89 | 90 | A Series is backed by chunked arrays, each of which holds data which is contiguous in 91 | memory. 92 | 93 | Here's an example of a Series backed by multiple chunks: 94 | ```python 95 | In [27]: s = pl.Series([1,2,3]) 96 | 97 | In [28]: s = s.append(pl.Series([99, 11])) 98 | 99 | In [29]: s 100 | Out[29]: 101 | shape: (5,) 102 | Series: '' [i64] 103 | [ 104 | 1 105 | 2 106 | 3 107 | 99 108 | 11 109 | ] 110 | 111 | In [30]: s.get_chunks() 112 | Out[30]: 113 | [shape: (3,) 114 | Series: '' [i64] 115 | [ 116 | 1 117 | 2 118 | 3 119 | ], 120 | shape: (2,) 121 | Series: '' [i64] 122 | [ 123 | 99 124 | 11 125 | ]] 126 | ``` 127 | Chunked arrays will come up in several examples in this tutorial. 128 | -------------------------------------------------------------------------------- /docs/publishing.md: -------------------------------------------------------------------------------- 1 | # 14. Publishing your plugin to PyPI and becoming famous 2 | 3 | Here are the steps you should follow: 4 | 5 | 1. publish plugin to PyPI 6 | 2. ??? 7 | 3. profit 8 | 9 | This section deals with step 1, and assumes your project live on GitHub. 10 | 11 | ## Set up trusted publishing 12 | 13 | If you followed the [Prerequisites] steps, you should have `.github/workflows/publish_to_pypi.yml`, 14 | `Makefile`, and `requirements.txt` files. If not, go back and follow the cookiecutter step. 15 | 16 | Next, set up an account on Pypi.org, can't do much without that. 17 | 18 | Third, on PyPI, you'll want to (note: this is taken almost verbatim from [PyPA](https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/#configuring-trusted-publishing)): 19 | 20 | 1. Go to https://pypi.org/manage/account/publishing/. 21 | 2. Fill in the name you wish to publish your new PyPI project under (the name value in your pyproject.toml), the GitHub repository owner’s name (org or user), and repository name, and the name of the release workflow file under the .github/ folder, see Creating a workflow definition. Finally, add the name of the GitHub Environment (pypi) we’re going set up under your repository. Register the trusted publisher. 22 | 23 | Finally, if you make a commit and tag it, and then push, then a release should be triggered! It will then be 24 | available for install across different platforms, which would be really hard (impossible?) to do if you were building 25 | the wheel manually and uploading to PyPI yourself. 26 | 27 | ## PYPI_API_TOKEN 28 | 29 | You'll also need a repository secret called `PYPI_API_TOKEN`. In PyPI, 30 | create an API token scoped just to your project, and then save it in your 31 | repository's secrets using the name `PYPI_API_TOKEN`. 32 | 33 | [Prerequisites]: ../prerequisites/ 34 | -------------------------------------------------------------------------------- /docs/requirements-docs.txt: -------------------------------------------------------------------------------- 1 | mkdocs 2 | mkdocs-material -------------------------------------------------------------------------------- /docs/stem.md: -------------------------------------------------------------------------------- 1 | # 6. How to CRATE something else entirely 2 | 3 | Take a look at [crates.io](https://crates.io/) - there's _so_ much good stuff there! 4 | There's probably a package for practically any use case. 5 | 6 | For example, this looks like a fun one: [rust_stemmers](https://crates.io/crates/rust-stemmers). 7 | It lets us input a word, and stem it (i.e. reduce it to a simpler version, e.g. 'fearlessly' -> 8 | 'fearless'). 9 | Can we make a plugin out of it? 10 | 11 | ## Cargo this, cargo that 12 | 13 | If we're going to use `rust_stemmers`, we're going to need to take it on as a dependency. 14 | The easiest way to do this is probably to run `cargo add rust_stemmers` - run this, and 15 | watch how `Cargo.toml` changes! 16 | You should see the line 17 | ```toml 18 | rust-stemmers = "1.2.0" 19 | ``` 20 | somewhere in there. 21 | 22 | ## Writing a Snowball Stemmer 23 | 24 | Let's write a function which: 25 | 26 | - takes a `Utf8` columns as input; 27 | - produces a `Utf8` column as output. 28 | 29 | We'd like to be able to call it as follows: 30 | 31 | ```python 32 | df.with_columns(stemmed_word=mp.snowball_stem('word')) 33 | ``` 34 | 35 | On the Python side, let's add the following function to `minimal_plugin/__init__.py`: 36 | 37 | ```python 38 | def snowball_stem(expr: IntoExprColumn) -> pl.Expr: 39 | return register_plugin_function( 40 | args=[expr], 41 | plugin_path=LIB, 42 | function_name="snowball_stem", 43 | is_elementwise=True, 44 | ) 45 | ``` 46 | 47 | Then, we can define the function like this in `src/expressions.rs`: 48 | 49 | ```Rust 50 | use rust_stemmers::{Algorithm, Stemmer}; 51 | 52 | #[polars_expr(output_type=String)] 53 | fn snowball_stem(inputs: &[Series]) -> PolarsResult { 54 | let ca: &StringChunked = inputs[0].str()?; 55 | let en_stemmer = Stemmer::create(Algorithm::English); 56 | let out: StringChunked = ca.apply_into_string_amortized(|value: &str, output: &mut String| { 57 | write!(output, "{}", en_stemmer.stem(value)).unwrap() 58 | }); 59 | Ok(out.into_series()) 60 | } 61 | ``` 62 | 63 | Let's try it out! Put the following in `run.py`: 64 | ```python 65 | import polars as pl 66 | import minimal_plugin as mp 67 | 68 | df = pl.DataFrame({'word': ["fearlessly", "littleness", "lovingly", "devoted"]}) 69 | print(df.with_columns(b=mp.snowball_stem('word'))) 70 | ``` 71 | 72 | If you then compile with `maturin develop` (or `maturin develop --release` 73 | if you're benchmarking), and run it with `python run.py`, you'll see: 74 | ``` 75 | shape: (4, 2) 76 | ┌────────────┬──────────┐ 77 | │ a ┆ b │ 78 | │ --- ┆ --- │ 79 | │ str ┆ str │ 80 | ╞════════════╪══════════╡ 81 | │ fearlessly ┆ fearless │ 82 | │ littleness ┆ littl │ 83 | │ lovingly ┆ love │ 84 | │ devoted ┆ devot │ 85 | └────────────┴──────────┘ 86 | ``` 87 | 88 | In this example, we took on an extra dependency, which increased 89 | the size of the package. By using plugins, we have a way of accessing 90 | extra functionality without having to bloat up the size of the main 91 | Polars install too much! 92 | 93 | ## Stretch goal 94 | 95 | Browse through `crates.io` - is there any other crate you could use 96 | to make your own plugin out of? 97 | -------------------------------------------------------------------------------- /docs/stringify.md: -------------------------------------------------------------------------------- 1 | # 5. How to STRING something together 2 | 3 | Tired of examples which only include numeric data? Me neither. 4 | But we need to address the elephant in the room: strings. 5 | 6 | We're going to start by re-implementing a pig-latinnifier. 7 | This example is already part of the `pyo3-polars` repo examples, 8 | but we'll tackle it with a different spin here by first doing it 9 | the wrong way 😈. 10 | 11 | ## Pig-latinnify - take 1 12 | 13 | Let's start by doing this the wrong way. 14 | We'll use our `abs` example, and adapt it to the 15 | string case. We'll follow the same strategy: 16 | 17 | - iterate over arrow arrays; 18 | - for each element in each array, create a new output value. 19 | 20 | Put the following in `src/expressions.rs`: 21 | 22 | ```Rust 23 | use std::borrow::Cow; 24 | use std::fmt::Write; 25 | 26 | #[polars_expr(output_type=String)] 27 | fn pig_latinnify(inputs: &[Series]) -> PolarsResult { 28 | let s = &inputs[0]; 29 | let ca = s.str()?; 30 | let out: StringChunked = ca.apply(|opt_v: Option<&str>| { 31 | opt_v.map(|value: &str| { 32 | // Not the recommended way to do it, 33 | // see below for a better way! 34 | if let Some(first_char) = value.chars().next() { 35 | Cow::Owned(format!("{}{}ay", &value[1..], first_char)) 36 | } else { 37 | Cow::Borrowed(value) 38 | } 39 | }) 40 | }); 41 | Ok(out.into_series()) 42 | } 43 | ``` 44 | If you're not familiar with [clone-on-write](https://doc.rust-lang.org/std/borrow/enum.Cow.html), 45 | don't worry about it - we're about to see a simpler and better way to do this anyway. 46 | What I'd like you to focus on is that for every row, we're creating a new `String`. 47 | 48 | If you combine this with a Python definition (which you should put 49 | in `minimal_plugin/__init__.py`): 50 | 51 | ```python 52 | def pig_latinnify(expr: IntoExprColumn) -> pl.Expr: 53 | return register_plugin_function( 54 | args=[expr], 55 | plugin_path=LIB, 56 | function_name="pig_latinnify", 57 | is_elementwise=True, 58 | ) 59 | ``` 60 | then you'll be able to pig-latinnify a column of strings! To see it 61 | in action, compile with `maturin develop` (or `maturin develop --release` 62 | if you're benchmarking) and put the following in `run.py`: 63 | 64 | ```python 65 | import polars as pl 66 | import minimal_plugin as mp 67 | 68 | df = pl.DataFrame({'a': ["I", "love", "pig", "latin"]}) 69 | print(df.with_columns(a_pig_latin=mp.pig_latinnify('a'))) 70 | ``` 71 | ``` 72 | shape: (4, 2) 73 | ┌───────┬─────────────┐ 74 | │ a ┆ a_pig_latin │ 75 | │ --- ┆ --- │ 76 | │ str ┆ str │ 77 | ╞═══════╪═════════════╡ 78 | │ I ┆ Iay │ 79 | │ love ┆ ovelay │ 80 | │ pig ┆ igpay │ 81 | │ latin ┆ atinlay │ 82 | └───────┴─────────────┘ 83 | ``` 84 | 85 | This will already be an order of magnitude faster than using `map_elements`. 86 | But as mentioned earlier, we're creating a new string for every single row. 87 | 88 | Can we do better? 89 | 90 | ## Pig-latinnify - take 2 91 | 92 | Yes! `StringChunked` has a utility `apply_into_string_amortized` method which amortises 93 | the cost of creating new strings for each row by creating a string upfront, 94 | clearing it, and repeatedly writing to it. 95 | This gives a 4x speedup! All you need to do is change `pig_latinnify` to: 96 | 97 | ```Rust 98 | #[polars_expr(output_type=String)] 99 | fn pig_latinnify(inputs: &[Series]) -> PolarsResult { 100 | let ca: &StringChunked = inputs[0].str()?; 101 | let out: StringChunked = ca.apply_into_string_amortized(|value: &str, output: &mut String| { 102 | if let Some(first_char) = value.chars().next() { 103 | write!(output, "{}{}ay", &value[1..], first_char).unwrap() 104 | } 105 | }); 106 | Ok(out.into_series()) 107 | } 108 | ``` 109 | 110 | Simpler, faster, and more memory-efficient. 111 | _Thinking about allocations_ can really make a difference! 112 | 113 | ## So let's think about allocations! 114 | 115 | If you have an elementwise function which produces `String` output, then chances are it does one of the following: 116 | 117 | - Creates a new string. In this case, you can use `apply_into_string_amortized` to amortise the cost of allocating a new string for each input row, 118 | as we did above in `pig_latinnify`. This works by allocating a `String` upfront and then repeatedly re-writing to it. 119 | - Slices the original string. In this case, you can use `apply_values` with `Cow::Borrowed`, for example: 120 | 121 | ```rust 122 | fn remove_last_extension(s: &str) -> &str { 123 | match s.rfind('.') { 124 | Some(pos) => &s[..pos], 125 | None => s, 126 | } 127 | } 128 | 129 | #[polars_expr(output_type=String)] 130 | fn remove_extension(inputs: &[Series]) -> PolarsResult { 131 | let s = &inputs[0]; 132 | let ca = s.str()?; 133 | let out: StringChunked = ca.apply_values(|val| { 134 | let res = Cow::Borrowed(remove_last_extension(val)); 135 | res 136 | }); 137 | Ok(out.into_series()) 138 | } 139 | ``` 140 | 141 | There are low-level optimisations you can do to take things further, but - if in doubt - `apply_into_string_amortized` / `binary_elementwise_into_string_amortized` are probably good enough. 142 | -------------------------------------------------------------------------------- /docs/struct.md: -------------------------------------------------------------------------------- 1 | # 10. STRUCTin' 2 | 3 | > "Day one, I'm in love with your struct" Thumpasaurus (kinda) 4 | 5 | --- 6 | 7 | For this chapter, we need to start by activating the necessary feature - in `Cargo.toml`, please make this change: 8 | 9 | ```diff 10 | -polars = { version = "0.46.0", default-features = false } 11 | +polars = { version = "0.46.0", features=["dtype-struct"], default-features = false } 12 | ``` 13 | 14 | --- 15 | 16 | How do we consume structs, and how do we return them? 17 | 18 | Let's try creating a Polars DataFrame in Python that stores a struct similar to this one: 19 | 20 | ```rust 21 | struct Point2D { 22 | x: f64, 23 | y: f64, 24 | rgb: u32, 25 | } 26 | ``` 27 | 28 | ![Diagram showing the struct in a UML-like box](assets/struct_example_Point2D.png){ style="display: block; margin: 0 auto;" } 29 | 30 | There are different ways of doing that, but that doesn't matter now. Here's one way: 31 | ```python 32 | df = pl.DataFrame( 33 | { 34 | "x": [1.0, 1.25, 1.5, 1.75], 35 | "y": [3.0, 2.75, 2.5, 2.25], 36 | "rgba": [0x00FF7FFF, 0xFF7F00FF, 0x7F7F7FFF, 0xD8D8D8FF], 37 | } 38 | ).select( 39 | point_2d_s=pl.struct( 40 | "x", "y", "rgba", 41 | schema={ 42 | "x": pl.Float64, 43 | "y": pl.Float64, 44 | "rgba": pl.UInt32, 45 | } 46 | ) 47 | ) 48 | ``` 49 | 50 | If we `print(df)`, here's what we have: 51 | 52 | ``` 53 | shape: (4, 1) 54 | ┌────────────────────────┐ 55 | │ point_2d_s │ 56 | │ --- │ 57 | │ struct[3] │ 58 | ╞════════════════════════╡ 59 | │ {1.0,3.0,16744447} │ 60 | │ {1.25,2.75,4286513407} │ 61 | │ {1.5,2.5,2139062271} │ 62 | │ {1.75,2.25,3638089983} │ 63 | └────────────────────────┘ 64 | ``` 65 | 66 | Now's an excellent time to ask: how's that stored in memory? Before we get to that answer, consider this other scenario, in Rust: 67 | 68 | ```rust 69 | let v: [Point2D; 4] = [ 70 | Point2D { 71 | x: 1.0, 72 | y: 3.0, 73 | rgb: 0x00FF7FFFu32, 74 | }, 75 | Point2D { 76 | x: 1.25, 77 | y: 2.75, 78 | rgb: 0xFF7F00FFu32, 79 | }, 80 | Point2D { 81 | x: 1.5, 82 | y: 2.5, 83 | rgb: 0x7F7F7FFFu32, 84 | }, 85 | Point2D { 86 | x: 1.75, 87 | y: 2.25, 88 | rgb: 0xD8D8D8FFu32, 89 | }, 90 | ]; 91 | ``` 92 | 93 | How's this one stored in memory? You might find that answer easier, it's an array of struct instances, so we have the `x`, `y` and `rgba` fields contiguously in memory, like that: 94 | 95 | ![Diagram showing the contiguous layout memory of an array of structs in Rust](assets/struct_array_memory_layout.png){ style="display: block; margin: 0 auto;" } 96 | 97 | This is consistent with how C, C++, and many other languages store structs in memory. 98 | How's our struct-in-a-DataFrame different? 99 | 100 | Polars follows the Arrow protocol for structs, which means each field of the struct is stored in a Series, backed by chunks. Each chunk is contiguous in memory. 101 | In a scenario in which we have a single chunk for each field, this is how things would look like: 102 | 103 | ![Diagram showing how Series backing struct fields are stored in memory](assets/structchunked_fields_memory_layout.png){ style="display: block; margin: 0 auto;" } 104 | 105 | Since we never modified the DataFrame after creating it, it is the case of our initial example, no more chunks were allocated. 106 | 107 | --- 108 | 109 | Now that we have a better idea of how things work under the hood, let's jump to a practical plugin that takes a struct, and just prints the Series corresponding to each field - it'll return the same struct passed as input, with no alteration. 110 | 111 | First things first - this time we're gonna see somthing new. Polars does not allow us to write: 112 | 113 | ```rust 114 | #[polars_expr(output_type=Struct)] 115 | fn print_struct_fields(inputs: &[Series]) -> PolarsResult { ... } 116 | ``` 117 | 118 | The way we inform a struct Series is being returned is a bit cumbersome - we do so by defining a separate function: 119 | 120 | ```rust 121 | fn struct_point_2d_output(input_fields: &[Field]) -> PolarsResult { 122 | let field = &input_fields[0]; 123 | match field.dtype() { 124 | DataType::Struct(fields) => { 125 | Ok(Field::new("struct_point_2d".into(), DataType::Struct(fields.clone()))) 126 | } 127 | dtype => polars_bail!(InvalidOperation: "expected Struct dtype, got {}", dtype), 128 | } 129 | } 130 | ``` 131 | 132 | Then using that function in our `polars_expr`, with a different "kwarg": 133 | 134 | ```rust 135 | #[polars_expr(output_type_func=struct_point_2d_output)] 136 | fn print_struct_fields(inputs: &[Series]) -> PolarsResult { 137 | 138 | let struct_ = inputs[0].struct_()?; 139 | let fields = struct_.fields_as_series(); 140 | 141 | if fields.is_empty() { 142 | return Ok(inputs[0].clone()); 143 | } 144 | 145 | let fields = fields 146 | .iter() 147 | .map(|s| { 148 | let s = s.clone(); 149 | println!("{:?}", s); 150 | s 151 | }) 152 | .collect::>(); 153 | 154 | StructChunked::from_series(struct_.name().clone(), struct_.len(), fields.iter()) 155 | .map(|ca| ca.into_series()) 156 | } 157 | ``` 158 | 159 | This is a very basic, "do-nothing" example. For this reason, we're not gonna spend too much time here. 160 | Still, you're encouraged to register the plugin and try it for yourself, as an exercise. 161 | 162 | Now, let's look at something more interesting. 163 | 164 | --- 165 | 166 | We'll rewrite a plugin which takes a `Struct` as 167 | input, and shifts all values forwards by one key. So, for example, if 168 | the input was `{'a': 1, 'b': 2., 'c': '3'}`, then the output will be 169 | `{'a': 2., 'b': '3', 'c': 1}`. 170 | 171 | On the Python side, usual business: 172 | 173 | ```python 174 | def shift_struct(expr: IntoExprColumn) -> pl.Expr: 175 | return register_plugin_function( 176 | args=[expr], 177 | plugin_path=LIB, 178 | function_name="shift_struct", 179 | is_elementwise=True, 180 | ) 181 | ``` 182 | 183 | Then, we need to get the schema right. 184 | 185 | ```Rust 186 | fn shifted_struct(input_fields: &[Field]) -> PolarsResult { 187 | let field = &input_fields[0]; 188 | match field.dtype() { 189 | DataType::Struct(fields) => { 190 | let mut field_0 = fields[0].clone(); 191 | let name = field_0.name.clone(); 192 | field_0.set_name(fields[fields.len() - 1].name().clone()); 193 | let mut fields = fields[1..] 194 | .iter() 195 | .zip(fields[0..fields.len() - 1].iter()) 196 | .map(|(fld, name)| Field::new(name.name().clone(), fld.dtype().clone())) 197 | .collect::>(); 198 | fields.push(field_0); 199 | Ok(Field::new(name, DataType::Struct(fields))) 200 | } 201 | _ => unreachable!(), 202 | } 203 | } 204 | ``` 205 | 206 | In this case, I put the first field's name as the output struct's name, but it doesn't 207 | really matter what we put, as Polars doesn't allow us to rename expressions within 208 | plugins. You can always rename on the Python side if you really want to, but I'd suggest 209 | to just let Polars follow its usual "left-hand-rule". 210 | 211 | The function definition is going to follow a similar logic: 212 | 213 | ```rust 214 | #[polars_expr(output_type_func=shifted_struct)] 215 | fn shift_struct(inputs: &[Series]) -> PolarsResult { 216 | let struct_ = inputs[0].struct_()?; 217 | let fields = struct_.fields_as_series(); 218 | if fields.is_empty() { 219 | return Ok(inputs[0].clone()); 220 | } 221 | let mut field_0 = fields[0].clone(); 222 | let name = field_0.name().clone(); 223 | field_0.rename(fields[fields.len() - 1].name().clone()); 224 | let mut fields = fields[1..] 225 | .iter() 226 | .zip(fields[..fields.len() - 1].iter()) 227 | .map(|(s, name)| { 228 | let mut s = s.clone(); 229 | s.rename(name.name().clone()); 230 | s 231 | }) 232 | .collect::>(); 233 | fields.push(field_0); 234 | StructChunked::from_series(name, struct_.len(), fields.iter()).map(|ca| ca.into_series()) 235 | } 236 | ``` 237 | 238 | Let's try this out. Put the following in `run.py`: 239 | 240 | ```python 241 | import polars as pl 242 | import minimal_plugin as mp 243 | 244 | df = pl.DataFrame( 245 | { 246 | "a": [1, 3, 8], 247 | "b": [2.0, 3.1, 2.5], 248 | "c": ["3", "7", "3"], 249 | } 250 | ).select(abc=pl.struct("a", "b", "c")) 251 | print(df.with_columns(abc_shifted=mp.shift_struct("abc"))) 252 | ``` 253 | 254 | Compile with `maturin develop` (or `maturin develop --release` if you're 255 | benchmarking), and if you run `python run.py` you'll see: 256 | 257 | ``` 258 | shape: (3, 2) 259 | ┌─────────────┬─────────────┐ 260 | │ abc ┆ abc_shifted │ 261 | │ --- ┆ --- │ 262 | │ struct[3] ┆ struct[3] │ 263 | ╞═════════════╪═════════════╡ 264 | │ {1,2.0,"3"} ┆ {2.0,"3",1} │ 265 | │ {3,3.1,"7"} ┆ {3.1,"7",3} │ 266 | │ {8,2.5,"3"} ┆ {2.5,"3",8} │ 267 | └─────────────┴─────────────┘ 268 | ``` 269 | 270 | The values look right - but is the schema? 271 | Let's take a look 272 | 273 | ``` 274 | import pprint 275 | pprint.pprint(df.with_columns(abc_shifted=mp.shift_struct("abc")).schema) 276 | ``` 277 | 278 | ``` 279 | OrderedDict([('abc', Struct({'a': Int64, 'b': Float64, 'c': String})), 280 | ('abc_shifted', Struct({'a': Float64, 'b': String, 'c': Int64}))]) 281 | ``` 282 | 283 | Looks correct! 284 | -------------------------------------------------------------------------------- /docs/sum.md: -------------------------------------------------------------------------------- 1 | # 3. How to do SUMthing 2 | 3 | So far, the expressions we wrote only operated on a single expression. 4 | 5 | What if we'd like to do something fancier, involving more than one expression? 6 | Let's try to write an expression which lets us do 7 | 8 | ```python 9 | df.with_columns(mp.sum_i64('a', 'b')) 10 | ``` 11 | 12 | ## Take a ride on the Python side 13 | 14 | First, we need to be able to pass multiple inputs to our Rust function. We'll do that 15 | by using the `args` argument when we register our expression. Add the following to 16 | `minimal_plugins/__init__.py`: 17 | 18 | ```python 19 | def sum_i64(expr: IntoExprColumn, other: IntoExprColumn) -> pl.Expr: 20 | return register_plugin_function( 21 | args=[expr, other], 22 | plugin_path=LIB, 23 | function_name="sum_i64", 24 | is_elementwise=True, 25 | ) 26 | ``` 27 | 28 | ## I’ve got 1100011 problems but binary ain't one 29 | 30 | Time to write a binary function, in the sense that it takes two 31 | columns as input and produces a third. 32 | Polars gives us a handy `broadcast_binary_elementwise` function for computing binary elementwise operations! 33 | 34 | Add the following to `src/expressions.rs`: 35 | 36 | ```Rust 37 | #[polars_expr(output_type=Int64)] 38 | fn sum_i64(inputs: &[Series]) -> PolarsResult { 39 | let left: &Int64Chunked = inputs[0].i64()?; 40 | let right: &Int64Chunked = inputs[1].i64()?; 41 | // Note: there's a faster way of summing two columns, see 42 | // section 7. 43 | let out: Int64Chunked = broadcast_binary_elementwise( 44 | left, 45 | right, 46 | |left: Option, right: Option| match (left, right) { 47 | (Some(left), Some(right)) => Some(left + right), 48 | _ => None, 49 | }, 50 | ); 51 | Ok(out.into_series()) 52 | } 53 | ``` 54 | Note that you'll also need to add 55 | ```Rust 56 | use polars::prelude::arity::broadcast_binary_elementwise; 57 | ``` 58 | to the top of the `src/expressions.rs` file. 59 | 60 | !!! note 61 | 62 | There's a faster way of implementing this particular operation, 63 | which we'll cover later in the tutorial in [Branch mispredictions]. 64 | 65 | The idea is: 66 | 67 | - for each row, if both `left` and `right` are valid (i.e. they are both 68 | `Some`), then we sum them; 69 | - if either of them is missing (`None`), then we return `None`. 70 | 71 | To try it out, remember to first compile with `maturin develop` 72 | (or `maturin develop --release` if you're benchmarking). Then 73 | if you make a `run.py` file with 74 | ```python 75 | import polars as pl 76 | import minimal_plugin as mp 77 | 78 | df = pl.DataFrame({'a': [1, 5, 2], 'b': [3, None, -1]}) 79 | print(df.with_columns(a_plus_b=mp.sum_i64('a', 'b'))) 80 | ``` 81 | then `python run.py` should produce 82 | ``` 83 | shape: (3, 3) 84 | ┌─────┬──────┬──────────┐ 85 | │ a ┆ b ┆ a_plus_b │ 86 | │ --- ┆ --- ┆ --- │ 87 | │ i64 ┆ i64 ┆ i64 │ 88 | ╞═════╪══════╪══════════╡ 89 | │ 1 ┆ 3 ┆ 4 │ 90 | │ 5 ┆ null ┆ null │ 91 | │ 2 ┆ -1 ┆ 1 │ 92 | └─────┴──────┴──────────┘ 93 | ``` 94 | 95 | [Branch mispredictions]: ../branch_mispredictions/ 96 | 97 | ## Get over your exercises 98 | 99 | It's widely acknowledged that the best way to learn is by doing. 100 | 101 | Can you make `sum_numeric` (a generic version of `sum_i64`)? 102 | Can you support the case when `left` and `right` are of different 103 | types, e.g. `i8` plus `i16`? 104 | -------------------------------------------------------------------------------- /docs/vec_of_option.md: -------------------------------------------------------------------------------- 1 | 2 | # 13. `Vec>` vs. `Vec` 3 | 4 | > "I got, I got, I got, I got options" – _Pitbull_, before writing his first Polars plugin 5 | 6 | In the plugins we looked at so far, we typically created an iterator of options and let Polars collect it into a `ChunkedArray`. 7 | Sometimes, however, you need to store intermediate values in a `Vec`. You might be tempted to make it a `Vec>`, where 8 | missing values are `None` and present values are `Some`... 9 | 10 | 🛑 BUT WAIT! 11 | 12 | Did you know that `Vec>` occupies twice as much memory as `Vec`? Let's prove it: 13 | 14 | ```rust 15 | use std::mem::size_of_val; 16 | 17 | fn main() { 18 | let vector: Vec = vec![1, 2, 3]; 19 | println!("{}", size_of_val(&*vector)); 20 | // Output: 12 21 | 22 | let vector: Vec> = vec![Some(1), Some(2), Some(3)]; 23 | println!("{}", size_of_val(&*vector)); 24 | // Output: 24 25 | } 26 | ``` 27 | 28 | So...how can we create an output which includes missing values, without allocating twice as much memory as is necessary? 29 | 30 | ## Validity mask 31 | 32 | Instead of creating a vector of options, we can create a vector of primitive values with zeroes in place of the missing values, and use 33 | a validity mask to indicate which values are missing. One example of this can be seen in Polars' `interpolate_impl`, which does the heavy lifting for the 34 | [`Series.interpolate`](https://docs.pola.rs/api/python/version/0.18/reference/series/api/polars.Series.interpolate.html): 35 | 36 | ```rust 37 | fn interpolate_impl(chunked_arr: &ChunkedArray, interpolation_branch: I) -> ChunkedArray 38 | where 39 | T: PolarsNumericType, 40 | I: Fn(T::Native, T::Native, IdxSize, T::Native, &mut Vec), 41 | { 42 | // This implementation differs from pandas as that boundary None's are not removed. 43 | // This prevents a lot of errors due to expressions leading to different lengths. 44 | if !chunked_arr.has_nulls() || chunked_arr.null_count() == chunked_arr.len() { 45 | return chunked_arr.clone(); 46 | } 47 | 48 | // We first find the first and last so that we can set the null buffer. 49 | let first = chunked_arr.first_non_null().unwrap(); 50 | let last = chunked_arr.last_non_null().unwrap() + 1; 51 | 52 | // Fill out with `first` nulls. 53 | let mut out = Vec::with_capacity(chunked_arr.len()); 54 | let mut iter = chunked_arr.iter().skip(first); 55 | for _ in 0..first { 56 | out.push(Zero::zero()); 57 | } 58 | 59 | // The next element of `iter` is definitely `Some(Some(v))`, because we skipped the first 60 | // elements `first` and if all values were missing we'd have done an early return. 61 | let mut low = iter.next().unwrap().unwrap(); 62 | out.push(low); 63 | while let Some(next) = iter.next() { 64 | if let Some(v) = next { 65 | out.push(v); 66 | low = v; 67 | } else { 68 | let mut steps = 1 as IdxSize; 69 | for next in iter.by_ref() { 70 | steps += 1; 71 | if let Some(high) = next { 72 | let steps_n: T::Native = NumCast::from(steps).unwrap(); 73 | interpolation_branch(low, high, steps, steps_n, &mut out); 74 | out.push(high); 75 | low = high; 76 | break; 77 | } 78 | } 79 | } 80 | } 81 | if first != 0 || last != chunked_arr.len() { 82 | let mut validity = MutableBitmap::with_capacity(chunked_arr.len()); 83 | validity.extend_constant(chunked_arr.len(), true); 84 | 85 | for i in 0..first { 86 | validity.set(i, false); 87 | } 88 | 89 | for i in last..chunked_arr.len() { 90 | validity.set(i, false); 91 | out.push(Zero::zero()) 92 | } 93 | 94 | let array = PrimitiveArray::new( 95 | T::get_dtype().to_arrow(CompatLevel::newest()), 96 | out.into(), 97 | Some(validity.into()), 98 | ); 99 | ChunkedArray::with_chunk(PlSmallStr::EMPTY, array) 100 | } else { 101 | ChunkedArray::from_vec(chunked_arr.name(), out) 102 | } 103 | } 104 | ``` 105 | 106 | That's a lot to digest at once, so let's take small steps and focus on the core logic. 107 | At the start, we store the indexes of the first and last non-null values: 108 | 109 | ```rust 110 | let first = chunked_arr.first_non_null().unwrap(); 111 | let last = chunked_arr.last_non_null().unwrap() + 1; 112 | ``` 113 | 114 | We then create a vector `out` to store the result values in, and in places where we'd like 115 | the output to be missing, we push zeroes (we'll see below how we tell Polars that these are 116 | to be considered missing, rather than as ordinary zeroes): 117 | 118 | ```rust 119 | let mut out = Vec::with_capacity(chunked_arr.len()); 120 | for _ in 0..first { 121 | out.push(Zero::zero()); 122 | } 123 | ``` 124 | 125 | We then skip the first `first` elements and start interpolating (note how we write `out.push(low)`, not `out.push(Some(low))` 126 | - we gloss over the rest as it's not related to the main focus of this chapter): 127 | 128 | ```rust 129 | let mut iter = chunked_arr.iter().skip(first); 130 | let mut low = iter.next().unwrap().unwrap(); 131 | out.push(low); 132 | while let Some(next) = iter.next() { 133 | // Interpolation logic 134 | } 135 | ``` 136 | 137 | Now, after _most_ of the work is done and we've filled up most of `out`, 138 | we create a validity mask and set it to `false` for elements which we'd like to declare as missing: 139 | 140 | ```rust 141 | if first != 0 || last != chunked_arr.len() { 142 | // A validity mask is created for the vector, initially all set to true 143 | let mut validity = MutableBitmap::with_capacity(chunked_arr.len()); 144 | validity.extend_constant(chunked_arr.len(), true); 145 | 146 | for i in 0..first { 147 | // The indexes corresponding to the zeroes before the first valid value 148 | // are set to false (invalid) 149 | validity.set(i, false); 150 | } 151 | 152 | for i in last..chunked_arr.len() { 153 | // The indexes corresponding to the values after the last valid value 154 | // are set to false (invalid) 155 | validity.set(i, false); 156 | 157 | out.push(Zero::zero()) // Push zeroes after the last valid value, as 158 | // many as there are nulls at the end, just like 159 | // it was done before the first valid value. 160 | } 161 | 162 | let array = PrimitiveArray::new( 163 | T::get_dtype().to_arrow(CompatLevel::newest()), 164 | out.into(), 165 | Some(validity.into()), 166 | ); 167 | ChunkedArray::with_chunk(PlSmallStr::EMPTY, array) 168 | } else { 169 | ChunkedArray::from_vec(chunked_arr.name(), out) 170 | } 171 | ``` 172 | 173 | The `MutableBitmap` only requires one byte per 8 elements, so the total space used is much less than it would've been 174 | if we'd created `out` as a vector of options! 175 | Further, note how the validity mask is only allocated when the output contains nulls - if there are no nulls, we can 176 | save even more memory by not having a validity mask at all! 177 | 178 | ## Sentinel values 179 | 180 | Let's look at another example of where it's possible to avoid allocating a vector of options. This example comes 181 | from the Polars-XDT plugin. There's one function there which creates a temporary `idx` vector in which, for 182 | each element, we store the index of the previous element larger than it. If an element has no previous larger 183 | element, then rather than storing `None` (thus forcing all non-missing elements to be `Some`), we can just 184 | store `-1`. 185 | 186 | Take a look at [this diff from a PR](https://github.com/pola-rs/polars-xdt/pull/79/files#diff-991878a926639bba03bcc36a2790f73181b358f2ff59e0256f9ad76aa707be35) which does exactly that, 187 | in which most changes are along the lines of: 188 | 189 | ```diff 190 | - if i < Some(0) { 191 | - idx.push(None); 192 | + if i < 0 { 193 | + idx.push(-1); 194 | ``` 195 | 196 | There's no functional behaviour change, but we already know the memory benefits! 197 | 198 | ## Conclusion 199 | 200 | In general, _if you can avoid allocating `Vec>` instead of `Vec`,_ __do it!__! 201 | 202 | !!!note 203 | 204 | This advice only applies if you're creating a vector to store results in. If you're collecting 205 | an iterator of options into a chunked array, then Polars already optimises this for you. 206 | -------------------------------------------------------------------------------- /docs/where_to_go.md: -------------------------------------------------------------------------------- 1 | # Where to go from here? 2 | 3 | What now? 4 | 5 | If this material was a bit overwhelming for you, I'd suggest taking a step back 6 | and reading [The Rust Programming Language](https://doc.rust-lang.org/book). 7 | Or at least, the first 10 chapters. 8 | 9 | Next, you may be interested in looking at existing plugins for inspiration. 10 | There's a nice list of them in the official user guide: https://docs.pola.rs/user-guide/plugins/your-first-polars-plugin/#community-plugins. 11 | 12 | Finally, you should definitely join the Discord Server, where there's a channel 13 | dedicated to plugins: https://discord.gg/4UfP5cfBE7. 14 | -------------------------------------------------------------------------------- /minimal_plugin/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import TYPE_CHECKING 3 | 4 | import polars as pl 5 | from pathlib import Path 6 | 7 | from polars.plugins import register_plugin_function 8 | 9 | 10 | LIB = Path(__file__).parent 11 | 12 | if TYPE_CHECKING: 13 | from minimal_plugin.typing import IntoExprColumn 14 | 15 | 16 | def noop(expr: IntoExprColumn) -> pl.Expr: 17 | return register_plugin_function( 18 | args=[expr], 19 | plugin_path=LIB, 20 | function_name="noop", 21 | is_elementwise=True, 22 | ) 23 | 24 | 25 | def abs_i64(expr: IntoExprColumn) -> pl.Expr: 26 | return register_plugin_function( 27 | args=[expr], 28 | plugin_path=LIB, 29 | function_name="abs_i64", 30 | is_elementwise=True, 31 | ) 32 | 33 | 34 | def abs_numeric(expr: IntoExprColumn) -> pl.Expr: 35 | return register_plugin_function( 36 | args=[expr], 37 | plugin_path=LIB, 38 | function_name="abs_numeric", 39 | is_elementwise=True, 40 | ) 41 | 42 | 43 | def sum_i64(expr: IntoExprColumn, other: IntoExprColumn) -> pl.Expr: 44 | return register_plugin_function( 45 | args=[expr, other], 46 | plugin_path=LIB, 47 | function_name="sum_i64", 48 | is_elementwise=True, 49 | ) 50 | 51 | 52 | def cum_sum(expr: IntoExprColumn) -> pl.Expr: 53 | return register_plugin_function( 54 | args=[expr], 55 | plugin_path=LIB, 56 | function_name="cum_sum", 57 | is_elementwise=False, 58 | ) 59 | 60 | 61 | def pig_latinnify(expr: IntoExprColumn) -> pl.Expr: 62 | return register_plugin_function( 63 | args=[expr], 64 | plugin_path=LIB, 65 | function_name="pig_latinnify", 66 | is_elementwise=True, 67 | ) 68 | 69 | 70 | def remove_extension(expr: IntoExprColumn) -> pl.Expr: 71 | return register_plugin_function( 72 | args=[expr], 73 | plugin_path=LIB, 74 | function_name="remove_extension", 75 | is_elementwise=True, 76 | ) 77 | 78 | 79 | def abs_i64_fast(expr: IntoExprColumn) -> pl.Expr: 80 | return register_plugin_function( 81 | args=[expr], 82 | plugin_path=LIB, 83 | function_name="abs_i64_fast", 84 | is_elementwise=True, 85 | ) 86 | 87 | 88 | def add_suffix(expr: IntoExprColumn, *, suffix: str) -> pl.Expr: 89 | return register_plugin_function( 90 | args=[expr], 91 | plugin_path=LIB, 92 | function_name="add_suffix", 93 | is_elementwise=True, 94 | kwargs={"suffix": suffix}, 95 | ) 96 | 97 | 98 | def snowball_stem(expr: IntoExprColumn) -> pl.Expr: 99 | return register_plugin_function( 100 | args=[expr], 101 | plugin_path=LIB, 102 | function_name="snowball_stem", 103 | is_elementwise=True, 104 | ) 105 | 106 | 107 | def weighted_mean(expr: IntoExprColumn, weights: IntoExprColumn) -> pl.Expr: 108 | return register_plugin_function( 109 | args=[expr, weights], 110 | plugin_path=LIB, 111 | function_name="weighted_mean", 112 | is_elementwise=True, 113 | ) 114 | 115 | 116 | def print_struct_fields(expr: IntoExprColumn) -> pl.Expr: 117 | return register_plugin_function( 118 | args=[expr], 119 | plugin_path=LIB, 120 | function_name="print_struct_fields", 121 | is_elementwise=True, 122 | ) 123 | 124 | 125 | def shift_struct(expr: IntoExprColumn) -> pl.Expr: 126 | return register_plugin_function( 127 | args=[expr], 128 | plugin_path=LIB, 129 | function_name="shift_struct", 130 | is_elementwise=True, 131 | ) 132 | 133 | 134 | def reverse_geocode(lat: IntoExprColumn, long: IntoExprColumn) -> pl.Expr: 135 | return register_plugin_function( 136 | args=[lat, long], 137 | plugin_path=LIB, 138 | function_name="reverse_geocode", 139 | is_elementwise=True, 140 | ) 141 | 142 | 143 | def non_zero_indices(expr: IntoExprColumn) -> pl.Expr: 144 | return register_plugin_function( 145 | args=[expr], 146 | plugin_path=LIB, 147 | function_name="non_zero_indices", 148 | is_elementwise=True, 149 | ) 150 | 151 | 152 | def vertical_weighted_mean(values: IntoExprColumn, weights: IntoExprColumn) -> pl.Expr: 153 | return register_plugin_function( 154 | args=[values, weights], 155 | plugin_path=LIB, 156 | function_name="vertical_weighted_mean", 157 | is_elementwise=False, 158 | returns_scalar=True, 159 | ) 160 | 161 | 162 | def interpolate(expr: IntoExprColumn) -> pl.Expr: 163 | return register_plugin_function( 164 | args=[expr], 165 | plugin_path=LIB, 166 | function_name="interpolate", 167 | is_elementwise=False, 168 | ) 169 | 170 | 171 | def life_step( 172 | left: IntoExprColumn, mid: IntoExprColumn, right: IntoExprColumn 173 | ) -> pl.Expr: 174 | return register_plugin_function( 175 | args=[left, mid, right], 176 | plugin_path=LIB, 177 | function_name="life_step", 178 | is_elementwise=False, 179 | ) 180 | 181 | 182 | def midpoint_2d(expr: IntoExprColumn, ref_point: tuple[float, float]) -> pl.Expr: 183 | return register_plugin_function( 184 | args=[expr], 185 | plugin_path=LIB, 186 | function_name="midpoint_2d", 187 | is_elementwise=True, 188 | kwargs={"ref_point": ref_point}, 189 | ) 190 | -------------------------------------------------------------------------------- /minimal_plugin/typing.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, Union 2 | 3 | if TYPE_CHECKING: 4 | import sys 5 | import polars as pl 6 | 7 | if sys.version_info >= (3, 10): 8 | from typing import TypeAlias 9 | else: 10 | from typing_extensions import TypeAlias 11 | from polars.datatypes import DataType, DataTypeClass 12 | 13 | IntoExprColumn: TypeAlias = Union[pl.Expr, str, pl.Series] 14 | PolarsDataType: TypeAlias = Union[DataType, DataTypeClass] 15 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: Polars plugins tutorial 2 | repo_url: https://github.com/MarcoGorelli/polars-plugins-tutorial 3 | 4 | theme: 5 | name: material 6 | font: false 7 | features: 8 | - content.code.copy 9 | - content.code.annotate 10 | - navigation.footer 11 | 12 | nav: 13 | - Home: index.md 14 | - prerequisites.md 15 | - noop.md 16 | - abs.md 17 | - sum.md 18 | - cum_sum.md 19 | - stringify.md 20 | - stem.md 21 | - branch_mispredictions.md 22 | - arguments.md 23 | - 9. Lists at last: 24 | - lists.md 25 | - lists_in_lists_out.md 26 | - struct.md 27 | - arrays.md 28 | - lost_in_space.md 29 | - vec_of_option.md 30 | - publishing.md 31 | - aggregate.md 32 | - "Extra: Can we run Doom?": 33 | - life_pt1.md 34 | - life_pt2.md 35 | - where_to_go.md 36 | 37 | plugins: 38 | - search 39 | 40 | # Extensions 41 | markdown_extensions: 42 | - abbr 43 | - admonition 44 | - attr_list 45 | - def_list 46 | - footnotes 47 | - md_in_html 48 | - toc: 49 | permalink: true 50 | - pymdownx.arithmatex: 51 | generic: true 52 | - pymdownx.betterem: 53 | smart_enable: all 54 | - pymdownx.caret 55 | - pymdownx.details 56 | - pymdownx.emoji: 57 | emoji_generator: !!python/name:material.extensions.emoji.to_svg 58 | emoji_index: !!python/name:material.extensions.emoji.twemoji 59 | - pymdownx.highlight: 60 | anchor_linenums: true 61 | line_spans: __span 62 | pygments_lang_class: true 63 | - pymdownx.inlinehilite 64 | - pymdownx.keys 65 | - pymdownx.magiclink: 66 | normalize_issue_symbols: true 67 | repo_url_shorthand: true 68 | user: squidfunk 69 | repo: mkdocs-material 70 | - pymdownx.mark 71 | - pymdownx.smartsymbols 72 | - pymdownx.snippets: 73 | auto_append: 74 | - includes/mkdocs.md 75 | - pymdownx.superfences: 76 | custom_fences: 77 | - name: mermaid 78 | class: mermaid 79 | format: !!python/name:pymdownx.superfences.fence_code_format 80 | - pymdownx.tabbed: 81 | alternate_style: true 82 | combine_header_slug: true 83 | slugify: !!python/object/apply:pymdownx.slugs.slugify 84 | kwds: 85 | case: lower 86 | - pymdownx.tasklist: 87 | custom_checkbox: true 88 | - pymdownx.tilde 89 | -------------------------------------------------------------------------------- /perf.py: -------------------------------------------------------------------------------- 1 | import timeit 2 | import warnings 3 | import numpy as np 4 | 5 | setup = """ 6 | import pandas as pd 7 | import polars as pl 8 | import minimal_plugin # noqa: F401 9 | import numpy as np 10 | rng = np.random.default_rng(12345) 11 | N = 10_000_000 12 | 13 | df = pl.DataFrame({'a': rng.integers(low=-100, high=100, size=N)}) 14 | df = df.with_row_index().with_columns( 15 | pl.when(pl.col('index')%2==1).then(pl.lit(None)).otherwise(pl.col('a')).alias('a') 16 | ) 17 | """ 18 | 19 | results = ( 20 | np.array( 21 | timeit.Timer( 22 | stmt="df.select(pl.col('a').mp.abs_i64_fast())", 23 | setup=setup, 24 | ).repeat(7, 3) 25 | ) 26 | / 3 27 | ) 28 | print(f"min: {min(results)}") 29 | print(f"max: {max(results)}") 30 | print(f"{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}") 31 | 32 | results = ( 33 | np.array( 34 | timeit.Timer( 35 | stmt="df.select(pl.col('a').mp.abs_i64())", 36 | setup=setup, 37 | ).repeat(7, 3) 38 | ) 39 | / 3 40 | ) 41 | print(f"min: {min(results)}") 42 | print(f"max: {max(results)}") 43 | print(f"{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}") 44 | 45 | with warnings.catch_warnings(): 46 | warnings.simplefilter("ignore") 47 | results = ( 48 | np.array( 49 | timeit.Timer( 50 | stmt="df.select(pl.col('a').map_elements(lambda x: abs(x)))", 51 | setup=setup, 52 | ).repeat(7, 3) 53 | ) 54 | / 3 55 | ) 56 | print(f"min: {min(results)}") 57 | print(f"max: {max(results)}") 58 | print(f"{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}") 59 | -------------------------------------------------------------------------------- /perf_list.py: -------------------------------------------------------------------------------- 1 | import timeit 2 | import warnings 3 | import numpy as np 4 | 5 | setup = """ 6 | import polars as pl 7 | import minimal_plugin as mp 8 | import numpy as np 9 | rng = np.random.default_rng(12345) 10 | N = 100_000 11 | 12 | df = pl.DataFrame({'a': [rng.integers(low=-100, high=100, size=5) for _ in range(N)]}) 13 | """ 14 | 15 | results = ( 16 | np.array( 17 | timeit.Timer( 18 | stmt="df.select(mp.non_zero_indices('a'))", 19 | setup=setup, 20 | ).repeat(7, 3) 21 | ) 22 | / 3 23 | ) 24 | print(f"min: {min(results)}") 25 | print(f"max: {max(results)}") 26 | print(f"{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}") 27 | 28 | results = ( 29 | np.array( 30 | timeit.Timer( 31 | stmt="df.select(pl.col('a').list.eval(pl.arg_where(pl.element() != 0)))", 32 | setup=setup, 33 | ).repeat(7, 3) 34 | ) 35 | / 3 36 | ) 37 | print(f"min: {min(results)}") 38 | print(f"max: {max(results)}") 39 | print(f"{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}") 40 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["maturin>=1.0,<2.0", "polars>=1.3.0"] 3 | build-backend = "maturin" 4 | 5 | [project] 6 | name = "minimal_plugin" # Should match the folder with your code! 7 | requires-python = ">=3.8" 8 | classifiers = [ 9 | "Programming Language :: Rust", 10 | "Programming Language :: Python :: Implementation :: CPython", 11 | "Programming Language :: Python :: Implementation :: PyPy", 12 | ] 13 | dynamic = ["version"] 14 | 15 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | ruff 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | maturin>=1.4.0 2 | polars>=1.3.0 3 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | 2 | import polars as pl 3 | import minimal_plugin as mp 4 | 5 | 6 | df = pl.DataFrame( 7 | {"values": [[1, 3, 2], [5, 7], []], "weights": [[0.5, 0.3, 0.2], [0.1, 0.9], []]} 8 | ) 9 | print(df.with_columns(weighted_mean=mp.weighted_mean("values", "weights"))) 10 | 11 | df = pl.DataFrame( 12 | { 13 | "english": ["foo", "bar", ""], 14 | } 15 | ) 16 | print(df.with_columns(pig_latin=mp.pig_latinnify("english"))) 17 | 18 | df = pl.DataFrame( 19 | { 20 | "values": [1.0, 3, 2, 5, 7], 21 | "weights": [0.5, 0.3, 0.2, 0.1, 0.9], 22 | "group": ["a", "a", "a", "b", "b"], 23 | } 24 | ) 25 | print( 26 | df.group_by("group").agg( 27 | weighted_mean=mp.vertical_weighted_mean("values", "weights") 28 | ) 29 | ) 30 | 31 | df = pl.DataFrame( 32 | { 33 | "a": [None, None, 3, None, None, 9, 11, None], 34 | } 35 | ) 36 | result = df.with_columns(interpolate=mp.interpolate("a")) 37 | print(result) 38 | 39 | 40 | df = pl.DataFrame({ 41 | 'filename': [ 42 | "requirements.txt", "Makefile", "pkg.tar.gz", "tmp.d" 43 | ], 44 | }) 45 | print(df.with_columns(without_ext=mp.remove_extension('filename'))) 46 | 47 | points = pl.Series( 48 | "points", 49 | [ 50 | [6.63, 8.35], 51 | [7.19, 4.85], 52 | [2.1, 4.21], 53 | [3.4, 6.13], 54 | [2.48, 9.26], 55 | [9.41, 7.26], 56 | [7.45, 8.85], 57 | [6.58, 5.22], 58 | [6.05, 5.77], 59 | [8.57, 4.16], 60 | [3.22, 4.98], 61 | [6.62, 6.62], 62 | [9.36, 7.44], 63 | [8.34, 3.43], 64 | [4.47, 7.61], 65 | [4.34, 5.05], 66 | [5.0, 5.05], 67 | [5.0, 5.0], 68 | [2.07, 7.8], 69 | [9.45, 9.6], 70 | [3.1, 3.26], 71 | [4.37, 5.72], 72 | ], 73 | dtype=pl.Array(pl.Float64, 2), 74 | ) 75 | df = pl.DataFrame(points) 76 | result = df.with_columns(midpoints=mp.midpoint_2d("points", ref_point=(5.0, 5.0))) 77 | print(result) 78 | -------------------------------------------------------------------------------- /rust-toolchain.toml: -------------------------------------------------------------------------------- 1 | [toolchain] 2 | channel = "nightly" 3 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | group_imports = "StdExternalCrate" 2 | imports_granularity = "Module" 3 | match_block_trailing_comma = true 4 | -------------------------------------------------------------------------------- /src/arrays.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::unused_unit)] 2 | use polars::prelude::*; 3 | use polars_core::utils::CustomIterTools; 4 | use pyo3_polars::derive::polars_expr; 5 | use serde::Deserialize; 6 | 7 | pub fn point_2d_output(_: &[Field]) -> PolarsResult { 8 | Ok(Field::new( 9 | PlSmallStr::from_static("point_2d"), 10 | DataType::Array(Box::new(DataType::Float64), 2), 11 | )) 12 | } 13 | 14 | #[derive(Deserialize)] 15 | struct MidPoint2DKwargs { 16 | ref_point: [f64; 2], 17 | } 18 | 19 | #[polars_expr(output_type_func=point_2d_output)] 20 | fn midpoint_2d(inputs: &[Series], kwargs: MidPoint2DKwargs) -> PolarsResult { 21 | let ca: &ArrayChunked = inputs[0].array()?; 22 | let ref_point = kwargs.ref_point; 23 | 24 | let out: ArrayChunked = unsafe { 25 | ca.try_apply_amortized_same_type(|row| { 26 | let s = row.as_ref(); 27 | let ca = s.f64()?; 28 | let out_inner: Float64Chunked = ca 29 | .iter() 30 | .enumerate() 31 | .map(|(idx, opt_val)| opt_val.map(|val| (val + ref_point[idx]) / 2.0f64)) 32 | .collect_trusted(); 33 | Ok(out_inner.into_series()) 34 | }) 35 | }?; 36 | 37 | Ok(out.into_series()) 38 | } 39 | -------------------------------------------------------------------------------- /src/expressions.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::unused_unit)] 2 | use std::ops::{Add, Div, Mul, Sub}; 3 | 4 | use num_traits::{NumCast, Zero, Signed}; 5 | use polars::prelude::arity::{ 6 | binary_elementwise_into_string_amortized, broadcast_binary_elementwise, 7 | }; 8 | use polars::prelude::*; 9 | use polars_arrow::bitmap::MutableBitmap; 10 | use polars_core::series::amortized_iter::AmortSeries; 11 | use polars_core::utils::align_chunks_binary; 12 | use pyo3_polars::derive::polars_expr; 13 | use pyo3_polars::export::polars_core::utils::arrow::array::PrimitiveArray; 14 | use pyo3_polars::export::polars_core::utils::CustomIterTools; 15 | use serde::Deserialize; 16 | 17 | fn same_output_type(input_fields: &[Field]) -> PolarsResult { 18 | let field = &input_fields[0]; 19 | Ok(field.clone()) 20 | } 21 | 22 | #[polars_expr(output_type_func=same_output_type)] 23 | fn noop(inputs: &[Series]) -> PolarsResult { 24 | let s = &inputs[0]; 25 | Ok(s.clone()) 26 | } 27 | 28 | #[polars_expr(output_type=Int64)] 29 | fn abs_i64(inputs: &[Series]) -> PolarsResult { 30 | let s = &inputs[0]; 31 | let ca: &Int64Chunked = s.i64()?; 32 | // NOTE: there's a faster way of implementing `abs_i64`, which we'll 33 | // cover in section 7. 34 | let out: Int64Chunked = ca.apply(|opt_v: Option| opt_v.map(|v: i64| v.abs())); 35 | Ok(out.into_series()) 36 | } 37 | 38 | fn impl_abs_numeric(ca: &ChunkedArray) -> ChunkedArray 39 | where 40 | T: PolarsNumericType, 41 | T::Native: Signed, 42 | { 43 | ca.apply(|opt_v: Option| opt_v.map(|v: T::Native| v.abs())) 44 | } 45 | 46 | #[polars_expr(output_type_func=same_output_type)] 47 | fn abs_numeric(inputs: &[Series]) -> PolarsResult { 48 | let s = &inputs[0]; 49 | match s.dtype() { 50 | DataType::Int32 => Ok(impl_abs_numeric(s.i32().unwrap()).into_series()), 51 | DataType::Int64 => Ok(impl_abs_numeric(s.i64().unwrap()).into_series()), 52 | DataType::Float32 => Ok(impl_abs_numeric(s.f32().unwrap()).into_series()), 53 | DataType::Float64 => Ok(impl_abs_numeric(s.f64().unwrap()).into_series()), 54 | dtype => { 55 | polars_bail!(InvalidOperation:format!("dtype {dtype} not \ 56 | supported for abs_numeric, expected Int32, Int64, Float32, Float64.")) 57 | }, 58 | } 59 | } 60 | 61 | #[polars_expr(output_type=Int64)] 62 | fn sum_i64(inputs: &[Series]) -> PolarsResult { 63 | let left: &Int64Chunked = inputs[0].i64()?; 64 | let right: &Int64Chunked = inputs[1].i64()?; 65 | // Note: there's a faster way of summing two columns, see 66 | // section 7. 67 | let out: Int64Chunked = 68 | broadcast_binary_elementwise(left, right, |left: Option, right: Option| match ( 69 | left, right, 70 | ) { 71 | (Some(left), Some(right)) => Some(left + right), 72 | _ => None, 73 | }); 74 | Ok(out.into_series()) 75 | } 76 | 77 | #[polars_expr(output_type_func=same_output_type)] 78 | fn cum_sum(inputs: &[Series]) -> PolarsResult { 79 | let s = &inputs[0]; 80 | let ca: &Int64Chunked = s.i64()?; 81 | let out: Int64Chunked = ca 82 | .iter() 83 | .scan(0_i64, |state: &mut i64, x: Option| match x { 84 | Some(x) => { 85 | *state += x; 86 | Some(Some(*state)) 87 | }, 88 | None => Some(None), 89 | }) 90 | .collect_trusted(); 91 | Ok(out.into_series()) 92 | } 93 | 94 | use std::borrow::Cow; 95 | use std::fmt::Write; 96 | 97 | #[polars_expr(output_type=String)] 98 | fn pig_latinnify(inputs: &[Series]) -> PolarsResult { 99 | let s = &inputs[0]; 100 | let ca = s.str()?; 101 | let out: StringChunked = ca.apply(|opt_v: Option<&str>| { 102 | opt_v.map(|value: &str| { 103 | // Not the recommended way to do it, 104 | // see below for a better way! 105 | if let Some(first_char) = value.chars().next() { 106 | Cow::Owned(format!("{}{}ay", &value[1..], first_char)) 107 | } else { 108 | Cow::Borrowed(value) 109 | } 110 | }) 111 | }); 112 | Ok(out.into_series()) 113 | } 114 | 115 | fn remove_last_extension(s: &str) -> &str { 116 | match s.rfind('.') { 117 | Some(pos) => &s[..pos], 118 | None => s, 119 | } 120 | } 121 | 122 | #[polars_expr(output_type=String)] 123 | fn remove_extension(inputs: &[Series]) -> PolarsResult { 124 | let s = &inputs[0]; 125 | let ca = s.str()?; 126 | let out: StringChunked = ca.apply_values(|val| { 127 | let res = Cow::Borrowed(remove_last_extension(val)); 128 | res 129 | }); 130 | Ok(out.into_series()) 131 | } 132 | 133 | #[polars_expr(output_type=Int64)] 134 | fn abs_i64_fast(inputs: &[Series]) -> PolarsResult { 135 | let s = &inputs[0]; 136 | let ca = s.i64()?; 137 | let chunks = ca 138 | .downcast_iter() 139 | .map(|arr| arr.values().as_slice()) 140 | .zip(ca.iter_validities()) 141 | .map(|(slice, validity)| { 142 | let arr: PrimitiveArray = slice.iter().copied().map(|x| x.abs()).collect_arr(); 143 | arr.with_validity(validity.cloned()) 144 | }); 145 | let out = Int64Chunked::from_chunk_iter(PlSmallStr::EMPTY, chunks); 146 | Ok(out.into_series()) 147 | } 148 | 149 | #[derive(Deserialize)] 150 | struct AddSuffixKwargs { 151 | suffix: String, 152 | } 153 | 154 | #[polars_expr(output_type=String)] 155 | fn add_suffix(inputs: &[Series], kwargs: AddSuffixKwargs) -> PolarsResult { 156 | let s = &inputs[0]; 157 | let ca = s.str()?; 158 | let out = ca.apply_into_string_amortized(|value, output| { 159 | write!(output, "{}{}", value, kwargs.suffix).unwrap(); 160 | }); 161 | Ok(out.into_series()) 162 | } 163 | 164 | // use rust_stemmers::{Algorithm, Stemmer}; 165 | 166 | // #[polars_expr(output_type=String)] 167 | // fn snowball_stem(inputs: &[Series]) -> PolarsResult { 168 | // let ca: &StringChunked = inputs[0].str()?; 169 | // let en_stemmer = Stemmer::create(Algorithm::English); 170 | // let out: StringChunked = ca.apply_into_string_amortized(|value: &str, output: &mut String| { 171 | // write!(output, "{}", en_stemmer.stem(value)).unwrap() 172 | // }); 173 | // Ok(out.into_series()) 174 | // } 175 | 176 | fn binary_amortized_elementwise<'a, T, K, F>( 177 | lhs: &'a ListChunked, 178 | rhs: &'a ListChunked, 179 | mut f: F, 180 | ) -> ChunkedArray 181 | where 182 | T: PolarsDataType, 183 | T::Array: ArrayFromIter>, 184 | F: FnMut(&AmortSeries, &AmortSeries) -> Option + Copy, 185 | { 186 | { 187 | let (lhs, rhs) = align_chunks_binary(lhs, rhs); 188 | lhs.amortized_iter() 189 | .zip(rhs.amortized_iter()) 190 | .map(|(lhs, rhs)| match (lhs, rhs) { 191 | (Some(lhs), Some(rhs)) => f(&lhs, &rhs), 192 | _ => None, 193 | }) 194 | .collect_ca(PlSmallStr::EMPTY) 195 | } 196 | } 197 | 198 | #[polars_expr(output_type=Float64)] 199 | fn weighted_mean(inputs: &[Series]) -> PolarsResult { 200 | let values = inputs[0].list()?; 201 | let weights = &inputs[1].list()?; 202 | polars_ensure!( 203 | values.dtype() == &DataType::List(Box::new(DataType::Int64)), 204 | ComputeError: "Expected `values` to be of type `List(Int64)`, got: {}", values.dtype() 205 | ); 206 | polars_ensure!( 207 | weights.dtype() == &DataType::List(Box::new(DataType::Float64)), 208 | ComputeError: "Expected `weights` to be of type `List(Float64)`, got: {}", weights.dtype() 209 | ); 210 | 211 | let out: Float64Chunked = binary_amortized_elementwise( 212 | values, 213 | weights, 214 | |values_inner: &AmortSeries, weights_inner: &AmortSeries| -> Option { 215 | let values_inner = values_inner.as_ref().i64().unwrap(); 216 | let weights_inner = weights_inner.as_ref().f64().unwrap(); 217 | if values_inner.is_empty() { 218 | // Mirror Polars, and return None for empty mean. 219 | return None; 220 | } 221 | let mut numerator: f64 = 0.; 222 | let mut denominator: f64 = 0.; 223 | values_inner 224 | .iter() 225 | .zip(weights_inner.iter()) 226 | .for_each(|(v, w)| { 227 | if let (Some(v), Some(w)) = (v, w) { 228 | numerator += v as f64 * w; 229 | denominator += w; 230 | } 231 | }); 232 | Some(numerator / denominator) 233 | }, 234 | ); 235 | Ok(out.into_series()) 236 | } 237 | 238 | fn struct_point_2d_output(input_fields: &[Field]) -> PolarsResult { 239 | let field = &input_fields[0]; 240 | match field.dtype() { 241 | DataType::Struct(fields) => Ok(Field::new( 242 | "struct_point_2d".into(), 243 | DataType::Struct(fields.clone()), 244 | )), 245 | dtype => polars_bail!(InvalidOperation: "expected Struct dtype, got {}", dtype), 246 | } 247 | } 248 | 249 | #[polars_expr(output_type_func=struct_point_2d_output)] 250 | fn print_struct_fields(inputs: &[Series]) -> PolarsResult { 251 | let struct_ = inputs[0].struct_()?; 252 | let fields = struct_.fields_as_series(); 253 | 254 | if fields.is_empty() { 255 | return Ok(inputs[0].clone()); 256 | } 257 | 258 | let fields = fields 259 | .iter() 260 | .map(|s| { 261 | let s = s.clone(); 262 | println!("{:?}", s); 263 | s 264 | }) 265 | .collect::>(); 266 | 267 | StructChunked::from_series(struct_.name().clone(), struct_.len(), fields.iter()) 268 | .map(|ca| ca.into_series()) 269 | } 270 | 271 | fn shifted_struct(input_fields: &[Field]) -> PolarsResult { 272 | let field = &input_fields[0]; 273 | match field.dtype() { 274 | DataType::Struct(fields) => { 275 | let mut field_0 = fields[0].clone(); 276 | let name = field_0.name.clone(); 277 | field_0.set_name(fields[fields.len() - 1].name().clone()); 278 | let mut fields = fields[1..] 279 | .iter() 280 | .zip(fields[0..fields.len() - 1].iter()) 281 | .map(|(fld, name)| Field::new(name.name().clone(), fld.dtype().clone())) 282 | .collect::>(); 283 | fields.push(field_0); 284 | Ok(Field::new(name, DataType::Struct(fields))) 285 | }, 286 | _ => unreachable!(), 287 | } 288 | } 289 | 290 | #[polars_expr(output_type_func=shifted_struct)] 291 | fn shift_struct(inputs: &[Series]) -> PolarsResult { 292 | let struct_ = inputs[0].struct_()?; 293 | let fields = struct_.fields_as_series(); 294 | if fields.is_empty() { 295 | return Ok(inputs[0].clone()); 296 | } 297 | let mut field_0 = fields[0].clone(); 298 | let name = field_0.name().clone(); 299 | field_0.rename(fields[fields.len() - 1].name().clone()); 300 | let mut fields = fields[1..] 301 | .iter() 302 | .zip(fields[..fields.len() - 1].iter()) 303 | .map(|(s, name)| { 304 | let mut s = s.clone(); 305 | s.rename(name.name().clone()); 306 | s 307 | }) 308 | .collect::>(); 309 | fields.push(field_0); 310 | StructChunked::from_series(name, struct_.len(), fields.iter()).map(|ca| ca.into_series()) 311 | } 312 | 313 | use reverse_geocoder::ReverseGeocoder; 314 | 315 | #[polars_expr(output_type=String)] 316 | fn reverse_geocode(inputs: &[Series]) -> PolarsResult { 317 | let latitude = inputs[0].f64()?; 318 | let longitude = inputs[1].f64()?; 319 | let geocoder = ReverseGeocoder::new(); 320 | let out = binary_elementwise_into_string_amortized(latitude, longitude, |lhs, rhs, out| { 321 | let search_result = geocoder.search((lhs, rhs)); 322 | write!(out, "{}", search_result.record.name).unwrap(); 323 | }); 324 | Ok(out.into_series()) 325 | } 326 | 327 | fn list_idx_dtype(input_fields: &[Field]) -> PolarsResult { 328 | let field = Field::new( 329 | input_fields[0].name.clone(), 330 | DataType::List(Box::new(IDX_DTYPE)), 331 | ); 332 | Ok(field.clone()) 333 | } 334 | 335 | #[polars_expr(output_type_func=list_idx_dtype)] 336 | fn non_zero_indices(inputs: &[Series]) -> PolarsResult { 337 | let ca = inputs[0].list()?; 338 | polars_ensure!( 339 | ca.dtype() == &DataType::List(Box::new(DataType::Int64)), 340 | ComputeError: "Expected `List(Int64)`, got: {}", ca.dtype() 341 | ); 342 | 343 | let out: ListChunked = ca.apply_amortized(|s| { 344 | let s: &Series = s.as_ref(); 345 | let ca: &Int64Chunked = s.i64().unwrap(); 346 | let out: IdxCa = ca 347 | .iter() 348 | .enumerate() 349 | .filter(|(_idx, opt_val)| opt_val != &Some(0)) 350 | .map(|(idx, _opt_val)| Some(idx as IdxSize)) 351 | .collect_ca(PlSmallStr::EMPTY); 352 | out.into_series() 353 | }); 354 | Ok(out.into_series()) 355 | } 356 | 357 | #[polars_expr(output_type=Float64)] 358 | fn vertical_weighted_mean(inputs: &[Series]) -> PolarsResult { 359 | let values = &inputs[0].f64()?; 360 | let weights = &inputs[1].f64()?; 361 | let mut numerator = 0.; 362 | let mut denominator = 0.; 363 | values.iter().zip(weights.iter()).for_each(|(v, w)| { 364 | if let (Some(v), Some(w)) = (v, w) { 365 | numerator += v * w; 366 | denominator += w; 367 | } 368 | }); 369 | let result = numerator / denominator; 370 | Ok(Series::new(PlSmallStr::EMPTY, vec![result])) 371 | } 372 | 373 | fn linear_itp(low: T, step: T, slope: T) -> T 374 | where 375 | T: Sub + Mul + Add + Div, 376 | { 377 | low + step * slope 378 | } 379 | 380 | #[inline] 381 | fn signed_interp(low: T, high: T, steps: IdxSize, steps_n: T, out: &mut Vec) 382 | where 383 | T: Sub + Mul + Add + Div + NumCast + Copy, 384 | { 385 | let slope = (high - low) / steps_n; 386 | for step_i in 1..steps { 387 | let step_i: T = NumCast::from(step_i).unwrap(); 388 | let v = linear_itp(low, step_i, slope); 389 | out.push(v) 390 | } 391 | } 392 | 393 | fn interpolate_impl(chunked_arr: &ChunkedArray, interpolation_branch: I) -> ChunkedArray 394 | where 395 | T: PolarsNumericType, 396 | I: Fn(T::Native, T::Native, IdxSize, T::Native, &mut Vec), 397 | { 398 | // This implementation differs from pandas as that boundary None's are not removed. 399 | // This prevents a lot of errors due to expressions leading to different lengths. 400 | if chunked_arr.null_count() == 0 || chunked_arr.null_count() == chunked_arr.len() { 401 | return chunked_arr.clone(); 402 | } 403 | 404 | // We first find the first and last so that we can set the null buffer. 405 | let first = chunked_arr.first_non_null().unwrap(); 406 | let last = chunked_arr.last_non_null().unwrap() + 1; 407 | 408 | // Fill out with `first` nulls. 409 | let mut out = Vec::with_capacity(chunked_arr.len()); 410 | let mut iter = chunked_arr.iter().skip(first); 411 | for _ in 0..first { 412 | out.push(Zero::zero()); 413 | } 414 | 415 | // The next element of `iter` is definitely `Some(Some(v))`, because we skipped the first 416 | // elements `first` and if all values were missing we'd have done an early return. 417 | let mut low = iter.next().unwrap().unwrap(); 418 | out.push(low); 419 | while let Some(next) = iter.next() { 420 | if let Some(v) = next { 421 | out.push(v); 422 | low = v; 423 | } else { 424 | let mut steps = 1 as IdxSize; 425 | for next in iter.by_ref() { 426 | steps += 1; 427 | if let Some(high) = next { 428 | let steps_n: T::Native = NumCast::from(steps).unwrap(); 429 | interpolation_branch(low, high, steps, steps_n, &mut out); 430 | out.push(high); 431 | low = high; 432 | break; 433 | } 434 | } 435 | } 436 | } 437 | if first != 0 || last != chunked_arr.len() { 438 | let mut validity = MutableBitmap::with_capacity(chunked_arr.len()); 439 | validity.extend_constant(chunked_arr.len(), true); 440 | 441 | for i in 0..first { 442 | validity.set(i, false); 443 | } 444 | 445 | for i in last..chunked_arr.len() { 446 | validity.set(i, false); 447 | out.push(Zero::zero()) 448 | } 449 | 450 | let array = PrimitiveArray::new( 451 | T::get_dtype().to_arrow(CompatLevel::newest()), 452 | out.into(), 453 | Some(validity.into()), 454 | ); 455 | ChunkedArray::with_chunk(PlSmallStr::EMPTY, array) 456 | } else { 457 | ChunkedArray::from_vec(PlSmallStr::EMPTY, out) 458 | } 459 | } 460 | 461 | #[polars_expr(output_type=Int64)] 462 | fn interpolate(inputs: &[Series]) -> PolarsResult { 463 | let s = &inputs[0]; 464 | let ca = s.i64()?; 465 | let out: Int64Chunked = interpolate_impl(ca, signed_interp::); 466 | Ok(out.into_series()) 467 | } 468 | 469 | #[polars_expr(output_type=Int64)] 470 | fn life_step(inputs: &[Series]) -> PolarsResult { 471 | let (ca_lf, ca_curr, ca_rt) = (inputs[0].i64()?, inputs[1].i64()?, inputs[2].i64()?); 472 | 473 | let lf = ca_lf 474 | .cont_slice() 475 | .expect("Expected input to be contiguous (in a single chunk)"); 476 | let mid = ca_curr 477 | .cont_slice() 478 | .expect("Expected input to be contiguous (in a single chunk)"); 479 | let rt = ca_rt 480 | .cont_slice() 481 | .expect("Expected input to be contiguous (in a single chunk)"); 482 | 483 | let len = lf.len(); 484 | 485 | let out: Int64Chunked = mid 486 | .iter() 487 | .enumerate() 488 | .map(|(idx, val)| { 489 | // Neighbours above 490 | let prev_row = if 0 == idx { 491 | lf[len - 1] + mid[len - 1] + rt[len - 1] 492 | } else { 493 | lf[idx - 1] + mid[idx - 1] + rt[idx - 1] 494 | }; 495 | 496 | // Curr row does not include cell in the middle, 497 | // a cell is not a neighbour of itself 498 | let curr_row = lf[idx] + rt[idx]; 499 | 500 | // Neighbours below 501 | let next_row = if len - 1 == idx { 502 | lf[0] + mid[0] + rt[0] 503 | } else { 504 | lf[idx + 1] + mid[idx + 1] + rt[idx + 1] 505 | }; 506 | 507 | // Life logic 508 | Some(match (val, prev_row + curr_row + next_row) { 509 | (1, 2) | (1, 3) => 1, 510 | (0, 3) => 1, 511 | _ => 0, 512 | }) 513 | }) 514 | .collect_trusted(); 515 | Ok(out.into_series()) 516 | } 517 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | mod arrays; 2 | mod expressions; 3 | 4 | use pyo3_polars::PolarsAllocator; 5 | 6 | #[global_allocator] 7 | static ALLOC: PolarsAllocator = PolarsAllocator::new(); 8 | -------------------------------------------------------------------------------- /test_plugin.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | import minimal_plugin as mp 3 | from polars.testing import assert_frame_equal 4 | 5 | 6 | def test_noop(): 7 | df = pl.DataFrame( 8 | {"a": [1, 1, None], "b": [4.1, 5.2, 6.3], "c": ["hello", "everybody!", "!"]} 9 | ) 10 | result = df.with_columns(mp.noop(pl.all()).name.suffix("_noop")) 11 | expected = pl.DataFrame( 12 | { 13 | "a": [1, 1, None], 14 | "b": [4.1, 5.2, 6.3], 15 | "c": ["hello", "everybody!", "!"], 16 | "a_noop": [1, 1, None], 17 | "b_noop": [4.1, 5.2, 6.3], 18 | "c_noop": ["hello", "everybody!", "!"], 19 | } 20 | ) 21 | assert_frame_equal(result, expected) 22 | 23 | 24 | def test_abs_i64(): 25 | df = pl.DataFrame( 26 | {"a": [1, -1, None], "b": [4.1, 5.2, -6.3], "c": ["hello", "everybody!", "!"]} 27 | ) 28 | result = df.with_columns(mp.abs_i64("a").name.suffix("_abs")) 29 | expected = pl.DataFrame( 30 | { 31 | "a": [1, -1, None], 32 | "b": [4.1, 5.2, -6.3], 33 | "c": ["hello", "everybody!", "!"], 34 | "a_abs": [1, 1, None], 35 | } 36 | ) 37 | assert_frame_equal(result, expected) 38 | 39 | 40 | def test_abs_numeric(): 41 | df = pl.DataFrame( 42 | {"a": [1, -1, None], "b": [4.1, 5.2, -6.3], "c": ["hello", "everybody!", "!"]} 43 | ) 44 | result = df.with_columns(mp.abs_numeric(pl.col("a", "b")).name.suffix("_abs")) 45 | expected = pl.DataFrame( 46 | { 47 | "a": [1, -1, None], 48 | "b": [4.1, 5.2, -6.3], 49 | "c": ["hello", "everybody!", "!"], 50 | "a_abs": [1, 1, None], 51 | "b_abs": [4.1, 5.2, 6.3], 52 | } 53 | ) 54 | assert_frame_equal(result, expected) 55 | 56 | 57 | def test_sum_i64(): 58 | df = pl.DataFrame({"a": [1, 5, 2], "b": [3, None, -1]}) 59 | result = df.with_columns(a_plus_b=mp.sum_i64("a", "b")) 60 | expected = pl.DataFrame( 61 | {"a": [1, 5, 2], "b": [3, None, -1], "a_plus_b": [4, None, 1]} 62 | ) 63 | assert_frame_equal(result, expected) 64 | 65 | 66 | def test_cum_sum(): 67 | df = pl.DataFrame( 68 | { 69 | "a": [1, 2, 3, 4, None, 5], 70 | "b": [1, 1, 1, 2, 2, 2], 71 | } 72 | ) 73 | result = df.with_columns(a_cum_sum=mp.cum_sum("a")) 74 | expected = pl.DataFrame( 75 | { 76 | "a": [1, 2, 3, 4, None, 5], 77 | "b": [1, 1, 1, 2, 2, 2], 78 | "a_cum_sum": [1, 3, 6, 10, None, 15], 79 | } 80 | ) 81 | assert_frame_equal(result, expected) 82 | result = df.with_columns(a_cum_sum=mp.cum_sum("a").over("b")) 83 | expected = pl.DataFrame( 84 | { 85 | "a": [1, 2, 3, 4, None, 5], 86 | "b": [1, 1, 1, 2, 2, 2], 87 | "a_cum_sum": [1, 3, 6, 4, None, 9], 88 | } 89 | ) 90 | assert_frame_equal(result, expected) 91 | 92 | 93 | def test_pig_latinnify(): 94 | df = pl.DataFrame({"a": ["I", "love", "pig", "latin"]}) 95 | result = df.with_columns(a_pig_latin=mp.pig_latinnify("a")) 96 | expected = pl.DataFrame( 97 | { 98 | "a": ["I", "love", "pig", "latin"], 99 | "a_pig_latin": ["Iay", "ovelay", "igpay", "atinlay"], 100 | } 101 | ) 102 | assert_frame_equal(result, expected) 103 | 104 | 105 | def test_add_suffix(): 106 | df = pl.DataFrame({"a": ["bob", "billy"]}) 107 | result = df.with_columns(mp.add_suffix("a", suffix="-billy")) 108 | expected = pl.DataFrame({"a": ["bob-billy", "billy-billy"]}) 109 | assert_frame_equal(result, expected) 110 | 111 | 112 | def test_weighted_mean(): 113 | df = pl.DataFrame( 114 | { 115 | "values": [[1, 3, 2], [5, 7], None, [5, 7], []], 116 | "weights": [[0.5, 0.3, 0.2], [0.1, 0.9], [0.1, 0.9], None, []], 117 | } 118 | ) 119 | result = df.with_columns(weighted_mean=mp.weighted_mean("values", "weights")) 120 | expected = pl.DataFrame( 121 | { 122 | "values": [[1, 3, 2], [5, 7], None, [5, 7], []], 123 | "weights": [[0.5, 0.3, 0.2], [0.1, 0.9], [0.1, 0.9], None, []], 124 | "weighted_mean": [1.7999999999999998, 6.8, None, None, None], 125 | } 126 | ) 127 | assert_frame_equal(result, expected) 128 | 129 | 130 | def test_non_zero_indices(): 131 | df = pl.DataFrame({"dense": [[0, 9], [8, 6, 0, 9], None, [3, 3]]}) 132 | result = df.with_columns(indices=mp.non_zero_indices("dense")) 133 | expected = pl.DataFrame( 134 | { 135 | "dense": [[0, 9], [8, 6, 0, 9], None, [3, 3]], 136 | "indices": [[1], [0, 1, 3], None, [0, 1]], 137 | }, 138 | schema_overrides={"indices": pl.List(pl.UInt32)}, 139 | ) 140 | assert_frame_equal(result, expected) 141 | 142 | 143 | def test_print_struct_fields(): 144 | df = pl.DataFrame( 145 | { 146 | "x": [1.0, 1.25, 1.5, 1.75], 147 | "y": [3.0, 2.75, 2.5, 2.25], 148 | "rgba": [0x00FF7FFF, 0xFF7F00FF, 0x7F7F7FFF, 0xD8D8D8FF], 149 | } 150 | ).select( 151 | point_2d_s=pl.struct( 152 | "x", 153 | "y", 154 | "rgba", 155 | schema={ 156 | "x": pl.Float64, 157 | "y": pl.Float64, 158 | "rgba": pl.UInt32, 159 | }, 160 | ) 161 | ) 162 | result = df.with_columns(point_2d_s=mp.print_struct_fields("point_2d_s")) 163 | assert_frame_equal(result, df) 164 | 165 | 166 | def test_shift_struct(): 167 | df = pl.DataFrame( 168 | { 169 | "a": [1, 3, 8], 170 | "b": [2.0, 3.1, 2.5], 171 | "c": ["3", "7", "3"], 172 | } 173 | ).select(abc=pl.struct("a", "b", "c")) 174 | result = df.with_columns(abc_shifted=mp.shift_struct("abc")) 175 | expected = pl.DataFrame( 176 | { 177 | "abc": [ 178 | {"a": 1, "b": 2.0, "c": "3"}, 179 | {"a": 3, "b": 3.1, "c": "7"}, 180 | {"a": 8, "b": 2.5, "c": "3"}, 181 | ], 182 | "abc_shifted": [ 183 | {"a": 2.0, "b": "3", "c": 1}, 184 | {"a": 3.1, "b": "7", "c": 3}, 185 | {"a": 2.5, "b": "3", "c": 8}, 186 | ], 187 | } 188 | ) 189 | assert_frame_equal(result, expected) 190 | 191 | 192 | def test_reverse_geocode(): 193 | df = pl.DataFrame({"lat": [37.7749, 51.01, 52.5], "lon": [-122.4194, -3.9, -0.91]}) 194 | result = df.with_columns(city=mp.reverse_geocode("lat", "lon")) 195 | expected = pl.DataFrame( 196 | { 197 | "lat": [37.7749, 51.01, 52.5], 198 | "lon": [-122.4194, -3.9, -0.91], 199 | "city": ["San Francisco", "South Molton", "Market Harborough"], 200 | } 201 | ) 202 | assert_frame_equal(result, expected) 203 | 204 | 205 | def test_vertical_weighted_mean(): 206 | df = pl.DataFrame( 207 | { 208 | "values": [1.0, 3, 2, 5, 7], 209 | "weights": [0.5, 0.3, 0.2, 0.1, 0.9], 210 | "group": ["a", "a", "a", "b", "b"], 211 | } 212 | ) 213 | result = ( 214 | df.group_by("group") 215 | .agg(weighted_mean=mp.vertical_weighted_mean("values", "weights")) 216 | .sort("group", descending=True) 217 | ) 218 | expected = pl.DataFrame( 219 | {"group": ["b", "a"], "weighted_mean": [6.8, 1.7999999999999998]} 220 | ) 221 | assert_frame_equal(result, expected) 222 | 223 | 224 | def test_midpoint_2d(): 225 | df = pl.DataFrame( 226 | pl.Series( 227 | "points", 228 | [ 229 | [6.63, 8.35], 230 | [7.19, 4.85], 231 | [2.1, 4.21], 232 | [3.4, 6.13], 233 | [2.48, 9.26], 234 | ], 235 | dtype=pl.Array(pl.Float64, 2), 236 | ) 237 | ) 238 | result = df.select(midpoints=mp.midpoint_2d("points", ref_point=(5.0, 5.0))) 239 | expected = pl.DataFrame( 240 | pl.Series( 241 | "midpoints", 242 | [[5.815, 6.675], [6.095, 4.925], [3.55, 4.605], [4.2, 5.565], [3.74, 7.13]], 243 | dtype=pl.Array(pl.Float64, 2), 244 | ) 245 | ) 246 | assert_frame_equal(result, expected) 247 | --------------------------------------------------------------------------------