├── .github
└── workflows
│ ├── check_remote_polars_version.yml
│ ├── ci.yml
│ └── pytest.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CODE_OF_CONDUCT.md
├── Cargo.lock
├── Cargo.toml
├── LICENSE
├── Makefile
├── README.md
├── check_polars_version.py
├── docs
├── abs.md
├── aggregate.md
├── arguments.md
├── arrays.md
├── assets
│ ├── array00.png
│ ├── array01.png
│ ├── image.png
│ ├── life_toad.gif
│ ├── life_toad_df.gif
│ ├── list_chunked_memory_layout.png
│ ├── struct_array_memory_layout.png
│ ├── struct_example_Point2D.png
│ ├── structchunked_fields_memory_layout.png
│ └── timings.png
├── branch_mispredictions.md
├── cum_sum.md
├── index.md
├── life_pt1.md
├── life_pt2.md
├── lists.md
├── lists_in_lists_out.md
├── lost_in_space.md
├── noop.md
├── prerequisites.md
├── publishing.md
├── requirements-docs.txt
├── stem.md
├── stringify.md
├── struct.md
├── sum.md
├── vec_of_option.md
└── where_to_go.md
├── minimal_plugin
├── __init__.py
└── typing.py
├── mkdocs.yml
├── perf.py
├── perf_list.py
├── pyproject.toml
├── requirements-dev.txt
├── requirements.txt
├── run.py
├── rust-toolchain.toml
├── rustfmt.toml
├── src
├── arrays.rs
├── expressions.rs
└── lib.rs
└── test_plugin.py
/.github/workflows/check_remote_polars_version.yml:
--------------------------------------------------------------------------------
1 | name: Remote polars version check
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | branches:
9 | - main
10 |
11 | concurrency:
12 | group: ${{ github.workflow }}-${{ github.ref }}
13 | cancel-in-progress: true
14 |
15 | permissions:
16 | contents: read
17 |
18 | env:
19 | RUSTFLAGS: "-Dwarnings"
20 |
21 | jobs:
22 | polars_version_check:
23 | runs-on: ubuntu-latest
24 | strategy:
25 | matrix:
26 | target: [x86_64]
27 | python-version: ["3.11", "3.12"]
28 | steps:
29 | - uses: actions/checkout@v3
30 | - uses: actions/setup-python@v4
31 | with:
32 | python-version: ${{ matrix.python-version }}
33 |
34 | - name: Check polars version used by the remote cookiecutter template
35 | run: python check_polars_version.py
36 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: ci
2 | on:
3 | push:
4 | branches:
5 | - master
6 | - main
7 | permissions:
8 | contents: write
9 | jobs:
10 | deploy:
11 | runs-on: ubuntu-latest
12 | steps:
13 | - uses: actions/checkout@v4
14 | - name: Configure Git Credentials
15 | run: |
16 | git config user.name github-actions[bot]
17 | git config user.email 41898282+github-actions[bot]@users.noreply.github.com
18 | - uses: actions/setup-python@v4
19 | with:
20 | python-version: 3.x
21 | - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
22 | - uses: actions/cache@v3
23 | with:
24 | key: mkdocs-material-${{ env.cache_id }}
25 | path: .cache
26 | restore-keys: |
27 | mkdocs-material-
28 | - run: pip install mkdocs-material
29 | - run: mkdocs gh-deploy --force
30 |
--------------------------------------------------------------------------------
/.github/workflows/pytest.yml:
--------------------------------------------------------------------------------
1 | # This file is autogenerated by maturin v1.4.0
2 | # To update, run
3 | #
4 | # maturin generate-ci github --pytest
5 | #
6 | name: CI
7 |
8 | on:
9 | push:
10 | branches:
11 | - main
12 | - master
13 | tags:
14 | - '*'
15 | pull_request:
16 | workflow_dispatch:
17 |
18 | concurrency:
19 | group: ${{ github.workflow }}-${{ github.ref }}
20 | cancel-in-progress: true
21 |
22 | permissions:
23 | contents: read
24 |
25 | # Make sure CI fails on all warnings, including Clippy lints
26 | env:
27 | RUSTFLAGS: "-Dwarnings"
28 |
29 | jobs:
30 | linux_tests:
31 | runs-on: ubuntu-latest
32 | strategy:
33 | matrix:
34 | target: [x86_64]
35 | python-version: ["3.9", "3.10", "3.11"]
36 | steps:
37 | - uses: actions/checkout@v3
38 | - uses: actions/setup-python@v4
39 | with:
40 | python-version: ${{ matrix.python-version }}
41 |
42 | - name: Set up Rust
43 | run: rustup show
44 | - uses: mozilla-actions/sccache-action@v0.0.3
45 | - run: curl -LsSf https://astral.sh/uv/install.sh | sh
46 | - run: uv venv --seed
47 | - run: . .venv/bin/activate && uv pip install -r requirements.txt -r requirements-dev.txt
48 | - run: make pre-commit
49 | - run: make install
50 | - run: make test
51 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/codespell-project/codespell
3 | rev: v2.2.5
4 | hooks:
5 | - id: codespell
6 | args: [--ignore-words-list=crate]
7 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | We as members, contributors, and leaders pledge to make participation in our
6 | community a harassment-free experience for everyone, regardless of age, body
7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
8 | identity and expression, level of experience, education, socio-economic status,
9 | nationality, personal appearance, race, religion, or sexual identity
10 | and orientation.
11 |
12 | We pledge to act and interact in ways that contribute to an open, welcoming,
13 | diverse, inclusive, and healthy community.
14 |
15 | ## Our Standards
16 |
17 | Examples of behavior that contributes to a positive environment for our
18 | community include:
19 |
20 | * Demonstrating empathy and kindness toward other people
21 | * Being respectful of differing opinions, viewpoints, and experiences
22 | * Giving and gracefully accepting constructive feedback
23 | * Accepting responsibility and apologizing to those affected by our mistakes,
24 | and learning from the experience
25 | * Focusing on what is best not just for us as individuals, but for the
26 | overall community
27 |
28 | Examples of unacceptable behavior include:
29 |
30 | * The use of sexualized language or imagery, and sexual attention or
31 | advances of any kind
32 | * Trolling, insulting or derogatory comments, and personal or political attacks
33 | * Public or private harassment
34 | * Publishing others' private information, such as a physical or email
35 | address, without their explicit permission
36 | * Other conduct which could reasonably be considered inappropriate in a
37 | professional setting
38 |
39 | ## Enforcement Responsibilities
40 |
41 | Community leaders are responsible for clarifying and enforcing our standards of
42 | acceptable behavior and will take appropriate and fair corrective action in
43 | response to any behavior that they deem inappropriate, threatening, offensive,
44 | or harmful.
45 |
46 | Community leaders have the right and responsibility to remove, edit, or reject
47 | comments, commits, code, wiki edits, issues, and other contributions that are
48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
49 | decisions when appropriate.
50 |
51 | ## Scope
52 |
53 | This Code of Conduct applies within all community spaces, and also applies when
54 | an individual is officially representing the community in public spaces.
55 | Examples of representing our community include using an official e-mail address,
56 | posting via an official social media account, or acting as an appointed
57 | representative at an online or offline event.
58 |
59 | ## Enforcement
60 |
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported to the community leaders responsible for enforcement at
63 | https://www.linkedin.com/in/marcogorelli/.
64 | All complaints will be reviewed and investigated promptly and fairly.
65 |
66 | All community leaders are obligated to respect the privacy and security of the
67 | reporter of any incident.
68 |
69 | ## Enforcement Guidelines
70 |
71 | Community leaders will follow these Community Impact Guidelines in determining
72 | the consequences for any action they deem in violation of this Code of Conduct:
73 |
74 | ### 1. Correction
75 |
76 | **Community Impact**: Use of inappropriate language or other behavior deemed
77 | unprofessional or unwelcome in the community.
78 |
79 | **Consequence**: A private, written warning from community leaders, providing
80 | clarity around the nature of the violation and an explanation of why the
81 | behavior was inappropriate. A public apology may be requested.
82 |
83 | ### 2. Warning
84 |
85 | **Community Impact**: A violation through a single incident or series
86 | of actions.
87 |
88 | **Consequence**: A warning with consequences for continued behavior. No
89 | interaction with the people involved, including unsolicited interaction with
90 | those enforcing the Code of Conduct, for a specified period of time. This
91 | includes avoiding interactions in community spaces as well as external channels
92 | like social media. Violating these terms may lead to a temporary or
93 | permanent ban.
94 |
95 | ### 3. Temporary Ban
96 |
97 | **Community Impact**: A serious violation of community standards, including
98 | sustained inappropriate behavior.
99 |
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 |
106 | ### 4. Permanent Ban
107 |
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior, harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 |
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 |
115 | ## Attribution
116 |
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120 |
121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
122 | enforcement ladder](https://github.com/mozilla/diversity).
123 |
124 | [homepage]: https://www.contributor-covenant.org
125 |
126 | For answers to common questions about this code of conduct, see the FAQ at
127 | https://www.contributor-covenant.org/faq. Translations are available at
128 | https://www.contributor-covenant.org/translations.
129 |
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | # Name of the project goes here
3 | # Note - it should be the same as the folder which you store your code in!
4 | name = "minimal_plugin"
5 | version = "0.1.0"
6 | edition = "2021"
7 |
8 | [lib]
9 | # Name of the project goes here
10 | # Note - it should be the same as the folder which you store your code in!
11 | name = "minimal_plugin"
12 | crate-type= ["cdylib"]
13 |
14 | [dependencies]
15 | pyo3 = { version = "0.23", features = ["extension-module", "abi3-py38"] }
16 | pyo3-polars = { version = "0.20", features = ["derive", "dtype-struct", "dtype-decimal", "dtype-array"] }
17 | serde = { version = "1", features = ["derive"] }
18 | polars = { version = "0.46.0", features = ["dtype-struct"], default-features = false }
19 | polars-arrow = { version = "0.46.0", default-features = false }
20 | polars-core = { version = "0.46.0", features = ["dtype-array"], default-features = false }
21 | polars-sql = { version = "0.46.0", default-features = false }
22 | reverse_geocoder = "4.1.1"
23 | num-traits = "0.2.19"
24 | # rust-stemmers = "1.2.0"
25 |
26 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Marco Edward Gorelli
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 |
2 | SHELL=/bin/bash
3 |
4 | install:
5 | unset CONDA_PREFIX && \
6 | source .venv/bin/activate && maturin develop -m Cargo.toml
7 |
8 | install-release:
9 | unset CONDA_PREFIX && \
10 | source .venv/bin/activate && maturin develop --release -m Cargo.toml
11 |
12 | pre-commit:
13 | cargo +nightly fmt --all --manifest-path Cargo.toml && cargo clippy --all-features --manifest-path Cargo.toml
14 | .venv/bin/python -m ruff format minimal_plugin test_plugin.py
15 | .venv/bin/python -m ruff check minimal_plugin test_plugin.py
16 |
17 | run: install
18 | source .venv/bin/activate && python run.py
19 |
20 | run-release: install-release
21 | source .venv/bin/activate && python run.py
22 |
23 | test:
24 | source .venv/bin/activate && pytest test_plugin.py
25 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # How you (yes, you!) can write a Polars Plugin
2 |
3 |
4 |
8 |
9 |
10 | - ✅ Unlock super-high performance
11 | - ✅ Have a tonne of fun
12 | - ✅ Impress everybody with your superpowers
13 |
14 | This repository is meant to accompany the tutorial from https://marcogorelli.github.io/polars-plugins-tutorial/.
15 |
16 | Logo
17 | ----
18 |
19 | Thanks to [Olha Urdeichuk](https://www.fiverr.com/olhaurdeichuk) for the illustration.
20 |
21 | Funding
22 | -------
23 |
24 | Thank you to [Quansight Labs](https://labs.quansight.org/) for having provided support to this project during their internships
25 | programme, during which [Bruno Conde](https://github.com/condekind) made very significant contributions to chapters on Strings,
26 | performance (`Vec>` vs. `Vec`), nested data types (Array, List, Struct), the CI process, and the "can we run doom?" extra.
27 |
--------------------------------------------------------------------------------
/check_polars_version.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | import sys
3 | import tomllib
4 | import re
5 | from typing import Any
6 |
7 |
8 | # Remote Cargo.toml from which we extract the version of 'polars'
9 | remote_cargo_url = r"https://raw.githubusercontent.com/MarcoGorelli/cookiecutter-polars-plugins/main/%7B%7B%20cookiecutter.project_slug%20%7D%7D/Cargo.toml"
10 | # Packages that should have the same version as the 'polars' package from above
11 | pinned_packages = ["polars", "polars-arrow", "polars-core"]
12 |
13 |
14 | def fetch_url_content(url: str) -> str | None:
15 | try:
16 | res = subprocess.run(
17 | ["curl", "-s", url],
18 | check=True,
19 | stdout=subprocess.PIPE,
20 | stderr=subprocess.PIPE,
21 | text=True,
22 | )
23 | return res.stdout
24 | except subprocess.CalledProcessError as e:
25 | print(f"Error fetching URL: {e.stderr}")
26 | return None
27 |
28 |
29 | # Fetch contents of remote (template) Cargo.toml
30 | raw_content = fetch_url_content(remote_cargo_url)
31 | if not raw_content:
32 | print("Fetched template Cargo.toml is empty, try again")
33 | sys.exit(1)
34 |
35 | # Load it as a dict with tomllib
36 | try:
37 | template_cargo_toml = tomllib.loads(raw_content)
38 | except tomllib.TOMLDecodeError as e:
39 | print(f"Error decoding template Cargo.toml: {e}")
40 | sys.exit(1)
41 |
42 | # Store remote (template) polars version
43 | template_polars_version: str = template_cargo_toml["dependencies"]["polars"]["version"]
44 |
45 | # Load local Cargo.toml as a dict
46 | local_cargo_toml: dict[str, Any]
47 | with open("Cargo.toml", mode="rb") as local_cargo_toml_file:
48 | try:
49 | local_cargo_toml = tomllib.load(local_cargo_toml_file)
50 | except tomllib.TOMLDecodeError as e:
51 | print(f"Error decoding local Cargo.toml: {e}")
52 | sys.exit(1)
53 |
54 | # Check each local pkg that should be pinned with the same ver. as the remote
55 | for pkg in pinned_packages:
56 | version = local_cargo_toml["dependencies"][pkg]["version"]
57 | if version != template_polars_version:
58 | print(f"{pkg=} {version=} does not match {template_polars_version=}")
59 | sys.exit(1)
60 |
61 |
62 | # Additionally, check other locations that reference polars version
63 | def find_local_polars_reference() -> str | None:
64 | """
65 | This will output a string with the format:
66 |
67 |
68 | ...
69 | """
70 | try:
71 | res = subprocess.run(
72 | ["grep", "-rEZ", "--exclude-dir=target", r"^[+-]?polars = ", "."],
73 | check=True,
74 | stdout=subprocess.PIPE,
75 | stderr=subprocess.PIPE,
76 | text=True,
77 | )
78 | return res.stdout
79 | except subprocess.CalledProcessError:
80 | print("Error running `grep -rEZ --exclude-dir=target '^[+-]?polars = ' .`")
81 | return None
82 |
83 |
84 | grep_result = find_local_polars_reference()
85 | if not grep_result:
86 | print("Error running grep, try again")
87 | sys.exit(1)
88 |
89 | # Iterate each non-empty line of the grep result
90 | for line in [ln for ln in grep_result.split("\n") if ln.strip()]:
91 | # File name and line contents are separated with a null byte
92 | filename, line = line.split("\0")
93 |
94 | # Use a group to capture the exact version present in the line
95 | m = re.search(r'\bversion = "([^"]+)"', line)
96 | if not m:
97 | print(f"Error extracting version from package in {filename}: {line}")
98 | sys.exit(1)
99 | if (ver := m.group(1)) != template_polars_version:
100 | print(
101 | f"Error in {filename}: local {ver=} does not"
102 | f"match {template_polars_version=}: {line=}"
103 | )
104 | sys.exit(1)
105 |
--------------------------------------------------------------------------------
/docs/abs.md:
--------------------------------------------------------------------------------
1 | # 2. How to do ABSolutely nothing
2 |
3 | OK, the title's misleading. We won't do "nothing", we'll make an `abs` function
4 | which will work on numeric data.
5 |
6 | We'll do this in phases:
7 |
8 | - `abs_i64` will take the absolute value of each row of an `Int64` column
9 | - `abs_numeric` will take the absolute value of each row in any numeric column
10 |
11 | ## `abs_i64`
12 |
13 | Let's start with the Python side - this is almost the same as what
14 | we did for `noop`, we'll just change the names. Please add this to
15 | `minimal_plugin/__init__.py`, right below the definition of `noop`:
16 | ```python
17 | def abs_i64(expr: IntoExprColumn) -> pl.Expr:
18 | return register_plugin_function(
19 | args=[expr],
20 | plugin_path=LIB,
21 | function_name="abs_i64",
22 | is_elementwise=True,
23 | )
24 | ```
25 |
26 | Then, please add this to `src/expressions.rs`, right below the Rust
27 | definition of `noop`:
28 |
29 | ```Rust
30 | #[polars_expr(output_type=Int64)]
31 | fn abs_i64(inputs: &[Series]) -> PolarsResult {
32 | let s = &inputs[0];
33 | let ca: &Int64Chunked = s.i64()?;
34 | // NOTE: there's a faster way of implementing `abs_i64`, which we'll
35 | // cover in section 7.
36 | let out: Int64Chunked = ca.apply(|opt_v: Option| opt_v.map(|v: i64| v.abs()));
37 | Ok(out.into_series())
38 | }
39 | ```
40 |
41 | The general idea here is:
42 |
43 | - Each element `opt_v` can either be `Some(i64)`, or `None`.
44 | If it's `None`, we return `None`, whereas if it's `Some(i64)`,
45 | then we return `Some` of the absolute value of the `i64` value.
46 |
47 | !!!note
48 |
49 | There's a faster way of implementing `abs_i64`, which you'll learn
50 | about in [Branch mispredictions].
51 |
52 | - We produce a new ChunkedArray, convert it to Series, and return it.
53 |
54 | Let's try this out. Make a Python file `run.py` with the following:
55 | ```python
56 | import polars as pl
57 | import minimal_plugin as mp
58 |
59 | df = pl.DataFrame({
60 | 'a': [1, -1, None],
61 | 'b': [4.1, 5.2, -6.3],
62 | 'c': ['hello', 'everybody!', '!']
63 | })
64 | print(df.with_columns(mp.abs_i64('a').name.suffix('_abs')))
65 | ```
66 | Compile it with `maturin develop` (or `maturin develop --release` if you're benchmarking), and run it with `python run.py`.
67 | If it outputs
68 | ```
69 | shape: (3, 4)
70 | ┌──────┬──────┬────────────┬───────┐
71 | │ a ┆ b ┆ c ┆ a_abs │
72 | │ --- ┆ --- ┆ --- ┆ --- │
73 | │ i64 ┆ f64 ┆ str ┆ i64 │
74 | ╞══════╪══════╪════════════╪═══════╡
75 | │ 1 ┆ 4.1 ┆ hello ┆ 1 │
76 | │ -1 ┆ 5.2 ┆ everybody! ┆ 1 │
77 | │ null ┆ -6.3 ┆ ! ┆ null │
78 | └──────┴──────┴────────────┴───────┘
79 | ```
80 | then you did everything correctly!
81 |
82 | [Branch mispredictions]: ../branch_mispredictions/
83 |
84 | ## `abs_numeric`
85 |
86 | The code above unfortunately only supports `Int64` columns. Let's try to
87 | generalise it a bit, so that it can accept any signed numeric column.
88 |
89 | First, add the following definition to `minimal_plugin/__init__.py`:
90 |
91 | ```python
92 | def abs_numeric(expr: IntoExprColumn) -> pl.Expr:
93 | return register_plugin_function(
94 | args=[expr],
95 | plugin_path=LIB,
96 | function_name="abs_numeric",
97 | is_elementwise=True,
98 | )
99 | ```
100 |
101 | Then, we'll go back to `src/expressions.rs`.
102 | Paste in the following:
103 |
104 | ```Rust
105 | fn impl_abs_numeric(ca: &Int64Chunked) -> Int64Chunked {
106 | // NOTE: there's a faster way of implementing `abs`, which we'll
107 | // cover in section 7.
108 | ca.apply(|opt_v: Option| opt_v.map(|v: i64| v.abs()))
109 | }
110 |
111 | #[polars_expr(output_type=Int64)]
112 | fn abs_numeric(inputs: &[Series]) -> PolarsResult {
113 | let s = &inputs[0];
114 | let ca: &Int64Chunked = s.i64()?;
115 | let out = impl_abs_numeric(ca);
116 | Ok(out.into_series())
117 | }
118 | ```
119 |
120 | Note how it's exactly like `abs_i64`, but `impl_abs_numeric` was
121 | factored out of the `abs_numeric` function. It's not yet generic,
122 | we need to do a bit more work.
123 | The general idea is:
124 |
125 | - each `ChunkedArray` is of some Polars Type `T` (e.g. `Int64`);
126 | - to each Polars Type `T`, there corresponds a Rust native type `T::Native` (e.g. `i64`).
127 |
128 | Change `impl_abs_numeric` to:
129 |
130 | ```Rust
131 | fn impl_abs_numeric(ca: &ChunkedArray) -> ChunkedArray
132 | where
133 | T: PolarsNumericType,
134 | T::Native: Signed,
135 | {
136 | // NOTE: there's a faster way of implementing `abs`, which we'll
137 | // cover in section 7.
138 | ca.apply(|opt_v: Option| opt_v.map(|v: T::Native| v.abs()))
139 | }
140 | ```
141 | Make sure to add
142 | ```Rust
143 | use pyo3_polars::export::polars_core::export::num::Signed;
144 | ```
145 | to the top of the `src/expression.rs` file.
146 |
147 | We then need to modify `abs_numeric` as follows:
148 | ```Rust
149 | #[polars_expr(output_type_func=same_output_type)]
150 | fn abs_numeric(inputs: &[Series]) -> PolarsResult {
151 | let s = &inputs[0];
152 | match s.dtype() {
153 | DataType::Int32 => Ok(impl_abs_numeric(s.i32().unwrap()).into_series()),
154 | DataType::Int64 => Ok(impl_abs_numeric(s.i64().unwrap()).into_series()),
155 | DataType::Float32 => Ok(impl_abs_numeric(s.f32().unwrap()).into_series()),
156 | DataType::Float64 => Ok(impl_abs_numeric(s.f64().unwrap()).into_series()),
157 | dtype => {
158 | polars_bail!(InvalidOperation:format!("dtype {dtype} not \
159 | supported for abs_numeric, expected Int32, Int64, Float32, Float64."))
160 | }
161 | }
162 | }
163 | ```
164 | That's it! Our function is now generic over signed numeric types,
165 | instead of only accepting the `Int64` type.
166 |
167 | Finally, modify the `print` line of `run.py` to be
168 | ```python
169 | print(df.with_columns(mp.abs_numeric(pl.col('a', 'b')).name.suffix('_abs')))
170 | ```
171 |
172 | Compile with `maturin develop` (or `maturin develop --release`
173 | if you're benchmarking) and then run with `python run.py`. You should
174 | see:
175 | ```
176 | shape: (3, 5)
177 | ┌──────┬──────┬────────────┬───────┬───────┐
178 | │ a ┆ b ┆ c ┆ a_abs ┆ b_abs │
179 | │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
180 | │ i64 ┆ f64 ┆ str ┆ i64 ┆ f64 │
181 | ╞══════╪══════╪════════════╪═══════╪═══════╡
182 | │ 1 ┆ 4.1 ┆ hello ┆ 1 ┆ 4.1 │
183 | │ -1 ┆ 5.2 ┆ everybody! ┆ 1 ┆ 5.2 │
184 | │ null ┆ -6.3 ┆ ! ┆ null ┆ 6.3 │
185 | └──────┴──────┴────────────┴───────┴───────┘
186 | ```
187 | Note how we were able to take the absolute value of both `b` (`f64`)
188 | and `a` (`i64`) columns with `abs_numeric`!
189 |
--------------------------------------------------------------------------------
/docs/aggregate.md:
--------------------------------------------------------------------------------
1 | # 15. In (the) aggregate
2 |
3 | Enough transorming columns! Let's aggregate them instead.
4 |
5 | A Polars expression is a function from a Dataframe to a Series. So,
6 | how can we possibly write an expression which produces a scalar?
7 |
8 | Simple:
9 |
10 | - write an expression which returns a 1-row Series
11 | - when you register the expression, pass `returns_scalar = True`
12 |
13 | As an example, let's compute the weighted mean of a column, where
14 | the weights are given by a second column.
15 |
16 | ## Hello Python my old friend
17 |
18 | Nothing fancy here:
19 |
20 | ```python
21 | def vertical_weighted_mean(values: IntoExprColumn, weights: IntoExprColumn) -> pl.Expr:
22 | return register_plugin_function(
23 | args=[values, weights],
24 | plugin_path=LIB,
25 | function_name="vertical_weighted_mean",
26 | is_elementwise=False,
27 | returns_scalar=True,
28 | )
29 | ```
30 |
31 | ## Rust
32 |
33 | To keep this example's complexity down, let's just limit it to `Float64` columns.
34 |
35 | ```rust
36 | #[polars_expr(output_type=Float64)]
37 | fn vertical_weighted_mean(inputs: &[Series]) -> PolarsResult {
38 | let values = &inputs[0].f64()?;
39 | let weights = &inputs[1].f64()?;
40 | let mut numerator = 0.;
41 | let mut denominator = 0.;
42 | values.iter().zip(weights.iter()).for_each(|(v, w)| {
43 | if let (Some(v), Some(w)) = (v, w) {
44 | numerator += v * w;
45 | denominator += w;
46 | }
47 | });
48 | let result = numerator / denominator;
49 | Ok(Series::new(PlSmallStr::EMPTY, vec![result]))
50 | }
51 | ```
52 |
53 | ## Run it!
54 |
55 | Put the following in `run.py`:
56 |
57 | ```python
58 | df = pl.DataFrame({
59 | 'values': [1., 3, 2, 5, 7],
60 | 'weights': [.5, .3, .2, .1, .9],
61 | 'group': ['a', 'a', 'a', 'b', 'b'],
62 | })
63 | print(df.group_by('group').agg(weighted_mean = mp.vertical_weighted_mean('values', 'weights')))
64 | ```
65 |
66 | If you compile with `maturin develop` (or `maturin develop --release` if benchmarking), you'll see:
67 |
68 | ```
69 | shape: (2, 2)
70 | ┌───────┬───────────────┐
71 | │ group ┆ weighted_mean │
72 | │ --- ┆ --- │
73 | │ str ┆ f64 │
74 | ╞═══════╪═══════════════╡
75 | │ b ┆ 6.166667 │
76 | │ a ┆ 2.333333 │
77 | └───────┴───────────────┘
78 | ```
79 |
80 | Try omitting `returns_scalar=True` when registering the expression - what changes?
81 |
--------------------------------------------------------------------------------
/docs/arguments.md:
--------------------------------------------------------------------------------
1 | # 8. I'd like to have an argument, please
2 |
3 | Say you want to rewrite
4 | ```python
5 | def add_suffix(s, *, suffix):
6 | return s + suffix
7 |
8 | s.map_elements(lambda x: add_suffix(x, suffix='-billy'))
9 | ```
10 | as a plugin. How can you do that?
11 |
12 | We've covered passing in extra columns, but...how about passing extra
13 | keyword arguments?
14 |
15 | We'll do this with `kwargs`. In `minimal_plugin/__init__.py`, add the
16 | following:
17 |
18 | ```python
19 | def add_suffix(expr: IntoExprColumn, *, suffix: str) -> pl.Expr:
20 | return register_plugin_function(
21 | args=[expr],
22 | plugin_path=LIB,
23 | function_name="add_suffix",
24 | is_elementwise=True,
25 | kwargs={"suffix": suffix},
26 | )
27 | ```
28 |
29 | In `src/expressions.rs`, we'll then first have to define a struct to hold
30 | our keyword-arguments:
31 |
32 | ```rust
33 | #[derive(Deserialize)]
34 | struct AddSuffixKwargs {
35 | suffix: String,
36 | }
37 | ```
38 | Make sure to also add
39 | ```rust
40 | use serde::Deserialize;
41 | ```
42 | to the top of the file.
43 |
44 | Then, we can just pass an argument of this type to a `add_suffix` function,
45 | which is going to be very similar to the good version of `pig_latinnify`:
46 |
47 | ```rust
48 | #[polars_expr(output_type=String)]
49 | fn add_suffix(inputs: &[Series], kwargs: AddSuffixKwargs) -> PolarsResult {
50 | let s = &inputs[0];
51 | let ca = s.str()?;
52 | let out = ca.apply_into_string_amortized(|value, output| {
53 | write!(output, "{}{}", value, kwargs.suffix).unwrap();
54 | });
55 | Ok(out.into_series())
56 | }
57 | ```
58 |
59 | To see it in action, compile with `maturin develop` (or `maturin develop --release` if you're
60 | benchmarking), and then you should be able to put
61 | ```python
62 | import polars as pl
63 | import minimal_plugin as mp
64 |
65 | df = pl.DataFrame({'a': ['bob', 'billy']})
66 | print(df.with_columns(mp.add_suffix('a', suffix='-billy')))
67 | ```
68 | into `run.py`, and run it to get
69 | ```
70 | shape: (2, 1)
71 | ┌─────────────┐
72 | │ a │
73 | │ --- │
74 | │ str │
75 | ╞═════════════╡
76 | │ bob-billy │
77 | │ billy-billy │
78 | └─────────────┘
79 | ```
80 | You can add multiple keyword-arguments in the same function, just make sure to
81 | include them in the struct which you define on the Rust side.
82 |
--------------------------------------------------------------------------------
/docs/arrays.md:
--------------------------------------------------------------------------------
1 |
2 | # 11. ARRAY, captain!
3 |
4 | We've talked about lists, structs, but what about arrays?
5 |
6 | In this section we're gonna cover how to deal with fixed sized arrays, e.g., x and y coordinates of 2d points *in the same column*:
7 |
8 | ```python
9 | points = pl.Series(
10 | "points",
11 | [
12 | [6.63, 8.35],
13 | [7.19, 4.85],
14 | [2.1, 4.21],
15 | [3.4, 6.13],
16 | ],
17 | dtype=pl.Array(pl.Float64, 2),
18 | )
19 | df = pl.DataFrame(points)
20 |
21 | print(df)
22 | ```
23 |
24 | ```
25 | shape: (4, 1)
26 | ┌───────────────┐
27 | │ points │
28 | │ --- │
29 | │ array[f64, 2] │
30 | ╞═══════════════╡
31 | │ [6.63, 8.35] │
32 | │ [7.19, 4.85] │
33 | │ [2.1, 4.21] │
34 | │ [3.4, 6.13] │
35 | └───────────────┘
36 | ```
37 |
38 | Let's get to work - what if we wanted to make a plugin that takes a Series like `points` above, and, likewise, returned a Series of arrays?
39 | Turns out we _can_ do it! But it's a little bit tricky.
40 |
41 | __First of all__, we need to include `features = ["dtype-array"]` in both `pyo3-polars` and `polars-core` in our `Cargo.toml`.
42 |
43 | Now let's create a plugin that calculates the midpoint between a reference point and each point in a Series like the one above.
44 | This should illustrate both how to unpack an array inside our Rust code and also return a Series of the same type.
45 |
46 | We'll start by registering our plugin:
47 |
48 | ```python
49 | def midpoint_2d(expr: IntoExprColumn, ref_point: tuple[float, float]) -> pl.Expr:
50 | return register_plugin_function(
51 | args=[expr],
52 | plugin_path=Path(__file__).parent,
53 | function_name="midpoint_2d",
54 | is_elementwise=True,
55 | kwargs={"ref_point": ref_point},
56 | )
57 | ```
58 |
59 | As you can see, we included an additional kwarg: `ref_point`, which we annotated with the type `tuple: [float, float]`.
60 | In our Rust code, we won't receive it as a tuple, though, it'll also be an array.
61 | This isn't crucial for this example, so just accept it for now.
62 | As you saw in the __arguments__ chapter, we take kwargs by defining a struct for them:
63 |
64 | ```rust
65 | #[derive(Deserialize)]
66 | struct MidPoint2DKwargs {
67 | ref_point: [f64; 2],
68 | }
69 | ```
70 |
71 | And we can finally move to the actual plugin code:
72 |
73 | ```rust
74 | // We need this to ensure the output is of dtype array.
75 | // Unfortunately, polars plugins do not support something similar to:
76 | // #[polars_expr(output_type=Array)]
77 | pub fn point_2d_output(_: &[Field]) -> PolarsResult {
78 | Ok(Field::new(
79 | PlSmallStr::from_static("point_2d"),
80 | DataType::Array(Box::new(DataType::Float64), 2),
81 | ))
82 | }
83 |
84 | #[polars_expr(output_type_func=point_2d_output)]
85 | fn midpoint_2d(inputs: &[Series], kwargs: MidPoint2DKwargs) -> PolarsResult {
86 | let ca: &ArrayChunked = inputs[0].array()?;
87 | let ref_point = kwargs.ref_point;
88 |
89 | let out: ArrayChunked = unsafe {
90 | ca.try_apply_amortized_same_type(|row| {
91 | let s = row.as_ref();
92 | let ca = s.f64()?;
93 | let out_inner: Float64Chunked = ca
94 | .iter()
95 | .enumerate()
96 | .map(|(idx, opt_val)| {
97 | opt_val.map(|val| {
98 | (val + ref_point[idx]) / 2.0f64
99 | })
100 | }).collect_trusted();
101 | Ok(out_inner.into_series())
102 | })}?;
103 |
104 | Ok(out.into_series())
105 | }
106 | ```
107 |
108 | Uh-oh, unsafe, we're doomed!
109 |
110 | Hold on a moment - it's true that we need unsafe here, but let's not freak out.
111 | If we read the docs of `try_apply_amortized_same_type`, we see the following:
112 |
113 | > ```rust
114 | > /// Try apply a closure `F` to each array.
115 | > ///
116 | > /// # Safety
117 | > /// Return series of `F` must has the same dtype and number of elements as input if it is Ok.
118 | > pub unsafe fn try_apply_amortized_same_type(&self, mut f: F) -> PolarsResult
119 | > where
120 | > F: FnMut(AmortSeries) -> PolarsResult,
121 | > ```
122 |
123 |
124 | In this example, we can uphold that contract - we know we're returning a Series with the same number of elements and same dtype as the input!
125 |
126 | Still, the code looks a bit scary, doesn't it? So let's break it down:
127 |
128 | ```rust
129 | let out: ArrayChunked = unsafe {
130 |
131 | // This is similar to apply_values, but it's amortized and made specifically
132 | // for arrays.
133 | ca.try_apply_amortized_same_type(|row| {
134 | let s = row.as_ref();
135 | // `s` is a Series which contains two elements.
136 | // We unpack it similarly to the way we've been unpacking Series in the
137 | // previous chapters:
138 | //
139 | // Previously we've been doing this to unpack a column we had behind a
140 | // Series - this time, inside this closure, the Series contains the two
141 | // elements composing the "row" (x and y):
142 | let ca = s.f64()?;
143 |
144 | // There are many ways to extract the x and y coordinates from ca.
145 | // Here, we remain idiomatic and consistent with what we've been doing
146 | // in the past - iterate, enumerate and map:
147 | let out_inner: Float64Chunked = ca
148 | .iter()
149 | .enumerate()
150 | .map(|(idx, opt_val)| {
151 |
152 | // We only use map here because opt_val is an Option
153 | opt_val.map(|val| {
154 |
155 | // Here's where the simple logic of calculating a
156 | // midpoint happens. We take the coordinate (`val`) at
157 | // index `idx`, add it to the `idx-th` entry of our
158 | // reference point (which is a coordinate of our point),
159 | // then divide it by two, since we're dealing with 2d
160 | // points only.
161 | (val + ref_point[idx]) / 2.0f64
162 | })
163 | // Our map already returns Some or None, so we don't have to
164 | // worry about wrapping the result in, e.g., Some()
165 | }).collect_trusted();
166 |
167 | // At last, we convert out_inner (which is a Float64Chunked) back to a
168 | // Series
169 | Ok(out_inner.into_series())
170 | })}?;
171 |
172 | // And finally, we convert our ArrayChunked into a Series, ready to ship to
173 | // Python-land:
174 | Ok(out.into_series())
175 | ```
176 |
177 | That's it. What does the result look like?
178 | In `run.py`, we have:
179 |
180 | ```python
181 | import polars as pl
182 | from minimal_plugin import midpoint_2d
183 |
184 | points = pl.Series(
185 | "points",
186 | [
187 | [6.63, 8.35],
188 | [7.19, 4.85],
189 | [2.1, 4.21],
190 | [3.4, 6.13],
191 | [2.48, 9.26],
192 | [9.41, 7.26],
193 | [7.45, 8.85],
194 | [6.58, 5.22],
195 | [6.05, 5.77],
196 | [8.57, 4.16],
197 | [3.22, 4.98],
198 | [6.62, 6.62],
199 | [9.36, 7.44],
200 | [8.34, 3.43],
201 | [4.47, 7.61],
202 | [4.34, 5.05],
203 | [5.0, 5.05],
204 | [5.0, 5.0],
205 | [2.07, 7.8],
206 | [9.45, 9.6],
207 | [3.1, 3.26],
208 | [4.37, 5.72],
209 | ],
210 | dtype=pl.Array(pl.Float64, 2),
211 | )
212 | df = pl.DataFrame(points)
213 |
214 | # Now we call our plugin:
215 | result = df.with_columns(midpoints=midpoint_2d("points", ref_point=(5.0, 5.0)))
216 | print(result)
217 | ```
218 |
219 | Let's compile and run it:
220 | ```shell
221 | maturin develop
222 |
223 | python run.py
224 | ```
225 |
226 | 🥁:
227 | ```
228 | shape: (22, 2)
229 | ┌───────────────┬────────────────┐
230 | │ points ┆ midpoints │
231 | │ --- ┆ --- │
232 | │ array[f64, 2] ┆ array[f64, 2] │
233 | ╞═══════════════╪════════════════╡
234 | │ [6.63, 8.35] ┆ [5.815, 6.675] │
235 | │ [7.19, 4.85] ┆ [6.095, 4.925] │
236 | │ [2.1, 4.21] ┆ [3.55, 4.605] │
237 | │ [3.4, 6.13] ┆ [4.2, 5.565] │
238 | │ [2.48, 9.26] ┆ [3.74, 7.13] │
239 | │ … ┆ … │
240 | │ [5.0, 5.0] ┆ [5.0, 5.0] │
241 | │ [2.07, 7.8] ┆ [3.535, 6.4] │
242 | │ [9.45, 9.6] ┆ [7.225, 7.3] │
243 | │ [3.1, 3.26] ┆ [4.05, 4.13] │
244 | │ [4.37, 5.72] ┆ [4.685, 5.36] │
245 | └───────────────┴────────────────┘
246 | ```
247 |
248 |
249 | !!!note
250 | Notice how the dtype remains the same.
251 | As an exercise, try to achieve the same in pure-Python (without Rust plugins)
252 | without explicitly casting the type of the Series.
253 |
254 | Hurray, we did it!
255 | And why exactly go through all this trouble instead of just doing the same thing in pure Python?
256 | For performance of course!
257 |
258 | _Spoilers ahead if you haven't tried the exercise from the note above_
259 |
260 | With the following implementation in Python, we can take some measurements:
261 |
262 | ```python
263 | ref_point = (5.0, 5.0)
264 |
265 | def using_plugin(df=df, ref_point=ref_point):
266 | result = df.with_columns(midpoints=midpoint_2d("points", ref_point=ref_point))
267 | return result
268 |
269 | def midpoint(points:pl.Series) -> pl.Series:
270 | result=[]
271 | for point in points:
272 | result.append([(point[0]+ref_point[0])/2, (point[1]+ref_point[1])/2])
273 | return pl.Series(result, dtype=pl.Array(pl.Float64, 2))
274 |
275 | def using_python(df=df, ref_point=ref_point):
276 | result = (
277 | df.with_columns(
278 | midpoints=pl.col('points').map_batches(midpoint, return_dtype=pl.Array(pl.Float64, 2))
279 | )
280 | )
281 | return result
282 | ```
283 |
284 | For the sake of brevity, some extra methods to generate and parse an input file were left out of the code above, as well as the `timeit` bits.
285 | By measuring both versions with 1.000.000 points a few times and taking the average, we got the following result:
286 |
287 | ```
288 | Using plugin:
289 | min: 0.5307095803339811
290 | max: 0.5741689523274545
291 | mean +/- stderr: 0.5524565599986263 +/- 0.0064489015434971925
292 |
293 | Using python:
294 | min: 6.682447870339577
295 | max: 6.99253460233255
296 | mean +/- stderr: 6.808615755191394 +/- 0.03757884107880601
297 | ```
298 |
299 | A speedup of __12x__, that's a __big win__!
300 |
301 | !!!note
302 | When benchmarking Rust code, remember to use `maturin develop --release`, otherwise the timings will be much slower!
303 |
--------------------------------------------------------------------------------
/docs/assets/array00.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MarcoGorelli/polars-plugins-tutorial/c132a11de495fe1b88eac767db14bb8337a88df2/docs/assets/array00.png
--------------------------------------------------------------------------------
/docs/assets/array01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MarcoGorelli/polars-plugins-tutorial/c132a11de495fe1b88eac767db14bb8337a88df2/docs/assets/array01.png
--------------------------------------------------------------------------------
/docs/assets/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MarcoGorelli/polars-plugins-tutorial/c132a11de495fe1b88eac767db14bb8337a88df2/docs/assets/image.png
--------------------------------------------------------------------------------
/docs/assets/life_toad.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MarcoGorelli/polars-plugins-tutorial/c132a11de495fe1b88eac767db14bb8337a88df2/docs/assets/life_toad.gif
--------------------------------------------------------------------------------
/docs/assets/life_toad_df.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MarcoGorelli/polars-plugins-tutorial/c132a11de495fe1b88eac767db14bb8337a88df2/docs/assets/life_toad_df.gif
--------------------------------------------------------------------------------
/docs/assets/list_chunked_memory_layout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MarcoGorelli/polars-plugins-tutorial/c132a11de495fe1b88eac767db14bb8337a88df2/docs/assets/list_chunked_memory_layout.png
--------------------------------------------------------------------------------
/docs/assets/struct_array_memory_layout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MarcoGorelli/polars-plugins-tutorial/c132a11de495fe1b88eac767db14bb8337a88df2/docs/assets/struct_array_memory_layout.png
--------------------------------------------------------------------------------
/docs/assets/struct_example_Point2D.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MarcoGorelli/polars-plugins-tutorial/c132a11de495fe1b88eac767db14bb8337a88df2/docs/assets/struct_example_Point2D.png
--------------------------------------------------------------------------------
/docs/assets/structchunked_fields_memory_layout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MarcoGorelli/polars-plugins-tutorial/c132a11de495fe1b88eac767db14bb8337a88df2/docs/assets/structchunked_fields_memory_layout.png
--------------------------------------------------------------------------------
/docs/assets/timings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MarcoGorelli/polars-plugins-tutorial/c132a11de495fe1b88eac767db14bb8337a88df2/docs/assets/timings.png
--------------------------------------------------------------------------------
/docs/branch_mispredictions.md:
--------------------------------------------------------------------------------
1 | # 7. Branch mispredictions
2 |
3 | Time to go back to the past. In Section 2, I told you that the
4 | implementation we had of `abs_i64` wasn't the most efficient one
5 | you could possibly write. Time to see how to improve it!
6 |
7 | Which algorithm do you think would win?
8 |
9 | 1. for each row:
10 | - check if it's null or not
11 | - if it's not null, calculate its absolute value
12 | 2. for each row:
13 | - calculate its absolute value, even if we don't need it
14 | because it's a null row
15 |
16 | If you've not come across the concept of branch mispredictions
17 | before, then the answer may surprise you, because the second
18 | one is faster here. This is because `.abs` is a very fast
19 | operation, and wasting time checking whether each element is null
20 | or not actually slows us down!
21 |
22 | Here's how you can make `abs_i64` faster:
23 |
24 | ```Rust
25 | #[polars_expr(output_type=Int64)]
26 | fn abs_i64(inputs: &[Series]) -> PolarsResult {
27 | let s = &inputs[0];
28 | let ca = s.i64()?;
29 | let out = ca.apply_values(|x| x.abs());
30 | Ok(out.into_series())
31 | }
32 | ```
33 |
34 | For operations more complex than `.abs`, it may be that computing the operation
35 | for only the non-null values is cheaper. In general, you should
36 | measure, not guess.
37 | If you're just starting out with plugins and only need to beat
38 | `.map_elements`, then either of these solutions will blow it out
39 | of the water.
40 |
41 | 
42 |
43 | ## Practice!
44 |
45 | Can you go back and make a faster version of `sum_i64`?
46 |
--------------------------------------------------------------------------------
/docs/cum_sum.md:
--------------------------------------------------------------------------------
1 | # 4. Yes we SCAN
2 |
3 | The operations we've seen so far have all been elementwise, e.g.:
4 |
5 | - for each row, we calculated the absolute value
6 | - for each row, we summed the respective values in two columns
7 |
8 | Let's do something (completely) different - instead of working with
9 | each row in isolation, we'll calculate a quantity which depends on the
10 | rows which precede it.
11 |
12 | We're going to implement `cum_sum`.
13 |
14 | ## Python side
15 |
16 | Add this to `minimal_plugin/__init__.py`:
17 | ```python
18 | def cum_sum(expr: IntoExprColumn) -> pl.Expr:
19 | return register_plugin_function(
20 | args=[expr],
21 | plugin_path=LIB,
22 | function_name="cum_sum",
23 | is_elementwise=False,
24 | )
25 | ```
26 | Note how, unlike in previous examples, we set `is_elementwise=False`.
27 | You'll see why this is so important at the end of this page.
28 |
29 | ## Rust
30 |
31 | Time to learn a new Rust function: `scan`.
32 | If you're not familiar with it, please take a little break from this tutorial
33 | and [read the scan docs](https://doc.rust-lang.org/std/iter/trait.Iterator.html#method.scan).
34 |
35 | Welcome back! Let's use our newfound scan-superpowers to implement `cum_sum`. Here's what goes into `src/expressions.rs`:
36 | ```Rust
37 | #[polars_expr(output_type_func=same_output_type)]
38 | fn cum_sum(inputs: &[Series]) -> PolarsResult {
39 | let s = &inputs[0];
40 | let ca: &Int64Chunked = s.i64()?;
41 | let out: Int64Chunked = ca
42 | .iter()
43 | .scan(0_i64, |state: &mut i64, x: Option| {
44 | match x {
45 | Some(x) => {
46 | *state += x;
47 | Some(Some(*state))
48 | },
49 | None => Some(None),
50 | }
51 | })
52 | .collect_trusted();
53 | Ok(out.into_series())
54 | }
55 | ```
56 | Make sure to also add
57 | ```Rust
58 | use pyo3_polars::export::polars_core::utils::CustomIterTools;
59 | ```
60 | to the top of the file.
61 |
62 | The `cum_sum` definition may look complex, but it's not too bad once we
63 | break it down:
64 |
65 | - we hold the running sum in `state`
66 | - we iterate over rows, initialising `state` to be `0`
67 | - if the current row is `Some`, then add the current row's value to `state` and emit the current value of `state`
68 | - if the current row is `None`, then don't modify `state` and emit `None`
69 |
70 | Note how we use `collect_trusted` at the end, rather than `collect`.
71 | `collect` would work as well, but if we know the length of the output
72 | (and we do in this case, `cum_sum` doesn't change the column's length)
73 | then we can safely use `collect_trusted` and save some precious time.
74 |
75 | Let's compile with `maturin develop` (or `maturin develop --release`
76 | if you're benchmarking), change the last line of `run.py` to
77 | ```python
78 | print(df.with_columns(a_cum_sum=mp.cum_sum('a')))
79 | ```
80 | and then run `python run.py`:
81 |
82 | ```
83 | shape: (3, 3)
84 | ┌─────┬──────┬───────────┐
85 | │ a ┆ b ┆ a_cum_sum │
86 | │ --- ┆ --- ┆ --- │
87 | │ i64 ┆ i64 ┆ i64 │
88 | ╞═════╪══════╪═══════════╡
89 | │ 1 ┆ 3 ┆ 1 │
90 | │ 5 ┆ null ┆ 6 │
91 | │ 2 ┆ -1 ┆ 8 │
92 | └─────┴──────┴───────────┘
93 | ```
94 |
95 | ## Elementwise, my dear Watson
96 |
97 | Why was it so important to set `is_elementwise` correctly? Let's see
98 | with an example.
99 |
100 | Put the following in `run.py`:
101 | ```python
102 | import polars as pl
103 | import minimal_plugin as mp
104 |
105 | df = pl.DataFrame({
106 | 'a': [1, 2, 3, 4, None, 5],
107 | 'b': [1, 1, 1, 2, 2, 2],
108 | })
109 | print(df.with_columns(a_cum_sum=mp.cum_sum('a')))
110 | ```
111 |
112 | Then, run `python run.py`.
113 |
114 | Finally, go to `minimal_plugin/__init__.py` and change `is_elementwise`
115 | from `False` to `True`, and run `python run.py` again.
116 |
117 | In both cases, you should see the following output:
118 | ```
119 | shape: (6, 3)
120 | ┌──────┬─────┬───────────┐
121 | │ a ┆ b ┆ a_cum_sum │
122 | │ --- ┆ --- ┆ --- │
123 | │ i64 ┆ i64 ┆ i64 │
124 | ╞══════╪═════╪═══════════╡
125 | │ 1 ┆ 1 ┆ 1 │
126 | │ 2 ┆ 1 ┆ 3 │
127 | │ 3 ┆ 1 ┆ 6 │
128 | │ 4 ┆ 2 ┆ 10 │
129 | │ null ┆ 2 ┆ null │
130 | │ 5 ┆ 2 ┆ 15 │
131 | └──────┴─────┴───────────┘
132 | ```
133 | which looks correct. So, what's the deal with `is_elementwise`?
134 |
135 | The deal is that we need it in order for window functions / `group_by`s
136 | to be correct. Change the last line of `run.py` to
137 | ```python
138 | print(df.with_columns(a_cum_sum=mp.cum_sum('a').over('b')))
139 | ```
140 |
141 | Now, we get:
142 |
143 | - with `elementwise=True`:
144 |
145 | ```
146 | shape: (6, 3)
147 | ┌──────┬─────┬───────────┐
148 | │ a ┆ b ┆ a_cum_sum │
149 | │ --- ┆ --- ┆ --- │
150 | │ i64 ┆ i64 ┆ i64 │
151 | ╞══════╪═════╪═══════════╡
152 | │ 1 ┆ 1 ┆ 1 │
153 | │ 2 ┆ 1 ┆ 3 │
154 | │ 3 ┆ 1 ┆ 6 │
155 | │ 4 ┆ 2 ┆ 10 │
156 | │ null ┆ 2 ┆ null │
157 | │ 5 ┆ 2 ┆ 15 │
158 | └──────┴─────┴───────────┘
159 | ```
160 |
161 | - with `elementwise=False`:
162 |
163 | ```
164 | shape: (6, 3)
165 | ┌──────┬─────┬───────────┐
166 | │ a ┆ b ┆ a_cum_sum │
167 | │ --- ┆ --- ┆ --- │
168 | │ i64 ┆ i64 ┆ i64 │
169 | ╞══════╪═════╪═══════════╡
170 | │ 1 ┆ 1 ┆ 1 │
171 | │ 2 ┆ 1 ┆ 3 │
172 | │ 3 ┆ 1 ┆ 6 │
173 | │ 4 ┆ 2 ┆ 4 │
174 | │ null ┆ 2 ┆ null │
175 | │ 5 ┆ 2 ┆ 9 │
176 | └──────┴─────┴───────────┘
177 | ```
178 |
179 | Only `elementwise=False` actually respected the window! This is why
180 | it's important to set `elementwise` correctly.
181 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | # How you (yes, you!) can write a Polars Plugin
2 |
3 | - ✅ Unlock super-high performance
4 | - ✅ Have a tonne of fun
5 | - ✅ Impress everybody with your superpowers
6 |
7 | {: style="width:400px"}
8 |
9 | ## Why?
10 |
11 | Polars is an incredible and groundbreaking Dataframe library, and its expressions API
12 | is simply amazing. Sometimes, however, you need to express really custom business logic
13 | which just isn't in scope for the Polars API. In that situation, people tend to use
14 | `map_elements`, which lets you express anything but also kills most of Polars' benefits.
15 |
16 | But it doesn't have to be that way - with just basic Rust knowledge and this tutorial,
17 | I postulate that you'll be able to address at least 99% of inefficient `map_elements` tasks!
18 |
19 | ## What will you learn
20 |
21 | - Writing simple single-column elementwise expressions
22 | - Writing complex multi-column non-elementwise expressions which use third-party Rust packages
23 | - How to share your plugin superpowers with others
24 |
25 | ## What are people saying?
26 |
27 | **Nelson Griffiths**, Engineering & ML Lead at Double River Investments | Core Maintainer Functime
28 |
29 | > this was an awesome intro. I am no rust expert, though I have written a few plugins. And I learned quite a bit from this! Having my team read it now as well. Thanks for putting this together. I think more content like this for people who don’t know how to write optimal polars code on the rust side will be really useful for people like me who want to work on plugins!
30 |
31 | **Barak David**, Software Engineer
32 |
33 | > Amazing tutorial! I just created nltk plugin, and experienced X50 speedup!
34 |
--------------------------------------------------------------------------------
/docs/life_pt1.md:
--------------------------------------------------------------------------------
1 |
2 | # Extra.1 Well...
3 |
4 | > "No." - _Doom Slayer_
5 |
6 |
7 |
8 | !!!note
9 | This section is completely optional, and is provided for a bit
10 | of nerdy fun. It is by no means essential, feel free to skip
11 | it if it doesn't interest you!
12 |
13 | Well, someone can, probably. But doom in a dataframe would be kinda hard to play, so let's try something simpler.
14 | [Conway's Game of Life](https://en.wikipedia.org/wiki/Conway%27s_Game_of_Life) is a notorious Cellular Automaton that we could perhaps implement with a plugin.
15 | For science, of course.
16 |
17 | 
18 |
19 | Jokes aside, life allows us to show how a plugin can access elements in both neighbouring rows and columns for each element.
20 | With a little bit of extra Python, we can display things in an almost pretty manner.
21 |
22 | !!!note
23 | For this tutorial, we'll assume you created a new plugin from the
24 | cookiecutter template and named it `game_of_life`
25 | (these steps aren't shown here, since they were already covered at the
26 | very beginning of this series).
27 |
28 | In this section we'll cover the developer side of the plugin (both Python and Rust).
29 | In the next section we'll show how a user can import and use what we developed here.
30 |
31 | ## The Python side
32 |
33 | Let's take a look at what we'll implement first, in `game_of_life/__init__.py`:
34 |
35 | ```python
36 | import fileinput
37 | from collections import OrderedDict
38 | from itertools import tee, islice
39 | from os import PathLike
40 | from pathlib import Path
41 | from typing import Iterable, Any
42 |
43 | import polars as pl
44 | from polars._typing import IntoExprColumn
45 | from polars.plugins import register_plugin_function
46 |
47 |
48 | # Parse a board from a file or stdin
49 | def parse_board(ifile: str | ...) -> list[list[int]]: ...
50 |
51 | # Transpose a list of lists
52 | def _transpose(board: list[list[int]]) -> list[list[int]]: ...
53 |
54 | # Creates a DataFrame from a list of lists
55 | def board_to_df(board: list[list[int]]) -> pl.DataFrame: ...
56 |
57 | # Helper function to help us deal with corner cases
58 | def _nwise_wrapping(iterable: Iterable[Any], n: int): ...
59 |
60 | # Advance the simulation by n steps
61 | def step(df: pl.DataFrame, n: int = 1): ...
62 |
63 | # Register our plugin
64 | def life_step(left: IntoExprColumn, mid: IntoExprColumn, right: IntoExprColumn) -> pl.Expr: ...
65 | ```
66 |
67 | Starting with the function to parse a board from a file or stdin:
68 |
69 | ```python
70 | def parse_board(
71 | ifile: (
72 | str
73 | | bytes
74 | | PathLike[str]
75 | | PathLike[bytes]
76 | | Iterable[str | bytes | PathLike[str] | PathLike[bytes]]
77 | ),
78 | ) -> list[list[int]]:
79 | """
80 | Converts a board in a file containing only 0s and 1s, e.g.::
81 |
82 | 0010
83 | 0100
84 |
85 | into:
86 | [[0010],[0100]]
87 | """
88 | return [
89 | [c for ch in ln if (c := int(ch)) in [0, 1]]
90 | for line in fileinput.input(ifile)
91 | if len(ln := line.strip()) > 0
92 | ]
93 | ```
94 |
95 | Next, we have transpose. Why do we need it, anyway? Because the way a dataframe reads our list of lists is counter-intuitive when constructing it from a dict comprehension.
96 | If we start with an input board like:
97 |
98 | ```
99 | 0000
100 | 1111
101 | ```
102 |
103 | without transpose, we'd end up with:
104 |
105 | ```
106 | >>> import polars as pl
107 | >>> board = [[0,0,0,0],[1,1,1,1]]
108 | >>> pl.DataFrame({f"c{idx}": row for idx, row in enumerate(board)})
109 | shape: (4, 2)
110 | ┌─────┬─────┐
111 | │ c0 ┆ c1 │
112 | │ --- ┆ --- │
113 | │ i64 ┆ i64 │
114 | ╞═════╪═════╡
115 | │ 0 ┆ 1 │
116 | │ 0 ┆ 1 │
117 | │ 0 ┆ 1 │
118 | │ 0 ┆ 1 │
119 | └─────┴─────┘
120 | ```
121 |
122 | Not what we expected _visually_, so we transpose the initial board to have the resulting dataframe match it.
123 |
124 | ```python
125 | def _transpose(board: list[list[int]]) -> list[list[int]]:
126 | return [[row[idx] for row in board] for idx in range(len(board[0]))]
127 | ```
128 |
129 | Next one is `board_to_df`, which calls `_transpose` and constructs the DataFrame in a similar way to the example above.
130 | The padding detail is just to avoid columns with larger names than others, feel free to ignore it:
131 |
132 | ```python
133 | def board_to_df(board: list[list[int]]) -> pl.DataFrame:
134 | """
135 | Converts a list of lists of integers (0s and 1s) to a Polars DataFrame.
136 | The inner lists must have the same length.
137 | """
138 |
139 | # This is done because each row will become a column - the user likely
140 | # expects a dataframe that *visually* matches the input file
141 | board = _transpose(board)
142 |
143 | padding_len = len(str(len(board) - 1))
144 | board_t_dict = {f"{idx:0{padding_len}}": row for idx, row in enumerate(board)}
145 | return pl.DataFrame(
146 | board_t_dict,
147 | )
148 | ```
149 |
150 | Let's skip `_nwise_wrapping` and `step` for now and jump straight to the last function - we'll return to the two we skipped soon:
151 |
152 | !!!note
153 | Don't forget to read the comments!
154 |
155 | ```python
156 | def life_step(left: IntoExprColumn, mid: IntoExprColumn, right: IntoExprColumn) -> pl.Expr:
157 | """
158 | This is the function that registers the polars plugin. To use it directly,
159 | data must be in the correct format. An interesting way to do so is to use
160 | the same column names as the original data frame, so the resulting df will
161 | have the same shape. See how this is done in the `step(df, n)` function.
162 | """
163 | return register_plugin_function(
164 | args=[left, mid, right],
165 | plugin_path=LIB,
166 | function_name="life_step",
167 | is_elementwise=False,
168 | )
169 | ```
170 |
171 | Ok, plugin registered. How do we use it? We create columns in `step` with `with_columns`.
172 | And we do so in a way that the new columns will have the exact name as the previously existing ones, so they're overridden.
173 |
174 | But wait, there's something we didn't talk about.
175 | What happens at the border of the board (both vertically and horizontally)?
176 | Do we stop the simulation from propagating there, do we wrap around, or something else?
177 | Many implementations stop the simulation at the border, so let's do it differently, let's wrap around!
178 |
179 | Wait, why are we talking about this here - isn't this a concern to be solved by our plugin in Rust?
180 | Yes, but Python-land is where we name our columns.
181 | So in order to have that nice overriding behavior, we need to address it here.
182 | This is also a hint at what the mysterious `_nwise_wrapping` function does:
183 |
184 | ```python
185 | def _nwise_wrapping(iterable: Iterable[Any], n: int):
186 | """
187 | Returns overlapping n-tuples from an iterable, wrapping around. This means
188 | the result will have the same length as `iterable`. It also means the first
189 | element(s) will include elements from the end of the iterable, and
190 | likewise, the last element(s) will include elements from the start, e.g.::
191 |
192 | fn('ABCDE', 3) -> 'EAB', 'ABC', 'BCD', 'CDE', 'DEA'
193 | """
194 | elements = list(iterable)
195 | to_be_wrapped = elements[-(n - 2) :] + elements + elements[: n - 2]
196 | iterators = tee(to_be_wrapped, n)
197 | return [
198 | list(z) for z in zip(*(islice(it, i, None) for i, it in enumerate(iterators)))
199 | ]
200 | ```
201 |
202 | The implementation might look a bit complicated, but the docstring should clarify its goal.
203 |
204 | Now we're only missing `step`, which takes a DataFrame already in the expected format and returns another DataFrame with our plugin applied `n` times to it:
205 |
206 | ```python
207 | def step(df: pl.DataFrame, n: int = 1):
208 | """
209 | Takes a df and returns df.with_columns(...) corresponding to `n` advanced
210 | steps in the simulation
211 | """
212 | padding_len = len(str(df.width - 1))
213 |
214 | # colnums: [['{n-1}', '00', '01'], ['00', '01', '02'], ['01', '02', '03'], ... ]
215 | colnums = _nwise_wrapping([f"{idx:0{padding_len}}" for idx in range(df.width)], 3)
216 |
217 | # colnames: ['00', '01', '02', '03', ... , '{n-1}']
218 | colnames = [cols[1] for cols in colnums]
219 |
220 | # colvalues: [, ... ]
221 | colvalues = [life_step(*tuple(cols)) for cols in colnums]
222 |
223 | for _ in range(n):
224 | df = df.with_columns(**OrderedDict(zip(colnames, colvalues)))
225 | return df
226 | ```
227 |
228 | We're done with the Python side of things.
229 | And if you're wondering: "what plugin did we actually register with `life_step`?" -
230 | you're totally right to be confused, we didn't touch Rust yet!
231 | Why did we leave it for last?
232 | Because surprisingly, it's much simpler than the Python side, and much shorter too.
233 |
234 | ## Let's get rusty
235 |
236 | What do we need to do?
237 | For each element, we need to look at the the sum of the 8 neighbours, then apply the rule to decide whether the element will be dead or alive in the next iteration.
238 | Here's what our entire `src/expressions.rs` looks like:
239 |
240 | ```rust
241 | #![allow(clippy::unused_unit)]
242 | use polars::export::arrow::legacy::utils::CustomIterTools;
243 | use polars::prelude::*;
244 | use pyo3_polars::derive::polars_expr;
245 |
246 | #[polars_expr(output_type=Int64)]
247 | fn life_step(inputs: &[Series]) -> PolarsResult {
248 | let (ca_lf, ca_curr, ca_rt) = (inputs[0].i64()?, inputs[1].i64()?, inputs[2].i64()?);
249 |
250 | /*
251 | We're "counting" on the user not to append or modify the DataFrame created
252 | from the board file.
253 |
254 | In general, this might sound insane, but for our Game of Life, this is not
255 | so unreasonable.
256 | */
257 | let lf = ca_lf
258 | .cont_slice()
259 | .expect("Expected input to be contiguous (in a single chunk)");
260 | let mid = ca_curr
261 | .cont_slice()
262 | .expect("Expected input to be contiguous (in a single chunk)");
263 | let rt = ca_rt
264 | .cont_slice()
265 | .expect("Expected input to be contiguous (in a single chunk)");
266 |
267 | let len = lf.len();
268 |
269 | let out: Int64Chunked = mid
270 | .iter()
271 | .enumerate()
272 | .map(|(idx, val)| {
273 | // Neighbours above
274 | let prev_row = if 0 == idx {
275 | lf[len - 1] + mid[len - 1] + rt[len - 1]
276 | } else {
277 | lf[idx - 1] + mid[idx - 1] + rt[idx - 1]
278 | };
279 |
280 | // Curr row does not include cell in the middle,
281 | // a cell is not a neighbour of itself
282 | let curr_row = lf[idx] + rt[idx];
283 |
284 | // Neighbours below
285 | let next_row = if len - 1 == idx {
286 | lf[0] + mid[0] + rt[0]
287 | } else {
288 | lf[idx + 1] + mid[idx + 1] + rt[idx + 1]
289 | };
290 |
291 | // Life logic
292 | Some(match (val, prev_row + curr_row + next_row) {
293 | (1, 2) | (1, 3) => 1,
294 | (0, 3) => 1,
295 | _ => 0,
296 | })
297 | })
298 | .collect_trusted();
299 | Ok(out.into_series())
300 | }
301 | ```
302 |
303 | Awesome, now what? If we ignore tests, _as plugin developers_, we could say we're done.
304 | Nothing's happened yet, so how could we be done?
305 | In the next section we'll take a look at how the plugin _user_ would call the functions we made available.
306 |
--------------------------------------------------------------------------------
/docs/life_pt2.md:
--------------------------------------------------------------------------------
1 |
2 | # Extra.2 Plugin user
3 |
4 | In the last section we saw what the plugin developers made available for a plugin user.
5 | Now we put the user's hat and demonstrate that _usage_.
6 | For this, we'll implement a CLI app that will parse a board file provided as an argument, then run a step of the simulation every `delay` seconds (also provided as an argument).
7 |
8 | > Tip: place the code from this section in a separate file, e.g., `run.py`.
9 |
10 | Just like what we did previously, let's look at an overview of what's to come:
11 |
12 | ```python
13 | import argparse
14 | import contextlib
15 | import io
16 | import sys
17 | from time import sleep
18 |
19 | from game_of_life import parse_board, board_to_df, step
20 | import polars as pl
21 |
22 |
23 | class Application:
24 |
25 | # Initialize the board
26 | def __init__(self): ...
27 |
28 | # Printing the application object prints the board
29 | def __str__(self) -> str: ...
30 |
31 | # Run a step of the simulation every `delay` steps, for `n` maximum steps
32 | def start(self, n, delay, print_df): ...
33 | ```
34 |
35 | Notice how we're importing `parse_board`, `board_to_df` and `step` from our fully-developed plugin.
36 | This could've been installed with pip! Check the [publishing chapter](publishing.md) for more on this.
37 |
38 | So first things first: `__init__`.
39 | Here we use the stdlib `argparse` module to capture the command line arguments we mentioned above.
40 | Then, we call `board_to_df` with the result of `parse_board`, storing the resulting DataFrame in the `Application` object itself.
41 |
42 | ```python
43 | class Application:
44 |
45 | def __init__(self):
46 | self._args = argparse.Namespace()
47 | cli = argparse.ArgumentParser(
48 | prog="python -m game_of_life", description="Options"
49 | )
50 | cli.add_argument("-i", "--input", type=str, required=True)
51 | cli.add_argument("-d", "--delay", type=float, default=0.2)
52 | cli.add_argument("-n", "--num-steps", type=int, default=sys.maxsize)
53 |
54 | cli.parse_args(namespace=self._args)
55 |
56 | # [-i]
57 | self.ifile: str = self._args.input
58 |
59 | # [-d]
60 | self.delay: float = self._args.delay
61 |
62 | # [-n]
63 | self.steps: int = self._args.num_steps
64 |
65 | # Creates a pl.DataFrame from the provided file
66 | self.df = board_to_df(parse_board(self.ifile))
67 | ```
68 |
69 | Next, an optional but handy detail - we implement `__str__` for `Application` in a way that printing an `Application` object will actually print the DataFrame stored internally:
70 |
71 | ```python
72 | class Application:
73 |
74 | # ...
75 |
76 | def __str__(self) -> str:
77 | res = io.StringIO()
78 | with (
79 | pl.Config(tbl_rows=-1, tbl_cols=-1),
80 | contextlib.redirect_stdout(res),
81 | ):
82 | print(self.df)
83 | return res.getvalue()
84 | ```
85 |
86 | The `pl.Config` part just removes the default row and column limits when displaying a DataFrame - otherwise we'd see ellipses (`...`) instead of `1`s and `0`s.
87 |
88 | Finally, `start` is where we display the DataFrame and call `step` to advance the simulation, over and over:
89 |
90 | ```python
91 | class Application:
92 |
93 | # ...
94 |
95 | def start(
96 | self,
97 | n: int | None = None,
98 | delay: float | None = None,
99 | print_df: bool = True,
100 | ):
101 | if n is None:
102 | n = self.steps
103 |
104 | if delay is None:
105 | delay = self.delay
106 |
107 | if print_df:
108 | print(self)
109 |
110 | iteration_cnt = 0
111 | try:
112 | for _ in range(n):
113 | self.df = step(self.df)
114 | iteration_cnt += 1
115 | if print_df:
116 | # Clear screen
117 | print("\033[2J")
118 | print(self)
119 | sleep(delay)
120 |
121 | except KeyboardInterrupt:
122 | print(
123 | f"\nKeyboard Interrupt: ran for {iteration_cnt} iterations. Aborting..."
124 | )
125 | print(f"max_num_steps={self._args.num_steps}\ndelay={self._args.delay}")
126 | ```
127 |
128 | To run the program, we only need two more things - an entry point and an input file.
129 | Create a `toad.txt` in an `input` folder, containing:
130 |
131 | ```
132 | 00000000000
133 | 00000000000
134 | 00000000000
135 | 00001110000
136 | 00011100000
137 | 00000000000
138 | 00000000000
139 | 00000000000
140 | ```
141 |
142 | and add this entry point at the end of `run.py`:
143 |
144 | ```python
145 | if __name__ == "__main__":
146 | app = Application()
147 | app.start()
148 | ```
149 |
150 | Now we can see the results of our work, at last:
151 |
152 | ```shell
153 | # Compile the rust code
154 | maturin develop --release
155 |
156 | # Run the application
157 | python run.py -i input/toad.txt -d 0.3
158 | ```
159 |
160 | 
161 |
162 | __Victory!__
163 |
164 | ## Reference
165 |
166 | The entire code for this plugin, including the user's side, can be found on [GitHub](https://github.com/condekind/life_polars_plugin).
167 |
--------------------------------------------------------------------------------
/docs/lists.md:
--------------------------------------------------------------------------------
1 | # 9.0 Weighted-mean watchers
2 |
3 | According to [one YouTube talk](https://youtu.be/u5mIDz5ldmI?si=4AtnyyAwdVk33bYu),
4 | the `list` namespace is one of Polars' main selling points.
5 | If you're also a fan of it, this section will teach you how to extend it even further.
6 |
7 | ## Motivation
8 |
9 | Say you have
10 | ```python
11 | In [10]: df = pl.DataFrame({
12 | ...: 'values': [[1, 3, 2], [5, 7]],
13 | ...: 'weights': [[.5, .3, .2], [.1, .9]]
14 | ...: })
15 |
16 | In [11]: df
17 | Out[11]:
18 | shape: (2, 2)
19 | ┌───────────┬─────────────────┐
20 | │ values ┆ weights │
21 | │ --- ┆ --- │
22 | │ list[i64] ┆ list[f64] │
23 | ╞═══════════╪═════════════════╡
24 | │ [1, 3, 2] ┆ [0.5, 0.3, 0.2] │
25 | │ [5, 7] ┆ [0.1, 0.9] │
26 | └───────────┴─────────────────┘
27 | ```
28 |
29 | Can you calculate the mean of the values in `'values'`, weighted by the values in `'weights'`?
30 |
31 | So:
32 |
33 | - `.5*1 + .3*3 + .2*2 = 1.8`
34 | - `5*.1 + 7*.9 = 6.8`
35 |
36 | I don't know of an easy way to do this with Polars expressions. There probably is a way - but
37 | as you'll see here, it's not that hard to write a plugin, and it's probably faster too.
38 |
39 | ## Weighted mean
40 |
41 | On the Python side, this'll be similar to `sum_i64`:
42 |
43 | ```python
44 | def weighted_mean(expr: IntoExprColumn, weights: IntoExprColumn) -> pl.Expr:
45 | return register_plugin_function(
46 | args=[expr, weights],
47 | plugin_path=LIB,
48 | function_name="weighted_mean",
49 | is_elementwise=True,
50 | )
51 | ```
52 |
53 | On the Rust side, we'll define a helper function which will let us work with
54 | pairs of list chunked arrays:
55 |
56 | ```rust
57 | fn binary_amortized_elementwise<'a, T, K, F>(
58 | lhs: &'a ListChunked,
59 | rhs: &'a ListChunked,
60 | mut f: F,
61 | ) -> ChunkedArray
62 | where
63 | T: PolarsDataType,
64 | T::Array: ArrayFromIter>,
65 | F: FnMut(&AmortSeries, &AmortSeries) -> Option + Copy,
66 | {
67 | {
68 | let (lhs, rhs) = align_chunks_binary(lhs, rhs);
69 | lhs.amortized_iter()
70 | .zip(rhs.amortized_iter())
71 | .map(|(lhs, rhs)| match (lhs, rhs) {
72 | (Some(lhs), Some(rhs)) => f(&lhs, &rhs),
73 | _ => None,
74 | })
75 | .collect_ca(PlSmallStr::EMPTY)
76 | }
77 | }
78 | ```
79 |
80 | That's a bit of a mouthful, so let's try to make sense of it.
81 |
82 | - As we learned about in [Prerequisites], Polars Series are backed by chunked arrays.
83 | `align_chunks_binary` just ensures that the chunks have the same lengths. It may need
84 | to rechunk under the hood for us;
85 | - `amortized_iter` returns an iterator of `AmortSeries`, each of which corresponds
86 | to a row from our input.
87 |
88 | We'll explain more about `AmortSeries` in a future iteration of this tutorial.
89 | For now, let's just look at how to use this utility:
90 |
91 | - we pass it `ListChunked` as inputs;
92 | - we also pass a function which takes two `AmortSeries` and produces a scalar
93 | value.
94 |
95 | ```rust
96 | #[polars_expr(output_type=Float64)]
97 | fn weighted_mean(inputs: &[Series]) -> PolarsResult {
98 | let values = inputs[0].list()?;
99 | let weights = &inputs[1].list()?;
100 | polars_ensure!(
101 | values.dtype() == &DataType::List(Box::new(DataType::Int64)),
102 | ComputeError: "Expected `values` to be of type `List(Int64)`, got: {}", values.dtype()
103 | );
104 | polars_ensure!(
105 | weights.dtype() == &DataType::List(Box::new(DataType::Float64)),
106 | ComputeError: "Expected `weights` to be of type `List(Float64)`, got: {}", weights.dtype()
107 | );
108 |
109 | let out: Float64Chunked = binary_amortized_elementwise(
110 | values,
111 | weights,
112 | |values_inner: &AmortSeries, weights_inner: &AmortSeries| -> Option {
113 | let values_inner = values_inner.as_ref().i64().unwrap();
114 | let weights_inner = weights_inner.as_ref().f64().unwrap();
115 | if values_inner.len() == 0 {
116 | // Mirror Polars, and return None for empty mean.
117 | return None
118 | }
119 | let mut numerator: f64 = 0.;
120 | let mut denominator: f64 = 0.;
121 | values_inner
122 | .iter()
123 | .zip(weights_inner.iter())
124 | .for_each(|(v, w)| {
125 | if let (Some(v), Some(w)) = (v, w) {
126 | numerator += v as f64 * w;
127 | denominator += w;
128 | }
129 | });
130 | Some(numerator / denominator)
131 | },
132 | );
133 | Ok(out.into_series())
134 | }
135 | ```
136 |
137 | If you just need to get a problem solved, this function works! But let's note its
138 | limitations:
139 |
140 | - it assumes that each inner element of `values` and `weights` has the same
141 | length - it would be better to raise an error if this assumption is not met
142 | - it only accepts `Int64` `values` and `Float64` `weights`
143 | (see section 2 for how you could make it more generic).
144 |
145 | To try it out, we compile with `maturin develop` (or `maturin develop --release` if you're
146 | benchmarking), and then we should be able to run `run.py`:
147 |
148 | ```python
149 | import polars as pl
150 | import minimal_plugin as mp
151 |
152 | df = pl.DataFrame({
153 | 'values': [[1, 3, 2], [5, 7]],
154 | 'weights': [[.5, .3, .2], [.1, .9]]
155 | })
156 | print(df.with_columns(weighted_mean = mp.weighted_mean('values', 'weights')))
157 | ```
158 | to see
159 | ```
160 | shape: (2, 3)
161 | ┌───────────┬─────────────────┬───────────────┐
162 | │ values ┆ weights ┆ weighted_mean │
163 | │ --- ┆ --- ┆ --- │
164 | │ list[i64] ┆ list[f64] ┆ f64 │
165 | ╞═══════════╪═════════════════╪═══════════════╡
166 | │ [1, 3, 2] ┆ [0.5, 0.3, 0.2] ┆ 1.8 │
167 | │ [5, 7] ┆ [0.1, 0.9] ┆ 6.8 │
168 | └───────────┴─────────────────┴───────────────┘
169 | ```
170 |
171 | [Prerequisites]: ../prerequisites/
172 |
173 | ## Gimme ~~chocolate~~ challenge
174 |
175 | Could you implement a weighted standard deviation calculator?
176 |
--------------------------------------------------------------------------------
/docs/lists_in_lists_out.md:
--------------------------------------------------------------------------------
1 | # 9.1 Lists in, lists out, lists all about
2 |
3 | Chapter 9.0 ([Weighted-mean watchers]) was fun. Let's do it all over again!
4 |
5 | Or rather, let's do another list operation. We're going to start with
6 | a dataframe such as:
7 |
8 | ```python
9 | shape: (4, 1)
10 | ┌──────────────┐
11 | │ dense │
12 | │ --- │
13 | │ list[i64] │
14 | ╞══════════════╡
15 | │ [0, 9] │
16 | │ [8, 6, 0, 9] │
17 | │ null │
18 | │ [3, 3] │
19 | └──────────────┘
20 | ```
21 |
22 | Before we start, however, let's take a look into how Polars stores lists in memory.
23 | As we saw, lists are backed up by chunks.
24 | Inside each chunk, Polars stores all the lists ("rows") as one single list, while keeping track of where each row starts, and how many elements they have.
25 | This is consistent with Apache Arrow's columnar format.
26 | It looks something like this:
27 |
28 | 
29 |
30 | Back to where we were - we're going to try to count the indices which are non-zero. -->
31 |
32 | !!! note
33 |
34 | You don't really need a plugin to do this, you can just do
35 |
36 | ```python
37 | df.with_columns(sparse_indices=pl.col('dense').list.eval(pl.arg_where(pl.element() != 0)))
38 | ```
39 |
40 | But `eval` won't cover every need you ever have ever, so...it's good
41 | to learn how to do this as a plugin so you can then customize it according to your needs.
42 |
43 | ---
44 |
45 | Polars has a helper function built-in for dealing with this: `apply_amortized`. We can use it to apply
46 | a function to each element of a List Series. In this case, we just want to find the indices of non-zero
47 | elements, so we'll do:
48 |
49 | ```rust
50 | fn list_idx_dtype(input_fields: &[Field]) -> PolarsResult {
51 | let field = Field::new(input_fields[0].name.clone(), DataType::List(Box::new(IDX_DTYPE)));
52 | Ok(field.clone())
53 | }
54 |
55 | #[polars_expr(output_type_func=list_idx_dtype)]
56 | fn non_zero_indices(inputs: &[Series]) -> PolarsResult {
57 | let ca = inputs[0].list()?;
58 | polars_ensure!(
59 | ca.dtype() == &DataType::List(Box::new(DataType::Int64)),
60 | ComputeError: "Expected `List(Int64)`, got: {}", ca.dtype()
61 | );
62 |
63 | let out: ListChunked = ca.apply_amortized(|s| {
64 | let s: &Series = s.as_ref();
65 | let ca: &Int64Chunked = s.i64().unwrap();
66 | let out: IdxCa = ca
67 | .iter()
68 | .enumerate()
69 | .filter(|(_idx, opt_val)| opt_val != &Some(0))
70 | .map(|(idx, _opt_val)| Some(idx as IdxSize))
71 | .collect_ca(PlSmallStr::EMPTY);
72 | out.into_series()
73 | });
74 | Ok(out.into_series())
75 | }
76 | ```
77 | `apply_amortized` is a bit like the `apply_into_string_amortized` function we used in [How to STRING something together],
78 | in that it makes a big allocation upfront to amortize the allocation costs. Think of it as a list version
79 | of `apply_values`, where each element is itself a `Series`.
80 |
81 | Something new in this example is:
82 |
83 | - `IdxSize`
84 | - `IdxCa`
85 | - `IDX_DTYPE`
86 |
87 | `IdxSize` is either `u32` or `u64`, depending on your platform, and are what Polars generally uses
88 | for counting-related operations. `IdxCa` is the associated `ChunkedArray`, and `IDX_DTYPE` the associated
89 | Polars dtype.
90 |
91 | [Weighted-mean watchers]: ../lists/
92 | [How to STRING something together]: ../stringify/
93 |
94 | To finish this off, the Python side will be a bog-standard:
95 |
96 | ```python
97 | def non_zero_indices(expr: IntoExprColumn) -> pl.Expr:
98 | return register_plugin_function(
99 | args=[expr], plugin_path=LIB, function_name="non_zero_indices", is_elementwise=True
100 | )
101 | ```
102 |
103 | If we then make `run.py` with
104 |
105 | ```python
106 | import polars as pl
107 | import minimal_plugin as mp
108 |
109 | pl.Config().set_fmt_table_cell_list_len(10)
110 |
111 | df = pl.DataFrame({'dense': [[0, 9], [8, 6, 0, 9], None, [3, 3]]})
112 | print(df)
113 | print(df.with_columns(indices=mp.non_zero_indices('dense')))
114 | ```
115 | and compile with `maturin develop` (or `maturin develop --release` if you're benchmarking!)
116 | then we'll see
117 |
118 | ```
119 | shape: (4, 2)
120 | ┌──────────────┬───────────┐
121 | │ dense ┆ indices │
122 | │ --- ┆ --- │
123 | │ list[i64] ┆ list[u32] │
124 | ╞══════════════╪═══════════╡
125 | │ [0, 9] ┆ [1] │
126 | │ [8, 6, 0, 9] ┆ [0, 1, 3] │
127 | │ null ┆ null │
128 | │ [3, 3] ┆ [0, 1] │
129 | └──────────────┴───────────┘
130 | ```
131 |
132 | Yay, it worked! And not only that, but it's about 1.5x as fast as the `list.eval` solution
133 | noted above!
134 |
--------------------------------------------------------------------------------
/docs/lost_in_space.md:
--------------------------------------------------------------------------------
1 | # 12. Lost in space
2 |
3 | Suppose, hypothetically speaking, that you're lost somewhere and only have access
4 | to your latitude, your longitude, and a laptop on which you can write a Polars Plugin.
5 | How can you find out what the closest city to you is?
6 |
7 | ## Reverse geocoding
8 |
9 | The practice of starting with a (latitude, longitude) pair and finding out which
10 | city it corresponds to is known as reverse geocoding.
11 | We're not going to implement a reverse geocoder from scratch - instead, we'll
12 | use the `reverse-geocoder` crate and make a plugin out of it!
13 |
14 | ## Cargo here, cargo there, cargo everywhere
15 |
16 | Let's add that crate to our project by running `cargo add reverse-geocoder`.
17 | You'll need to activate the nightly Rust channel, which you can do by making
18 | a file ` rust-toolchain.toml` in your root directory
19 | ```toml
20 | [toolchain]
21 | channel = "nightly"
22 | ```
23 | You'll also need to add `polars-arrow` and `polars-core` to `Cargo.toml`
24 | and pin them to the same version that you pin `polars` to.
25 | Yes, this example is getting a bit heavier...
26 |
27 | The way the `reverse-geocoder` crate works is:
28 |
29 | - you instantiate a `ReverseGeocoder` instance
30 | - you pass a (latitude, longitude) pair to `search`
31 | - you get the city name out
32 |
33 | So our plugin will work by taking two `Float64` columns (one of latitude, one
34 | for longitude) and producing a String output column.
35 |
36 | ## Binary elementwise apply to buffer
37 |
38 | In [How to STRING something together], we learned how to use `StringChunked.apply_into_string_amortized`
39 | to run an elementwise function on a String column. Does Polars have a binary version of that one
40 | which allows us to start from any data type?
41 |
42 | [Prerequisites]: ../prerequisites/
43 | [How to STRING something together]: ../stringify/
44 |
45 | Unfortunately, not. But, this is a good chance to learn about a few new concepts!
46 |
47 | We'll start easy by dealing with the Python side. Add the following to `minimal_plugin/__init__.py`:
48 |
49 | ```python
50 | def reverse_geocode(lat: IntoExprColumn, long: IntoExprColumn) -> pl.Expr:
51 | return register_plugin_function(
52 | args=[lat, long], plugin_path=LIB, function_name="reverse_geocode", is_elementwise=True
53 | )
54 | ```
55 |
56 | On the Rust side, in `src/expressions.rs`, get ready for it, we're going to add:
57 |
58 | ```Rust
59 | use polars_arrow::array::MutablePlString;
60 | use polars_core::utils::align_chunks_binary;
61 | use reverse_geocoder::ReverseGeocoder;
62 |
63 | #[polars_expr(output_type=String)]
64 | fn reverse_geocode(inputs: &[Series]) -> PolarsResult {
65 | let latitude = inputs[0].f64()?;
66 | let longitude = inputs[1].f64()?;
67 | let geocoder = ReverseGeocoder::new();
68 | let out = binary_elementwise_into_string_amortized(latitude, longitude, |lhs, rhs, out| {
69 | let search_result = geocoder.search((lhs, rhs));
70 | write!(out, "{}", search_result.record.name).unwrap();
71 | });
72 | Ok(out.into_series())
73 | }
74 | ```
75 |
76 | We use the utility function `binary_elementwise_into_string_amortized`,
77 | which is a binary version of `apply_into_string_amortized` which we learned
78 | about in the [Stringify] chapter.
79 |
80 | [Stringify]: ../stringify/
81 |
82 | To run it, put the following in `run.py`:
83 | ```python
84 | import polars as pl
85 | import minimal_plugin as mp
86 |
87 | df = pl.DataFrame({
88 | 'lat': [37.7749, 51.01, 52.5],
89 | 'lon': [-122.4194, -3.9, -.91]
90 | })
91 | print(df.with_columns(city=mp.reverse_geocode('lat', 'lon')))
92 | ```
93 | then compile with `maturin develop` (or `maturin develop --release` if you're benchmarking)
94 | and you should see
95 | ```
96 | shape: (3, 3)
97 | ┌─────────┬───────────┬───────────────────┐
98 | │ lat ┆ lon ┆ city │
99 | │ --- ┆ --- ┆ --- │
100 | │ f64 ┆ f64 ┆ str │
101 | ╞═════════╪═══════════╪═══════════════════╡
102 | │ 37.7749 ┆ -122.4194 ┆ San Francisco │
103 | │ 51.01 ┆ -3.9 ┆ South Molton │
104 | │ 52.5 ┆ -0.91 ┆ Market Harborough │
105 | └─────────┴───────────┴───────────────────┘
106 | ```
107 | in the output!
108 |
109 | Great, now in our hypothetical scenario, you're probably still lost, but
110 | at least you know which city you're closest to.
111 |
--------------------------------------------------------------------------------
/docs/noop.md:
--------------------------------------------------------------------------------
1 | # 1. How to do nothing
2 |
3 | That's right - this section is about how to do _nothing_.
4 |
5 | We'll write a Polars plugin which takes an expression, and returns it exactly
6 | as it is. Nothing more, nothing less. This will just be an exercise in setting
7 | everything up!
8 |
9 | If you followed the instructions in [Prerequisites], then your working directory
10 | should look a bit like the following:
11 | ```
12 | .
13 | ├── Cargo.toml
14 | ├── minimal_plugin
15 | │ ├── __init__.py
16 | │ └── typing.py
17 | ├── pyproject.toml
18 | ├── run.py
19 | ├── src
20 | │ ├── expressions.rs
21 | │ └── lib.rs
22 | └── tests
23 | ```
24 | The cookiecutter command you ran earlier set up a Polars plugin project with a
25 | sample function called `pig_latinnify` already implemented. The [Polars Plugins Cookiecutter](https://github.com/MarcoGorelli/cookiecutter-polars-plugins)
26 | helps you quickly start a Polars plugin project, skipping the boilerplate setup.
27 | Check it out for more details!
28 |
29 | [Prerequisites]: ../prerequisites/
30 |
31 | ## The Python side
32 |
33 | Let's start by getting the Python side ready. It won't run until we
34 | implement the Rust side too, but it's a necessary step.
35 | Start by adding the following to `minimal_plugin/__init__.py`:
36 |
37 | ```python
38 | def noop(expr: IntoExprColumn) -> pl.Expr:
39 | return register_plugin_function(
40 | args=[expr],
41 | plugin_path=LIB,
42 | function_name="noop",
43 | is_elementwise=True,
44 | )
45 | ```
46 | Let's go through this line-by-line:
47 |
48 | - when we compile Rust, it generates a Shared Object file.
49 | The `LIB` variable holds its filepath;
50 | - We'll cover `is_elementwise` in [Yes we SCAN], for now don't pay attention to it;
51 | - We use the Polars utility function [register_plugin_function](https://docs.pola.rs/py-polars/html/reference/plugins.html#polars.plugins.register_plugin_function) to extend its functionality with our own.
52 |
53 |
54 | Note that string literals are parsed as expressions, so that if somebody
55 | calls `noop('a')`, it gets interpreted as `noop(pl.col('a'))`.
56 |
57 | [Yes we SCAN]: ../cum_sum/
58 |
59 | ## Let's get Rusty
60 |
61 | Let's leave `src/lib.rs` as it is, and add the following to `src/expressions.rs`:
62 |
63 | ``` rust
64 | fn same_output_type(input_fields: &[Field]) -> PolarsResult {
65 | let field = &input_fields[0];
66 | Ok(field.clone())
67 | }
68 |
69 | #[polars_expr(output_type_func=same_output_type)]
70 | fn noop(inputs: &[Series]) -> PolarsResult {
71 | let s = &inputs[0];
72 | Ok(s.clone())
73 | }
74 | ```
75 |
76 | There's a lot to cover here so we'll break it down below.
77 |
78 | ### Defining `noop`'s schema
79 |
80 | Polars needs to know the schema/dtypes resulting from operations to make good
81 | optimization decisions. The way we tell Polars what to expect from our custom
82 | function is with the `polars_expr` attribute.
83 |
84 | Our beautiful `noop` doesn't change the data type (in fact, it doesn't change anything...)
85 | so we'll just write a function which returns the same input type:
86 |
87 | ```Rust
88 | fn same_output_type(input_fields: &[Field]) -> PolarsResult {
89 | let field = &input_fields[0];
90 | Ok(field.clone())
91 | }
92 | ```
93 | and use that to define the function output's schema. Just like
94 | `noop`, this function takes a reference to its only input and
95 | clones it.
96 |
97 | ### Defining `noop`'s body
98 |
99 | The input is an iterable of `Series`. In our case, `noop` just
100 | receives a single Series as input, but as we'll see in later
101 | sections, it's possible to pass multiple Series.
102 |
103 | We said we wanted our function to do nothing, so let's implement
104 | that: take a reference to the first (and only) input Series,
105 | and return a (cheap!) clone of it.
106 |
107 | ## Putting it all together
108 |
109 | Right, does this all work? Let's edit the Python file `run.py`,
110 | which we will use for testing. We'll just make a toy dataframe
111 | and apply `noop` to each column!
112 | ```python
113 | import polars as pl
114 | import minimal_plugin as mp
115 |
116 | df = pl.DataFrame({
117 | 'a': [1, 1, None],
118 | 'b': [4.1, 5.2, 6.3],
119 | 'c': ['hello', 'everybody!', '!']
120 | })
121 | print(df.with_columns(mp.noop(pl.all()).name.suffix('_noop')))
122 | ```
123 |
124 | Let's compile! Please run `maturin develop` (or `maturin develop --release` if benchmarking).
125 | You'll need to do this every time you change any of your Rust code.
126 | It may take a while the first time, but subsequent executions will
127 | be significantly faster as the build process is incremental.
128 |
129 | Finally, you can run your code! If you run `python run.py` and get
130 | the following output:
131 | ```
132 | shape: (3, 6)
133 | ┌──────┬─────┬────────────┬────────┬────────┬────────────┐
134 | │ a ┆ b ┆ c ┆ a_noop ┆ b_noop ┆ c_noop │
135 | │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
136 | │ i64 ┆ f64 ┆ str ┆ i64 ┆ f64 ┆ str │
137 | ╞══════╪═════╪════════════╪════════╪════════╪════════════╡
138 | │ 1 ┆ 4.1 ┆ hello ┆ 1 ┆ 4.1 ┆ hello │
139 | │ 1 ┆ 5.2 ┆ everybody! ┆ 1 ┆ 5.2 ┆ everybody! │
140 | │ null ┆ 6.3 ┆ ! ┆ null ┆ 6.3 ┆ ! │
141 | └──────┴─────┴────────────┴────────┴────────┴────────────┘
142 | ```
143 | then it means everything worked correctly. Congrats!
144 |
145 | You're now ready to learn how to do ABSolutely nothing.
146 |
--------------------------------------------------------------------------------
/docs/prerequisites.md:
--------------------------------------------------------------------------------
1 | # 0. Prerequisites
2 |
3 | ## Knowledge
4 |
5 | > "But you know what I like more than materialistic things? Knowledge." Tai Lopez
6 |
7 | How much Rust do you need to know to write your own Polars plugin? Less than
8 | you think.
9 |
10 | I'd suggest starting out with the [Rustlings](https://github.com/rust-lang/rustlings)
11 | course, which provides some fun and interactive exercises designed to make you familiar
12 | with the language. I'd suggest starting the following sections:
13 |
14 | - 00 intro
15 | - 01 variables
16 | - 02 functions
17 | - 03 if
18 | - 05 vecs
19 | - 12 options
20 | - 13 error handling
21 |
22 | You'll also need basic Python knowledge: classes, decorators, and functions.
23 |
24 | Alternatively, you could just clone this repo and then hack away
25 | at the examples trial-and-error style until you get what you're looking
26 | for - the compiler will probably help you more than you're expecting.
27 |
28 | ## Software
29 |
30 | To get started, please [install cookiecutter](https://cookiecutter.readthedocs.io/en/stable/README.html#installation).
31 |
32 | Then, from your home directory (or wherever you store your Python projects) please run
33 | ```
34 | cookiecutter https://github.com/MarcoGorelli/cookiecutter-polars-plugins
35 | ```
36 | When prompted, please enter (let's suppose your name is "Maja Anima", but replace that
37 | with your preferred name):
38 | ```
39 | [1/3] plugin_name (Polars Cookiecutter): Minimal Plugin
40 | [2/3] project_slug (polars_minimal_plugin):
41 | [3/3] author (anonymous): Maja Anima
42 | ```
43 | This will create a folder call `minimal_plugin`.
44 | Please navigate to it with `cd minimal_plugin`.
45 |
46 | Next, [create a Python3.8+ virtual environment](https://docs.python.org/3/library/venv.html), and install:
47 |
48 | - `polars>=1.3.0`
49 | - `maturin>=1.4.0`
50 |
51 | Finally, you'll also need to [install Rust](https://rustup.rs/).
52 |
53 | That's it! However, you are highly encouraged to also install
54 | [rust-analyzer](https://rust-analyzer.github.io/manual.html) if you want to
55 | improve your Rust-writing experience by exactly 120%.
56 |
57 | ## What's in a Series?
58 |
59 | If you take a look at a Series such as
60 | ```python
61 | In [9]: s = pl.Series([None, 2, 3]) + 42
62 |
63 | In [10]: s
64 | Out[10]:
65 | shape: (3,)
66 | Series: '' [i64]
67 | [
68 | null
69 | 44
70 | 45
71 | ]
72 | ```
73 | you may be tempted to conclude that it contains three values: `[null, 44, 45]`.
74 |
75 | However, if you print out `s._get_buffers()`, you'll see
76 | something different:
77 |
78 | - `s._get_buffers()["values"]`: `[42, 44, 45]`. These are the _values_.
79 | - `s._get_buffers()["validity"]`: `[False, True, True]`. These are the _validities_.
80 |
81 | So we don't really have integers and `null` mixed together into a single array - we
82 | have a pair of arrays, one holding values and another one holding booleans indicating
83 | whether each value is valid or not.
84 | If a value appears as `null` to you, then there's no guarantee about what physical number
85 | is behind it! It was `42` here, but it could well be `43`, or any other number,
86 | in another example.
87 |
88 | ## What's a chunk?
89 |
90 | A Series is backed by chunked arrays, each of which holds data which is contiguous in
91 | memory.
92 |
93 | Here's an example of a Series backed by multiple chunks:
94 | ```python
95 | In [27]: s = pl.Series([1,2,3])
96 |
97 | In [28]: s = s.append(pl.Series([99, 11]))
98 |
99 | In [29]: s
100 | Out[29]:
101 | shape: (5,)
102 | Series: '' [i64]
103 | [
104 | 1
105 | 2
106 | 3
107 | 99
108 | 11
109 | ]
110 |
111 | In [30]: s.get_chunks()
112 | Out[30]:
113 | [shape: (3,)
114 | Series: '' [i64]
115 | [
116 | 1
117 | 2
118 | 3
119 | ],
120 | shape: (2,)
121 | Series: '' [i64]
122 | [
123 | 99
124 | 11
125 | ]]
126 | ```
127 | Chunked arrays will come up in several examples in this tutorial.
128 |
--------------------------------------------------------------------------------
/docs/publishing.md:
--------------------------------------------------------------------------------
1 | # 14. Publishing your plugin to PyPI and becoming famous
2 |
3 | Here are the steps you should follow:
4 |
5 | 1. publish plugin to PyPI
6 | 2. ???
7 | 3. profit
8 |
9 | This section deals with step 1, and assumes your project live on GitHub.
10 |
11 | ## Set up trusted publishing
12 |
13 | If you followed the [Prerequisites] steps, you should have `.github/workflows/publish_to_pypi.yml`,
14 | `Makefile`, and `requirements.txt` files. If not, go back and follow the cookiecutter step.
15 |
16 | Next, set up an account on Pypi.org, can't do much without that.
17 |
18 | Third, on PyPI, you'll want to (note: this is taken almost verbatim from [PyPA](https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/#configuring-trusted-publishing)):
19 |
20 | 1. Go to https://pypi.org/manage/account/publishing/.
21 | 2. Fill in the name you wish to publish your new PyPI project under (the name value in your pyproject.toml), the GitHub repository owner’s name (org or user), and repository name, and the name of the release workflow file under the .github/ folder, see Creating a workflow definition. Finally, add the name of the GitHub Environment (pypi) we’re going set up under your repository. Register the trusted publisher.
22 |
23 | Finally, if you make a commit and tag it, and then push, then a release should be triggered! It will then be
24 | available for install across different platforms, which would be really hard (impossible?) to do if you were building
25 | the wheel manually and uploading to PyPI yourself.
26 |
27 | ## PYPI_API_TOKEN
28 |
29 | You'll also need a repository secret called `PYPI_API_TOKEN`. In PyPI,
30 | create an API token scoped just to your project, and then save it in your
31 | repository's secrets using the name `PYPI_API_TOKEN`.
32 |
33 | [Prerequisites]: ../prerequisites/
34 |
--------------------------------------------------------------------------------
/docs/requirements-docs.txt:
--------------------------------------------------------------------------------
1 | mkdocs
2 | mkdocs-material
--------------------------------------------------------------------------------
/docs/stem.md:
--------------------------------------------------------------------------------
1 | # 6. How to CRATE something else entirely
2 |
3 | Take a look at [crates.io](https://crates.io/) - there's _so_ much good stuff there!
4 | There's probably a package for practically any use case.
5 |
6 | For example, this looks like a fun one: [rust_stemmers](https://crates.io/crates/rust-stemmers).
7 | It lets us input a word, and stem it (i.e. reduce it to a simpler version, e.g. 'fearlessly' ->
8 | 'fearless').
9 | Can we make a plugin out of it?
10 |
11 | ## Cargo this, cargo that
12 |
13 | If we're going to use `rust_stemmers`, we're going to need to take it on as a dependency.
14 | The easiest way to do this is probably to run `cargo add rust_stemmers` - run this, and
15 | watch how `Cargo.toml` changes!
16 | You should see the line
17 | ```toml
18 | rust-stemmers = "1.2.0"
19 | ```
20 | somewhere in there.
21 |
22 | ## Writing a Snowball Stemmer
23 |
24 | Let's write a function which:
25 |
26 | - takes a `Utf8` columns as input;
27 | - produces a `Utf8` column as output.
28 |
29 | We'd like to be able to call it as follows:
30 |
31 | ```python
32 | df.with_columns(stemmed_word=mp.snowball_stem('word'))
33 | ```
34 |
35 | On the Python side, let's add the following function to `minimal_plugin/__init__.py`:
36 |
37 | ```python
38 | def snowball_stem(expr: IntoExprColumn) -> pl.Expr:
39 | return register_plugin_function(
40 | args=[expr],
41 | plugin_path=LIB,
42 | function_name="snowball_stem",
43 | is_elementwise=True,
44 | )
45 | ```
46 |
47 | Then, we can define the function like this in `src/expressions.rs`:
48 |
49 | ```Rust
50 | use rust_stemmers::{Algorithm, Stemmer};
51 |
52 | #[polars_expr(output_type=String)]
53 | fn snowball_stem(inputs: &[Series]) -> PolarsResult {
54 | let ca: &StringChunked = inputs[0].str()?;
55 | let en_stemmer = Stemmer::create(Algorithm::English);
56 | let out: StringChunked = ca.apply_into_string_amortized(|value: &str, output: &mut String| {
57 | write!(output, "{}", en_stemmer.stem(value)).unwrap()
58 | });
59 | Ok(out.into_series())
60 | }
61 | ```
62 |
63 | Let's try it out! Put the following in `run.py`:
64 | ```python
65 | import polars as pl
66 | import minimal_plugin as mp
67 |
68 | df = pl.DataFrame({'word': ["fearlessly", "littleness", "lovingly", "devoted"]})
69 | print(df.with_columns(b=mp.snowball_stem('word')))
70 | ```
71 |
72 | If you then compile with `maturin develop` (or `maturin develop --release`
73 | if you're benchmarking), and run it with `python run.py`, you'll see:
74 | ```
75 | shape: (4, 2)
76 | ┌────────────┬──────────┐
77 | │ a ┆ b │
78 | │ --- ┆ --- │
79 | │ str ┆ str │
80 | ╞════════════╪══════════╡
81 | │ fearlessly ┆ fearless │
82 | │ littleness ┆ littl │
83 | │ lovingly ┆ love │
84 | │ devoted ┆ devot │
85 | └────────────┴──────────┘
86 | ```
87 |
88 | In this example, we took on an extra dependency, which increased
89 | the size of the package. By using plugins, we have a way of accessing
90 | extra functionality without having to bloat up the size of the main
91 | Polars install too much!
92 |
93 | ## Stretch goal
94 |
95 | Browse through `crates.io` - is there any other crate you could use
96 | to make your own plugin out of?
97 |
--------------------------------------------------------------------------------
/docs/stringify.md:
--------------------------------------------------------------------------------
1 | # 5. How to STRING something together
2 |
3 | Tired of examples which only include numeric data? Me neither.
4 | But we need to address the elephant in the room: strings.
5 |
6 | We're going to start by re-implementing a pig-latinnifier.
7 | This example is already part of the `pyo3-polars` repo examples,
8 | but we'll tackle it with a different spin here by first doing it
9 | the wrong way 😈.
10 |
11 | ## Pig-latinnify - take 1
12 |
13 | Let's start by doing this the wrong way.
14 | We'll use our `abs` example, and adapt it to the
15 | string case. We'll follow the same strategy:
16 |
17 | - iterate over arrow arrays;
18 | - for each element in each array, create a new output value.
19 |
20 | Put the following in `src/expressions.rs`:
21 |
22 | ```Rust
23 | use std::borrow::Cow;
24 | use std::fmt::Write;
25 |
26 | #[polars_expr(output_type=String)]
27 | fn pig_latinnify(inputs: &[Series]) -> PolarsResult {
28 | let s = &inputs[0];
29 | let ca = s.str()?;
30 | let out: StringChunked = ca.apply(|opt_v: Option<&str>| {
31 | opt_v.map(|value: &str| {
32 | // Not the recommended way to do it,
33 | // see below for a better way!
34 | if let Some(first_char) = value.chars().next() {
35 | Cow::Owned(format!("{}{}ay", &value[1..], first_char))
36 | } else {
37 | Cow::Borrowed(value)
38 | }
39 | })
40 | });
41 | Ok(out.into_series())
42 | }
43 | ```
44 | If you're not familiar with [clone-on-write](https://doc.rust-lang.org/std/borrow/enum.Cow.html),
45 | don't worry about it - we're about to see a simpler and better way to do this anyway.
46 | What I'd like you to focus on is that for every row, we're creating a new `String`.
47 |
48 | If you combine this with a Python definition (which you should put
49 | in `minimal_plugin/__init__.py`):
50 |
51 | ```python
52 | def pig_latinnify(expr: IntoExprColumn) -> pl.Expr:
53 | return register_plugin_function(
54 | args=[expr],
55 | plugin_path=LIB,
56 | function_name="pig_latinnify",
57 | is_elementwise=True,
58 | )
59 | ```
60 | then you'll be able to pig-latinnify a column of strings! To see it
61 | in action, compile with `maturin develop` (or `maturin develop --release`
62 | if you're benchmarking) and put the following in `run.py`:
63 |
64 | ```python
65 | import polars as pl
66 | import minimal_plugin as mp
67 |
68 | df = pl.DataFrame({'a': ["I", "love", "pig", "latin"]})
69 | print(df.with_columns(a_pig_latin=mp.pig_latinnify('a')))
70 | ```
71 | ```
72 | shape: (4, 2)
73 | ┌───────┬─────────────┐
74 | │ a ┆ a_pig_latin │
75 | │ --- ┆ --- │
76 | │ str ┆ str │
77 | ╞═══════╪═════════════╡
78 | │ I ┆ Iay │
79 | │ love ┆ ovelay │
80 | │ pig ┆ igpay │
81 | │ latin ┆ atinlay │
82 | └───────┴─────────────┘
83 | ```
84 |
85 | This will already be an order of magnitude faster than using `map_elements`.
86 | But as mentioned earlier, we're creating a new string for every single row.
87 |
88 | Can we do better?
89 |
90 | ## Pig-latinnify - take 2
91 |
92 | Yes! `StringChunked` has a utility `apply_into_string_amortized` method which amortises
93 | the cost of creating new strings for each row by creating a string upfront,
94 | clearing it, and repeatedly writing to it.
95 | This gives a 4x speedup! All you need to do is change `pig_latinnify` to:
96 |
97 | ```Rust
98 | #[polars_expr(output_type=String)]
99 | fn pig_latinnify(inputs: &[Series]) -> PolarsResult {
100 | let ca: &StringChunked = inputs[0].str()?;
101 | let out: StringChunked = ca.apply_into_string_amortized(|value: &str, output: &mut String| {
102 | if let Some(first_char) = value.chars().next() {
103 | write!(output, "{}{}ay", &value[1..], first_char).unwrap()
104 | }
105 | });
106 | Ok(out.into_series())
107 | }
108 | ```
109 |
110 | Simpler, faster, and more memory-efficient.
111 | _Thinking about allocations_ can really make a difference!
112 |
113 | ## So let's think about allocations!
114 |
115 | If you have an elementwise function which produces `String` output, then chances are it does one of the following:
116 |
117 | - Creates a new string. In this case, you can use `apply_into_string_amortized` to amortise the cost of allocating a new string for each input row,
118 | as we did above in `pig_latinnify`. This works by allocating a `String` upfront and then repeatedly re-writing to it.
119 | - Slices the original string. In this case, you can use `apply_values` with `Cow::Borrowed`, for example:
120 |
121 | ```rust
122 | fn remove_last_extension(s: &str) -> &str {
123 | match s.rfind('.') {
124 | Some(pos) => &s[..pos],
125 | None => s,
126 | }
127 | }
128 |
129 | #[polars_expr(output_type=String)]
130 | fn remove_extension(inputs: &[Series]) -> PolarsResult {
131 | let s = &inputs[0];
132 | let ca = s.str()?;
133 | let out: StringChunked = ca.apply_values(|val| {
134 | let res = Cow::Borrowed(remove_last_extension(val));
135 | res
136 | });
137 | Ok(out.into_series())
138 | }
139 | ```
140 |
141 | There are low-level optimisations you can do to take things further, but - if in doubt - `apply_into_string_amortized` / `binary_elementwise_into_string_amortized` are probably good enough.
142 |
--------------------------------------------------------------------------------
/docs/struct.md:
--------------------------------------------------------------------------------
1 | # 10. STRUCTin'
2 |
3 | > "Day one, I'm in love with your struct" Thumpasaurus (kinda)
4 |
5 | ---
6 |
7 | For this chapter, we need to start by activating the necessary feature - in `Cargo.toml`, please make this change:
8 |
9 | ```diff
10 | -polars = { version = "0.46.0", default-features = false }
11 | +polars = { version = "0.46.0", features=["dtype-struct"], default-features = false }
12 | ```
13 |
14 | ---
15 |
16 | How do we consume structs, and how do we return them?
17 |
18 | Let's try creating a Polars DataFrame in Python that stores a struct similar to this one:
19 |
20 | ```rust
21 | struct Point2D {
22 | x: f64,
23 | y: f64,
24 | rgb: u32,
25 | }
26 | ```
27 |
28 | { style="display: block; margin: 0 auto;" }
29 |
30 | There are different ways of doing that, but that doesn't matter now. Here's one way:
31 | ```python
32 | df = pl.DataFrame(
33 | {
34 | "x": [1.0, 1.25, 1.5, 1.75],
35 | "y": [3.0, 2.75, 2.5, 2.25],
36 | "rgba": [0x00FF7FFF, 0xFF7F00FF, 0x7F7F7FFF, 0xD8D8D8FF],
37 | }
38 | ).select(
39 | point_2d_s=pl.struct(
40 | "x", "y", "rgba",
41 | schema={
42 | "x": pl.Float64,
43 | "y": pl.Float64,
44 | "rgba": pl.UInt32,
45 | }
46 | )
47 | )
48 | ```
49 |
50 | If we `print(df)`, here's what we have:
51 |
52 | ```
53 | shape: (4, 1)
54 | ┌────────────────────────┐
55 | │ point_2d_s │
56 | │ --- │
57 | │ struct[3] │
58 | ╞════════════════════════╡
59 | │ {1.0,3.0,16744447} │
60 | │ {1.25,2.75,4286513407} │
61 | │ {1.5,2.5,2139062271} │
62 | │ {1.75,2.25,3638089983} │
63 | └────────────────────────┘
64 | ```
65 |
66 | Now's an excellent time to ask: how's that stored in memory? Before we get to that answer, consider this other scenario, in Rust:
67 |
68 | ```rust
69 | let v: [Point2D; 4] = [
70 | Point2D {
71 | x: 1.0,
72 | y: 3.0,
73 | rgb: 0x00FF7FFFu32,
74 | },
75 | Point2D {
76 | x: 1.25,
77 | y: 2.75,
78 | rgb: 0xFF7F00FFu32,
79 | },
80 | Point2D {
81 | x: 1.5,
82 | y: 2.5,
83 | rgb: 0x7F7F7FFFu32,
84 | },
85 | Point2D {
86 | x: 1.75,
87 | y: 2.25,
88 | rgb: 0xD8D8D8FFu32,
89 | },
90 | ];
91 | ```
92 |
93 | How's this one stored in memory? You might find that answer easier, it's an array of struct instances, so we have the `x`, `y` and `rgba` fields contiguously in memory, like that:
94 |
95 | { style="display: block; margin: 0 auto;" }
96 |
97 | This is consistent with how C, C++, and many other languages store structs in memory.
98 | How's our struct-in-a-DataFrame different?
99 |
100 | Polars follows the Arrow protocol for structs, which means each field of the struct is stored in a Series, backed by chunks. Each chunk is contiguous in memory.
101 | In a scenario in which we have a single chunk for each field, this is how things would look like:
102 |
103 | { style="display: block; margin: 0 auto;" }
104 |
105 | Since we never modified the DataFrame after creating it, it is the case of our initial example, no more chunks were allocated.
106 |
107 | ---
108 |
109 | Now that we have a better idea of how things work under the hood, let's jump to a practical plugin that takes a struct, and just prints the Series corresponding to each field - it'll return the same struct passed as input, with no alteration.
110 |
111 | First things first - this time we're gonna see somthing new. Polars does not allow us to write:
112 |
113 | ```rust
114 | #[polars_expr(output_type=Struct)]
115 | fn print_struct_fields(inputs: &[Series]) -> PolarsResult { ... }
116 | ```
117 |
118 | The way we inform a struct Series is being returned is a bit cumbersome - we do so by defining a separate function:
119 |
120 | ```rust
121 | fn struct_point_2d_output(input_fields: &[Field]) -> PolarsResult {
122 | let field = &input_fields[0];
123 | match field.dtype() {
124 | DataType::Struct(fields) => {
125 | Ok(Field::new("struct_point_2d".into(), DataType::Struct(fields.clone())))
126 | }
127 | dtype => polars_bail!(InvalidOperation: "expected Struct dtype, got {}", dtype),
128 | }
129 | }
130 | ```
131 |
132 | Then using that function in our `polars_expr`, with a different "kwarg":
133 |
134 | ```rust
135 | #[polars_expr(output_type_func=struct_point_2d_output)]
136 | fn print_struct_fields(inputs: &[Series]) -> PolarsResult {
137 |
138 | let struct_ = inputs[0].struct_()?;
139 | let fields = struct_.fields_as_series();
140 |
141 | if fields.is_empty() {
142 | return Ok(inputs[0].clone());
143 | }
144 |
145 | let fields = fields
146 | .iter()
147 | .map(|s| {
148 | let s = s.clone();
149 | println!("{:?}", s);
150 | s
151 | })
152 | .collect::>();
153 |
154 | StructChunked::from_series(struct_.name().clone(), struct_.len(), fields.iter())
155 | .map(|ca| ca.into_series())
156 | }
157 | ```
158 |
159 | This is a very basic, "do-nothing" example. For this reason, we're not gonna spend too much time here.
160 | Still, you're encouraged to register the plugin and try it for yourself, as an exercise.
161 |
162 | Now, let's look at something more interesting.
163 |
164 | ---
165 |
166 | We'll rewrite a plugin which takes a `Struct` as
167 | input, and shifts all values forwards by one key. So, for example, if
168 | the input was `{'a': 1, 'b': 2., 'c': '3'}`, then the output will be
169 | `{'a': 2., 'b': '3', 'c': 1}`.
170 |
171 | On the Python side, usual business:
172 |
173 | ```python
174 | def shift_struct(expr: IntoExprColumn) -> pl.Expr:
175 | return register_plugin_function(
176 | args=[expr],
177 | plugin_path=LIB,
178 | function_name="shift_struct",
179 | is_elementwise=True,
180 | )
181 | ```
182 |
183 | Then, we need to get the schema right.
184 |
185 | ```Rust
186 | fn shifted_struct(input_fields: &[Field]) -> PolarsResult {
187 | let field = &input_fields[0];
188 | match field.dtype() {
189 | DataType::Struct(fields) => {
190 | let mut field_0 = fields[0].clone();
191 | let name = field_0.name.clone();
192 | field_0.set_name(fields[fields.len() - 1].name().clone());
193 | let mut fields = fields[1..]
194 | .iter()
195 | .zip(fields[0..fields.len() - 1].iter())
196 | .map(|(fld, name)| Field::new(name.name().clone(), fld.dtype().clone()))
197 | .collect::>();
198 | fields.push(field_0);
199 | Ok(Field::new(name, DataType::Struct(fields)))
200 | }
201 | _ => unreachable!(),
202 | }
203 | }
204 | ```
205 |
206 | In this case, I put the first field's name as the output struct's name, but it doesn't
207 | really matter what we put, as Polars doesn't allow us to rename expressions within
208 | plugins. You can always rename on the Python side if you really want to, but I'd suggest
209 | to just let Polars follow its usual "left-hand-rule".
210 |
211 | The function definition is going to follow a similar logic:
212 |
213 | ```rust
214 | #[polars_expr(output_type_func=shifted_struct)]
215 | fn shift_struct(inputs: &[Series]) -> PolarsResult {
216 | let struct_ = inputs[0].struct_()?;
217 | let fields = struct_.fields_as_series();
218 | if fields.is_empty() {
219 | return Ok(inputs[0].clone());
220 | }
221 | let mut field_0 = fields[0].clone();
222 | let name = field_0.name().clone();
223 | field_0.rename(fields[fields.len() - 1].name().clone());
224 | let mut fields = fields[1..]
225 | .iter()
226 | .zip(fields[..fields.len() - 1].iter())
227 | .map(|(s, name)| {
228 | let mut s = s.clone();
229 | s.rename(name.name().clone());
230 | s
231 | })
232 | .collect::>();
233 | fields.push(field_0);
234 | StructChunked::from_series(name, struct_.len(), fields.iter()).map(|ca| ca.into_series())
235 | }
236 | ```
237 |
238 | Let's try this out. Put the following in `run.py`:
239 |
240 | ```python
241 | import polars as pl
242 | import minimal_plugin as mp
243 |
244 | df = pl.DataFrame(
245 | {
246 | "a": [1, 3, 8],
247 | "b": [2.0, 3.1, 2.5],
248 | "c": ["3", "7", "3"],
249 | }
250 | ).select(abc=pl.struct("a", "b", "c"))
251 | print(df.with_columns(abc_shifted=mp.shift_struct("abc")))
252 | ```
253 |
254 | Compile with `maturin develop` (or `maturin develop --release` if you're
255 | benchmarking), and if you run `python run.py` you'll see:
256 |
257 | ```
258 | shape: (3, 2)
259 | ┌─────────────┬─────────────┐
260 | │ abc ┆ abc_shifted │
261 | │ --- ┆ --- │
262 | │ struct[3] ┆ struct[3] │
263 | ╞═════════════╪═════════════╡
264 | │ {1,2.0,"3"} ┆ {2.0,"3",1} │
265 | │ {3,3.1,"7"} ┆ {3.1,"7",3} │
266 | │ {8,2.5,"3"} ┆ {2.5,"3",8} │
267 | └─────────────┴─────────────┘
268 | ```
269 |
270 | The values look right - but is the schema?
271 | Let's take a look
272 |
273 | ```
274 | import pprint
275 | pprint.pprint(df.with_columns(abc_shifted=mp.shift_struct("abc")).schema)
276 | ```
277 |
278 | ```
279 | OrderedDict([('abc', Struct({'a': Int64, 'b': Float64, 'c': String})),
280 | ('abc_shifted', Struct({'a': Float64, 'b': String, 'c': Int64}))])
281 | ```
282 |
283 | Looks correct!
284 |
--------------------------------------------------------------------------------
/docs/sum.md:
--------------------------------------------------------------------------------
1 | # 3. How to do SUMthing
2 |
3 | So far, the expressions we wrote only operated on a single expression.
4 |
5 | What if we'd like to do something fancier, involving more than one expression?
6 | Let's try to write an expression which lets us do
7 |
8 | ```python
9 | df.with_columns(mp.sum_i64('a', 'b'))
10 | ```
11 |
12 | ## Take a ride on the Python side
13 |
14 | First, we need to be able to pass multiple inputs to our Rust function. We'll do that
15 | by using the `args` argument when we register our expression. Add the following to
16 | `minimal_plugins/__init__.py`:
17 |
18 | ```python
19 | def sum_i64(expr: IntoExprColumn, other: IntoExprColumn) -> pl.Expr:
20 | return register_plugin_function(
21 | args=[expr, other],
22 | plugin_path=LIB,
23 | function_name="sum_i64",
24 | is_elementwise=True,
25 | )
26 | ```
27 |
28 | ## I’ve got 1100011 problems but binary ain't one
29 |
30 | Time to write a binary function, in the sense that it takes two
31 | columns as input and produces a third.
32 | Polars gives us a handy `broadcast_binary_elementwise` function for computing binary elementwise operations!
33 |
34 | Add the following to `src/expressions.rs`:
35 |
36 | ```Rust
37 | #[polars_expr(output_type=Int64)]
38 | fn sum_i64(inputs: &[Series]) -> PolarsResult {
39 | let left: &Int64Chunked = inputs[0].i64()?;
40 | let right: &Int64Chunked = inputs[1].i64()?;
41 | // Note: there's a faster way of summing two columns, see
42 | // section 7.
43 | let out: Int64Chunked = broadcast_binary_elementwise(
44 | left,
45 | right,
46 | |left: Option, right: Option| match (left, right) {
47 | (Some(left), Some(right)) => Some(left + right),
48 | _ => None,
49 | },
50 | );
51 | Ok(out.into_series())
52 | }
53 | ```
54 | Note that you'll also need to add
55 | ```Rust
56 | use polars::prelude::arity::broadcast_binary_elementwise;
57 | ```
58 | to the top of the `src/expressions.rs` file.
59 |
60 | !!! note
61 |
62 | There's a faster way of implementing this particular operation,
63 | which we'll cover later in the tutorial in [Branch mispredictions].
64 |
65 | The idea is:
66 |
67 | - for each row, if both `left` and `right` are valid (i.e. they are both
68 | `Some`), then we sum them;
69 | - if either of them is missing (`None`), then we return `None`.
70 |
71 | To try it out, remember to first compile with `maturin develop`
72 | (or `maturin develop --release` if you're benchmarking). Then
73 | if you make a `run.py` file with
74 | ```python
75 | import polars as pl
76 | import minimal_plugin as mp
77 |
78 | df = pl.DataFrame({'a': [1, 5, 2], 'b': [3, None, -1]})
79 | print(df.with_columns(a_plus_b=mp.sum_i64('a', 'b')))
80 | ```
81 | then `python run.py` should produce
82 | ```
83 | shape: (3, 3)
84 | ┌─────┬──────┬──────────┐
85 | │ a ┆ b ┆ a_plus_b │
86 | │ --- ┆ --- ┆ --- │
87 | │ i64 ┆ i64 ┆ i64 │
88 | ╞═════╪══════╪══════════╡
89 | │ 1 ┆ 3 ┆ 4 │
90 | │ 5 ┆ null ┆ null │
91 | │ 2 ┆ -1 ┆ 1 │
92 | └─────┴──────┴──────────┘
93 | ```
94 |
95 | [Branch mispredictions]: ../branch_mispredictions/
96 |
97 | ## Get over your exercises
98 |
99 | It's widely acknowledged that the best way to learn is by doing.
100 |
101 | Can you make `sum_numeric` (a generic version of `sum_i64`)?
102 | Can you support the case when `left` and `right` are of different
103 | types, e.g. `i8` plus `i16`?
104 |
--------------------------------------------------------------------------------
/docs/vec_of_option.md:
--------------------------------------------------------------------------------
1 |
2 | # 13. `Vec>` vs. `Vec`
3 |
4 | > "I got, I got, I got, I got options" – _Pitbull_, before writing his first Polars plugin
5 |
6 | In the plugins we looked at so far, we typically created an iterator of options and let Polars collect it into a `ChunkedArray`.
7 | Sometimes, however, you need to store intermediate values in a `Vec`. You might be tempted to make it a `Vec>`, where
8 | missing values are `None` and present values are `Some`...
9 |
10 | 🛑 BUT WAIT!
11 |
12 | Did you know that `Vec >` occupies twice as much memory as `Vec`? Let's prove it:
13 |
14 | ```rust
15 | use std::mem::size_of_val;
16 |
17 | fn main() {
18 | let vector: Vec = vec![1, 2, 3];
19 | println!("{}", size_of_val(&*vector));
20 | // Output: 12
21 |
22 | let vector: Vec> = vec![Some(1), Some(2), Some(3)];
23 | println!("{}", size_of_val(&*vector));
24 | // Output: 24
25 | }
26 | ```
27 |
28 | So...how can we create an output which includes missing values, without allocating twice as much memory as is necessary?
29 |
30 | ## Validity mask
31 |
32 | Instead of creating a vector of options, we can create a vector of primitive values with zeroes in place of the missing values, and use
33 | a validity mask to indicate which values are missing. One example of this can be seen in Polars' `interpolate_impl`, which does the heavy lifting for the
34 | [`Series.interpolate`](https://docs.pola.rs/api/python/version/0.18/reference/series/api/polars.Series.interpolate.html):
35 |
36 | ```rust
37 | fn interpolate_impl(chunked_arr: &ChunkedArray, interpolation_branch: I) -> ChunkedArray
38 | where
39 | T: PolarsNumericType,
40 | I: Fn(T::Native, T::Native, IdxSize, T::Native, &mut Vec),
41 | {
42 | // This implementation differs from pandas as that boundary None's are not removed.
43 | // This prevents a lot of errors due to expressions leading to different lengths.
44 | if !chunked_arr.has_nulls() || chunked_arr.null_count() == chunked_arr.len() {
45 | return chunked_arr.clone();
46 | }
47 |
48 | // We first find the first and last so that we can set the null buffer.
49 | let first = chunked_arr.first_non_null().unwrap();
50 | let last = chunked_arr.last_non_null().unwrap() + 1;
51 |
52 | // Fill out with `first` nulls.
53 | let mut out = Vec::with_capacity(chunked_arr.len());
54 | let mut iter = chunked_arr.iter().skip(first);
55 | for _ in 0..first {
56 | out.push(Zero::zero());
57 | }
58 |
59 | // The next element of `iter` is definitely `Some(Some(v))`, because we skipped the first
60 | // elements `first` and if all values were missing we'd have done an early return.
61 | let mut low = iter.next().unwrap().unwrap();
62 | out.push(low);
63 | while let Some(next) = iter.next() {
64 | if let Some(v) = next {
65 | out.push(v);
66 | low = v;
67 | } else {
68 | let mut steps = 1 as IdxSize;
69 | for next in iter.by_ref() {
70 | steps += 1;
71 | if let Some(high) = next {
72 | let steps_n: T::Native = NumCast::from(steps).unwrap();
73 | interpolation_branch(low, high, steps, steps_n, &mut out);
74 | out.push(high);
75 | low = high;
76 | break;
77 | }
78 | }
79 | }
80 | }
81 | if first != 0 || last != chunked_arr.len() {
82 | let mut validity = MutableBitmap::with_capacity(chunked_arr.len());
83 | validity.extend_constant(chunked_arr.len(), true);
84 |
85 | for i in 0..first {
86 | validity.set(i, false);
87 | }
88 |
89 | for i in last..chunked_arr.len() {
90 | validity.set(i, false);
91 | out.push(Zero::zero())
92 | }
93 |
94 | let array = PrimitiveArray::new(
95 | T::get_dtype().to_arrow(CompatLevel::newest()),
96 | out.into(),
97 | Some(validity.into()),
98 | );
99 | ChunkedArray::with_chunk(PlSmallStr::EMPTY, array)
100 | } else {
101 | ChunkedArray::from_vec(chunked_arr.name(), out)
102 | }
103 | }
104 | ```
105 |
106 | That's a lot to digest at once, so let's take small steps and focus on the core logic.
107 | At the start, we store the indexes of the first and last non-null values:
108 |
109 | ```rust
110 | let first = chunked_arr.first_non_null().unwrap();
111 | let last = chunked_arr.last_non_null().unwrap() + 1;
112 | ```
113 |
114 | We then create a vector `out` to store the result values in, and in places where we'd like
115 | the output to be missing, we push zeroes (we'll see below how we tell Polars that these are
116 | to be considered missing, rather than as ordinary zeroes):
117 |
118 | ```rust
119 | let mut out = Vec::with_capacity(chunked_arr.len());
120 | for _ in 0..first {
121 | out.push(Zero::zero());
122 | }
123 | ```
124 |
125 | We then skip the first `first` elements and start interpolating (note how we write `out.push(low)`, not `out.push(Some(low))`
126 | - we gloss over the rest as it's not related to the main focus of this chapter):
127 |
128 | ```rust
129 | let mut iter = chunked_arr.iter().skip(first);
130 | let mut low = iter.next().unwrap().unwrap();
131 | out.push(low);
132 | while let Some(next) = iter.next() {
133 | // Interpolation logic
134 | }
135 | ```
136 |
137 | Now, after _most_ of the work is done and we've filled up most of `out`,
138 | we create a validity mask and set it to `false` for elements which we'd like to declare as missing:
139 |
140 | ```rust
141 | if first != 0 || last != chunked_arr.len() {
142 | // A validity mask is created for the vector, initially all set to true
143 | let mut validity = MutableBitmap::with_capacity(chunked_arr.len());
144 | validity.extend_constant(chunked_arr.len(), true);
145 |
146 | for i in 0..first {
147 | // The indexes corresponding to the zeroes before the first valid value
148 | // are set to false (invalid)
149 | validity.set(i, false);
150 | }
151 |
152 | for i in last..chunked_arr.len() {
153 | // The indexes corresponding to the values after the last valid value
154 | // are set to false (invalid)
155 | validity.set(i, false);
156 |
157 | out.push(Zero::zero()) // Push zeroes after the last valid value, as
158 | // many as there are nulls at the end, just like
159 | // it was done before the first valid value.
160 | }
161 |
162 | let array = PrimitiveArray::new(
163 | T::get_dtype().to_arrow(CompatLevel::newest()),
164 | out.into(),
165 | Some(validity.into()),
166 | );
167 | ChunkedArray::with_chunk(PlSmallStr::EMPTY, array)
168 | } else {
169 | ChunkedArray::from_vec(chunked_arr.name(), out)
170 | }
171 | ```
172 |
173 | The `MutableBitmap` only requires one byte per 8 elements, so the total space used is much less than it would've been
174 | if we'd created `out` as a vector of options!
175 | Further, note how the validity mask is only allocated when the output contains nulls - if there are no nulls, we can
176 | save even more memory by not having a validity mask at all!
177 |
178 | ## Sentinel values
179 |
180 | Let's look at another example of where it's possible to avoid allocating a vector of options. This example comes
181 | from the Polars-XDT plugin. There's one function there which creates a temporary `idx` vector in which, for
182 | each element, we store the index of the previous element larger than it. If an element has no previous larger
183 | element, then rather than storing `None` (thus forcing all non-missing elements to be `Some`), we can just
184 | store `-1`.
185 |
186 | Take a look at [this diff from a PR](https://github.com/pola-rs/polars-xdt/pull/79/files#diff-991878a926639bba03bcc36a2790f73181b358f2ff59e0256f9ad76aa707be35) which does exactly that,
187 | in which most changes are along the lines of:
188 |
189 | ```diff
190 | - if i < Some(0) {
191 | - idx.push(None);
192 | + if i < 0 {
193 | + idx.push(-1);
194 | ```
195 |
196 | There's no functional behaviour change, but we already know the memory benefits!
197 |
198 | ## Conclusion
199 |
200 | In general, _if you can avoid allocating `Vec>` instead of `Vec`,_ __do it!__!
201 |
202 | !!!note
203 |
204 | This advice only applies if you're creating a vector to store results in. If you're collecting
205 | an iterator of options into a chunked array, then Polars already optimises this for you.
206 |
--------------------------------------------------------------------------------
/docs/where_to_go.md:
--------------------------------------------------------------------------------
1 | # Where to go from here?
2 |
3 | What now?
4 |
5 | If this material was a bit overwhelming for you, I'd suggest taking a step back
6 | and reading [The Rust Programming Language](https://doc.rust-lang.org/book).
7 | Or at least, the first 10 chapters.
8 |
9 | Next, you may be interested in looking at existing plugins for inspiration.
10 | There's a nice list of them in the official user guide: https://docs.pola.rs/user-guide/plugins/your-first-polars-plugin/#community-plugins.
11 |
12 | Finally, you should definitely join the Discord Server, where there's a channel
13 | dedicated to plugins: https://discord.gg/4UfP5cfBE7.
14 |
--------------------------------------------------------------------------------
/minimal_plugin/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | from typing import TYPE_CHECKING
3 |
4 | import polars as pl
5 | from pathlib import Path
6 |
7 | from polars.plugins import register_plugin_function
8 |
9 |
10 | LIB = Path(__file__).parent
11 |
12 | if TYPE_CHECKING:
13 | from minimal_plugin.typing import IntoExprColumn
14 |
15 |
16 | def noop(expr: IntoExprColumn) -> pl.Expr:
17 | return register_plugin_function(
18 | args=[expr],
19 | plugin_path=LIB,
20 | function_name="noop",
21 | is_elementwise=True,
22 | )
23 |
24 |
25 | def abs_i64(expr: IntoExprColumn) -> pl.Expr:
26 | return register_plugin_function(
27 | args=[expr],
28 | plugin_path=LIB,
29 | function_name="abs_i64",
30 | is_elementwise=True,
31 | )
32 |
33 |
34 | def abs_numeric(expr: IntoExprColumn) -> pl.Expr:
35 | return register_plugin_function(
36 | args=[expr],
37 | plugin_path=LIB,
38 | function_name="abs_numeric",
39 | is_elementwise=True,
40 | )
41 |
42 |
43 | def sum_i64(expr: IntoExprColumn, other: IntoExprColumn) -> pl.Expr:
44 | return register_plugin_function(
45 | args=[expr, other],
46 | plugin_path=LIB,
47 | function_name="sum_i64",
48 | is_elementwise=True,
49 | )
50 |
51 |
52 | def cum_sum(expr: IntoExprColumn) -> pl.Expr:
53 | return register_plugin_function(
54 | args=[expr],
55 | plugin_path=LIB,
56 | function_name="cum_sum",
57 | is_elementwise=False,
58 | )
59 |
60 |
61 | def pig_latinnify(expr: IntoExprColumn) -> pl.Expr:
62 | return register_plugin_function(
63 | args=[expr],
64 | plugin_path=LIB,
65 | function_name="pig_latinnify",
66 | is_elementwise=True,
67 | )
68 |
69 |
70 | def remove_extension(expr: IntoExprColumn) -> pl.Expr:
71 | return register_plugin_function(
72 | args=[expr],
73 | plugin_path=LIB,
74 | function_name="remove_extension",
75 | is_elementwise=True,
76 | )
77 |
78 |
79 | def abs_i64_fast(expr: IntoExprColumn) -> pl.Expr:
80 | return register_plugin_function(
81 | args=[expr],
82 | plugin_path=LIB,
83 | function_name="abs_i64_fast",
84 | is_elementwise=True,
85 | )
86 |
87 |
88 | def add_suffix(expr: IntoExprColumn, *, suffix: str) -> pl.Expr:
89 | return register_plugin_function(
90 | args=[expr],
91 | plugin_path=LIB,
92 | function_name="add_suffix",
93 | is_elementwise=True,
94 | kwargs={"suffix": suffix},
95 | )
96 |
97 |
98 | def snowball_stem(expr: IntoExprColumn) -> pl.Expr:
99 | return register_plugin_function(
100 | args=[expr],
101 | plugin_path=LIB,
102 | function_name="snowball_stem",
103 | is_elementwise=True,
104 | )
105 |
106 |
107 | def weighted_mean(expr: IntoExprColumn, weights: IntoExprColumn) -> pl.Expr:
108 | return register_plugin_function(
109 | args=[expr, weights],
110 | plugin_path=LIB,
111 | function_name="weighted_mean",
112 | is_elementwise=True,
113 | )
114 |
115 |
116 | def print_struct_fields(expr: IntoExprColumn) -> pl.Expr:
117 | return register_plugin_function(
118 | args=[expr],
119 | plugin_path=LIB,
120 | function_name="print_struct_fields",
121 | is_elementwise=True,
122 | )
123 |
124 |
125 | def shift_struct(expr: IntoExprColumn) -> pl.Expr:
126 | return register_plugin_function(
127 | args=[expr],
128 | plugin_path=LIB,
129 | function_name="shift_struct",
130 | is_elementwise=True,
131 | )
132 |
133 |
134 | def reverse_geocode(lat: IntoExprColumn, long: IntoExprColumn) -> pl.Expr:
135 | return register_plugin_function(
136 | args=[lat, long],
137 | plugin_path=LIB,
138 | function_name="reverse_geocode",
139 | is_elementwise=True,
140 | )
141 |
142 |
143 | def non_zero_indices(expr: IntoExprColumn) -> pl.Expr:
144 | return register_plugin_function(
145 | args=[expr],
146 | plugin_path=LIB,
147 | function_name="non_zero_indices",
148 | is_elementwise=True,
149 | )
150 |
151 |
152 | def vertical_weighted_mean(values: IntoExprColumn, weights: IntoExprColumn) -> pl.Expr:
153 | return register_plugin_function(
154 | args=[values, weights],
155 | plugin_path=LIB,
156 | function_name="vertical_weighted_mean",
157 | is_elementwise=False,
158 | returns_scalar=True,
159 | )
160 |
161 |
162 | def interpolate(expr: IntoExprColumn) -> pl.Expr:
163 | return register_plugin_function(
164 | args=[expr],
165 | plugin_path=LIB,
166 | function_name="interpolate",
167 | is_elementwise=False,
168 | )
169 |
170 |
171 | def life_step(
172 | left: IntoExprColumn, mid: IntoExprColumn, right: IntoExprColumn
173 | ) -> pl.Expr:
174 | return register_plugin_function(
175 | args=[left, mid, right],
176 | plugin_path=LIB,
177 | function_name="life_step",
178 | is_elementwise=False,
179 | )
180 |
181 |
182 | def midpoint_2d(expr: IntoExprColumn, ref_point: tuple[float, float]) -> pl.Expr:
183 | return register_plugin_function(
184 | args=[expr],
185 | plugin_path=LIB,
186 | function_name="midpoint_2d",
187 | is_elementwise=True,
188 | kwargs={"ref_point": ref_point},
189 | )
190 |
--------------------------------------------------------------------------------
/minimal_plugin/typing.py:
--------------------------------------------------------------------------------
1 | from typing import TYPE_CHECKING, Union
2 |
3 | if TYPE_CHECKING:
4 | import sys
5 | import polars as pl
6 |
7 | if sys.version_info >= (3, 10):
8 | from typing import TypeAlias
9 | else:
10 | from typing_extensions import TypeAlias
11 | from polars.datatypes import DataType, DataTypeClass
12 |
13 | IntoExprColumn: TypeAlias = Union[pl.Expr, str, pl.Series]
14 | PolarsDataType: TypeAlias = Union[DataType, DataTypeClass]
15 |
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: Polars plugins tutorial
2 | repo_url: https://github.com/MarcoGorelli/polars-plugins-tutorial
3 |
4 | theme:
5 | name: material
6 | font: false
7 | features:
8 | - content.code.copy
9 | - content.code.annotate
10 | - navigation.footer
11 |
12 | nav:
13 | - Home: index.md
14 | - prerequisites.md
15 | - noop.md
16 | - abs.md
17 | - sum.md
18 | - cum_sum.md
19 | - stringify.md
20 | - stem.md
21 | - branch_mispredictions.md
22 | - arguments.md
23 | - 9. Lists at last:
24 | - lists.md
25 | - lists_in_lists_out.md
26 | - struct.md
27 | - arrays.md
28 | - lost_in_space.md
29 | - vec_of_option.md
30 | - publishing.md
31 | - aggregate.md
32 | - "Extra: Can we run Doom?":
33 | - life_pt1.md
34 | - life_pt2.md
35 | - where_to_go.md
36 |
37 | plugins:
38 | - search
39 |
40 | # Extensions
41 | markdown_extensions:
42 | - abbr
43 | - admonition
44 | - attr_list
45 | - def_list
46 | - footnotes
47 | - md_in_html
48 | - toc:
49 | permalink: true
50 | - pymdownx.arithmatex:
51 | generic: true
52 | - pymdownx.betterem:
53 | smart_enable: all
54 | - pymdownx.caret
55 | - pymdownx.details
56 | - pymdownx.emoji:
57 | emoji_generator: !!python/name:material.extensions.emoji.to_svg
58 | emoji_index: !!python/name:material.extensions.emoji.twemoji
59 | - pymdownx.highlight:
60 | anchor_linenums: true
61 | line_spans: __span
62 | pygments_lang_class: true
63 | - pymdownx.inlinehilite
64 | - pymdownx.keys
65 | - pymdownx.magiclink:
66 | normalize_issue_symbols: true
67 | repo_url_shorthand: true
68 | user: squidfunk
69 | repo: mkdocs-material
70 | - pymdownx.mark
71 | - pymdownx.smartsymbols
72 | - pymdownx.snippets:
73 | auto_append:
74 | - includes/mkdocs.md
75 | - pymdownx.superfences:
76 | custom_fences:
77 | - name: mermaid
78 | class: mermaid
79 | format: !!python/name:pymdownx.superfences.fence_code_format
80 | - pymdownx.tabbed:
81 | alternate_style: true
82 | combine_header_slug: true
83 | slugify: !!python/object/apply:pymdownx.slugs.slugify
84 | kwds:
85 | case: lower
86 | - pymdownx.tasklist:
87 | custom_checkbox: true
88 | - pymdownx.tilde
89 |
--------------------------------------------------------------------------------
/perf.py:
--------------------------------------------------------------------------------
1 | import timeit
2 | import warnings
3 | import numpy as np
4 |
5 | setup = """
6 | import pandas as pd
7 | import polars as pl
8 | import minimal_plugin # noqa: F401
9 | import numpy as np
10 | rng = np.random.default_rng(12345)
11 | N = 10_000_000
12 |
13 | df = pl.DataFrame({'a': rng.integers(low=-100, high=100, size=N)})
14 | df = df.with_row_index().with_columns(
15 | pl.when(pl.col('index')%2==1).then(pl.lit(None)).otherwise(pl.col('a')).alias('a')
16 | )
17 | """
18 |
19 | results = (
20 | np.array(
21 | timeit.Timer(
22 | stmt="df.select(pl.col('a').mp.abs_i64_fast())",
23 | setup=setup,
24 | ).repeat(7, 3)
25 | )
26 | / 3
27 | )
28 | print(f"min: {min(results)}")
29 | print(f"max: {max(results)}")
30 | print(f"{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}")
31 |
32 | results = (
33 | np.array(
34 | timeit.Timer(
35 | stmt="df.select(pl.col('a').mp.abs_i64())",
36 | setup=setup,
37 | ).repeat(7, 3)
38 | )
39 | / 3
40 | )
41 | print(f"min: {min(results)}")
42 | print(f"max: {max(results)}")
43 | print(f"{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}")
44 |
45 | with warnings.catch_warnings():
46 | warnings.simplefilter("ignore")
47 | results = (
48 | np.array(
49 | timeit.Timer(
50 | stmt="df.select(pl.col('a').map_elements(lambda x: abs(x)))",
51 | setup=setup,
52 | ).repeat(7, 3)
53 | )
54 | / 3
55 | )
56 | print(f"min: {min(results)}")
57 | print(f"max: {max(results)}")
58 | print(f"{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}")
59 |
--------------------------------------------------------------------------------
/perf_list.py:
--------------------------------------------------------------------------------
1 | import timeit
2 | import warnings
3 | import numpy as np
4 |
5 | setup = """
6 | import polars as pl
7 | import minimal_plugin as mp
8 | import numpy as np
9 | rng = np.random.default_rng(12345)
10 | N = 100_000
11 |
12 | df = pl.DataFrame({'a': [rng.integers(low=-100, high=100, size=5) for _ in range(N)]})
13 | """
14 |
15 | results = (
16 | np.array(
17 | timeit.Timer(
18 | stmt="df.select(mp.non_zero_indices('a'))",
19 | setup=setup,
20 | ).repeat(7, 3)
21 | )
22 | / 3
23 | )
24 | print(f"min: {min(results)}")
25 | print(f"max: {max(results)}")
26 | print(f"{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}")
27 |
28 | results = (
29 | np.array(
30 | timeit.Timer(
31 | stmt="df.select(pl.col('a').list.eval(pl.arg_where(pl.element() != 0)))",
32 | setup=setup,
33 | ).repeat(7, 3)
34 | )
35 | / 3
36 | )
37 | print(f"min: {min(results)}")
38 | print(f"max: {max(results)}")
39 | print(f"{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}")
40 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["maturin>=1.0,<2.0", "polars>=1.3.0"]
3 | build-backend = "maturin"
4 |
5 | [project]
6 | name = "minimal_plugin" # Should match the folder with your code!
7 | requires-python = ">=3.8"
8 | classifiers = [
9 | "Programming Language :: Rust",
10 | "Programming Language :: Python :: Implementation :: CPython",
11 | "Programming Language :: Python :: Implementation :: PyPy",
12 | ]
13 | dynamic = ["version"]
14 |
15 |
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | ruff
3 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | maturin>=1.4.0
2 | polars>=1.3.0
3 |
--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
1 |
2 | import polars as pl
3 | import minimal_plugin as mp
4 |
5 |
6 | df = pl.DataFrame(
7 | {"values": [[1, 3, 2], [5, 7], []], "weights": [[0.5, 0.3, 0.2], [0.1, 0.9], []]}
8 | )
9 | print(df.with_columns(weighted_mean=mp.weighted_mean("values", "weights")))
10 |
11 | df = pl.DataFrame(
12 | {
13 | "english": ["foo", "bar", ""],
14 | }
15 | )
16 | print(df.with_columns(pig_latin=mp.pig_latinnify("english")))
17 |
18 | df = pl.DataFrame(
19 | {
20 | "values": [1.0, 3, 2, 5, 7],
21 | "weights": [0.5, 0.3, 0.2, 0.1, 0.9],
22 | "group": ["a", "a", "a", "b", "b"],
23 | }
24 | )
25 | print(
26 | df.group_by("group").agg(
27 | weighted_mean=mp.vertical_weighted_mean("values", "weights")
28 | )
29 | )
30 |
31 | df = pl.DataFrame(
32 | {
33 | "a": [None, None, 3, None, None, 9, 11, None],
34 | }
35 | )
36 | result = df.with_columns(interpolate=mp.interpolate("a"))
37 | print(result)
38 |
39 |
40 | df = pl.DataFrame({
41 | 'filename': [
42 | "requirements.txt", "Makefile", "pkg.tar.gz", "tmp.d"
43 | ],
44 | })
45 | print(df.with_columns(without_ext=mp.remove_extension('filename')))
46 |
47 | points = pl.Series(
48 | "points",
49 | [
50 | [6.63, 8.35],
51 | [7.19, 4.85],
52 | [2.1, 4.21],
53 | [3.4, 6.13],
54 | [2.48, 9.26],
55 | [9.41, 7.26],
56 | [7.45, 8.85],
57 | [6.58, 5.22],
58 | [6.05, 5.77],
59 | [8.57, 4.16],
60 | [3.22, 4.98],
61 | [6.62, 6.62],
62 | [9.36, 7.44],
63 | [8.34, 3.43],
64 | [4.47, 7.61],
65 | [4.34, 5.05],
66 | [5.0, 5.05],
67 | [5.0, 5.0],
68 | [2.07, 7.8],
69 | [9.45, 9.6],
70 | [3.1, 3.26],
71 | [4.37, 5.72],
72 | ],
73 | dtype=pl.Array(pl.Float64, 2),
74 | )
75 | df = pl.DataFrame(points)
76 | result = df.with_columns(midpoints=mp.midpoint_2d("points", ref_point=(5.0, 5.0)))
77 | print(result)
78 |
--------------------------------------------------------------------------------
/rust-toolchain.toml:
--------------------------------------------------------------------------------
1 | [toolchain]
2 | channel = "nightly"
3 |
--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | group_imports = "StdExternalCrate"
2 | imports_granularity = "Module"
3 | match_block_trailing_comma = true
4 |
--------------------------------------------------------------------------------
/src/arrays.rs:
--------------------------------------------------------------------------------
1 | #![allow(clippy::unused_unit)]
2 | use polars::prelude::*;
3 | use polars_core::utils::CustomIterTools;
4 | use pyo3_polars::derive::polars_expr;
5 | use serde::Deserialize;
6 |
7 | pub fn point_2d_output(_: &[Field]) -> PolarsResult {
8 | Ok(Field::new(
9 | PlSmallStr::from_static("point_2d"),
10 | DataType::Array(Box::new(DataType::Float64), 2),
11 | ))
12 | }
13 |
14 | #[derive(Deserialize)]
15 | struct MidPoint2DKwargs {
16 | ref_point: [f64; 2],
17 | }
18 |
19 | #[polars_expr(output_type_func=point_2d_output)]
20 | fn midpoint_2d(inputs: &[Series], kwargs: MidPoint2DKwargs) -> PolarsResult {
21 | let ca: &ArrayChunked = inputs[0].array()?;
22 | let ref_point = kwargs.ref_point;
23 |
24 | let out: ArrayChunked = unsafe {
25 | ca.try_apply_amortized_same_type(|row| {
26 | let s = row.as_ref();
27 | let ca = s.f64()?;
28 | let out_inner: Float64Chunked = ca
29 | .iter()
30 | .enumerate()
31 | .map(|(idx, opt_val)| opt_val.map(|val| (val + ref_point[idx]) / 2.0f64))
32 | .collect_trusted();
33 | Ok(out_inner.into_series())
34 | })
35 | }?;
36 |
37 | Ok(out.into_series())
38 | }
39 |
--------------------------------------------------------------------------------
/src/expressions.rs:
--------------------------------------------------------------------------------
1 | #![allow(clippy::unused_unit)]
2 | use std::ops::{Add, Div, Mul, Sub};
3 |
4 | use num_traits::{NumCast, Zero, Signed};
5 | use polars::prelude::arity::{
6 | binary_elementwise_into_string_amortized, broadcast_binary_elementwise,
7 | };
8 | use polars::prelude::*;
9 | use polars_arrow::bitmap::MutableBitmap;
10 | use polars_core::series::amortized_iter::AmortSeries;
11 | use polars_core::utils::align_chunks_binary;
12 | use pyo3_polars::derive::polars_expr;
13 | use pyo3_polars::export::polars_core::utils::arrow::array::PrimitiveArray;
14 | use pyo3_polars::export::polars_core::utils::CustomIterTools;
15 | use serde::Deserialize;
16 |
17 | fn same_output_type(input_fields: &[Field]) -> PolarsResult {
18 | let field = &input_fields[0];
19 | Ok(field.clone())
20 | }
21 |
22 | #[polars_expr(output_type_func=same_output_type)]
23 | fn noop(inputs: &[Series]) -> PolarsResult {
24 | let s = &inputs[0];
25 | Ok(s.clone())
26 | }
27 |
28 | #[polars_expr(output_type=Int64)]
29 | fn abs_i64(inputs: &[Series]) -> PolarsResult {
30 | let s = &inputs[0];
31 | let ca: &Int64Chunked = s.i64()?;
32 | // NOTE: there's a faster way of implementing `abs_i64`, which we'll
33 | // cover in section 7.
34 | let out: Int64Chunked = ca.apply(|opt_v: Option| opt_v.map(|v: i64| v.abs()));
35 | Ok(out.into_series())
36 | }
37 |
38 | fn impl_abs_numeric(ca: &ChunkedArray) -> ChunkedArray
39 | where
40 | T: PolarsNumericType,
41 | T::Native: Signed,
42 | {
43 | ca.apply(|opt_v: Option| opt_v.map(|v: T::Native| v.abs()))
44 | }
45 |
46 | #[polars_expr(output_type_func=same_output_type)]
47 | fn abs_numeric(inputs: &[Series]) -> PolarsResult {
48 | let s = &inputs[0];
49 | match s.dtype() {
50 | DataType::Int32 => Ok(impl_abs_numeric(s.i32().unwrap()).into_series()),
51 | DataType::Int64 => Ok(impl_abs_numeric(s.i64().unwrap()).into_series()),
52 | DataType::Float32 => Ok(impl_abs_numeric(s.f32().unwrap()).into_series()),
53 | DataType::Float64 => Ok(impl_abs_numeric(s.f64().unwrap()).into_series()),
54 | dtype => {
55 | polars_bail!(InvalidOperation:format!("dtype {dtype} not \
56 | supported for abs_numeric, expected Int32, Int64, Float32, Float64."))
57 | },
58 | }
59 | }
60 |
61 | #[polars_expr(output_type=Int64)]
62 | fn sum_i64(inputs: &[Series]) -> PolarsResult {
63 | let left: &Int64Chunked = inputs[0].i64()?;
64 | let right: &Int64Chunked = inputs[1].i64()?;
65 | // Note: there's a faster way of summing two columns, see
66 | // section 7.
67 | let out: Int64Chunked =
68 | broadcast_binary_elementwise(left, right, |left: Option, right: Option| match (
69 | left, right,
70 | ) {
71 | (Some(left), Some(right)) => Some(left + right),
72 | _ => None,
73 | });
74 | Ok(out.into_series())
75 | }
76 |
77 | #[polars_expr(output_type_func=same_output_type)]
78 | fn cum_sum(inputs: &[Series]) -> PolarsResult {
79 | let s = &inputs[0];
80 | let ca: &Int64Chunked = s.i64()?;
81 | let out: Int64Chunked = ca
82 | .iter()
83 | .scan(0_i64, |state: &mut i64, x: Option| match x {
84 | Some(x) => {
85 | *state += x;
86 | Some(Some(*state))
87 | },
88 | None => Some(None),
89 | })
90 | .collect_trusted();
91 | Ok(out.into_series())
92 | }
93 |
94 | use std::borrow::Cow;
95 | use std::fmt::Write;
96 |
97 | #[polars_expr(output_type=String)]
98 | fn pig_latinnify(inputs: &[Series]) -> PolarsResult {
99 | let s = &inputs[0];
100 | let ca = s.str()?;
101 | let out: StringChunked = ca.apply(|opt_v: Option<&str>| {
102 | opt_v.map(|value: &str| {
103 | // Not the recommended way to do it,
104 | // see below for a better way!
105 | if let Some(first_char) = value.chars().next() {
106 | Cow::Owned(format!("{}{}ay", &value[1..], first_char))
107 | } else {
108 | Cow::Borrowed(value)
109 | }
110 | })
111 | });
112 | Ok(out.into_series())
113 | }
114 |
115 | fn remove_last_extension(s: &str) -> &str {
116 | match s.rfind('.') {
117 | Some(pos) => &s[..pos],
118 | None => s,
119 | }
120 | }
121 |
122 | #[polars_expr(output_type=String)]
123 | fn remove_extension(inputs: &[Series]) -> PolarsResult {
124 | let s = &inputs[0];
125 | let ca = s.str()?;
126 | let out: StringChunked = ca.apply_values(|val| {
127 | let res = Cow::Borrowed(remove_last_extension(val));
128 | res
129 | });
130 | Ok(out.into_series())
131 | }
132 |
133 | #[polars_expr(output_type=Int64)]
134 | fn abs_i64_fast(inputs: &[Series]) -> PolarsResult {
135 | let s = &inputs[0];
136 | let ca = s.i64()?;
137 | let chunks = ca
138 | .downcast_iter()
139 | .map(|arr| arr.values().as_slice())
140 | .zip(ca.iter_validities())
141 | .map(|(slice, validity)| {
142 | let arr: PrimitiveArray = slice.iter().copied().map(|x| x.abs()).collect_arr();
143 | arr.with_validity(validity.cloned())
144 | });
145 | let out = Int64Chunked::from_chunk_iter(PlSmallStr::EMPTY, chunks);
146 | Ok(out.into_series())
147 | }
148 |
149 | #[derive(Deserialize)]
150 | struct AddSuffixKwargs {
151 | suffix: String,
152 | }
153 |
154 | #[polars_expr(output_type=String)]
155 | fn add_suffix(inputs: &[Series], kwargs: AddSuffixKwargs) -> PolarsResult {
156 | let s = &inputs[0];
157 | let ca = s.str()?;
158 | let out = ca.apply_into_string_amortized(|value, output| {
159 | write!(output, "{}{}", value, kwargs.suffix).unwrap();
160 | });
161 | Ok(out.into_series())
162 | }
163 |
164 | // use rust_stemmers::{Algorithm, Stemmer};
165 |
166 | // #[polars_expr(output_type=String)]
167 | // fn snowball_stem(inputs: &[Series]) -> PolarsResult {
168 | // let ca: &StringChunked = inputs[0].str()?;
169 | // let en_stemmer = Stemmer::create(Algorithm::English);
170 | // let out: StringChunked = ca.apply_into_string_amortized(|value: &str, output: &mut String| {
171 | // write!(output, "{}", en_stemmer.stem(value)).unwrap()
172 | // });
173 | // Ok(out.into_series())
174 | // }
175 |
176 | fn binary_amortized_elementwise<'a, T, K, F>(
177 | lhs: &'a ListChunked,
178 | rhs: &'a ListChunked,
179 | mut f: F,
180 | ) -> ChunkedArray
181 | where
182 | T: PolarsDataType,
183 | T::Array: ArrayFromIter>,
184 | F: FnMut(&AmortSeries, &AmortSeries) -> Option + Copy,
185 | {
186 | {
187 | let (lhs, rhs) = align_chunks_binary(lhs, rhs);
188 | lhs.amortized_iter()
189 | .zip(rhs.amortized_iter())
190 | .map(|(lhs, rhs)| match (lhs, rhs) {
191 | (Some(lhs), Some(rhs)) => f(&lhs, &rhs),
192 | _ => None,
193 | })
194 | .collect_ca(PlSmallStr::EMPTY)
195 | }
196 | }
197 |
198 | #[polars_expr(output_type=Float64)]
199 | fn weighted_mean(inputs: &[Series]) -> PolarsResult {
200 | let values = inputs[0].list()?;
201 | let weights = &inputs[1].list()?;
202 | polars_ensure!(
203 | values.dtype() == &DataType::List(Box::new(DataType::Int64)),
204 | ComputeError: "Expected `values` to be of type `List(Int64)`, got: {}", values.dtype()
205 | );
206 | polars_ensure!(
207 | weights.dtype() == &DataType::List(Box::new(DataType::Float64)),
208 | ComputeError: "Expected `weights` to be of type `List(Float64)`, got: {}", weights.dtype()
209 | );
210 |
211 | let out: Float64Chunked = binary_amortized_elementwise(
212 | values,
213 | weights,
214 | |values_inner: &AmortSeries, weights_inner: &AmortSeries| -> Option {
215 | let values_inner = values_inner.as_ref().i64().unwrap();
216 | let weights_inner = weights_inner.as_ref().f64().unwrap();
217 | if values_inner.is_empty() {
218 | // Mirror Polars, and return None for empty mean.
219 | return None;
220 | }
221 | let mut numerator: f64 = 0.;
222 | let mut denominator: f64 = 0.;
223 | values_inner
224 | .iter()
225 | .zip(weights_inner.iter())
226 | .for_each(|(v, w)| {
227 | if let (Some(v), Some(w)) = (v, w) {
228 | numerator += v as f64 * w;
229 | denominator += w;
230 | }
231 | });
232 | Some(numerator / denominator)
233 | },
234 | );
235 | Ok(out.into_series())
236 | }
237 |
238 | fn struct_point_2d_output(input_fields: &[Field]) -> PolarsResult {
239 | let field = &input_fields[0];
240 | match field.dtype() {
241 | DataType::Struct(fields) => Ok(Field::new(
242 | "struct_point_2d".into(),
243 | DataType::Struct(fields.clone()),
244 | )),
245 | dtype => polars_bail!(InvalidOperation: "expected Struct dtype, got {}", dtype),
246 | }
247 | }
248 |
249 | #[polars_expr(output_type_func=struct_point_2d_output)]
250 | fn print_struct_fields(inputs: &[Series]) -> PolarsResult {
251 | let struct_ = inputs[0].struct_()?;
252 | let fields = struct_.fields_as_series();
253 |
254 | if fields.is_empty() {
255 | return Ok(inputs[0].clone());
256 | }
257 |
258 | let fields = fields
259 | .iter()
260 | .map(|s| {
261 | let s = s.clone();
262 | println!("{:?}", s);
263 | s
264 | })
265 | .collect::>();
266 |
267 | StructChunked::from_series(struct_.name().clone(), struct_.len(), fields.iter())
268 | .map(|ca| ca.into_series())
269 | }
270 |
271 | fn shifted_struct(input_fields: &[Field]) -> PolarsResult {
272 | let field = &input_fields[0];
273 | match field.dtype() {
274 | DataType::Struct(fields) => {
275 | let mut field_0 = fields[0].clone();
276 | let name = field_0.name.clone();
277 | field_0.set_name(fields[fields.len() - 1].name().clone());
278 | let mut fields = fields[1..]
279 | .iter()
280 | .zip(fields[0..fields.len() - 1].iter())
281 | .map(|(fld, name)| Field::new(name.name().clone(), fld.dtype().clone()))
282 | .collect::>();
283 | fields.push(field_0);
284 | Ok(Field::new(name, DataType::Struct(fields)))
285 | },
286 | _ => unreachable!(),
287 | }
288 | }
289 |
290 | #[polars_expr(output_type_func=shifted_struct)]
291 | fn shift_struct(inputs: &[Series]) -> PolarsResult {
292 | let struct_ = inputs[0].struct_()?;
293 | let fields = struct_.fields_as_series();
294 | if fields.is_empty() {
295 | return Ok(inputs[0].clone());
296 | }
297 | let mut field_0 = fields[0].clone();
298 | let name = field_0.name().clone();
299 | field_0.rename(fields[fields.len() - 1].name().clone());
300 | let mut fields = fields[1..]
301 | .iter()
302 | .zip(fields[..fields.len() - 1].iter())
303 | .map(|(s, name)| {
304 | let mut s = s.clone();
305 | s.rename(name.name().clone());
306 | s
307 | })
308 | .collect::>();
309 | fields.push(field_0);
310 | StructChunked::from_series(name, struct_.len(), fields.iter()).map(|ca| ca.into_series())
311 | }
312 |
313 | use reverse_geocoder::ReverseGeocoder;
314 |
315 | #[polars_expr(output_type=String)]
316 | fn reverse_geocode(inputs: &[Series]) -> PolarsResult {
317 | let latitude = inputs[0].f64()?;
318 | let longitude = inputs[1].f64()?;
319 | let geocoder = ReverseGeocoder::new();
320 | let out = binary_elementwise_into_string_amortized(latitude, longitude, |lhs, rhs, out| {
321 | let search_result = geocoder.search((lhs, rhs));
322 | write!(out, "{}", search_result.record.name).unwrap();
323 | });
324 | Ok(out.into_series())
325 | }
326 |
327 | fn list_idx_dtype(input_fields: &[Field]) -> PolarsResult {
328 | let field = Field::new(
329 | input_fields[0].name.clone(),
330 | DataType::List(Box::new(IDX_DTYPE)),
331 | );
332 | Ok(field.clone())
333 | }
334 |
335 | #[polars_expr(output_type_func=list_idx_dtype)]
336 | fn non_zero_indices(inputs: &[Series]) -> PolarsResult {
337 | let ca = inputs[0].list()?;
338 | polars_ensure!(
339 | ca.dtype() == &DataType::List(Box::new(DataType::Int64)),
340 | ComputeError: "Expected `List(Int64)`, got: {}", ca.dtype()
341 | );
342 |
343 | let out: ListChunked = ca.apply_amortized(|s| {
344 | let s: &Series = s.as_ref();
345 | let ca: &Int64Chunked = s.i64().unwrap();
346 | let out: IdxCa = ca
347 | .iter()
348 | .enumerate()
349 | .filter(|(_idx, opt_val)| opt_val != &Some(0))
350 | .map(|(idx, _opt_val)| Some(idx as IdxSize))
351 | .collect_ca(PlSmallStr::EMPTY);
352 | out.into_series()
353 | });
354 | Ok(out.into_series())
355 | }
356 |
357 | #[polars_expr(output_type=Float64)]
358 | fn vertical_weighted_mean(inputs: &[Series]) -> PolarsResult {
359 | let values = &inputs[0].f64()?;
360 | let weights = &inputs[1].f64()?;
361 | let mut numerator = 0.;
362 | let mut denominator = 0.;
363 | values.iter().zip(weights.iter()).for_each(|(v, w)| {
364 | if let (Some(v), Some(w)) = (v, w) {
365 | numerator += v * w;
366 | denominator += w;
367 | }
368 | });
369 | let result = numerator / denominator;
370 | Ok(Series::new(PlSmallStr::EMPTY, vec![result]))
371 | }
372 |
373 | fn linear_itp(low: T, step: T, slope: T) -> T
374 | where
375 | T: Sub + Mul + Add + Div,
376 | {
377 | low + step * slope
378 | }
379 |
380 | #[inline]
381 | fn signed_interp(low: T, high: T, steps: IdxSize, steps_n: T, out: &mut Vec)
382 | where
383 | T: Sub + Mul + Add + Div + NumCast + Copy,
384 | {
385 | let slope = (high - low) / steps_n;
386 | for step_i in 1..steps {
387 | let step_i: T = NumCast::from(step_i).unwrap();
388 | let v = linear_itp(low, step_i, slope);
389 | out.push(v)
390 | }
391 | }
392 |
393 | fn interpolate_impl(chunked_arr: &ChunkedArray, interpolation_branch: I) -> ChunkedArray
394 | where
395 | T: PolarsNumericType,
396 | I: Fn(T::Native, T::Native, IdxSize, T::Native, &mut Vec),
397 | {
398 | // This implementation differs from pandas as that boundary None's are not removed.
399 | // This prevents a lot of errors due to expressions leading to different lengths.
400 | if chunked_arr.null_count() == 0 || chunked_arr.null_count() == chunked_arr.len() {
401 | return chunked_arr.clone();
402 | }
403 |
404 | // We first find the first and last so that we can set the null buffer.
405 | let first = chunked_arr.first_non_null().unwrap();
406 | let last = chunked_arr.last_non_null().unwrap() + 1;
407 |
408 | // Fill out with `first` nulls.
409 | let mut out = Vec::with_capacity(chunked_arr.len());
410 | let mut iter = chunked_arr.iter().skip(first);
411 | for _ in 0..first {
412 | out.push(Zero::zero());
413 | }
414 |
415 | // The next element of `iter` is definitely `Some(Some(v))`, because we skipped the first
416 | // elements `first` and if all values were missing we'd have done an early return.
417 | let mut low = iter.next().unwrap().unwrap();
418 | out.push(low);
419 | while let Some(next) = iter.next() {
420 | if let Some(v) = next {
421 | out.push(v);
422 | low = v;
423 | } else {
424 | let mut steps = 1 as IdxSize;
425 | for next in iter.by_ref() {
426 | steps += 1;
427 | if let Some(high) = next {
428 | let steps_n: T::Native = NumCast::from(steps).unwrap();
429 | interpolation_branch(low, high, steps, steps_n, &mut out);
430 | out.push(high);
431 | low = high;
432 | break;
433 | }
434 | }
435 | }
436 | }
437 | if first != 0 || last != chunked_arr.len() {
438 | let mut validity = MutableBitmap::with_capacity(chunked_arr.len());
439 | validity.extend_constant(chunked_arr.len(), true);
440 |
441 | for i in 0..first {
442 | validity.set(i, false);
443 | }
444 |
445 | for i in last..chunked_arr.len() {
446 | validity.set(i, false);
447 | out.push(Zero::zero())
448 | }
449 |
450 | let array = PrimitiveArray::new(
451 | T::get_dtype().to_arrow(CompatLevel::newest()),
452 | out.into(),
453 | Some(validity.into()),
454 | );
455 | ChunkedArray::with_chunk(PlSmallStr::EMPTY, array)
456 | } else {
457 | ChunkedArray::from_vec(PlSmallStr::EMPTY, out)
458 | }
459 | }
460 |
461 | #[polars_expr(output_type=Int64)]
462 | fn interpolate(inputs: &[Series]) -> PolarsResult {
463 | let s = &inputs[0];
464 | let ca = s.i64()?;
465 | let out: Int64Chunked = interpolate_impl(ca, signed_interp::);
466 | Ok(out.into_series())
467 | }
468 |
469 | #[polars_expr(output_type=Int64)]
470 | fn life_step(inputs: &[Series]) -> PolarsResult {
471 | let (ca_lf, ca_curr, ca_rt) = (inputs[0].i64()?, inputs[1].i64()?, inputs[2].i64()?);
472 |
473 | let lf = ca_lf
474 | .cont_slice()
475 | .expect("Expected input to be contiguous (in a single chunk)");
476 | let mid = ca_curr
477 | .cont_slice()
478 | .expect("Expected input to be contiguous (in a single chunk)");
479 | let rt = ca_rt
480 | .cont_slice()
481 | .expect("Expected input to be contiguous (in a single chunk)");
482 |
483 | let len = lf.len();
484 |
485 | let out: Int64Chunked = mid
486 | .iter()
487 | .enumerate()
488 | .map(|(idx, val)| {
489 | // Neighbours above
490 | let prev_row = if 0 == idx {
491 | lf[len - 1] + mid[len - 1] + rt[len - 1]
492 | } else {
493 | lf[idx - 1] + mid[idx - 1] + rt[idx - 1]
494 | };
495 |
496 | // Curr row does not include cell in the middle,
497 | // a cell is not a neighbour of itself
498 | let curr_row = lf[idx] + rt[idx];
499 |
500 | // Neighbours below
501 | let next_row = if len - 1 == idx {
502 | lf[0] + mid[0] + rt[0]
503 | } else {
504 | lf[idx + 1] + mid[idx + 1] + rt[idx + 1]
505 | };
506 |
507 | // Life logic
508 | Some(match (val, prev_row + curr_row + next_row) {
509 | (1, 2) | (1, 3) => 1,
510 | (0, 3) => 1,
511 | _ => 0,
512 | })
513 | })
514 | .collect_trusted();
515 | Ok(out.into_series())
516 | }
517 |
--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
1 | mod arrays;
2 | mod expressions;
3 |
4 | use pyo3_polars::PolarsAllocator;
5 |
6 | #[global_allocator]
7 | static ALLOC: PolarsAllocator = PolarsAllocator::new();
8 |
--------------------------------------------------------------------------------
/test_plugin.py:
--------------------------------------------------------------------------------
1 | import polars as pl
2 | import minimal_plugin as mp
3 | from polars.testing import assert_frame_equal
4 |
5 |
6 | def test_noop():
7 | df = pl.DataFrame(
8 | {"a": [1, 1, None], "b": [4.1, 5.2, 6.3], "c": ["hello", "everybody!", "!"]}
9 | )
10 | result = df.with_columns(mp.noop(pl.all()).name.suffix("_noop"))
11 | expected = pl.DataFrame(
12 | {
13 | "a": [1, 1, None],
14 | "b": [4.1, 5.2, 6.3],
15 | "c": ["hello", "everybody!", "!"],
16 | "a_noop": [1, 1, None],
17 | "b_noop": [4.1, 5.2, 6.3],
18 | "c_noop": ["hello", "everybody!", "!"],
19 | }
20 | )
21 | assert_frame_equal(result, expected)
22 |
23 |
24 | def test_abs_i64():
25 | df = pl.DataFrame(
26 | {"a": [1, -1, None], "b": [4.1, 5.2, -6.3], "c": ["hello", "everybody!", "!"]}
27 | )
28 | result = df.with_columns(mp.abs_i64("a").name.suffix("_abs"))
29 | expected = pl.DataFrame(
30 | {
31 | "a": [1, -1, None],
32 | "b": [4.1, 5.2, -6.3],
33 | "c": ["hello", "everybody!", "!"],
34 | "a_abs": [1, 1, None],
35 | }
36 | )
37 | assert_frame_equal(result, expected)
38 |
39 |
40 | def test_abs_numeric():
41 | df = pl.DataFrame(
42 | {"a": [1, -1, None], "b": [4.1, 5.2, -6.3], "c": ["hello", "everybody!", "!"]}
43 | )
44 | result = df.with_columns(mp.abs_numeric(pl.col("a", "b")).name.suffix("_abs"))
45 | expected = pl.DataFrame(
46 | {
47 | "a": [1, -1, None],
48 | "b": [4.1, 5.2, -6.3],
49 | "c": ["hello", "everybody!", "!"],
50 | "a_abs": [1, 1, None],
51 | "b_abs": [4.1, 5.2, 6.3],
52 | }
53 | )
54 | assert_frame_equal(result, expected)
55 |
56 |
57 | def test_sum_i64():
58 | df = pl.DataFrame({"a": [1, 5, 2], "b": [3, None, -1]})
59 | result = df.with_columns(a_plus_b=mp.sum_i64("a", "b"))
60 | expected = pl.DataFrame(
61 | {"a": [1, 5, 2], "b": [3, None, -1], "a_plus_b": [4, None, 1]}
62 | )
63 | assert_frame_equal(result, expected)
64 |
65 |
66 | def test_cum_sum():
67 | df = pl.DataFrame(
68 | {
69 | "a": [1, 2, 3, 4, None, 5],
70 | "b": [1, 1, 1, 2, 2, 2],
71 | }
72 | )
73 | result = df.with_columns(a_cum_sum=mp.cum_sum("a"))
74 | expected = pl.DataFrame(
75 | {
76 | "a": [1, 2, 3, 4, None, 5],
77 | "b": [1, 1, 1, 2, 2, 2],
78 | "a_cum_sum": [1, 3, 6, 10, None, 15],
79 | }
80 | )
81 | assert_frame_equal(result, expected)
82 | result = df.with_columns(a_cum_sum=mp.cum_sum("a").over("b"))
83 | expected = pl.DataFrame(
84 | {
85 | "a": [1, 2, 3, 4, None, 5],
86 | "b": [1, 1, 1, 2, 2, 2],
87 | "a_cum_sum": [1, 3, 6, 4, None, 9],
88 | }
89 | )
90 | assert_frame_equal(result, expected)
91 |
92 |
93 | def test_pig_latinnify():
94 | df = pl.DataFrame({"a": ["I", "love", "pig", "latin"]})
95 | result = df.with_columns(a_pig_latin=mp.pig_latinnify("a"))
96 | expected = pl.DataFrame(
97 | {
98 | "a": ["I", "love", "pig", "latin"],
99 | "a_pig_latin": ["Iay", "ovelay", "igpay", "atinlay"],
100 | }
101 | )
102 | assert_frame_equal(result, expected)
103 |
104 |
105 | def test_add_suffix():
106 | df = pl.DataFrame({"a": ["bob", "billy"]})
107 | result = df.with_columns(mp.add_suffix("a", suffix="-billy"))
108 | expected = pl.DataFrame({"a": ["bob-billy", "billy-billy"]})
109 | assert_frame_equal(result, expected)
110 |
111 |
112 | def test_weighted_mean():
113 | df = pl.DataFrame(
114 | {
115 | "values": [[1, 3, 2], [5, 7], None, [5, 7], []],
116 | "weights": [[0.5, 0.3, 0.2], [0.1, 0.9], [0.1, 0.9], None, []],
117 | }
118 | )
119 | result = df.with_columns(weighted_mean=mp.weighted_mean("values", "weights"))
120 | expected = pl.DataFrame(
121 | {
122 | "values": [[1, 3, 2], [5, 7], None, [5, 7], []],
123 | "weights": [[0.5, 0.3, 0.2], [0.1, 0.9], [0.1, 0.9], None, []],
124 | "weighted_mean": [1.7999999999999998, 6.8, None, None, None],
125 | }
126 | )
127 | assert_frame_equal(result, expected)
128 |
129 |
130 | def test_non_zero_indices():
131 | df = pl.DataFrame({"dense": [[0, 9], [8, 6, 0, 9], None, [3, 3]]})
132 | result = df.with_columns(indices=mp.non_zero_indices("dense"))
133 | expected = pl.DataFrame(
134 | {
135 | "dense": [[0, 9], [8, 6, 0, 9], None, [3, 3]],
136 | "indices": [[1], [0, 1, 3], None, [0, 1]],
137 | },
138 | schema_overrides={"indices": pl.List(pl.UInt32)},
139 | )
140 | assert_frame_equal(result, expected)
141 |
142 |
143 | def test_print_struct_fields():
144 | df = pl.DataFrame(
145 | {
146 | "x": [1.0, 1.25, 1.5, 1.75],
147 | "y": [3.0, 2.75, 2.5, 2.25],
148 | "rgba": [0x00FF7FFF, 0xFF7F00FF, 0x7F7F7FFF, 0xD8D8D8FF],
149 | }
150 | ).select(
151 | point_2d_s=pl.struct(
152 | "x",
153 | "y",
154 | "rgba",
155 | schema={
156 | "x": pl.Float64,
157 | "y": pl.Float64,
158 | "rgba": pl.UInt32,
159 | },
160 | )
161 | )
162 | result = df.with_columns(point_2d_s=mp.print_struct_fields("point_2d_s"))
163 | assert_frame_equal(result, df)
164 |
165 |
166 | def test_shift_struct():
167 | df = pl.DataFrame(
168 | {
169 | "a": [1, 3, 8],
170 | "b": [2.0, 3.1, 2.5],
171 | "c": ["3", "7", "3"],
172 | }
173 | ).select(abc=pl.struct("a", "b", "c"))
174 | result = df.with_columns(abc_shifted=mp.shift_struct("abc"))
175 | expected = pl.DataFrame(
176 | {
177 | "abc": [
178 | {"a": 1, "b": 2.0, "c": "3"},
179 | {"a": 3, "b": 3.1, "c": "7"},
180 | {"a": 8, "b": 2.5, "c": "3"},
181 | ],
182 | "abc_shifted": [
183 | {"a": 2.0, "b": "3", "c": 1},
184 | {"a": 3.1, "b": "7", "c": 3},
185 | {"a": 2.5, "b": "3", "c": 8},
186 | ],
187 | }
188 | )
189 | assert_frame_equal(result, expected)
190 |
191 |
192 | def test_reverse_geocode():
193 | df = pl.DataFrame({"lat": [37.7749, 51.01, 52.5], "lon": [-122.4194, -3.9, -0.91]})
194 | result = df.with_columns(city=mp.reverse_geocode("lat", "lon"))
195 | expected = pl.DataFrame(
196 | {
197 | "lat": [37.7749, 51.01, 52.5],
198 | "lon": [-122.4194, -3.9, -0.91],
199 | "city": ["San Francisco", "South Molton", "Market Harborough"],
200 | }
201 | )
202 | assert_frame_equal(result, expected)
203 |
204 |
205 | def test_vertical_weighted_mean():
206 | df = pl.DataFrame(
207 | {
208 | "values": [1.0, 3, 2, 5, 7],
209 | "weights": [0.5, 0.3, 0.2, 0.1, 0.9],
210 | "group": ["a", "a", "a", "b", "b"],
211 | }
212 | )
213 | result = (
214 | df.group_by("group")
215 | .agg(weighted_mean=mp.vertical_weighted_mean("values", "weights"))
216 | .sort("group", descending=True)
217 | )
218 | expected = pl.DataFrame(
219 | {"group": ["b", "a"], "weighted_mean": [6.8, 1.7999999999999998]}
220 | )
221 | assert_frame_equal(result, expected)
222 |
223 |
224 | def test_midpoint_2d():
225 | df = pl.DataFrame(
226 | pl.Series(
227 | "points",
228 | [
229 | [6.63, 8.35],
230 | [7.19, 4.85],
231 | [2.1, 4.21],
232 | [3.4, 6.13],
233 | [2.48, 9.26],
234 | ],
235 | dtype=pl.Array(pl.Float64, 2),
236 | )
237 | )
238 | result = df.select(midpoints=mp.midpoint_2d("points", ref_point=(5.0, 5.0)))
239 | expected = pl.DataFrame(
240 | pl.Series(
241 | "midpoints",
242 | [[5.815, 6.675], [6.095, 4.925], [3.55, 4.605], [4.2, 5.565], [3.74, 7.13]],
243 | dtype=pl.Array(pl.Float64, 2),
244 | )
245 | )
246 | assert_frame_equal(result, expected)
247 |
--------------------------------------------------------------------------------