├── .cargo └── config ├── .dockerignore ├── .github └── workflows │ ├── build.yml │ └── test.yaml ├── .gitignore ├── CHANGELOG.md ├── Cargo.lock ├── Cargo.toml ├── LICENSE.txt ├── README.md ├── datafusion ├── __init__.py ├── functions.py └── tests │ ├── __init__.py │ ├── conftest.py │ ├── generic.py │ ├── test_aggregation.py │ ├── test_catalog.py │ ├── test_context.py │ ├── test_dataframe.py │ ├── test_functions.py │ ├── test_imports.py │ ├── test_indexing.py │ ├── test_sql.py │ └── test_udaf.py ├── dev └── create_license.py ├── pyproject.toml ├── requirements-310.txt ├── requirements-37.txt ├── requirements.in └── src ├── catalog.rs ├── context.rs ├── dataframe.rs ├── errors.rs ├── expression.rs ├── functions.rs ├── lib.rs ├── udaf.rs ├── udf.rs └── utils.rs /.cargo/config: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | [target.x86_64-apple-darwin] 19 | rustflags = [ 20 | "-C", "link-arg=-undefined", 21 | "-C", "link-arg=dynamic_lookup", 22 | ] 23 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | target 19 | venv 20 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | name: Python Release Build 19 | on: 20 | push: 21 | tags: ["*-rc*"] 22 | branches: ["main"] 23 | 24 | jobs: 25 | generate-license: 26 | runs-on: ubuntu-latest 27 | steps: 28 | - uses: actions/checkout@v2 29 | - uses: actions-rs/toolchain@v1 30 | with: 31 | profile: minimal 32 | toolchain: stable 33 | override: true 34 | - name: Generate license file 35 | run: python ./dev/create_license.py 36 | - uses: actions/upload-artifact@v2 37 | with: 38 | name: python-wheel-license 39 | path: LICENSE.txt 40 | 41 | build-python-mac-win: 42 | needs: [generate-license] 43 | name: Mac/Win 44 | runs-on: ${{ matrix.os }} 45 | strategy: 46 | fail-fast: false 47 | matrix: 48 | python-version: ["3.10"] 49 | os: [macos-latest, windows-latest] 50 | steps: 51 | - uses: actions/checkout@v2 52 | 53 | - uses: actions/setup-python@v2 54 | with: 55 | python-version: ${{ matrix.python-version }} 56 | 57 | - uses: actions-rs/toolchain@v1 58 | with: 59 | toolchain: stable 60 | 61 | - name: Install dependencies 62 | run: | 63 | python -m pip install --upgrade pip 64 | pip install maturin==0.12.10 65 | 66 | - run: rm LICENSE.txt 67 | - name: Download LICENSE.txt 68 | uses: actions/download-artifact@v2 69 | with: 70 | name: python-wheel-license 71 | path: . 72 | 73 | - name: Build Python package 74 | run: maturin build --release --strip --cargo-extra-args="--locked" 75 | 76 | - name: List Windows wheels 77 | if: matrix.os == 'windows-latest' 78 | run: dir target\wheels\ 79 | 80 | - name: List Mac wheels 81 | if: matrix.os != 'windows-latest' 82 | run: find target/wheels/ 83 | 84 | - name: Archive wheels 85 | uses: actions/upload-artifact@v2 86 | with: 87 | name: dist 88 | path: target/wheels/* 89 | 90 | build-manylinux: 91 | needs: [generate-license] 92 | name: Manylinux 93 | runs-on: ubuntu-latest 94 | steps: 95 | - uses: actions/checkout@v2 96 | - run: rm LICENSE.txt 97 | - name: Download LICENSE.txt 98 | uses: actions/download-artifact@v2 99 | with: 100 | name: python-wheel-license 101 | path: . 102 | - run: cat LICENSE.txt 103 | - name: Build wheels 104 | run: | 105 | export RUSTFLAGS='-C target-cpu=skylake' 106 | docker run --rm -v $(pwd):/io \ 107 | --workdir /io \ 108 | konstin2/maturin:v0.12.10 \ 109 | build --release --manylinux 2010 --cargo-extra-args="--locked" 110 | - name: Archive wheels 111 | uses: actions/upload-artifact@v2 112 | with: 113 | name: dist 114 | path: target/wheels/* 115 | 116 | # NOTE: PyPI publish needs to be done manually for now after release passed the vote 117 | # release: 118 | # name: Publish in PyPI 119 | # needs: [build-manylinux, build-python-mac-win] 120 | # runs-on: ubuntu-latest 121 | # steps: 122 | # - uses: actions/download-artifact@v2 123 | # - name: Publish to PyPI 124 | # uses: pypa/gh-action-pypi-publish@master 125 | # with: 126 | # user: __token__ 127 | # password: ${{ secrets.pypi_password }} 128 | -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | name: Python test 19 | on: 20 | push: 21 | branches: [main] 22 | pull_request: 23 | branches: [main] 24 | 25 | concurrency: 26 | group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} 27 | cancel-in-progress: true 28 | 29 | jobs: 30 | test-matrix: 31 | runs-on: ubuntu-latest 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | python-version: 36 | - "3.10" 37 | toolchain: 38 | - "stable" 39 | - "beta" 40 | # we are not that much eager in walking on the edge yet 41 | # - nightly 42 | # build stable for only 3.7 43 | include: 44 | - python-version: "3.7" 45 | toolchain: "stable" 46 | steps: 47 | - uses: actions/checkout@v2 48 | 49 | - name: Setup Rust Toolchain 50 | uses: actions-rs/toolchain@v1 51 | id: rust-toolchain 52 | with: 53 | toolchain: ${{ matrix.toolchain }} 54 | override: true 55 | 56 | - name: Setup Python 57 | uses: actions/setup-python@v2 58 | with: 59 | python-version: ${{ matrix.python-version }} 60 | 61 | - name: Cache Cargo 62 | uses: actions/cache@v2 63 | with: 64 | path: ~/.cargo 65 | key: cargo-cache-${{ steps.rust-toolchain.outputs.rustc_hash }}-${{ hashFiles('Cargo.lock') }} 66 | 67 | - name: Check Formatting 68 | uses: actions-rs/cargo@v1 69 | if: ${{ matrix.python-version == '3.10' && matrix.toolchain == 'stable' }} 70 | with: 71 | command: fmt 72 | args: -- --check 73 | 74 | - name: Run Clippy 75 | uses: actions-rs/cargo@v1 76 | if: ${{ matrix.python-version == '3.10' && matrix.toolchain == 'stable' }} 77 | with: 78 | command: clippy 79 | args: --all-targets --all-features -- -D clippy::all 80 | 81 | - name: Create Virtualenv (3.10) 82 | if: ${{ matrix.python-version == '3.10' }} 83 | run: | 84 | python -m venv venv 85 | source venv/bin/activate 86 | pip install -r requirements-310.txt 87 | 88 | - name: Create Virtualenv (3.7) 89 | if: ${{ matrix.python-version == '3.7' }} 90 | run: | 91 | python -m venv venv 92 | source venv/bin/activate 93 | pip install -r requirements-37.txt 94 | 95 | - name: Run Python Linters 96 | if: ${{ matrix.python-version == '3.10' && matrix.toolchain == 'stable' }} 97 | run: | 98 | source venv/bin/activate 99 | flake8 --exclude venv --ignore=E501 100 | black --line-length 79 --diff --check . 101 | 102 | - name: Run tests 103 | run: | 104 | source venv/bin/activate 105 | maturin develop --cargo-extra-args='--locked' 106 | RUST_BACKTRACE=1 pytest -v . 107 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | 19 | # Created by https://www.toptal.com/developers/gitignore/api/python,rust 20 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,rust 21 | 22 | ### Python ### 23 | # Byte-compiled / optimized / DLL files 24 | __pycache__/ 25 | *.py[cod] 26 | *$py.class 27 | 28 | # C extensions 29 | *.so 30 | 31 | # Distribution / packaging 32 | .Python 33 | build/ 34 | develop-eggs/ 35 | dist/ 36 | downloads/ 37 | eggs/ 38 | .eggs/ 39 | lib/ 40 | lib64/ 41 | parts/ 42 | sdist/ 43 | var/ 44 | wheels/ 45 | share/python-wheels/ 46 | *.egg-info/ 47 | .installed.cfg 48 | *.egg 49 | MANIFEST 50 | 51 | # PyInstaller 52 | # Usually these files are written by a python script from a template 53 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 54 | *.manifest 55 | *.spec 56 | 57 | # Installer logs 58 | pip-log.txt 59 | pip-delete-this-directory.txt 60 | 61 | # Unit test / coverage reports 62 | htmlcov/ 63 | .tox/ 64 | .nox/ 65 | .coverage 66 | .coverage.* 67 | .cache 68 | nosetests.xml 69 | coverage.xml 70 | *.cover 71 | *.py,cover 72 | .hypothesis/ 73 | .pytest_cache/ 74 | cover/ 75 | 76 | # Translations 77 | *.mo 78 | *.pot 79 | 80 | # Django stuff: 81 | *.log 82 | local_settings.py 83 | db.sqlite3 84 | db.sqlite3-journal 85 | 86 | # Flask stuff: 87 | instance/ 88 | .webassets-cache 89 | 90 | # Scrapy stuff: 91 | .scrapy 92 | 93 | # Sphinx documentation 94 | docs/_build/ 95 | 96 | # PyBuilder 97 | .pybuilder/ 98 | target/ 99 | 100 | # Jupyter Notebook 101 | .ipynb_checkpoints 102 | 103 | # IPython 104 | profile_default/ 105 | ipython_config.py 106 | 107 | # pyenv 108 | # For a library or package, you might want to ignore these files since the code is 109 | # intended to run in multiple environments; otherwise, check them in: 110 | # .python-version 111 | 112 | # pipenv 113 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 114 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 115 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 116 | # install all needed dependencies. 117 | #Pipfile.lock 118 | 119 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 120 | __pypackages__/ 121 | 122 | # Celery stuff 123 | celerybeat-schedule 124 | celerybeat.pid 125 | 126 | # SageMath parsed files 127 | *.sage.py 128 | 129 | # Environments 130 | .env 131 | .venv 132 | env/ 133 | venv/ 134 | ENV/ 135 | env.bak/ 136 | venv.bak/ 137 | 138 | # Spyder project settings 139 | .spyderproject 140 | .spyproject 141 | 142 | # Rope project settings 143 | .ropeproject 144 | 145 | # mkdocs documentation 146 | /site 147 | 148 | # mypy 149 | .mypy_cache/ 150 | .dmypy.json 151 | dmypy.json 152 | 153 | # Pyre type checker 154 | .pyre/ 155 | 156 | # pytype static type analyzer 157 | .pytype/ 158 | 159 | # Cython debug symbols 160 | cython_debug/ 161 | 162 | ### Rust ### 163 | # Generated by Cargo 164 | # will have compiled files and executables 165 | debug/ 166 | 167 | # These are backup files generated by rustfmt 168 | **/*.rs.bk 169 | 170 | # MSVC Windows builds of rustc generate these, which store debugging information 171 | *.pdb 172 | 173 | # End of https://www.toptal.com/developers/gitignore/api/python,rust 174 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Changelog 21 | 22 | ## [Unreleased](https://github.com/datafusion-contrib/datafusion-python/tree/HEAD) 23 | 24 | [Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.1...HEAD) 25 | 26 | **Merged pull requests:** 27 | 28 | - use \_\_getitem\_\_ for df column selection [\#41](https://github.com/datafusion-contrib/datafusion-python/pull/41) ([Jimexist](https://github.com/Jimexist)) 29 | - fix demo in readme [\#40](https://github.com/datafusion-contrib/datafusion-python/pull/40) ([Jimexist](https://github.com/Jimexist)) 30 | - Implement select_columns [\#39](https://github.com/datafusion-contrib/datafusion-python/pull/39) ([andygrove](https://github.com/andygrove)) 31 | - update readme and changelog [\#38](https://github.com/datafusion-contrib/datafusion-python/pull/38) ([Jimexist](https://github.com/Jimexist)) 32 | - Add PyDataFrame.explain [\#36](https://github.com/datafusion-contrib/datafusion-python/pull/36) ([andygrove](https://github.com/andygrove)) 33 | - Release 0.5.0 [\#34](https://github.com/datafusion-contrib/datafusion-python/pull/34) ([Jimexist](https://github.com/Jimexist)) 34 | - disable nightly in workflow [\#33](https://github.com/datafusion-contrib/datafusion-python/pull/33) ([Jimexist](https://github.com/Jimexist)) 35 | - update requirements to 37 and 310, update readme [\#32](https://github.com/datafusion-contrib/datafusion-python/pull/32) ([Jimexist](https://github.com/Jimexist)) 36 | - Add custom global allocator [\#30](https://github.com/datafusion-contrib/datafusion-python/pull/30) ([matthewmturner](https://github.com/matthewmturner)) 37 | - Remove pandas dependency [\#25](https://github.com/datafusion-contrib/datafusion-python/pull/25) ([matthewmturner](https://github.com/matthewmturner)) 38 | - upgrade datafusion and pyo3 [\#20](https://github.com/datafusion-contrib/datafusion-python/pull/20) ([Jimexist](https://github.com/Jimexist)) 39 | - update maturin 0.12+ [\#17](https://github.com/datafusion-contrib/datafusion-python/pull/17) ([Jimexist](https://github.com/Jimexist)) 40 | - Update README.md [\#16](https://github.com/datafusion-contrib/datafusion-python/pull/16) ([Jimexist](https://github.com/Jimexist)) 41 | - apply cargo clippy --fix [\#15](https://github.com/datafusion-contrib/datafusion-python/pull/15) ([Jimexist](https://github.com/Jimexist)) 42 | - update test workflow to include rust clippy and check [\#14](https://github.com/datafusion-contrib/datafusion-python/pull/14) ([Jimexist](https://github.com/Jimexist)) 43 | - use maturin 0.12.6 [\#13](https://github.com/datafusion-contrib/datafusion-python/pull/13) ([Jimexist](https://github.com/Jimexist)) 44 | - apply cargo fmt [\#12](https://github.com/datafusion-contrib/datafusion-python/pull/12) ([Jimexist](https://github.com/Jimexist)) 45 | - use stable not nightly [\#11](https://github.com/datafusion-contrib/datafusion-python/pull/11) ([Jimexist](https://github.com/Jimexist)) 46 | - ci: test against more compilers, setup clippy and fix clippy lints [\#9](https://github.com/datafusion-contrib/datafusion-python/pull/9) ([cpcloud](https://github.com/cpcloud)) 47 | - Fix use of importlib.metadata and unify requirements.txt [\#8](https://github.com/datafusion-contrib/datafusion-python/pull/8) ([cpcloud](https://github.com/cpcloud)) 48 | - Ship the Cargo.lock file in the source distribution [\#7](https://github.com/datafusion-contrib/datafusion-python/pull/7) ([cpcloud](https://github.com/cpcloud)) 49 | - add \_\_version\_\_ attribute to datafusion object [\#3](https://github.com/datafusion-contrib/datafusion-python/pull/3) ([tfeda](https://github.com/tfeda)) 50 | - fix ci by fixing directories [\#2](https://github.com/datafusion-contrib/datafusion-python/pull/2) ([Jimexist](https://github.com/Jimexist)) 51 | - setup workflow [\#1](https://github.com/datafusion-contrib/datafusion-python/pull/1) ([Jimexist](https://github.com/Jimexist)) 52 | 53 | ## [0.5.1](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.1) (2022-03-15) 54 | 55 | [Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.1-rc1...0.5.1) 56 | 57 | ## [0.5.1-rc1](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.1-rc1) (2022-03-15) 58 | 59 | [Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.0...0.5.1-rc1) 60 | 61 | ## [0.5.0](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.0) (2022-03-10) 62 | 63 | [Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.0-rc2...0.5.0) 64 | 65 | ## [0.5.0-rc2](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.0-rc2) (2022-03-10) 66 | 67 | [Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.0-rc1...0.5.0-rc2) 68 | 69 | **Closed issues:** 70 | 71 | - Add support for Ballista [\#37](https://github.com/datafusion-contrib/datafusion-python/issues/37) 72 | - Implement DataFrame.explain [\#35](https://github.com/datafusion-contrib/datafusion-python/issues/35) 73 | 74 | ## [0.5.0-rc1](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.0-rc1) (2022-03-09) 75 | 76 | [Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/4c98b8e9c3c3f8e2e6a8f2d1ffcfefda344c4680...0.5.0-rc1) 77 | 78 | **Closed issues:** 79 | 80 | - Investigate exposing additional optimizations [\#28](https://github.com/datafusion-contrib/datafusion-python/issues/28) 81 | - Use custom allocator in Python build [\#27](https://github.com/datafusion-contrib/datafusion-python/issues/27) 82 | - Why is pandas a requirement? [\#24](https://github.com/datafusion-contrib/datafusion-python/issues/24) 83 | - Unable to build [\#18](https://github.com/datafusion-contrib/datafusion-python/issues/18) 84 | - Setup CI against multiple Python version [\#6](https://github.com/datafusion-contrib/datafusion-python/issues/6) 85 | 86 | \* _This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)_ 87 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | [package] 19 | name = "datafusion-python" 20 | version = "0.6.0" 21 | homepage = "https://github.com/apache/arrow" 22 | repository = "https://github.com/apache/arrow" 23 | authors = ["Apache Arrow "] 24 | description = "Build and run queries against data" 25 | readme = "README.md" 26 | license = "Apache-2.0" 27 | edition = "2021" 28 | rust-version = "1.57" 29 | 30 | [features] 31 | default = ["mimalloc"] 32 | 33 | [dependencies] 34 | tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] } 35 | rand = "0.7" 36 | pyo3 = { version = "~0.16.5", features = ["extension-module", "abi3", "abi3-py37"] } 37 | datafusion = { version = "^10.0.0", features = ["pyarrow"] } 38 | datafusion-expr = { version = "^10.0.0" } 39 | datafusion-common = { version = "^10.0.0", features = ["pyarrow"] } 40 | uuid = { version = "0.8", features = ["v4"] } 41 | mimalloc = { version = "*", optional = true, default-features = false } 42 | 43 | [lib] 44 | name = "datafusion_python" 45 | crate-type = ["cdylib", "rlib"] 46 | 47 | [package.metadata.maturin] 48 | name = "datafusion._internal" 49 | 50 | [profile.release] 51 | lto = true 52 | codegen-units = 1 53 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # DataFusion Python Bindings Have Now Moved 21 | 22 | Development of the DataFusion Python bindings has now moved to https://github.com/apache/arrow-datafusion-python 23 | 24 | # DataFusion in Python 25 | 26 | [![Python test](https://github.com/datafusion-contrib/datafusion-python/actions/workflows/test.yaml/badge.svg)](https://github.com/datafusion-contrib/datafusion-python/actions/workflows/test.yaml) 27 | [![Python Release Build](https://github.com/datafusion-contrib/datafusion-python/actions/workflows/build.yml/badge.svg)](https://github.com/datafusion-contrib/datafusion-python/actions/workflows/build.yml) 28 | 29 | This is a Python library that binds to [Apache Arrow](https://arrow.apache.org/) in-memory query engine [DataFusion](https://github.com/apache/arrow-datafusion). 30 | 31 | Like pyspark, it allows you to build a plan through SQL or a DataFrame API against in-memory data, parquet or CSV files, run it in a multi-threaded environment, and obtain the result back in Python. 32 | 33 | It also allows you to use UDFs and UDAFs for complex operations. 34 | 35 | The major advantage of this library over other execution engines is that this library achieves zero-copy between Python and its execution engine: there is no cost in using UDFs, UDAFs, and collecting the results to Python apart from having to lock the GIL when running those operations. 36 | 37 | Its query engine, DataFusion, is written in [Rust](https://www.rust-lang.org/), which makes strong assumptions about thread safety and lack of memory leaks. 38 | 39 | Technically, zero-copy is achieved via the [c data interface](https://arrow.apache.org/docs/format/CDataInterface.html). 40 | 41 | ## How to use it 42 | 43 | Simple usage: 44 | 45 | ```python 46 | import datafusion 47 | from datafusion import functions as f 48 | from datafusion import col 49 | import pyarrow 50 | 51 | # create a context 52 | ctx = datafusion.SessionContext() 53 | 54 | # create a RecordBatch and a new DataFrame from it 55 | batch = pyarrow.RecordBatch.from_arrays( 56 | [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], 57 | names=["a", "b"], 58 | ) 59 | df = ctx.create_dataframe([[batch]]) 60 | 61 | # create a new statement 62 | df = df.select( 63 | col("a") + col("b"), 64 | col("a") - col("b"), 65 | ) 66 | 67 | # execute and collect the first (and only) batch 68 | result = df.collect()[0] 69 | 70 | assert result.column(0) == pyarrow.array([5, 7, 9]) 71 | assert result.column(1) == pyarrow.array([-3, -3, -3]) 72 | ``` 73 | 74 | ### UDFs 75 | 76 | ```python 77 | from datafusion import udf 78 | 79 | def is_null(array: pyarrow.Array) -> pyarrow.Array: 80 | return array.is_null() 81 | 82 | is_null_arr = udf(is_null, [pyarrow.int64()], pyarrow.bool_(), 'stable') 83 | 84 | df = df.select(is_null_arr(col("a"))) 85 | 86 | result = df.collect() 87 | 88 | assert result.column(0) == pyarrow.array([False] * 3) 89 | ``` 90 | 91 | ### UDAF 92 | 93 | ```python 94 | import pyarrow 95 | import pyarrow.compute 96 | from datafusion import udaf, Accumulator 97 | 98 | 99 | class MyAccumulator(Accumulator): 100 | """ 101 | Interface of a user-defined accumulation. 102 | """ 103 | def __init__(self): 104 | self._sum = pyarrow.scalar(0.0) 105 | 106 | def update(self, values: pyarrow.Array) -> None: 107 | # not nice since pyarrow scalars can't be summed yet. This breaks on `None` 108 | self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(values).as_py()) 109 | 110 | def merge(self, states: pyarrow.Array) -> None: 111 | # not nice since pyarrow scalars can't be summed yet. This breaks on `None` 112 | self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(states).as_py()) 113 | 114 | def state(self) -> pyarrow.Array: 115 | return pyarrow.array([self._sum.as_py()]) 116 | 117 | def evaluate(self) -> pyarrow.Scalar: 118 | return self._sum 119 | 120 | 121 | df = ctx.create_dataframe([[batch]]) 122 | 123 | my_udaf = udaf(MyAccumulator, pyarrow.float64(), pyarrow.float64(), [pyarrow.float64()], 'stable') 124 | 125 | df = df.aggregate( 126 | [], 127 | [my_udaf(col("a"))] 128 | ) 129 | 130 | result = df.collect()[0] 131 | 132 | assert result.column(0) == pyarrow.array([6.0]) 133 | ``` 134 | 135 | ## How to install (from pip) 136 | 137 | ```bash 138 | pip install datafusion 139 | # or 140 | python -m pip install datafusion 141 | ``` 142 | 143 | You can verify the installation by running: 144 | 145 | ```python 146 | >>> import datafusion 147 | >>> datafusion.__version__ 148 | '0.6.0' 149 | ``` 150 | 151 | ## How to develop 152 | 153 | This assumes that you have rust and cargo installed. We use the workflow recommended by [pyo3](https://github.com/PyO3/pyo3) and [maturin](https://github.com/PyO3/maturin). 154 | 155 | Bootstrap: 156 | 157 | ```bash 158 | # fetch this repo 159 | git clone git@github.com:datafusion-contrib/datafusion-python.git 160 | # prepare development environment (used to build wheel / install in development) 161 | python3 -m venv venv 162 | # activate the venv 163 | source venv/bin/activate 164 | # update pip itself if necessary 165 | python -m pip install -U pip 166 | # install dependencies (for Python 3.8+) 167 | python -m pip install -r requirements-310.txt 168 | ``` 169 | 170 | Whenever rust code changes (your changes or via `git pull`): 171 | 172 | ```bash 173 | # make sure you activate the venv using "source venv/bin/activate" first 174 | maturin develop 175 | python -m pytest 176 | ``` 177 | 178 | ## How to update dependencies 179 | 180 | To change test dependencies, change the `requirements.in` and run 181 | 182 | ```bash 183 | # install pip-tools (this can be done only once), also consider running in venv 184 | python -m pip install pip-tools 185 | python -m piptools compile --generate-hashes -o requirements-310.txt 186 | ``` 187 | 188 | To update dependencies, run with `-U` 189 | 190 | ```bash 191 | python -m piptools compile -U --generate-hashes -o requirements-310.txt 192 | ``` 193 | 194 | More details [here](https://github.com/jazzband/pip-tools) 195 | -------------------------------------------------------------------------------- /datafusion/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | from abc import ABCMeta, abstractmethod 19 | from typing import List 20 | 21 | try: 22 | import importlib.metadata as importlib_metadata 23 | except ImportError: 24 | import importlib_metadata 25 | 26 | 27 | import pyarrow as pa 28 | 29 | from ._internal import ( 30 | AggregateUDF, 31 | DataFrame, 32 | SessionContext, 33 | Expression, 34 | ScalarUDF, 35 | ) 36 | 37 | 38 | __version__ = importlib_metadata.version(__name__) 39 | 40 | 41 | __all__ = [ 42 | "DataFrame", 43 | "SessionContext", 44 | "Expression", 45 | "AggregateUDF", 46 | "ScalarUDF", 47 | "column", 48 | "literal", 49 | ] 50 | 51 | 52 | class Accumulator(metaclass=ABCMeta): 53 | @abstractmethod 54 | def state(self) -> List[pa.Scalar]: 55 | pass 56 | 57 | @abstractmethod 58 | def update(self, values: pa.Array) -> None: 59 | pass 60 | 61 | @abstractmethod 62 | def merge(self, states: pa.Array) -> None: 63 | pass 64 | 65 | @abstractmethod 66 | def evaluate(self) -> pa.Scalar: 67 | pass 68 | 69 | 70 | def column(value): 71 | return Expression.column(value) 72 | 73 | 74 | col = column 75 | 76 | 77 | def literal(value): 78 | if not isinstance(value, pa.Scalar): 79 | value = pa.scalar(value) 80 | return Expression.literal(value) 81 | 82 | 83 | lit = literal 84 | 85 | 86 | def udf(func, input_types, return_type, volatility, name=None): 87 | """ 88 | Create a new User Defined Function 89 | """ 90 | if not callable(func): 91 | raise TypeError("`func` argument must be callable") 92 | if name is None: 93 | name = func.__qualname__ 94 | return ScalarUDF( 95 | name=name, 96 | func=func, 97 | input_types=input_types, 98 | return_type=return_type, 99 | volatility=volatility, 100 | ) 101 | 102 | 103 | def udaf(accum, input_type, return_type, state_type, volatility, name=None): 104 | """ 105 | Create a new User Defined Aggregate Function 106 | """ 107 | if not issubclass(accum, Accumulator): 108 | raise TypeError( 109 | "`accum` must implement the abstract base class Accumulator" 110 | ) 111 | if name is None: 112 | name = accum.__qualname__ 113 | return AggregateUDF( 114 | name=name, 115 | accumulator=accum, 116 | input_type=input_type, 117 | return_type=return_type, 118 | state_type=state_type, 119 | volatility=volatility, 120 | ) 121 | -------------------------------------------------------------------------------- /datafusion/functions.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | 19 | from ._internal import functions 20 | 21 | 22 | def __getattr__(name): 23 | return getattr(functions, name) 24 | -------------------------------------------------------------------------------- /datafusion/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | -------------------------------------------------------------------------------- /datafusion/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from datafusion import SessionContext 3 | import pyarrow as pa 4 | 5 | 6 | @pytest.fixture 7 | def ctx(): 8 | return SessionContext() 9 | 10 | 11 | @pytest.fixture 12 | def database(ctx, tmp_path): 13 | path = tmp_path / "test.csv" 14 | 15 | table = pa.Table.from_arrays( 16 | [ 17 | [1, 2, 3, 4], 18 | ["a", "b", "c", "d"], 19 | [1.1, 2.2, 3.3, 4.4], 20 | ], 21 | names=["int", "str", "float"], 22 | ) 23 | pa.csv.write_csv(table, path) 24 | 25 | ctx.register_csv("csv", path) 26 | ctx.register_csv("csv1", str(path)) 27 | ctx.register_csv( 28 | "csv2", 29 | path, 30 | has_header=True, 31 | delimiter=",", 32 | schema_infer_max_records=10, 33 | ) 34 | -------------------------------------------------------------------------------- /datafusion/tests/generic.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import datetime 19 | 20 | import numpy as np 21 | import pyarrow as pa 22 | import pyarrow.csv 23 | 24 | # used to write parquet files 25 | import pyarrow.parquet as pq 26 | 27 | 28 | def data(): 29 | np.random.seed(1) 30 | data = np.concatenate( 31 | [ 32 | np.random.normal(0, 0.01, size=50), 33 | np.random.normal(50, 0.01, size=50), 34 | ] 35 | ) 36 | return pa.array(data) 37 | 38 | 39 | def data_with_nans(): 40 | np.random.seed(0) 41 | data = np.random.normal(0, 0.01, size=50) 42 | mask = np.random.randint(0, 2, size=50) 43 | data[mask == 0] = np.NaN 44 | return data 45 | 46 | 47 | def data_datetime(f): 48 | data = [ 49 | datetime.datetime.now(), 50 | datetime.datetime.now() - datetime.timedelta(days=1), 51 | datetime.datetime.now() + datetime.timedelta(days=1), 52 | ] 53 | return pa.array( 54 | data, type=pa.timestamp(f), mask=np.array([False, True, False]) 55 | ) 56 | 57 | 58 | def data_date32(): 59 | data = [ 60 | datetime.date(2000, 1, 1), 61 | datetime.date(1980, 1, 1), 62 | datetime.date(2030, 1, 1), 63 | ] 64 | return pa.array( 65 | data, type=pa.date32(), mask=np.array([False, True, False]) 66 | ) 67 | 68 | 69 | def data_timedelta(f): 70 | data = [ 71 | datetime.timedelta(days=100), 72 | datetime.timedelta(days=1), 73 | datetime.timedelta(seconds=1), 74 | ] 75 | return pa.array( 76 | data, type=pa.duration(f), mask=np.array([False, True, False]) 77 | ) 78 | 79 | 80 | def data_binary_other(): 81 | return np.array([1, 0, 0], dtype="u4") 82 | 83 | 84 | def write_parquet(path, data): 85 | table = pa.Table.from_arrays([data], names=["a"]) 86 | pq.write_table(table, path) 87 | return str(path) 88 | -------------------------------------------------------------------------------- /datafusion/tests/test_aggregation.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import pyarrow as pa 19 | import pytest 20 | 21 | from datafusion import SessionContext, column 22 | from datafusion import functions as f 23 | 24 | 25 | @pytest.fixture 26 | def df(): 27 | ctx = SessionContext() 28 | 29 | # create a RecordBatch and a new DataFrame from it 30 | batch = pa.RecordBatch.from_arrays( 31 | [pa.array([1, 2, 3]), pa.array([4, 4, 6])], 32 | names=["a", "b"], 33 | ) 34 | return ctx.create_dataframe([[batch]]) 35 | 36 | 37 | def test_built_in_aggregation(df): 38 | col_a = column("a") 39 | col_b = column("b") 40 | df = df.aggregate( 41 | [], 42 | [f.max(col_a), f.min(col_a), f.count(col_a), f.approx_distinct(col_b)], 43 | ) 44 | result = df.collect()[0] 45 | assert result.column(0) == pa.array([3]) 46 | assert result.column(1) == pa.array([1]) 47 | assert result.column(2) == pa.array([3], type=pa.int64()) 48 | assert result.column(3) == pa.array([2], type=pa.uint64()) 49 | -------------------------------------------------------------------------------- /datafusion/tests/test_catalog.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import pyarrow as pa 19 | import pytest 20 | 21 | 22 | def test_basic(ctx, database): 23 | with pytest.raises(KeyError): 24 | ctx.catalog("non-existent") 25 | 26 | default = ctx.catalog() 27 | assert default.names() == ["public"] 28 | 29 | for database in [default.database("public"), default.database()]: 30 | assert database.names() == {"csv1", "csv", "csv2"} 31 | 32 | table = database.table("csv") 33 | assert table.kind == "physical" 34 | assert table.schema == pa.schema( 35 | [ 36 | pa.field("int", pa.int64(), nullable=False), 37 | pa.field("str", pa.string(), nullable=False), 38 | pa.field("float", pa.float64(), nullable=False), 39 | ] 40 | ) 41 | -------------------------------------------------------------------------------- /datafusion/tests/test_context.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import pyarrow as pa 19 | 20 | 21 | def test_register_record_batches(ctx): 22 | # create a RecordBatch and register it as memtable 23 | batch = pa.RecordBatch.from_arrays( 24 | [pa.array([1, 2, 3]), pa.array([4, 5, 6])], 25 | names=["a", "b"], 26 | ) 27 | 28 | ctx.register_record_batches("t", [[batch]]) 29 | 30 | assert ctx.tables() == {"t"} 31 | 32 | result = ctx.sql("SELECT a+b, a-b FROM t").collect() 33 | 34 | assert result[0].column(0) == pa.array([5, 7, 9]) 35 | assert result[0].column(1) == pa.array([-3, -3, -3]) 36 | 37 | 38 | def test_create_dataframe_registers_unique_table_name(ctx): 39 | # create a RecordBatch and register it as memtable 40 | batch = pa.RecordBatch.from_arrays( 41 | [pa.array([1, 2, 3]), pa.array([4, 5, 6])], 42 | names=["a", "b"], 43 | ) 44 | 45 | df = ctx.create_dataframe([[batch]]) 46 | tables = list(ctx.tables()) 47 | 48 | assert df 49 | assert len(tables) == 1 50 | assert len(tables[0]) == 33 51 | assert tables[0].startswith("c") 52 | # ensure that the rest of the table name contains 53 | # only hexadecimal numbers 54 | for c in tables[0][1:]: 55 | assert c in "0123456789abcdef" 56 | 57 | 58 | def test_register_table(ctx, database): 59 | default = ctx.catalog() 60 | public = default.database("public") 61 | assert public.names() == {"csv", "csv1", "csv2"} 62 | table = public.table("csv") 63 | 64 | ctx.register_table("csv3", table) 65 | assert public.names() == {"csv", "csv1", "csv2", "csv3"} 66 | 67 | 68 | def test_deregister_table(ctx, database): 69 | default = ctx.catalog() 70 | public = default.database("public") 71 | assert public.names() == {"csv", "csv1", "csv2"} 72 | 73 | ctx.deregister_table("csv") 74 | assert public.names() == {"csv1", "csv2"} 75 | -------------------------------------------------------------------------------- /datafusion/tests/test_dataframe.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import pyarrow as pa 19 | import pytest 20 | 21 | from datafusion import functions as f 22 | from datafusion import DataFrame, SessionContext, column, literal, udf 23 | 24 | 25 | @pytest.fixture 26 | def df(): 27 | ctx = SessionContext() 28 | 29 | # create a RecordBatch and a new DataFrame from it 30 | batch = pa.RecordBatch.from_arrays( 31 | [pa.array([1, 2, 3]), pa.array([4, 5, 6])], 32 | names=["a", "b"], 33 | ) 34 | 35 | return ctx.create_dataframe([[batch]]) 36 | 37 | 38 | @pytest.fixture 39 | def struct_df(): 40 | ctx = SessionContext() 41 | 42 | # create a RecordBatch and a new DataFrame from it 43 | batch = pa.RecordBatch.from_arrays( 44 | [pa.array([{"c": 1}, {"c": 2}, {"c": 3}]), pa.array([4, 5, 6])], 45 | names=["a", "b"], 46 | ) 47 | 48 | return ctx.create_dataframe([[batch]]) 49 | 50 | 51 | def test_select(df): 52 | df = df.select( 53 | column("a") + column("b"), 54 | column("a") - column("b"), 55 | ) 56 | 57 | # execute and collect the first (and only) batch 58 | result = df.collect()[0] 59 | 60 | assert result.column(0) == pa.array([5, 7, 9]) 61 | assert result.column(1) == pa.array([-3, -3, -3]) 62 | 63 | 64 | def test_select_columns(df): 65 | df = df.select_columns("b", "a") 66 | 67 | # execute and collect the first (and only) batch 68 | result = df.collect()[0] 69 | 70 | assert result.column(0) == pa.array([4, 5, 6]) 71 | assert result.column(1) == pa.array([1, 2, 3]) 72 | 73 | 74 | def test_filter(df): 75 | df = df.select( 76 | column("a") + column("b"), 77 | column("a") - column("b"), 78 | ).filter(column("a") > literal(2)) 79 | 80 | # execute and collect the first (and only) batch 81 | result = df.collect()[0] 82 | 83 | assert result.column(0) == pa.array([9]) 84 | assert result.column(1) == pa.array([-3]) 85 | 86 | 87 | def test_sort(df): 88 | df = df.sort(column("b").sort(ascending=False)) 89 | 90 | table = pa.Table.from_batches(df.collect()) 91 | expected = {"a": [3, 2, 1], "b": [6, 5, 4]} 92 | 93 | assert table.to_pydict() == expected 94 | 95 | 96 | def test_limit(df): 97 | df = df.limit(1) 98 | 99 | # execute and collect the first (and only) batch 100 | result = df.collect()[0] 101 | 102 | assert len(result.column(0)) == 1 103 | assert len(result.column(1)) == 1 104 | 105 | 106 | def test_udf(df): 107 | # is_null is a pa function over arrays 108 | is_null = udf( 109 | lambda x: x.is_null(), 110 | [pa.int64()], 111 | pa.bool_(), 112 | volatility="immutable", 113 | ) 114 | 115 | df = df.select(is_null(column("a"))) 116 | result = df.collect()[0].column(0) 117 | 118 | assert result == pa.array([False, False, False]) 119 | 120 | 121 | def test_join(): 122 | ctx = SessionContext() 123 | 124 | batch = pa.RecordBatch.from_arrays( 125 | [pa.array([1, 2, 3]), pa.array([4, 5, 6])], 126 | names=["a", "b"], 127 | ) 128 | df = ctx.create_dataframe([[batch]]) 129 | 130 | batch = pa.RecordBatch.from_arrays( 131 | [pa.array([1, 2]), pa.array([8, 10])], 132 | names=["a", "c"], 133 | ) 134 | df1 = ctx.create_dataframe([[batch]]) 135 | 136 | df = df.join(df1, join_keys=(["a"], ["a"]), how="inner") 137 | df = df.sort(column("a").sort(ascending=True)) 138 | table = pa.Table.from_batches(df.collect()) 139 | 140 | expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]} 141 | assert table.to_pydict() == expected 142 | 143 | 144 | def test_window_lead(df): 145 | df = df.select( 146 | column("a"), 147 | f.alias( 148 | f.window( 149 | "lead", [column("b")], order_by=[f.order_by(column("b"))] 150 | ), 151 | "a_next", 152 | ), 153 | ) 154 | 155 | table = pa.Table.from_batches(df.collect()) 156 | 157 | expected = {"a": [1, 2, 3], "a_next": [5, 6, None]} 158 | assert table.to_pydict() == expected 159 | 160 | 161 | def test_get_dataframe(tmp_path): 162 | ctx = SessionContext() 163 | 164 | path = tmp_path / "test.csv" 165 | table = pa.Table.from_arrays( 166 | [ 167 | [1, 2, 3, 4], 168 | ["a", "b", "c", "d"], 169 | [1.1, 2.2, 3.3, 4.4], 170 | ], 171 | names=["int", "str", "float"], 172 | ) 173 | pa.csv.write_csv(table, path) 174 | 175 | ctx.register_csv("csv", path) 176 | 177 | df = ctx.table("csv") 178 | assert isinstance(df, DataFrame) 179 | 180 | 181 | def test_struct_select(struct_df): 182 | df = struct_df.select( 183 | column("a")["c"] + column("b"), 184 | column("a")["c"] - column("b"), 185 | ) 186 | 187 | # execute and collect the first (and only) batch 188 | result = df.collect()[0] 189 | 190 | assert result.column(0) == pa.array([5, 7, 9]) 191 | assert result.column(1) == pa.array([-3, -3, -3]) 192 | 193 | 194 | def test_explain(df): 195 | df = df.select( 196 | column("a") + column("b"), 197 | column("a") - column("b"), 198 | ) 199 | df.explain() 200 | -------------------------------------------------------------------------------- /datafusion/tests/test_functions.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import numpy as np 19 | import pyarrow as pa 20 | import pytest 21 | 22 | from datafusion import SessionContext, column 23 | from datafusion import functions as f 24 | from datafusion import literal 25 | 26 | 27 | @pytest.fixture 28 | def df(): 29 | ctx = SessionContext() 30 | # create a RecordBatch and a new DataFrame from it 31 | batch = pa.RecordBatch.from_arrays( 32 | [pa.array(["Hello", "World", "!"]), pa.array([4, 5, 6])], 33 | names=["a", "b"], 34 | ) 35 | return ctx.create_dataframe([[batch]]) 36 | 37 | 38 | def test_literal(df): 39 | df = df.select( 40 | literal(1), 41 | literal("1"), 42 | literal("OK"), 43 | literal(3.14), 44 | literal(True), 45 | literal(b"hello world"), 46 | ) 47 | result = df.collect() 48 | assert len(result) == 1 49 | result = result[0] 50 | assert result.column(0) == pa.array([1] * 3) 51 | assert result.column(1) == pa.array(["1"] * 3) 52 | assert result.column(2) == pa.array(["OK"] * 3) 53 | assert result.column(3) == pa.array([3.14] * 3) 54 | assert result.column(4) == pa.array([True] * 3) 55 | assert result.column(5) == pa.array([b"hello world"] * 3) 56 | 57 | 58 | def test_lit_arith(df): 59 | """ 60 | Test literals with arithmetic operations 61 | """ 62 | df = df.select( 63 | literal(1) + column("b"), f.concat(column("a"), literal("!")) 64 | ) 65 | result = df.collect() 66 | assert len(result) == 1 67 | result = result[0] 68 | assert result.column(0) == pa.array([5, 6, 7]) 69 | assert result.column(1) == pa.array(["Hello!", "World!", "!!"]) 70 | 71 | 72 | def test_math_functions(): 73 | ctx = SessionContext() 74 | # create a RecordBatch and a new DataFrame from it 75 | batch = pa.RecordBatch.from_arrays( 76 | [pa.array([0.1, -0.7, 0.55])], names=["value"] 77 | ) 78 | df = ctx.create_dataframe([[batch]]) 79 | 80 | values = np.array([0.1, -0.7, 0.55]) 81 | col_v = column("value") 82 | df = df.select( 83 | f.abs(col_v), 84 | f.sin(col_v), 85 | f.cos(col_v), 86 | f.tan(col_v), 87 | f.asin(col_v), 88 | f.acos(col_v), 89 | f.exp(col_v), 90 | f.ln(col_v + literal(pa.scalar(1))), 91 | f.log2(col_v + literal(pa.scalar(1))), 92 | f.log10(col_v + literal(pa.scalar(1))), 93 | f.random(), 94 | ) 95 | batches = df.collect() 96 | assert len(batches) == 1 97 | result = batches[0] 98 | 99 | np.testing.assert_array_almost_equal(result.column(0), np.abs(values)) 100 | np.testing.assert_array_almost_equal(result.column(1), np.sin(values)) 101 | np.testing.assert_array_almost_equal(result.column(2), np.cos(values)) 102 | np.testing.assert_array_almost_equal(result.column(3), np.tan(values)) 103 | np.testing.assert_array_almost_equal(result.column(4), np.arcsin(values)) 104 | np.testing.assert_array_almost_equal(result.column(5), np.arccos(values)) 105 | np.testing.assert_array_almost_equal(result.column(6), np.exp(values)) 106 | np.testing.assert_array_almost_equal( 107 | result.column(7), np.log(values + 1.0) 108 | ) 109 | np.testing.assert_array_almost_equal( 110 | result.column(8), np.log2(values + 1.0) 111 | ) 112 | np.testing.assert_array_almost_equal( 113 | result.column(9), np.log10(values + 1.0) 114 | ) 115 | np.testing.assert_array_less(result.column(10), np.ones_like(values)) 116 | 117 | 118 | def test_string_functions(df): 119 | df = df.select(f.md5(column("a")), f.lower(column("a"))) 120 | result = df.collect() 121 | assert len(result) == 1 122 | result = result[0] 123 | assert result.column(0) == pa.array( 124 | [ 125 | "8b1a9953c4611296a827abf8c47804d7", 126 | "f5a7924e621e84c9280a9a27e1bcb7f6", 127 | "9033e0e305f247c0c3c80d0c7848c8b3", 128 | ] 129 | ) 130 | assert result.column(1) == pa.array(["hello", "world", "!"]) 131 | 132 | 133 | def test_hash_functions(df): 134 | exprs = [ 135 | f.digest(column("a"), literal(m)) 136 | for m in ("md5", "sha256", "sha512", "blake2s", "blake3") 137 | ] 138 | df = df.select(*exprs) 139 | result = df.collect() 140 | assert len(result) == 1 141 | result = result[0] 142 | b = bytearray.fromhex 143 | assert result.column(0) == pa.array( 144 | [ 145 | b("8B1A9953C4611296A827ABF8C47804D7"), 146 | b("F5A7924E621E84C9280A9A27E1BCB7F6"), 147 | b("9033E0E305F247C0C3C80D0C7848C8B3"), 148 | ] 149 | ) 150 | assert result.column(1) == pa.array( 151 | [ 152 | b( 153 | "185F8DB32271FE25F561A6FC938B2E26" 154 | "4306EC304EDA518007D1764826381969" 155 | ), 156 | b( 157 | "78AE647DC5544D227130A0682A51E30B" 158 | "C7777FBB6D8A8F17007463A3ECD1D524" 159 | ), 160 | b( 161 | "BB7208BC9B5D7C04F1236A82A0093A5E" 162 | "33F40423D5BA8D4266F7092C3BA43B62" 163 | ), 164 | ] 165 | ) 166 | assert result.column(2) == pa.array( 167 | [ 168 | b( 169 | "3615F80C9D293ED7402687F94B22D58E" 170 | "529B8CC7916F8FAC7FDDF7FBD5AF4CF7" 171 | "77D3D795A7A00A16BF7E7F3FB9561EE9" 172 | "BAAE480DA9FE7A18769E71886B03F315" 173 | ), 174 | b( 175 | "8EA77393A42AB8FA92500FB077A9509C" 176 | "C32BC95E72712EFA116EDAF2EDFAE34F" 177 | "BB682EFDD6C5DD13C117E08BD4AAEF71" 178 | "291D8AACE2F890273081D0677C16DF0F" 179 | ), 180 | b( 181 | "3831A6A6155E509DEE59A7F451EB3532" 182 | "4D8F8F2DF6E3708894740F98FDEE2388" 183 | "9F4DE5ADB0C5010DFB555CDA77C8AB5D" 184 | "C902094C52DE3278F35A75EBC25F093A" 185 | ), 186 | ] 187 | ) 188 | assert result.column(3) == pa.array( 189 | [ 190 | b( 191 | "F73A5FBF881F89B814871F46E26AD3FA" 192 | "37CB2921C5E8561618639015B3CCBB71" 193 | ), 194 | b( 195 | "B792A0383FB9E7A189EC150686579532" 196 | "854E44B71AC394831DAED169BA85CCC5" 197 | ), 198 | b( 199 | "27988A0E51812297C77A433F63523334" 200 | "6AEE29A829DCF4F46E0F58F402C6CFCB" 201 | ), 202 | ] 203 | ) 204 | assert result.column(4) == pa.array( 205 | [ 206 | b( 207 | "FBC2B0516EE8744D293B980779178A35" 208 | "08850FDCFE965985782C39601B65794F" 209 | ), 210 | b( 211 | "BF73D18575A736E4037D45F9E316085B" 212 | "86C19BE6363DE6AA789E13DEAACC1C4E" 213 | ), 214 | b( 215 | "C8D11B9F7237E4034ADBCD2005735F9B" 216 | "C4C597C75AD89F4492BEC8F77D15F7EB" 217 | ), 218 | ] 219 | ) 220 | -------------------------------------------------------------------------------- /datafusion/tests/test_imports.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import pytest 19 | 20 | import datafusion 21 | from datafusion import ( 22 | AggregateUDF, 23 | DataFrame, 24 | SessionContext, 25 | Expression, 26 | ScalarUDF, 27 | functions, 28 | ) 29 | 30 | 31 | def test_import_datafusion(): 32 | assert datafusion.__name__ == "datafusion" 33 | 34 | 35 | def test_datafusion_python_version(): 36 | assert datafusion.__version__ is not None 37 | 38 | 39 | def test_class_module_is_datafusion(): 40 | for klass in [ 41 | SessionContext, 42 | Expression, 43 | DataFrame, 44 | ScalarUDF, 45 | AggregateUDF, 46 | ]: 47 | assert klass.__module__ == "datafusion" 48 | 49 | 50 | def test_import_from_functions_submodule(): 51 | from datafusion.functions import abs, sin # noqa 52 | 53 | assert functions.abs is abs 54 | assert functions.sin is sin 55 | 56 | msg = "cannot import name 'foobar' from 'datafusion.functions'" 57 | with pytest.raises(ImportError, match=msg): 58 | from datafusion.functions import foobar # noqa 59 | 60 | 61 | def test_classes_are_inheritable(): 62 | class MyExecContext(SessionContext): 63 | pass 64 | 65 | class MyExpression(Expression): 66 | pass 67 | 68 | class MyDataFrame(DataFrame): 69 | pass 70 | -------------------------------------------------------------------------------- /datafusion/tests/test_indexing.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import pyarrow as pa 19 | import pytest 20 | 21 | from datafusion import SessionContext 22 | 23 | 24 | @pytest.fixture 25 | def df(): 26 | ctx = SessionContext() 27 | 28 | # create a RecordBatch and a new DataFrame from it 29 | batch = pa.RecordBatch.from_arrays( 30 | [pa.array([1, 2, 3]), pa.array([4, 4, 6])], 31 | names=["a", "b"], 32 | ) 33 | return ctx.create_dataframe([[batch]]) 34 | 35 | 36 | def test_indexing(df): 37 | assert df["a"] is not None 38 | assert df["a", "b"] is not None 39 | assert df[("a", "b")] is not None 40 | assert df[["a"]] is not None 41 | 42 | 43 | def test_err(df): 44 | with pytest.raises(Exception) as e_info: 45 | df["c"] 46 | 47 | assert "Schema error: No field named 'c'" in e_info.value.args[0] 48 | 49 | with pytest.raises(Exception) as e_info: 50 | df[1] 51 | 52 | assert ( 53 | "DataFrame can only be indexed by string index or indices" 54 | in e_info.value.args[0] 55 | ) 56 | -------------------------------------------------------------------------------- /datafusion/tests/test_sql.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import numpy as np 19 | import pyarrow as pa 20 | import pytest 21 | 22 | from datafusion import udf 23 | 24 | from . import generic as helpers 25 | 26 | 27 | def test_no_table(ctx): 28 | with pytest.raises(Exception, match="DataFusion error"): 29 | ctx.sql("SELECT a FROM b").collect() 30 | 31 | 32 | def test_register_csv(ctx, tmp_path): 33 | path = tmp_path / "test.csv" 34 | 35 | table = pa.Table.from_arrays( 36 | [ 37 | [1, 2, 3, 4], 38 | ["a", "b", "c", "d"], 39 | [1.1, 2.2, 3.3, 4.4], 40 | ], 41 | names=["int", "str", "float"], 42 | ) 43 | pa.csv.write_csv(table, path) 44 | 45 | ctx.register_csv("csv", path) 46 | ctx.register_csv("csv1", str(path)) 47 | ctx.register_csv( 48 | "csv2", 49 | path, 50 | has_header=True, 51 | delimiter=",", 52 | schema_infer_max_records=10, 53 | ) 54 | alternative_schema = pa.schema( 55 | [ 56 | ("some_int", pa.int16()), 57 | ("some_bytes", pa.string()), 58 | ("some_floats", pa.float32()), 59 | ] 60 | ) 61 | ctx.register_csv("csv3", path, schema=alternative_schema) 62 | 63 | assert ctx.tables() == {"csv", "csv1", "csv2", "csv3"} 64 | 65 | for table in ["csv", "csv1", "csv2"]: 66 | result = ctx.sql(f"SELECT COUNT(int) AS cnt FROM {table}").collect() 67 | result = pa.Table.from_batches(result) 68 | assert result.to_pydict() == {"cnt": [4]} 69 | 70 | result = ctx.sql("SELECT * FROM csv3").collect() 71 | result = pa.Table.from_batches(result) 72 | assert result.schema == alternative_schema 73 | 74 | with pytest.raises( 75 | ValueError, match="Delimiter must be a single character" 76 | ): 77 | ctx.register_csv("csv4", path, delimiter="wrong") 78 | 79 | 80 | def test_register_parquet(ctx, tmp_path): 81 | path = helpers.write_parquet(tmp_path / "a.parquet", helpers.data()) 82 | ctx.register_parquet("t", path) 83 | assert ctx.tables() == {"t"} 84 | 85 | result = ctx.sql("SELECT COUNT(a) AS cnt FROM t").collect() 86 | result = pa.Table.from_batches(result) 87 | assert result.to_pydict() == {"cnt": [100]} 88 | 89 | 90 | def test_register_parquet_partitioned(ctx, tmp_path): 91 | dir_root = tmp_path / "dataset_parquet_partitioned" 92 | dir_root.mkdir(exist_ok=False) 93 | (dir_root / "grp=a").mkdir(exist_ok=False) 94 | (dir_root / "grp=b").mkdir(exist_ok=False) 95 | 96 | table = pa.Table.from_arrays( 97 | [ 98 | [1, 2, 3, 4], 99 | ["a", "b", "c", "d"], 100 | [1.1, 2.2, 3.3, 4.4], 101 | ], 102 | names=["int", "str", "float"], 103 | ) 104 | pa.parquet.write_table(table.slice(0, 3), dir_root / "grp=a/file.parquet") 105 | pa.parquet.write_table(table.slice(3, 4), dir_root / "grp=b/file.parquet") 106 | 107 | ctx.register_parquet( 108 | "datapp", 109 | str(dir_root), 110 | table_partition_cols=["grp"], 111 | parquet_pruning=True, 112 | file_extension=".parquet", 113 | ) 114 | assert ctx.tables() == {"datapp"} 115 | 116 | result = ctx.sql( 117 | "SELECT grp, COUNT(*) AS cnt FROM datapp GROUP BY grp" 118 | ).collect() 119 | result = pa.Table.from_batches(result) 120 | 121 | rd = result.to_pydict() 122 | assert dict(zip(rd["grp"], rd["cnt"])) == {"a": 3, "b": 1} 123 | 124 | 125 | def test_execute(ctx, tmp_path): 126 | data = [1, 1, 2, 2, 3, 11, 12] 127 | 128 | # single column, "a" 129 | path = helpers.write_parquet(tmp_path / "a.parquet", pa.array(data)) 130 | ctx.register_parquet("t", path) 131 | 132 | assert ctx.tables() == {"t"} 133 | 134 | # count 135 | result = ctx.sql("SELECT COUNT(a) AS cnt FROM t").collect() 136 | 137 | expected = pa.array([7], pa.int64()) 138 | expected = [pa.RecordBatch.from_arrays([expected], ["cnt"])] 139 | assert result == expected 140 | 141 | # where 142 | expected = pa.array([2], pa.int64()) 143 | expected = [pa.RecordBatch.from_arrays([expected], ["cnt"])] 144 | result = ctx.sql("SELECT COUNT(a) AS cnt FROM t WHERE a > 10").collect() 145 | assert result == expected 146 | 147 | # group by 148 | results = ctx.sql( 149 | "SELECT CAST(a as int) AS a, COUNT(a) AS cnt FROM t GROUP BY a" 150 | ).collect() 151 | 152 | # group by returns batches 153 | result_keys = [] 154 | result_values = [] 155 | for result in results: 156 | pydict = result.to_pydict() 157 | result_keys.extend(pydict["a"]) 158 | result_values.extend(pydict["cnt"]) 159 | 160 | result_keys, result_values = ( 161 | list(t) for t in zip(*sorted(zip(result_keys, result_values))) 162 | ) 163 | 164 | assert result_keys == [1, 2, 3, 11, 12] 165 | assert result_values == [2, 2, 1, 1, 1] 166 | 167 | # order by 168 | result = ctx.sql( 169 | "SELECT a, CAST(a AS int) AS a_int FROM t ORDER BY a DESC LIMIT 2" 170 | ).collect() 171 | expected_a = pa.array([50.0219, 50.0152], pa.float64()) 172 | expected_cast = pa.array([50, 50], pa.int32()) 173 | expected = [ 174 | pa.RecordBatch.from_arrays([expected_a, expected_cast], ["a", "a_int"]) 175 | ] 176 | np.testing.assert_equal(expected[0].column(1), expected[0].column(1)) 177 | 178 | 179 | def test_cast(ctx, tmp_path): 180 | """ 181 | Verify that we can cast 182 | """ 183 | path = helpers.write_parquet(tmp_path / "a.parquet", helpers.data()) 184 | ctx.register_parquet("t", path) 185 | 186 | valid_types = [ 187 | "smallint", 188 | "int", 189 | "bigint", 190 | "float(32)", 191 | "float(64)", 192 | "float", 193 | ] 194 | 195 | select = ", ".join( 196 | [f"CAST(9 AS {t}) AS A{i}" for i, t in enumerate(valid_types)] 197 | ) 198 | 199 | # can execute, which implies that we can cast 200 | ctx.sql(f"SELECT {select} FROM t").collect() 201 | 202 | 203 | @pytest.mark.parametrize( 204 | ("fn", "input_types", "output_type", "input_values", "expected_values"), 205 | [ 206 | ( 207 | lambda x: x, 208 | [pa.float64()], 209 | pa.float64(), 210 | [-1.2, None, 1.2], 211 | [-1.2, None, 1.2], 212 | ), 213 | ( 214 | lambda x: x.is_null(), 215 | [pa.float64()], 216 | pa.bool_(), 217 | [-1.2, None, 1.2], 218 | [False, True, False], 219 | ), 220 | ], 221 | ) 222 | def test_udf( 223 | ctx, tmp_path, fn, input_types, output_type, input_values, expected_values 224 | ): 225 | # write to disk 226 | path = helpers.write_parquet( 227 | tmp_path / "a.parquet", pa.array(input_values) 228 | ) 229 | ctx.register_parquet("t", path) 230 | 231 | func = udf( 232 | fn, input_types, output_type, name="func", volatility="immutable" 233 | ) 234 | ctx.register_udf(func) 235 | 236 | batches = ctx.sql("SELECT func(a) AS tt FROM t").collect() 237 | result = batches[0].column(0) 238 | 239 | assert result == pa.array(expected_values) 240 | 241 | 242 | _null_mask = np.array([False, True, False]) 243 | 244 | 245 | @pytest.mark.parametrize( 246 | "arr", 247 | [ 248 | pa.array(["a", "b", "c"], pa.utf8(), _null_mask), 249 | pa.array(["a", "b", "c"], pa.large_utf8(), _null_mask), 250 | pa.array([b"1", b"2", b"3"], pa.binary(), _null_mask), 251 | pa.array([b"1111", b"2222", b"3333"], pa.large_binary(), _null_mask), 252 | pa.array([False, True, True], None, _null_mask), 253 | pa.array([0, 1, 2], None), 254 | helpers.data_binary_other(), 255 | helpers.data_date32(), 256 | helpers.data_with_nans(), 257 | # C data interface missing 258 | pytest.param( 259 | pa.array([b"1111", b"2222", b"3333"], pa.binary(4), _null_mask), 260 | marks=pytest.mark.xfail, 261 | ), 262 | pytest.param(helpers.data_datetime("s"), marks=pytest.mark.xfail), 263 | pytest.param(helpers.data_datetime("ms"), marks=pytest.mark.xfail), 264 | pytest.param(helpers.data_datetime("us"), marks=pytest.mark.xfail), 265 | pytest.param(helpers.data_datetime("ns"), marks=pytest.mark.xfail), 266 | # Not writtable to parquet 267 | pytest.param(helpers.data_timedelta("s"), marks=pytest.mark.xfail), 268 | pytest.param(helpers.data_timedelta("ms"), marks=pytest.mark.xfail), 269 | pytest.param(helpers.data_timedelta("us"), marks=pytest.mark.xfail), 270 | pytest.param(helpers.data_timedelta("ns"), marks=pytest.mark.xfail), 271 | ], 272 | ) 273 | def test_simple_select(ctx, tmp_path, arr): 274 | path = helpers.write_parquet(tmp_path / "a.parquet", arr) 275 | ctx.register_parquet("t", path) 276 | 277 | batches = ctx.sql("SELECT a AS tt FROM t").collect() 278 | result = batches[0].column(0) 279 | 280 | np.testing.assert_equal(result, arr) 281 | -------------------------------------------------------------------------------- /datafusion/tests/test_udaf.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | from typing import List 19 | 20 | import pyarrow as pa 21 | import pyarrow.compute as pc 22 | import pytest 23 | 24 | from datafusion import Accumulator, SessionContext, column, udaf 25 | 26 | 27 | class Summarize(Accumulator): 28 | """ 29 | Interface of a user-defined accumulation. 30 | """ 31 | 32 | def __init__(self): 33 | self._sum = pa.scalar(0.0) 34 | 35 | def state(self) -> List[pa.Scalar]: 36 | return [self._sum] 37 | 38 | def update(self, values: pa.Array) -> None: 39 | # Not nice since pyarrow scalars can't be summed yet. 40 | # This breaks on `None` 41 | self._sum = pa.scalar(self._sum.as_py() + pc.sum(values).as_py()) 42 | 43 | def merge(self, states: pa.Array) -> None: 44 | # Not nice since pyarrow scalars can't be summed yet. 45 | # This breaks on `None` 46 | self._sum = pa.scalar(self._sum.as_py() + pc.sum(states).as_py()) 47 | 48 | def evaluate(self) -> pa.Scalar: 49 | return self._sum 50 | 51 | 52 | class NotSubclassOfAccumulator: 53 | pass 54 | 55 | 56 | class MissingMethods(Accumulator): 57 | def __init__(self): 58 | self._sum = pa.scalar(0) 59 | 60 | def state(self) -> List[pa.Scalar]: 61 | return [self._sum] 62 | 63 | 64 | @pytest.fixture 65 | def df(): 66 | ctx = SessionContext() 67 | 68 | # create a RecordBatch and a new DataFrame from it 69 | batch = pa.RecordBatch.from_arrays( 70 | [pa.array([1, 2, 3]), pa.array([4, 4, 6])], 71 | names=["a", "b"], 72 | ) 73 | return ctx.create_dataframe([[batch]]) 74 | 75 | 76 | @pytest.mark.skip(reason="df.collect() will hang, need more investigations") 77 | def test_errors(df): 78 | with pytest.raises(TypeError): 79 | udaf( 80 | NotSubclassOfAccumulator, 81 | pa.float64(), 82 | pa.float64(), 83 | [pa.float64()], 84 | volatility="immutable", 85 | ) 86 | 87 | accum = udaf( 88 | MissingMethods, 89 | pa.int64(), 90 | pa.int64(), 91 | [pa.int64()], 92 | volatility="immutable", 93 | ) 94 | df = df.aggregate([], [accum(column("a"))]) 95 | 96 | msg = ( 97 | "Can't instantiate abstract class MissingMethods with abstract " 98 | "methods evaluate, merge, update" 99 | ) 100 | with pytest.raises(Exception, match=msg): 101 | df.collect() 102 | 103 | 104 | def test_aggregate(df): 105 | summarize = udaf( 106 | Summarize, 107 | pa.float64(), 108 | pa.float64(), 109 | [pa.float64()], 110 | volatility="immutable", 111 | ) 112 | 113 | df = df.aggregate([], [summarize(column("a"))]) 114 | 115 | # execute and collect the first (and only) batch 116 | result = df.collect()[0] 117 | 118 | assert result.column(0) == pa.array([1.0 + 2.0 + 3.0]) 119 | 120 | 121 | def test_group_by(df): 122 | summarize = udaf( 123 | Summarize, 124 | pa.float64(), 125 | pa.float64(), 126 | [pa.float64()], 127 | volatility="immutable", 128 | ) 129 | 130 | df = df.aggregate([column("b")], [summarize(column("a"))]) 131 | 132 | batches = df.collect() 133 | 134 | arrays = [batch.column(1) for batch in batches] 135 | joined = pa.concat_arrays(arrays) 136 | assert joined == pa.array([1.0 + 2.0, 3.0]) 137 | -------------------------------------------------------------------------------- /dev/create_license.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | # This file is a mirror of https://github.com/apache/arrow-datafusion/blob/master/dev/create_license.py 20 | 21 | import json 22 | import subprocess 23 | 24 | subprocess.check_output(["cargo", "install", "cargo-license"]) 25 | data = subprocess.check_output( 26 | [ 27 | "cargo", 28 | "license", 29 | "--avoid-build-deps", 30 | "--avoid-dev-deps", 31 | "--do-not-bundle", 32 | "--json", 33 | ] 34 | ) 35 | data = json.loads(data) 36 | 37 | result = """ 38 | Apache License 39 | Version 2.0, January 2004 40 | http://www.apache.org/licenses/ 41 | 42 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 43 | 44 | 1. Definitions. 45 | 46 | "License" shall mean the terms and conditions for use, reproduction, 47 | and distribution as defined by Sections 1 through 9 of this document. 48 | 49 | "Licensor" shall mean the copyright owner or entity authorized by 50 | the copyright owner that is granting the License. 51 | 52 | "Legal Entity" shall mean the union of the acting entity and all 53 | other entities that control, are controlled by, or are under common 54 | control with that entity. For the purposes of this definition, 55 | "control" means (i) the power, direct or indirect, to cause the 56 | direction or management of such entity, whether by contract or 57 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 58 | outstanding shares, or (iii) beneficial ownership of such entity. 59 | 60 | "You" (or "Your") shall mean an individual or Legal Entity 61 | exercising permissions granted by this License. 62 | 63 | "Source" form shall mean the preferred form for making modifications, 64 | including but not limited to software source code, documentation 65 | source, and configuration files. 66 | 67 | "Object" form shall mean any form resulting from mechanical 68 | transformation or translation of a Source form, including but 69 | not limited to compiled object code, generated documentation, 70 | and conversions to other media types. 71 | 72 | "Work" shall mean the work of authorship, whether in Source or 73 | Object form, made available under the License, as indicated by a 74 | copyright notice that is included in or attached to the work 75 | (an example is provided in the Appendix below). 76 | 77 | "Derivative Works" shall mean any work, whether in Source or Object 78 | form, that is based on (or derived from) the Work and for which the 79 | editorial revisions, annotations, elaborations, or other modifications 80 | represent, as a whole, an original work of authorship. For the purposes 81 | of this License, Derivative Works shall not include works that remain 82 | separable from, or merely link (or bind by name) to the interfaces of, 83 | the Work and Derivative Works thereof. 84 | 85 | "Contribution" shall mean any work of authorship, including 86 | the original version of the Work and any modifications or additions 87 | to that Work or Derivative Works thereof, that is intentionally 88 | submitted to Licensor for inclusion in the Work by the copyright owner 89 | or by an individual or Legal Entity authorized to submit on behalf of 90 | the copyright owner. For the purposes of this definition, "submitted" 91 | means any form of electronic, verbal, or written communication sent 92 | to the Licensor or its representatives, including but not limited to 93 | communication on electronic mailing lists, source code control systems, 94 | and issue tracking systems that are managed by, or on behalf of, the 95 | Licensor for the purpose of discussing and improving the Work, but 96 | excluding communication that is conspicuously marked or otherwise 97 | designated in writing by the copyright owner as "Not a Contribution." 98 | 99 | "Contributor" shall mean Licensor and any individual or Legal Entity 100 | on behalf of whom a Contribution has been received by Licensor and 101 | subsequently incorporated within the Work. 102 | 103 | 2. Grant of Copyright License. Subject to the terms and conditions of 104 | this License, each Contributor hereby grants to You a perpetual, 105 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 106 | copyright license to reproduce, prepare Derivative Works of, 107 | publicly display, publicly perform, sublicense, and distribute the 108 | Work and such Derivative Works in Source or Object form. 109 | 110 | 3. Grant of Patent License. Subject to the terms and conditions of 111 | this License, each Contributor hereby grants to You a perpetual, 112 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 113 | (except as stated in this section) patent license to make, have made, 114 | use, offer to sell, sell, import, and otherwise transfer the Work, 115 | where such license applies only to those patent claims licensable 116 | by such Contributor that are necessarily infringed by their 117 | Contribution(s) alone or by combination of their Contribution(s) 118 | with the Work to which such Contribution(s) was submitted. If You 119 | institute patent litigation against any entity (including a 120 | cross-claim or counterclaim in a lawsuit) alleging that the Work 121 | or a Contribution incorporated within the Work constitutes direct 122 | or contributory patent infringement, then any patent licenses 123 | granted to You under this License for that Work shall terminate 124 | as of the date such litigation is filed. 125 | 126 | 4. Redistribution. You may reproduce and distribute copies of the 127 | Work or Derivative Works thereof in any medium, with or without 128 | modifications, and in Source or Object form, provided that You 129 | meet the following conditions: 130 | 131 | (a) You must give any other recipients of the Work or 132 | Derivative Works a copy of this License; and 133 | 134 | (b) You must cause any modified files to carry prominent notices 135 | stating that You changed the files; and 136 | 137 | (c) You must retain, in the Source form of any Derivative Works 138 | that You distribute, all copyright, patent, trademark, and 139 | attribution notices from the Source form of the Work, 140 | excluding those notices that do not pertain to any part of 141 | the Derivative Works; and 142 | 143 | (d) If the Work includes a "NOTICE" text file as part of its 144 | distribution, then any Derivative Works that You distribute must 145 | include a readable copy of the attribution notices contained 146 | within such NOTICE file, excluding those notices that do not 147 | pertain to any part of the Derivative Works, in at least one 148 | of the following places: within a NOTICE text file distributed 149 | as part of the Derivative Works; within the Source form or 150 | documentation, if provided along with the Derivative Works; or, 151 | within a display generated by the Derivative Works, if and 152 | wherever such third-party notices normally appear. The contents 153 | of the NOTICE file are for informational purposes only and 154 | do not modify the License. You may add Your own attribution 155 | notices within Derivative Works that You distribute, alongside 156 | or as an addendum to the NOTICE text from the Work, provided 157 | that such additional attribution notices cannot be construed 158 | as modifying the License. 159 | 160 | You may add Your own copyright statement to Your modifications and 161 | may provide additional or different license terms and conditions 162 | for use, reproduction, or distribution of Your modifications, or 163 | for any such Derivative Works as a whole, provided Your use, 164 | reproduction, and distribution of the Work otherwise complies with 165 | the conditions stated in this License. 166 | 167 | 5. Submission of Contributions. Unless You explicitly state otherwise, 168 | any Contribution intentionally submitted for inclusion in the Work 169 | by You to the Licensor shall be under the terms and conditions of 170 | this License, without any additional terms or conditions. 171 | Notwithstanding the above, nothing herein shall supersede or modify 172 | the terms of any separate license agreement you may have executed 173 | with Licensor regarding such Contributions. 174 | 175 | 6. Trademarks. This License does not grant permission to use the trade 176 | names, trademarks, service marks, or product names of the Licensor, 177 | except as required for reasonable and customary use in describing the 178 | origin of the Work and reproducing the content of the NOTICE file. 179 | 180 | 7. Disclaimer of Warranty. Unless required by applicable law or 181 | agreed to in writing, Licensor provides the Work (and each 182 | Contributor provides its Contributions) on an "AS IS" BASIS, 183 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 184 | implied, including, without limitation, any warranties or conditions 185 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 186 | PARTICULAR PURPOSE. You are solely responsible for determining the 187 | appropriateness of using or redistributing the Work and assume any 188 | risks associated with Your exercise of permissions under this License. 189 | 190 | 8. Limitation of Liability. In no event and under no legal theory, 191 | whether in tort (including negligence), contract, or otherwise, 192 | unless required by applicable law (such as deliberate and grossly 193 | negligent acts) or agreed to in writing, shall any Contributor be 194 | liable to You for damages, including any direct, indirect, special, 195 | incidental, or consequential damages of any character arising as a 196 | result of this License or out of the use or inability to use the 197 | Work (including but not limited to damages for loss of goodwill, 198 | work stoppage, computer failure or malfunction, or any and all 199 | other commercial damages or losses), even if such Contributor 200 | has been advised of the possibility of such damages. 201 | 202 | 9. Accepting Warranty or Additional Liability. While redistributing 203 | the Work or Derivative Works thereof, You may choose to offer, 204 | and charge a fee for, acceptance of support, warranty, indemnity, 205 | or other liability obligations and/or rights consistent with this 206 | License. However, in accepting such obligations, You may act only 207 | on Your own behalf and on Your sole responsibility, not on behalf 208 | of any other Contributor, and only if You agree to indemnify, 209 | defend, and hold each Contributor harmless for any liability 210 | incurred by, or claims asserted against, such Contributor by reason 211 | of your accepting any such warranty or additional liability. 212 | 213 | END OF TERMS AND CONDITIONS 214 | 215 | APPENDIX: How to apply the Apache License to your work. 216 | 217 | To apply the Apache License to your work, attach the following 218 | boilerplate notice, with the fields enclosed by brackets "[]" 219 | replaced with your own identifying information. (Don't include 220 | the brackets!) The text should be enclosed in the appropriate 221 | comment syntax for the file format. We also recommend that a 222 | file or class name and description of purpose be included on the 223 | same "printed page" as the copyright notice for easier 224 | identification within third-party archives. 225 | 226 | Copyright [yyyy] [name of copyright owner] 227 | 228 | Licensed under the Apache License, Version 2.0 (the "License"); 229 | you may not use this file except in compliance with the License. 230 | You may obtain a copy of the License at 231 | 232 | http://www.apache.org/licenses/LICENSE-2.0 233 | 234 | Unless required by applicable law or agreed to in writing, software 235 | distributed under the License is distributed on an "AS IS" BASIS, 236 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 237 | See the License for the specific language governing permissions and 238 | limitations under the License. 239 | """ 240 | result += "\n------------------\n\n" 241 | result += "This software is built and contains the following software:\n\n" 242 | result += "(automatically generated via [cargo-license](https://crates.io/crates/cargo-license))\n\n" 243 | for item in data: 244 | license = item["license"] 245 | name = item["name"] 246 | version = item["version"] 247 | repository = item["repository"] 248 | result += "------------------\n\n" 249 | result += f"### {name} {version}\n* source: [{repository}]({repository})\n* license: {license}\n\n" 250 | 251 | with open("LICENSE.txt", "w") as f: 252 | f.write(result) 253 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | [build-system] 19 | requires = ["maturin>=0.11,<0.13"] 20 | build-backend = "maturin" 21 | 22 | [project] 23 | name = "datafusion" 24 | description = "Build and run queries against data" 25 | readme = "README.md" 26 | license = {file = "LICENSE.txt"} 27 | requires-python = ">=3.6" 28 | keywords = ["datafusion", "dataframe", "rust", "query-engine"] 29 | classifier = [ 30 | "Development Status :: 2 - Pre-Alpha", 31 | "Intended Audience :: Developers", 32 | "License :: OSI Approved :: Apache Software License", 33 | "License :: OSI Approved", 34 | "Operating System :: MacOS", 35 | "Operating System :: Microsoft :: Windows", 36 | "Operating System :: POSIX :: Linux", 37 | "Programming Language :: Python :: 3", 38 | "Programming Language :: Python :: 3.7", 39 | "Programming Language :: Python :: 3.8", 40 | "Programming Language :: Python :: 3.9", 41 | "Programming Language :: Python :: 3.10", 42 | "Programming Language :: Python", 43 | "Programming Language :: Rust", 44 | ] 45 | dependencies = [ 46 | "pyarrow>=1", 47 | ] 48 | 49 | [project.urls] 50 | documentation = "https://arrow.apache.org/datafusion/python" 51 | repository = "https://github.com/apache/arrow-datafusion" 52 | 53 | [tool.isort] 54 | profile = "black" 55 | 56 | [tool.maturin] 57 | sdist-include = ["Cargo.lock"] 58 | -------------------------------------------------------------------------------- /requirements-310.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with python 3.10 3 | # To update, run: 4 | # 5 | # pip-compile --generate-hashes 6 | # 7 | attrs==21.4.0 \ 8 | --hash=sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4 \ 9 | --hash=sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd 10 | # via pytest 11 | black==22.3.0 \ 12 | --hash=sha256:06f9d8846f2340dfac80ceb20200ea5d1b3f181dd0556b47af4e8e0b24fa0a6b \ 13 | --hash=sha256:10dbe6e6d2988049b4655b2b739f98785a884d4d6b85bc35133a8fb9a2233176 \ 14 | --hash=sha256:2497f9c2386572e28921fa8bec7be3e51de6801f7459dffd6e62492531c47e09 \ 15 | --hash=sha256:30d78ba6bf080eeaf0b7b875d924b15cd46fec5fd044ddfbad38c8ea9171043a \ 16 | --hash=sha256:328efc0cc70ccb23429d6be184a15ce613f676bdfc85e5fe8ea2a9354b4e9015 \ 17 | --hash=sha256:35020b8886c022ced9282b51b5a875b6d1ab0c387b31a065b84db7c33085ca79 \ 18 | --hash=sha256:5795a0375eb87bfe902e80e0c8cfaedf8af4d49694d69161e5bd3206c18618bb \ 19 | --hash=sha256:5891ef8abc06576985de8fa88e95ab70641de6c1fca97e2a15820a9b69e51b20 \ 20 | --hash=sha256:637a4014c63fbf42a692d22b55d8ad6968a946b4a6ebc385c5505d9625b6a464 \ 21 | --hash=sha256:67c8301ec94e3bcc8906740fe071391bce40a862b7be0b86fb5382beefecd968 \ 22 | --hash=sha256:6d2fc92002d44746d3e7db7cf9313cf4452f43e9ea77a2c939defce3b10b5c82 \ 23 | --hash=sha256:6ee227b696ca60dd1c507be80a6bc849a5a6ab57ac7352aad1ffec9e8b805f21 \ 24 | --hash=sha256:863714200ada56cbc366dc9ae5291ceb936573155f8bf8e9de92aef51f3ad0f0 \ 25 | --hash=sha256:9b542ced1ec0ceeff5b37d69838106a6348e60db7b8fdd245294dc1d26136265 \ 26 | --hash=sha256:a6342964b43a99dbc72f72812bf88cad8f0217ae9acb47c0d4f141a6416d2d7b \ 27 | --hash=sha256:ad4efa5fad66b903b4a5f96d91461d90b9507a812b3c5de657d544215bb7877a \ 28 | --hash=sha256:bc58025940a896d7e5356952228b68f793cf5fcb342be703c3a2669a1488cb72 \ 29 | --hash=sha256:cc1e1de68c8e5444e8f94c3670bb48a2beef0e91dddfd4fcc29595ebd90bb9ce \ 30 | --hash=sha256:cee3e11161dde1b2a33a904b850b0899e0424cc331b7295f2a9698e79f9a69a0 \ 31 | --hash=sha256:e3556168e2e5c49629f7b0f377070240bd5511e45e25a4497bb0073d9dda776a \ 32 | --hash=sha256:e8477ec6bbfe0312c128e74644ac8a02ca06bcdb8982d4ee06f209be28cdf163 \ 33 | --hash=sha256:ee8f1f7228cce7dffc2b464f07ce769f478968bfb3dd1254a4c2eeed84928aad \ 34 | --hash=sha256:fd57160949179ec517d32ac2ac898b5f20d68ed1a9c977346efbac9c2f1e779d 35 | # via -r requirements.in 36 | click==8.1.3 \ 37 | --hash=sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e \ 38 | --hash=sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48 39 | # via black 40 | flake8==4.0.1 \ 41 | --hash=sha256:479b1304f72536a55948cb40a32dce8bb0ffe3501e26eaf292c7e60eb5e0428d \ 42 | --hash=sha256:806e034dda44114815e23c16ef92f95c91e4c71100ff52813adf7132a6ad870d 43 | # via -r requirements.in 44 | iniconfig==1.1.1 \ 45 | --hash=sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3 \ 46 | --hash=sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32 47 | # via pytest 48 | isort==5.10.1 \ 49 | --hash=sha256:6f62d78e2f89b4500b080fe3a81690850cd254227f27f75c3a0c491a1f351ba7 \ 50 | --hash=sha256:e8443a5e7a020e9d7f97f1d7d9cd17c88bcb3bc7e218bf9cf5095fe550be2951 51 | # via -r requirements.in 52 | maturin==0.12.16 \ 53 | --hash=sha256:29a635699db1f4981b891a4ee51ddcae8c410136ed40103232aea3b5f62e8504 \ 54 | --hash=sha256:4b5fe1de8b8e7ba5a9f52002b24a2d8148a23d1260c7bd59291c319ccc5b31f1 \ 55 | --hash=sha256:54ecff17c64cf5c5dc59ff22745517ea56b791995e70008d1dcd1623ce609f78 \ 56 | --hash=sha256:641c8ed8452cb8a288baf953be78d03e27e60189a64f00cc7bcc1731d158e8f6 \ 57 | --hash=sha256:70a042197fdcb726c911146a1c875f65f596de122a01eeb58af10faf3bd3a2c5 \ 58 | --hash=sha256:781abebb255061b5eda0413ecbac22b88a7ab50ecaee607fe5d8e3c55ab48e52 \ 59 | --hash=sha256:83a8f9378c320e981412f8d367e181af22f145d489a7da0a0c3aea86cf23f048 \ 60 | --hash=sha256:8aeb62a328bf4d9439758b59ccf5360a5f3127bbe58bedbcb6c64e888de3eb36 \ 61 | --hash=sha256:8d11e801216f4c91b2ba9da4bad615ffc3638f80a7ba973245a0154dcfdbee64 \ 62 | --hash=sha256:917f77382cdff55d2f290d0f58b7e6f4a7aaa74b58e2b61e4c67b37786d8a965 \ 63 | --hash=sha256:97756ad5ff113478de237b029add91def0a40af0dc5e120c25e1595addd9c151 \ 64 | --hash=sha256:9ce67a844d63d1ba8cdcf903ee2e6e0b21c0b0461b97c8737751d74002ded4c4 \ 65 | --hash=sha256:a026515e39fd48ee5318de57ddc6841a5fbcd5169b3860fb9ac9ea9521cc6027 \ 66 | --hash=sha256:bc4da52ef0c7e975396e7e6fb90da8858c518b4dccb810ceabec9db7ecedde57 \ 67 | --hash=sha256:d63f60dd5dddb165f824b2d8e593dcb31300d832eb6cbc6288dd484e29dfbd89 \ 68 | --hash=sha256:e5e4e3bfcf209ea1a6d20cade2de1ea716e17ea491a7a8b3fee0e45a10aa1e98 \ 69 | --hash=sha256:e7e3fa53c5207c05d4148ecbc0ce7463b7757989dadebcd8ab3a61c67b874157 70 | # via -r requirements.in 71 | mccabe==0.6.1 \ 72 | --hash=sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42 \ 73 | --hash=sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f 74 | # via flake8 75 | mypy==0.950 \ 76 | --hash=sha256:0112752a6ff07230f9ec2f71b0d3d4e088a910fdce454fdb6553e83ed0eced7d \ 77 | --hash=sha256:0384d9f3af49837baa92f559d3fa673e6d2652a16550a9ee07fc08c736f5e6f8 \ 78 | --hash=sha256:1b333cfbca1762ff15808a0ef4f71b5d3eed8528b23ea1c3fb50543c867d68de \ 79 | --hash=sha256:1fdeb0a0f64f2a874a4c1f5271f06e40e1e9779bf55f9567f149466fc7a55038 \ 80 | --hash=sha256:4c653e4846f287051599ed8f4b3c044b80e540e88feec76b11044ddc5612ffed \ 81 | --hash=sha256:563514c7dc504698fb66bb1cf897657a173a496406f1866afae73ab5b3cdb334 \ 82 | --hash=sha256:5b231afd6a6e951381b9ef09a1223b1feabe13625388db48a8690f8daa9b71ff \ 83 | --hash=sha256:5ce6a09042b6da16d773d2110e44f169683d8cc8687e79ec6d1181a72cb028d2 \ 84 | --hash=sha256:5e7647df0f8fc947388e6251d728189cfadb3b1e558407f93254e35abc026e22 \ 85 | --hash=sha256:6003de687c13196e8a1243a5e4bcce617d79b88f83ee6625437e335d89dfebe2 \ 86 | --hash=sha256:61504b9a5ae166ba5ecfed9e93357fd51aa693d3d434b582a925338a2ff57fd2 \ 87 | --hash=sha256:77423570c04aca807508a492037abbd72b12a1fb25a385847d191cd50b2c9605 \ 88 | --hash=sha256:a4d9898f46446bfb6405383b57b96737dcfd0a7f25b748e78ef3e8c576bba3cb \ 89 | --hash=sha256:a952b8bc0ae278fc6316e6384f67bb9a396eb30aced6ad034d3a76120ebcc519 \ 90 | --hash=sha256:b5b5bd0ffb11b4aba2bb6d31b8643902c48f990cc92fda4e21afac658044f0c0 \ 91 | --hash=sha256:ca75ecf2783395ca3016a5e455cb322ba26b6d33b4b413fcdedfc632e67941dc \ 92 | --hash=sha256:cf9c261958a769a3bd38c3e133801ebcd284ffb734ea12d01457cb09eacf7d7b \ 93 | --hash=sha256:dd4d670eee9610bf61c25c940e9ade2d0ed05eb44227275cce88701fee014b1f \ 94 | --hash=sha256:e19736af56947addedce4674c0971e5dceef1b5ec7d667fe86bcd2b07f8f9075 \ 95 | --hash=sha256:eaea21d150fb26d7b4856766e7addcf929119dd19fc832b22e71d942835201ef \ 96 | --hash=sha256:eaff8156016487c1af5ffa5304c3e3fd183edcb412f3e9c72db349faf3f6e0eb \ 97 | --hash=sha256:ee0a36edd332ed2c5208565ae6e3a7afc0eabb53f5327e281f2ef03a6bc7687a \ 98 | --hash=sha256:ef7beb2a3582eb7a9f37beaf38a28acfd801988cde688760aea9e6cc4832b10b 99 | # via -r requirements.in 100 | mypy-extensions==0.4.3 \ 101 | --hash=sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d \ 102 | --hash=sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8 103 | # via 104 | # black 105 | # mypy 106 | numpy==1.22.3 \ 107 | --hash=sha256:07a8c89a04997625236c5ecb7afe35a02af3896c8aa01890a849913a2309c676 \ 108 | --hash=sha256:08d9b008d0156c70dc392bb3ab3abb6e7a711383c3247b410b39962263576cd4 \ 109 | --hash=sha256:201b4d0552831f7250a08d3b38de0d989d6f6e4658b709a02a73c524ccc6ffce \ 110 | --hash=sha256:2c10a93606e0b4b95c9b04b77dc349b398fdfbda382d2a39ba5a822f669a0123 \ 111 | --hash=sha256:3ca688e1b9b95d80250bca34b11a05e389b1420d00e87a0d12dc45f131f704a1 \ 112 | --hash=sha256:48a3aecd3b997bf452a2dedb11f4e79bc5bfd21a1d4cc760e703c31d57c84b3e \ 113 | --hash=sha256:568dfd16224abddafb1cbcce2ff14f522abe037268514dd7e42c6776a1c3f8e5 \ 114 | --hash=sha256:5bfb1bb598e8229c2d5d48db1860bcf4311337864ea3efdbe1171fb0c5da515d \ 115 | --hash=sha256:639b54cdf6aa4f82fe37ebf70401bbb74b8508fddcf4797f9fe59615b8c5813a \ 116 | --hash=sha256:8251ed96f38b47b4295b1ae51631de7ffa8260b5b087808ef09a39a9d66c97ab \ 117 | --hash=sha256:92bfa69cfbdf7dfc3040978ad09a48091143cffb778ec3b03fa170c494118d75 \ 118 | --hash=sha256:97098b95aa4e418529099c26558eeb8486e66bd1e53a6b606d684d0c3616b168 \ 119 | --hash=sha256:a3bae1a2ed00e90b3ba5f7bd0a7c7999b55d609e0c54ceb2b076a25e345fa9f4 \ 120 | --hash=sha256:c34ea7e9d13a70bf2ab64a2532fe149a9aced424cd05a2c4ba662fd989e3e45f \ 121 | --hash=sha256:dbc7601a3b7472d559dc7b933b18b4b66f9aa7452c120e87dfb33d02008c8a18 \ 122 | --hash=sha256:e7927a589df200c5e23c57970bafbd0cd322459aa7b1ff73b7c2e84d6e3eae62 \ 123 | --hash=sha256:f8c1f39caad2c896bc0018f699882b345b2a63708008be29b1f355ebf6f933fe \ 124 | --hash=sha256:f950f8845b480cffe522913d35567e29dd381b0dc7e4ce6a4a9f9156417d2430 \ 125 | --hash=sha256:fade0d4f4d292b6f39951b6836d7a3c7ef5b2347f3c420cd9820a1d90d794802 \ 126 | --hash=sha256:fdf3c08bce27132395d3c3ba1503cac12e17282358cb4bddc25cc46b0aca07aa 127 | # via 128 | # -r requirements.in 129 | # pyarrow 130 | packaging==21.3 \ 131 | --hash=sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb \ 132 | --hash=sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522 133 | # via pytest 134 | pathspec==0.9.0 \ 135 | --hash=sha256:7d15c4ddb0b5c802d161efc417ec1a2558ea2653c2e8ad9c19098201dc1c993a \ 136 | --hash=sha256:e564499435a2673d586f6b2130bb5b95f04a3ba06f81b8f895b651a3c76aabb1 137 | # via black 138 | platformdirs==2.5.2 \ 139 | --hash=sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788 \ 140 | --hash=sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19 141 | # via black 142 | pluggy==1.0.0 \ 143 | --hash=sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159 \ 144 | --hash=sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3 145 | # via pytest 146 | py==1.11.0 \ 147 | --hash=sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719 \ 148 | --hash=sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378 149 | # via pytest 150 | pyarrow==8.0.0 \ 151 | --hash=sha256:03a10daad957970e914920b793f6a49416699e791f4c827927fd4e4d892a5d16 \ 152 | --hash=sha256:15511ce2f50343f3fd5e9f7c30e4d004da9134e9597e93e9c96c3985928cbe82 \ 153 | --hash=sha256:1dd482ccb07c96188947ad94d7536ab696afde23ad172df8e18944ec79f55055 \ 154 | --hash=sha256:25a5f7c7f36df520b0b7363ba9f51c3070799d4b05d587c60c0adaba57763479 \ 155 | --hash=sha256:3bd201af6e01f475f02be88cf1f6ee9856ab98c11d8bbb6f58347c58cd07be00 \ 156 | --hash=sha256:3fee786259d986f8c046100ced54d63b0c8c9f7cdb7d1bbe07dc69e0f928141c \ 157 | --hash=sha256:42b7982301a9ccd06e1dd4fabd2e8e5df74b93ce4c6b87b81eb9e2d86dc79871 \ 158 | --hash=sha256:4a18a211ed888f1ac0b0ebcb99e2d9a3e913a481120ee9b1fe33d3fedb945d4e \ 159 | --hash=sha256:51e58778fcb8829fca37fbfaea7f208d5ce7ea89ea133dd13d8ce745278ee6f0 \ 160 | --hash=sha256:541e7845ce5f27a861eb5b88ee165d931943347eec17b9ff1e308663531c9647 \ 161 | --hash=sha256:65c7f4cc2be195e3db09296d31a654bb6d8786deebcab00f0e2455fd109d7456 \ 162 | --hash=sha256:69b043a3fce064ebd9fbae6abc30e885680296e5bd5e6f7353e6a87966cf2ad7 \ 163 | --hash=sha256:6ea2c54e6b5ecd64e8299d2abb40770fe83a718f5ddc3825ddd5cd28e352cce1 \ 164 | --hash=sha256:78a6ac39cd793582998dac88ab5c1c1dd1e6503df6672f064f33a21937ec1d8d \ 165 | --hash=sha256:81b87b782a1366279411f7b235deab07c8c016e13f9af9f7c7b0ee564fedcc8f \ 166 | --hash=sha256:8392b9a1e837230090fe916415ed4c3433b2ddb1a798e3f6438303c70fbabcfc \ 167 | --hash=sha256:863be6bad6c53797129610930794a3e797cb7d41c0a30e6794a2ac0e42ce41b8 \ 168 | --hash=sha256:8cd86e04a899bef43e25184f4b934584861d787cf7519851a8c031803d45c6d8 \ 169 | --hash=sha256:95c7822eb37663e073da9892f3499fe28e84f3464711a3e555e0c5463fd53a19 \ 170 | --hash=sha256:98c13b2e28a91b0fbf24b483df54a8d7814c074c2623ecef40dce1fa52f6539b \ 171 | --hash=sha256:ba2b7aa7efb59156b87987a06f5241932914e4d5bbb74a465306b00a6c808849 \ 172 | --hash=sha256:c9c97c8e288847e091dfbcdf8ce51160e638346f51919a9e74fe038b2e8aee62 \ 173 | --hash=sha256:cb06cacc19f3b426681f2f6803cc06ff481e7fe5b3a533b406bc5b2138843d4f \ 174 | --hash=sha256:ce64bc1da3109ef5ab9e4c60316945a7239c798098a631358e9ab39f6e5529e9 \ 175 | --hash=sha256:d5ef4372559b191cafe7db8932801eee252bfc35e983304e7d60b6954576a071 \ 176 | --hash=sha256:d6f1e1040413651819074ef5b500835c6c42e6c446532a1ddef8bc5054e8dba5 \ 177 | --hash=sha256:deb400df8f19a90b662babceb6dd12daddda6bb357c216e558b207c0770c7654 \ 178 | --hash=sha256:ea132067ec712d1b1116a841db1c95861508862b21eddbcafefbce8e4b96b867 \ 179 | --hash=sha256:ece333706a94c1221ced8b299042f85fd88b5db802d71be70024433ddf3aecab \ 180 | --hash=sha256:edad25522ad509e534400d6ab98cf1872d30c31bc5e947712bfd57def7af15bb 181 | # via -r requirements.in 182 | pycodestyle==2.8.0 \ 183 | --hash=sha256:720f8b39dde8b293825e7ff02c475f3077124006db4f440dcbc9a20b76548a20 \ 184 | --hash=sha256:eddd5847ef438ea1c7870ca7eb78a9d47ce0cdb4851a5523949f2601d0cbbe7f 185 | # via flake8 186 | pyflakes==2.4.0 \ 187 | --hash=sha256:05a85c2872edf37a4ed30b0cce2f6093e1d0581f8c19d7393122da7e25b2b24c \ 188 | --hash=sha256:3bb3a3f256f4b7968c9c788781e4ff07dce46bdf12339dcda61053375426ee2e 189 | # via flake8 190 | pyparsing==3.0.9 \ 191 | --hash=sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb \ 192 | --hash=sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc 193 | # via packaging 194 | pytest==7.1.2 \ 195 | --hash=sha256:13d0e3ccfc2b6e26be000cb6568c832ba67ba32e719443bfe725814d3c42433c \ 196 | --hash=sha256:a06a0425453864a270bc45e71f783330a7428defb4230fb5e6a731fde06ecd45 197 | # via -r requirements.in 198 | toml==0.10.2 \ 199 | --hash=sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b \ 200 | --hash=sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f 201 | # via -r requirements.in 202 | tomli==2.0.1 \ 203 | --hash=sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc \ 204 | --hash=sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f 205 | # via 206 | # black 207 | # maturin 208 | # mypy 209 | # pytest 210 | typing-extensions==4.2.0 \ 211 | --hash=sha256:6657594ee297170d19f67d55c05852a874e7eb634f4f753dbd667855e07c1708 \ 212 | --hash=sha256:f1c24655a0da0d1b67f07e17a5e6b2a105894e6824b92096378bb3668ef02376 213 | # via mypy 214 | -------------------------------------------------------------------------------- /requirements-37.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with python 3.7 3 | # To update, run: 4 | # 5 | # pip-compile --generate-hashes 6 | # 7 | attrs==21.4.0 \ 8 | --hash=sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4 \ 9 | --hash=sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd 10 | # via pytest 11 | black==22.3.0 \ 12 | --hash=sha256:06f9d8846f2340dfac80ceb20200ea5d1b3f181dd0556b47af4e8e0b24fa0a6b \ 13 | --hash=sha256:10dbe6e6d2988049b4655b2b739f98785a884d4d6b85bc35133a8fb9a2233176 \ 14 | --hash=sha256:2497f9c2386572e28921fa8bec7be3e51de6801f7459dffd6e62492531c47e09 \ 15 | --hash=sha256:30d78ba6bf080eeaf0b7b875d924b15cd46fec5fd044ddfbad38c8ea9171043a \ 16 | --hash=sha256:328efc0cc70ccb23429d6be184a15ce613f676bdfc85e5fe8ea2a9354b4e9015 \ 17 | --hash=sha256:35020b8886c022ced9282b51b5a875b6d1ab0c387b31a065b84db7c33085ca79 \ 18 | --hash=sha256:5795a0375eb87bfe902e80e0c8cfaedf8af4d49694d69161e5bd3206c18618bb \ 19 | --hash=sha256:5891ef8abc06576985de8fa88e95ab70641de6c1fca97e2a15820a9b69e51b20 \ 20 | --hash=sha256:637a4014c63fbf42a692d22b55d8ad6968a946b4a6ebc385c5505d9625b6a464 \ 21 | --hash=sha256:67c8301ec94e3bcc8906740fe071391bce40a862b7be0b86fb5382beefecd968 \ 22 | --hash=sha256:6d2fc92002d44746d3e7db7cf9313cf4452f43e9ea77a2c939defce3b10b5c82 \ 23 | --hash=sha256:6ee227b696ca60dd1c507be80a6bc849a5a6ab57ac7352aad1ffec9e8b805f21 \ 24 | --hash=sha256:863714200ada56cbc366dc9ae5291ceb936573155f8bf8e9de92aef51f3ad0f0 \ 25 | --hash=sha256:9b542ced1ec0ceeff5b37d69838106a6348e60db7b8fdd245294dc1d26136265 \ 26 | --hash=sha256:a6342964b43a99dbc72f72812bf88cad8f0217ae9acb47c0d4f141a6416d2d7b \ 27 | --hash=sha256:ad4efa5fad66b903b4a5f96d91461d90b9507a812b3c5de657d544215bb7877a \ 28 | --hash=sha256:bc58025940a896d7e5356952228b68f793cf5fcb342be703c3a2669a1488cb72 \ 29 | --hash=sha256:cc1e1de68c8e5444e8f94c3670bb48a2beef0e91dddfd4fcc29595ebd90bb9ce \ 30 | --hash=sha256:cee3e11161dde1b2a33a904b850b0899e0424cc331b7295f2a9698e79f9a69a0 \ 31 | --hash=sha256:e3556168e2e5c49629f7b0f377070240bd5511e45e25a4497bb0073d9dda776a \ 32 | --hash=sha256:e8477ec6bbfe0312c128e74644ac8a02ca06bcdb8982d4ee06f209be28cdf163 \ 33 | --hash=sha256:ee8f1f7228cce7dffc2b464f07ce769f478968bfb3dd1254a4c2eeed84928aad \ 34 | --hash=sha256:fd57160949179ec517d32ac2ac898b5f20d68ed1a9c977346efbac9c2f1e779d 35 | # via -r requirements.in 36 | click==8.1.3 \ 37 | --hash=sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e \ 38 | --hash=sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48 39 | # via black 40 | flake8==4.0.1 \ 41 | --hash=sha256:479b1304f72536a55948cb40a32dce8bb0ffe3501e26eaf292c7e60eb5e0428d \ 42 | --hash=sha256:806e034dda44114815e23c16ef92f95c91e4c71100ff52813adf7132a6ad870d 43 | # via -r requirements.in 44 | importlib-metadata==4.2.0 ; python_version < "3.8" \ 45 | --hash=sha256:057e92c15bc8d9e8109738a48db0ccb31b4d9d5cfbee5a8670879a30be66304b \ 46 | --hash=sha256:b7e52a1f8dec14a75ea73e0891f3060099ca1d8e6a462a4dff11c3e119ea1b31 47 | # via 48 | # -r requirements.in 49 | # click 50 | # flake8 51 | # pluggy 52 | # pytest 53 | iniconfig==1.1.1 \ 54 | --hash=sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3 \ 55 | --hash=sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32 56 | # via pytest 57 | isort==5.10.1 \ 58 | --hash=sha256:6f62d78e2f89b4500b080fe3a81690850cd254227f27f75c3a0c491a1f351ba7 \ 59 | --hash=sha256:e8443a5e7a020e9d7f97f1d7d9cd17c88bcb3bc7e218bf9cf5095fe550be2951 60 | # via -r requirements.in 61 | maturin==0.12.16 \ 62 | --hash=sha256:29a635699db1f4981b891a4ee51ddcae8c410136ed40103232aea3b5f62e8504 \ 63 | --hash=sha256:4b5fe1de8b8e7ba5a9f52002b24a2d8148a23d1260c7bd59291c319ccc5b31f1 \ 64 | --hash=sha256:54ecff17c64cf5c5dc59ff22745517ea56b791995e70008d1dcd1623ce609f78 \ 65 | --hash=sha256:641c8ed8452cb8a288baf953be78d03e27e60189a64f00cc7bcc1731d158e8f6 \ 66 | --hash=sha256:70a042197fdcb726c911146a1c875f65f596de122a01eeb58af10faf3bd3a2c5 \ 67 | --hash=sha256:781abebb255061b5eda0413ecbac22b88a7ab50ecaee607fe5d8e3c55ab48e52 \ 68 | --hash=sha256:83a8f9378c320e981412f8d367e181af22f145d489a7da0a0c3aea86cf23f048 \ 69 | --hash=sha256:8aeb62a328bf4d9439758b59ccf5360a5f3127bbe58bedbcb6c64e888de3eb36 \ 70 | --hash=sha256:8d11e801216f4c91b2ba9da4bad615ffc3638f80a7ba973245a0154dcfdbee64 \ 71 | --hash=sha256:917f77382cdff55d2f290d0f58b7e6f4a7aaa74b58e2b61e4c67b37786d8a965 \ 72 | --hash=sha256:97756ad5ff113478de237b029add91def0a40af0dc5e120c25e1595addd9c151 \ 73 | --hash=sha256:9ce67a844d63d1ba8cdcf903ee2e6e0b21c0b0461b97c8737751d74002ded4c4 \ 74 | --hash=sha256:a026515e39fd48ee5318de57ddc6841a5fbcd5169b3860fb9ac9ea9521cc6027 \ 75 | --hash=sha256:bc4da52ef0c7e975396e7e6fb90da8858c518b4dccb810ceabec9db7ecedde57 \ 76 | --hash=sha256:d63f60dd5dddb165f824b2d8e593dcb31300d832eb6cbc6288dd484e29dfbd89 \ 77 | --hash=sha256:e5e4e3bfcf209ea1a6d20cade2de1ea716e17ea491a7a8b3fee0e45a10aa1e98 \ 78 | --hash=sha256:e7e3fa53c5207c05d4148ecbc0ce7463b7757989dadebcd8ab3a61c67b874157 79 | # via -r requirements.in 80 | mccabe==0.6.1 \ 81 | --hash=sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42 \ 82 | --hash=sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f 83 | # via flake8 84 | mypy==0.950 \ 85 | --hash=sha256:0112752a6ff07230f9ec2f71b0d3d4e088a910fdce454fdb6553e83ed0eced7d \ 86 | --hash=sha256:0384d9f3af49837baa92f559d3fa673e6d2652a16550a9ee07fc08c736f5e6f8 \ 87 | --hash=sha256:1b333cfbca1762ff15808a0ef4f71b5d3eed8528b23ea1c3fb50543c867d68de \ 88 | --hash=sha256:1fdeb0a0f64f2a874a4c1f5271f06e40e1e9779bf55f9567f149466fc7a55038 \ 89 | --hash=sha256:4c653e4846f287051599ed8f4b3c044b80e540e88feec76b11044ddc5612ffed \ 90 | --hash=sha256:563514c7dc504698fb66bb1cf897657a173a496406f1866afae73ab5b3cdb334 \ 91 | --hash=sha256:5b231afd6a6e951381b9ef09a1223b1feabe13625388db48a8690f8daa9b71ff \ 92 | --hash=sha256:5ce6a09042b6da16d773d2110e44f169683d8cc8687e79ec6d1181a72cb028d2 \ 93 | --hash=sha256:5e7647df0f8fc947388e6251d728189cfadb3b1e558407f93254e35abc026e22 \ 94 | --hash=sha256:6003de687c13196e8a1243a5e4bcce617d79b88f83ee6625437e335d89dfebe2 \ 95 | --hash=sha256:61504b9a5ae166ba5ecfed9e93357fd51aa693d3d434b582a925338a2ff57fd2 \ 96 | --hash=sha256:77423570c04aca807508a492037abbd72b12a1fb25a385847d191cd50b2c9605 \ 97 | --hash=sha256:a4d9898f46446bfb6405383b57b96737dcfd0a7f25b748e78ef3e8c576bba3cb \ 98 | --hash=sha256:a952b8bc0ae278fc6316e6384f67bb9a396eb30aced6ad034d3a76120ebcc519 \ 99 | --hash=sha256:b5b5bd0ffb11b4aba2bb6d31b8643902c48f990cc92fda4e21afac658044f0c0 \ 100 | --hash=sha256:ca75ecf2783395ca3016a5e455cb322ba26b6d33b4b413fcdedfc632e67941dc \ 101 | --hash=sha256:cf9c261958a769a3bd38c3e133801ebcd284ffb734ea12d01457cb09eacf7d7b \ 102 | --hash=sha256:dd4d670eee9610bf61c25c940e9ade2d0ed05eb44227275cce88701fee014b1f \ 103 | --hash=sha256:e19736af56947addedce4674c0971e5dceef1b5ec7d667fe86bcd2b07f8f9075 \ 104 | --hash=sha256:eaea21d150fb26d7b4856766e7addcf929119dd19fc832b22e71d942835201ef \ 105 | --hash=sha256:eaff8156016487c1af5ffa5304c3e3fd183edcb412f3e9c72db349faf3f6e0eb \ 106 | --hash=sha256:ee0a36edd332ed2c5208565ae6e3a7afc0eabb53f5327e281f2ef03a6bc7687a \ 107 | --hash=sha256:ef7beb2a3582eb7a9f37beaf38a28acfd801988cde688760aea9e6cc4832b10b 108 | # via -r requirements.in 109 | mypy-extensions==0.4.3 \ 110 | --hash=sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d \ 111 | --hash=sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8 112 | # via 113 | # black 114 | # mypy 115 | numpy==1.21.6 \ 116 | --hash=sha256:1dbe1c91269f880e364526649a52eff93ac30035507ae980d2fed33aaee633ac \ 117 | --hash=sha256:357768c2e4451ac241465157a3e929b265dfac85d9214074985b1786244f2ef3 \ 118 | --hash=sha256:3820724272f9913b597ccd13a467cc492a0da6b05df26ea09e78b171a0bb9da6 \ 119 | --hash=sha256:4391bd07606be175aafd267ef9bea87cf1b8210c787666ce82073b05f202add1 \ 120 | --hash=sha256:4aa48afdce4660b0076a00d80afa54e8a97cd49f457d68a4342d188a09451c1a \ 121 | --hash=sha256:58459d3bad03343ac4b1b42ed14d571b8743dc80ccbf27444f266729df1d6f5b \ 122 | --hash=sha256:5c3c8def4230e1b959671eb959083661b4a0d2e9af93ee339c7dada6759a9470 \ 123 | --hash=sha256:5f30427731561ce75d7048ac254dbe47a2ba576229250fb60f0fb74db96501a1 \ 124 | --hash=sha256:643843bcc1c50526b3a71cd2ee561cf0d8773f062c8cbaf9ffac9fdf573f83ab \ 125 | --hash=sha256:67c261d6c0a9981820c3a149d255a76918278a6b03b6a036800359aba1256d46 \ 126 | --hash=sha256:67f21981ba2f9d7ba9ade60c9e8cbaa8cf8e9ae51673934480e45cf55e953673 \ 127 | --hash=sha256:6aaf96c7f8cebc220cdfc03f1d5a31952f027dda050e5a703a0d1c396075e3e7 \ 128 | --hash=sha256:7c4068a8c44014b2d55f3c3f574c376b2494ca9cc73d2f1bd692382b6dffe3db \ 129 | --hash=sha256:7c7e5fa88d9ff656e067876e4736379cc962d185d5cd808014a8a928d529ef4e \ 130 | --hash=sha256:7f5ae4f304257569ef3b948810816bc87c9146e8c446053539947eedeaa32786 \ 131 | --hash=sha256:82691fda7c3f77c90e62da69ae60b5ac08e87e775b09813559f8901a88266552 \ 132 | --hash=sha256:8737609c3bbdd48e380d463134a35ffad3b22dc56295eff6f79fd85bd0eeeb25 \ 133 | --hash=sha256:9f411b2c3f3d76bba0865b35a425157c5dcf54937f82bbeb3d3c180789dd66a6 \ 134 | --hash=sha256:a6be4cb0ef3b8c9250c19cc122267263093eee7edd4e3fa75395dfda8c17a8e2 \ 135 | --hash=sha256:bcb238c9c96c00d3085b264e5c1a1207672577b93fa666c3b14a45240b14123a \ 136 | --hash=sha256:bf2ec4b75d0e9356edea834d1de42b31fe11f726a81dfb2c2112bc1eaa508fcf \ 137 | --hash=sha256:d136337ae3cc69aa5e447e78d8e1514be8c3ec9b54264e680cf0b4bd9011574f \ 138 | --hash=sha256:d4bf4d43077db55589ffc9009c0ba0a94fa4908b9586d6ccce2e0b164c86303c \ 139 | --hash=sha256:d6a96eef20f639e6a97d23e57dd0c1b1069a7b4fd7027482a4c5c451cd7732f4 \ 140 | --hash=sha256:d9caa9d5e682102453d96a0ee10c7241b72859b01a941a397fd965f23b3e016b \ 141 | --hash=sha256:dd1c8f6bd65d07d3810b90d02eba7997e32abbdf1277a481d698969e921a3be0 \ 142 | --hash=sha256:e31f0bb5928b793169b87e3d1e070f2342b22d5245c755e2b81caa29756246c3 \ 143 | --hash=sha256:ecb55251139706669fdec2ff073c98ef8e9a84473e51e716211b41aa0f18e656 \ 144 | --hash=sha256:ee5ec40fdd06d62fe5d4084bef4fd50fd4bb6bfd2bf519365f569dc470163ab0 \ 145 | --hash=sha256:f17e562de9edf691a42ddb1eb4a5541c20dd3f9e65b09ded2beb0799c0cf29bb \ 146 | --hash=sha256:fdffbfb6832cd0b300995a2b08b8f6fa9f6e856d562800fea9182316d99c4e8e 147 | # via 148 | # -r requirements.in 149 | # pyarrow 150 | packaging==21.3 \ 151 | --hash=sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb \ 152 | --hash=sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522 153 | # via pytest 154 | pathspec==0.9.0 \ 155 | --hash=sha256:7d15c4ddb0b5c802d161efc417ec1a2558ea2653c2e8ad9c19098201dc1c993a \ 156 | --hash=sha256:e564499435a2673d586f6b2130bb5b95f04a3ba06f81b8f895b651a3c76aabb1 157 | # via black 158 | platformdirs==2.5.2 \ 159 | --hash=sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788 \ 160 | --hash=sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19 161 | # via black 162 | pluggy==1.0.0 \ 163 | --hash=sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159 \ 164 | --hash=sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3 165 | # via pytest 166 | py==1.11.0 \ 167 | --hash=sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719 \ 168 | --hash=sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378 169 | # via pytest 170 | pyarrow==8.0.0 \ 171 | --hash=sha256:03a10daad957970e914920b793f6a49416699e791f4c827927fd4e4d892a5d16 \ 172 | --hash=sha256:15511ce2f50343f3fd5e9f7c30e4d004da9134e9597e93e9c96c3985928cbe82 \ 173 | --hash=sha256:1dd482ccb07c96188947ad94d7536ab696afde23ad172df8e18944ec79f55055 \ 174 | --hash=sha256:25a5f7c7f36df520b0b7363ba9f51c3070799d4b05d587c60c0adaba57763479 \ 175 | --hash=sha256:3bd201af6e01f475f02be88cf1f6ee9856ab98c11d8bbb6f58347c58cd07be00 \ 176 | --hash=sha256:3fee786259d986f8c046100ced54d63b0c8c9f7cdb7d1bbe07dc69e0f928141c \ 177 | --hash=sha256:42b7982301a9ccd06e1dd4fabd2e8e5df74b93ce4c6b87b81eb9e2d86dc79871 \ 178 | --hash=sha256:4a18a211ed888f1ac0b0ebcb99e2d9a3e913a481120ee9b1fe33d3fedb945d4e \ 179 | --hash=sha256:51e58778fcb8829fca37fbfaea7f208d5ce7ea89ea133dd13d8ce745278ee6f0 \ 180 | --hash=sha256:541e7845ce5f27a861eb5b88ee165d931943347eec17b9ff1e308663531c9647 \ 181 | --hash=sha256:65c7f4cc2be195e3db09296d31a654bb6d8786deebcab00f0e2455fd109d7456 \ 182 | --hash=sha256:69b043a3fce064ebd9fbae6abc30e885680296e5bd5e6f7353e6a87966cf2ad7 \ 183 | --hash=sha256:6ea2c54e6b5ecd64e8299d2abb40770fe83a718f5ddc3825ddd5cd28e352cce1 \ 184 | --hash=sha256:78a6ac39cd793582998dac88ab5c1c1dd1e6503df6672f064f33a21937ec1d8d \ 185 | --hash=sha256:81b87b782a1366279411f7b235deab07c8c016e13f9af9f7c7b0ee564fedcc8f \ 186 | --hash=sha256:8392b9a1e837230090fe916415ed4c3433b2ddb1a798e3f6438303c70fbabcfc \ 187 | --hash=sha256:863be6bad6c53797129610930794a3e797cb7d41c0a30e6794a2ac0e42ce41b8 \ 188 | --hash=sha256:8cd86e04a899bef43e25184f4b934584861d787cf7519851a8c031803d45c6d8 \ 189 | --hash=sha256:95c7822eb37663e073da9892f3499fe28e84f3464711a3e555e0c5463fd53a19 \ 190 | --hash=sha256:98c13b2e28a91b0fbf24b483df54a8d7814c074c2623ecef40dce1fa52f6539b \ 191 | --hash=sha256:ba2b7aa7efb59156b87987a06f5241932914e4d5bbb74a465306b00a6c808849 \ 192 | --hash=sha256:c9c97c8e288847e091dfbcdf8ce51160e638346f51919a9e74fe038b2e8aee62 \ 193 | --hash=sha256:cb06cacc19f3b426681f2f6803cc06ff481e7fe5b3a533b406bc5b2138843d4f \ 194 | --hash=sha256:ce64bc1da3109ef5ab9e4c60316945a7239c798098a631358e9ab39f6e5529e9 \ 195 | --hash=sha256:d5ef4372559b191cafe7db8932801eee252bfc35e983304e7d60b6954576a071 \ 196 | --hash=sha256:d6f1e1040413651819074ef5b500835c6c42e6c446532a1ddef8bc5054e8dba5 \ 197 | --hash=sha256:deb400df8f19a90b662babceb6dd12daddda6bb357c216e558b207c0770c7654 \ 198 | --hash=sha256:ea132067ec712d1b1116a841db1c95861508862b21eddbcafefbce8e4b96b867 \ 199 | --hash=sha256:ece333706a94c1221ced8b299042f85fd88b5db802d71be70024433ddf3aecab \ 200 | --hash=sha256:edad25522ad509e534400d6ab98cf1872d30c31bc5e947712bfd57def7af15bb 201 | # via -r requirements.in 202 | pycodestyle==2.8.0 \ 203 | --hash=sha256:720f8b39dde8b293825e7ff02c475f3077124006db4f440dcbc9a20b76548a20 \ 204 | --hash=sha256:eddd5847ef438ea1c7870ca7eb78a9d47ce0cdb4851a5523949f2601d0cbbe7f 205 | # via flake8 206 | pyflakes==2.4.0 \ 207 | --hash=sha256:05a85c2872edf37a4ed30b0cce2f6093e1d0581f8c19d7393122da7e25b2b24c \ 208 | --hash=sha256:3bb3a3f256f4b7968c9c788781e4ff07dce46bdf12339dcda61053375426ee2e 209 | # via flake8 210 | pyparsing==3.0.9 \ 211 | --hash=sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb \ 212 | --hash=sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc 213 | # via packaging 214 | pytest==7.1.2 \ 215 | --hash=sha256:13d0e3ccfc2b6e26be000cb6568c832ba67ba32e719443bfe725814d3c42433c \ 216 | --hash=sha256:a06a0425453864a270bc45e71f783330a7428defb4230fb5e6a731fde06ecd45 217 | # via -r requirements.in 218 | toml==0.10.2 \ 219 | --hash=sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b \ 220 | --hash=sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f 221 | # via -r requirements.in 222 | tomli==2.0.1 \ 223 | --hash=sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc \ 224 | --hash=sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f 225 | # via 226 | # black 227 | # maturin 228 | # mypy 229 | # pytest 230 | typed-ast==1.5.3 \ 231 | --hash=sha256:20d5118e494478ef2d3a2702d964dae830aedd7b4d3b626d003eea526be18718 \ 232 | --hash=sha256:27e46cdd01d6c3a0dd8f728b6a938a6751f7bd324817501c15fb056307f918c6 \ 233 | --hash=sha256:27f25232e2dd0edfe1f019d6bfaaf11e86e657d9bdb7b0956db95f560cceb2b3 \ 234 | --hash=sha256:3042bfc9ca118712c9809201f55355479cfcdc17449f9f8db5e744e9625c6805 \ 235 | --hash=sha256:37e5349d1d5de2f4763d534ccb26809d1c24b180a477659a12c4bde9dd677d74 \ 236 | --hash=sha256:4fff9fdcce59dc61ec1b317bdb319f8f4e6b69ebbe61193ae0a60c5f9333dc49 \ 237 | --hash=sha256:542cd732351ba8235f20faa0fc7398946fe1a57f2cdb289e5497e1e7f48cfedb \ 238 | --hash=sha256:5dc2c11ae59003d4a26dda637222d9ae924387f96acae9492df663843aefad55 \ 239 | --hash=sha256:8831479695eadc8b5ffed06fdfb3e424adc37962a75925668deeb503f446c0a3 \ 240 | --hash=sha256:8cdf91b0c466a6c43f36c1964772918a2c04cfa83df8001ff32a89e357f8eb06 \ 241 | --hash=sha256:8e0b8528838ffd426fea8d18bde4c73bcb4167218998cc8b9ee0a0f2bfe678a6 \ 242 | --hash=sha256:8ef1d96ad05a291f5c36895d86d1375c0ee70595b90f6bb5f5fdbee749b146db \ 243 | --hash=sha256:9ad3b48cf2b487be140072fb86feff36801487d4abb7382bb1929aaac80638ea \ 244 | --hash=sha256:9cc9e1457e1feb06b075c8ef8aeb046a28ec351b1958b42c7c31c989c841403a \ 245 | --hash=sha256:9e237e74fd321a55c90eee9bc5d44be976979ad38a29bbd734148295c1ce7617 \ 246 | --hash=sha256:c9f1a27592fac87daa4e3f16538713d705599b0a27dfe25518b80b6b017f0a6d \ 247 | --hash=sha256:d64dabc6336ddc10373922a146fa2256043b3b43e61f28961caec2a5207c56d5 \ 248 | --hash=sha256:e20d196815eeffb3d76b75223e8ffed124e65ee62097e4e73afb5fec6b993e7a \ 249 | --hash=sha256:e34f9b9e61333ecb0f7d79c21c28aa5cd63bec15cb7e1310d7d3da6ce886bc9b \ 250 | --hash=sha256:ed44e81517364cb5ba367e4f68fca01fba42a7a4690d40c07886586ac267d9b9 \ 251 | --hash=sha256:ee852185964744987609b40aee1d2eb81502ae63ee8eef614558f96a56c1902d \ 252 | --hash=sha256:f60d9de0d087454c91b3999a296d0c4558c1666771e3460621875021bf899af9 \ 253 | --hash=sha256:f818c5b81966d4728fec14caa338e30a70dfc3da577984d38f97816c4b3071ec \ 254 | --hash=sha256:fd5df1313915dbd70eaaa88c19030b441742e8b05e6103c631c83b75e0435ccc 255 | # via 256 | # black 257 | # mypy 258 | typing-extensions==4.2.0 \ 259 | --hash=sha256:6657594ee297170d19f67d55c05852a874e7eb634f4f753dbd667855e07c1708 \ 260 | --hash=sha256:f1c24655a0da0d1b67f07e17a5e6b2a105894e6824b92096378bb3668ef02376 261 | # via 262 | # black 263 | # importlib-metadata 264 | # mypy 265 | zipp==3.8.0 \ 266 | --hash=sha256:56bf8aadb83c24db6c4b577e13de374ccfb67da2078beba1d037c17980bf43ad \ 267 | --hash=sha256:c4f6e5bbf48e74f7a38e7cc5b0480ff42b0ae5178957d564d18932525d5cf099 268 | # via importlib-metadata 269 | -------------------------------------------------------------------------------- /requirements.in: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | black 19 | flake8 20 | isort 21 | maturin 22 | mypy 23 | numpy 24 | pyarrow 25 | pytest 26 | toml 27 | importlib_metadata; python_version < "3.8" 28 | -------------------------------------------------------------------------------- /src/catalog.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::collections::HashSet; 19 | use std::sync::Arc; 20 | 21 | use pyo3::exceptions::PyKeyError; 22 | use pyo3::prelude::*; 23 | 24 | use datafusion::{ 25 | arrow::pyarrow::PyArrowConvert, 26 | catalog::{catalog::CatalogProvider, schema::SchemaProvider}, 27 | datasource::{TableProvider, TableType}, 28 | }; 29 | 30 | #[pyclass(name = "Catalog", module = "datafusion", subclass)] 31 | pub(crate) struct PyCatalog { 32 | catalog: Arc, 33 | } 34 | 35 | #[pyclass(name = "Database", module = "datafusion", subclass)] 36 | pub(crate) struct PyDatabase { 37 | database: Arc, 38 | } 39 | 40 | #[pyclass(name = "Table", module = "datafusion", subclass)] 41 | pub struct PyTable { 42 | table: Arc, 43 | } 44 | 45 | impl PyCatalog { 46 | pub fn new(catalog: Arc) -> Self { 47 | Self { catalog } 48 | } 49 | } 50 | 51 | impl PyDatabase { 52 | pub fn new(database: Arc) -> Self { 53 | Self { database } 54 | } 55 | } 56 | 57 | impl PyTable { 58 | pub fn new(table: Arc) -> Self { 59 | Self { table } 60 | } 61 | 62 | pub fn table(&self) -> Arc { 63 | self.table.clone() 64 | } 65 | } 66 | 67 | #[pymethods] 68 | impl PyCatalog { 69 | fn names(&self) -> Vec { 70 | self.catalog.schema_names() 71 | } 72 | 73 | #[args(name = "\"public\"")] 74 | fn database(&self, name: &str) -> PyResult { 75 | match self.catalog.schema(name) { 76 | Some(database) => Ok(PyDatabase::new(database)), 77 | None => Err(PyKeyError::new_err(format!( 78 | "Database with name {} doesn't exist.", 79 | name 80 | ))), 81 | } 82 | } 83 | } 84 | 85 | #[pymethods] 86 | impl PyDatabase { 87 | fn names(&self) -> HashSet { 88 | self.database.table_names().into_iter().collect() 89 | } 90 | 91 | fn table(&self, name: &str) -> PyResult { 92 | match self.database.table(name) { 93 | Some(table) => Ok(PyTable::new(table)), 94 | None => Err(PyKeyError::new_err(format!( 95 | "Table with name {} doesn't exist.", 96 | name 97 | ))), 98 | } 99 | } 100 | 101 | // register_table 102 | // deregister_table 103 | } 104 | 105 | #[pymethods] 106 | impl PyTable { 107 | /// Get a reference to the schema for this table 108 | #[getter] 109 | fn schema(&self, py: Python) -> PyResult { 110 | self.table.schema().to_pyarrow(py) 111 | } 112 | 113 | /// Get the type of this table for metadata/catalog purposes. 114 | #[getter] 115 | fn kind(&self) -> &str { 116 | match self.table.table_type() { 117 | TableType::Base => "physical", 118 | TableType::View => "view", 119 | TableType::Temporary => "temporary", 120 | } 121 | } 122 | 123 | // fn scan 124 | // fn statistics 125 | // fn has_exact_statistics 126 | // fn supports_filter_pushdown 127 | } 128 | -------------------------------------------------------------------------------- /src/context.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::path::PathBuf; 19 | use std::{collections::HashSet, sync::Arc}; 20 | 21 | use uuid::Uuid; 22 | 23 | use pyo3::exceptions::{PyKeyError, PyValueError}; 24 | use pyo3::prelude::*; 25 | 26 | use datafusion::arrow::datatypes::Schema; 27 | use datafusion::arrow::record_batch::RecordBatch; 28 | use datafusion::datasource::MemTable; 29 | use datafusion::execution::context::SessionContext; 30 | use datafusion::prelude::{CsvReadOptions, ParquetReadOptions}; 31 | 32 | use crate::catalog::{PyCatalog, PyTable}; 33 | use crate::dataframe::PyDataFrame; 34 | use crate::errors::DataFusionError; 35 | use crate::udf::PyScalarUDF; 36 | use crate::utils::wait_for_future; 37 | 38 | /// `PySessionContext` is able to plan and execute DataFusion plans. 39 | /// It has a powerful optimizer, a physical planner for local execution, and a 40 | /// multi-threaded execution engine to perform the execution. 41 | #[pyclass(name = "SessionContext", module = "datafusion", subclass, unsendable)] 42 | pub(crate) struct PySessionContext { 43 | ctx: SessionContext, 44 | } 45 | 46 | #[pymethods] 47 | impl PySessionContext { 48 | // TODO(kszucs): should expose the configuration options as keyword arguments 49 | #[new] 50 | fn new() -> Self { 51 | PySessionContext { 52 | ctx: SessionContext::new(), 53 | } 54 | } 55 | 56 | /// Returns a PyDataFrame whose plan corresponds to the SQL statement. 57 | fn sql(&mut self, query: &str, py: Python) -> PyResult { 58 | let result = self.ctx.sql(query); 59 | let df = wait_for_future(py, result).map_err(DataFusionError::from)?; 60 | Ok(PyDataFrame::new(df)) 61 | } 62 | 63 | fn create_dataframe(&mut self, partitions: Vec>) -> PyResult { 64 | let table = MemTable::try_new(partitions[0][0].schema(), partitions) 65 | .map_err(DataFusionError::from)?; 66 | 67 | // generate a random (unique) name for this table 68 | // table name cannot start with numeric digit 69 | let name = "c".to_owned() 70 | + Uuid::new_v4() 71 | .to_simple() 72 | .encode_lower(&mut Uuid::encode_buffer()); 73 | 74 | self.ctx 75 | .register_table(&*name, Arc::new(table)) 76 | .map_err(DataFusionError::from)?; 77 | let table = self.ctx.table(&*name).map_err(DataFusionError::from)?; 78 | 79 | let df = PyDataFrame::new(table); 80 | Ok(df) 81 | } 82 | 83 | fn register_table(&mut self, name: &str, table: &PyTable) -> PyResult<()> { 84 | self.ctx 85 | .register_table(name, table.table()) 86 | .map_err(DataFusionError::from)?; 87 | Ok(()) 88 | } 89 | 90 | fn deregister_table(&mut self, name: &str) -> PyResult<()> { 91 | self.ctx 92 | .deregister_table(name) 93 | .map_err(DataFusionError::from)?; 94 | Ok(()) 95 | } 96 | 97 | fn register_record_batches( 98 | &mut self, 99 | name: &str, 100 | partitions: Vec>, 101 | ) -> PyResult<()> { 102 | let schema = partitions[0][0].schema(); 103 | let table = MemTable::try_new(schema, partitions)?; 104 | self.ctx 105 | .register_table(name, Arc::new(table)) 106 | .map_err(DataFusionError::from)?; 107 | Ok(()) 108 | } 109 | 110 | #[allow(clippy::too_many_arguments)] 111 | #[args( 112 | table_partition_cols = "vec![]", 113 | parquet_pruning = "true", 114 | file_extension = "\".parquet\"" 115 | )] 116 | fn register_parquet( 117 | &mut self, 118 | name: &str, 119 | path: &str, 120 | table_partition_cols: Vec, 121 | parquet_pruning: bool, 122 | file_extension: &str, 123 | py: Python, 124 | ) -> PyResult<()> { 125 | let mut options = ParquetReadOptions::default() 126 | .table_partition_cols(table_partition_cols) 127 | .parquet_pruning(parquet_pruning); 128 | options.file_extension = file_extension; 129 | let result = self.ctx.register_parquet(name, path, options); 130 | wait_for_future(py, result).map_err(DataFusionError::from)?; 131 | Ok(()) 132 | } 133 | 134 | #[allow(clippy::too_many_arguments)] 135 | #[args( 136 | schema = "None", 137 | has_header = "true", 138 | delimiter = "\",\"", 139 | schema_infer_max_records = "1000", 140 | file_extension = "\".csv\"" 141 | )] 142 | fn register_csv( 143 | &mut self, 144 | name: &str, 145 | path: PathBuf, 146 | schema: Option, 147 | has_header: bool, 148 | delimiter: &str, 149 | schema_infer_max_records: usize, 150 | file_extension: &str, 151 | py: Python, 152 | ) -> PyResult<()> { 153 | let path = path 154 | .to_str() 155 | .ok_or_else(|| PyValueError::new_err("Unable to convert path to a string"))?; 156 | let delimiter = delimiter.as_bytes(); 157 | if delimiter.len() != 1 { 158 | return Err(PyValueError::new_err( 159 | "Delimiter must be a single character", 160 | )); 161 | } 162 | 163 | let mut options = CsvReadOptions::new() 164 | .has_header(has_header) 165 | .delimiter(delimiter[0]) 166 | .schema_infer_max_records(schema_infer_max_records) 167 | .file_extension(file_extension); 168 | options.schema = schema.as_ref(); 169 | 170 | let result = self.ctx.register_csv(name, path, options); 171 | wait_for_future(py, result).map_err(DataFusionError::from)?; 172 | 173 | Ok(()) 174 | } 175 | 176 | fn register_udf(&mut self, udf: PyScalarUDF) -> PyResult<()> { 177 | self.ctx.register_udf(udf.function); 178 | Ok(()) 179 | } 180 | 181 | #[args(name = "\"datafusion\"")] 182 | fn catalog(&self, name: &str) -> PyResult { 183 | match self.ctx.catalog(name) { 184 | Some(catalog) => Ok(PyCatalog::new(catalog)), 185 | None => Err(PyKeyError::new_err(format!( 186 | "Catalog with name {} doesn't exist.", 187 | &name 188 | ))), 189 | } 190 | } 191 | 192 | fn tables(&self) -> HashSet { 193 | self.ctx.tables().unwrap() 194 | } 195 | 196 | fn table(&self, name: &str) -> PyResult { 197 | Ok(PyDataFrame::new(self.ctx.table(name)?)) 198 | } 199 | 200 | fn empty_table(&self) -> PyResult { 201 | Ok(PyDataFrame::new(self.ctx.read_empty()?)) 202 | } 203 | } 204 | -------------------------------------------------------------------------------- /src/dataframe.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use crate::utils::wait_for_future; 19 | use crate::{errors::DataFusionError, expression::PyExpr}; 20 | use datafusion::arrow::datatypes::Schema; 21 | use datafusion::arrow::pyarrow::PyArrowConvert; 22 | use datafusion::arrow::util::pretty; 23 | use datafusion::dataframe::DataFrame; 24 | use datafusion::logical_plan::JoinType; 25 | use pyo3::exceptions::PyTypeError; 26 | use pyo3::prelude::*; 27 | use pyo3::types::PyTuple; 28 | use std::sync::Arc; 29 | 30 | /// A PyDataFrame is a representation of a logical plan and an API to compose statements. 31 | /// Use it to build a plan and `.collect()` to execute the plan and collect the result. 32 | /// The actual execution of a plan runs natively on Rust and Arrow on a multi-threaded environment. 33 | #[pyclass(name = "DataFrame", module = "datafusion", subclass)] 34 | #[derive(Clone)] 35 | pub(crate) struct PyDataFrame { 36 | df: Arc, 37 | } 38 | 39 | impl PyDataFrame { 40 | /// creates a new PyDataFrame 41 | pub fn new(df: Arc) -> Self { 42 | Self { df } 43 | } 44 | } 45 | 46 | #[pymethods] 47 | impl PyDataFrame { 48 | fn __getitem__(&self, key: PyObject) -> PyResult { 49 | Python::with_gil(|py| { 50 | if let Ok(key) = key.extract::<&str>(py) { 51 | self.select_columns(vec![key]) 52 | } else if let Ok(tuple) = key.extract::<&PyTuple>(py) { 53 | let keys = tuple 54 | .iter() 55 | .map(|item| item.extract::<&str>()) 56 | .collect::>>()?; 57 | self.select_columns(keys) 58 | } else if let Ok(keys) = key.extract::>(py) { 59 | self.select_columns(keys) 60 | } else { 61 | let message = "DataFrame can only be indexed by string index or indices"; 62 | Err(PyTypeError::new_err(message)) 63 | } 64 | }) 65 | } 66 | 67 | /// Returns the schema from the logical plan 68 | fn schema(&self) -> Schema { 69 | self.df.schema().into() 70 | } 71 | 72 | #[args(args = "*")] 73 | fn select_columns(&self, args: Vec<&str>) -> PyResult { 74 | let df = self.df.select_columns(&args)?; 75 | Ok(Self::new(df)) 76 | } 77 | 78 | #[args(args = "*")] 79 | fn select(&self, args: Vec) -> PyResult { 80 | let expr = args.into_iter().map(|e| e.into()).collect(); 81 | let df = self.df.select(expr)?; 82 | Ok(Self::new(df)) 83 | } 84 | 85 | fn filter(&self, predicate: PyExpr) -> PyResult { 86 | let df = self.df.filter(predicate.into())?; 87 | Ok(Self::new(df)) 88 | } 89 | 90 | fn aggregate(&self, group_by: Vec, aggs: Vec) -> PyResult { 91 | let group_by = group_by.into_iter().map(|e| e.into()).collect(); 92 | let aggs = aggs.into_iter().map(|e| e.into()).collect(); 93 | let df = self.df.aggregate(group_by, aggs)?; 94 | Ok(Self::new(df)) 95 | } 96 | 97 | #[args(exprs = "*")] 98 | fn sort(&self, exprs: Vec) -> PyResult { 99 | let exprs = exprs.into_iter().map(|e| e.into()).collect(); 100 | let df = self.df.sort(exprs)?; 101 | Ok(Self::new(df)) 102 | } 103 | 104 | fn limit(&self, count: usize) -> PyResult { 105 | let df = self.df.limit(None, Some(count))?; 106 | Ok(Self::new(df)) 107 | } 108 | 109 | /// Executes the plan, returning a list of `RecordBatch`es. 110 | /// Unless some order is specified in the plan, there is no 111 | /// guarantee of the order of the result. 112 | fn collect(&self, py: Python) -> PyResult> { 113 | let batches = wait_for_future(py, self.df.collect())?; 114 | // cannot use PyResult> return type due to 115 | // https://github.com/PyO3/pyo3/issues/1813 116 | batches.into_iter().map(|rb| rb.to_pyarrow(py)).collect() 117 | } 118 | 119 | /// Print the result, 20 lines by default 120 | #[args(num = "20")] 121 | fn show(&self, py: Python, num: usize) -> PyResult<()> { 122 | let df = self.df.limit(None, Some(num))?; 123 | let batches = wait_for_future(py, df.collect())?; 124 | Ok(pretty::print_batches(&batches)?) 125 | } 126 | 127 | fn join( 128 | &self, 129 | right: PyDataFrame, 130 | join_keys: (Vec<&str>, Vec<&str>), 131 | how: &str, 132 | ) -> PyResult { 133 | let join_type = match how { 134 | "inner" => JoinType::Inner, 135 | "left" => JoinType::Left, 136 | "right" => JoinType::Right, 137 | "full" => JoinType::Full, 138 | "semi" => JoinType::Semi, 139 | "anti" => JoinType::Anti, 140 | how => { 141 | return Err(DataFusionError::Common(format!( 142 | "The join type {} does not exist or is not implemented", 143 | how 144 | )) 145 | .into()) 146 | } 147 | }; 148 | 149 | let df = self 150 | .df 151 | .join(right.df, join_type, &join_keys.0, &join_keys.1, None)?; 152 | Ok(Self::new(df)) 153 | } 154 | 155 | /// Print the query plan 156 | #[args(verbose = false, analyze = false)] 157 | fn explain(&self, py: Python, verbose: bool, analyze: bool) -> PyResult<()> { 158 | let df = self.df.explain(verbose, analyze)?; 159 | let batches = wait_for_future(py, df.collect())?; 160 | Ok(pretty::print_batches(&batches)?) 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /src/errors.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use core::fmt; 19 | 20 | use datafusion::arrow::error::ArrowError; 21 | use datafusion::error::DataFusionError as InnerDataFusionError; 22 | use pyo3::{exceptions::PyException, PyErr}; 23 | 24 | #[derive(Debug)] 25 | pub enum DataFusionError { 26 | ExecutionError(InnerDataFusionError), 27 | ArrowError(ArrowError), 28 | Common(String), 29 | } 30 | 31 | impl fmt::Display for DataFusionError { 32 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 33 | match self { 34 | DataFusionError::ExecutionError(e) => write!(f, "DataFusion error: {:?}", e), 35 | DataFusionError::ArrowError(e) => write!(f, "Arrow error: {:?}", e), 36 | DataFusionError::Common(e) => write!(f, "{}", e), 37 | } 38 | } 39 | } 40 | 41 | impl From for DataFusionError { 42 | fn from(err: ArrowError) -> DataFusionError { 43 | DataFusionError::ArrowError(err) 44 | } 45 | } 46 | 47 | impl From for DataFusionError { 48 | fn from(err: InnerDataFusionError) -> DataFusionError { 49 | DataFusionError::ExecutionError(err) 50 | } 51 | } 52 | 53 | impl From for PyErr { 54 | fn from(err: DataFusionError) -> PyErr { 55 | PyException::new_err(err.to_string()) 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/expression.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use pyo3::{basic::CompareOp, prelude::*}; 19 | use std::convert::{From, Into}; 20 | 21 | use datafusion::arrow::datatypes::DataType; 22 | use datafusion::logical_plan::{col, lit, Expr}; 23 | 24 | use datafusion::scalar::ScalarValue; 25 | 26 | /// An PyExpr that can be used on a DataFrame 27 | #[pyclass(name = "Expression", module = "datafusion", subclass)] 28 | #[derive(Debug, Clone)] 29 | pub(crate) struct PyExpr { 30 | pub(crate) expr: Expr, 31 | } 32 | 33 | impl From for Expr { 34 | fn from(expr: PyExpr) -> Expr { 35 | expr.expr 36 | } 37 | } 38 | 39 | impl From for PyExpr { 40 | fn from(expr: Expr) -> PyExpr { 41 | PyExpr { expr } 42 | } 43 | } 44 | 45 | #[pymethods] 46 | impl PyExpr { 47 | fn __richcmp__(&self, other: PyExpr, op: CompareOp) -> PyExpr { 48 | let expr = match op { 49 | CompareOp::Lt => self.expr.clone().lt(other.expr), 50 | CompareOp::Le => self.expr.clone().lt_eq(other.expr), 51 | CompareOp::Eq => self.expr.clone().eq(other.expr), 52 | CompareOp::Ne => self.expr.clone().not_eq(other.expr), 53 | CompareOp::Gt => self.expr.clone().gt(other.expr), 54 | CompareOp::Ge => self.expr.clone().gt_eq(other.expr), 55 | }; 56 | expr.into() 57 | } 58 | 59 | fn __str__(&self) -> PyResult { 60 | Ok(format!("{}", self.expr)) 61 | } 62 | 63 | fn __add__(&self, rhs: PyExpr) -> PyResult { 64 | Ok((self.expr.clone() + rhs.expr).into()) 65 | } 66 | 67 | fn __sub__(&self, rhs: PyExpr) -> PyResult { 68 | Ok((self.expr.clone() - rhs.expr).into()) 69 | } 70 | 71 | fn __truediv__(&self, rhs: PyExpr) -> PyResult { 72 | Ok((self.expr.clone() / rhs.expr).into()) 73 | } 74 | 75 | fn __mul__(&self, rhs: PyExpr) -> PyResult { 76 | Ok((self.expr.clone() * rhs.expr).into()) 77 | } 78 | 79 | fn __mod__(&self, rhs: PyExpr) -> PyResult { 80 | Ok(self.expr.clone().modulus(rhs.expr).into()) 81 | } 82 | 83 | fn __and__(&self, rhs: PyExpr) -> PyResult { 84 | Ok(self.expr.clone().and(rhs.expr).into()) 85 | } 86 | 87 | fn __or__(&self, rhs: PyExpr) -> PyResult { 88 | Ok(self.expr.clone().or(rhs.expr).into()) 89 | } 90 | 91 | fn __invert__(&self) -> PyResult { 92 | Ok(self.expr.clone().not().into()) 93 | } 94 | 95 | fn __getitem__(&self, key: &str) -> PyResult { 96 | Ok(Expr::GetIndexedField { 97 | expr: Box::new(self.expr.clone()), 98 | key: ScalarValue::Utf8(Some(key.to_string())), 99 | } 100 | .into()) 101 | } 102 | 103 | #[staticmethod] 104 | pub fn literal(value: ScalarValue) -> PyExpr { 105 | lit(value).into() 106 | } 107 | 108 | #[staticmethod] 109 | pub fn column(value: &str) -> PyExpr { 110 | col(value).into() 111 | } 112 | 113 | /// assign a name to the PyExpr 114 | pub fn alias(&self, name: &str) -> PyExpr { 115 | self.expr.clone().alias(name).into() 116 | } 117 | 118 | /// Create a sort PyExpr from an existing PyExpr. 119 | #[args(ascending = true, nulls_first = true)] 120 | pub fn sort(&self, ascending: bool, nulls_first: bool) -> PyExpr { 121 | self.expr.clone().sort(ascending, nulls_first).into() 122 | } 123 | 124 | pub fn is_null(&self) -> PyExpr { 125 | self.expr.clone().is_null().into() 126 | } 127 | 128 | pub fn cast(&self, to: DataType) -> PyExpr { 129 | // self.expr.cast_to() requires DFSchema to validate that the cast 130 | // is supported, omit that for now 131 | let expr = Expr::Cast { 132 | expr: Box::new(self.expr.clone()), 133 | data_type: to, 134 | }; 135 | expr.into() 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /src/functions.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use pyo3::{prelude::*, wrap_pyfunction}; 19 | 20 | use datafusion::logical_plan; 21 | use datafusion::physical_plan::aggregates::AggregateFunction; 22 | use datafusion_expr::BuiltinScalarFunction; 23 | 24 | use crate::errors; 25 | use crate::expression::PyExpr; 26 | 27 | #[pyfunction] 28 | fn array(value: Vec) -> PyExpr { 29 | PyExpr { 30 | expr: logical_plan::array(value.into_iter().map(|x| x.expr).collect::>()), 31 | } 32 | } 33 | 34 | #[pyfunction] 35 | fn in_list(expr: PyExpr, value: Vec, negated: bool) -> PyExpr { 36 | logical_plan::in_list( 37 | expr.expr, 38 | value.into_iter().map(|x| x.expr).collect::>(), 39 | negated, 40 | ) 41 | .into() 42 | } 43 | 44 | /// Current date and time 45 | #[pyfunction] 46 | fn now() -> PyExpr { 47 | PyExpr { 48 | // here lit(0) is a stub for conform to arity 49 | expr: logical_plan::now(logical_plan::lit(0)), 50 | } 51 | } 52 | 53 | /// Returns a random value in the range 0.0 <= x < 1.0 54 | #[pyfunction] 55 | fn random() -> PyExpr { 56 | PyExpr { 57 | expr: logical_plan::random(), 58 | } 59 | } 60 | 61 | /// Computes a binary hash of the given data. type is the algorithm to use. 62 | /// Standard algorithms are md5, sha224, sha256, sha384, sha512, blake2s, blake2b, and blake3. 63 | #[pyfunction(value, method)] 64 | fn digest(value: PyExpr, method: PyExpr) -> PyExpr { 65 | PyExpr { 66 | expr: logical_plan::digest(value.expr, method.expr), 67 | } 68 | } 69 | 70 | /// Concatenates the text representations of all the arguments. 71 | /// NULL arguments are ignored. 72 | #[pyfunction(args = "*")] 73 | fn concat(args: Vec) -> PyResult { 74 | let args = args.into_iter().map(|e| e.expr).collect::>(); 75 | Ok(logical_plan::concat(&args).into()) 76 | } 77 | 78 | /// Concatenates all but the first argument, with separators. 79 | /// The first argument is used as the separator string, and should not be NULL. 80 | /// Other NULL arguments are ignored. 81 | #[pyfunction(sep, args = "*")] 82 | fn concat_ws(sep: String, args: Vec) -> PyResult { 83 | let args = args.into_iter().map(|e| e.expr).collect::>(); 84 | Ok(logical_plan::concat_ws(sep, &args).into()) 85 | } 86 | 87 | /// Creates a new Sort expression 88 | #[pyfunction] 89 | fn order_by(expr: PyExpr, asc: Option, nulls_first: Option) -> PyResult { 90 | Ok(PyExpr { 91 | expr: datafusion::logical_plan::Expr::Sort { 92 | expr: Box::new(expr.expr), 93 | asc: asc.unwrap_or(true), 94 | nulls_first: nulls_first.unwrap_or(true), 95 | }, 96 | }) 97 | } 98 | 99 | /// Creates a new Alias expression 100 | #[pyfunction] 101 | fn alias(expr: PyExpr, name: &str) -> PyResult { 102 | Ok(PyExpr { 103 | expr: datafusion::logical_plan::Expr::Alias(Box::new(expr.expr), String::from(name)), 104 | }) 105 | } 106 | 107 | /// Creates a new Window function expression 108 | #[pyfunction] 109 | fn window( 110 | name: &str, 111 | args: Vec, 112 | partition_by: Option>, 113 | order_by: Option>, 114 | ) -> PyResult { 115 | use std::str::FromStr; 116 | let fun = datafusion_expr::window_function::WindowFunction::from_str(name) 117 | .map_err(|e| -> errors::DataFusionError { e.into() })?; 118 | Ok(PyExpr { 119 | expr: datafusion::logical_plan::Expr::WindowFunction { 120 | fun, 121 | args: args.into_iter().map(|x| x.expr).collect::>(), 122 | partition_by: partition_by 123 | .unwrap_or_default() 124 | .into_iter() 125 | .map(|x| x.expr) 126 | .collect::>(), 127 | order_by: order_by 128 | .unwrap_or_default() 129 | .into_iter() 130 | .map(|x| x.expr) 131 | .collect::>(), 132 | window_frame: None, 133 | }, 134 | }) 135 | } 136 | 137 | macro_rules! scalar_function { 138 | ($NAME: ident, $FUNC: ident) => { 139 | scalar_function!($NAME, $FUNC, stringify!($NAME)); 140 | }; 141 | ($NAME: ident, $FUNC: ident, $DOC: expr) => { 142 | #[doc = $DOC] 143 | #[pyfunction(args = "*")] 144 | fn $NAME(args: Vec) -> PyExpr { 145 | let expr = logical_plan::Expr::ScalarFunction { 146 | fun: BuiltinScalarFunction::$FUNC, 147 | args: args.into_iter().map(|e| e.into()).collect(), 148 | }; 149 | expr.into() 150 | } 151 | }; 152 | } 153 | 154 | macro_rules! aggregate_function { 155 | ($NAME: ident, $FUNC: ident) => { 156 | aggregate_function!($NAME, $FUNC, stringify!($NAME)); 157 | }; 158 | ($NAME: ident, $FUNC: ident, $DOC: expr) => { 159 | #[doc = $DOC] 160 | #[pyfunction(args = "*", distinct = "false")] 161 | fn $NAME(args: Vec, distinct: bool) -> PyExpr { 162 | let expr = logical_plan::Expr::AggregateFunction { 163 | fun: AggregateFunction::$FUNC, 164 | args: args.into_iter().map(|e| e.into()).collect(), 165 | distinct, 166 | }; 167 | expr.into() 168 | } 169 | }; 170 | } 171 | 172 | scalar_function!(abs, Abs); 173 | scalar_function!(acos, Acos); 174 | scalar_function!(ascii, Ascii, "Returns the numeric code of the first character of the argument. In UTF8 encoding, returns the Unicode code point of the character. In other multibyte encodings, the argument must be an ASCII character."); 175 | scalar_function!(asin, Asin); 176 | scalar_function!(atan, Atan); 177 | scalar_function!( 178 | bit_length, 179 | BitLength, 180 | "Returns number of bits in the string (8 times the octet_length)." 181 | ); 182 | scalar_function!(btrim, Btrim, "Removes the longest string containing only characters in characters (a space by default) from the start and end of string."); 183 | scalar_function!(ceil, Ceil); 184 | scalar_function!( 185 | character_length, 186 | CharacterLength, 187 | "Returns number of characters in the string." 188 | ); 189 | scalar_function!(chr, Chr, "Returns the character with the given code."); 190 | scalar_function!(cos, Cos); 191 | scalar_function!(exp, Exp); 192 | scalar_function!(floor, Floor); 193 | scalar_function!(initcap, InitCap, "Converts the first letter of each word to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters."); 194 | scalar_function!(left, Left, "Returns first n characters in the string, or when n is negative, returns all but last |n| characters."); 195 | scalar_function!(ln, Ln); 196 | scalar_function!(log10, Log10); 197 | scalar_function!(log2, Log2); 198 | scalar_function!(lower, Lower, "Converts the string to all lower case"); 199 | scalar_function!(lpad, Lpad, "Extends the string to length length by prepending the characters fill (a space by default). If the string is already longer than length then it is truncated (on the right)."); 200 | scalar_function!(ltrim, Ltrim, "Removes the longest string containing only characters in characters (a space by default) from the start of string."); 201 | scalar_function!( 202 | md5, 203 | MD5, 204 | "Computes the MD5 hash of the argument, with the result written in hexadecimal." 205 | ); 206 | scalar_function!(octet_length, OctetLength, "Returns number of bytes in the string. Since this version of the function accepts type character directly, it will not strip trailing spaces."); 207 | scalar_function!(regexp_match, RegexpMatch); 208 | scalar_function!( 209 | regexp_replace, 210 | RegexpReplace, 211 | "Replaces substring(s) matching a POSIX regular expression" 212 | ); 213 | scalar_function!( 214 | repeat, 215 | Repeat, 216 | "Repeats string the specified number of times." 217 | ); 218 | scalar_function!( 219 | replace, 220 | Replace, 221 | "Replaces all occurrences in string of substring from with substring to." 222 | ); 223 | scalar_function!( 224 | reverse, 225 | Reverse, 226 | "Reverses the order of the characters in the string." 227 | ); 228 | scalar_function!(right, Right, "Returns last n characters in the string, or when n is negative, returns all but first |n| characters."); 229 | scalar_function!(round, Round); 230 | scalar_function!(rpad, Rpad, "Extends the string to length length by appending the characters fill (a space by default). If the string is already longer than length then it is truncated."); 231 | scalar_function!(rtrim, Rtrim, "Removes the longest string containing only characters in characters (a space by default) from the end of string."); 232 | scalar_function!(sha224, SHA224); 233 | scalar_function!(sha256, SHA256); 234 | scalar_function!(sha384, SHA384); 235 | scalar_function!(sha512, SHA512); 236 | scalar_function!(signum, Signum); 237 | scalar_function!(sin, Sin); 238 | scalar_function!( 239 | split_part, 240 | SplitPart, 241 | "Splits string at occurrences of delimiter and returns the n'th field (counting from one)." 242 | ); 243 | scalar_function!(sqrt, Sqrt); 244 | scalar_function!( 245 | starts_with, 246 | StartsWith, 247 | "Returns true if string starts with prefix." 248 | ); 249 | scalar_function!(strpos, Strpos, "Returns starting index of specified substring within string, or zero if it's not present. (Same as position(substring in string), but note the reversed argument order.)"); 250 | scalar_function!(substr, Substr); 251 | scalar_function!(tan, Tan); 252 | scalar_function!( 253 | to_hex, 254 | ToHex, 255 | "Converts the number to its equivalent hexadecimal representation." 256 | ); 257 | scalar_function!(to_timestamp, ToTimestamp); 258 | scalar_function!(translate, Translate, "Replaces each character in string that matches a character in the from set with the corresponding character in the to set. If from is longer than to, occurrences of the extra characters in from are deleted."); 259 | scalar_function!(trim, Trim, "Removes the longest string containing only characters in characters (a space by default) from the start, end, or both ends (BOTH is the default) of string."); 260 | scalar_function!(trunc, Trunc); 261 | scalar_function!(upper, Upper, "Converts the string to all upper case."); 262 | 263 | aggregate_function!(avg, Avg); 264 | aggregate_function!(count, Count); 265 | aggregate_function!(max, Max); 266 | aggregate_function!(min, Min); 267 | aggregate_function!(sum, Sum); 268 | aggregate_function!(approx_distinct, ApproxDistinct); 269 | 270 | pub(crate) fn init_module(m: &PyModule) -> PyResult<()> { 271 | m.add_wrapped(wrap_pyfunction!(abs))?; 272 | m.add_wrapped(wrap_pyfunction!(acos))?; 273 | m.add_wrapped(wrap_pyfunction!(approx_distinct))?; 274 | m.add_wrapped(wrap_pyfunction!(alias))?; 275 | m.add_wrapped(wrap_pyfunction!(array))?; 276 | m.add_wrapped(wrap_pyfunction!(ascii))?; 277 | m.add_wrapped(wrap_pyfunction!(asin))?; 278 | m.add_wrapped(wrap_pyfunction!(atan))?; 279 | m.add_wrapped(wrap_pyfunction!(avg))?; 280 | m.add_wrapped(wrap_pyfunction!(bit_length))?; 281 | m.add_wrapped(wrap_pyfunction!(btrim))?; 282 | m.add_wrapped(wrap_pyfunction!(ceil))?; 283 | m.add_wrapped(wrap_pyfunction!(character_length))?; 284 | m.add_wrapped(wrap_pyfunction!(chr))?; 285 | m.add_wrapped(wrap_pyfunction!(concat_ws))?; 286 | m.add_wrapped(wrap_pyfunction!(concat))?; 287 | m.add_wrapped(wrap_pyfunction!(cos))?; 288 | m.add_wrapped(wrap_pyfunction!(count))?; 289 | m.add_wrapped(wrap_pyfunction!(digest))?; 290 | m.add_wrapped(wrap_pyfunction!(exp))?; 291 | m.add_wrapped(wrap_pyfunction!(floor))?; 292 | m.add_wrapped(wrap_pyfunction!(in_list))?; 293 | m.add_wrapped(wrap_pyfunction!(initcap))?; 294 | m.add_wrapped(wrap_pyfunction!(left))?; 295 | m.add_wrapped(wrap_pyfunction!(ln))?; 296 | m.add_wrapped(wrap_pyfunction!(log10))?; 297 | m.add_wrapped(wrap_pyfunction!(log2))?; 298 | m.add_wrapped(wrap_pyfunction!(lower))?; 299 | m.add_wrapped(wrap_pyfunction!(lpad))?; 300 | m.add_wrapped(wrap_pyfunction!(ltrim))?; 301 | m.add_wrapped(wrap_pyfunction!(max))?; 302 | m.add_wrapped(wrap_pyfunction!(md5))?; 303 | m.add_wrapped(wrap_pyfunction!(min))?; 304 | m.add_wrapped(wrap_pyfunction!(now))?; 305 | m.add_wrapped(wrap_pyfunction!(octet_length))?; 306 | m.add_wrapped(wrap_pyfunction!(order_by))?; 307 | m.add_wrapped(wrap_pyfunction!(random))?; 308 | m.add_wrapped(wrap_pyfunction!(regexp_match))?; 309 | m.add_wrapped(wrap_pyfunction!(regexp_replace))?; 310 | m.add_wrapped(wrap_pyfunction!(repeat))?; 311 | m.add_wrapped(wrap_pyfunction!(replace))?; 312 | m.add_wrapped(wrap_pyfunction!(reverse))?; 313 | m.add_wrapped(wrap_pyfunction!(right))?; 314 | m.add_wrapped(wrap_pyfunction!(round))?; 315 | m.add_wrapped(wrap_pyfunction!(rpad))?; 316 | m.add_wrapped(wrap_pyfunction!(rtrim))?; 317 | m.add_wrapped(wrap_pyfunction!(sha224))?; 318 | m.add_wrapped(wrap_pyfunction!(sha256))?; 319 | m.add_wrapped(wrap_pyfunction!(sha384))?; 320 | m.add_wrapped(wrap_pyfunction!(sha512))?; 321 | m.add_wrapped(wrap_pyfunction!(signum))?; 322 | m.add_wrapped(wrap_pyfunction!(sin))?; 323 | m.add_wrapped(wrap_pyfunction!(split_part))?; 324 | m.add_wrapped(wrap_pyfunction!(sqrt))?; 325 | m.add_wrapped(wrap_pyfunction!(starts_with))?; 326 | m.add_wrapped(wrap_pyfunction!(strpos))?; 327 | m.add_wrapped(wrap_pyfunction!(substr))?; 328 | m.add_wrapped(wrap_pyfunction!(sum))?; 329 | m.add_wrapped(wrap_pyfunction!(tan))?; 330 | m.add_wrapped(wrap_pyfunction!(to_hex))?; 331 | m.add_wrapped(wrap_pyfunction!(to_timestamp))?; 332 | m.add_wrapped(wrap_pyfunction!(translate))?; 333 | m.add_wrapped(wrap_pyfunction!(trim))?; 334 | m.add_wrapped(wrap_pyfunction!(trunc))?; 335 | m.add_wrapped(wrap_pyfunction!(upper))?; 336 | m.add_wrapped(wrap_pyfunction!(window))?; 337 | Ok(()) 338 | } 339 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | #[cfg(feature = "mimalloc")] 19 | use mimalloc::MiMalloc; 20 | use pyo3::prelude::*; 21 | 22 | pub mod catalog; 23 | mod context; 24 | mod dataframe; 25 | pub mod errors; 26 | mod expression; 27 | mod functions; 28 | mod udaf; 29 | mod udf; 30 | pub mod utils; 31 | 32 | #[cfg(feature = "mimalloc")] 33 | #[global_allocator] 34 | static GLOBAL: MiMalloc = MiMalloc; 35 | 36 | /// Low-level DataFusion internal package. 37 | /// 38 | /// The higher-level public API is defined in pure python files under the 39 | /// datafusion directory. 40 | #[pymodule] 41 | fn _internal(py: Python, m: &PyModule) -> PyResult<()> { 42 | // Register the python classes 43 | m.add_class::()?; 44 | m.add_class::()?; 45 | m.add_class::()?; 46 | m.add_class::()?; 47 | m.add_class::()?; 48 | m.add_class::()?; 49 | m.add_class::()?; 50 | m.add_class::()?; 51 | 52 | // Register the functions as a submodule 53 | let funcs = PyModule::new(py, "functions")?; 54 | functions::init_module(funcs)?; 55 | m.add_submodule(funcs)?; 56 | 57 | Ok(()) 58 | } 59 | -------------------------------------------------------------------------------- /src/udaf.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::sync::Arc; 19 | 20 | use pyo3::{prelude::*, types::PyTuple}; 21 | 22 | use datafusion::arrow::array::ArrayRef; 23 | use datafusion::arrow::datatypes::DataType; 24 | use datafusion::arrow::pyarrow::PyArrowConvert; 25 | use datafusion::error::{DataFusionError, Result}; 26 | use datafusion::logical_plan; 27 | use datafusion_common::ScalarValue; 28 | use datafusion_expr::Accumulator; 29 | use datafusion_expr::AccumulatorFunctionImplementation; 30 | use datafusion_expr::AggregateUDF; 31 | 32 | use crate::expression::PyExpr; 33 | use crate::utils::parse_volatility; 34 | 35 | #[derive(Debug)] 36 | struct RustAccumulator { 37 | accum: PyObject, 38 | } 39 | 40 | impl RustAccumulator { 41 | fn new(accum: PyObject) -> Self { 42 | Self { accum } 43 | } 44 | } 45 | 46 | impl Accumulator for RustAccumulator { 47 | fn state(&self) -> Result> { 48 | Python::with_gil(|py| self.accum.as_ref(py).call_method0("state")?.extract()) 49 | .map_err(|e| DataFusionError::Execution(format!("{}", e))) 50 | } 51 | 52 | fn evaluate(&self) -> Result { 53 | Python::with_gil(|py| self.accum.as_ref(py).call_method0("evaluate")?.extract()) 54 | .map_err(|e| DataFusionError::Execution(format!("{}", e))) 55 | } 56 | 57 | fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { 58 | Python::with_gil(|py| { 59 | // 1. cast args to Pyarrow array 60 | let py_args = values 61 | .iter() 62 | .map(|arg| arg.data().to_owned().to_pyarrow(py).unwrap()) 63 | .collect::>(); 64 | let py_args = PyTuple::new(py, py_args); 65 | 66 | // 2. call function 67 | self.accum 68 | .as_ref(py) 69 | .call_method1("update", py_args) 70 | .map_err(|e| DataFusionError::Execution(format!("{}", e)))?; 71 | 72 | Ok(()) 73 | }) 74 | } 75 | 76 | fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> { 77 | Python::with_gil(|py| { 78 | let state = &states[0]; 79 | 80 | // 1. cast states to Pyarrow array 81 | let state = state 82 | .to_pyarrow(py) 83 | .map_err(|e| DataFusionError::Execution(format!("{}", e)))?; 84 | 85 | // 2. call merge 86 | self.accum 87 | .as_ref(py) 88 | .call_method1("merge", (state,)) 89 | .map_err(|e| DataFusionError::Execution(format!("{}", e)))?; 90 | 91 | Ok(()) 92 | }) 93 | } 94 | } 95 | 96 | pub fn to_rust_accumulator(accum: PyObject) -> AccumulatorFunctionImplementation { 97 | Arc::new(move || -> Result> { 98 | let accum = Python::with_gil(|py| { 99 | accum 100 | .call0(py) 101 | .map_err(|e| DataFusionError::Execution(format!("{}", e))) 102 | })?; 103 | Ok(Box::new(RustAccumulator::new(accum))) 104 | }) 105 | } 106 | 107 | /// Represents an AggregateUDF 108 | #[pyclass(name = "AggregateUDF", module = "datafusion", subclass)] 109 | #[derive(Debug, Clone)] 110 | pub struct PyAggregateUDF { 111 | pub(crate) function: AggregateUDF, 112 | } 113 | 114 | #[pymethods] 115 | impl PyAggregateUDF { 116 | #[new(name, accumulator, input_type, return_type, state_type, volatility)] 117 | fn new( 118 | name: &str, 119 | accumulator: PyObject, 120 | input_type: DataType, 121 | return_type: DataType, 122 | state_type: Vec, 123 | volatility: &str, 124 | ) -> PyResult { 125 | let function = logical_plan::create_udaf( 126 | name, 127 | input_type, 128 | Arc::new(return_type), 129 | parse_volatility(volatility)?, 130 | to_rust_accumulator(accumulator), 131 | Arc::new(state_type), 132 | ); 133 | Ok(Self { function }) 134 | } 135 | 136 | /// creates a new PyExpr with the call of the udf 137 | #[args(args = "*")] 138 | fn __call__(&self, args: Vec) -> PyResult { 139 | let args = args.iter().map(|e| e.expr.clone()).collect(); 140 | Ok(self.function.call(args).into()) 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /src/udf.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::sync::Arc; 19 | 20 | use pyo3::{prelude::*, types::PyTuple}; 21 | 22 | use datafusion::arrow::array::ArrayRef; 23 | use datafusion::arrow::datatypes::DataType; 24 | use datafusion::arrow::pyarrow::PyArrowConvert; 25 | use datafusion::error::DataFusionError; 26 | use datafusion::logical_plan; 27 | use datafusion::physical_plan::functions::make_scalar_function; 28 | use datafusion::physical_plan::udf::ScalarUDF; 29 | use datafusion_expr::function::ScalarFunctionImplementation; 30 | 31 | use crate::expression::PyExpr; 32 | use crate::utils::parse_volatility; 33 | 34 | /// Create a DataFusion's UDF implementation from a python function 35 | /// that expects pyarrow arrays. This is more efficient as it performs 36 | /// a zero-copy of the contents. 37 | fn to_rust_function(func: PyObject) -> ScalarFunctionImplementation { 38 | make_scalar_function( 39 | move |args: &[ArrayRef]| -> Result { 40 | Python::with_gil(|py| { 41 | // 1. cast args to Pyarrow arrays 42 | let py_args = args 43 | .iter() 44 | .map(|arg| arg.data().to_owned().to_pyarrow(py).unwrap()) 45 | .collect::>(); 46 | let py_args = PyTuple::new(py, py_args); 47 | 48 | // 2. call function 49 | let value = func.as_ref(py).call(py_args, None); 50 | let value = match value { 51 | Ok(n) => Ok(n), 52 | Err(error) => Err(DataFusionError::Execution(format!("{:?}", error))), 53 | }?; 54 | 55 | // 3. cast to arrow::array::Array 56 | let array = ArrayRef::from_pyarrow(value).unwrap(); 57 | Ok(array) 58 | }) 59 | }, 60 | ) 61 | } 62 | 63 | /// Represents a PyScalarUDF 64 | #[pyclass(name = "ScalarUDF", module = "datafusion", subclass)] 65 | #[derive(Debug, Clone)] 66 | pub struct PyScalarUDF { 67 | pub(crate) function: ScalarUDF, 68 | } 69 | 70 | #[pymethods] 71 | impl PyScalarUDF { 72 | #[new(name, func, input_types, return_type, volatility)] 73 | fn new( 74 | name: &str, 75 | func: PyObject, 76 | input_types: Vec, 77 | return_type: DataType, 78 | volatility: &str, 79 | ) -> PyResult { 80 | let function = logical_plan::create_udf( 81 | name, 82 | input_types, 83 | Arc::new(return_type), 84 | parse_volatility(volatility)?, 85 | to_rust_function(func), 86 | ); 87 | Ok(Self { function }) 88 | } 89 | 90 | /// creates a new PyExpr with the call of the udf 91 | #[args(args = "*")] 92 | fn __call__(&self, args: Vec) -> PyResult { 93 | let args = args.iter().map(|e| e.expr.clone()).collect(); 94 | Ok(self.function.call(args).into()) 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/utils.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use crate::errors::DataFusionError; 19 | use datafusion_expr::Volatility; 20 | use pyo3::prelude::*; 21 | use std::future::Future; 22 | use tokio::runtime::Runtime; 23 | 24 | /// Utility to collect rust futures with GIL released 25 | pub fn wait_for_future(py: Python, f: F) -> F::Output 26 | where 27 | F: Send, 28 | F::Output: Send, 29 | { 30 | let rt = Runtime::new().unwrap(); 31 | py.allow_threads(|| rt.block_on(f)) 32 | } 33 | 34 | pub(crate) fn parse_volatility(value: &str) -> Result { 35 | Ok(match value { 36 | "immutable" => Volatility::Immutable, 37 | "stable" => Volatility::Stable, 38 | "volatile" => Volatility::Volatile, 39 | value => { 40 | return Err(DataFusionError::Common(format!( 41 | "Unsupportad volatility type: `{}`, supported \ 42 | values are: immutable, stable and volatile.", 43 | value 44 | ))) 45 | } 46 | }) 47 | } 48 | --------------------------------------------------------------------------------