├── .asf.yaml ├── .cargo └── config.toml ├── .dockerignore ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── dependabot.yml ├── pull_request_template.md └── workflows │ ├── build.yml │ ├── dev.yml │ ├── docs.yaml │ ├── take.yml │ └── test.yaml ├── .gitignore ├── .gitmodules ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── Cargo.lock ├── Cargo.toml ├── LICENSE.txt ├── README.md ├── benchmarks ├── db-benchmark │ ├── README.md │ ├── db-benchmark.dockerfile │ ├── groupby-datafusion.py │ ├── join-datafusion.py │ └── run-bench.sh └── tpch │ ├── .gitignore │ ├── README.md │ ├── create_tables.sql │ ├── queries │ ├── q1.sql │ ├── q10.sql │ ├── q11.sql │ ├── q12.sql │ ├── q13.sql │ ├── q14.sql │ ├── q15.sql │ ├── q16.sql │ ├── q17.sql │ ├── q18.sql │ ├── q19.sql │ ├── q2.sql │ ├── q20.sql │ ├── q21.sql │ ├── q22.sql │ ├── q3.sql │ ├── q4.sql │ ├── q5.sql │ ├── q6.sql │ ├── q7.sql │ ├── q8.sql │ └── q9.sql │ ├── tpch-gen.sh │ └── tpch.py ├── build.rs ├── ci └── scripts │ ├── python_lint.sh │ ├── rust_clippy.sh │ ├── rust_fmt.sh │ └── rust_toml_fmt.sh ├── dev ├── build-set-env.sh ├── changelog │ ├── 43.0.0.md │ ├── 44.0.0.md │ ├── 45.0.0.md │ ├── 46.0.0.md │ ├── 47.0.0.md │ └── pre-43.0.0.md ├── clean.sh ├── create_license.py ├── python_lint.sh ├── release │ ├── README.md │ ├── check-rat-report.py │ ├── create-tarball.sh │ ├── generate-changelog.py │ ├── rat_exclude_files.txt │ ├── release-tarball.sh │ ├── run-rat.sh │ └── verify-release-candidate.sh └── rust_lint.sh ├── docs ├── .gitignore ├── Makefile ├── README.md ├── build.sh ├── make.bat ├── mdbook │ ├── README.md │ ├── book.toml │ └── src │ │ ├── SUMMARY.md │ │ ├── images │ │ ├── datafusion-jupyterlab.png │ │ └── plan.svg │ │ ├── index.md │ │ ├── installation.md │ │ ├── quickstart.md │ │ └── usage │ │ ├── create-table.md │ │ ├── index.md │ │ ├── query-plans.md │ │ └── query-table.md └── source │ ├── _static │ ├── images │ │ ├── 2x_bgwhite_original.png │ │ ├── original.png │ │ ├── original.svg │ │ └── original2x.png │ └── theme_overrides.css │ ├── _templates │ ├── docs-sidebar.html │ └── layout.html │ ├── conf.py │ ├── contributor-guide │ ├── ffi.rst │ └── introduction.rst │ ├── images │ └── jupyter_lab_df_view.png │ ├── index.rst │ └── user-guide │ ├── basics.rst │ ├── common-operations │ ├── aggregations.rst │ ├── basic-info.rst │ ├── expressions.rst │ ├── functions.rst │ ├── index.rst │ ├── joins.rst │ ├── select-and-filter.rst │ ├── udf-and-udfa.rst │ ├── views.rst │ └── windows.rst │ ├── configuration.rst │ ├── data-sources.rst │ ├── dataframe.rst │ ├── introduction.rst │ ├── io │ ├── arrow.rst │ ├── avro.rst │ ├── csv.rst │ ├── index.rst │ ├── json.rst │ ├── parquet.rst │ └── table_provider.rst │ └── sql.rst ├── examples ├── README.md ├── chart.png ├── create-context.py ├── dataframe-parquet.py ├── datafusion-ffi-example │ ├── .cargo │ │ └── config.toml │ ├── Cargo.lock │ ├── Cargo.toml │ ├── build.rs │ ├── pyproject.toml │ ├── python │ │ └── tests │ │ │ ├── _test_table_function.py │ │ │ └── _test_table_provider.py │ └── src │ │ ├── lib.rs │ │ ├── table_function.rs │ │ └── table_provider.rs ├── export.py ├── import.py ├── python-udaf.py ├── python-udf-comparisons.py ├── python-udf.py ├── python-udwf.py ├── query-pyarrow-data.py ├── sql-parquet-s3.py ├── sql-parquet.py ├── sql-to-pandas.py ├── sql-using-python-udaf.py ├── sql-using-python-udf.py ├── substrait.py └── tpch │ ├── .gitignore │ ├── README.md │ ├── _tests.py │ ├── convert_data_to_parquet.py │ ├── q01_pricing_summary_report.py │ ├── q02_minimum_cost_supplier.py │ ├── q03_shipping_priority.py │ ├── q04_order_priority_checking.py │ ├── q05_local_supplier_volume.py │ ├── q06_forecasting_revenue_change.py │ ├── q07_volume_shipping.py │ ├── q08_market_share.py │ ├── q09_product_type_profit_measure.py │ ├── q10_returned_item_reporting.py │ ├── q11_important_stock_identification.py │ ├── q12_ship_mode_order_priority.py │ ├── q13_customer_distribution.py │ ├── q14_promotion_effect.py │ ├── q15_top_supplier.py │ ├── q16_part_supplier_relationship.py │ ├── q17_small_quantity_order.py │ ├── q18_large_volume_customer.py │ ├── q19_discounted_revenue.py │ ├── q20_potential_part_promotion.py │ ├── q21_suppliers_kept_orders_waiting.py │ ├── q22_global_sales_opportunity.py │ └── util.py ├── pyproject.toml ├── python ├── datafusion │ ├── __init__.py │ ├── catalog.py │ ├── col.py │ ├── common.py │ ├── context.py │ ├── dataframe.py │ ├── expr.py │ ├── functions.py │ ├── html_formatter.py │ ├── input │ │ ├── __init__.py │ │ ├── base.py │ │ └── location.py │ ├── io.py │ ├── object_store.py │ ├── plan.py │ ├── py.typed │ ├── record_batch.py │ ├── substrait.py │ ├── udf.py │ ├── unparser.py │ └── user_defined.py └── tests │ ├── __init__.py │ ├── conftest.py │ ├── data_test_context │ └── data.json │ ├── generic.py │ ├── test_aggregation.py │ ├── test_catalog.py │ ├── test_config.py │ ├── test_context.py │ ├── test_dataframe.py │ ├── test_expr.py │ ├── test_functions.py │ ├── test_imports.py │ ├── test_indexing.py │ ├── test_input.py │ ├── test_io.py │ ├── test_plans.py │ ├── test_sql.py │ ├── test_store.py │ ├── test_substrait.py │ ├── test_udaf.py │ ├── test_udf.py │ ├── test_udwf.py │ ├── test_unparser.py │ ├── test_view.py │ └── test_wrapper_coverage.py ├── src ├── catalog.rs ├── common.rs ├── common │ ├── data_type.rs │ ├── df_schema.rs │ ├── function.rs │ └── schema.rs ├── config.rs ├── context.rs ├── dataframe.rs ├── dataset.rs ├── dataset_exec.rs ├── errors.rs ├── expr.rs ├── expr │ ├── aggregate.rs │ ├── aggregate_expr.rs │ ├── alias.rs │ ├── analyze.rs │ ├── between.rs │ ├── binary_expr.rs │ ├── bool_expr.rs │ ├── case.rs │ ├── cast.rs │ ├── column.rs │ ├── conditional_expr.rs │ ├── copy_to.rs │ ├── create_catalog.rs │ ├── create_catalog_schema.rs │ ├── create_external_table.rs │ ├── create_function.rs │ ├── create_index.rs │ ├── create_memory_table.rs │ ├── create_view.rs │ ├── describe_table.rs │ ├── distinct.rs │ ├── dml.rs │ ├── drop_catalog_schema.rs │ ├── drop_function.rs │ ├── drop_table.rs │ ├── drop_view.rs │ ├── empty_relation.rs │ ├── exists.rs │ ├── explain.rs │ ├── extension.rs │ ├── filter.rs │ ├── grouping_set.rs │ ├── in_list.rs │ ├── in_subquery.rs │ ├── indexed_field.rs │ ├── join.rs │ ├── like.rs │ ├── limit.rs │ ├── literal.rs │ ├── logical_node.rs │ ├── placeholder.rs │ ├── projection.rs │ ├── recursive_query.rs │ ├── repartition.rs │ ├── scalar_subquery.rs │ ├── scalar_variable.rs │ ├── signature.rs │ ├── sort.rs │ ├── sort_expr.rs │ ├── statement.rs │ ├── subquery.rs │ ├── subquery_alias.rs │ ├── table_scan.rs │ ├── union.rs │ ├── unnest.rs │ ├── unnest_expr.rs │ ├── values.rs │ └── window.rs ├── functions.rs ├── lib.rs ├── physical_plan.rs ├── pyarrow_filter_expression.rs ├── pyarrow_util.rs ├── record_batch.rs ├── sql.rs ├── sql │ ├── exceptions.rs │ └── logical.rs ├── store.rs ├── substrait.rs ├── udaf.rs ├── udf.rs ├── udtf.rs ├── udwf.rs ├── unparser │ ├── dialect.rs │ └── mod.rs └── utils.rs └── uv.lock /.asf.yaml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | notifications: 19 | commits: commits@datafusion.apache.org 20 | issues: github@datafusion.apache.org 21 | pullrequests: github@datafusion.apache.org 22 | jira_options: link label worklog 23 | github: 24 | description: "Apache DataFusion Python Bindings" 25 | homepage: https://datafusion.apache.org/python 26 | enabled_merge_buttons: 27 | squash: true 28 | merge: false 29 | rebase: false 30 | features: 31 | issues: true 32 | protected_branches: 33 | main: 34 | required_pull_request_reviews: 35 | required_approving_review_count: 1 36 | 37 | staging: 38 | whoami: asf-staging 39 | subdir: python 40 | 41 | publish: 42 | whoami: asf-site 43 | subdir: python 44 | -------------------------------------------------------------------------------- /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [target.x86_64-apple-darwin] 2 | rustflags = [ 3 | "-C", "link-arg=-undefined", 4 | "-C", "link-arg=dynamic_lookup", 5 | ] 6 | 7 | [target.aarch64-apple-darwin] 8 | rustflags = [ 9 | "-C", "link-arg=-undefined", 10 | "-C", "link-arg=dynamic_lookup", 11 | ] 12 | 13 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | .cargo 2 | .github 3 | .pytest_cache 4 | ci 5 | conda 6 | dev 7 | docs 8 | examples 9 | parquet 10 | target 11 | testing 12 | venv -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 16 | **Expected behavior** 17 | A clear and concise description of what you expected to happen. 18 | 19 | **Additional context** 20 | Add any other context about the problem here. 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem or challenge? Please describe what you are trying to do.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | (This section helps Arrow developers understand the context and *why* for this feature, in addition to the *what*) 13 | 14 | **Describe the solution you'd like** 15 | A clear and concise description of what you want to happen. 16 | 17 | **Describe alternatives you've considered** 18 | A clear and concise description of any alternative solutions or features you've considered. 19 | 20 | **Additional context** 21 | Add any other context or screenshots about the feature request here. 22 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | version: 2 20 | updates: 21 | 22 | - package-ecosystem: "cargo" 23 | directory: "/" 24 | schedule: 25 | interval: "weekly" 26 | day: "saturday" 27 | open-pull-requests-limit: 20 28 | target-branch: main 29 | 30 | - package-ecosystem: "github-actions" 31 | directory: "/" 32 | schedule: 33 | interval: "weekly" 34 | day: "sunday" 35 | open-pull-requests-limit: 20 36 | target-branch: main 37 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # Which issue does this PR close? 2 | 3 | 6 | 7 | Closes #. 8 | 9 | # Rationale for this change 10 | 14 | 15 | # What changes are included in this PR? 16 | 19 | 20 | # Are there any user-facing changes? 21 | 24 | 25 | -------------------------------------------------------------------------------- /.github/workflows/dev.yml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | name: Dev 19 | on: [push, pull_request] 20 | 21 | jobs: 22 | 23 | rat: 24 | name: Release Audit Tool (RAT) 25 | runs-on: ubuntu-latest 26 | steps: 27 | - name: Checkout 28 | uses: actions/checkout@v4 29 | - name: Setup Python 30 | uses: actions/setup-python@v5 31 | with: 32 | python-version: "3.10" 33 | - name: Audit licenses 34 | run: ./dev/release/run-rat.sh . 35 | -------------------------------------------------------------------------------- /.github/workflows/take.yml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | name: Assign the issue via a `take` comment 19 | on: 20 | issue_comment: 21 | types: created 22 | 23 | permissions: 24 | issues: write 25 | 26 | jobs: 27 | issue_assign: 28 | runs-on: ubuntu-latest 29 | if: (!github.event.issue.pull_request) && github.event.comment.body == 'take' 30 | concurrency: 31 | group: ${{ github.actor }}-issue-assign 32 | steps: 33 | - run: | 34 | CODE=$(curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -LI https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees/${{ github.event.comment.user.login }} -o /dev/null -w '%{http_code}\n' -s) 35 | if [ "$CODE" -eq "204" ] 36 | then 37 | echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" 38 | curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees 39 | else 40 | echo "Cannot assign issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" 41 | fi -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | /venv 3 | .idea 4 | /docs/temp 5 | /docs/build 6 | .DS_Store 7 | .vscode 8 | 9 | # Byte-compiled / optimized / DLL files 10 | __pycache__/ 11 | *.py[cod] 12 | *$py.class 13 | 14 | # Python dist ignore 15 | dist 16 | 17 | # C extensions 18 | *.so 19 | 20 | # Python dist 21 | dist 22 | 23 | # pyenv 24 | # For a library or package, you might want to ignore these files since the code is 25 | # intended to run in multiple environments; otherwise, check them in: 26 | .python-version 27 | venv 28 | .venv 29 | 30 | apache-rat-*.jar 31 | *rat.txt 32 | .env 33 | CHANGELOG.md.bak 34 | 35 | docs/mdbook/book 36 | 37 | .pyo3_build_config 38 | 39 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "testing"] 2 | path = testing 3 | url = https://github.com/apache/arrow-testing.git 4 | [submodule "parquet"] 5 | path = parquet 6 | url = https://github.com/apache/parquet-testing.git 7 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | repos: 19 | - repo: https://github.com/rhysd/actionlint 20 | rev: v1.7.6 21 | hooks: 22 | - id: actionlint-docker 23 | - repo: https://github.com/astral-sh/ruff-pre-commit 24 | # Ruff version. 25 | rev: v0.9.10 26 | hooks: 27 | # Run the linter. 28 | - id: ruff 29 | # Run the formatter. 30 | - id: ruff-format 31 | - repo: local 32 | hooks: 33 | - id: rust-fmt 34 | name: Rust fmt 35 | description: Run cargo fmt on files included in the commit. rustfmt should be installed before-hand. 36 | entry: cargo fmt --all -- 37 | pass_filenames: true 38 | types: [file, rust] 39 | language: system 40 | - id: rust-clippy 41 | name: Rust clippy 42 | description: Run cargo clippy on files included in the commit. clippy should be installed before-hand. 43 | entry: cargo clippy --all-targets --all-features -- -Dclippy::all -D warnings -Aclippy::redundant_closure 44 | pass_filenames: false 45 | types: [file, rust] 46 | language: system 47 | 48 | default_language_version: 49 | python: python3 50 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # DataFusion Python Changelog 21 | 22 | The changelogs have now moved [here](./dev/changelog). 23 | -------------------------------------------------------------------------------- /benchmarks/db-benchmark/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # DataFusion Implementation of db-benchmark 21 | 22 | This directory contains scripts for running [db-benchmark](https://github.com/duckdblabs/db-benchmark) with 23 | DataFusion's Python bindings. 24 | 25 | ## Directions 26 | 27 | Run the following from root of this project. 28 | 29 | ```bash 30 | docker build -t db-benchmark -f benchmarks/db-benchmark/db-benchmark.dockerfile . 31 | docker run --privileged -it db-benchmark 32 | ``` 33 | -------------------------------------------------------------------------------- /benchmarks/db-benchmark/run-bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | set -e 19 | 20 | #SRC_DATANAME=G1_1e7_1e2_0_0 python3 /db-benchmark/polars/groupby-polars.py 21 | SRC_DATANAME=G1_1e7_1e2_0_0 python3 /db-benchmark/datafusion-python/groupby-datafusion.py 22 | 23 | # joins need more work still 24 | #SRC_DATANAME=G1_1e7_1e2_0_0 python3 /db-benchmark/datafusion-python/join-datafusion.py 25 | #SRC_DATANAME=G1_1e7_1e2_0_0 python3 /db-benchmark/polars/join-polars.py 26 | 27 | cat time.csv 28 | -------------------------------------------------------------------------------- /benchmarks/tpch/.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | results.csv -------------------------------------------------------------------------------- /benchmarks/tpch/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # DataFusion Python Benchmarks Derived from TPC-H 21 | 22 | ## Create Release Build 23 | 24 | From repo root: 25 | 26 | ```bash 27 | maturin develop --release 28 | ``` 29 | 30 | Note that release builds take a really long time, so you may want to temporarily comment out this section of the 31 | root Cargo.toml when frequently building. 32 | 33 | ```toml 34 | [profile.release] 35 | lto = true 36 | codegen-units = 1 37 | ``` 38 | 39 | ## Generate Data 40 | 41 | ```bash 42 | ./tpch-gen.sh 1 43 | ``` 44 | 45 | ## Run Benchmarks 46 | 47 | ```bash 48 | python tpch.py ./data ./queries 49 | ``` 50 | 51 | A summary of the benchmark timings will be written to `results.csv`. For example: 52 | 53 | ```csv 54 | setup,1.4 55 | q1,2978.6 56 | q2,679.7 57 | q3,2943.7 58 | q4,2894.9 59 | q5,3592.3 60 | q6,1691.4 61 | q7,3003.9 62 | q8,3818.7 63 | q9,4237.9 64 | q10,2344.7 65 | q11,526.1 66 | q12,2284.6 67 | q13,1009.2 68 | q14,1738.4 69 | q15,1942.1 70 | q16,499.8 71 | q17,5178.9 72 | q18,4127.7 73 | q19,2056.6 74 | q20,2162.5 75 | q21,8046.5 76 | q22,754.9 77 | total,58513.2 78 | ``` -------------------------------------------------------------------------------- /benchmarks/tpch/queries/q1.sql: -------------------------------------------------------------------------------- 1 | -- Benchmark Query 1 derived from TPC-H query 1 under the terms of the TPC Fair Use Policy. 2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. 3 | select 4 | l_returnflag, 5 | l_linestatus, 6 | sum(l_quantity) as sum_qty, 7 | sum(l_extendedprice) as sum_base_price, 8 | sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, 9 | sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, 10 | avg(l_quantity) as avg_qty, 11 | avg(l_extendedprice) as avg_price, 12 | avg(l_discount) as avg_disc, 13 | count(*) as count_order 14 | from 15 | lineitem 16 | where 17 | l_shipdate <= date '1998-12-01' - interval '68 days' 18 | group by 19 | l_returnflag, 20 | l_linestatus 21 | order by 22 | l_returnflag, 23 | l_linestatus; 24 | -------------------------------------------------------------------------------- /benchmarks/tpch/queries/q10.sql: -------------------------------------------------------------------------------- 1 | -- Benchmark Query 10 derived from TPC-H query 10 under the terms of the TPC Fair Use Policy. 2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. 3 | select 4 | c_custkey, 5 | c_name, 6 | sum(l_extendedprice * (1 - l_discount)) as revenue, 7 | c_acctbal, 8 | n_name, 9 | c_address, 10 | c_phone, 11 | c_comment 12 | from 13 | customer, 14 | orders, 15 | lineitem, 16 | nation 17 | where 18 | c_custkey = o_custkey 19 | and l_orderkey = o_orderkey 20 | and o_orderdate >= date '1993-07-01' 21 | and o_orderdate < date '1993-07-01' + interval '3' month 22 | and l_returnflag = 'R' 23 | and c_nationkey = n_nationkey 24 | group by 25 | c_custkey, 26 | c_name, 27 | c_acctbal, 28 | c_phone, 29 | n_name, 30 | c_address, 31 | c_comment 32 | order by 33 | revenue desc limit 20; 34 | -------------------------------------------------------------------------------- /benchmarks/tpch/queries/q11.sql: -------------------------------------------------------------------------------- 1 | -- Benchmark Query 11 derived from TPC-H query 11 under the terms of the TPC Fair Use Policy. 2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. 3 | select 4 | ps_partkey, 5 | sum(ps_supplycost * ps_availqty) as value 6 | from 7 | partsupp, 8 | supplier, 9 | nation 10 | where 11 | ps_suppkey = s_suppkey 12 | and s_nationkey = n_nationkey 13 | and n_name = 'ALGERIA' 14 | group by 15 | ps_partkey having 16 | sum(ps_supplycost * ps_availqty) > ( 17 | select 18 | sum(ps_supplycost * ps_availqty) * 0.0001000000 19 | from 20 | partsupp, 21 | supplier, 22 | nation 23 | where 24 | ps_suppkey = s_suppkey 25 | and s_nationkey = n_nationkey 26 | and n_name = 'ALGERIA' 27 | ) 28 | order by 29 | value desc; 30 | -------------------------------------------------------------------------------- /benchmarks/tpch/queries/q12.sql: -------------------------------------------------------------------------------- 1 | -- Benchmark Query 12 derived from TPC-H query 12 under the terms of the TPC Fair Use Policy. 2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. 3 | select 4 | l_shipmode, 5 | sum(case 6 | when o_orderpriority = '1-URGENT' 7 | or o_orderpriority = '2-HIGH' 8 | then 1 9 | else 0 10 | end) as high_line_count, 11 | sum(case 12 | when o_orderpriority <> '1-URGENT' 13 | and o_orderpriority <> '2-HIGH' 14 | then 1 15 | else 0 16 | end) as low_line_count 17 | from 18 | orders, 19 | lineitem 20 | where 21 | o_orderkey = l_orderkey 22 | and l_shipmode in ('FOB', 'SHIP') 23 | and l_commitdate < l_receiptdate 24 | and l_shipdate < l_commitdate 25 | and l_receiptdate >= date '1995-01-01' 26 | and l_receiptdate < date '1995-01-01' + interval '1' year 27 | group by 28 | l_shipmode 29 | order by 30 | l_shipmode; 31 | -------------------------------------------------------------------------------- /benchmarks/tpch/queries/q13.sql: -------------------------------------------------------------------------------- 1 | -- Benchmark Query 13 derived from TPC-H query 13 under the terms of the TPC Fair Use Policy. 2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. 3 | select 4 | c_count, 5 | count(*) as custdist 6 | from 7 | ( 8 | select 9 | c_custkey, 10 | count(o_orderkey) 11 | from 12 | customer left outer join orders on 13 | c_custkey = o_custkey 14 | and o_comment not like '%express%requests%' 15 | group by 16 | c_custkey 17 | ) as c_orders (c_custkey, c_count) 18 | group by 19 | c_count 20 | order by 21 | custdist desc, 22 | c_count desc; 23 | -------------------------------------------------------------------------------- /benchmarks/tpch/queries/q14.sql: -------------------------------------------------------------------------------- 1 | -- Benchmark Query 14 derived from TPC-H query 14 under the terms of the TPC Fair Use Policy. 2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. 3 | select 4 | 100.00 * sum(case 5 | when p_type like 'PROMO%' 6 | then l_extendedprice * (1 - l_discount) 7 | else 0 8 | end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue 9 | from 10 | lineitem, 11 | part 12 | where 13 | l_partkey = p_partkey 14 | and l_shipdate >= date '1995-02-01' 15 | and l_shipdate < date '1995-02-01' + interval '1' month; 16 | -------------------------------------------------------------------------------- /benchmarks/tpch/queries/q15.sql: -------------------------------------------------------------------------------- 1 | -- Benchmark Query 15 derived from TPC-H query 15 under the terms of the TPC Fair Use Policy. 2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. 3 | create view revenue0 (supplier_no, total_revenue) as 4 | select 5 | l_suppkey, 6 | sum(l_extendedprice * (1 - l_discount)) 7 | from 8 | lineitem 9 | where 10 | l_shipdate >= date '1996-08-01' 11 | and l_shipdate < date '1996-08-01' + interval '3' month 12 | group by 13 | l_suppkey; 14 | select 15 | s_suppkey, 16 | s_name, 17 | s_address, 18 | s_phone, 19 | total_revenue 20 | from 21 | supplier, 22 | revenue0 23 | where 24 | s_suppkey = supplier_no 25 | and total_revenue = ( 26 | select 27 | max(total_revenue) 28 | from 29 | revenue0 30 | ) 31 | order by 32 | s_suppkey; 33 | drop view revenue0; 34 | -------------------------------------------------------------------------------- /benchmarks/tpch/queries/q16.sql: -------------------------------------------------------------------------------- 1 | -- Benchmark Query 16 derived from TPC-H query 16 under the terms of the TPC Fair Use Policy. 2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. 3 | select 4 | p_brand, 5 | p_type, 6 | p_size, 7 | count(distinct ps_suppkey) as supplier_cnt 8 | from 9 | partsupp, 10 | part 11 | where 12 | p_partkey = ps_partkey 13 | and p_brand <> 'Brand#14' 14 | and p_type not like 'SMALL PLATED%' 15 | and p_size in (14, 6, 5, 31, 49, 15, 41, 47) 16 | and ps_suppkey not in ( 17 | select 18 | s_suppkey 19 | from 20 | supplier 21 | where 22 | s_comment like '%Customer%Complaints%' 23 | ) 24 | group by 25 | p_brand, 26 | p_type, 27 | p_size 28 | order by 29 | supplier_cnt desc, 30 | p_brand, 31 | p_type, 32 | p_size; 33 | -------------------------------------------------------------------------------- /benchmarks/tpch/queries/q17.sql: -------------------------------------------------------------------------------- 1 | -- Benchmark Query 17 derived from TPC-H query 17 under the terms of the TPC Fair Use Policy. 2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. 3 | select 4 | sum(l_extendedprice) / 7.0 as avg_yearly 5 | from 6 | lineitem, 7 | part 8 | where 9 | p_partkey = l_partkey 10 | and p_brand = 'Brand#42' 11 | and p_container = 'LG BAG' 12 | and l_quantity < ( 13 | select 14 | 0.2 * avg(l_quantity) 15 | from 16 | lineitem 17 | where 18 | l_partkey = p_partkey 19 | ); 20 | -------------------------------------------------------------------------------- /benchmarks/tpch/queries/q18.sql: -------------------------------------------------------------------------------- 1 | -- Benchmark Query 18 derived from TPC-H query 18 under the terms of the TPC Fair Use Policy. 2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. 3 | select 4 | c_name, 5 | c_custkey, 6 | o_orderkey, 7 | o_orderdate, 8 | o_totalprice, 9 | sum(l_quantity) 10 | from 11 | customer, 12 | orders, 13 | lineitem 14 | where 15 | o_orderkey in ( 16 | select 17 | l_orderkey 18 | from 19 | lineitem 20 | group by 21 | l_orderkey having 22 | sum(l_quantity) > 313 23 | ) 24 | and c_custkey = o_custkey 25 | and o_orderkey = l_orderkey 26 | group by 27 | c_name, 28 | c_custkey, 29 | o_orderkey, 30 | o_orderdate, 31 | o_totalprice 32 | order by 33 | o_totalprice desc, 34 | o_orderdate limit 100; 35 | -------------------------------------------------------------------------------- /benchmarks/tpch/queries/q19.sql: -------------------------------------------------------------------------------- 1 | -- Benchmark Query 19 derived from TPC-H query 19 under the terms of the TPC Fair Use Policy. 2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. 3 | select 4 | sum(l_extendedprice* (1 - l_discount)) as revenue 5 | from 6 | lineitem, 7 | part 8 | where 9 | ( 10 | p_partkey = l_partkey 11 | and p_brand = 'Brand#21' 12 | and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') 13 | and l_quantity >= 8 and l_quantity <= 8 + 10 14 | and p_size between 1 and 5 15 | and l_shipmode in ('AIR', 'AIR REG') 16 | and l_shipinstruct = 'DELIVER IN PERSON' 17 | ) 18 | or 19 | ( 20 | p_partkey = l_partkey 21 | and p_brand = 'Brand#13' 22 | and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') 23 | and l_quantity >= 20 and l_quantity <= 20 + 10 24 | and p_size between 1 and 10 25 | and l_shipmode in ('AIR', 'AIR REG') 26 | and l_shipinstruct = 'DELIVER IN PERSON' 27 | ) 28 | or 29 | ( 30 | p_partkey = l_partkey 31 | and p_brand = 'Brand#52' 32 | and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') 33 | and l_quantity >= 30 and l_quantity <= 30 + 10 34 | and p_size between 1 and 15 35 | and l_shipmode in ('AIR', 'AIR REG') 36 | and l_shipinstruct = 'DELIVER IN PERSON' 37 | ); 38 | -------------------------------------------------------------------------------- /benchmarks/tpch/queries/q2.sql: -------------------------------------------------------------------------------- 1 | -- Benchmark Query 2 derived from TPC-H query 2 under the terms of the TPC Fair Use Policy. 2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. 3 | select 4 | s_acctbal, 5 | s_name, 6 | n_name, 7 | p_partkey, 8 | p_mfgr, 9 | s_address, 10 | s_phone, 11 | s_comment 12 | from 13 | part, 14 | supplier, 15 | partsupp, 16 | nation, 17 | region 18 | where 19 | p_partkey = ps_partkey 20 | and s_suppkey = ps_suppkey 21 | and p_size = 48 22 | and p_type like '%TIN' 23 | and s_nationkey = n_nationkey 24 | and n_regionkey = r_regionkey 25 | and r_name = 'ASIA' 26 | and ps_supplycost = ( 27 | select 28 | min(ps_supplycost) 29 | from 30 | partsupp, 31 | supplier, 32 | nation, 33 | region 34 | where 35 | p_partkey = ps_partkey 36 | and s_suppkey = ps_suppkey 37 | and s_nationkey = n_nationkey 38 | and n_regionkey = r_regionkey 39 | and r_name = 'ASIA' 40 | ) 41 | order by 42 | s_acctbal desc, 43 | n_name, 44 | s_name, 45 | p_partkey limit 100; 46 | -------------------------------------------------------------------------------- /benchmarks/tpch/queries/q20.sql: -------------------------------------------------------------------------------- 1 | -- Benchmark Query 20 derived from TPC-H query 20 under the terms of the TPC Fair Use Policy. 2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. 3 | select 4 | s_name, 5 | s_address 6 | from 7 | supplier, 8 | nation 9 | where 10 | s_suppkey in ( 11 | select 12 | ps_suppkey 13 | from 14 | partsupp 15 | where 16 | ps_partkey in ( 17 | select 18 | p_partkey 19 | from 20 | part 21 | where 22 | p_name like 'blanched%' 23 | ) 24 | and ps_availqty > ( 25 | select 26 | 0.5 * sum(l_quantity) 27 | from 28 | lineitem 29 | where 30 | l_partkey = ps_partkey 31 | and l_suppkey = ps_suppkey 32 | and l_shipdate >= date '1993-01-01' 33 | and l_shipdate < date '1993-01-01' + interval '1' year 34 | ) 35 | ) 36 | and s_nationkey = n_nationkey 37 | and n_name = 'KENYA' 38 | order by 39 | s_name; 40 | -------------------------------------------------------------------------------- /benchmarks/tpch/queries/q21.sql: -------------------------------------------------------------------------------- 1 | -- Benchmark Query 21 derived from TPC-H query 21 under the terms of the TPC Fair Use Policy. 2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. 3 | select 4 | s_name, 5 | count(*) as numwait 6 | from 7 | supplier, 8 | lineitem l1, 9 | orders, 10 | nation 11 | where 12 | s_suppkey = l1.l_suppkey 13 | and o_orderkey = l1.l_orderkey 14 | and o_orderstatus = 'F' 15 | and l1.l_receiptdate > l1.l_commitdate 16 | and exists ( 17 | select 18 | * 19 | from 20 | lineitem l2 21 | where 22 | l2.l_orderkey = l1.l_orderkey 23 | and l2.l_suppkey <> l1.l_suppkey 24 | ) 25 | and not exists ( 26 | select 27 | * 28 | from 29 | lineitem l3 30 | where 31 | l3.l_orderkey = l1.l_orderkey 32 | and l3.l_suppkey <> l1.l_suppkey 33 | and l3.l_receiptdate > l3.l_commitdate 34 | ) 35 | and s_nationkey = n_nationkey 36 | and n_name = 'ARGENTINA' 37 | group by 38 | s_name 39 | order by 40 | numwait desc, 41 | s_name limit 100; 42 | -------------------------------------------------------------------------------- /benchmarks/tpch/queries/q22.sql: -------------------------------------------------------------------------------- 1 | -- Benchmark Query 22 derived from TPC-H query 22 under the terms of the TPC Fair Use Policy. 2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. 3 | select 4 | cntrycode, 5 | count(*) as numcust, 6 | sum(c_acctbal) as totacctbal 7 | from 8 | ( 9 | select 10 | substring(c_phone from 1 for 2) as cntrycode, 11 | c_acctbal 12 | from 13 | customer 14 | where 15 | substring(c_phone from 1 for 2) in 16 | ('24', '34', '16', '30', '33', '14', '13') 17 | and c_acctbal > ( 18 | select 19 | avg(c_acctbal) 20 | from 21 | customer 22 | where 23 | c_acctbal > 0.00 24 | and substring(c_phone from 1 for 2) in 25 | ('24', '34', '16', '30', '33', '14', '13') 26 | ) 27 | and not exists ( 28 | select 29 | * 30 | from 31 | orders 32 | where 33 | o_custkey = c_custkey 34 | ) 35 | ) as custsale 36 | group by 37 | cntrycode 38 | order by 39 | cntrycode; 40 | -------------------------------------------------------------------------------- /benchmarks/tpch/queries/q3.sql: -------------------------------------------------------------------------------- 1 | -- Benchmark Query 3 derived from TPC-H query 3 under the terms of the TPC Fair Use Policy. 2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. 3 | select 4 | l_orderkey, 5 | sum(l_extendedprice * (1 - l_discount)) as revenue, 6 | o_orderdate, 7 | o_shippriority 8 | from 9 | customer, 10 | orders, 11 | lineitem 12 | where 13 | c_mktsegment = 'BUILDING' 14 | and c_custkey = o_custkey 15 | and l_orderkey = o_orderkey 16 | and o_orderdate < date '1995-03-15' 17 | and l_shipdate > date '1995-03-15' 18 | group by 19 | l_orderkey, 20 | o_orderdate, 21 | o_shippriority 22 | order by 23 | revenue desc, 24 | o_orderdate limit 10; 25 | -------------------------------------------------------------------------------- /benchmarks/tpch/queries/q4.sql: -------------------------------------------------------------------------------- 1 | -- Benchmark Query 4 derived from TPC-H query 4 under the terms of the TPC Fair Use Policy. 2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. 3 | select 4 | o_orderpriority, 5 | count(*) as order_count 6 | from 7 | orders 8 | where 9 | o_orderdate >= date '1995-04-01' 10 | and o_orderdate < date '1995-04-01' + interval '3' month 11 | and exists ( 12 | select 13 | * 14 | from 15 | lineitem 16 | where 17 | l_orderkey = o_orderkey 18 | and l_commitdate < l_receiptdate 19 | ) 20 | group by 21 | o_orderpriority 22 | order by 23 | o_orderpriority; 24 | -------------------------------------------------------------------------------- /benchmarks/tpch/queries/q5.sql: -------------------------------------------------------------------------------- 1 | -- Benchmark Query 5 derived from TPC-H query 5 under the terms of the TPC Fair Use Policy. 2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. 3 | select 4 | n_name, 5 | sum(l_extendedprice * (1 - l_discount)) as revenue 6 | from 7 | customer, 8 | orders, 9 | lineitem, 10 | supplier, 11 | nation, 12 | region 13 | where 14 | c_custkey = o_custkey 15 | and l_orderkey = o_orderkey 16 | and l_suppkey = s_suppkey 17 | and c_nationkey = s_nationkey 18 | and s_nationkey = n_nationkey 19 | and n_regionkey = r_regionkey 20 | and r_name = 'AFRICA' 21 | and o_orderdate >= date '1994-01-01' 22 | and o_orderdate < date '1994-01-01' + interval '1' year 23 | group by 24 | n_name 25 | order by 26 | revenue desc; 27 | -------------------------------------------------------------------------------- /benchmarks/tpch/queries/q6.sql: -------------------------------------------------------------------------------- 1 | -- Benchmark Query 6 derived from TPC-H query 6 under the terms of the TPC Fair Use Policy. 2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. 3 | select 4 | sum(l_extendedprice * l_discount) as revenue 5 | from 6 | lineitem 7 | where 8 | l_shipdate >= date '1994-01-01' 9 | and l_shipdate < date '1994-01-01' + interval '1' year 10 | and l_discount between 0.04 - 0.01 and 0.04 + 0.01 11 | and l_quantity < 24; 12 | -------------------------------------------------------------------------------- /benchmarks/tpch/queries/q7.sql: -------------------------------------------------------------------------------- 1 | -- Benchmark Query 7 derived from TPC-H query 7 under the terms of the TPC Fair Use Policy. 2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. 3 | select 4 | supp_nation, 5 | cust_nation, 6 | l_year, 7 | sum(volume) as revenue 8 | from 9 | ( 10 | select 11 | n1.n_name as supp_nation, 12 | n2.n_name as cust_nation, 13 | extract(year from l_shipdate) as l_year, 14 | l_extendedprice * (1 - l_discount) as volume 15 | from 16 | supplier, 17 | lineitem, 18 | orders, 19 | customer, 20 | nation n1, 21 | nation n2 22 | where 23 | s_suppkey = l_suppkey 24 | and o_orderkey = l_orderkey 25 | and c_custkey = o_custkey 26 | and s_nationkey = n1.n_nationkey 27 | and c_nationkey = n2.n_nationkey 28 | and ( 29 | (n1.n_name = 'GERMANY' and n2.n_name = 'IRAQ') 30 | or (n1.n_name = 'IRAQ' and n2.n_name = 'GERMANY') 31 | ) 32 | and l_shipdate between date '1995-01-01' and date '1996-12-31' 33 | ) as shipping 34 | group by 35 | supp_nation, 36 | cust_nation, 37 | l_year 38 | order by 39 | supp_nation, 40 | cust_nation, 41 | l_year; 42 | -------------------------------------------------------------------------------- /benchmarks/tpch/queries/q8.sql: -------------------------------------------------------------------------------- 1 | -- Benchmark Query 8 derived from TPC-H query 8 under the terms of the TPC Fair Use Policy. 2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. 3 | select 4 | o_year, 5 | sum(case 6 | when nation = 'IRAQ' then volume 7 | else 0 8 | end) / sum(volume) as mkt_share 9 | from 10 | ( 11 | select 12 | extract(year from o_orderdate) as o_year, 13 | l_extendedprice * (1 - l_discount) as volume, 14 | n2.n_name as nation 15 | from 16 | part, 17 | supplier, 18 | lineitem, 19 | orders, 20 | customer, 21 | nation n1, 22 | nation n2, 23 | region 24 | where 25 | p_partkey = l_partkey 26 | and s_suppkey = l_suppkey 27 | and l_orderkey = o_orderkey 28 | and o_custkey = c_custkey 29 | and c_nationkey = n1.n_nationkey 30 | and n1.n_regionkey = r_regionkey 31 | and r_name = 'MIDDLE EAST' 32 | and s_nationkey = n2.n_nationkey 33 | and o_orderdate between date '1995-01-01' and date '1996-12-31' 34 | and p_type = 'LARGE PLATED STEEL' 35 | ) as all_nations 36 | group by 37 | o_year 38 | order by 39 | o_year; 40 | -------------------------------------------------------------------------------- /benchmarks/tpch/queries/q9.sql: -------------------------------------------------------------------------------- 1 | -- Benchmark Query 9 derived from TPC-H query 9 under the terms of the TPC Fair Use Policy. 2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. 3 | select 4 | nation, 5 | o_year, 6 | sum(amount) as sum_profit 7 | from 8 | ( 9 | select 10 | n_name as nation, 11 | extract(year from o_orderdate) as o_year, 12 | l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount 13 | from 14 | part, 15 | supplier, 16 | lineitem, 17 | partsupp, 18 | orders, 19 | nation 20 | where 21 | s_suppkey = l_suppkey 22 | and ps_suppkey = l_suppkey 23 | and ps_partkey = l_partkey 24 | and p_partkey = l_partkey 25 | and o_orderkey = l_orderkey 26 | and s_nationkey = n_nationkey 27 | and p_name like '%moccasin%' 28 | ) as profit 29 | group by 30 | nation, 31 | o_year 32 | order by 33 | nation, 34 | o_year desc; 35 | -------------------------------------------------------------------------------- /benchmarks/tpch/tpch-gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | 19 | mkdir -p data/answers 2>/dev/null 20 | 21 | set -e 22 | 23 | # If RUN_IN_CI is set, then do not produce verbose output or use an interactive terminal 24 | if [[ -z "${RUN_IN_CI}" ]]; then 25 | TERMINAL_FLAG="-it" 26 | VERBOSE_OUTPUT="-vf" 27 | else 28 | TERMINAL_FLAG="" 29 | VERBOSE_OUTPUT="-f" 30 | fi 31 | 32 | #pushd .. 33 | #. ./dev/build-set-env.sh 34 | #popd 35 | 36 | # Generate data into the ./data directory if it does not already exist 37 | FILE=./data/supplier.tbl 38 | if test -f "$FILE"; then 39 | echo "$FILE exists." 40 | else 41 | docker run -v `pwd`/data:/data $TERMINAL_FLAG --rm ghcr.io/scalytics/tpch-docker:main $VERBOSE_OUTPUT -s $1 42 | 43 | # workaround for https://github.com/apache/arrow-datafusion/issues/6147 44 | mv data/customer.tbl data/customer.csv 45 | mv data/lineitem.tbl data/lineitem.csv 46 | mv data/nation.tbl data/nation.csv 47 | mv data/orders.tbl data/orders.csv 48 | mv data/part.tbl data/part.csv 49 | mv data/partsupp.tbl data/partsupp.csv 50 | mv data/region.tbl data/region.csv 51 | mv data/supplier.tbl data/supplier.csv 52 | 53 | ls -l data 54 | fi 55 | 56 | # Copy expected answers (at SF=1) into the ./data/answers directory if it does not already exist 57 | FILE=./data/answers/q1.out 58 | if test -f "$FILE"; then 59 | echo "$FILE exists." 60 | else 61 | docker run -v `pwd`/data:/data $TERMINAL_FLAG --entrypoint /bin/bash --rm ghcr.io/scalytics/tpch-docker:main -c "cp /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/" 62 | fi 63 | -------------------------------------------------------------------------------- /build.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | fn main() { 19 | pyo3_build_config::add_extension_module_link_args(); 20 | } 21 | -------------------------------------------------------------------------------- /ci/scripts/python_lint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | set -ex 21 | ruff format datafusion 22 | ruff check datafusion -------------------------------------------------------------------------------- /ci/scripts/rust_clippy.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | set -ex 21 | cargo clippy --all-targets --workspace --features default -- -D warnings 22 | -------------------------------------------------------------------------------- /ci/scripts/rust_fmt.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | set -ex 21 | cargo fmt --all -- --check 22 | -------------------------------------------------------------------------------- /ci/scripts/rust_toml_fmt.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | set -ex 21 | find . -mindepth 2 -name 'Cargo.toml' -exec cargo tomlfmt -p {} \; 22 | -------------------------------------------------------------------------------- /dev/build-set-env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | export PY_DATAFUSION_VERSION=$(awk -F'[ ="]+' '$1 == "version" { print $2 }' Cargo.toml) 21 | -------------------------------------------------------------------------------- /dev/changelog/45.0.0.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Apache DataFusion Python 45.0.0 Changelog 21 | 22 | This release consists of 2 commits from 2 contributors. See credits at the end of this changelog for more information. 23 | 24 | **Fixed bugs:** 25 | 26 | - fix: add to_timestamp_nanos [#1020](https://github.com/apache/datafusion-python/pull/1020) (chenkovsky) 27 | 28 | **Other:** 29 | 30 | - Chore/upgrade datafusion 45 [#1010](https://github.com/apache/datafusion-python/pull/1010) (kevinjqliu) 31 | 32 | ## Credits 33 | 34 | Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. 35 | 36 | ``` 37 | 1 Kevin Liu 38 | 1 Tim Saucer 39 | ``` 40 | 41 | Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. 42 | 43 | -------------------------------------------------------------------------------- /dev/clean.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | # 20 | 21 | # This cleans up the project by removing build artifacts and other generated files. 22 | 23 | # Function to remove a directory and print the action 24 | remove_dir() { 25 | if [ -d "$1" ]; then 26 | echo "Removing directory: $1" 27 | rm -rf "$1" 28 | fi 29 | } 30 | 31 | # Function to remove a file and print the action 32 | remove_file() { 33 | if [ -f "$1" ]; then 34 | echo "Removing file: $1" 35 | rm -f "$1" 36 | fi 37 | } 38 | 39 | # Remove .pytest_cache directory 40 | remove_dir .pytest_cache/ 41 | 42 | # Remove target directory 43 | remove_dir target/ 44 | 45 | # Remove any __pycache__ directories 46 | find python/ -type d -name "__pycache__" -print | while read -r dir; do 47 | remove_dir "$dir" 48 | done 49 | 50 | # Remove pytest-coverage.lcov file 51 | # remove_file .coverage 52 | # remove_file pytest-coverage.lcov 53 | 54 | # Remove rust-coverage.lcov file 55 | # remove_file rust-coverage.lcov 56 | 57 | # Remove pyo3 files 58 | find python/ -type f -name '_internal.*.so' -print | while read -r file; do 59 | remove_file "$file" 60 | done 61 | 62 | echo "Cleanup complete." -------------------------------------------------------------------------------- /dev/python_lint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | # This script runs all the Rust lints locally the same way the 21 | # DataFusion CI does 22 | 23 | set -e 24 | source .venv/bin/activate 25 | flake8 --exclude venv,benchmarks/db-benchmark --ignore=E501,W503 26 | black --line-length 79 . 27 | -------------------------------------------------------------------------------- /dev/release/check-rat-report.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | ############################################################################## 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | ############################################################################## 20 | import fnmatch 21 | import re 22 | import sys 23 | import xml.etree.ElementTree as ET 24 | 25 | if len(sys.argv) != 3: 26 | sys.stderr.write("Usage: %s exclude_globs.lst rat_report.xml\n" % sys.argv[0]) 27 | sys.exit(1) 28 | 29 | exclude_globs_filename = sys.argv[1] 30 | xml_filename = sys.argv[2] 31 | 32 | globs = [line.strip() for line in open(exclude_globs_filename)] 33 | 34 | tree = ET.parse(xml_filename) 35 | root = tree.getroot() 36 | resources = root.findall("resource") 37 | 38 | all_ok = True 39 | for r in resources: 40 | approvals = r.findall("license-approval") 41 | if not approvals or approvals[0].attrib["name"] == "true": 42 | continue 43 | clean_name = re.sub("^[^/]+/", "", r.attrib["name"]) 44 | excluded = False 45 | for g in globs: 46 | if fnmatch.fnmatch(clean_name, g): 47 | excluded = True 48 | break 49 | if not excluded: 50 | sys.stdout.write( 51 | "NOT APPROVED: %s (%s): %s\n" 52 | % (clean_name, r.attrib["name"], approvals[0].attrib["name"]) 53 | ) 54 | all_ok = False 55 | 56 | if not all_ok: 57 | sys.exit(1) 58 | 59 | print("OK") 60 | sys.exit(0) 61 | -------------------------------------------------------------------------------- /dev/release/rat_exclude_files.txt: -------------------------------------------------------------------------------- 1 | *.npmrc 2 | *.gitignore 3 | *.dockerignore 4 | .gitmodules 5 | *_generated.h 6 | *_generated.js 7 | *_generated.ts 8 | *.csv 9 | *.json 10 | *.snap 11 | .github/ISSUE_TEMPLATE/*.md 12 | .github/pull_request_template.md 13 | CHANGELOG.md 14 | dev/release/rat_exclude_files.txt 15 | MANIFEST.in 16 | __init__.pxd 17 | __init__.py 18 | *.html 19 | *.sgml 20 | *.css 21 | *.png 22 | *.ico 23 | *.svg 24 | *.devhelp2 25 | *.scss 26 | .gitattributes 27 | requirements.txt 28 | *requirements*.txt 29 | **/testdata/* 30 | ci/* 31 | **/*.svg 32 | **/*.csv 33 | **/*.json 34 | **/*.sql 35 | venv/* 36 | parquet/* 37 | testing/* 38 | target/* 39 | **/target/* 40 | Cargo.lock 41 | **/Cargo.lock 42 | .history 43 | *rat.txt 44 | */.git 45 | .github/* 46 | benchmarks/tpch/queries/q*.sql 47 | benchmarks/tpch/create_tables.sql 48 | .cargo/config.toml 49 | **/.cargo/config.toml 50 | uv.lock -------------------------------------------------------------------------------- /dev/release/release-tarball.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | # 20 | 21 | # Adapted from https://github.com/apache/arrow-rs/tree/master/dev/release/release-tarball.sh 22 | 23 | # This script copies a tarball from the "dev" area of the 24 | # dist.apache.arrow repository to the "release" area 25 | # 26 | # This script should only be run after the release has been approved 27 | # by the arrow PMC committee. 28 | # 29 | # See release/README.md for full release instructions 30 | # 31 | # Based in part on post-01-upload.sh from apache/arrow 32 | 33 | 34 | set -e 35 | set -u 36 | 37 | if [ "$#" -ne 2 ]; then 38 | echo "Usage: $0 " 39 | echo "ex. $0 4.1.0 2" 40 | exit 41 | fi 42 | 43 | version=$1 44 | rc=$2 45 | 46 | tmp_dir=tmp-apache-datafusion-python-dist 47 | 48 | echo "Recreate temporary directory: ${tmp_dir}" 49 | rm -rf ${tmp_dir} 50 | mkdir -p ${tmp_dir} 51 | 52 | echo "Clone dev dist repository" 53 | svn \ 54 | co \ 55 | https://dist.apache.org/repos/dist/dev/datafusion/apache-datafusion-python-${version}-rc${rc} \ 56 | ${tmp_dir}/dev 57 | 58 | echo "Clone release dist repository" 59 | svn co https://dist.apache.org/repos/dist/release/datafusion ${tmp_dir}/release 60 | 61 | echo "Copy ${version}-rc${rc} to release working copy" 62 | release_version=datafusion-python-${version} 63 | mkdir -p ${tmp_dir}/release/${release_version} 64 | cp -r ${tmp_dir}/dev/* ${tmp_dir}/release/${release_version}/ 65 | svn add ${tmp_dir}/release/${release_version} 66 | 67 | echo "Commit release" 68 | svn ci -m "Apache DataFusion Python ${version}" ${tmp_dir}/release 69 | 70 | echo "Clean up" 71 | rm -rf ${tmp_dir} 72 | 73 | echo "Success! The release is available here:" 74 | echo " https://dist.apache.org/repos/dist/release/datafusion/${release_version}" 75 | -------------------------------------------------------------------------------- /dev/release/run-rat.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | # 20 | 21 | RAT_VERSION=0.13 22 | 23 | # download apache rat 24 | if [ ! -f apache-rat-${RAT_VERSION}.jar ]; then 25 | curl -s https://repo1.maven.org/maven2/org/apache/rat/apache-rat/${RAT_VERSION}/apache-rat-${RAT_VERSION}.jar > apache-rat-${RAT_VERSION}.jar 26 | fi 27 | 28 | RAT="java -jar apache-rat-${RAT_VERSION}.jar -x " 29 | 30 | RELEASE_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd) 31 | 32 | # generate the rat report 33 | $RAT $1 > rat.txt 34 | python $RELEASE_DIR/check-rat-report.py $RELEASE_DIR/rat_exclude_files.txt rat.txt > filtered_rat.txt 35 | cat filtered_rat.txt 36 | UNAPPROVED=`cat filtered_rat.txt | grep "NOT APPROVED" | wc -l` 37 | 38 | if [ "0" -eq "${UNAPPROVED}" ]; then 39 | echo "No unapproved licenses" 40 | else 41 | echo "${UNAPPROVED} unapproved licences. Check rat report: rat.txt" 42 | exit 1 43 | fi 44 | -------------------------------------------------------------------------------- /dev/rust_lint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | # This script runs all the Rust lints locally the same way the 21 | # DataFusion CI does 22 | 23 | set -e 24 | if ! command -v cargo-tomlfmt &> /dev/null; then 25 | echo "Installing cargo-tomlfmt using cargo" 26 | cargo install cargo-tomlfmt 27 | fi 28 | 29 | ci/scripts/rust_fmt.sh 30 | ci/scripts/rust_clippy.sh 31 | ci/scripts/rust_toml_fmt.sh 32 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | pokemon.csv 2 | yellow_trip_data.parquet 3 | yellow_tripdata_2021-01.parquet 4 | 5 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # 19 | # Minimal makefile for Sphinx documentation 20 | # 21 | 22 | # You can set these variables from the command line, and also 23 | # from the environment for the first two. 24 | SPHINXOPTS ?= 25 | SPHINXBUILD ?= sphinx-build 26 | SOURCEDIR = source 27 | BUILDDIR = build 28 | 29 | # Put it first so that "make" without argument is like "make help". 30 | help: 31 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 32 | 33 | .PHONY: help Makefile 34 | 35 | # Catch-all target: route all unknown targets to Sphinx using the new 36 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 37 | %: Makefile 38 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | # 20 | 21 | set -e 22 | 23 | original_dir=$(pwd) 24 | script_dir=$(dirname "$(realpath "$0")") 25 | cd "$script_dir" || exit 26 | 27 | if [ ! -f pokemon.csv ]; then 28 | curl -O https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv 29 | fi 30 | 31 | if [ ! -f yellow_tripdata_2021-01.parquet ]; then 32 | curl -O https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet 33 | fi 34 | 35 | rm -rf build 2> /dev/null 36 | rm -rf temp 2> /dev/null 37 | mkdir temp 38 | cp -rf source/* temp/ 39 | make SOURCEDIR=`pwd`/temp html 40 | 41 | cd "$original_dir" || exit 42 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @rem Licensed to the Apache Software Foundation (ASF) under one 2 | @rem or more contributor license agreements. See the NOTICE file 3 | @rem distributed with this work for additional information 4 | @rem regarding copyright ownership. The ASF licenses this file 5 | @rem to you under the Apache License, Version 2.0 (the 6 | @rem "License"); you may not use this file except in compliance 7 | @rem with the License. You may obtain a copy of the License at 8 | @rem 9 | @rem http://www.apache.org/licenses/LICENSE-2.0 10 | @rem 11 | @rem Unless required by applicable law or agreed to in writing, 12 | @rem software distributed under the License is distributed on an 13 | @rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | @rem KIND, either express or implied. See the License for the 15 | @rem specific language governing permissions and limitations 16 | @rem under the License. 17 | 18 | @ECHO OFF 19 | 20 | pushd %~dp0 21 | 22 | REM Command file for Sphinx documentation 23 | 24 | if "%SPHINXBUILD%" == "" ( 25 | set SPHINXBUILD=sphinx-build 26 | ) 27 | set SOURCEDIR=source 28 | set BUILDDIR=build 29 | 30 | if "%1" == "" goto help 31 | 32 | %SPHINXBUILD% >NUL 2>NUL 33 | if errorlevel 9009 ( 34 | echo. 35 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 36 | echo.installed, then set the SPHINXBUILD environment variable to point 37 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 38 | echo.may add the Sphinx directory to PATH. 39 | echo. 40 | echo.If you don't have Sphinx installed, grab it from 41 | echo.http://sphinx-doc.org/ 42 | exit /b 1 43 | ) 44 | 45 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 46 | goto end 47 | 48 | :help 49 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 50 | 51 | :end 52 | popd -------------------------------------------------------------------------------- /docs/mdbook/README.md: -------------------------------------------------------------------------------- 1 | 17 | # DataFusion Book 18 | 19 | This folder builds a DataFusion user guide using [mdBook](https://github.com/rust-lang/mdBook). 20 | 21 | ## Build and run book locally 22 | 23 | Build the latest files with `mdbook build`. 24 | 25 | Open the book locally by running `open book/index.html`. 26 | 27 | ## Install mdBook 28 | 29 | Download the `mdbook` binary or run `cargo install mdbook`. 30 | 31 | Then manually open it, so you have permissions to run it on your Mac. 32 | 33 | Add it to your path with a command like this so you can easily run the commands: `mv ~/Downloads/mdbook /Users/matthew.powers/.local/bin`. 34 | -------------------------------------------------------------------------------- /docs/mdbook/book.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | [book] 19 | authors = ["Apache Arrow "] 20 | language = "en" 21 | multilingual = false 22 | src = "src" 23 | title = "DataFusion Book" 24 | -------------------------------------------------------------------------------- /docs/mdbook/src/SUMMARY.md: -------------------------------------------------------------------------------- 1 | 17 | # Summary 18 | 19 | - [Index](./index.md) 20 | - [Installation](./installation.md) 21 | - [Quickstart](./quickstart.md) 22 | - [Usage](./usage/index.md) 23 | - [Create a table](./usage/create-table.md) 24 | - [Query a table](./usage/query-table.md) 25 | - [Viewing Query Plans](./usage/query-plans.md) -------------------------------------------------------------------------------- /docs/mdbook/src/images/datafusion-jupyterlab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/datafusion-python/0cc9b0a513e784597bc6e35f883fefa9e2d3210b/docs/mdbook/src/images/datafusion-jupyterlab.png -------------------------------------------------------------------------------- /docs/mdbook/src/index.md: -------------------------------------------------------------------------------- 1 | 17 | # DataFusion Book 18 | 19 | DataFusion is a blazing fast query engine that lets you run data analyses quickly and reliably. 20 | 21 | DataFusion is written in Rust, but also exposes Python and SQL bindings, so you can easily query data in your language of choice. You don't need to know any Rust to be a happy and productive user of DataFusion. 22 | 23 | DataFusion lets you run queries faster than pandas. Let's compare query runtimes for a 5GB CSV file with 100 million rows of data. 24 | 25 | Take a look at a few rows of the data: 26 | 27 | ``` 28 | +-------+-------+--------------+-----+-----+-------+----+----+-----------+ 29 | | id1 | id2 | id3 | id4 | id5 | id6 | v1 | v2 | v3 | 30 | +-------+-------+--------------+-----+-----+-------+----+----+-----------+ 31 | | id016 | id016 | id0000042202 | 15 | 24 | 5971 | 5 | 11 | 37.211254 | 32 | | id039 | id045 | id0000029558 | 40 | 49 | 39457 | 5 | 4 | 48.951141 | 33 | | id047 | id023 | id0000071286 | 68 | 20 | 74463 | 2 | 14 | 60.469241 | 34 | +-------+-------+--------------+-----+-----+-------+----+----+-----------+ 35 | ``` 36 | 37 | Suppose you'd like to run the following query: `SELECT id1, sum(v1) AS v1 from the_table GROUP BY id1`. 38 | 39 | If you use pandas, then this query will take 43.6 seconds to execute. 40 | 41 | It only takes DataFusion 9.8 seconds to execute the same query. 42 | 43 | DataFusion is easy to use, powerful, and fast. Let's learn more! 44 | -------------------------------------------------------------------------------- /docs/mdbook/src/installation.md: -------------------------------------------------------------------------------- 1 | 17 | # Installation 18 | 19 | DataFusion is easy to install, just like any other Python library. 20 | 21 | ## Using uv 22 | 23 | If you do not yet have a virtual environment, create one: 24 | 25 | ```bash 26 | uv venv 27 | ``` 28 | 29 | You can add datafusion to your virtual environment with the usual: 30 | 31 | ```bash 32 | uv pip install datafusion 33 | ``` 34 | 35 | Or, to add to a project: 36 | 37 | ```bash 38 | uv add datafusion 39 | ``` 40 | 41 | ## Using pip 42 | 43 | ``` bash 44 | pip install datafusion 45 | ``` 46 | 47 | ## uv & JupyterLab setup 48 | 49 | This section explains how to install DataFusion in a uv environment with other libraries that allow for a nice Jupyter workflow. This setup is completely optional. These steps are only needed if you'd like to run DataFusion in a Jupyter notebook and have an interface like this: 50 | 51 | ![DataFusion in Jupyter](https://github.com/MrPowers/datafusion-book/raw/main/src/images/datafusion-jupyterlab.png) 52 | 53 | Create a virtual environment with DataFusion, Jupyter, and other useful dependencies and start the desktop application. 54 | 55 | ```bash 56 | uv venv 57 | uv pip install datafusion jupyterlab jupyterlab_code_formatter 58 | uv run jupyter lab 59 | ``` 60 | 61 | ## Examples 62 | 63 | See the [DataFusion Python Examples](https://github.com/apache/arrow-datafusion-python/tree/main/examples) for a variety of Python scripts that show DataFusion in action! 64 | -------------------------------------------------------------------------------- /docs/mdbook/src/quickstart.md: -------------------------------------------------------------------------------- 1 | 17 | # DataFusion Quickstart 18 | 19 | You can easily query a DataFusion table with the Python API or with pure SQL. 20 | 21 | Let's create a small DataFrame and then run some queries with both APIs. 22 | 23 | Start by creating a DataFrame with four rows of data and two columns: `a` and `b`. 24 | 25 | ```python 26 | from datafusion import SessionContext 27 | 28 | ctx = SessionContext() 29 | 30 | df = ctx.from_pydict({"a": [1, 2, 3, 1], "b": [4, 5, 6, 7]}, name="my_table") 31 | ``` 32 | 33 | Let's append a column to this DataFrame that adds columns `a` and `b` with the SQL API. 34 | 35 | ``` 36 | ctx.sql("select a, b, a + b as sum_a_b from my_table") 37 | 38 | +---+---+---------+ 39 | | a | b | sum_a_b | 40 | +---+---+---------+ 41 | | 1 | 4 | 5 | 42 | | 2 | 5 | 7 | 43 | | 3 | 6 | 9 | 44 | | 1 | 7 | 8 | 45 | +---+---+---------+ 46 | ``` 47 | 48 | DataFusion makes it easy to run SQL queries on DataFrames. 49 | 50 | Now let's run the same query with the DataFusion Python API: 51 | 52 | ```python 53 | from datafusion import col 54 | 55 | df.select( 56 | col("a"), 57 | col("b"), 58 | col("a") + col("b"), 59 | ) 60 | ``` 61 | 62 | We get the same result as before: 63 | 64 | ``` 65 | +---+---+-------------------------+ 66 | | a | b | my_table.a + my_table.b | 67 | +---+---+-------------------------+ 68 | | 1 | 4 | 5 | 69 | | 2 | 5 | 7 | 70 | | 3 | 6 | 9 | 71 | | 1 | 7 | 8 | 72 | +---+---+-------------------------+ 73 | ``` 74 | 75 | DataFusion also allows you to query data with a well-designed Python interface. 76 | 77 | Python users have two great ways to query DataFusion tables. 78 | -------------------------------------------------------------------------------- /docs/mdbook/src/usage/create-table.md: -------------------------------------------------------------------------------- 1 | 17 | # DataFusion Create Table 18 | 19 | It's easy to create DataFusion tables from a variety of data sources. 20 | 21 | ## Create Table from Python Dictionary 22 | 23 | Here's how to create a DataFusion table from a Python dictionary: 24 | 25 | ```python 26 | from datafusion import SessionContext 27 | 28 | ctx = SessionContext() 29 | 30 | df = ctx.from_pydict({"a": [1, 2, 3, 1], "b": [4, 5, 6, 7]}, name="my_table") 31 | ``` 32 | 33 | Supplying the `name` parameter is optional. You only need to name the table if you'd like to query it with the SQL API. 34 | 35 | You can also create a DataFrame without a name that can be queried with the Python API: 36 | 37 | ```python 38 | from datafusion import SessionContext 39 | 40 | ctx = SessionContext() 41 | 42 | df = ctx.from_pydict({"a": [1, 2, 3, 1], "b": [4, 5, 6, 7]}) 43 | ``` 44 | 45 | ## Create Table from CSV 46 | 47 | You can read a CSV into a DataFusion DataFrame. Here's how to read the `G1_1e8_1e2_0_0.csv` file into a table named `csv_1e8`: 48 | 49 | ```python 50 | ctx.register_csv("csv_1e8", "G1_1e8_1e2_0_0.csv") 51 | ``` 52 | 53 | ## Create Table from Parquet 54 | 55 | You can read a Parquet file into a DataFusion DataFrame. Here's how to read the `yellow_tripdata_2021-01.parquet` file into a table named `taxi`. 56 | 57 | ```python 58 | ctx.register_table("taxi", "yellow_tripdata_2021-01.parquet") 59 | ``` 60 | -------------------------------------------------------------------------------- /docs/mdbook/src/usage/index.md: -------------------------------------------------------------------------------- 1 | 17 | # Usage 18 | 19 | This section shows how to create DataFusion DataFrames from a variety of data sources like CSV files and Parquet files. 20 | 21 | You'll learn more about the SQL statements that are supported by DataFusion. 22 | 23 | You'll also learn about the DataFusion's Python API for querying data. 24 | 25 | The documentation will wrap up with a variety of real-world data processing tasks that are well suited for DataFusion. The lightning-fast speed and reliable execution makes DataFusion the best technology for a variety of data processing tasks. 26 | -------------------------------------------------------------------------------- /docs/source/_static/images/2x_bgwhite_original.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/datafusion-python/0cc9b0a513e784597bc6e35f883fefa9e2d3210b/docs/source/_static/images/2x_bgwhite_original.png -------------------------------------------------------------------------------- /docs/source/_static/images/original.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/datafusion-python/0cc9b0a513e784597bc6e35f883fefa9e2d3210b/docs/source/_static/images/original.png -------------------------------------------------------------------------------- /docs/source/_static/images/original2x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/datafusion-python/0cc9b0a513e784597bc6e35f883fefa9e2d3210b/docs/source/_static/images/original2x.png -------------------------------------------------------------------------------- /docs/source/_templates/docs-sidebar.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 10 | 11 | 20 | -------------------------------------------------------------------------------- /docs/source/_templates/layout.html: -------------------------------------------------------------------------------- 1 | {% extends "pydata_sphinx_theme/layout.html" %} 2 | 3 | {# Silence the navbar #} 4 | {% block docs_navbar %} 5 | {% endblock %} 6 | 7 | 10 | {% block footer %} 11 | 12 |
13 |
14 | {% for footer_item in theme_footer_items %} 15 | 18 | {% endfor %} 19 | 23 |
24 |
25 | 26 | {% endblock %} 27 | -------------------------------------------------------------------------------- /docs/source/images/jupyter_lab_df_view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/datafusion-python/0cc9b0a513e784597bc6e35f883fefa9e2d3210b/docs/source/images/jupyter_lab_df_view.png -------------------------------------------------------------------------------- /docs/source/user-guide/common-operations/basic-info.rst: -------------------------------------------------------------------------------- 1 | .. Licensed to the Apache Software Foundation (ASF) under one 2 | .. or more contributor license agreements. See the NOTICE file 3 | .. distributed with this work for additional information 4 | .. regarding copyright ownership. The ASF licenses this file 5 | .. to you under the Apache License, Version 2.0 (the 6 | .. "License"); you may not use this file except in compliance 7 | .. with the License. You may obtain a copy of the License at 8 | 9 | .. http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | .. Unless required by applicable law or agreed to in writing, 12 | .. software distributed under the License is distributed on an 13 | .. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | .. KIND, either express or implied. See the License for the 15 | .. specific language governing permissions and limitations 16 | .. under the License. 17 | 18 | Basic Operations 19 | ================ 20 | 21 | In this section, you will learn how to display essential details of DataFrames using specific functions. 22 | 23 | .. ipython:: python 24 | 25 | from datafusion import SessionContext 26 | import random 27 | 28 | ctx = SessionContext() 29 | df = ctx.from_pydict({ 30 | "nrs": [1, 2, 3, 4, 5], 31 | "names": ["python", "ruby", "java", "haskell", "go"], 32 | "random": random.sample(range(1000), 5), 33 | "groups": ["A", "A", "B", "C", "B"], 34 | }) 35 | df 36 | 37 | Use :py:func:`~datafusion.dataframe.DataFrame.limit` to view the top rows of the frame: 38 | 39 | .. ipython:: python 40 | 41 | df.limit(2) 42 | 43 | Display the columns of the DataFrame using :py:func:`~datafusion.dataframe.DataFrame.schema`: 44 | 45 | .. ipython:: python 46 | 47 | df.schema() 48 | 49 | The method :py:func:`~datafusion.dataframe.DataFrame.to_pandas` uses pyarrow to convert to pandas DataFrame, by collecting the batches, 50 | passing them to an Arrow table, and then converting them to a pandas DataFrame. 51 | 52 | .. ipython:: python 53 | 54 | df.to_pandas() 55 | 56 | :py:func:`~datafusion.dataframe.DataFrame.describe` shows a quick statistic summary of your data: 57 | 58 | .. ipython:: python 59 | 60 | df.describe() 61 | 62 | -------------------------------------------------------------------------------- /docs/source/user-guide/common-operations/index.rst: -------------------------------------------------------------------------------- 1 | .. Licensed to the Apache Software Foundation (ASF) under one 2 | .. or more contributor license agreements. See the NOTICE file 3 | .. distributed with this work for additional information 4 | .. regarding copyright ownership. The ASF licenses this file 5 | .. to you under the Apache License, Version 2.0 (the 6 | .. "License"); you may not use this file except in compliance 7 | .. with the License. You may obtain a copy of the License at 8 | 9 | .. http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | .. Unless required by applicable law or agreed to in writing, 12 | .. software distributed under the License is distributed on an 13 | .. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | .. KIND, either express or implied. See the License for the 15 | .. specific language governing permissions and limitations 16 | .. under the License. 17 | 18 | Common Operations 19 | ================= 20 | 21 | The contents of this section are designed to guide a new user through how to use DataFusion. 22 | 23 | .. toctree:: 24 | :maxdepth: 2 25 | 26 | views 27 | basic-info 28 | select-and-filter 29 | expressions 30 | joins 31 | functions 32 | aggregations 33 | windows 34 | udf-and-udfa 35 | -------------------------------------------------------------------------------- /docs/source/user-guide/common-operations/views.rst: -------------------------------------------------------------------------------- 1 | .. Licensed to the Apache Software Foundation (ASF) under one 2 | .. or more contributor license agreements. See the NOTICE file 3 | .. distributed with this work for additional information 4 | .. regarding copyright ownership. The ASF licenses this file 5 | .. to you under the Apache License, Version 2.0 (the 6 | .. "License"); you may not use this file except in compliance 7 | .. with the License. You may obtain a copy of the License at 8 | 9 | .. http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | .. Unless required by applicable law or agreed to in writing, 12 | .. software distributed under the License is distributed on an 13 | .. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | .. KIND, either express or implied. See the License for the 15 | .. specific language governing permissions and limitations 16 | .. under the License. 17 | 18 | ====================== 19 | Registering Views 20 | ====================== 21 | 22 | You can use the context's ``register_view`` method to register a DataFrame as a view 23 | 24 | .. code-block:: python 25 | 26 | from datafusion import SessionContext, col, literal 27 | 28 | # Create a DataFusion context 29 | ctx = SessionContext() 30 | 31 | # Create sample data 32 | data = {"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]} 33 | 34 | # Create a DataFrame from the dictionary 35 | df = ctx.from_pydict(data, "my_table") 36 | 37 | # Filter the DataFrame (for example, keep rows where a > 2) 38 | df_filtered = df.filter(col("a") > literal(2)) 39 | 40 | # Register the dataframe as a view with the context 41 | ctx.register_view("view1", df_filtered) 42 | 43 | # Now run a SQL query against the registered view 44 | df_view = ctx.sql("SELECT * FROM view1") 45 | 46 | # Collect the results 47 | results = df_view.collect() 48 | 49 | # Convert results to a list of dictionaries for display 50 | result_dicts = [batch.to_pydict() for batch in results] 51 | 52 | print(result_dicts) 53 | 54 | This will output: 55 | 56 | .. code-block:: python 57 | 58 | [{'a': [3, 4, 5], 'b': [30, 40, 50]}] 59 | -------------------------------------------------------------------------------- /docs/source/user-guide/configuration.rst: -------------------------------------------------------------------------------- 1 | .. Licensed to the Apache Software Foundation (ASF) under one 2 | .. or more contributor license agreements. See the NOTICE file 3 | .. distributed with this work for additional information 4 | .. regarding copyright ownership. The ASF licenses this file 5 | .. to you under the Apache License, Version 2.0 (the 6 | .. "License"); you may not use this file except in compliance 7 | .. with the License. You may obtain a copy of the License at 8 | 9 | .. http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | .. Unless required by applicable law or agreed to in writing, 12 | .. software distributed under the License is distributed on an 13 | .. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | .. KIND, either express or implied. See the License for the 15 | .. specific language governing permissions and limitations 16 | .. under the License. 17 | 18 | Configuration 19 | ============= 20 | 21 | Let's look at how we can configure DataFusion. When creating a :py:class:`~datafusion.context.SessionContext`, you can pass in 22 | a :py:class:`~datafusion.context.SessionConfig` and :py:class:`~datafusion.context.RuntimeEnvBuilder` object. These two cover a wide range of options. 23 | 24 | .. code-block:: python 25 | 26 | from datafusion import RuntimeEnvBuilder, SessionConfig, SessionContext 27 | 28 | # create a session context with default settings 29 | ctx = SessionContext() 30 | print(ctx) 31 | 32 | # create a session context with explicit runtime and config settings 33 | runtime = RuntimeEnvBuilder().with_disk_manager_os().with_fair_spill_pool(10000000) 34 | config = ( 35 | SessionConfig() 36 | .with_create_default_catalog_and_schema(True) 37 | .with_default_catalog_and_schema("foo", "bar") 38 | .with_target_partitions(8) 39 | .with_information_schema(True) 40 | .with_repartition_joins(False) 41 | .with_repartition_aggregations(False) 42 | .with_repartition_windows(False) 43 | .with_parquet_pruning(False) 44 | .set("datafusion.execution.parquet.pushdown_filters", "true") 45 | ) 46 | ctx = SessionContext(config, runtime) 47 | print(ctx) 48 | 49 | 50 | You can read more about available :py:class:`~datafusion.context.SessionConfig` options in the `rust DataFusion Configuration guide `_, 51 | and about :code:`RuntimeEnvBuilder` options in the rust `online API documentation `_. 52 | -------------------------------------------------------------------------------- /docs/source/user-guide/io/avro.rst: -------------------------------------------------------------------------------- 1 | .. Licensed to the Apache Software Foundation (ASF) under one 2 | .. or more contributor license agreements. See the NOTICE file 3 | .. distributed with this work for additional information 4 | .. regarding copyright ownership. The ASF licenses this file 5 | .. to you under the Apache License, Version 2.0 (the 6 | .. "License"); you may not use this file except in compliance 7 | .. with the License. You may obtain a copy of the License at 8 | 9 | .. http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | .. Unless required by applicable law or agreed to in writing, 12 | .. software distributed under the License is distributed on an 13 | .. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | .. KIND, either express or implied. See the License for the 15 | .. specific language governing permissions and limitations 16 | .. under the License. 17 | 18 | .. _io_avro: 19 | 20 | Avro 21 | ==== 22 | 23 | `Avro `_ is a serialization format for record data. Reading an avro file is very straightforward 24 | with :py:func:`~datafusion.context.SessionContext.read_avro` 25 | 26 | .. code-block:: python 27 | 28 | 29 | from datafusion import SessionContext 30 | 31 | ctx = SessionContext() 32 | df = ctx.read_avro("file.avro") -------------------------------------------------------------------------------- /docs/source/user-guide/io/csv.rst: -------------------------------------------------------------------------------- 1 | .. Licensed to the Apache Software Foundation (ASF) under one 2 | .. or more contributor license agreements. See the NOTICE file 3 | .. distributed with this work for additional information 4 | .. regarding copyright ownership. The ASF licenses this file 5 | .. to you under the Apache License, Version 2.0 (the 6 | .. "License"); you may not use this file except in compliance 7 | .. with the License. You may obtain a copy of the License at 8 | 9 | .. http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | .. Unless required by applicable law or agreed to in writing, 12 | .. software distributed under the License is distributed on an 13 | .. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | .. KIND, either express or implied. See the License for the 15 | .. specific language governing permissions and limitations 16 | .. under the License. 17 | 18 | .. _io_csv: 19 | 20 | CSV 21 | === 22 | 23 | Reading a csv is very straightforward with :py:func:`~datafusion.context.SessionContext.read_csv` 24 | 25 | .. code-block:: python 26 | 27 | 28 | from datafusion import SessionContext 29 | 30 | ctx = SessionContext() 31 | df = ctx.read_csv("file.csv") 32 | 33 | An alternative is to use :py:func:`~datafusion.context.SessionContext.register_csv` 34 | 35 | .. code-block:: python 36 | 37 | ctx.register_csv("file", "file.csv") 38 | df = ctx.table("file") 39 | -------------------------------------------------------------------------------- /docs/source/user-guide/io/index.rst: -------------------------------------------------------------------------------- 1 | .. Licensed to the Apache Software Foundation (ASF) under one 2 | .. or more contributor license agreements. See the NOTICE file 3 | .. distributed with this work for additional information 4 | .. regarding copyright ownership. The ASF licenses this file 5 | .. to you under the Apache License, Version 2.0 (the 6 | .. "License"); you may not use this file except in compliance 7 | .. with the License. You may obtain a copy of the License at 8 | 9 | .. http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | .. Unless required by applicable law or agreed to in writing, 12 | .. software distributed under the License is distributed on an 13 | .. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | .. KIND, either express or implied. See the License for the 15 | .. specific language governing permissions and limitations 16 | .. under the License. 17 | 18 | IO 19 | == 20 | 21 | .. toctree:: 22 | :maxdepth: 2 23 | 24 | arrow 25 | avro 26 | csv 27 | json 28 | parquet 29 | table_provider 30 | -------------------------------------------------------------------------------- /docs/source/user-guide/io/json.rst: -------------------------------------------------------------------------------- 1 | .. Licensed to the Apache Software Foundation (ASF) under one 2 | .. or more contributor license agreements. See the NOTICE file 3 | .. distributed with this work for additional information 4 | .. regarding copyright ownership. The ASF licenses this file 5 | .. to you under the Apache License, Version 2.0 (the 6 | .. "License"); you may not use this file except in compliance 7 | .. with the License. You may obtain a copy of the License at 8 | 9 | .. http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | .. Unless required by applicable law or agreed to in writing, 12 | .. software distributed under the License is distributed on an 13 | .. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | .. KIND, either express or implied. See the License for the 15 | .. specific language governing permissions and limitations 16 | .. under the License. 17 | 18 | .. _io_json: 19 | 20 | JSON 21 | ==== 22 | `JSON `_ (JavaScript Object Notation) is a lightweight data-interchange format. 23 | When it comes to reading a JSON file, using :py:func:`~datafusion.context.SessionContext.read_json` is a simple and easy 24 | 25 | .. code-block:: python 26 | 27 | 28 | from datafusion import SessionContext 29 | 30 | ctx = SessionContext() 31 | df = ctx.read_json("file.json") 32 | -------------------------------------------------------------------------------- /docs/source/user-guide/io/parquet.rst: -------------------------------------------------------------------------------- 1 | .. Licensed to the Apache Software Foundation (ASF) under one 2 | .. or more contributor license agreements. See the NOTICE file 3 | .. distributed with this work for additional information 4 | .. regarding copyright ownership. The ASF licenses this file 5 | .. to you under the Apache License, Version 2.0 (the 6 | .. "License"); you may not use this file except in compliance 7 | .. with the License. You may obtain a copy of the License at 8 | 9 | .. http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | .. Unless required by applicable law or agreed to in writing, 12 | .. software distributed under the License is distributed on an 13 | .. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | .. KIND, either express or implied. See the License for the 15 | .. specific language governing permissions and limitations 16 | .. under the License. 17 | 18 | .. _io_parquet: 19 | 20 | Parquet 21 | ======= 22 | 23 | It is quite simple to read a parquet file using the :py:func:`~datafusion.context.SessionContext.read_parquet` function. 24 | 25 | .. code-block:: python 26 | 27 | from datafusion import SessionContext 28 | 29 | ctx = SessionContext() 30 | df = ctx.read_parquet("file.parquet") 31 | 32 | An alternative is to use :py:func:`~datafusion.context.SessionContext.register_parquet` 33 | 34 | .. code-block:: python 35 | 36 | ctx.register_parquet("file", "file.parquet") 37 | df = ctx.table("file") 38 | -------------------------------------------------------------------------------- /docs/source/user-guide/io/table_provider.rst: -------------------------------------------------------------------------------- 1 | .. Licensed to the Apache Software Foundation (ASF) under one 2 | .. or more contributor license agreements. See the NOTICE file 3 | .. distributed with this work for additional information 4 | .. regarding copyright ownership. The ASF licenses this file 5 | .. to you under the Apache License, Version 2.0 (the 6 | .. "License"); you may not use this file except in compliance 7 | .. with the License. You may obtain a copy of the License at 8 | 9 | .. http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | .. Unless required by applicable law or agreed to in writing, 12 | .. software distributed under the License is distributed on an 13 | .. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | .. KIND, either express or implied. See the License for the 15 | .. specific language governing permissions and limitations 16 | .. under the License. 17 | 18 | .. _io_custom_table_provider: 19 | 20 | Custom Table Provider 21 | ===================== 22 | 23 | If you have a custom data source that you want to integrate with DataFusion, you can do so by 24 | implementing the `TableProvider `_ 25 | interface in Rust and then exposing it in Python. To do so, 26 | you must use DataFusion 43.0.0 or later and expose a `FFI_TableProvider `_ 27 | via `PyCapsule `_. 28 | 29 | A complete example can be found in the `examples folder `_. 30 | 31 | .. code-block:: rust 32 | 33 | #[pymethods] 34 | impl MyTableProvider { 35 | 36 | fn __datafusion_table_provider__<'py>( 37 | &self, 38 | py: Python<'py>, 39 | ) -> PyResult> { 40 | let name = CString::new("datafusion_table_provider").unwrap(); 41 | 42 | let provider = Arc::new(self.clone()) 43 | .map_err(|e| PyRuntimeError::new_err(e.to_string()))?; 44 | let provider = FFI_TableProvider::new(Arc::new(provider), false); 45 | 46 | PyCapsule::new_bound(py, provider, Some(name.clone())) 47 | } 48 | } 49 | 50 | Once you have this library available, in python you can register your table provider 51 | to the ``SessionContext``. 52 | 53 | .. code-block:: python 54 | 55 | provider = MyTableProvider() 56 | ctx.register_table_provider("my_table", provider) 57 | 58 | ctx.table("my_table").show() 59 | -------------------------------------------------------------------------------- /docs/source/user-guide/sql.rst: -------------------------------------------------------------------------------- 1 | .. Licensed to the Apache Software Foundation (ASF) under one 2 | .. or more contributor license agreements. See the NOTICE file 3 | .. distributed with this work for additional information 4 | .. regarding copyright ownership. The ASF licenses this file 5 | .. to you under the Apache License, Version 2.0 (the 6 | .. "License"); you may not use this file except in compliance 7 | .. with the License. You may obtain a copy of the License at 8 | 9 | .. http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | .. Unless required by applicable law or agreed to in writing, 12 | .. software distributed under the License is distributed on an 13 | .. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | .. KIND, either express or implied. See the License for the 15 | .. specific language governing permissions and limitations 16 | .. under the License. 17 | 18 | SQL 19 | === 20 | 21 | DataFusion also offers a SQL API, read the full reference `here `_ 22 | 23 | .. ipython:: python 24 | 25 | import datafusion 26 | from datafusion import col 27 | import pyarrow 28 | 29 | # create a context 30 | ctx = datafusion.SessionContext() 31 | 32 | # register a CSV 33 | ctx.register_csv('pokemon', 'pokemon.csv') 34 | 35 | # create a new statement via SQL 36 | df = ctx.sql('SELECT "Attack"+"Defense", "Attack"-"Defense" FROM pokemon') 37 | 38 | # collect and convert to pandas DataFrame 39 | df.to_pandas() -------------------------------------------------------------------------------- /examples/chart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/datafusion-python/0cc9b0a513e784597bc6e35f883fefa9e2d3210b/examples/chart.png -------------------------------------------------------------------------------- /examples/create-context.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | from datafusion import RuntimeEnvBuilder, SessionConfig, SessionContext 19 | 20 | # create a session context with default settings 21 | ctx = SessionContext() 22 | print(ctx) 23 | 24 | # create a session context with explicit runtime and config settings 25 | runtime = RuntimeEnvBuilder().with_disk_manager_os().with_fair_spill_pool(10000000) 26 | config = ( 27 | SessionConfig() 28 | .with_create_default_catalog_and_schema(enabled=True) 29 | .with_default_catalog_and_schema("foo", "bar") 30 | .with_target_partitions(8) 31 | .with_information_schema(enabled=True) 32 | .with_repartition_joins(enabled=False) 33 | .with_repartition_aggregations(enabled=False) 34 | .with_repartition_windows(enabled=False) 35 | .with_parquet_pruning(enabled=False) 36 | .set("datafusion.execution.parquet.pushdown_filters", "true") 37 | ) 38 | ctx = SessionContext(config, runtime) 39 | print(ctx) 40 | 41 | ctx = ctx.enable_url_table() 42 | print(ctx) 43 | -------------------------------------------------------------------------------- /examples/dataframe-parquet.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | from datafusion import SessionContext 19 | from datafusion import functions as f 20 | 21 | ctx = SessionContext() 22 | df = ctx.read_parquet("yellow_tripdata_2021-01.parquet").aggregate( 23 | [f.col("passenger_count")], [f.count_star()] 24 | ) 25 | df.show() 26 | -------------------------------------------------------------------------------- /examples/datafusion-ffi-example/.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [target.x86_64-apple-darwin] 2 | rustflags = [ 3 | "-C", "link-arg=-undefined", 4 | "-C", "link-arg=dynamic_lookup", 5 | ] 6 | 7 | [target.aarch64-apple-darwin] 8 | rustflags = [ 9 | "-C", "link-arg=-undefined", 10 | "-C", "link-arg=dynamic_lookup", 11 | ] 12 | 13 | -------------------------------------------------------------------------------- /examples/datafusion-ffi-example/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | [package] 19 | name = "ffi-table-provider" 20 | version = "0.1.0" 21 | edition = "2021" 22 | 23 | [dependencies] 24 | datafusion = { version = "47.0.0" } 25 | datafusion-ffi = { version = "47.0.0" } 26 | pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py39"] } 27 | arrow = { version = "55.0.0" } 28 | arrow-array = { version = "55.0.0" } 29 | arrow-schema = { version = "55.0.0" } 30 | 31 | [build-dependencies] 32 | pyo3-build-config = "0.23" 33 | 34 | [lib] 35 | name = "datafusion_ffi_example" 36 | crate-type = ["cdylib", "rlib"] 37 | -------------------------------------------------------------------------------- /examples/datafusion-ffi-example/build.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | fn main() { 19 | pyo3_build_config::add_extension_module_link_args(); 20 | } 21 | -------------------------------------------------------------------------------- /examples/datafusion-ffi-example/pyproject.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | [build-system] 19 | requires = ["maturin>=1.6,<2.0"] 20 | build-backend = "maturin" 21 | 22 | [project] 23 | name = "datafusion_ffi_example" 24 | requires-python = ">=3.9" 25 | classifiers = [ 26 | "Programming Language :: Rust", 27 | "Programming Language :: Python :: Implementation :: CPython", 28 | "Programming Language :: Python :: Implementation :: PyPy", 29 | ] 30 | dynamic = ["version"] 31 | 32 | [tool.maturin] 33 | features = ["pyo3/extension-module"] 34 | -------------------------------------------------------------------------------- /examples/datafusion-ffi-example/python/tests/_test_table_provider.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | from __future__ import annotations 19 | 20 | import pyarrow as pa 21 | from datafusion import SessionContext 22 | from datafusion_ffi_example import MyTableProvider 23 | 24 | 25 | def test_table_loading(): 26 | ctx = SessionContext() 27 | table = MyTableProvider(3, 2, 4) 28 | ctx.register_table_provider("t", table) 29 | result = ctx.table("t").collect() 30 | 31 | assert len(result) == 4 32 | assert result[0].num_columns == 3 33 | 34 | result = [r.column(0) for r in result] 35 | expected = [ 36 | pa.array([0, 1], type=pa.int32()), 37 | pa.array([2, 3, 4], type=pa.int32()), 38 | pa.array([4, 5, 6, 7], type=pa.int32()), 39 | pa.array([6, 7, 8, 9, 10], type=pa.int32()), 40 | ] 41 | 42 | assert result == expected 43 | -------------------------------------------------------------------------------- /examples/datafusion-ffi-example/src/lib.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use crate::table_function::MyTableFunction; 19 | use crate::table_provider::MyTableProvider; 20 | use pyo3::prelude::*; 21 | 22 | pub(crate) mod table_function; 23 | pub(crate) mod table_provider; 24 | 25 | #[pymodule] 26 | fn datafusion_ffi_example(m: &Bound<'_, PyModule>) -> PyResult<()> { 27 | m.add_class::()?; 28 | m.add_class::()?; 29 | Ok(()) 30 | } 31 | -------------------------------------------------------------------------------- /examples/datafusion-ffi-example/src/table_function.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use crate::table_provider::MyTableProvider; 19 | use datafusion::catalog::{TableFunctionImpl, TableProvider}; 20 | use datafusion::error::Result as DataFusionResult; 21 | use datafusion::prelude::Expr; 22 | use datafusion_ffi::udtf::FFI_TableFunction; 23 | use pyo3::types::PyCapsule; 24 | use pyo3::{pyclass, pymethods, Bound, PyResult, Python}; 25 | use std::sync::Arc; 26 | 27 | #[pyclass(name = "MyTableFunction", module = "datafusion_ffi_example", subclass)] 28 | #[derive(Debug, Clone)] 29 | pub(crate) struct MyTableFunction {} 30 | 31 | #[pymethods] 32 | impl MyTableFunction { 33 | #[new] 34 | fn new() -> Self { 35 | Self {} 36 | } 37 | 38 | fn __datafusion_table_function__<'py>( 39 | &self, 40 | py: Python<'py>, 41 | ) -> PyResult> { 42 | let name = cr"datafusion_table_function".into(); 43 | 44 | let func = self.clone(); 45 | let provider = FFI_TableFunction::new(Arc::new(func), None); 46 | 47 | PyCapsule::new(py, provider, Some(name)) 48 | } 49 | } 50 | 51 | impl TableFunctionImpl for MyTableFunction { 52 | fn call(&self, _args: &[Expr]) -> DataFusionResult> { 53 | let provider = MyTableProvider::new(4, 3, 2).create_table()?; 54 | Ok(Arc::new(provider)) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /examples/export.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import datafusion 19 | 20 | # create a context 21 | ctx = datafusion.SessionContext() 22 | 23 | # create a new datafusion DataFrame 24 | df = ctx.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]}) 25 | # Dataframe: 26 | # +---+---+ 27 | # | a | b | 28 | # +---+---+ 29 | # | 1 | 4 | 30 | # | 2 | 5 | 31 | # | 3 | 6 | 32 | # +---+---+ 33 | 34 | # export to pandas dataframe 35 | pandas_df = df.to_pandas() 36 | assert pandas_df.shape == (3, 2) 37 | 38 | # export to PyArrow table 39 | arrow_table = df.to_arrow_table() 40 | assert arrow_table.shape == (3, 2) 41 | 42 | # export to Polars dataframe 43 | polars_df = df.to_polars() 44 | assert polars_df.shape == (3, 2) 45 | 46 | # export to Python list of rows 47 | pylist = df.to_pylist() 48 | assert pylist == [{"a": 1, "b": 4}, {"a": 2, "b": 5}, {"a": 3, "b": 6}] 49 | 50 | # export to Python dictionary of columns 51 | pydict = df.to_pydict() 52 | assert pydict == {"a": [1, 2, 3], "b": [4, 5, 6]} 53 | -------------------------------------------------------------------------------- /examples/import.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import datafusion 19 | import pandas as pd 20 | import polars as pl 21 | import pyarrow as pa 22 | 23 | # Create a context 24 | ctx = datafusion.SessionContext() 25 | 26 | # Create a datafusion DataFrame from a Python dictionary 27 | # The dictionary keys represent column names and the dictionary values 28 | # represent column values 29 | df = ctx.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]}) 30 | assert type(df) is datafusion.DataFrame 31 | # Dataframe: 32 | # +---+---+ 33 | # | a | b | 34 | # +---+---+ 35 | # | 1 | 4 | 36 | # | 2 | 5 | 37 | # | 3 | 6 | 38 | # +---+---+ 39 | 40 | # Create a datafusion DataFrame from a Python list of rows 41 | df = ctx.from_pylist([{"a": 1, "b": 4}, {"a": 2, "b": 5}, {"a": 3, "b": 6}]) 42 | assert type(df) is datafusion.DataFrame 43 | 44 | # Convert pandas DataFrame to datafusion DataFrame 45 | pandas_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) 46 | df = ctx.from_pandas(pandas_df) 47 | assert type(df) is datafusion.DataFrame 48 | 49 | # Convert polars DataFrame to datafusion DataFrame 50 | polars_df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) 51 | df = ctx.from_polars(polars_df) 52 | assert type(df) is datafusion.DataFrame 53 | 54 | # Convert Arrow Table to datafusion DataFrame 55 | arrow_table = pa.Table.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]}) 56 | df = ctx.from_arrow(arrow_table) 57 | assert type(df) is datafusion.DataFrame 58 | -------------------------------------------------------------------------------- /examples/python-udaf.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import datafusion 19 | import pyarrow as pa 20 | import pyarrow.compute 21 | from datafusion import Accumulator, col, udaf 22 | 23 | 24 | class MyAccumulator(Accumulator): 25 | """ 26 | Interface of a user-defined accumulation. 27 | """ 28 | 29 | def __init__(self) -> None: 30 | self._sum = pa.scalar(0.0) 31 | 32 | def update(self, values: pa.Array) -> None: 33 | # not nice since pyarrow scalars can't be summed yet. This breaks on `None` 34 | self._sum = pa.scalar(self._sum.as_py() + pa.compute.sum(values).as_py()) 35 | 36 | def merge(self, states: pa.Array) -> None: 37 | # not nice since pyarrow scalars can't be summed yet. This breaks on `None` 38 | self._sum = pa.scalar(self._sum.as_py() + pa.compute.sum(states).as_py()) 39 | 40 | def state(self) -> pa.Array: 41 | return pa.array([self._sum.as_py()]) 42 | 43 | def evaluate(self) -> pa.Scalar: 44 | return self._sum 45 | 46 | 47 | # create a context 48 | ctx = datafusion.SessionContext() 49 | 50 | # create a RecordBatch and a new DataFrame from it 51 | batch = pa.RecordBatch.from_arrays( 52 | [pa.array([1, 2, 3]), pa.array([4, 5, 6])], 53 | names=["a", "b"], 54 | ) 55 | df = ctx.create_dataframe([[batch]]) 56 | 57 | my_udaf = udaf( 58 | MyAccumulator, 59 | pa.float64(), 60 | pa.float64(), 61 | [pa.float64()], 62 | "stable", 63 | ) 64 | 65 | df = df.aggregate([], [my_udaf(col("a"))]) 66 | 67 | result = df.collect()[0] 68 | 69 | assert result.column(0) == pa.array([6.0]) 70 | -------------------------------------------------------------------------------- /examples/python-udf.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import pyarrow as pa 19 | from datafusion import SessionContext, udf 20 | from datafusion import functions as f 21 | 22 | 23 | def is_null(array: pa.Array) -> pa.Array: 24 | return array.is_null() 25 | 26 | 27 | is_null_arr = udf(is_null, [pa.int64()], pa.bool_(), "stable") 28 | 29 | # create a context 30 | ctx = SessionContext() 31 | 32 | # create a RecordBatch and a new DataFrame from it 33 | batch = pa.RecordBatch.from_arrays( 34 | [pa.array([1, 2, 3]), pa.array([4, 5, 6])], 35 | names=["a", "b"], 36 | ) 37 | df = ctx.create_dataframe([[batch]]) 38 | 39 | df = df.select(is_null_arr(f.col("a"))) 40 | 41 | result = df.collect()[0] 42 | 43 | assert result.column(0) == pa.array([False] * 3) 44 | -------------------------------------------------------------------------------- /examples/query-pyarrow-data.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import datafusion 19 | import pyarrow as pa 20 | from datafusion import col 21 | 22 | # create a context 23 | ctx = datafusion.SessionContext() 24 | 25 | # create a RecordBatch and a new DataFrame from it 26 | batch = pa.RecordBatch.from_arrays( 27 | [pa.array([1, 2, 3]), pa.array([4, 5, 6])], 28 | names=["a", "b"], 29 | ) 30 | df = ctx.create_dataframe([[batch]]) 31 | 32 | # create a new statement 33 | df = df.select( 34 | col("a") + col("b"), 35 | col("a") - col("b"), 36 | ) 37 | 38 | # execute and collect the first (and only) batch 39 | result = df.collect()[0] 40 | 41 | assert result.column(0) == pa.array([5, 7, 9]) 42 | assert result.column(1) == pa.array([-3, -3, -3]) 43 | -------------------------------------------------------------------------------- /examples/sql-parquet-s3.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import os 19 | 20 | import datafusion 21 | from datafusion.object_store import AmazonS3 22 | 23 | region = "us-east-1" 24 | bucket_name = "yellow-trips" 25 | 26 | s3 = AmazonS3( 27 | bucket_name=bucket_name, 28 | region=region, 29 | access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), 30 | secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), 31 | ) 32 | 33 | ctx = datafusion.SessionContext() 34 | path = f"s3://{bucket_name}/" 35 | ctx.register_object_store("s3://", s3, None) 36 | 37 | ctx.register_parquet("trips", path) 38 | 39 | df = ctx.sql("select count(passenger_count) from trips") 40 | df.show() 41 | -------------------------------------------------------------------------------- /examples/sql-parquet.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | from datafusion import SessionContext 19 | 20 | ctx = SessionContext() 21 | ctx.register_parquet("taxi", "yellow_tripdata_2021-01.parquet") 22 | df = ctx.sql( 23 | "select passenger_count, count(*) from taxi where passenger_count is not null group by passenger_count order by passenger_count" 24 | ) 25 | df.show() 26 | -------------------------------------------------------------------------------- /examples/sql-to-pandas.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | from datafusion import SessionContext 19 | 20 | # Create a DataFusion context 21 | ctx = SessionContext() 22 | 23 | # Register table with context 24 | ctx.register_parquet("taxi", "yellow_tripdata_2021-01.parquet") 25 | 26 | # Execute SQL 27 | df = ctx.sql( 28 | "select passenger_count, count(*) " 29 | "from taxi " 30 | "where passenger_count is not null " 31 | "group by passenger_count " 32 | "order by passenger_count" 33 | ) 34 | 35 | # convert to Pandas 36 | pandas_df = df.to_pandas() 37 | 38 | # create a chart 39 | fig = pandas_df.plot( 40 | kind="bar", title="Trip Count by Number of Passengers" 41 | ).get_figure() 42 | fig.savefig("chart.png") 43 | -------------------------------------------------------------------------------- /examples/sql-using-python-udf.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import pyarrow as pa 19 | from datafusion import SessionContext, udf 20 | 21 | 22 | # Define a user-defined function (UDF) 23 | def is_null(array: pa.Array) -> pa.Array: 24 | return array.is_null() 25 | 26 | 27 | is_null_arr = udf( 28 | is_null, 29 | [pa.int64()], 30 | pa.bool_(), 31 | "stable", 32 | # This will be the name of the UDF in SQL 33 | # If not specified it will by default the same as Python function name 34 | name="is_null", 35 | ) 36 | 37 | # Create a context 38 | ctx = SessionContext() 39 | 40 | # Create a datafusion DataFrame from a Python dictionary 41 | ctx.from_pydict({"a": [1, 2, 3], "b": [4, None, 6]}, name="t") 42 | # Dataframe: 43 | # +---+---+ 44 | # | a | b | 45 | # +---+---+ 46 | # | 1 | 4 | 47 | # | 2 | | 48 | # | 3 | 6 | 49 | # +---+---+ 50 | 51 | # Register UDF for use in SQL 52 | ctx.register_udf(is_null_arr) 53 | 54 | # Query the DataFrame using SQL 55 | result_df = ctx.sql("select a, is_null(b) as b_is_null from t") 56 | # Dataframe: 57 | # +---+-----------+ 58 | # | a | b_is_null | 59 | # +---+-----------+ 60 | # | 1 | false | 61 | # | 2 | true | 62 | # | 3 | false | 63 | # +---+-----------+ 64 | assert result_df.to_pydict()["b_is_null"] == [False, True, False] 65 | -------------------------------------------------------------------------------- /examples/substrait.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | from datafusion import SessionContext 19 | from datafusion import substrait as ss 20 | 21 | # Create a DataFusion context 22 | ctx = SessionContext() 23 | 24 | # Register table with context 25 | ctx.register_csv("aggregate_test_data", "./testing/data/csv/aggregate_test_100.csv") 26 | 27 | substrait_plan = ss.Serde.serialize_to_plan("SELECT * FROM aggregate_test_data", ctx) 28 | # type(substrait_plan) -> 29 | 30 | # Encode it to bytes 31 | substrait_bytes = substrait_plan.encode() 32 | # type(substrait_bytes) -> , at this point the bytes can be distributed to file, network, etc safely 33 | # where they could subsequently be deserialized on the receiving end. 34 | 35 | # Alternative serialization approaches 36 | # type(substrait_bytes) -> , at this point the bytes can be distributed to file, network, etc safely 37 | # where they could subsequently be deserialized on the receiving end. 38 | substrait_bytes = ss.Serde.serialize_bytes("SELECT * FROM aggregate_test_data", ctx) 39 | 40 | # Imagine here bytes would be read from network, file, etc ... for example brevity this is omitted and variable is simply reused 41 | # type(substrait_plan) -> 42 | substrait_plan = ss.Serde.deserialize_bytes(substrait_bytes) 43 | 44 | # type(df_logical_plan) -> 45 | df_logical_plan = ss.Consumer.from_substrait_plan(ctx, substrait_plan) 46 | 47 | # Back to Substrait Plan just for demonstration purposes 48 | # type(substrait_plan) -> 49 | substrait_plan = ss.Producer.to_substrait_plan(df_logical_plan, ctx) 50 | -------------------------------------------------------------------------------- /examples/tpch/.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | 3 | -------------------------------------------------------------------------------- /examples/tpch/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # DataFusion Python Examples for TPC-H 21 | 22 | These examples reproduce the problems listed in the Transaction Process Council 23 | TPC-H benchmark. The purpose of these examples is to demonstrate how to use 24 | different aspects of Data Fusion and not necessarily geared towards creating the 25 | most performant queries possible. Within each example is a description of the 26 | problem. For users who are familiar with SQL style commands, you can compare the 27 | approaches in these examples with those listed in the specification. 28 | 29 | - https://www.tpc.org/tpch/ 30 | 31 | The examples provided are based on version 2.18.0 of the TPC-H specification. 32 | 33 | ## Data Setup 34 | 35 | To run these examples, you must first generate a dataset. The `dbgen` tool 36 | provided by TPC can create datasets of arbitrary scale. For testing it is 37 | typically sufficient to create a 1 gigabyte dataset. For convenience, this 38 | repository has a script which uses docker to create this dataset. From the 39 | `benchmarks/tpch` directory execute the following script. 40 | 41 | ```bash 42 | ./tpch-gen.sh 1 43 | ``` 44 | 45 | The examples provided use parquet files for the tables generated by `dbgen`. 46 | A python script is provided to convert the text files from `dbgen` into parquet 47 | files expected by the examples. From the `examples/tpch` directory you can 48 | execute the following command to create the necessary parquet files. 49 | 50 | ```bash 51 | python convert_data_to_parquet.py 52 | ``` 53 | 54 | ## Description of Examples 55 | 56 | For easier access, a description of the techniques demonstrated in each file 57 | is in the README.md file in the `examples` directory. 58 | -------------------------------------------------------------------------------- /examples/tpch/util.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | """ 19 | Common utilities for running TPC-H examples. 20 | """ 21 | 22 | import os 23 | 24 | 25 | def get_data_path(filename: str) -> str: 26 | path = os.path.dirname(os.path.abspath(__file__)) 27 | 28 | return os.path.join(path, "data", filename) 29 | 30 | 31 | def get_answer_file(answer_file: str) -> str: 32 | path = os.path.dirname(os.path.abspath(__file__)) 33 | 34 | return os.path.join( 35 | path, "../../benchmarks/tpch/data/answers", f"{answer_file}.out" 36 | ) 37 | -------------------------------------------------------------------------------- /python/datafusion/col.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | """Col class.""" 19 | 20 | from datafusion.expr import Expr 21 | 22 | 23 | class Col: 24 | """Create a column expression. 25 | 26 | This helper class allows an extra syntax of creating columns using the __getattr__ 27 | method. 28 | """ 29 | 30 | def __call__(self, value: str) -> Expr: 31 | """Create a column expression.""" 32 | return Expr.column(value) 33 | 34 | def __getattr__(self, value: str) -> Expr: 35 | """Create a column using attribute syntax.""" 36 | # For autocomplete to work with IPython 37 | if value.startswith("__wrapped__"): 38 | return getattr(type(self), value) 39 | 40 | return Expr.column(value) 41 | 42 | 43 | col: Col = Col() 44 | column: Col = Col() 45 | __all__ = ["col", "column"] 46 | -------------------------------------------------------------------------------- /python/datafusion/common.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | """Common data types used throughout the DataFusion project.""" 18 | 19 | from enum import Enum 20 | 21 | from ._internal import common as common_internal 22 | 23 | # TODO: these should all have proper wrapper classes 24 | 25 | DFSchema = common_internal.DFSchema 26 | DataType = common_internal.DataType 27 | DataTypeMap = common_internal.DataTypeMap 28 | PythonType = common_internal.PythonType 29 | RexType = common_internal.RexType 30 | SqlFunction = common_internal.SqlFunction 31 | SqlSchema = common_internal.SqlSchema 32 | SqlStatistics = common_internal.SqlStatistics 33 | SqlTable = common_internal.SqlTable 34 | SqlType = common_internal.SqlType 35 | SqlView = common_internal.SqlView 36 | TableType = common_internal.TableType 37 | TableSource = common_internal.TableSource 38 | Constraints = common_internal.Constraints 39 | 40 | __all__ = [ 41 | "Constraints", 42 | "DFSchema", 43 | "DataType", 44 | "DataTypeMap", 45 | "NullTreatment", 46 | "PythonType", 47 | "RexType", 48 | "SqlFunction", 49 | "SqlSchema", 50 | "SqlStatistics", 51 | "SqlTable", 52 | "SqlType", 53 | "SqlView", 54 | "TableSource", 55 | "TableType", 56 | ] 57 | 58 | 59 | class NullTreatment(Enum): 60 | """Describe how null values are to be treated by functions. 61 | 62 | This is used primarily by aggregate and window functions. It can be set on 63 | these functions using the builder approach described in 64 | ref:`_window_functions` and ref:`_aggregation` in the online documentation. 65 | 66 | """ 67 | 68 | RESPECT_NULLS = common_internal.NullTreatment.RESPECT_NULLS 69 | IGNORE_NULLS = common_internal.NullTreatment.IGNORE_NULLS 70 | -------------------------------------------------------------------------------- /python/datafusion/input/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | """This package provides for input sources. 19 | 20 | The primary class used within DataFusion is ``LocationInputPlugin``. 21 | """ 22 | 23 | from .location import LocationInputPlugin 24 | 25 | __all__ = [ 26 | "LocationInputPlugin", 27 | ] 28 | -------------------------------------------------------------------------------- /python/datafusion/input/base.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | """This module provides ``BaseInputSource``. 19 | 20 | A user can extend this to provide a custom input source. 21 | """ 22 | 23 | from abc import ABC, abstractmethod 24 | from typing import Any 25 | 26 | from datafusion.common import SqlTable 27 | 28 | 29 | class BaseInputSource(ABC): 30 | """Base Input Source class. 31 | 32 | If a consuming library would like to provider their own InputSource this is 33 | the class they should extend to write their own. 34 | 35 | Once completed the Plugin InputSource can be registered with the 36 | SessionContext to ensure that it will be used in order 37 | to obtain the SqlTable information from the custom datasource. 38 | """ 39 | 40 | @abstractmethod 41 | def is_correct_input(self, input_item: Any, table_name: str, **kwargs: Any) -> bool: 42 | """Returns `True` if the input is valid.""" 43 | 44 | @abstractmethod 45 | def build_table(self, input_item: Any, table_name: str, **kwarg: Any) -> SqlTable: # type: ignore[invalid-type-form] 46 | """Create a table from the input source.""" 47 | -------------------------------------------------------------------------------- /python/datafusion/object_store.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | """Object store functionality.""" 18 | 19 | from ._internal import object_store 20 | 21 | AmazonS3 = object_store.AmazonS3 22 | GoogleCloud = object_store.GoogleCloud 23 | LocalFileSystem = object_store.LocalFileSystem 24 | MicrosoftAzure = object_store.MicrosoftAzure 25 | Http = object_store.Http 26 | 27 | __all__ = ["AmazonS3", "GoogleCloud", "Http", "LocalFileSystem", "MicrosoftAzure"] 28 | -------------------------------------------------------------------------------- /python/datafusion/py.typed: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. -------------------------------------------------------------------------------- /python/datafusion/udf.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | """Deprecated module for user defined functions.""" 19 | 20 | import warnings 21 | 22 | from datafusion.user_defined import * # noqa: F403 23 | 24 | warnings.warn( 25 | "The module 'udf' is deprecated and will be removed in the next release. " 26 | "Please use 'user_defined' instead.", 27 | DeprecationWarning, 28 | stacklevel=2, 29 | ) 30 | -------------------------------------------------------------------------------- /python/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | -------------------------------------------------------------------------------- /python/tests/conftest.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import pyarrow as pa 19 | import pytest 20 | from datafusion import SessionContext 21 | from pyarrow.csv import write_csv 22 | 23 | 24 | @pytest.fixture 25 | def ctx(): 26 | return SessionContext() 27 | 28 | 29 | @pytest.fixture 30 | def database(ctx, tmp_path): 31 | path = tmp_path / "test.csv" 32 | 33 | table = pa.Table.from_arrays( 34 | [ 35 | [1, 2, 3, 4], 36 | ["a", "b", "c", "d"], 37 | [1.1, 2.2, 3.3, 4.4], 38 | ], 39 | names=["int", "str", "float"], 40 | ) 41 | write_csv(table, path) 42 | 43 | ctx.register_csv("csv", path) 44 | ctx.register_csv("csv1", str(path)) 45 | ctx.register_csv( 46 | "csv2", 47 | path, 48 | has_header=True, 49 | delimiter=",", 50 | schema_infer_max_records=10, 51 | ) 52 | -------------------------------------------------------------------------------- /python/tests/data_test_context/data.json: -------------------------------------------------------------------------------- 1 | {"A": "a", "B": 1} 2 | {"A": "b", "B": 2} 3 | {"A": "c", "B": 3} 4 | -------------------------------------------------------------------------------- /python/tests/test_catalog.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import pyarrow as pa 19 | import pytest 20 | 21 | 22 | # Note we take in `database` as a variable even though we don't use 23 | # it because that will cause the fixture to set up the context with 24 | # the tables we need. 25 | def test_basic(ctx, database): 26 | with pytest.raises(KeyError): 27 | ctx.catalog("non-existent") 28 | 29 | default = ctx.catalog() 30 | assert default.names() == ["public"] 31 | 32 | for db in [default.database("public"), default.database()]: 33 | assert db.names() == {"csv1", "csv", "csv2"} 34 | 35 | table = db.table("csv") 36 | assert table.kind == "physical" 37 | assert table.schema == pa.schema( 38 | [ 39 | pa.field("int", pa.int64(), nullable=True), 40 | pa.field("str", pa.string(), nullable=True), 41 | pa.field("float", pa.float64(), nullable=True), 42 | ] 43 | ) 44 | -------------------------------------------------------------------------------- /python/tests/test_config.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import pytest 19 | from datafusion import Config 20 | 21 | 22 | @pytest.fixture 23 | def config(): 24 | return Config() 25 | 26 | 27 | def test_get_then_set(config): 28 | config_key = "datafusion.optimizer.filter_null_join_keys" 29 | 30 | assert config.get(config_key) == "false" 31 | 32 | config.set(config_key, "true") 33 | assert config.get(config_key) == "true" 34 | 35 | 36 | def test_get_all(config): 37 | config_dict = config.get_all() 38 | assert config_dict["datafusion.catalog.create_default_catalog_and_schema"] == "true" 39 | 40 | 41 | def test_get_invalid_config(config): 42 | assert config.get("not.valid.key") is None 43 | -------------------------------------------------------------------------------- /python/tests/test_indexing.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import pyarrow as pa 19 | import pytest 20 | from datafusion import SessionContext 21 | 22 | 23 | @pytest.fixture 24 | def df(): 25 | ctx = SessionContext() 26 | 27 | # create a RecordBatch and a new DataFrame from it 28 | batch = pa.RecordBatch.from_arrays( 29 | [pa.array([1, 2, 3]), pa.array([4, 4, 6])], 30 | names=["a", "b"], 31 | ) 32 | return ctx.create_dataframe([[batch]]) 33 | 34 | 35 | def test_indexing(df): 36 | assert df["a"] is not None 37 | assert df["a", "b"] is not None 38 | assert df[("a", "b")] is not None 39 | assert df[["a"]] is not None 40 | 41 | 42 | def test_err(df): 43 | with pytest.raises(Exception) as e_info: 44 | df["c"] 45 | 46 | for e in ["SchemaError", "FieldNotFound", 'name: "c"']: 47 | assert e in e_info.value.args[0] 48 | 49 | with pytest.raises(Exception) as e_info: 50 | df[1] 51 | 52 | assert ( 53 | "DataFrame can only be indexed by string index or indices" 54 | in e_info.value.args[0] 55 | ) 56 | -------------------------------------------------------------------------------- /python/tests/test_input.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import pathlib 19 | 20 | from datafusion.input.location import LocationInputPlugin 21 | 22 | 23 | def test_location_input(): 24 | location_input = LocationInputPlugin() 25 | 26 | cwd = pathlib.Path.cwd() 27 | input_file = cwd / "testing/data/parquet/generated_simple_numerics/blogs.parquet" 28 | table_name = "blog" 29 | tbl = location_input.build_table(str(input_file), table_name) 30 | assert tbl.name == "blog" 31 | assert len(tbl.columns) == 3 32 | assert "blogs.parquet" in tbl.filepaths[0] 33 | -------------------------------------------------------------------------------- /python/tests/test_plans.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import pytest 19 | from datafusion import ExecutionPlan, LogicalPlan, SessionContext 20 | 21 | 22 | # Note: We must use CSV because memory tables are currently not supported for 23 | # conversion to/from protobuf. 24 | @pytest.fixture 25 | def df(): 26 | ctx = SessionContext() 27 | return ctx.read_csv(path="testing/data/csv/aggregate_test_100.csv").select("c1") 28 | 29 | 30 | def test_logical_plan_to_proto(ctx, df) -> None: 31 | logical_plan_bytes = df.logical_plan().to_proto() 32 | logical_plan = LogicalPlan.from_proto(ctx, logical_plan_bytes) 33 | 34 | df_round_trip = ctx.create_dataframe_from_logical_plan(logical_plan) 35 | 36 | assert df.collect() == df_round_trip.collect() 37 | 38 | original_execution_plan = df.execution_plan() 39 | execution_plan_bytes = original_execution_plan.to_proto() 40 | execution_plan = ExecutionPlan.from_proto(ctx, execution_plan_bytes) 41 | 42 | assert str(original_execution_plan) == str(execution_plan) 43 | -------------------------------------------------------------------------------- /python/tests/test_store.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | from pathlib import Path 19 | 20 | import pytest 21 | from datafusion import SessionContext 22 | 23 | 24 | @pytest.fixture 25 | def ctx(): 26 | return SessionContext() 27 | 28 | 29 | def test_read_parquet(ctx): 30 | ctx.register_parquet( 31 | "test", 32 | f"file://{Path.cwd()}/parquet/data/alltypes_plain.parquet", 33 | table_partition_cols=[], 34 | parquet_pruning=True, 35 | file_extension=".parquet", 36 | ) 37 | df = ctx.sql("SELECT * FROM test") 38 | assert isinstance(df.collect(), list) 39 | -------------------------------------------------------------------------------- /python/tests/test_unparser.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | from datafusion.context import SessionContext 19 | from datafusion.unparser import Dialect, Unparser 20 | 21 | 22 | def test_unparser(): 23 | ctx = SessionContext() 24 | df = ctx.sql("SELECT 1") 25 | for dialect in [ 26 | Dialect.mysql(), 27 | Dialect.postgres(), 28 | Dialect.sqlite(), 29 | Dialect.duckdb(), 30 | ]: 31 | unparser = Unparser(dialect) 32 | sql = unparser.plan_to_sql(df.logical_plan()) 33 | assert sql == "SELECT 1" 34 | -------------------------------------------------------------------------------- /python/tests/test_view.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | 19 | from datafusion import SessionContext, col, literal 20 | 21 | 22 | def test_register_filtered_dataframe(): 23 | ctx = SessionContext() 24 | 25 | data = {"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]} 26 | 27 | df = ctx.from_pydict(data, "my_table") 28 | 29 | df_filtered = df.filter(col("a") > literal(2)) 30 | 31 | ctx.register_view("view1", df_filtered) 32 | 33 | df_view = ctx.sql("SELECT * FROM view1") 34 | 35 | filtered_results = df_view.collect() 36 | 37 | result_dicts = [batch.to_pydict() for batch in filtered_results] 38 | 39 | expected_results = [{"a": [3, 4, 5], "b": [30, 40, 50]}] 40 | 41 | assert result_dicts == expected_results 42 | 43 | df_results = df.collect() 44 | 45 | df_result_dicts = [batch.to_pydict() for batch in df_results] 46 | 47 | expected_df_results = [{"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]}] 48 | 49 | assert df_result_dicts == expected_df_results 50 | -------------------------------------------------------------------------------- /src/common.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use pyo3::prelude::*; 19 | 20 | pub mod data_type; 21 | pub mod df_schema; 22 | pub mod function; 23 | pub mod schema; 24 | 25 | /// Initializes the `common` module to match the pattern of `datafusion-common` https://docs.rs/datafusion-common/18.0.0/datafusion_common/index.html 26 | pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { 27 | m.add_class::()?; 28 | m.add_class::()?; 29 | m.add_class::()?; 30 | m.add_class::()?; 31 | m.add_class::()?; 32 | m.add_class::()?; 33 | m.add_class::()?; 34 | m.add_class::()?; 35 | m.add_class::()?; 36 | m.add_class::()?; 37 | m.add_class::()?; 38 | m.add_class::()?; 39 | m.add_class::()?; 40 | m.add_class::()?; 41 | m.add_class::()?; 42 | Ok(()) 43 | } 44 | -------------------------------------------------------------------------------- /src/common/df_schema.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::sync::Arc; 19 | 20 | use datafusion::common::DFSchema; 21 | use pyo3::prelude::*; 22 | 23 | #[derive(Debug, Clone)] 24 | #[pyclass(name = "DFSchema", module = "datafusion.common", subclass)] 25 | pub struct PyDFSchema { 26 | schema: Arc, 27 | } 28 | 29 | impl From for DFSchema { 30 | fn from(schema: PyDFSchema) -> DFSchema { 31 | (*schema.schema).clone() 32 | } 33 | } 34 | 35 | impl From for PyDFSchema { 36 | fn from(schema: DFSchema) -> PyDFSchema { 37 | PyDFSchema { 38 | schema: Arc::new(schema), 39 | } 40 | } 41 | } 42 | 43 | #[pymethods] 44 | impl PyDFSchema { 45 | #[pyo3(name = "empty")] 46 | #[staticmethod] 47 | fn py_empty() -> PyResult { 48 | Ok(Self { 49 | schema: Arc::new(DFSchema::empty()), 50 | }) 51 | } 52 | 53 | #[pyo3(name = "field_names")] 54 | fn py_field_names(&self) -> PyResult> { 55 | Ok(self.schema.field_names()) 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/common/function.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::collections::HashMap; 19 | 20 | use datafusion::arrow::datatypes::DataType; 21 | use pyo3::prelude::*; 22 | 23 | use super::data_type::PyDataType; 24 | 25 | #[pyclass(name = "SqlFunction", module = "datafusion.common", subclass)] 26 | #[derive(Debug, Clone)] 27 | pub struct SqlFunction { 28 | pub name: String, 29 | pub return_types: HashMap, DataType>, 30 | pub aggregation: bool, 31 | } 32 | 33 | impl SqlFunction { 34 | pub fn new( 35 | function_name: String, 36 | input_types: Vec, 37 | return_type: PyDataType, 38 | aggregation_bool: bool, 39 | ) -> Self { 40 | let mut func = Self { 41 | name: function_name, 42 | return_types: HashMap::new(), 43 | aggregation: aggregation_bool, 44 | }; 45 | func.add_type_mapping(input_types, return_type); 46 | func 47 | } 48 | 49 | pub fn add_type_mapping(&mut self, input_types: Vec, return_type: PyDataType) { 50 | self.return_types.insert( 51 | input_types.iter().map(|t| t.clone().into()).collect(), 52 | return_type.into(), 53 | ); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/expr/alias.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use crate::expr::PyExpr; 19 | use pyo3::prelude::*; 20 | use std::fmt::{self, Display, Formatter}; 21 | 22 | use datafusion::logical_expr::expr::Alias; 23 | 24 | #[pyclass(name = "Alias", module = "datafusion.expr", subclass)] 25 | #[derive(Clone)] 26 | pub struct PyAlias { 27 | alias: Alias, 28 | } 29 | 30 | impl From for PyAlias { 31 | fn from(alias: Alias) -> Self { 32 | Self { alias } 33 | } 34 | } 35 | 36 | impl From for Alias { 37 | fn from(py_alias: PyAlias) -> Self { 38 | py_alias.alias 39 | } 40 | } 41 | 42 | impl Display for PyAlias { 43 | fn fmt(&self, f: &mut Formatter) -> fmt::Result { 44 | write!( 45 | f, 46 | "Alias 47 | \nExpr: `{:?}` 48 | \nAlias Name: `{}`", 49 | &self.alias.expr, &self.alias.name 50 | ) 51 | } 52 | } 53 | 54 | #[pymethods] 55 | impl PyAlias { 56 | /// Retrieve the "name" of the alias 57 | fn alias(&self) -> PyResult { 58 | Ok(self.alias.name.clone()) 59 | } 60 | 61 | fn expr(&self) -> PyResult { 62 | Ok((*self.alias.expr.clone()).into()) 63 | } 64 | 65 | /// Get a String representation of this column 66 | fn __repr__(&self) -> String { 67 | format!("{}", self) 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/expr/between.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use crate::expr::PyExpr; 19 | use datafusion::logical_expr::expr::Between; 20 | use pyo3::prelude::*; 21 | use std::fmt::{self, Display, Formatter}; 22 | 23 | #[pyclass(name = "Between", module = "datafusion.expr", subclass)] 24 | #[derive(Clone)] 25 | pub struct PyBetween { 26 | between: Between, 27 | } 28 | 29 | impl From for Between { 30 | fn from(between: PyBetween) -> Self { 31 | between.between 32 | } 33 | } 34 | 35 | impl From for PyBetween { 36 | fn from(between: Between) -> PyBetween { 37 | PyBetween { between } 38 | } 39 | } 40 | 41 | impl Display for PyBetween { 42 | fn fmt(&self, f: &mut Formatter) -> fmt::Result { 43 | write!( 44 | f, 45 | "Between 46 | Expr: {:?} 47 | Negated: {:?} 48 | Low: {:?} 49 | High: {:?}", 50 | &self.between.expr, &self.between.negated, &self.between.low, &self.between.high 51 | ) 52 | } 53 | } 54 | 55 | #[pymethods] 56 | impl PyBetween { 57 | fn expr(&self) -> PyResult { 58 | Ok((*self.between.expr).clone().into()) 59 | } 60 | 61 | fn negated(&self) -> PyResult { 62 | Ok(self.between.negated) 63 | } 64 | 65 | fn low(&self) -> PyResult { 66 | Ok((*self.between.low).clone().into()) 67 | } 68 | 69 | fn high(&self) -> PyResult { 70 | Ok((*self.between.high).clone().into()) 71 | } 72 | 73 | fn __repr__(&self) -> String { 74 | format!("{}", self) 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/expr/binary_expr.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use crate::expr::PyExpr; 19 | use datafusion::logical_expr::BinaryExpr; 20 | use pyo3::prelude::*; 21 | 22 | #[pyclass(name = "BinaryExpr", module = "datafusion.expr", subclass)] 23 | #[derive(Clone)] 24 | pub struct PyBinaryExpr { 25 | expr: BinaryExpr, 26 | } 27 | 28 | impl From for BinaryExpr { 29 | fn from(expr: PyBinaryExpr) -> Self { 30 | expr.expr 31 | } 32 | } 33 | 34 | impl From for PyBinaryExpr { 35 | fn from(expr: BinaryExpr) -> PyBinaryExpr { 36 | PyBinaryExpr { expr } 37 | } 38 | } 39 | 40 | #[pymethods] 41 | impl PyBinaryExpr { 42 | fn left(&self) -> PyExpr { 43 | self.expr.left.as_ref().clone().into() 44 | } 45 | 46 | fn right(&self) -> PyExpr { 47 | self.expr.right.as_ref().clone().into() 48 | } 49 | 50 | fn op(&self) -> String { 51 | format!("{}", self.expr.op) 52 | } 53 | 54 | fn __repr__(&self) -> PyResult { 55 | Ok(format!("{}", self.expr)) 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/expr/case.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use crate::expr::PyExpr; 19 | use datafusion::logical_expr::Case; 20 | use pyo3::prelude::*; 21 | 22 | #[pyclass(name = "Case", module = "datafusion.expr", subclass)] 23 | #[derive(Clone)] 24 | pub struct PyCase { 25 | case: Case, 26 | } 27 | 28 | impl From for Case { 29 | fn from(case: PyCase) -> Self { 30 | case.case 31 | } 32 | } 33 | 34 | impl From for PyCase { 35 | fn from(case: Case) -> PyCase { 36 | PyCase { case } 37 | } 38 | } 39 | 40 | #[pymethods] 41 | impl PyCase { 42 | fn expr(&self) -> Option { 43 | self.case.expr.as_ref().map(|e| (**e).clone().into()) 44 | } 45 | 46 | fn when_then_expr(&self) -> Vec<(PyExpr, PyExpr)> { 47 | self.case 48 | .when_then_expr 49 | .iter() 50 | .map(|e| ((*e.0).clone().into(), (*e.1).clone().into())) 51 | .collect() 52 | } 53 | 54 | fn else_expr(&self) -> Option { 55 | self.case.else_expr.as_ref().map(|e| (**e).clone().into()) 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/expr/cast.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use crate::{common::data_type::PyDataType, expr::PyExpr}; 19 | use datafusion::logical_expr::{Cast, TryCast}; 20 | use pyo3::prelude::*; 21 | 22 | #[pyclass(name = "Cast", module = "datafusion.expr", subclass)] 23 | #[derive(Clone)] 24 | pub struct PyCast { 25 | cast: Cast, 26 | } 27 | 28 | impl From for Cast { 29 | fn from(cast: PyCast) -> Self { 30 | cast.cast 31 | } 32 | } 33 | 34 | impl From for PyCast { 35 | fn from(cast: Cast) -> PyCast { 36 | PyCast { cast } 37 | } 38 | } 39 | 40 | #[pymethods] 41 | impl PyCast { 42 | fn expr(&self) -> PyResult { 43 | Ok((*self.cast.expr).clone().into()) 44 | } 45 | 46 | fn data_type(&self) -> PyResult { 47 | Ok(self.cast.data_type.clone().into()) 48 | } 49 | } 50 | 51 | #[pyclass(name = "TryCast", module = "datafusion.expr", subclass)] 52 | #[derive(Clone)] 53 | pub struct PyTryCast { 54 | try_cast: TryCast, 55 | } 56 | 57 | impl From for TryCast { 58 | fn from(try_cast: PyTryCast) -> Self { 59 | try_cast.try_cast 60 | } 61 | } 62 | 63 | impl From for PyTryCast { 64 | fn from(try_cast: TryCast) -> PyTryCast { 65 | PyTryCast { try_cast } 66 | } 67 | } 68 | 69 | #[pymethods] 70 | impl PyTryCast { 71 | fn expr(&self) -> PyResult { 72 | Ok((*self.try_cast.expr).clone().into()) 73 | } 74 | 75 | fn data_type(&self) -> PyResult { 76 | Ok(self.try_cast.data_type.clone().into()) 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/expr/column.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use datafusion::common::Column; 19 | use pyo3::prelude::*; 20 | 21 | #[pyclass(name = "Column", module = "datafusion.expr", subclass)] 22 | #[derive(Clone)] 23 | pub struct PyColumn { 24 | pub col: Column, 25 | } 26 | 27 | impl PyColumn { 28 | pub fn new(col: Column) -> Self { 29 | Self { col } 30 | } 31 | } 32 | 33 | impl From for PyColumn { 34 | fn from(col: Column) -> PyColumn { 35 | PyColumn { col } 36 | } 37 | } 38 | 39 | #[pymethods] 40 | impl PyColumn { 41 | /// Get the column name 42 | fn name(&self) -> String { 43 | self.col.name.clone() 44 | } 45 | 46 | /// Get the column relation 47 | fn relation(&self) -> Option { 48 | self.col.relation.as_ref().map(|r| format!("{}", r)) 49 | } 50 | 51 | /// Get the fully-qualified column name 52 | fn qualified_name(&self) -> String { 53 | self.col.flat_name() 54 | } 55 | 56 | /// Get a String representation of this column 57 | fn __repr__(&self) -> String { 58 | self.qualified_name() 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/expr/conditional_expr.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use crate::{errors::PyDataFusionResult, expr::PyExpr}; 19 | use datafusion::logical_expr::conditional_expressions::CaseBuilder; 20 | use pyo3::prelude::*; 21 | 22 | #[pyclass(name = "CaseBuilder", module = "datafusion.expr", subclass)] 23 | pub struct PyCaseBuilder { 24 | pub case_builder: CaseBuilder, 25 | } 26 | 27 | impl From for CaseBuilder { 28 | fn from(case_builder: PyCaseBuilder) -> Self { 29 | case_builder.case_builder 30 | } 31 | } 32 | 33 | impl From for PyCaseBuilder { 34 | fn from(case_builder: CaseBuilder) -> PyCaseBuilder { 35 | PyCaseBuilder { case_builder } 36 | } 37 | } 38 | 39 | #[pymethods] 40 | impl PyCaseBuilder { 41 | fn when(&mut self, when: PyExpr, then: PyExpr) -> PyCaseBuilder { 42 | PyCaseBuilder { 43 | case_builder: self.case_builder.when(when.expr, then.expr), 44 | } 45 | } 46 | 47 | fn otherwise(&mut self, else_expr: PyExpr) -> PyDataFusionResult { 48 | Ok(self.case_builder.otherwise(else_expr.expr)?.clone().into()) 49 | } 50 | 51 | fn end(&mut self) -> PyDataFusionResult { 52 | Ok(self.case_builder.end()?.clone().into()) 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/expr/exists.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use datafusion::logical_expr::expr::Exists; 19 | use pyo3::prelude::*; 20 | 21 | use super::subquery::PySubquery; 22 | 23 | #[pyclass(name = "Exists", module = "datafusion.expr", subclass)] 24 | #[derive(Clone)] 25 | pub struct PyExists { 26 | exists: Exists, 27 | } 28 | 29 | impl From for PyExists { 30 | fn from(exists: Exists) -> Self { 31 | PyExists { exists } 32 | } 33 | } 34 | 35 | #[pymethods] 36 | impl PyExists { 37 | fn subquery(&self) -> PySubquery { 38 | self.exists.subquery.clone().into() 39 | } 40 | 41 | fn negated(&self) -> bool { 42 | self.exists.negated 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/expr/extension.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use datafusion::logical_expr::Extension; 19 | use pyo3::{prelude::*, IntoPyObjectExt}; 20 | 21 | use crate::sql::logical::PyLogicalPlan; 22 | 23 | use super::logical_node::LogicalNode; 24 | 25 | #[pyclass(name = "Extension", module = "datafusion.expr", subclass)] 26 | #[derive(Clone)] 27 | pub struct PyExtension { 28 | pub node: Extension, 29 | } 30 | 31 | impl From for PyExtension { 32 | fn from(node: Extension) -> PyExtension { 33 | PyExtension { node } 34 | } 35 | } 36 | 37 | #[pymethods] 38 | impl PyExtension { 39 | fn name(&self) -> PyResult { 40 | Ok(self.node.node.name().to_string()) 41 | } 42 | } 43 | 44 | impl LogicalNode for PyExtension { 45 | fn inputs(&self) -> Vec { 46 | vec![] 47 | } 48 | 49 | fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { 50 | self.clone().into_bound_py_any(py) 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/expr/grouping_set.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use datafusion::logical_expr::GroupingSet; 19 | use pyo3::prelude::*; 20 | 21 | #[pyclass(name = "GroupingSet", module = "datafusion.expr", subclass)] 22 | #[derive(Clone)] 23 | pub struct PyGroupingSet { 24 | grouping_set: GroupingSet, 25 | } 26 | 27 | impl From for GroupingSet { 28 | fn from(grouping_set: PyGroupingSet) -> Self { 29 | grouping_set.grouping_set 30 | } 31 | } 32 | 33 | impl From for PyGroupingSet { 34 | fn from(grouping_set: GroupingSet) -> PyGroupingSet { 35 | PyGroupingSet { grouping_set } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/expr/in_list.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use crate::expr::PyExpr; 19 | use datafusion::logical_expr::expr::InList; 20 | use pyo3::prelude::*; 21 | 22 | #[pyclass(name = "InList", module = "datafusion.expr", subclass)] 23 | #[derive(Clone)] 24 | pub struct PyInList { 25 | in_list: InList, 26 | } 27 | 28 | impl From for PyInList { 29 | fn from(in_list: InList) -> Self { 30 | PyInList { in_list } 31 | } 32 | } 33 | 34 | #[pymethods] 35 | impl PyInList { 36 | fn expr(&self) -> PyExpr { 37 | (*self.in_list.expr).clone().into() 38 | } 39 | 40 | fn list(&self) -> Vec { 41 | self.in_list.list.iter().map(|e| e.clone().into()).collect() 42 | } 43 | 44 | fn negated(&self) -> bool { 45 | self.in_list.negated 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/expr/in_subquery.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use datafusion::logical_expr::expr::InSubquery; 19 | use pyo3::prelude::*; 20 | 21 | use super::{subquery::PySubquery, PyExpr}; 22 | 23 | #[pyclass(name = "InSubquery", module = "datafusion.expr", subclass)] 24 | #[derive(Clone)] 25 | pub struct PyInSubquery { 26 | in_subquery: InSubquery, 27 | } 28 | 29 | impl From for PyInSubquery { 30 | fn from(in_subquery: InSubquery) -> Self { 31 | PyInSubquery { in_subquery } 32 | } 33 | } 34 | 35 | #[pymethods] 36 | impl PyInSubquery { 37 | fn expr(&self) -> PyExpr { 38 | (*self.in_subquery.expr).clone().into() 39 | } 40 | 41 | fn subquery(&self) -> PySubquery { 42 | self.in_subquery.subquery.clone().into() 43 | } 44 | 45 | fn negated(&self) -> bool { 46 | self.in_subquery.negated 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/expr/indexed_field.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use crate::expr::PyExpr; 19 | use datafusion::logical_expr::expr::{GetFieldAccess, GetIndexedField}; 20 | use pyo3::prelude::*; 21 | use std::fmt::{Display, Formatter}; 22 | 23 | use super::literal::PyLiteral; 24 | 25 | #[pyclass(name = "GetIndexedField", module = "datafusion.expr", subclass)] 26 | #[derive(Clone)] 27 | pub struct PyGetIndexedField { 28 | indexed_field: GetIndexedField, 29 | } 30 | 31 | impl From for GetIndexedField { 32 | fn from(indexed_field: PyGetIndexedField) -> Self { 33 | indexed_field.indexed_field 34 | } 35 | } 36 | 37 | impl From for PyGetIndexedField { 38 | fn from(indexed_field: GetIndexedField) -> PyGetIndexedField { 39 | PyGetIndexedField { indexed_field } 40 | } 41 | } 42 | 43 | impl Display for PyGetIndexedField { 44 | fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { 45 | write!( 46 | f, 47 | "GetIndexedField 48 | Expr: {:?} 49 | Key: {:?}", 50 | &self.indexed_field.expr, &self.indexed_field.field 51 | ) 52 | } 53 | } 54 | 55 | #[pymethods] 56 | impl PyGetIndexedField { 57 | fn expr(&self) -> PyResult { 58 | Ok((*self.indexed_field.expr).clone().into()) 59 | } 60 | 61 | fn key(&self) -> PyResult { 62 | match &self.indexed_field.field { 63 | GetFieldAccess::NamedStructField { name, .. } => Ok(name.clone().into()), 64 | _ => todo!(), 65 | } 66 | } 67 | 68 | /// Get a String representation of this column 69 | fn __repr__(&self) -> String { 70 | format!("{}", self) 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/expr/logical_node.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use pyo3::{Bound, PyAny, PyResult, Python}; 19 | 20 | use crate::sql::logical::PyLogicalPlan; 21 | 22 | /// Representation of a `LogicalNode` in the in overall `LogicalPlan` 23 | /// any "node" shares these common traits in common. 24 | pub trait LogicalNode { 25 | /// The input plan to the current logical node instance. 26 | fn inputs(&self) -> Vec; 27 | 28 | fn to_variant<'py>(&self, py: Python<'py>) -> PyResult>; 29 | } 30 | -------------------------------------------------------------------------------- /src/expr/placeholder.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use datafusion::logical_expr::expr::Placeholder; 19 | use pyo3::prelude::*; 20 | 21 | use crate::common::data_type::PyDataType; 22 | 23 | #[pyclass(name = "Placeholder", module = "datafusion.expr", subclass)] 24 | #[derive(Clone)] 25 | pub struct PyPlaceholder { 26 | placeholder: Placeholder, 27 | } 28 | 29 | impl From for PyPlaceholder { 30 | fn from(placeholder: Placeholder) -> Self { 31 | PyPlaceholder { placeholder } 32 | } 33 | } 34 | 35 | #[pymethods] 36 | impl PyPlaceholder { 37 | fn id(&self) -> String { 38 | self.placeholder.id.clone() 39 | } 40 | 41 | fn data_type(&self) -> Option { 42 | self.placeholder 43 | .data_type 44 | .as_ref() 45 | .map(|e| e.clone().into()) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/expr/scalar_subquery.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use datafusion::logical_expr::Subquery; 19 | use pyo3::prelude::*; 20 | 21 | use super::subquery::PySubquery; 22 | 23 | #[pyclass(name = "ScalarSubquery", module = "datafusion.expr", subclass)] 24 | #[derive(Clone)] 25 | pub struct PyScalarSubquery { 26 | subquery: Subquery, 27 | } 28 | 29 | impl From for Subquery { 30 | fn from(subquery: PyScalarSubquery) -> Self { 31 | subquery.subquery 32 | } 33 | } 34 | 35 | impl From for PyScalarSubquery { 36 | fn from(subquery: Subquery) -> PyScalarSubquery { 37 | PyScalarSubquery { subquery } 38 | } 39 | } 40 | 41 | #[pymethods] 42 | impl PyScalarSubquery { 43 | fn subquery(&self) -> PySubquery { 44 | self.subquery.clone().into() 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/expr/scalar_variable.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use datafusion::arrow::datatypes::DataType; 19 | use pyo3::prelude::*; 20 | 21 | use crate::common::data_type::PyDataType; 22 | 23 | #[pyclass(name = "ScalarVariable", module = "datafusion.expr", subclass)] 24 | #[derive(Clone)] 25 | pub struct PyScalarVariable { 26 | data_type: DataType, 27 | variables: Vec, 28 | } 29 | 30 | impl PyScalarVariable { 31 | pub fn new(data_type: &DataType, variables: &[String]) -> Self { 32 | Self { 33 | data_type: data_type.to_owned(), 34 | variables: variables.to_vec(), 35 | } 36 | } 37 | } 38 | 39 | #[pymethods] 40 | impl PyScalarVariable { 41 | /// Get the data type 42 | fn data_type(&self) -> PyResult { 43 | Ok(self.data_type.clone().into()) 44 | } 45 | 46 | fn variables(&self) -> PyResult> { 47 | Ok(self.variables.clone()) 48 | } 49 | 50 | fn __repr__(&self) -> PyResult { 51 | Ok(format!("{}{:?}", self.data_type, self.variables)) 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/expr/signature.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use datafusion::logical_expr::{TypeSignature, Volatility}; 19 | use pyo3::prelude::*; 20 | 21 | #[allow(dead_code)] 22 | #[pyclass(name = "Signature", module = "datafusion.expr", subclass)] 23 | #[derive(Clone)] 24 | pub struct PySignature { 25 | type_signature: TypeSignature, 26 | volatility: Volatility, 27 | } 28 | 29 | impl PySignature { 30 | pub fn new(type_signature: TypeSignature, volatility: Volatility) -> Self { 31 | Self { 32 | type_signature, 33 | volatility, 34 | } 35 | } 36 | } 37 | 38 | #[pymethods] 39 | impl PySignature {} 40 | -------------------------------------------------------------------------------- /src/expr/subquery.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::fmt::{self, Display, Formatter}; 19 | 20 | use datafusion::logical_expr::Subquery; 21 | use pyo3::{prelude::*, IntoPyObjectExt}; 22 | 23 | use crate::sql::logical::PyLogicalPlan; 24 | 25 | use super::logical_node::LogicalNode; 26 | 27 | #[pyclass(name = "Subquery", module = "datafusion.expr", subclass)] 28 | #[derive(Clone)] 29 | pub struct PySubquery { 30 | subquery: Subquery, 31 | } 32 | 33 | impl From for Subquery { 34 | fn from(subquery: PySubquery) -> Self { 35 | subquery.subquery 36 | } 37 | } 38 | 39 | impl From for PySubquery { 40 | fn from(subquery: Subquery) -> PySubquery { 41 | PySubquery { subquery } 42 | } 43 | } 44 | 45 | impl Display for PySubquery { 46 | fn fmt(&self, f: &mut Formatter) -> fmt::Result { 47 | write!( 48 | f, 49 | "Subquery 50 | Subquery: {:?} 51 | outer_ref_columns: {:?}", 52 | self.subquery.subquery, self.subquery.outer_ref_columns, 53 | ) 54 | } 55 | } 56 | 57 | #[pymethods] 58 | impl PySubquery { 59 | /// Retrieves the input `LogicalPlan` to this `Projection` node 60 | fn input(&self) -> PyResult> { 61 | Ok(Self::inputs(self)) 62 | } 63 | 64 | fn __repr__(&self) -> PyResult { 65 | Ok(format!("Subquery({})", self)) 66 | } 67 | 68 | fn __name__(&self) -> PyResult { 69 | Ok("Subquery".to_string()) 70 | } 71 | } 72 | 73 | impl LogicalNode for PySubquery { 74 | fn inputs(&self) -> Vec { 75 | vec![] 76 | } 77 | 78 | fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { 79 | self.clone().into_bound_py_any(py) 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/expr/unnest_expr.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use datafusion::logical_expr::expr::Unnest; 19 | use pyo3::prelude::*; 20 | use std::fmt::{self, Display, Formatter}; 21 | 22 | use super::PyExpr; 23 | 24 | #[pyclass(name = "UnnestExpr", module = "datafusion.expr", subclass)] 25 | #[derive(Clone)] 26 | pub struct PyUnnestExpr { 27 | unnest: Unnest, 28 | } 29 | 30 | impl From for PyUnnestExpr { 31 | fn from(unnest: Unnest) -> PyUnnestExpr { 32 | PyUnnestExpr { unnest } 33 | } 34 | } 35 | 36 | impl From for Unnest { 37 | fn from(unnest: PyUnnestExpr) -> Self { 38 | unnest.unnest 39 | } 40 | } 41 | 42 | impl Display for PyUnnestExpr { 43 | fn fmt(&self, f: &mut Formatter) -> fmt::Result { 44 | write!( 45 | f, 46 | "Unnest 47 | Expr: {:?}", 48 | &self.unnest.expr, 49 | ) 50 | } 51 | } 52 | 53 | #[pymethods] 54 | impl PyUnnestExpr { 55 | /// Retrieves the expression that is being unnested 56 | fn expr(&self) -> PyResult { 57 | Ok((*self.unnest.expr).clone().into()) 58 | } 59 | 60 | fn __repr__(&self) -> PyResult { 61 | Ok(format!("UnnestExpr({})", self)) 62 | } 63 | 64 | fn __name__(&self) -> PyResult { 65 | Ok("UnnestExpr".to_string()) 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/pyarrow_util.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | //! Conversions between PyArrow and DataFusion types 19 | 20 | use arrow::array::{Array, ArrayData}; 21 | use arrow::pyarrow::{FromPyArrow, ToPyArrow}; 22 | use datafusion::scalar::ScalarValue; 23 | use pyo3::types::{PyAnyMethods, PyList}; 24 | use pyo3::{Bound, FromPyObject, PyAny, PyObject, PyResult, Python}; 25 | 26 | use crate::common::data_type::PyScalarValue; 27 | use crate::errors::PyDataFusionError; 28 | 29 | impl FromPyArrow for PyScalarValue { 30 | fn from_pyarrow_bound(value: &Bound<'_, PyAny>) -> PyResult { 31 | let py = value.py(); 32 | let typ = value.getattr("type")?; 33 | let val = value.call_method0("as_py")?; 34 | 35 | // construct pyarrow array from the python value and pyarrow type 36 | let factory = py.import("pyarrow")?.getattr("array")?; 37 | let args = PyList::new(py, [val])?; 38 | let array = factory.call1((args, typ))?; 39 | 40 | // convert the pyarrow array to rust array using C data interface 41 | let array = arrow::array::make_array(ArrayData::from_pyarrow_bound(&array)?); 42 | let scalar = ScalarValue::try_from_array(&array, 0).map_err(PyDataFusionError::from)?; 43 | 44 | Ok(PyScalarValue(scalar)) 45 | } 46 | } 47 | 48 | impl<'source> FromPyObject<'source> for PyScalarValue { 49 | fn extract_bound(value: &Bound<'source, PyAny>) -> PyResult { 50 | Self::from_pyarrow_bound(value) 51 | } 52 | } 53 | 54 | pub fn scalar_to_pyarrow(scalar: &ScalarValue, py: Python) -> PyResult { 55 | let array = scalar.to_array().map_err(PyDataFusionError::from)?; 56 | // convert to pyarrow array using C data interface 57 | let pyarray = array.to_data().to_pyarrow(py)?; 58 | let pyscalar = pyarray.call_method1(py, "__getitem__", (0,))?; 59 | 60 | Ok(pyscalar) 61 | } 62 | -------------------------------------------------------------------------------- /src/sql.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | pub mod exceptions; 19 | pub mod logical; 20 | -------------------------------------------------------------------------------- /src/sql/exceptions.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::fmt::{Debug, Display}; 19 | 20 | use pyo3::PyErr; 21 | 22 | pub fn py_type_err(e: impl Debug + Display) -> PyErr { 23 | PyErr::new::(format!("{e}")) 24 | } 25 | 26 | pub fn py_runtime_err(e: impl Debug + Display) -> PyErr { 27 | PyErr::new::(format!("{e}")) 28 | } 29 | 30 | pub fn py_value_err(e: impl Debug + Display) -> PyErr { 31 | PyErr::new::(format!("{e}")) 32 | } 33 | -------------------------------------------------------------------------------- /src/unparser/dialect.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::sync::Arc; 19 | 20 | use datafusion::sql::unparser::dialect::{ 21 | DefaultDialect, Dialect, DuckDBDialect, MySqlDialect, PostgreSqlDialect, SqliteDialect, 22 | }; 23 | use pyo3::prelude::*; 24 | 25 | #[pyclass(name = "Dialect", module = "datafusion.unparser", subclass)] 26 | #[derive(Clone)] 27 | pub struct PyDialect { 28 | pub dialect: Arc, 29 | } 30 | 31 | #[pymethods] 32 | impl PyDialect { 33 | #[staticmethod] 34 | pub fn default() -> Self { 35 | Self { 36 | dialect: Arc::new(DefaultDialect {}), 37 | } 38 | } 39 | #[staticmethod] 40 | pub fn postgres() -> Self { 41 | Self { 42 | dialect: Arc::new(PostgreSqlDialect {}), 43 | } 44 | } 45 | #[staticmethod] 46 | pub fn mysql() -> Self { 47 | Self { 48 | dialect: Arc::new(MySqlDialect {}), 49 | } 50 | } 51 | #[staticmethod] 52 | pub fn sqlite() -> Self { 53 | Self { 54 | dialect: Arc::new(SqliteDialect {}), 55 | } 56 | } 57 | #[staticmethod] 58 | pub fn duckdb() -> Self { 59 | Self { 60 | dialect: Arc::new(DuckDBDialect::new()), 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/unparser/mod.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | mod dialect; 19 | 20 | use std::sync::Arc; 21 | 22 | use datafusion::sql::unparser::{dialect::Dialect, Unparser}; 23 | use dialect::PyDialect; 24 | use pyo3::{exceptions::PyValueError, prelude::*}; 25 | 26 | use crate::sql::logical::PyLogicalPlan; 27 | 28 | #[pyclass(name = "Unparser", module = "datafusion.unparser", subclass)] 29 | #[derive(Clone)] 30 | pub struct PyUnparser { 31 | dialect: Arc, 32 | pretty: bool, 33 | } 34 | 35 | #[pymethods] 36 | impl PyUnparser { 37 | #[new] 38 | pub fn new(dialect: PyDialect) -> Self { 39 | Self { 40 | dialect: dialect.dialect.clone(), 41 | pretty: false, 42 | } 43 | } 44 | 45 | pub fn plan_to_sql(&self, plan: &PyLogicalPlan) -> PyResult { 46 | let mut unparser = Unparser::new(self.dialect.as_ref()); 47 | unparser = unparser.with_pretty(self.pretty); 48 | let sql = unparser 49 | .plan_to_sql(&plan.plan()) 50 | .map_err(|e| PyValueError::new_err(e.to_string()))?; 51 | Ok(sql.to_string()) 52 | } 53 | 54 | pub fn with_pretty(&self, pretty: bool) -> Self { 55 | Self { 56 | dialect: self.dialect.clone(), 57 | pretty, 58 | } 59 | } 60 | } 61 | 62 | pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { 63 | m.add_class::()?; 64 | m.add_class::()?; 65 | Ok(()) 66 | } 67 | --------------------------------------------------------------------------------