├── examples
├── tpch
│ ├── .gitignore
│ ├── util.py
│ └── README.md
├── chart.png
├── datafusion-ffi-example
│ ├── .cargo
│ │ └── config.toml
│ ├── build.rs
│ ├── pyproject.toml
│ ├── Cargo.toml
│ ├── src
│ │ ├── lib.rs
│ │ └── table_function.rs
│ └── python
│ │ └── tests
│ │ ├── _test_table_provider.py
│ │ ├── _test_scalar_udf.py
│ │ └── _test_catalog_provider.py
├── dataframe-parquet.py
├── sql-parquet.py
├── sql-parquet-s3.py
├── sql-to-pandas.py
├── query-pyarrow-data.py
├── python-udf.py
├── export.py
├── create-context.py
├── sql-using-python-udf.py
├── import.py
├── python-udaf.py
└── substrait.py
├── benchmarks
├── tpch
│ ├── .gitignore
│ ├── queries
│ │ ├── q6.sql
│ │ ├── q17.sql
│ │ ├── q14.sql
│ │ ├── q13.sql
│ │ ├── q4.sql
│ │ ├── q3.sql
│ │ ├── q5.sql
│ │ ├── q18.sql
│ │ ├── q16.sql
│ │ ├── q1.sql
│ │ ├── q11.sql
│ │ ├── q15.sql
│ │ ├── q10.sql
│ │ ├── q12.sql
│ │ ├── q9.sql
│ │ ├── q20.sql
│ │ ├── q22.sql
│ │ ├── q21.sql
│ │ ├── q2.sql
│ │ ├── q8.sql
│ │ ├── q7.sql
│ │ └── q19.sql
│ ├── README.md
│ └── tpch-gen.sh
└── db-benchmark
│ ├── run-bench.sh
│ └── README.md
├── docs
├── .gitignore
├── source
│ ├── _static
│ │ └── images
│ │ │ ├── original.png
│ │ │ ├── original2x.png
│ │ │ └── 2x_bgwhite_original.png
│ ├── images
│ │ └── jupyter_lab_df_view.png
│ ├── _templates
│ │ ├── layout.html
│ │ └── docs-sidebar.html
│ └── user-guide
│ │ ├── io
│ │ ├── index.rst
│ │ ├── avro.rst
│ │ ├── json.rst
│ │ ├── csv.rst
│ │ ├── parquet.rst
│ │ └── table_provider.rst
│ │ └── common-operations
│ │ ├── index.rst
│ │ ├── views.rst
│ │ └── basic-info.rst
├── mdbook
│ ├── src
│ │ ├── images
│ │ │ └── datafusion-jupyterlab.png
│ │ ├── SUMMARY.md
│ │ ├── usage
│ │ │ ├── index.md
│ │ │ └── create-table.md
│ │ ├── installation.md
│ │ ├── index.md
│ │ └── quickstart.md
│ ├── book.toml
│ └── README.md
├── build.sh
├── Makefile
└── make.bat
├── python
├── tests
│ ├── data_test_context
│ │ └── data.json
│ ├── __init__.py
│ ├── test_unparser.py
│ ├── test_input.py
│ ├── test_store.py
│ ├── test_config.py
│ ├── test_view.py
│ ├── test_plans.py
│ ├── test_indexing.py
│ ├── conftest.py
│ └── utils.py
└── datafusion
│ ├── py.typed
│ ├── input
│ ├── __init__.py
│ └── base.py
│ ├── udf.py
│ ├── html_formatter.py
│ ├── object_store.py
│ ├── col.py
│ └── common.py
├── .dockerignore
├── .gitmodules
├── .cargo
└── config.toml
├── .github
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
├── pull_request_template.md
├── workflows
│ ├── dev.yml
│ └── take.yml
└── dependabot.yml
├── .gitignore
├── dev
├── release
│ ├── rat_exclude_files.txt
│ ├── run-rat.sh
│ ├── check-rat-report.py
│ └── release-tarball.sh
├── build-set-env.sh
├── python_lint.sh
├── rust_lint.sh
├── changelog
│ └── 45.0.0.md
└── clean.sh
├── rustfmt.toml
├── src
├── sql.rs
├── expr
│ ├── logical_node.rs
│ ├── grouping_set.rs
│ ├── signature.rs
│ ├── exists.rs
│ ├── in_list.rs
│ ├── scalar_subquery.rs
│ ├── placeholder.rs
│ ├── in_subquery.rs
│ ├── extension.rs
│ ├── binary_expr.rs
│ ├── scalar_variable.rs
│ ├── case.rs
│ ├── column.rs
│ ├── unnest_expr.rs
│ ├── alias.rs
│ ├── cast.rs
│ ├── between.rs
│ ├── indexed_field.rs
│ └── subquery.rs
├── sql
│ └── exceptions.rs
├── common
│ ├── df_schema.rs
│ └── function.rs
├── common.rs
├── unparser
│ ├── dialect.rs
│ └── mod.rs
└── pyarrow_util.rs
├── ci
└── scripts
│ ├── rust_fmt.sh
│ ├── python_lint.sh
│ ├── rust_toml_fmt.sh
│ └── rust_clippy.sh
├── CHANGELOG.md
├── .asf.yaml
└── .pre-commit-config.yaml
/examples/tpch/.gitignore:
--------------------------------------------------------------------------------
1 | data
2 |
3 |
--------------------------------------------------------------------------------
/benchmarks/tpch/.gitignore:
--------------------------------------------------------------------------------
1 | data
2 | results.csv
--------------------------------------------------------------------------------
/examples/chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apache/datafusion-python/HEAD/examples/chart.png
--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | pokemon.csv
2 | yellow_trip_data.parquet
3 | yellow_tripdata_2021-01.parquet
4 |
5 |
--------------------------------------------------------------------------------
/python/tests/data_test_context/data.json:
--------------------------------------------------------------------------------
1 | {"A": "a", "B": 1}
2 | {"A": "b", "B": 2}
3 | {"A": "c", "B": 3}
4 |
--------------------------------------------------------------------------------
/docs/source/_static/images/original.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apache/datafusion-python/HEAD/docs/source/_static/images/original.png
--------------------------------------------------------------------------------
/docs/source/_static/images/original2x.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apache/datafusion-python/HEAD/docs/source/_static/images/original2x.png
--------------------------------------------------------------------------------
/docs/source/images/jupyter_lab_df_view.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apache/datafusion-python/HEAD/docs/source/images/jupyter_lab_df_view.png
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | .cargo
2 | .github
3 | .pytest_cache
4 | ci
5 | conda
6 | dev
7 | docs
8 | examples
9 | parquet
10 | target
11 | testing
12 | venv
--------------------------------------------------------------------------------
/docs/mdbook/src/images/datafusion-jupyterlab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apache/datafusion-python/HEAD/docs/mdbook/src/images/datafusion-jupyterlab.png
--------------------------------------------------------------------------------
/docs/source/_static/images/2x_bgwhite_original.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apache/datafusion-python/HEAD/docs/source/_static/images/2x_bgwhite_original.png
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "testing"]
2 | path = testing
3 | url = https://github.com/apache/arrow-testing.git
4 | [submodule "parquet"]
5 | path = parquet
6 | url = https://github.com/apache/parquet-testing.git
7 |
--------------------------------------------------------------------------------
/.cargo/config.toml:
--------------------------------------------------------------------------------
1 | [target.x86_64-apple-darwin]
2 | rustflags = [
3 | "-C", "link-arg=-undefined",
4 | "-C", "link-arg=dynamic_lookup",
5 | ]
6 |
7 | [target.aarch64-apple-darwin]
8 | rustflags = [
9 | "-C", "link-arg=-undefined",
10 | "-C", "link-arg=dynamic_lookup",
11 | ]
12 |
13 |
--------------------------------------------------------------------------------
/examples/datafusion-ffi-example/.cargo/config.toml:
--------------------------------------------------------------------------------
1 | [target.x86_64-apple-darwin]
2 | rustflags = [
3 | "-C", "link-arg=-undefined",
4 | "-C", "link-arg=dynamic_lookup",
5 | ]
6 |
7 | [target.aarch64-apple-darwin]
8 | rustflags = [
9 | "-C", "link-arg=-undefined",
10 | "-C", "link-arg=dynamic_lookup",
11 | ]
12 |
13 |
--------------------------------------------------------------------------------
/benchmarks/tpch/queries/q6.sql:
--------------------------------------------------------------------------------
1 | -- Benchmark Query 6 derived from TPC-H query 6 under the terms of the TPC Fair Use Policy.
2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
3 | select
4 | sum(l_extendedprice * l_discount) as revenue
5 | from
6 | lineitem
7 | where
8 | l_shipdate >= date '1994-01-01'
9 | and l_shipdate < date '1994-01-01' + interval '1' year
10 | and l_discount between 0.04 - 0.01 and 0.04 + 0.01
11 | and l_quantity < 24;
12 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: bug
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 |
16 | **Expected behavior**
17 | A clear and concise description of what you expected to happen.
18 |
19 | **Additional context**
20 | Add any other context about the problem here.
21 |
--------------------------------------------------------------------------------
/benchmarks/tpch/queries/q17.sql:
--------------------------------------------------------------------------------
1 | -- Benchmark Query 17 derived from TPC-H query 17 under the terms of the TPC Fair Use Policy.
2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
3 | select
4 | sum(l_extendedprice) / 7.0 as avg_yearly
5 | from
6 | lineitem,
7 | part
8 | where
9 | p_partkey = l_partkey
10 | and p_brand = 'Brand#42'
11 | and p_container = 'LG BAG'
12 | and l_quantity < (
13 | select
14 | 0.2 * avg(l_quantity)
15 | from
16 | lineitem
17 | where
18 | l_partkey = p_partkey
19 | );
20 |
--------------------------------------------------------------------------------
/benchmarks/tpch/queries/q14.sql:
--------------------------------------------------------------------------------
1 | -- Benchmark Query 14 derived from TPC-H query 14 under the terms of the TPC Fair Use Policy.
2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
3 | select
4 | 100.00 * sum(case
5 | when p_type like 'PROMO%'
6 | then l_extendedprice * (1 - l_discount)
7 | else 0
8 | end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue
9 | from
10 | lineitem,
11 | part
12 | where
13 | l_partkey = p_partkey
14 | and l_shipdate >= date '1995-02-01'
15 | and l_shipdate < date '1995-02-01' + interval '1' month;
16 |
--------------------------------------------------------------------------------
/benchmarks/tpch/queries/q13.sql:
--------------------------------------------------------------------------------
1 | -- Benchmark Query 13 derived from TPC-H query 13 under the terms of the TPC Fair Use Policy.
2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
3 | select
4 | c_count,
5 | count(*) as custdist
6 | from
7 | (
8 | select
9 | c_custkey,
10 | count(o_orderkey)
11 | from
12 | customer left outer join orders on
13 | c_custkey = o_custkey
14 | and o_comment not like '%express%requests%'
15 | group by
16 | c_custkey
17 | ) as c_orders (c_custkey, c_count)
18 | group by
19 | c_count
20 | order by
21 | custdist desc,
22 | c_count desc;
23 |
--------------------------------------------------------------------------------
/benchmarks/tpch/queries/q4.sql:
--------------------------------------------------------------------------------
1 | -- Benchmark Query 4 derived from TPC-H query 4 under the terms of the TPC Fair Use Policy.
2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
3 | select
4 | o_orderpriority,
5 | count(*) as order_count
6 | from
7 | orders
8 | where
9 | o_orderdate >= date '1995-04-01'
10 | and o_orderdate < date '1995-04-01' + interval '3' month
11 | and exists (
12 | select
13 | *
14 | from
15 | lineitem
16 | where
17 | l_orderkey = o_orderkey
18 | and l_commitdate < l_receiptdate
19 | )
20 | group by
21 | o_orderpriority
22 | order by
23 | o_orderpriority;
24 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | /venv
3 | .idea
4 | /docs/temp
5 | /docs/build
6 | .DS_Store
7 | .vscode
8 |
9 | # Byte-compiled / optimized / DLL files
10 | __pycache__/
11 | *.py[cod]
12 | *$py.class
13 |
14 | # Python dist ignore
15 | dist
16 |
17 | # C extensions
18 | *.so
19 |
20 | # Python dist
21 | dist
22 |
23 | # pyenv
24 | # For a library or package, you might want to ignore these files since the code is
25 | # intended to run in multiple environments; otherwise, check them in:
26 | .python-version
27 | venv
28 | .venv
29 |
30 | apache-rat-*.jar
31 | *rat.txt
32 | .env
33 | CHANGELOG.md.bak
34 |
35 | docs/mdbook/book
36 |
37 | .pyo3_build_config
38 |
39 |
--------------------------------------------------------------------------------
/benchmarks/tpch/queries/q3.sql:
--------------------------------------------------------------------------------
1 | -- Benchmark Query 3 derived from TPC-H query 3 under the terms of the TPC Fair Use Policy.
2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
3 | select
4 | l_orderkey,
5 | sum(l_extendedprice * (1 - l_discount)) as revenue,
6 | o_orderdate,
7 | o_shippriority
8 | from
9 | customer,
10 | orders,
11 | lineitem
12 | where
13 | c_mktsegment = 'BUILDING'
14 | and c_custkey = o_custkey
15 | and l_orderkey = o_orderkey
16 | and o_orderdate < date '1995-03-15'
17 | and l_shipdate > date '1995-03-15'
18 | group by
19 | l_orderkey,
20 | o_orderdate,
21 | o_shippriority
22 | order by
23 | revenue desc,
24 | o_orderdate limit 10;
25 |
--------------------------------------------------------------------------------
/benchmarks/tpch/queries/q5.sql:
--------------------------------------------------------------------------------
1 | -- Benchmark Query 5 derived from TPC-H query 5 under the terms of the TPC Fair Use Policy.
2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
3 | select
4 | n_name,
5 | sum(l_extendedprice * (1 - l_discount)) as revenue
6 | from
7 | customer,
8 | orders,
9 | lineitem,
10 | supplier,
11 | nation,
12 | region
13 | where
14 | c_custkey = o_custkey
15 | and l_orderkey = o_orderkey
16 | and l_suppkey = s_suppkey
17 | and c_nationkey = s_nationkey
18 | and s_nationkey = n_nationkey
19 | and n_regionkey = r_regionkey
20 | and r_name = 'AFRICA'
21 | and o_orderdate >= date '1994-01-01'
22 | and o_orderdate < date '1994-01-01' + interval '1' year
23 | group by
24 | n_name
25 | order by
26 | revenue desc;
27 |
--------------------------------------------------------------------------------
/benchmarks/tpch/queries/q18.sql:
--------------------------------------------------------------------------------
1 | -- Benchmark Query 18 derived from TPC-H query 18 under the terms of the TPC Fair Use Policy.
2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
3 | select
4 | c_name,
5 | c_custkey,
6 | o_orderkey,
7 | o_orderdate,
8 | o_totalprice,
9 | sum(l_quantity)
10 | from
11 | customer,
12 | orders,
13 | lineitem
14 | where
15 | o_orderkey in (
16 | select
17 | l_orderkey
18 | from
19 | lineitem
20 | group by
21 | l_orderkey having
22 | sum(l_quantity) > 313
23 | )
24 | and c_custkey = o_custkey
25 | and o_orderkey = l_orderkey
26 | group by
27 | c_name,
28 | c_custkey,
29 | o_orderkey,
30 | o_orderdate,
31 | o_totalprice
32 | order by
33 | o_totalprice desc,
34 | o_orderdate limit 100;
35 |
--------------------------------------------------------------------------------
/benchmarks/tpch/queries/q16.sql:
--------------------------------------------------------------------------------
1 | -- Benchmark Query 16 derived from TPC-H query 16 under the terms of the TPC Fair Use Policy.
2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
3 | select
4 | p_brand,
5 | p_type,
6 | p_size,
7 | count(distinct ps_suppkey) as supplier_cnt
8 | from
9 | partsupp,
10 | part
11 | where
12 | p_partkey = ps_partkey
13 | and p_brand <> 'Brand#14'
14 | and p_type not like 'SMALL PLATED%'
15 | and p_size in (14, 6, 5, 31, 49, 15, 41, 47)
16 | and ps_suppkey not in (
17 | select
18 | s_suppkey
19 | from
20 | supplier
21 | where
22 | s_comment like '%Customer%Complaints%'
23 | )
24 | group by
25 | p_brand,
26 | p_type,
27 | p_size
28 | order by
29 | supplier_cnt desc,
30 | p_brand,
31 | p_type,
32 | p_size;
33 |
--------------------------------------------------------------------------------
/benchmarks/tpch/queries/q1.sql:
--------------------------------------------------------------------------------
1 | -- Benchmark Query 1 derived from TPC-H query 1 under the terms of the TPC Fair Use Policy.
2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
3 | select
4 | l_returnflag,
5 | l_linestatus,
6 | sum(l_quantity) as sum_qty,
7 | sum(l_extendedprice) as sum_base_price,
8 | sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
9 | sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
10 | avg(l_quantity) as avg_qty,
11 | avg(l_extendedprice) as avg_price,
12 | avg(l_discount) as avg_disc,
13 | count(*) as count_order
14 | from
15 | lineitem
16 | where
17 | l_shipdate <= date '1998-12-01' - interval '68 days'
18 | group by
19 | l_returnflag,
20 | l_linestatus
21 | order by
22 | l_returnflag,
23 | l_linestatus;
24 |
--------------------------------------------------------------------------------
/benchmarks/tpch/queries/q11.sql:
--------------------------------------------------------------------------------
1 | -- Benchmark Query 11 derived from TPC-H query 11 under the terms of the TPC Fair Use Policy.
2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
3 | select
4 | ps_partkey,
5 | sum(ps_supplycost * ps_availqty) as value
6 | from
7 | partsupp,
8 | supplier,
9 | nation
10 | where
11 | ps_suppkey = s_suppkey
12 | and s_nationkey = n_nationkey
13 | and n_name = 'ALGERIA'
14 | group by
15 | ps_partkey having
16 | sum(ps_supplycost * ps_availqty) > (
17 | select
18 | sum(ps_supplycost * ps_availqty) * 0.0001000000
19 | from
20 | partsupp,
21 | supplier,
22 | nation
23 | where
24 | ps_suppkey = s_suppkey
25 | and s_nationkey = n_nationkey
26 | and n_name = 'ALGERIA'
27 | )
28 | order by
29 | value desc;
30 |
--------------------------------------------------------------------------------
/benchmarks/tpch/queries/q15.sql:
--------------------------------------------------------------------------------
1 | -- Benchmark Query 15 derived from TPC-H query 15 under the terms of the TPC Fair Use Policy.
2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
3 | create view revenue0 (supplier_no, total_revenue) as
4 | select
5 | l_suppkey,
6 | sum(l_extendedprice * (1 - l_discount))
7 | from
8 | lineitem
9 | where
10 | l_shipdate >= date '1996-08-01'
11 | and l_shipdate < date '1996-08-01' + interval '3' month
12 | group by
13 | l_suppkey;
14 | select
15 | s_suppkey,
16 | s_name,
17 | s_address,
18 | s_phone,
19 | total_revenue
20 | from
21 | supplier,
22 | revenue0
23 | where
24 | s_suppkey = supplier_no
25 | and total_revenue = (
26 | select
27 | max(total_revenue)
28 | from
29 | revenue0
30 | )
31 | order by
32 | s_suppkey;
33 | drop view revenue0;
34 |
--------------------------------------------------------------------------------
/benchmarks/tpch/queries/q10.sql:
--------------------------------------------------------------------------------
1 | -- Benchmark Query 10 derived from TPC-H query 10 under the terms of the TPC Fair Use Policy.
2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
3 | select
4 | c_custkey,
5 | c_name,
6 | sum(l_extendedprice * (1 - l_discount)) as revenue,
7 | c_acctbal,
8 | n_name,
9 | c_address,
10 | c_phone,
11 | c_comment
12 | from
13 | customer,
14 | orders,
15 | lineitem,
16 | nation
17 | where
18 | c_custkey = o_custkey
19 | and l_orderkey = o_orderkey
20 | and o_orderdate >= date '1993-07-01'
21 | and o_orderdate < date '1993-07-01' + interval '3' month
22 | and l_returnflag = 'R'
23 | and c_nationkey = n_nationkey
24 | group by
25 | c_custkey,
26 | c_name,
27 | c_acctbal,
28 | c_phone,
29 | n_name,
30 | c_address,
31 | c_comment
32 | order by
33 | revenue desc limit 20;
34 |
--------------------------------------------------------------------------------
/python/datafusion/py.typed:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
--------------------------------------------------------------------------------
/python/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: enhancement
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem or challenge? Please describe what you are trying to do.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | (This section helps Arrow developers understand the context and *why* for this feature, in addition to the *what*)
13 |
14 | **Describe the solution you'd like**
15 | A clear and concise description of what you want to happen.
16 |
17 | **Describe alternatives you've considered**
18 | A clear and concise description of any alternative solutions or features you've considered.
19 |
20 | **Additional context**
21 | Add any other context or screenshots about the feature request here.
22 |
--------------------------------------------------------------------------------
/dev/release/rat_exclude_files.txt:
--------------------------------------------------------------------------------
1 | *.npmrc
2 | *.gitignore
3 | *.dockerignore
4 | .gitmodules
5 | *_generated.h
6 | *_generated.js
7 | *_generated.ts
8 | *.csv
9 | *.json
10 | *.snap
11 | .github/ISSUE_TEMPLATE/*.md
12 | .github/pull_request_template.md
13 | CHANGELOG.md
14 | dev/release/rat_exclude_files.txt
15 | MANIFEST.in
16 | __init__.pxd
17 | __init__.py
18 | *.html
19 | *.sgml
20 | *.css
21 | *.png
22 | *.ico
23 | *.svg
24 | *.devhelp2
25 | *.scss
26 | .gitattributes
27 | requirements.txt
28 | *requirements*.txt
29 | **/testdata/*
30 | ci/*
31 | **/*.svg
32 | **/*.csv
33 | **/*.json
34 | **/*.sql
35 | venv/*
36 | parquet/*
37 | testing/*
38 | target/*
39 | **/target/*
40 | Cargo.lock
41 | **/Cargo.lock
42 | .history
43 | *rat.txt
44 | */.git
45 | .github/*
46 | benchmarks/tpch/queries/q*.sql
47 | benchmarks/tpch/create_tables.sql
48 | .cargo/config.toml
49 | **/.cargo/config.toml
50 | uv.lock
--------------------------------------------------------------------------------
/docs/source/_templates/layout.html:
--------------------------------------------------------------------------------
1 | {% extends "pydata_sphinx_theme/layout.html" %}
2 |
3 | {# Silence the navbar #}
4 | {% block docs_navbar %}
5 | {% endblock %}
6 |
7 |
10 | {% block footer %}
11 |
12 |
25 |
26 | {% endblock %}
27 |
--------------------------------------------------------------------------------
/benchmarks/tpch/queries/q12.sql:
--------------------------------------------------------------------------------
1 | -- Benchmark Query 12 derived from TPC-H query 12 under the terms of the TPC Fair Use Policy.
2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
3 | select
4 | l_shipmode,
5 | sum(case
6 | when o_orderpriority = '1-URGENT'
7 | or o_orderpriority = '2-HIGH'
8 | then 1
9 | else 0
10 | end) as high_line_count,
11 | sum(case
12 | when o_orderpriority <> '1-URGENT'
13 | and o_orderpriority <> '2-HIGH'
14 | then 1
15 | else 0
16 | end) as low_line_count
17 | from
18 | orders,
19 | lineitem
20 | where
21 | o_orderkey = l_orderkey
22 | and l_shipmode in ('FOB', 'SHIP')
23 | and l_commitdate < l_receiptdate
24 | and l_shipdate < l_commitdate
25 | and l_receiptdate >= date '1995-01-01'
26 | and l_receiptdate < date '1995-01-01' + interval '1' year
27 | group by
28 | l_shipmode
29 | order by
30 | l_shipmode;
31 |
--------------------------------------------------------------------------------
/benchmarks/tpch/queries/q9.sql:
--------------------------------------------------------------------------------
1 | -- Benchmark Query 9 derived from TPC-H query 9 under the terms of the TPC Fair Use Policy.
2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
3 | select
4 | nation,
5 | o_year,
6 | sum(amount) as sum_profit
7 | from
8 | (
9 | select
10 | n_name as nation,
11 | extract(year from o_orderdate) as o_year,
12 | l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount
13 | from
14 | part,
15 | supplier,
16 | lineitem,
17 | partsupp,
18 | orders,
19 | nation
20 | where
21 | s_suppkey = l_suppkey
22 | and ps_suppkey = l_suppkey
23 | and ps_partkey = l_partkey
24 | and p_partkey = l_partkey
25 | and o_orderkey = l_orderkey
26 | and s_nationkey = n_nationkey
27 | and p_name like '%moccasin%'
28 | ) as profit
29 | group by
30 | nation,
31 | o_year
32 | order by
33 | nation,
34 | o_year desc;
35 |
--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | group_imports = "StdExternalCrate"
19 | imports_granularity = "Module"
20 |
--------------------------------------------------------------------------------
/benchmarks/tpch/queries/q20.sql:
--------------------------------------------------------------------------------
1 | -- Benchmark Query 20 derived from TPC-H query 20 under the terms of the TPC Fair Use Policy.
2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
3 | select
4 | s_name,
5 | s_address
6 | from
7 | supplier,
8 | nation
9 | where
10 | s_suppkey in (
11 | select
12 | ps_suppkey
13 | from
14 | partsupp
15 | where
16 | ps_partkey in (
17 | select
18 | p_partkey
19 | from
20 | part
21 | where
22 | p_name like 'blanched%'
23 | )
24 | and ps_availqty > (
25 | select
26 | 0.5 * sum(l_quantity)
27 | from
28 | lineitem
29 | where
30 | l_partkey = ps_partkey
31 | and l_suppkey = ps_suppkey
32 | and l_shipdate >= date '1993-01-01'
33 | and l_shipdate < date '1993-01-01' + interval '1' year
34 | )
35 | )
36 | and s_nationkey = n_nationkey
37 | and n_name = 'KENYA'
38 | order by
39 | s_name;
40 |
--------------------------------------------------------------------------------
/src/sql.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | pub mod exceptions;
19 | pub mod logical;
20 | pub(crate) mod util;
21 |
--------------------------------------------------------------------------------
/ci/scripts/rust_fmt.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #
3 | # Licensed to the Apache Software Foundation (ASF) under one
4 | # or more contributor license agreements. See the NOTICE file
5 | # distributed with this work for additional information
6 | # regarding copyright ownership. The ASF licenses this file
7 | # to you under the Apache License, Version 2.0 (the
8 | # "License"); you may not use this file except in compliance
9 | # with the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied. See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 |
20 | set -ex
21 | cargo +nightly fmt --all -- --check
22 |
--------------------------------------------------------------------------------
/ci/scripts/python_lint.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #
3 | # Licensed to the Apache Software Foundation (ASF) under one
4 | # or more contributor license agreements. See the NOTICE file
5 | # distributed with this work for additional information
6 | # regarding copyright ownership. The ASF licenses this file
7 | # to you under the Apache License, Version 2.0 (the
8 | # "License"); you may not use this file except in compliance
9 | # with the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied. See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 |
20 | set -ex
21 | ruff format datafusion
22 | ruff check datafusion
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 |
19 |
20 | # DataFusion Python Changelog
21 |
22 | The changelogs have now moved [here](./dev/changelog).
23 |
--------------------------------------------------------------------------------
/dev/build-set-env.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Licensed to the Apache Software Foundation (ASF) under one
4 | # or more contributor license agreements. See the NOTICE file
5 | # distributed with this work for additional information
6 | # regarding copyright ownership. The ASF licenses this file
7 | # to you under the Apache License, Version 2.0 (the
8 | # "License"); you may not use this file except in compliance
9 | # with the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied. See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 |
20 | export PY_DATAFUSION_VERSION=$(awk -F'[ ="]+' '$1 == "version" { print $2 }' Cargo.toml)
21 |
--------------------------------------------------------------------------------
/ci/scripts/rust_toml_fmt.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #
3 | # Licensed to the Apache Software Foundation (ASF) under one
4 | # or more contributor license agreements. See the NOTICE file
5 | # distributed with this work for additional information
6 | # regarding copyright ownership. The ASF licenses this file
7 | # to you under the Apache License, Version 2.0 (the
8 | # "License"); you may not use this file except in compliance
9 | # with the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied. See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 |
20 | set -ex
21 | find . -mindepth 2 -name 'Cargo.toml' -exec cargo tomlfmt -p {} \;
22 |
--------------------------------------------------------------------------------
/examples/datafusion-ffi-example/build.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | fn main() {
19 | pyo3_build_config::add_extension_module_link_args();
20 | }
21 |
--------------------------------------------------------------------------------
/ci/scripts/rust_clippy.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #
3 | # Licensed to the Apache Software Foundation (ASF) under one
4 | # or more contributor license agreements. See the NOTICE file
5 | # distributed with this work for additional information
6 | # regarding copyright ownership. The ASF licenses this file
7 | # to you under the Apache License, Version 2.0 (the
8 | # "License"); you may not use this file except in compliance
9 | # with the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied. See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 |
20 | set -ex
21 | cargo clippy --all-targets --workspace --features default -- -D warnings
22 |
--------------------------------------------------------------------------------
/benchmarks/tpch/queries/q22.sql:
--------------------------------------------------------------------------------
1 | -- Benchmark Query 22 derived from TPC-H query 22 under the terms of the TPC Fair Use Policy.
2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
3 | select
4 | cntrycode,
5 | count(*) as numcust,
6 | sum(c_acctbal) as totacctbal
7 | from
8 | (
9 | select
10 | substring(c_phone from 1 for 2) as cntrycode,
11 | c_acctbal
12 | from
13 | customer
14 | where
15 | substring(c_phone from 1 for 2) in
16 | ('24', '34', '16', '30', '33', '14', '13')
17 | and c_acctbal > (
18 | select
19 | avg(c_acctbal)
20 | from
21 | customer
22 | where
23 | c_acctbal > 0.00
24 | and substring(c_phone from 1 for 2) in
25 | ('24', '34', '16', '30', '33', '14', '13')
26 | )
27 | and not exists (
28 | select
29 | *
30 | from
31 | orders
32 | where
33 | o_custkey = c_custkey
34 | )
35 | ) as custsale
36 | group by
37 | cntrycode
38 | order by
39 | cntrycode;
40 |
--------------------------------------------------------------------------------
/benchmarks/tpch/queries/q21.sql:
--------------------------------------------------------------------------------
1 | -- Benchmark Query 21 derived from TPC-H query 21 under the terms of the TPC Fair Use Policy.
2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
3 | select
4 | s_name,
5 | count(*) as numwait
6 | from
7 | supplier,
8 | lineitem l1,
9 | orders,
10 | nation
11 | where
12 | s_suppkey = l1.l_suppkey
13 | and o_orderkey = l1.l_orderkey
14 | and o_orderstatus = 'F'
15 | and l1.l_receiptdate > l1.l_commitdate
16 | and exists (
17 | select
18 | *
19 | from
20 | lineitem l2
21 | where
22 | l2.l_orderkey = l1.l_orderkey
23 | and l2.l_suppkey <> l1.l_suppkey
24 | )
25 | and not exists (
26 | select
27 | *
28 | from
29 | lineitem l3
30 | where
31 | l3.l_orderkey = l1.l_orderkey
32 | and l3.l_suppkey <> l1.l_suppkey
33 | and l3.l_receiptdate > l3.l_commitdate
34 | )
35 | and s_nationkey = n_nationkey
36 | and n_name = 'ARGENTINA'
37 | group by
38 | s_name
39 | order by
40 | numwait desc,
41 | s_name limit 100;
42 |
--------------------------------------------------------------------------------
/docs/mdbook/book.toml:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | [book]
19 | authors = ["Apache Arrow "]
20 | language = "en"
21 | multilingual = false
22 | src = "src"
23 | title = "DataFusion Book"
24 |
--------------------------------------------------------------------------------
/docs/source/_templates/docs-sidebar.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
10 |
11 |
20 |
--------------------------------------------------------------------------------
/docs/source/user-guide/io/index.rst:
--------------------------------------------------------------------------------
1 | .. Licensed to the Apache Software Foundation (ASF) under one
2 | .. or more contributor license agreements. See the NOTICE file
3 | .. distributed with this work for additional information
4 | .. regarding copyright ownership. The ASF licenses this file
5 | .. to you under the Apache License, Version 2.0 (the
6 | .. "License"); you may not use this file except in compliance
7 | .. with the License. You may obtain a copy of the License at
8 |
9 | .. http://www.apache.org/licenses/LICENSE-2.0
10 |
11 | .. Unless required by applicable law or agreed to in writing,
12 | .. software distributed under the License is distributed on an
13 | .. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | .. KIND, either express or implied. See the License for the
15 | .. specific language governing permissions and limitations
16 | .. under the License.
17 |
18 | IO
19 | ==
20 |
21 | .. toctree::
22 | :maxdepth: 2
23 |
24 | arrow
25 | avro
26 | csv
27 | json
28 | parquet
29 | table_provider
30 |
--------------------------------------------------------------------------------
/benchmarks/tpch/queries/q2.sql:
--------------------------------------------------------------------------------
1 | -- Benchmark Query 2 derived from TPC-H query 2 under the terms of the TPC Fair Use Policy.
2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
3 | select
4 | s_acctbal,
5 | s_name,
6 | n_name,
7 | p_partkey,
8 | p_mfgr,
9 | s_address,
10 | s_phone,
11 | s_comment
12 | from
13 | part,
14 | supplier,
15 | partsupp,
16 | nation,
17 | region
18 | where
19 | p_partkey = ps_partkey
20 | and s_suppkey = ps_suppkey
21 | and p_size = 48
22 | and p_type like '%TIN'
23 | and s_nationkey = n_nationkey
24 | and n_regionkey = r_regionkey
25 | and r_name = 'ASIA'
26 | and ps_supplycost = (
27 | select
28 | min(ps_supplycost)
29 | from
30 | partsupp,
31 | supplier,
32 | nation,
33 | region
34 | where
35 | p_partkey = ps_partkey
36 | and s_suppkey = ps_suppkey
37 | and s_nationkey = n_nationkey
38 | and n_regionkey = r_regionkey
39 | and r_name = 'ASIA'
40 | )
41 | order by
42 | s_acctbal desc,
43 | n_name,
44 | s_name,
45 | p_partkey limit 100;
46 |
--------------------------------------------------------------------------------
/benchmarks/tpch/queries/q8.sql:
--------------------------------------------------------------------------------
1 | -- Benchmark Query 8 derived from TPC-H query 8 under the terms of the TPC Fair Use Policy.
2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
3 | select
4 | o_year,
5 | sum(case
6 | when nation = 'IRAQ' then volume
7 | else 0
8 | end) / sum(volume) as mkt_share
9 | from
10 | (
11 | select
12 | extract(year from o_orderdate) as o_year,
13 | l_extendedprice * (1 - l_discount) as volume,
14 | n2.n_name as nation
15 | from
16 | part,
17 | supplier,
18 | lineitem,
19 | orders,
20 | customer,
21 | nation n1,
22 | nation n2,
23 | region
24 | where
25 | p_partkey = l_partkey
26 | and s_suppkey = l_suppkey
27 | and l_orderkey = o_orderkey
28 | and o_custkey = c_custkey
29 | and c_nationkey = n1.n_nationkey
30 | and n1.n_regionkey = r_regionkey
31 | and r_name = 'MIDDLE EAST'
32 | and s_nationkey = n2.n_nationkey
33 | and o_orderdate between date '1995-01-01' and date '1996-12-31'
34 | and p_type = 'LARGE PLATED STEEL'
35 | ) as all_nations
36 | group by
37 | o_year
38 | order by
39 | o_year;
40 |
--------------------------------------------------------------------------------
/python/datafusion/input/__init__.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | """This package provides for input sources.
19 |
20 | The primary class used within DataFusion is ``LocationInputPlugin``.
21 | """
22 |
23 | from .location import LocationInputPlugin
24 |
25 | __all__ = [
26 | "LocationInputPlugin",
27 | ]
28 |
--------------------------------------------------------------------------------
/dev/python_lint.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Licensed to the Apache Software Foundation (ASF) under one
4 | # or more contributor license agreements. See the NOTICE file
5 | # distributed with this work for additional information
6 | # regarding copyright ownership. The ASF licenses this file
7 | # to you under the Apache License, Version 2.0 (the
8 | # "License"); you may not use this file except in compliance
9 | # with the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied. See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 |
20 | # This script runs all the Rust lints locally the same way the
21 | # DataFusion CI does
22 |
23 | set -e
24 | source .venv/bin/activate
25 | flake8 --exclude venv,benchmarks/db-benchmark --ignore=E501,W503
26 | black --line-length 79 .
27 |
--------------------------------------------------------------------------------
/examples/dataframe-parquet.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | from datafusion import SessionContext
19 | from datafusion import functions as f
20 |
21 | ctx = SessionContext()
22 | df = ctx.read_parquet("yellow_tripdata_2021-01.parquet").aggregate(
23 | [f.col("passenger_count")], [f.count_star()]
24 | )
25 | df.show()
26 |
--------------------------------------------------------------------------------
/benchmarks/tpch/queries/q7.sql:
--------------------------------------------------------------------------------
1 | -- Benchmark Query 7 derived from TPC-H query 7 under the terms of the TPC Fair Use Policy.
2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
3 | select
4 | supp_nation,
5 | cust_nation,
6 | l_year,
7 | sum(volume) as revenue
8 | from
9 | (
10 | select
11 | n1.n_name as supp_nation,
12 | n2.n_name as cust_nation,
13 | extract(year from l_shipdate) as l_year,
14 | l_extendedprice * (1 - l_discount) as volume
15 | from
16 | supplier,
17 | lineitem,
18 | orders,
19 | customer,
20 | nation n1,
21 | nation n2
22 | where
23 | s_suppkey = l_suppkey
24 | and o_orderkey = l_orderkey
25 | and c_custkey = o_custkey
26 | and s_nationkey = n1.n_nationkey
27 | and c_nationkey = n2.n_nationkey
28 | and (
29 | (n1.n_name = 'GERMANY' and n2.n_name = 'IRAQ')
30 | or (n1.n_name = 'IRAQ' and n2.n_name = 'GERMANY')
31 | )
32 | and l_shipdate between date '1995-01-01' and date '1996-12-31'
33 | ) as shipping
34 | group by
35 | supp_nation,
36 | cust_nation,
37 | l_year
38 | order by
39 | supp_nation,
40 | cust_nation,
41 | l_year;
42 |
--------------------------------------------------------------------------------
/docs/mdbook/src/SUMMARY.md:
--------------------------------------------------------------------------------
1 |
17 | # Summary
18 |
19 | - [Index](./index.md)
20 | - [Installation](./installation.md)
21 | - [Quickstart](./quickstart.md)
22 | - [Usage](./usage/index.md)
23 | - [Create a table](./usage/create-table.md)
24 | - [Query a table](./usage/query-table.md)
25 | - [Viewing Query Plans](./usage/query-plans.md)
--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | # Which issue does this PR close?
2 |
3 |
6 |
7 | Closes #.
8 |
9 | # Rationale for this change
10 |
14 |
15 | # What changes are included in this PR?
16 |
19 |
20 | # Are there any user-facing changes?
21 |
24 |
25 |
--------------------------------------------------------------------------------
/examples/sql-parquet.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | from datafusion import SessionContext
19 |
20 | ctx = SessionContext()
21 | ctx.register_parquet("taxi", "yellow_tripdata_2021-01.parquet")
22 | df = ctx.sql(
23 | "select passenger_count, count(*) from taxi where passenger_count is not null group by passenger_count order by passenger_count"
24 | )
25 | df.show()
26 |
--------------------------------------------------------------------------------
/python/datafusion/udf.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | """Deprecated module for user defined functions."""
19 |
20 | import warnings
21 |
22 | from datafusion.user_defined import * # noqa: F403
23 |
24 | warnings.warn(
25 | "The module 'udf' is deprecated and will be removed in the next release. "
26 | "Please use 'user_defined' instead.",
27 | DeprecationWarning,
28 | stacklevel=2,
29 | )
30 |
--------------------------------------------------------------------------------
/dev/rust_lint.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Licensed to the Apache Software Foundation (ASF) under one
4 | # or more contributor license agreements. See the NOTICE file
5 | # distributed with this work for additional information
6 | # regarding copyright ownership. The ASF licenses this file
7 | # to you under the Apache License, Version 2.0 (the
8 | # "License"); you may not use this file except in compliance
9 | # with the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied. See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 |
20 | # This script runs all the Rust lints locally the same way the
21 | # DataFusion CI does
22 |
23 | set -e
24 | if ! command -v cargo-tomlfmt &> /dev/null; then
25 | echo "Installing cargo-tomlfmt using cargo"
26 | cargo install cargo-tomlfmt
27 | fi
28 |
29 | ci/scripts/rust_fmt.sh
30 | ci/scripts/rust_clippy.sh
31 | ci/scripts/rust_toml_fmt.sh
32 |
--------------------------------------------------------------------------------
/python/datafusion/html_formatter.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | """Deprecated module for dataframe formatting."""
19 |
20 | import warnings
21 |
22 | from datafusion.dataframe_formatter import * # noqa: F403
23 |
24 | warnings.warn(
25 | "The module 'html_formatter' is deprecated and will be removed in the next release."
26 | "Please use 'dataframe_formatter' instead.",
27 | DeprecationWarning,
28 | stacklevel=3,
29 | )
30 |
--------------------------------------------------------------------------------
/python/datafusion/object_store.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | """Object store functionality."""
18 |
19 | from ._internal import object_store
20 |
21 | AmazonS3 = object_store.AmazonS3
22 | GoogleCloud = object_store.GoogleCloud
23 | LocalFileSystem = object_store.LocalFileSystem
24 | MicrosoftAzure = object_store.MicrosoftAzure
25 | Http = object_store.Http
26 |
27 | __all__ = ["AmazonS3", "GoogleCloud", "Http", "LocalFileSystem", "MicrosoftAzure"]
28 |
--------------------------------------------------------------------------------
/docs/source/user-guide/common-operations/index.rst:
--------------------------------------------------------------------------------
1 | .. Licensed to the Apache Software Foundation (ASF) under one
2 | .. or more contributor license agreements. See the NOTICE file
3 | .. distributed with this work for additional information
4 | .. regarding copyright ownership. The ASF licenses this file
5 | .. to you under the Apache License, Version 2.0 (the
6 | .. "License"); you may not use this file except in compliance
7 | .. with the License. You may obtain a copy of the License at
8 |
9 | .. http://www.apache.org/licenses/LICENSE-2.0
10 |
11 | .. Unless required by applicable law or agreed to in writing,
12 | .. software distributed under the License is distributed on an
13 | .. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | .. KIND, either express or implied. See the License for the
15 | .. specific language governing permissions and limitations
16 | .. under the License.
17 |
18 | Common Operations
19 | =================
20 |
21 | The contents of this section are designed to guide a new user through how to use DataFusion.
22 |
23 | .. toctree::
24 | :maxdepth: 2
25 |
26 | views
27 | basic-info
28 | select-and-filter
29 | expressions
30 | joins
31 | functions
32 | aggregations
33 | windows
34 | udf-and-udfa
35 |
--------------------------------------------------------------------------------
/docs/source/user-guide/io/avro.rst:
--------------------------------------------------------------------------------
1 | .. Licensed to the Apache Software Foundation (ASF) under one
2 | .. or more contributor license agreements. See the NOTICE file
3 | .. distributed with this work for additional information
4 | .. regarding copyright ownership. The ASF licenses this file
5 | .. to you under the Apache License, Version 2.0 (the
6 | .. "License"); you may not use this file except in compliance
7 | .. with the License. You may obtain a copy of the License at
8 |
9 | .. http://www.apache.org/licenses/LICENSE-2.0
10 |
11 | .. Unless required by applicable law or agreed to in writing,
12 | .. software distributed under the License is distributed on an
13 | .. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | .. KIND, either express or implied. See the License for the
15 | .. specific language governing permissions and limitations
16 | .. under the License.
17 |
18 | .. _io_avro:
19 |
20 | Avro
21 | ====
22 |
23 | `Avro `_ is a serialization format for record data. Reading an avro file is very straightforward
24 | with :py:func:`~datafusion.context.SessionContext.read_avro`
25 |
26 | .. code-block:: python
27 |
28 |
29 | from datafusion import SessionContext
30 |
31 | ctx = SessionContext()
32 | df = ctx.read_avro("file.avro")
--------------------------------------------------------------------------------
/benchmarks/tpch/queries/q19.sql:
--------------------------------------------------------------------------------
1 | -- Benchmark Query 19 derived from TPC-H query 19 under the terms of the TPC Fair Use Policy.
2 | -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
3 | select
4 | sum(l_extendedprice* (1 - l_discount)) as revenue
5 | from
6 | lineitem,
7 | part
8 | where
9 | (
10 | p_partkey = l_partkey
11 | and p_brand = 'Brand#21'
12 | and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG')
13 | and l_quantity >= 8 and l_quantity <= 8 + 10
14 | and p_size between 1 and 5
15 | and l_shipmode in ('AIR', 'AIR REG')
16 | and l_shipinstruct = 'DELIVER IN PERSON'
17 | )
18 | or
19 | (
20 | p_partkey = l_partkey
21 | and p_brand = 'Brand#13'
22 | and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK')
23 | and l_quantity >= 20 and l_quantity <= 20 + 10
24 | and p_size between 1 and 10
25 | and l_shipmode in ('AIR', 'AIR REG')
26 | and l_shipinstruct = 'DELIVER IN PERSON'
27 | )
28 | or
29 | (
30 | p_partkey = l_partkey
31 | and p_brand = 'Brand#52'
32 | and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG')
33 | and l_quantity >= 30 and l_quantity <= 30 + 10
34 | and p_size between 1 and 15
35 | and l_shipmode in ('AIR', 'AIR REG')
36 | and l_shipinstruct = 'DELIVER IN PERSON'
37 | );
38 |
--------------------------------------------------------------------------------
/.github/workflows/dev.yml:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | name: Dev
19 | on: [push, pull_request]
20 |
21 | jobs:
22 |
23 | rat:
24 | name: Release Audit Tool (RAT)
25 | runs-on: ubuntu-latest
26 | steps:
27 | - name: Checkout
28 | uses: actions/checkout@v5
29 | - name: Setup Python
30 | uses: actions/setup-python@v6
31 | with:
32 | python-version: "3.10"
33 | - name: Audit licenses
34 | run: ./dev/release/run-rat.sh .
35 |
--------------------------------------------------------------------------------
/benchmarks/db-benchmark/run-bench.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Licensed to the Apache Software Foundation (ASF) under one
3 | # or more contributor license agreements. See the NOTICE file
4 | # distributed with this work for additional information
5 | # regarding copyright ownership. The ASF licenses this file
6 | # to you under the Apache License, Version 2.0 (the
7 | # "License"); you may not use this file except in compliance
8 | # with the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied. See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 | set -e
19 |
20 | #SRC_DATANAME=G1_1e7_1e2_0_0 python3 /db-benchmark/polars/groupby-polars.py
21 | SRC_DATANAME=G1_1e7_1e2_0_0 python3 /db-benchmark/datafusion-python/groupby-datafusion.py
22 |
23 | # joins need more work still
24 | #SRC_DATANAME=G1_1e7_1e2_0_0 python3 /db-benchmark/datafusion-python/join-datafusion.py
25 | #SRC_DATANAME=G1_1e7_1e2_0_0 python3 /db-benchmark/polars/join-polars.py
26 |
27 | cat time.csv
28 |
--------------------------------------------------------------------------------
/examples/tpch/util.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | """
19 | Common utilities for running TPC-H examples.
20 | """
21 |
22 | from pathlib import Path
23 |
24 |
25 | def get_data_path(filename: str) -> Path:
26 | path = Path(__file__).resolve().parent
27 |
28 | return path / "data" / filename
29 |
30 |
31 | def get_answer_file(answer_file: str) -> Path:
32 | path = Path(__file__).resolve().parent
33 |
34 | return path / "../../benchmarks/tpch/data/answers" / f"{answer_file}.out"
35 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one
3 | # or more contributor license agreements. See the NOTICE file
4 | # distributed with this work for additional information
5 | # regarding copyright ownership. The ASF licenses this file
6 | # to you under the Apache License, Version 2.0 (the
7 | # "License"); you may not use this file except in compliance
8 | # with the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied. See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 | #
19 | version: 2
20 | updates:
21 |
22 | - package-ecosystem: "cargo"
23 | directory: "/"
24 | schedule:
25 | interval: "weekly"
26 | day: "saturday"
27 | open-pull-requests-limit: 20
28 | target-branch: main
29 |
30 | - package-ecosystem: "github-actions"
31 | directory: "/"
32 | schedule:
33 | interval: "weekly"
34 | day: "sunday"
35 | open-pull-requests-limit: 20
36 | target-branch: main
37 |
--------------------------------------------------------------------------------
/benchmarks/db-benchmark/README.md:
--------------------------------------------------------------------------------
1 |
19 |
20 | # DataFusion Implementation of db-benchmark
21 |
22 | This directory contains scripts for running [db-benchmark](https://github.com/duckdblabs/db-benchmark) with
23 | DataFusion's Python bindings.
24 |
25 | ## Directions
26 |
27 | Run the following from root of this project.
28 |
29 | ```bash
30 | docker build -t db-benchmark -f benchmarks/db-benchmark/db-benchmark.dockerfile .
31 | docker run --privileged -it db-benchmark
32 | ```
33 |
--------------------------------------------------------------------------------
/docs/source/user-guide/io/json.rst:
--------------------------------------------------------------------------------
1 | .. Licensed to the Apache Software Foundation (ASF) under one
2 | .. or more contributor license agreements. See the NOTICE file
3 | .. distributed with this work for additional information
4 | .. regarding copyright ownership. The ASF licenses this file
5 | .. to you under the Apache License, Version 2.0 (the
6 | .. "License"); you may not use this file except in compliance
7 | .. with the License. You may obtain a copy of the License at
8 |
9 | .. http://www.apache.org/licenses/LICENSE-2.0
10 |
11 | .. Unless required by applicable law or agreed to in writing,
12 | .. software distributed under the License is distributed on an
13 | .. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | .. KIND, either express or implied. See the License for the
15 | .. specific language governing permissions and limitations
16 | .. under the License.
17 |
18 | .. _io_json:
19 |
20 | JSON
21 | ====
22 | `JSON `_ (JavaScript Object Notation) is a lightweight data-interchange format.
23 | When it comes to reading a JSON file, using :py:func:`~datafusion.context.SessionContext.read_json` is a simple and easy
24 |
25 | .. code-block:: python
26 |
27 |
28 | from datafusion import SessionContext
29 |
30 | ctx = SessionContext()
31 | df = ctx.read_json("file.json")
32 |
--------------------------------------------------------------------------------
/src/expr/logical_node.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | use pyo3::{Bound, PyAny, PyResult, Python};
19 |
20 | use crate::sql::logical::PyLogicalPlan;
21 |
22 | /// Representation of a `LogicalNode` in the in overall `LogicalPlan`
23 | /// any "node" shares these common traits in common.
24 | pub trait LogicalNode {
25 | /// The input plan to the current logical node instance.
26 | fn inputs(&self) -> Vec;
27 |
28 | fn to_variant<'py>(&self, py: Python<'py>) -> PyResult>;
29 | }
30 |
--------------------------------------------------------------------------------
/examples/datafusion-ffi-example/pyproject.toml:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | [build-system]
19 | requires = ["maturin>=1.6,<2.0"]
20 | build-backend = "maturin"
21 |
22 | [project]
23 | name = "datafusion_ffi_example"
24 | requires-python = ">=3.9"
25 | classifiers = [
26 | "Programming Language :: Rust",
27 | "Programming Language :: Python :: Implementation :: CPython",
28 | "Programming Language :: Python :: Implementation :: PyPy",
29 | ]
30 | dynamic = ["version"]
31 |
32 | [tool.maturin]
33 | features = ["pyo3/extension-module"]
34 |
--------------------------------------------------------------------------------
/python/tests/test_unparser.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | from datafusion.context import SessionContext
19 | from datafusion.unparser import Dialect, Unparser
20 |
21 |
22 | def test_unparser():
23 | ctx = SessionContext()
24 | df = ctx.sql("SELECT 1")
25 | for dialect in [
26 | Dialect.mysql(),
27 | Dialect.postgres(),
28 | Dialect.sqlite(),
29 | Dialect.duckdb(),
30 | ]:
31 | unparser = Unparser(dialect)
32 | sql = unparser.plan_to_sql(df.logical_plan())
33 | assert sql == "SELECT 1"
34 |
--------------------------------------------------------------------------------
/src/sql/exceptions.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | use std::fmt::{Debug, Display};
19 |
20 | use pyo3::PyErr;
21 |
22 | pub fn py_type_err(e: impl Debug + Display) -> PyErr {
23 | PyErr::new::(format!("{e}"))
24 | }
25 |
26 | pub fn py_runtime_err(e: impl Debug + Display) -> PyErr {
27 | PyErr::new::(format!("{e}"))
28 | }
29 |
30 | pub fn py_value_err(e: impl Debug + Display) -> PyErr {
31 | PyErr::new::(format!("{e}"))
32 | }
33 |
--------------------------------------------------------------------------------
/python/tests/test_input.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | import pathlib
19 |
20 | from datafusion.input.location import LocationInputPlugin
21 |
22 |
23 | def test_location_input():
24 | location_input = LocationInputPlugin()
25 |
26 | cwd = pathlib.Path.cwd()
27 | input_file = cwd / "testing/data/parquet/generated_simple_numerics/blogs.parquet"
28 | table_name = "blog"
29 | tbl = location_input.build_table(str(input_file), table_name)
30 | assert tbl.name == "blog"
31 | assert len(tbl.columns) == 3
32 | assert "blogs.parquet" in tbl.filepaths[0]
33 |
--------------------------------------------------------------------------------
/docs/source/user-guide/io/csv.rst:
--------------------------------------------------------------------------------
1 | .. Licensed to the Apache Software Foundation (ASF) under one
2 | .. or more contributor license agreements. See the NOTICE file
3 | .. distributed with this work for additional information
4 | .. regarding copyright ownership. The ASF licenses this file
5 | .. to you under the Apache License, Version 2.0 (the
6 | .. "License"); you may not use this file except in compliance
7 | .. with the License. You may obtain a copy of the License at
8 |
9 | .. http://www.apache.org/licenses/LICENSE-2.0
10 |
11 | .. Unless required by applicable law or agreed to in writing,
12 | .. software distributed under the License is distributed on an
13 | .. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | .. KIND, either express or implied. See the License for the
15 | .. specific language governing permissions and limitations
16 | .. under the License.
17 |
18 | .. _io_csv:
19 |
20 | CSV
21 | ===
22 |
23 | Reading a csv is very straightforward with :py:func:`~datafusion.context.SessionContext.read_csv`
24 |
25 | .. code-block:: python
26 |
27 |
28 | from datafusion import SessionContext
29 |
30 | ctx = SessionContext()
31 | df = ctx.read_csv("file.csv")
32 |
33 | An alternative is to use :py:func:`~datafusion.context.SessionContext.register_csv`
34 |
35 | .. code-block:: python
36 |
37 | ctx.register_csv("file", "file.csv")
38 | df = ctx.table("file")
39 |
--------------------------------------------------------------------------------
/python/tests/test_store.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | from pathlib import Path
19 |
20 | import pytest
21 | from datafusion import SessionContext
22 |
23 |
24 | @pytest.fixture
25 | def ctx():
26 | return SessionContext()
27 |
28 |
29 | def test_read_parquet(ctx):
30 | ctx.register_parquet(
31 | "test",
32 | f"file://{Path.cwd()}/parquet/data/alltypes_plain.parquet",
33 | table_partition_cols=[],
34 | parquet_pruning=True,
35 | file_extension=".parquet",
36 | )
37 | df = ctx.sql("SELECT * FROM test")
38 | assert isinstance(df.collect(), list)
39 |
--------------------------------------------------------------------------------
/docs/mdbook/src/usage/index.md:
--------------------------------------------------------------------------------
1 |
17 | # Usage
18 |
19 | This section shows how to create DataFusion DataFrames from a variety of data sources like CSV files and Parquet files.
20 |
21 | You'll learn more about the SQL statements that are supported by DataFusion.
22 |
23 | You'll also learn about the DataFusion's Python API for querying data.
24 |
25 | The documentation will wrap up with a variety of real-world data processing tasks that are well suited for DataFusion. The lightning-fast speed and reliable execution makes DataFusion the best technology for a variety of data processing tasks.
26 |
--------------------------------------------------------------------------------
/examples/datafusion-ffi-example/Cargo.toml:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | [package]
19 | name = "datafusion-ffi-example"
20 | version = "0.2.0"
21 | edition = "2021"
22 |
23 | [dependencies]
24 | datafusion = { version = "50" }
25 | datafusion-ffi = { version = "50" }
26 | pyo3 = { version = "0.25", features = ["extension-module", "abi3", "abi3-py310"] }
27 | arrow = { version = "56" }
28 | arrow-array = { version = "56" }
29 | arrow-schema = { version = "56" }
30 | async-trait = "0.1.88"
31 |
32 | [build-dependencies]
33 | pyo3-build-config = "0.25"
34 |
35 | [lib]
36 | name = "datafusion_ffi_example"
37 | crate-type = ["cdylib", "rlib"]
38 |
--------------------------------------------------------------------------------
/docs/source/user-guide/io/parquet.rst:
--------------------------------------------------------------------------------
1 | .. Licensed to the Apache Software Foundation (ASF) under one
2 | .. or more contributor license agreements. See the NOTICE file
3 | .. distributed with this work for additional information
4 | .. regarding copyright ownership. The ASF licenses this file
5 | .. to you under the Apache License, Version 2.0 (the
6 | .. "License"); you may not use this file except in compliance
7 | .. with the License. You may obtain a copy of the License at
8 |
9 | .. http://www.apache.org/licenses/LICENSE-2.0
10 |
11 | .. Unless required by applicable law or agreed to in writing,
12 | .. software distributed under the License is distributed on an
13 | .. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | .. KIND, either express or implied. See the License for the
15 | .. specific language governing permissions and limitations
16 | .. under the License.
17 |
18 | .. _io_parquet:
19 |
20 | Parquet
21 | =======
22 |
23 | It is quite simple to read a parquet file using the :py:func:`~datafusion.context.SessionContext.read_parquet` function.
24 |
25 | .. code-block:: python
26 |
27 | from datafusion import SessionContext
28 |
29 | ctx = SessionContext()
30 | df = ctx.read_parquet("file.parquet")
31 |
32 | An alternative is to use :py:func:`~datafusion.context.SessionContext.register_parquet`
33 |
34 | .. code-block:: python
35 |
36 | ctx.register_parquet("file", "file.parquet")
37 | df = ctx.table("file")
38 |
--------------------------------------------------------------------------------
/docs/mdbook/README.md:
--------------------------------------------------------------------------------
1 |
17 | # DataFusion Book
18 |
19 | This folder builds a DataFusion user guide using [mdBook](https://github.com/rust-lang/mdBook).
20 |
21 | ## Build and run book locally
22 |
23 | Build the latest files with `mdbook build`.
24 |
25 | Open the book locally by running `open book/index.html`.
26 |
27 | ## Install mdBook
28 |
29 | Download the `mdbook` binary or run `cargo install mdbook`.
30 |
31 | Then manually open it, so you have permissions to run it on your Mac.
32 |
33 | Add it to your path with a command like this so you can easily run the commands: `mv ~/Downloads/mdbook /Users/matthew.powers/.local/bin`.
34 |
--------------------------------------------------------------------------------
/src/expr/grouping_set.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | use datafusion::logical_expr::GroupingSet;
19 | use pyo3::prelude::*;
20 |
21 | #[pyclass(frozen, name = "GroupingSet", module = "datafusion.expr", subclass)]
22 | #[derive(Clone)]
23 | pub struct PyGroupingSet {
24 | grouping_set: GroupingSet,
25 | }
26 |
27 | impl From for GroupingSet {
28 | fn from(grouping_set: PyGroupingSet) -> Self {
29 | grouping_set.grouping_set
30 | }
31 | }
32 |
33 | impl From for PyGroupingSet {
34 | fn from(grouping_set: GroupingSet) -> PyGroupingSet {
35 | PyGroupingSet { grouping_set }
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/examples/sql-parquet-s3.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | import os
19 |
20 | import datafusion
21 | from datafusion.object_store import AmazonS3
22 |
23 | region = "us-east-1"
24 | bucket_name = "yellow-trips"
25 |
26 | s3 = AmazonS3(
27 | bucket_name=bucket_name,
28 | region=region,
29 | access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
30 | secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
31 | )
32 |
33 | ctx = datafusion.SessionContext()
34 | path = f"s3://{bucket_name}/"
35 | ctx.register_object_store("s3://", s3, None)
36 |
37 | ctx.register_parquet("trips", path)
38 |
39 | df = ctx.sql("select count(passenger_count) from trips")
40 | df.show()
41 |
--------------------------------------------------------------------------------
/src/expr/signature.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | use datafusion::logical_expr::{TypeSignature, Volatility};
19 | use pyo3::prelude::*;
20 |
21 | #[allow(dead_code)]
22 | #[pyclass(frozen, name = "Signature", module = "datafusion.expr", subclass)]
23 | #[derive(Clone)]
24 | pub struct PySignature {
25 | type_signature: TypeSignature,
26 | volatility: Volatility,
27 | }
28 |
29 | impl PySignature {
30 | pub fn new(type_signature: TypeSignature, volatility: Volatility) -> Self {
31 | Self {
32 | type_signature,
33 | volatility,
34 | }
35 | }
36 | }
37 |
38 | #[pymethods]
39 | impl PySignature {}
40 |
--------------------------------------------------------------------------------
/src/expr/exists.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | use datafusion::logical_expr::expr::Exists;
19 | use pyo3::prelude::*;
20 |
21 | use super::subquery::PySubquery;
22 |
23 | #[pyclass(frozen, name = "Exists", module = "datafusion.expr", subclass)]
24 | #[derive(Clone)]
25 | pub struct PyExists {
26 | exists: Exists,
27 | }
28 |
29 | impl From for PyExists {
30 | fn from(exists: Exists) -> Self {
31 | PyExists { exists }
32 | }
33 | }
34 |
35 | #[pymethods]
36 | impl PyExists {
37 | fn subquery(&self) -> PySubquery {
38 | self.exists.subquery.clone().into()
39 | }
40 |
41 | fn negated(&self) -> bool {
42 | self.exists.negated
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/python/tests/test_config.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | import pytest
19 | from datafusion import Config
20 |
21 |
22 | @pytest.fixture
23 | def config():
24 | return Config()
25 |
26 |
27 | def test_get_then_set(config):
28 | config_key = "datafusion.optimizer.filter_null_join_keys"
29 |
30 | assert config.get(config_key) == "false"
31 |
32 | config.set(config_key, "true")
33 | assert config.get(config_key) == "true"
34 |
35 |
36 | def test_get_all(config):
37 | config_dict = config.get_all()
38 | assert config_dict["datafusion.catalog.create_default_catalog_and_schema"] == "true"
39 |
40 |
41 | def test_get_invalid_config(config):
42 | assert config.get("not.valid.key") is None
43 |
--------------------------------------------------------------------------------
/examples/sql-to-pandas.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | from datafusion import SessionContext
19 |
20 | # Create a DataFusion context
21 | ctx = SessionContext()
22 |
23 | # Register table with context
24 | ctx.register_parquet("taxi", "yellow_tripdata_2021-01.parquet")
25 |
26 | # Execute SQL
27 | df = ctx.sql(
28 | "select passenger_count, count(*) "
29 | "from taxi "
30 | "where passenger_count is not null "
31 | "group by passenger_count "
32 | "order by passenger_count"
33 | )
34 |
35 | # convert to Pandas
36 | pandas_df = df.to_pandas()
37 |
38 | # create a chart
39 | fig = pandas_df.plot(
40 | kind="bar", title="Trip Count by Number of Passengers"
41 | ).get_figure()
42 | fig.savefig("chart.png")
43 |
--------------------------------------------------------------------------------
/docs/build.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #
3 | # Licensed to the Apache Software Foundation (ASF) under one
4 | # or more contributor license agreements. See the NOTICE file
5 | # distributed with this work for additional information
6 | # regarding copyright ownership. The ASF licenses this file
7 | # to you under the Apache License, Version 2.0 (the
8 | # "License"); you may not use this file except in compliance
9 | # with the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied. See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | #
20 |
21 | set -e
22 |
23 | original_dir=$(pwd)
24 | script_dir=$(dirname "$(realpath "$0")")
25 | cd "$script_dir" || exit
26 |
27 | if [ ! -f pokemon.csv ]; then
28 | curl -O https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv
29 | fi
30 |
31 | if [ ! -f yellow_tripdata_2021-01.parquet ]; then
32 | curl -O https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet
33 | fi
34 |
35 | rm -rf build 2> /dev/null
36 | rm -rf temp 2> /dev/null
37 | mkdir temp
38 | cp -rf source/* temp/
39 | make SOURCEDIR=`pwd`/temp html
40 |
41 | cd "$original_dir" || exit
42 |
--------------------------------------------------------------------------------
/examples/query-pyarrow-data.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | import datafusion
19 | import pyarrow as pa
20 | from datafusion import col
21 |
22 | # create a context
23 | ctx = datafusion.SessionContext()
24 |
25 | # create a RecordBatch and a new DataFrame from it
26 | batch = pa.RecordBatch.from_arrays(
27 | [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
28 | names=["a", "b"],
29 | )
30 | df = ctx.create_dataframe([[batch]])
31 |
32 | # create a new statement
33 | df = df.select(
34 | col("a") + col("b"),
35 | col("a") - col("b"),
36 | )
37 |
38 | # execute and collect the first (and only) batch
39 | result = df.collect()[0]
40 |
41 | assert result.column(0) == pa.array([5, 7, 9])
42 | assert result.column(1) == pa.array([-3, -3, -3])
43 |
--------------------------------------------------------------------------------
/examples/python-udf.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | import pyarrow as pa
19 | from datafusion import SessionContext, udf
20 | from datafusion import functions as f
21 |
22 |
23 | def is_null(array: pa.Array) -> pa.Array:
24 | return array.is_null()
25 |
26 |
27 | is_null_arr = udf(is_null, [pa.int64()], pa.bool_(), "stable")
28 |
29 | # create a context
30 | ctx = SessionContext()
31 |
32 | # create a RecordBatch and a new DataFrame from it
33 | batch = pa.RecordBatch.from_arrays(
34 | [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
35 | names=["a", "b"],
36 | )
37 | df = ctx.create_dataframe([[batch]])
38 |
39 | df = df.select(is_null_arr(f.col("a")))
40 |
41 | result = df.collect()[0]
42 |
43 | assert result.column(0) == pa.array([False] * 3)
44 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | #
19 | # Minimal makefile for Sphinx documentation
20 | #
21 |
22 | # You can set these variables from the command line, and also
23 | # from the environment for the first two.
24 | SPHINXOPTS ?=
25 | SPHINXBUILD ?= sphinx-build
26 | SOURCEDIR = source
27 | BUILDDIR = build
28 |
29 | # Put it first so that "make" without argument is like "make help".
30 | help:
31 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
32 |
33 | .PHONY: help Makefile
34 |
35 | # Catch-all target: route all unknown targets to Sphinx using the new
36 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
37 | %: Makefile
38 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) --fail-on-warning
--------------------------------------------------------------------------------
/src/expr/in_list.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | use datafusion::logical_expr::expr::InList;
19 | use pyo3::prelude::*;
20 |
21 | use crate::expr::PyExpr;
22 |
23 | #[pyclass(frozen, name = "InList", module = "datafusion.expr", subclass)]
24 | #[derive(Clone)]
25 | pub struct PyInList {
26 | in_list: InList,
27 | }
28 |
29 | impl From for PyInList {
30 | fn from(in_list: InList) -> Self {
31 | PyInList { in_list }
32 | }
33 | }
34 |
35 | #[pymethods]
36 | impl PyInList {
37 | fn expr(&self) -> PyExpr {
38 | (*self.in_list.expr).clone().into()
39 | }
40 |
41 | fn list(&self) -> Vec {
42 | self.in_list.list.iter().map(|e| e.clone().into()).collect()
43 | }
44 |
45 | fn negated(&self) -> bool {
46 | self.in_list.negated
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/python/datafusion/col.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | """Col class."""
19 |
20 | from datafusion.expr import Expr
21 |
22 |
23 | class Col:
24 | """Create a column expression.
25 |
26 | This helper class allows an extra syntax of creating columns using the __getattr__
27 | method.
28 | """
29 |
30 | def __call__(self, value: str) -> Expr:
31 | """Create a column expression."""
32 | return Expr.column(value)
33 |
34 | def __getattr__(self, value: str) -> Expr:
35 | """Create a column using attribute syntax."""
36 | # For autocomplete to work with IPython
37 | if value.startswith("__wrapped__"):
38 | return getattr(type(self), value)
39 |
40 | return Expr.column(value)
41 |
42 |
43 | col: Col = Col()
44 | column: Col = Col()
45 | __all__ = ["col", "column"]
46 |
--------------------------------------------------------------------------------
/src/expr/scalar_subquery.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | use datafusion::logical_expr::Subquery;
19 | use pyo3::prelude::*;
20 |
21 | use super::subquery::PySubquery;
22 |
23 | #[pyclass(frozen, name = "ScalarSubquery", module = "datafusion.expr", subclass)]
24 | #[derive(Clone)]
25 | pub struct PyScalarSubquery {
26 | subquery: Subquery,
27 | }
28 |
29 | impl From for Subquery {
30 | fn from(subquery: PyScalarSubquery) -> Self {
31 | subquery.subquery
32 | }
33 | }
34 |
35 | impl From for PyScalarSubquery {
36 | fn from(subquery: Subquery) -> PyScalarSubquery {
37 | PyScalarSubquery { subquery }
38 | }
39 | }
40 |
41 | #[pymethods]
42 | impl PyScalarSubquery {
43 | fn subquery(&self) -> PySubquery {
44 | self.subquery.clone().into()
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/dev/changelog/45.0.0.md:
--------------------------------------------------------------------------------
1 |
19 |
20 | # Apache DataFusion Python 45.0.0 Changelog
21 |
22 | This release consists of 2 commits from 2 contributors. See credits at the end of this changelog for more information.
23 |
24 | **Fixed bugs:**
25 |
26 | - fix: add to_timestamp_nanos [#1020](https://github.com/apache/datafusion-python/pull/1020) (chenkovsky)
27 |
28 | **Other:**
29 |
30 | - Chore/upgrade datafusion 45 [#1010](https://github.com/apache/datafusion-python/pull/1010) (kevinjqliu)
31 |
32 | ## Credits
33 |
34 | Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
35 |
36 | ```
37 | 1 Kevin Liu
38 | 1 Tim Saucer
39 | ```
40 |
41 | Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
42 |
43 |
--------------------------------------------------------------------------------
/.asf.yaml:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | notifications:
19 | commits: commits@datafusion.apache.org
20 | issues: github@datafusion.apache.org
21 | pullrequests: github@datafusion.apache.org
22 | jira_options: link label worklog
23 | github:
24 | description: "Apache DataFusion Python Bindings"
25 | homepage: https://datafusion.apache.org/python
26 | enabled_merge_buttons:
27 | squash: true
28 | merge: false
29 | rebase: false
30 | features:
31 | issues: true
32 | protected_branches:
33 | main:
34 | required_status_checks:
35 | # require branches to be up-to-date before merging
36 | strict: true
37 | # don't require any jobs to pass
38 | contexts: []
39 |
40 | staging:
41 | whoami: asf-staging
42 | subdir: python
43 |
44 | publish:
45 | whoami: asf-site
46 | subdir: python
47 |
--------------------------------------------------------------------------------
/src/expr/placeholder.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | use datafusion::logical_expr::expr::Placeholder;
19 | use pyo3::prelude::*;
20 |
21 | use crate::common::data_type::PyDataType;
22 |
23 | #[pyclass(frozen, name = "Placeholder", module = "datafusion.expr", subclass)]
24 | #[derive(Clone)]
25 | pub struct PyPlaceholder {
26 | placeholder: Placeholder,
27 | }
28 |
29 | impl From for PyPlaceholder {
30 | fn from(placeholder: Placeholder) -> Self {
31 | PyPlaceholder { placeholder }
32 | }
33 | }
34 |
35 | #[pymethods]
36 | impl PyPlaceholder {
37 | fn id(&self) -> String {
38 | self.placeholder.id.clone()
39 | }
40 |
41 | fn data_type(&self) -> Option {
42 | self.placeholder
43 | .data_type
44 | .as_ref()
45 | .map(|e| e.clone().into())
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/dev/release/run-rat.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Licensed to the Apache Software Foundation (ASF) under one
4 | # or more contributor license agreements. See the NOTICE file
5 | # distributed with this work for additional information
6 | # regarding copyright ownership. The ASF licenses this file
7 | # to you under the Apache License, Version 2.0 (the
8 | # "License"); you may not use this file except in compliance
9 | # with the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied. See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | #
20 |
21 | RAT_VERSION=0.13
22 |
23 | # download apache rat
24 | if [ ! -f apache-rat-${RAT_VERSION}.jar ]; then
25 | curl -s https://repo1.maven.org/maven2/org/apache/rat/apache-rat/${RAT_VERSION}/apache-rat-${RAT_VERSION}.jar > apache-rat-${RAT_VERSION}.jar
26 | fi
27 |
28 | RAT="java -jar apache-rat-${RAT_VERSION}.jar -x "
29 |
30 | RELEASE_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd)
31 |
32 | # generate the rat report
33 | $RAT $1 > rat.txt
34 | python $RELEASE_DIR/check-rat-report.py $RELEASE_DIR/rat_exclude_files.txt rat.txt > filtered_rat.txt
35 | cat filtered_rat.txt
36 | UNAPPROVED=`cat filtered_rat.txt | grep "NOT APPROVED" | wc -l`
37 |
38 | if [ "0" -eq "${UNAPPROVED}" ]; then
39 | echo "No unapproved licenses"
40 | else
41 | echo "${UNAPPROVED} unapproved licences. Check rat report: rat.txt"
42 | exit 1
43 | fi
44 |
--------------------------------------------------------------------------------
/src/expr/in_subquery.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | use datafusion::logical_expr::expr::InSubquery;
19 | use pyo3::prelude::*;
20 |
21 | use super::subquery::PySubquery;
22 | use super::PyExpr;
23 |
24 | #[pyclass(frozen, name = "InSubquery", module = "datafusion.expr", subclass)]
25 | #[derive(Clone)]
26 | pub struct PyInSubquery {
27 | in_subquery: InSubquery,
28 | }
29 |
30 | impl From for PyInSubquery {
31 | fn from(in_subquery: InSubquery) -> Self {
32 | PyInSubquery { in_subquery }
33 | }
34 | }
35 |
36 | #[pymethods]
37 | impl PyInSubquery {
38 | fn expr(&self) -> PyExpr {
39 | (*self.in_subquery.expr).clone().into()
40 | }
41 |
42 | fn subquery(&self) -> PySubquery {
43 | self.in_subquery.subquery.clone().into()
44 | }
45 |
46 | fn negated(&self) -> bool {
47 | self.in_subquery.negated
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/examples/datafusion-ffi-example/src/lib.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | use crate::aggregate_udf::MySumUDF;
19 | use crate::catalog_provider::MyCatalogProvider;
20 | use crate::scalar_udf::IsNullUDF;
21 | use crate::table_function::MyTableFunction;
22 | use crate::table_provider::MyTableProvider;
23 | use crate::window_udf::MyRankUDF;
24 | use pyo3::prelude::*;
25 |
26 | pub(crate) mod aggregate_udf;
27 | pub(crate) mod catalog_provider;
28 | pub(crate) mod scalar_udf;
29 | pub(crate) mod table_function;
30 | pub(crate) mod table_provider;
31 | pub(crate) mod window_udf;
32 |
33 | #[pymodule]
34 | fn datafusion_ffi_example(m: &Bound<'_, PyModule>) -> PyResult<()> {
35 | m.add_class::()?;
36 | m.add_class::()?;
37 | m.add_class::()?;
38 | m.add_class::()?;
39 | m.add_class::()?;
40 | m.add_class::()?;
41 | Ok(())
42 | }
43 |
--------------------------------------------------------------------------------
/examples/export.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | import datafusion
19 |
20 | # create a context
21 | ctx = datafusion.SessionContext()
22 |
23 | # create a new datafusion DataFrame
24 | df = ctx.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]})
25 | # Dataframe:
26 | # +---+---+
27 | # | a | b |
28 | # +---+---+
29 | # | 1 | 4 |
30 | # | 2 | 5 |
31 | # | 3 | 6 |
32 | # +---+---+
33 |
34 | # export to pandas dataframe
35 | pandas_df = df.to_pandas()
36 | assert pandas_df.shape == (3, 2)
37 |
38 | # export to PyArrow table
39 | arrow_table = df.to_arrow_table()
40 | assert arrow_table.shape == (3, 2)
41 |
42 | # export to Polars dataframe
43 | polars_df = df.to_polars()
44 | assert polars_df.shape == (3, 2)
45 |
46 | # export to Python list of rows
47 | pylist = df.to_pylist()
48 | assert pylist == [{"a": 1, "b": 4}, {"a": 2, "b": 5}, {"a": 3, "b": 6}]
49 |
50 | # export to Python dictionary of columns
51 | pydict = df.to_pydict()
52 | assert pydict == {"a": [1, 2, 3], "b": [4, 5, 6]}
53 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @rem Licensed to the Apache Software Foundation (ASF) under one
2 | @rem or more contributor license agreements. See the NOTICE file
3 | @rem distributed with this work for additional information
4 | @rem regarding copyright ownership. The ASF licenses this file
5 | @rem to you under the Apache License, Version 2.0 (the
6 | @rem "License"); you may not use this file except in compliance
7 | @rem with the License. You may obtain a copy of the License at
8 | @rem
9 | @rem http://www.apache.org/licenses/LICENSE-2.0
10 | @rem
11 | @rem Unless required by applicable law or agreed to in writing,
12 | @rem software distributed under the License is distributed on an
13 | @rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | @rem KIND, either express or implied. See the License for the
15 | @rem specific language governing permissions and limitations
16 | @rem under the License.
17 |
18 | @ECHO OFF
19 |
20 | pushd %~dp0
21 |
22 | REM Command file for Sphinx documentation
23 |
24 | if "%SPHINXBUILD%" == "" (
25 | set SPHINXBUILD=sphinx-build
26 | )
27 | set SOURCEDIR=source
28 | set BUILDDIR=build
29 |
30 | if "%1" == "" goto help
31 |
32 | %SPHINXBUILD% >NUL 2>NUL
33 | if errorlevel 9009 (
34 | echo.
35 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
36 | echo.installed, then set the SPHINXBUILD environment variable to point
37 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
38 | echo.may add the Sphinx directory to PATH.
39 | echo.
40 | echo.If you don't have Sphinx installed, grab it from
41 | echo.http://sphinx-doc.org/
42 | exit /b 1
43 | )
44 |
45 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
46 | goto end
47 |
48 | :help
49 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
50 |
51 | :end
52 | popd
--------------------------------------------------------------------------------
/examples/datafusion-ffi-example/python/tests/_test_table_provider.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | from __future__ import annotations
19 |
20 | import pyarrow as pa
21 | from datafusion import SessionContext
22 | from datafusion_ffi_example import MyTableProvider
23 |
24 |
25 | def test_table_loading():
26 | ctx = SessionContext()
27 | table = MyTableProvider(3, 2, 4)
28 | ctx.register_table("t", table)
29 | result = ctx.table("t").collect()
30 |
31 | assert len(result) == 4
32 | assert result[0].num_columns == 3
33 |
34 | result = [r.column(0) for r in result]
35 | expected = [
36 | pa.array([0, 1], type=pa.int32()),
37 | pa.array([2, 3, 4], type=pa.int32()),
38 | pa.array([4, 5, 6, 7], type=pa.int32()),
39 | pa.array([6, 7, 8, 9, 10], type=pa.int32()),
40 | ]
41 |
42 | assert result == expected
43 |
44 | result = ctx.read_table(table).collect()
45 | result = [r.column(0) for r in result]
46 | assert result == expected
47 |
--------------------------------------------------------------------------------
/python/tests/test_view.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 |
19 | from datafusion import SessionContext, col, literal
20 |
21 |
22 | def test_register_filtered_dataframe():
23 | ctx = SessionContext()
24 |
25 | data = {"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]}
26 |
27 | df = ctx.from_pydict(data, "my_table")
28 |
29 | df_filtered = df.filter(col("a") > literal(2))
30 |
31 | ctx.register_view("view1", df_filtered)
32 |
33 | df_view = ctx.sql("SELECT * FROM view1")
34 |
35 | filtered_results = df_view.collect()
36 |
37 | result_dicts = [batch.to_pydict() for batch in filtered_results]
38 |
39 | expected_results = [{"a": [3, 4, 5], "b": [30, 40, 50]}]
40 |
41 | assert result_dicts == expected_results
42 |
43 | df_results = df.collect()
44 |
45 | df_result_dicts = [batch.to_pydict() for batch in df_results]
46 |
47 | expected_df_results = [{"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]}]
48 |
49 | assert df_result_dicts == expected_df_results
50 |
--------------------------------------------------------------------------------
/examples/create-context.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | from datafusion import RuntimeEnvBuilder, SessionConfig, SessionContext
19 |
20 | # create a session context with default settings
21 | ctx = SessionContext()
22 | print(ctx)
23 |
24 | # create a session context with explicit runtime and config settings
25 | runtime = RuntimeEnvBuilder().with_disk_manager_os().with_fair_spill_pool(10000000)
26 | config = (
27 | SessionConfig()
28 | .with_create_default_catalog_and_schema(enabled=True)
29 | .with_default_catalog_and_schema("foo", "bar")
30 | .with_target_partitions(8)
31 | .with_information_schema(enabled=True)
32 | .with_repartition_joins(enabled=False)
33 | .with_repartition_aggregations(enabled=False)
34 | .with_repartition_windows(enabled=False)
35 | .with_parquet_pruning(enabled=False)
36 | .set("datafusion.execution.parquet.pushdown_filters", "true")
37 | )
38 | ctx = SessionContext(config, runtime)
39 | print(ctx)
40 |
41 | ctx = ctx.enable_url_table()
42 | print(ctx)
43 |
--------------------------------------------------------------------------------
/src/expr/extension.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | use datafusion::logical_expr::Extension;
19 | use pyo3::prelude::*;
20 | use pyo3::IntoPyObjectExt;
21 |
22 | use super::logical_node::LogicalNode;
23 | use crate::sql::logical::PyLogicalPlan;
24 |
25 | #[pyclass(frozen, name = "Extension", module = "datafusion.expr", subclass)]
26 | #[derive(Clone)]
27 | pub struct PyExtension {
28 | pub node: Extension,
29 | }
30 |
31 | impl From for PyExtension {
32 | fn from(node: Extension) -> PyExtension {
33 | PyExtension { node }
34 | }
35 | }
36 |
37 | #[pymethods]
38 | impl PyExtension {
39 | fn name(&self) -> PyResult {
40 | Ok(self.node.node.name().to_string())
41 | }
42 | }
43 |
44 | impl LogicalNode for PyExtension {
45 | fn inputs(&self) -> Vec {
46 | vec![]
47 | }
48 |
49 | fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> {
50 | self.clone().into_bound_py_any(py)
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/python/tests/test_plans.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | import pytest
19 | from datafusion import ExecutionPlan, LogicalPlan, SessionContext
20 |
21 |
22 | # Note: We must use CSV because memory tables are currently not supported for
23 | # conversion to/from protobuf.
24 | @pytest.fixture
25 | def df():
26 | ctx = SessionContext()
27 | return ctx.read_csv(path="testing/data/csv/aggregate_test_100.csv").select("c1")
28 |
29 |
30 | def test_logical_plan_to_proto(ctx, df) -> None:
31 | logical_plan_bytes = df.logical_plan().to_proto()
32 | logical_plan = LogicalPlan.from_proto(ctx, logical_plan_bytes)
33 |
34 | df_round_trip = ctx.create_dataframe_from_logical_plan(logical_plan)
35 |
36 | assert df.collect() == df_round_trip.collect()
37 |
38 | original_execution_plan = df.execution_plan()
39 | execution_plan_bytes = original_execution_plan.to_proto()
40 | execution_plan = ExecutionPlan.from_proto(ctx, execution_plan_bytes)
41 |
42 | assert str(original_execution_plan) == str(execution_plan)
43 |
--------------------------------------------------------------------------------
/src/expr/binary_expr.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | use datafusion::logical_expr::BinaryExpr;
19 | use pyo3::prelude::*;
20 |
21 | use crate::expr::PyExpr;
22 |
23 | #[pyclass(frozen, name = "BinaryExpr", module = "datafusion.expr", subclass)]
24 | #[derive(Clone)]
25 | pub struct PyBinaryExpr {
26 | expr: BinaryExpr,
27 | }
28 |
29 | impl From for BinaryExpr {
30 | fn from(expr: PyBinaryExpr) -> Self {
31 | expr.expr
32 | }
33 | }
34 |
35 | impl From for PyBinaryExpr {
36 | fn from(expr: BinaryExpr) -> PyBinaryExpr {
37 | PyBinaryExpr { expr }
38 | }
39 | }
40 |
41 | #[pymethods]
42 | impl PyBinaryExpr {
43 | fn left(&self) -> PyExpr {
44 | self.expr.left.as_ref().clone().into()
45 | }
46 |
47 | fn right(&self) -> PyExpr {
48 | self.expr.right.as_ref().clone().into()
49 | }
50 |
51 | fn op(&self) -> String {
52 | format!("{}", self.expr.op)
53 | }
54 |
55 | fn __repr__(&self) -> PyResult {
56 | Ok(format!("{}", self.expr))
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/python/tests/test_indexing.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | import pyarrow as pa
19 | import pytest
20 | from datafusion import SessionContext
21 |
22 |
23 | @pytest.fixture
24 | def df():
25 | ctx = SessionContext()
26 |
27 | # create a RecordBatch and a new DataFrame from it
28 | batch = pa.RecordBatch.from_arrays(
29 | [pa.array([1, 2, 3]), pa.array([4, 4, 6])],
30 | names=["a", "b"],
31 | )
32 | return ctx.create_dataframe([[batch]])
33 |
34 |
35 | def test_indexing(df):
36 | assert df["a"] is not None
37 | assert df["a", "b"] is not None
38 | assert df[("a", "b")] is not None
39 | assert df[["a"]] is not None
40 |
41 |
42 | def test_err(df):
43 | with pytest.raises(Exception) as e_info:
44 | df["c"]
45 |
46 | for e in ["SchemaError", "FieldNotFound", 'name: "c"']:
47 | assert e in e_info.value.args[0]
48 |
49 | with pytest.raises(Exception) as e_info:
50 | df[1]
51 |
52 | assert (
53 | "DataFrame can only be indexed by string index or indices"
54 | in e_info.value.args[0]
55 | )
56 |
--------------------------------------------------------------------------------
/src/common/df_schema.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | use std::sync::Arc;
19 |
20 | use datafusion::common::DFSchema;
21 | use pyo3::prelude::*;
22 |
23 | #[derive(Debug, Clone)]
24 | #[pyclass(frozen, name = "DFSchema", module = "datafusion.common", subclass)]
25 | pub struct PyDFSchema {
26 | schema: Arc,
27 | }
28 |
29 | impl From for DFSchema {
30 | fn from(schema: PyDFSchema) -> DFSchema {
31 | (*schema.schema).clone()
32 | }
33 | }
34 |
35 | impl From for PyDFSchema {
36 | fn from(schema: DFSchema) -> PyDFSchema {
37 | PyDFSchema {
38 | schema: Arc::new(schema),
39 | }
40 | }
41 | }
42 |
43 | #[pymethods]
44 | impl PyDFSchema {
45 | #[pyo3(name = "empty")]
46 | #[staticmethod]
47 | fn py_empty() -> PyResult {
48 | Ok(Self {
49 | schema: Arc::new(DFSchema::empty()),
50 | })
51 | }
52 |
53 | #[pyo3(name = "field_names")]
54 | fn py_field_names(&self) -> PyResult> {
55 | Ok(self.schema.field_names())
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/src/expr/scalar_variable.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | use datafusion::arrow::datatypes::DataType;
19 | use pyo3::prelude::*;
20 |
21 | use crate::common::data_type::PyDataType;
22 |
23 | #[pyclass(frozen, name = "ScalarVariable", module = "datafusion.expr", subclass)]
24 | #[derive(Clone)]
25 | pub struct PyScalarVariable {
26 | data_type: DataType,
27 | variables: Vec,
28 | }
29 |
30 | impl PyScalarVariable {
31 | pub fn new(data_type: &DataType, variables: &[String]) -> Self {
32 | Self {
33 | data_type: data_type.to_owned(),
34 | variables: variables.to_vec(),
35 | }
36 | }
37 | }
38 |
39 | #[pymethods]
40 | impl PyScalarVariable {
41 | /// Get the data type
42 | fn data_type(&self) -> PyResult {
43 | Ok(self.data_type.clone().into())
44 | }
45 |
46 | fn variables(&self) -> PyResult> {
47 | Ok(self.variables.clone())
48 | }
49 |
50 | fn __repr__(&self) -> PyResult {
51 | Ok(format!("{}{:?}", self.data_type, self.variables))
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/src/expr/case.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | use datafusion::logical_expr::Case;
19 | use pyo3::prelude::*;
20 |
21 | use crate::expr::PyExpr;
22 |
23 | #[pyclass(frozen, name = "Case", module = "datafusion.expr", subclass)]
24 | #[derive(Clone)]
25 | pub struct PyCase {
26 | case: Case,
27 | }
28 |
29 | impl From for Case {
30 | fn from(case: PyCase) -> Self {
31 | case.case
32 | }
33 | }
34 |
35 | impl From for PyCase {
36 | fn from(case: Case) -> PyCase {
37 | PyCase { case }
38 | }
39 | }
40 |
41 | #[pymethods]
42 | impl PyCase {
43 | fn expr(&self) -> Option {
44 | self.case.expr.as_ref().map(|e| (**e).clone().into())
45 | }
46 |
47 | fn when_then_expr(&self) -> Vec<(PyExpr, PyExpr)> {
48 | self.case
49 | .when_then_expr
50 | .iter()
51 | .map(|e| ((*e.0).clone().into(), (*e.1).clone().into()))
52 | .collect()
53 | }
54 |
55 | fn else_expr(&self) -> Option {
56 | self.case.else_expr.as_ref().map(|e| (**e).clone().into())
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/benchmarks/tpch/README.md:
--------------------------------------------------------------------------------
1 |
19 |
20 | # DataFusion Python Benchmarks Derived from TPC-H
21 |
22 | ## Create Release Build
23 |
24 | From repo root:
25 |
26 | ```bash
27 | maturin develop --release
28 | ```
29 |
30 | Note that release builds take a really long time, so you may want to temporarily comment out this section of the
31 | root Cargo.toml when frequently building.
32 |
33 | ```toml
34 | [profile.release]
35 | lto = true
36 | codegen-units = 1
37 | ```
38 |
39 | ## Generate Data
40 |
41 | ```bash
42 | ./tpch-gen.sh 1
43 | ```
44 |
45 | ## Run Benchmarks
46 |
47 | ```bash
48 | python tpch.py ./data ./queries
49 | ```
50 |
51 | A summary of the benchmark timings will be written to `results.csv`. For example:
52 |
53 | ```csv
54 | setup,1.4
55 | q1,2978.6
56 | q2,679.7
57 | q3,2943.7
58 | q4,2894.9
59 | q5,3592.3
60 | q6,1691.4
61 | q7,3003.9
62 | q8,3818.7
63 | q9,4237.9
64 | q10,2344.7
65 | q11,526.1
66 | q12,2284.6
67 | q13,1009.2
68 | q14,1738.4
69 | q15,1942.1
70 | q16,499.8
71 | q17,5178.9
72 | q18,4127.7
73 | q19,2056.6
74 | q20,2162.5
75 | q21,8046.5
76 | q22,754.9
77 | total,58513.2
78 | ```
--------------------------------------------------------------------------------
/python/datafusion/input/base.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | """This module provides ``BaseInputSource``.
19 |
20 | A user can extend this to provide a custom input source.
21 | """
22 |
23 | from abc import ABC, abstractmethod
24 | from typing import Any
25 |
26 | from datafusion.common import SqlTable
27 |
28 |
29 | class BaseInputSource(ABC):
30 | """Base Input Source class.
31 |
32 | If a consuming library would like to provider their own InputSource this is
33 | the class they should extend to write their own.
34 |
35 | Once completed the Plugin InputSource can be registered with the
36 | SessionContext to ensure that it will be used in order
37 | to obtain the SqlTable information from the custom datasource.
38 | """
39 |
40 | @abstractmethod
41 | def is_correct_input(self, input_item: Any, table_name: str, **kwargs: Any) -> bool:
42 | """Returns `True` if the input is valid."""
43 |
44 | @abstractmethod
45 | def build_table(self, input_item: Any, table_name: str, **kwarg: Any) -> SqlTable: # type: ignore[invalid-type-form]
46 | """Create a table from the input source."""
47 |
--------------------------------------------------------------------------------
/src/common.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | use pyo3::prelude::*;
19 |
20 | pub mod data_type;
21 | pub mod df_schema;
22 | pub mod function;
23 | pub mod schema;
24 |
25 | /// Initializes the `common` module to match the pattern of `datafusion-common` https://docs.rs/datafusion-common/18.0.0/datafusion_common/index.html
26 | pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> {
27 | m.add_class::()?;
28 | m.add_class::()?;
29 | m.add_class::()?;
30 | m.add_class::()?;
31 | m.add_class::()?;
32 | m.add_class::()?;
33 | m.add_class::()?;
34 | m.add_class::()?;
35 | m.add_class::()?;
36 | m.add_class::()?;
37 | m.add_class::()?;
38 | m.add_class::()?;
39 | m.add_class::()?;
40 | m.add_class::()?;
41 | m.add_class::()?;
42 | Ok(())
43 | }
44 |
--------------------------------------------------------------------------------
/src/expr/column.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | use datafusion::common::Column;
19 | use pyo3::prelude::*;
20 |
21 | #[pyclass(frozen, name = "Column", module = "datafusion.expr", subclass)]
22 | #[derive(Clone)]
23 | pub struct PyColumn {
24 | pub col: Column,
25 | }
26 |
27 | impl PyColumn {
28 | pub fn new(col: Column) -> Self {
29 | Self { col }
30 | }
31 | }
32 |
33 | impl From for PyColumn {
34 | fn from(col: Column) -> PyColumn {
35 | PyColumn { col }
36 | }
37 | }
38 |
39 | #[pymethods]
40 | impl PyColumn {
41 | /// Get the column name
42 | fn name(&self) -> String {
43 | self.col.name.clone()
44 | }
45 |
46 | /// Get the column relation
47 | fn relation(&self) -> Option {
48 | self.col.relation.as_ref().map(|r| format!("{r}"))
49 | }
50 |
51 | /// Get the fully-qualified column name
52 | fn qualified_name(&self) -> String {
53 | self.col.flat_name()
54 | }
55 |
56 | /// Get a String representation of this column
57 | fn __repr__(&self) -> String {
58 | self.qualified_name()
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/python/tests/conftest.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | import pyarrow as pa
19 | import pytest
20 | from datafusion import DataFrame, SessionContext
21 | from pyarrow.csv import write_csv
22 |
23 |
24 | @pytest.fixture
25 | def ctx():
26 | return SessionContext()
27 |
28 |
29 | @pytest.fixture
30 | def database(ctx, tmp_path):
31 | path = tmp_path / "test.csv"
32 |
33 | table = pa.Table.from_arrays(
34 | [
35 | [1, 2, 3, 4],
36 | ["a", "b", "c", "d"],
37 | [1.1, 2.2, 3.3, 4.4],
38 | ],
39 | names=["int", "str", "float"],
40 | )
41 | write_csv(table, path)
42 |
43 | ctx.register_csv("csv", path)
44 | ctx.register_csv("csv1", str(path))
45 | ctx.register_csv(
46 | "csv2",
47 | path,
48 | has_header=True,
49 | delimiter=",",
50 | schema_infer_max_records=10,
51 | )
52 |
53 |
54 | @pytest.fixture
55 | def fail_collect(monkeypatch):
56 | def _fail_collect(self, *args, **kwargs): # pragma: no cover - failure path
57 | msg = "collect should not be called"
58 | raise AssertionError(msg)
59 |
60 | monkeypatch.setattr(DataFrame, "collect", _fail_collect)
61 |
--------------------------------------------------------------------------------
/dev/clean.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #
3 | # Licensed to the Apache Software Foundation (ASF) under one
4 | # or more contributor license agreements. See the NOTICE file
5 | # distributed with this work for additional information
6 | # regarding copyright ownership. The ASF licenses this file
7 | # to you under the Apache License, Version 2.0 (the
8 | # "License"); you may not use this file except in compliance
9 | # with the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied. See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | #
20 |
21 | # This cleans up the project by removing build artifacts and other generated files.
22 |
23 | # Function to remove a directory and print the action
24 | remove_dir() {
25 | if [ -d "$1" ]; then
26 | echo "Removing directory: $1"
27 | rm -rf "$1"
28 | fi
29 | }
30 |
31 | # Function to remove a file and print the action
32 | remove_file() {
33 | if [ -f "$1" ]; then
34 | echo "Removing file: $1"
35 | rm -f "$1"
36 | fi
37 | }
38 |
39 | # Remove .pytest_cache directory
40 | remove_dir .pytest_cache/
41 |
42 | # Remove target directory
43 | remove_dir target/
44 |
45 | # Remove any __pycache__ directories
46 | find python/ -type d -name "__pycache__" -print | while read -r dir; do
47 | remove_dir "$dir"
48 | done
49 |
50 | # Remove pytest-coverage.lcov file
51 | # remove_file .coverage
52 | # remove_file pytest-coverage.lcov
53 |
54 | # Remove rust-coverage.lcov file
55 | # remove_file rust-coverage.lcov
56 |
57 | # Remove pyo3 files
58 | find python/ -type f -name '_internal.*.so' -print | while read -r file; do
59 | remove_file "$file"
60 | done
61 |
62 | echo "Cleanup complete."
--------------------------------------------------------------------------------
/src/common/function.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | use std::collections::HashMap;
19 |
20 | use datafusion::arrow::datatypes::DataType;
21 | use pyo3::prelude::*;
22 |
23 | use super::data_type::PyDataType;
24 |
25 | #[pyclass(frozen, name = "SqlFunction", module = "datafusion.common", subclass)]
26 | #[derive(Debug, Clone)]
27 | pub struct SqlFunction {
28 | pub name: String,
29 | pub return_types: HashMap, DataType>,
30 | pub aggregation: bool,
31 | }
32 |
33 | impl SqlFunction {
34 | pub fn new(
35 | function_name: String,
36 | input_types: Vec,
37 | return_type: PyDataType,
38 | aggregation_bool: bool,
39 | ) -> Self {
40 | let mut func = Self {
41 | name: function_name,
42 | return_types: HashMap::new(),
43 | aggregation: aggregation_bool,
44 | };
45 | func.add_type_mapping(input_types, return_type);
46 | func
47 | }
48 |
49 | pub fn add_type_mapping(&mut self, input_types: Vec, return_type: PyDataType) {
50 | self.return_types.insert(
51 | input_types.iter().map(|t| t.clone().into()).collect(),
52 | return_type.into(),
53 | );
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/.github/workflows/take.yml:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | name: Assign the issue via a `take` comment
19 | on:
20 | issue_comment:
21 | types: created
22 |
23 | permissions:
24 | issues: write
25 |
26 | jobs:
27 | issue_assign:
28 | runs-on: ubuntu-latest
29 | if: (!github.event.issue.pull_request) && github.event.comment.body == 'take'
30 | concurrency:
31 | group: ${{ github.actor }}-issue-assign
32 | steps:
33 | - run: |
34 | CODE=$(curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -LI https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees/${{ github.event.comment.user.login }} -o /dev/null -w '%{http_code}\n' -s)
35 | if [ "$CODE" -eq "204" ]
36 | then
37 | echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}"
38 | curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees
39 | else
40 | echo "Cannot assign issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}"
41 | fi
--------------------------------------------------------------------------------
/src/unparser/dialect.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | use std::sync::Arc;
19 |
20 | use datafusion::sql::unparser::dialect::{
21 | DefaultDialect, Dialect, DuckDBDialect, MySqlDialect, PostgreSqlDialect, SqliteDialect,
22 | };
23 | use pyo3::prelude::*;
24 |
25 | #[pyclass(frozen, name = "Dialect", module = "datafusion.unparser", subclass)]
26 | #[derive(Clone)]
27 | pub struct PyDialect {
28 | pub dialect: Arc,
29 | }
30 |
31 | #[pymethods]
32 | impl PyDialect {
33 | #[staticmethod]
34 | pub fn default() -> Self {
35 | Self {
36 | dialect: Arc::new(DefaultDialect {}),
37 | }
38 | }
39 | #[staticmethod]
40 | pub fn postgres() -> Self {
41 | Self {
42 | dialect: Arc::new(PostgreSqlDialect {}),
43 | }
44 | }
45 | #[staticmethod]
46 | pub fn mysql() -> Self {
47 | Self {
48 | dialect: Arc::new(MySqlDialect {}),
49 | }
50 | }
51 | #[staticmethod]
52 | pub fn sqlite() -> Self {
53 | Self {
54 | dialect: Arc::new(SqliteDialect {}),
55 | }
56 | }
57 | #[staticmethod]
58 | pub fn duckdb() -> Self {
59 | Self {
60 | dialect: Arc::new(DuckDBDialect::new()),
61 | }
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/examples/sql-using-python-udf.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | import pyarrow as pa
19 | from datafusion import SessionContext, udf
20 |
21 |
22 | # Define a user-defined function (UDF)
23 | def is_null(array: pa.Array) -> pa.Array:
24 | return array.is_null()
25 |
26 |
27 | is_null_arr = udf(
28 | is_null,
29 | [pa.int64()],
30 | pa.bool_(),
31 | "stable",
32 | # This will be the name of the UDF in SQL
33 | # If not specified it will by default the same as Python function name
34 | name="is_null",
35 | )
36 |
37 | # Create a context
38 | ctx = SessionContext()
39 |
40 | # Create a datafusion DataFrame from a Python dictionary
41 | ctx.from_pydict({"a": [1, 2, 3], "b": [4, None, 6]}, name="t")
42 | # Dataframe:
43 | # +---+---+
44 | # | a | b |
45 | # +---+---+
46 | # | 1 | 4 |
47 | # | 2 | |
48 | # | 3 | 6 |
49 | # +---+---+
50 |
51 | # Register UDF for use in SQL
52 | ctx.register_udf(is_null_arr)
53 |
54 | # Query the DataFrame using SQL
55 | result_df = ctx.sql("select a, is_null(b) as b_is_null from t")
56 | # Dataframe:
57 | # +---+-----------+
58 | # | a | b_is_null |
59 | # +---+-----------+
60 | # | 1 | false |
61 | # | 2 | true |
62 | # | 3 | false |
63 | # +---+-----------+
64 | assert result_df.to_pydict()["b_is_null"] == [False, True, False]
65 |
--------------------------------------------------------------------------------
/docs/source/user-guide/common-operations/views.rst:
--------------------------------------------------------------------------------
1 | .. Licensed to the Apache Software Foundation (ASF) under one
2 | .. or more contributor license agreements. See the NOTICE file
3 | .. distributed with this work for additional information
4 | .. regarding copyright ownership. The ASF licenses this file
5 | .. to you under the Apache License, Version 2.0 (the
6 | .. "License"); you may not use this file except in compliance
7 | .. with the License. You may obtain a copy of the License at
8 |
9 | .. http://www.apache.org/licenses/LICENSE-2.0
10 |
11 | .. Unless required by applicable law or agreed to in writing,
12 | .. software distributed under the License is distributed on an
13 | .. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | .. KIND, either express or implied. See the License for the
15 | .. specific language governing permissions and limitations
16 | .. under the License.
17 |
18 | ======================
19 | Registering Views
20 | ======================
21 |
22 | You can use the context's ``register_view`` method to register a DataFrame as a view
23 |
24 | .. code-block:: python
25 |
26 | from datafusion import SessionContext, col, literal
27 |
28 | # Create a DataFusion context
29 | ctx = SessionContext()
30 |
31 | # Create sample data
32 | data = {"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]}
33 |
34 | # Create a DataFrame from the dictionary
35 | df = ctx.from_pydict(data, "my_table")
36 |
37 | # Filter the DataFrame (for example, keep rows where a > 2)
38 | df_filtered = df.filter(col("a") > literal(2))
39 |
40 | # Register the dataframe as a view with the context
41 | ctx.register_view("view1", df_filtered)
42 |
43 | # Now run a SQL query against the registered view
44 | df_view = ctx.sql("SELECT * FROM view1")
45 |
46 | # Collect the results
47 | results = df_view.collect()
48 |
49 | # Convert results to a list of dictionaries for display
50 | result_dicts = [batch.to_pydict() for batch in results]
51 |
52 | print(result_dicts)
53 |
54 | This will output:
55 |
56 | .. code-block:: python
57 |
58 | [{'a': [3, 4, 5], 'b': [30, 40, 50]}]
59 |
--------------------------------------------------------------------------------
/src/expr/unnest_expr.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | use std::fmt::{self, Display, Formatter};
19 |
20 | use datafusion::logical_expr::expr::Unnest;
21 | use pyo3::prelude::*;
22 |
23 | use super::PyExpr;
24 |
25 | #[pyclass(frozen, name = "UnnestExpr", module = "datafusion.expr", subclass)]
26 | #[derive(Clone)]
27 | pub struct PyUnnestExpr {
28 | unnest: Unnest,
29 | }
30 |
31 | impl From for PyUnnestExpr {
32 | fn from(unnest: Unnest) -> PyUnnestExpr {
33 | PyUnnestExpr { unnest }
34 | }
35 | }
36 |
37 | impl From for Unnest {
38 | fn from(unnest: PyUnnestExpr) -> Self {
39 | unnest.unnest
40 | }
41 | }
42 |
43 | impl Display for PyUnnestExpr {
44 | fn fmt(&self, f: &mut Formatter) -> fmt::Result {
45 | write!(
46 | f,
47 | "Unnest
48 | Expr: {:?}",
49 | &self.unnest.expr,
50 | )
51 | }
52 | }
53 |
54 | #[pymethods]
55 | impl PyUnnestExpr {
56 | /// Retrieves the expression that is being unnested
57 | fn expr(&self) -> PyResult {
58 | Ok((*self.unnest.expr).clone().into())
59 | }
60 |
61 | fn __repr__(&self) -> PyResult {
62 | Ok(format!("UnnestExpr({self})"))
63 | }
64 |
65 | fn __name__(&self) -> PyResult {
66 | Ok("UnnestExpr".to_string())
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/src/expr/alias.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | use std::fmt::{self, Display, Formatter};
19 |
20 | use datafusion::logical_expr::expr::Alias;
21 | use pyo3::prelude::*;
22 |
23 | use crate::expr::PyExpr;
24 |
25 | #[pyclass(frozen, name = "Alias", module = "datafusion.expr", subclass)]
26 | #[derive(Clone)]
27 | pub struct PyAlias {
28 | alias: Alias,
29 | }
30 |
31 | impl From for PyAlias {
32 | fn from(alias: Alias) -> Self {
33 | Self { alias }
34 | }
35 | }
36 |
37 | impl From for Alias {
38 | fn from(py_alias: PyAlias) -> Self {
39 | py_alias.alias
40 | }
41 | }
42 |
43 | impl Display for PyAlias {
44 | fn fmt(&self, f: &mut Formatter) -> fmt::Result {
45 | write!(
46 | f,
47 | "Alias
48 | \nExpr: `{:?}`
49 | \nAlias Name: `{}`",
50 | &self.alias.expr, &self.alias.name
51 | )
52 | }
53 | }
54 |
55 | #[pymethods]
56 | impl PyAlias {
57 | /// Retrieve the "name" of the alias
58 | fn alias(&self) -> PyResult {
59 | Ok(self.alias.name.clone())
60 | }
61 |
62 | fn expr(&self) -> PyResult {
63 | Ok((*self.alias.expr.clone()).into())
64 | }
65 |
66 | /// Get a String representation of this column
67 | fn __repr__(&self) -> String {
68 | format!("{self}")
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/examples/datafusion-ffi-example/src/table_function.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | use crate::table_provider::MyTableProvider;
19 | use datafusion::catalog::{TableFunctionImpl, TableProvider};
20 | use datafusion::error::Result as DataFusionResult;
21 | use datafusion::prelude::Expr;
22 | use datafusion_ffi::udtf::FFI_TableFunction;
23 | use pyo3::types::PyCapsule;
24 | use pyo3::{pyclass, pymethods, Bound, PyResult, Python};
25 | use std::sync::Arc;
26 |
27 | #[pyclass(name = "MyTableFunction", module = "datafusion_ffi_example", subclass)]
28 | #[derive(Debug, Clone)]
29 | pub(crate) struct MyTableFunction {}
30 |
31 | #[pymethods]
32 | impl MyTableFunction {
33 | #[new]
34 | fn new() -> Self {
35 | Self {}
36 | }
37 |
38 | fn __datafusion_table_function__<'py>(
39 | &self,
40 | py: Python<'py>,
41 | ) -> PyResult> {
42 | let name = cr"datafusion_table_function".into();
43 |
44 | let func = self.clone();
45 | let provider = FFI_TableFunction::new(Arc::new(func), None);
46 |
47 | PyCapsule::new(py, provider, Some(name))
48 | }
49 | }
50 |
51 | impl TableFunctionImpl for MyTableFunction {
52 | fn call(&self, _args: &[Expr]) -> DataFusionResult> {
53 | let provider = MyTableProvider::new(4, 3, 2).create_table()?;
54 | Ok(Arc::new(provider))
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/examples/import.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | import datafusion
19 | import pandas as pd
20 | import polars as pl
21 | import pyarrow as pa
22 |
23 | # Create a context
24 | ctx = datafusion.SessionContext()
25 |
26 | # Create a datafusion DataFrame from a Python dictionary
27 | # The dictionary keys represent column names and the dictionary values
28 | # represent column values
29 | df = ctx.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]})
30 | assert type(df) is datafusion.DataFrame
31 | # Dataframe:
32 | # +---+---+
33 | # | a | b |
34 | # +---+---+
35 | # | 1 | 4 |
36 | # | 2 | 5 |
37 | # | 3 | 6 |
38 | # +---+---+
39 |
40 | # Create a datafusion DataFrame from a Python list of rows
41 | df = ctx.from_pylist([{"a": 1, "b": 4}, {"a": 2, "b": 5}, {"a": 3, "b": 6}])
42 | assert type(df) is datafusion.DataFrame
43 |
44 | # Convert pandas DataFrame to datafusion DataFrame
45 | pandas_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
46 | df = ctx.from_pandas(pandas_df)
47 | assert type(df) is datafusion.DataFrame
48 |
49 | # Convert polars DataFrame to datafusion DataFrame
50 | polars_df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
51 | df = ctx.from_polars(polars_df)
52 | assert type(df) is datafusion.DataFrame
53 |
54 | # Convert Arrow Table to datafusion DataFrame
55 | arrow_table = pa.Table.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]})
56 | df = ctx.from_arrow(arrow_table)
57 | assert type(df) is datafusion.DataFrame
58 |
--------------------------------------------------------------------------------
/docs/source/user-guide/common-operations/basic-info.rst:
--------------------------------------------------------------------------------
1 | .. Licensed to the Apache Software Foundation (ASF) under one
2 | .. or more contributor license agreements. See the NOTICE file
3 | .. distributed with this work for additional information
4 | .. regarding copyright ownership. The ASF licenses this file
5 | .. to you under the Apache License, Version 2.0 (the
6 | .. "License"); you may not use this file except in compliance
7 | .. with the License. You may obtain a copy of the License at
8 |
9 | .. http://www.apache.org/licenses/LICENSE-2.0
10 |
11 | .. Unless required by applicable law or agreed to in writing,
12 | .. software distributed under the License is distributed on an
13 | .. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | .. KIND, either express or implied. See the License for the
15 | .. specific language governing permissions and limitations
16 | .. under the License.
17 |
18 | Basic Operations
19 | ================
20 |
21 | In this section, you will learn how to display essential details of DataFrames using specific functions.
22 |
23 | .. ipython:: python
24 |
25 | from datafusion import SessionContext
26 | import random
27 |
28 | ctx = SessionContext()
29 | df = ctx.from_pydict({
30 | "nrs": [1, 2, 3, 4, 5],
31 | "names": ["python", "ruby", "java", "haskell", "go"],
32 | "random": random.sample(range(1000), 5),
33 | "groups": ["A", "A", "B", "C", "B"],
34 | })
35 | df
36 |
37 | Use :py:func:`~datafusion.dataframe.DataFrame.limit` to view the top rows of the frame:
38 |
39 | .. ipython:: python
40 |
41 | df.limit(2)
42 |
43 | Display the columns of the DataFrame using :py:func:`~datafusion.dataframe.DataFrame.schema`:
44 |
45 | .. ipython:: python
46 |
47 | df.schema()
48 |
49 | The method :py:func:`~datafusion.dataframe.DataFrame.to_pandas` uses pyarrow to convert to pandas DataFrame, by collecting the batches,
50 | passing them to an Arrow table, and then converting them to a pandas DataFrame.
51 |
52 | .. ipython:: python
53 |
54 | df.to_pandas()
55 |
56 | :py:func:`~datafusion.dataframe.DataFrame.describe` shows a quick statistic summary of your data:
57 |
58 | .. ipython:: python
59 |
60 | df.describe()
61 |
62 |
--------------------------------------------------------------------------------
/docs/mdbook/src/usage/create-table.md:
--------------------------------------------------------------------------------
1 |
17 | # DataFusion Create Table
18 |
19 | It's easy to create DataFusion tables from a variety of data sources.
20 |
21 | ## Create Table from Python Dictionary
22 |
23 | Here's how to create a DataFusion table from a Python dictionary:
24 |
25 | ```python
26 | from datafusion import SessionContext
27 |
28 | ctx = SessionContext()
29 |
30 | df = ctx.from_pydict({"a": [1, 2, 3, 1], "b": [4, 5, 6, 7]}, name="my_table")
31 | ```
32 |
33 | Supplying the `name` parameter is optional. You only need to name the table if you'd like to query it with the SQL API.
34 |
35 | You can also create a DataFrame without a name that can be queried with the Python API:
36 |
37 | ```python
38 | from datafusion import SessionContext
39 |
40 | ctx = SessionContext()
41 |
42 | df = ctx.from_pydict({"a": [1, 2, 3, 1], "b": [4, 5, 6, 7]})
43 | ```
44 |
45 | ## Create Table from CSV
46 |
47 | You can read a CSV into a DataFusion DataFrame. Here's how to read the `G1_1e8_1e2_0_0.csv` file into a table named `csv_1e8`:
48 |
49 | ```python
50 | ctx.register_csv("csv_1e8", "G1_1e8_1e2_0_0.csv")
51 | ```
52 |
53 | ## Create Table from Parquet
54 |
55 | You can read a Parquet file into a DataFusion DataFrame. Here's how to read the `yellow_tripdata_2021-01.parquet` file into a table named `taxi`.
56 |
57 | ```python
58 | ctx.register_table("taxi", "yellow_tripdata_2021-01.parquet")
59 | ```
60 |
--------------------------------------------------------------------------------
/dev/release/check-rat-report.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | ##############################################################################
3 | # Licensed to the Apache Software Foundation (ASF) under one
4 | # or more contributor license agreements. See the NOTICE file
5 | # distributed with this work for additional information
6 | # regarding copyright ownership. The ASF licenses this file
7 | # to you under the Apache License, Version 2.0 (the
8 | # "License"); you may not use this file except in compliance
9 | # with the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied. See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | ##############################################################################
20 | import fnmatch
21 | import re
22 | import sys
23 | import xml.etree.ElementTree as ET
24 | from pathlib import Path
25 |
26 | if len(sys.argv) != 3:
27 | sys.stderr.write("Usage: %s exclude_globs.lst rat_report.xml\n" % sys.argv[0])
28 | sys.exit(1)
29 |
30 | exclude_globs_filename = sys.argv[1]
31 | xml_filename = sys.argv[2]
32 |
33 | globs = [line.strip() for line in Path.open(exclude_globs_filename)]
34 |
35 | tree = ET.parse(xml_filename)
36 | root = tree.getroot()
37 | resources = root.findall("resource")
38 |
39 | all_ok = True
40 | for r in resources:
41 | approvals = r.findall("license-approval")
42 | if not approvals or approvals[0].attrib["name"] == "true":
43 | continue
44 | clean_name = re.sub("^[^/]+/", "", r.attrib["name"])
45 | excluded = False
46 | for g in globs:
47 | if fnmatch.fnmatch(clean_name, g):
48 | excluded = True
49 | break
50 | if not excluded:
51 | sys.stdout.write(
52 | "NOT APPROVED: %s (%s): %s\n"
53 | % (clean_name, r.attrib["name"], approvals[0].attrib["name"])
54 | )
55 | all_ok = False
56 |
57 | if not all_ok:
58 | sys.exit(1)
59 |
60 | print("OK")
61 | sys.exit(0)
62 |
--------------------------------------------------------------------------------
/docs/mdbook/src/installation.md:
--------------------------------------------------------------------------------
1 |
17 | # Installation
18 |
19 | DataFusion is easy to install, just like any other Python library.
20 |
21 | ## Using uv
22 |
23 | If you do not yet have a virtual environment, create one:
24 |
25 | ```bash
26 | uv venv
27 | ```
28 |
29 | You can add datafusion to your virtual environment with the usual:
30 |
31 | ```bash
32 | uv pip install datafusion
33 | ```
34 |
35 | Or, to add to a project:
36 |
37 | ```bash
38 | uv add datafusion
39 | ```
40 |
41 | ## Using pip
42 |
43 | ``` bash
44 | pip install datafusion
45 | ```
46 |
47 | ## uv & JupyterLab setup
48 |
49 | This section explains how to install DataFusion in a uv environment with other libraries that allow for a nice Jupyter workflow. This setup is completely optional. These steps are only needed if you'd like to run DataFusion in a Jupyter notebook and have an interface like this:
50 |
51 | 
52 |
53 | Create a virtual environment with DataFusion, Jupyter, and other useful dependencies and start the desktop application.
54 |
55 | ```bash
56 | uv venv
57 | uv pip install datafusion jupyterlab jupyterlab_code_formatter
58 | uv run jupyter lab
59 | ```
60 |
61 | ## Examples
62 |
63 | See the [DataFusion Python Examples](https://github.com/apache/arrow-datafusion-python/tree/main/examples) for a variety of Python scripts that show DataFusion in action!
64 |
--------------------------------------------------------------------------------
/docs/mdbook/src/index.md:
--------------------------------------------------------------------------------
1 |
17 | # DataFusion Book
18 |
19 | DataFusion is a blazing fast query engine that lets you run data analyses quickly and reliably.
20 |
21 | DataFusion is written in Rust, but also exposes Python and SQL bindings, so you can easily query data in your language of choice. You don't need to know any Rust to be a happy and productive user of DataFusion.
22 |
23 | DataFusion lets you run queries faster than pandas. Let's compare query runtimes for a 5GB CSV file with 100 million rows of data.
24 |
25 | Take a look at a few rows of the data:
26 |
27 | ```
28 | +-------+-------+--------------+-----+-----+-------+----+----+-----------+
29 | | id1 | id2 | id3 | id4 | id5 | id6 | v1 | v2 | v3 |
30 | +-------+-------+--------------+-----+-----+-------+----+----+-----------+
31 | | id016 | id016 | id0000042202 | 15 | 24 | 5971 | 5 | 11 | 37.211254 |
32 | | id039 | id045 | id0000029558 | 40 | 49 | 39457 | 5 | 4 | 48.951141 |
33 | | id047 | id023 | id0000071286 | 68 | 20 | 74463 | 2 | 14 | 60.469241 |
34 | +-------+-------+--------------+-----+-----+-------+----+----+-----------+
35 | ```
36 |
37 | Suppose you'd like to run the following query: `SELECT id1, sum(v1) AS v1 from the_table GROUP BY id1`.
38 |
39 | If you use pandas, then this query will take 43.6 seconds to execute.
40 |
41 | It only takes DataFusion 9.8 seconds to execute the same query.
42 |
43 | DataFusion is easy to use, powerful, and fast. Let's learn more!
44 |
--------------------------------------------------------------------------------
/src/unparser/mod.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | mod dialect;
19 |
20 | use std::sync::Arc;
21 |
22 | use datafusion::sql::unparser::dialect::Dialect;
23 | use datafusion::sql::unparser::Unparser;
24 | use dialect::PyDialect;
25 | use pyo3::exceptions::PyValueError;
26 | use pyo3::prelude::*;
27 |
28 | use crate::sql::logical::PyLogicalPlan;
29 |
30 | #[pyclass(frozen, name = "Unparser", module = "datafusion.unparser", subclass)]
31 | #[derive(Clone)]
32 | pub struct PyUnparser {
33 | dialect: Arc,
34 | pretty: bool,
35 | }
36 |
37 | #[pymethods]
38 | impl PyUnparser {
39 | #[new]
40 | pub fn new(dialect: PyDialect) -> Self {
41 | Self {
42 | dialect: dialect.dialect.clone(),
43 | pretty: false,
44 | }
45 | }
46 |
47 | pub fn plan_to_sql(&self, plan: &PyLogicalPlan) -> PyResult {
48 | let mut unparser = Unparser::new(self.dialect.as_ref());
49 | unparser = unparser.with_pretty(self.pretty);
50 | let sql = unparser
51 | .plan_to_sql(&plan.plan())
52 | .map_err(|e| PyValueError::new_err(e.to_string()))?;
53 | Ok(sql.to_string())
54 | }
55 |
56 | pub fn with_pretty(&self, pretty: bool) -> Self {
57 | Self {
58 | dialect: self.dialect.clone(),
59 | pretty,
60 | }
61 | }
62 | }
63 |
64 | pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> {
65 | m.add_class::()?;
66 | m.add_class::()?;
67 | Ok(())
68 | }
69 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | repos:
19 | - repo: https://github.com/rhysd/actionlint
20 | rev: v1.7.6
21 | hooks:
22 | - id: actionlint-docker
23 | - repo: https://github.com/astral-sh/ruff-pre-commit
24 | # Ruff version.
25 | rev: v0.9.10
26 | hooks:
27 | # Run the linter.
28 | - id: ruff
29 | # Run the formatter.
30 | - id: ruff-format
31 | - repo: local
32 | hooks:
33 | - id: rust-fmt
34 | name: Rust fmt
35 | description: Run cargo fmt on files included in the commit. rustfmt should be installed before-hand.
36 | entry: cargo +nightly fmt --all --
37 | pass_filenames: true
38 | types: [file, rust]
39 | language: system
40 | - id: rust-clippy
41 | name: Rust clippy
42 | description: Run cargo clippy on files included in the commit. clippy should be installed before-hand.
43 | entry: cargo clippy --all-targets --all-features -- -Dclippy::all -D warnings -Aclippy::redundant_closure
44 | pass_filenames: false
45 | types: [file, rust]
46 | language: system
47 |
48 | - repo: https://github.com/codespell-project/codespell
49 | rev: v2.4.1
50 | hooks:
51 | - id: codespell
52 | args: [ --toml, "pyproject.toml"]
53 | additional_dependencies:
54 | - tomli
55 |
56 | default_language_version:
57 | python: python3
58 |
--------------------------------------------------------------------------------
/benchmarks/tpch/tpch-gen.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Licensed to the Apache Software Foundation (ASF) under one
3 | # or more contributor license agreements. See the NOTICE file
4 | # distributed with this work for additional information
5 | # regarding copyright ownership. The ASF licenses this file
6 | # to you under the Apache License, Version 2.0 (the
7 | # "License"); you may not use this file except in compliance
8 | # with the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied. See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 |
19 | mkdir -p data/answers 2>/dev/null
20 |
21 | set -e
22 |
23 | # If RUN_IN_CI is set, then do not produce verbose output or use an interactive terminal
24 | if [[ -z "${RUN_IN_CI}" ]]; then
25 | TERMINAL_FLAG="-it"
26 | VERBOSE_OUTPUT="-vf"
27 | else
28 | TERMINAL_FLAG=""
29 | VERBOSE_OUTPUT="-f"
30 | fi
31 |
32 | #pushd ..
33 | #. ./dev/build-set-env.sh
34 | #popd
35 |
36 | # Generate data into the ./data directory if it does not already exist
37 | FILE=./data/supplier.tbl
38 | if test -f "$FILE"; then
39 | echo "$FILE exists."
40 | else
41 | docker run -v `pwd`/data:/data $TERMINAL_FLAG --rm ghcr.io/scalytics/tpch-docker:main $VERBOSE_OUTPUT -s $1
42 |
43 | # workaround for https://github.com/apache/arrow-datafusion/issues/6147
44 | mv data/customer.tbl data/customer.csv
45 | mv data/lineitem.tbl data/lineitem.csv
46 | mv data/nation.tbl data/nation.csv
47 | mv data/orders.tbl data/orders.csv
48 | mv data/part.tbl data/part.csv
49 | mv data/partsupp.tbl data/partsupp.csv
50 | mv data/region.tbl data/region.csv
51 | mv data/supplier.tbl data/supplier.csv
52 |
53 | ls -l data
54 | fi
55 |
56 | # Copy expected answers (at SF=1) into the ./data/answers directory if it does not already exist
57 | FILE=./data/answers/q1.out
58 | if test -f "$FILE"; then
59 | echo "$FILE exists."
60 | else
61 | docker run -v `pwd`/data:/data $TERMINAL_FLAG --entrypoint /bin/bash --rm ghcr.io/scalytics/tpch-docker:main -c "cp /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/"
62 | fi
63 |
--------------------------------------------------------------------------------
/examples/datafusion-ffi-example/python/tests/_test_scalar_udf.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | from __future__ import annotations
19 |
20 | import pyarrow as pa
21 | from datafusion import SessionContext, col, udf
22 | from datafusion_ffi_example import IsNullUDF
23 |
24 |
25 | def setup_context_with_table():
26 | ctx = SessionContext()
27 |
28 | batch = pa.RecordBatch.from_arrays(
29 | [pa.array([1, 2, 3, None])],
30 | names=["a"],
31 | )
32 | ctx.register_record_batches("test_table", [[batch]])
33 | return ctx
34 |
35 |
36 | def test_ffi_scalar_register():
37 | ctx = setup_context_with_table()
38 | my_udf = udf(IsNullUDF())
39 | ctx.register_udf(my_udf)
40 |
41 | result = ctx.sql("select my_custom_is_null(a) from test_table").collect()
42 |
43 | assert len(result) == 1
44 | assert result[0].num_columns == 1
45 | print(result)
46 |
47 | result = [r.column(0) for r in result]
48 | expected = [
49 | pa.array([False, False, False, True], type=pa.bool_()),
50 | ]
51 |
52 | assert result == expected
53 |
54 |
55 | def test_ffi_scalar_call_directly():
56 | ctx = setup_context_with_table()
57 | my_udf = udf(IsNullUDF())
58 |
59 | result = ctx.table("test_table").select(my_udf(col("a"))).collect()
60 |
61 | assert len(result) == 1
62 | assert result[0].num_columns == 1
63 | print(result)
64 |
65 | result = [r.column(0) for r in result]
66 | expected = [
67 | pa.array([False, False, False, True], type=pa.bool_()),
68 | ]
69 |
70 | assert result == expected
71 |
--------------------------------------------------------------------------------
/examples/python-udaf.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | import datafusion
19 | import pyarrow as pa
20 | import pyarrow.compute
21 | from datafusion import Accumulator, col, udaf
22 |
23 |
24 | class MyAccumulator(Accumulator):
25 | """
26 | Interface of a user-defined accumulation.
27 | """
28 |
29 | def __init__(self) -> None:
30 | self._sum = pa.scalar(0.0)
31 |
32 | def update(self, values: pa.Array) -> None:
33 | # not nice since pyarrow scalars can't be summed yet. This breaks on `None`
34 | self._sum = pa.scalar(self._sum.as_py() + pa.compute.sum(values).as_py())
35 |
36 | def merge(self, states: pa.Array) -> None:
37 | # not nice since pyarrow scalars can't be summed yet. This breaks on `None`
38 | self._sum = pa.scalar(self._sum.as_py() + pa.compute.sum(states).as_py())
39 |
40 | def state(self) -> pa.Array:
41 | return pa.array([self._sum.as_py()])
42 |
43 | def evaluate(self) -> pa.Scalar:
44 | return self._sum
45 |
46 |
47 | # create a context
48 | ctx = datafusion.SessionContext()
49 |
50 | # create a RecordBatch and a new DataFrame from it
51 | batch = pa.RecordBatch.from_arrays(
52 | [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
53 | names=["a", "b"],
54 | )
55 | df = ctx.create_dataframe([[batch]])
56 |
57 | my_udaf = udaf(
58 | MyAccumulator,
59 | pa.float64(),
60 | pa.float64(),
61 | [pa.float64()],
62 | "stable",
63 | )
64 |
65 | df = df.aggregate([], [my_udaf(col("a"))])
66 |
67 | result = df.collect()[0]
68 |
69 | assert result.column(0) == pa.array([6.0])
70 |
--------------------------------------------------------------------------------
/src/expr/cast.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | use datafusion::logical_expr::{Cast, TryCast};
19 | use pyo3::prelude::*;
20 |
21 | use crate::common::data_type::PyDataType;
22 | use crate::expr::PyExpr;
23 |
24 | #[pyclass(frozen, name = "Cast", module = "datafusion.expr", subclass)]
25 | #[derive(Clone)]
26 | pub struct PyCast {
27 | cast: Cast,
28 | }
29 |
30 | impl From for Cast {
31 | fn from(cast: PyCast) -> Self {
32 | cast.cast
33 | }
34 | }
35 |
36 | impl From for PyCast {
37 | fn from(cast: Cast) -> PyCast {
38 | PyCast { cast }
39 | }
40 | }
41 |
42 | #[pymethods]
43 | impl PyCast {
44 | fn expr(&self) -> PyResult {
45 | Ok((*self.cast.expr).clone().into())
46 | }
47 |
48 | fn data_type(&self) -> PyResult {
49 | Ok(self.cast.data_type.clone().into())
50 | }
51 | }
52 |
53 | #[pyclass(name = "TryCast", module = "datafusion.expr", subclass)]
54 | #[derive(Clone)]
55 | pub struct PyTryCast {
56 | try_cast: TryCast,
57 | }
58 |
59 | impl From for TryCast {
60 | fn from(try_cast: PyTryCast) -> Self {
61 | try_cast.try_cast
62 | }
63 | }
64 |
65 | impl From for PyTryCast {
66 | fn from(try_cast: TryCast) -> PyTryCast {
67 | PyTryCast { try_cast }
68 | }
69 | }
70 |
71 | #[pymethods]
72 | impl PyTryCast {
73 | fn expr(&self) -> PyResult {
74 | Ok((*self.try_cast.expr).clone().into())
75 | }
76 |
77 | fn data_type(&self) -> PyResult {
78 | Ok(self.try_cast.data_type.clone().into())
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/src/expr/between.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | use std::fmt::{self, Display, Formatter};
19 |
20 | use datafusion::logical_expr::expr::Between;
21 | use pyo3::prelude::*;
22 |
23 | use crate::expr::PyExpr;
24 |
25 | #[pyclass(frozen, name = "Between", module = "datafusion.expr", subclass)]
26 | #[derive(Clone)]
27 | pub struct PyBetween {
28 | between: Between,
29 | }
30 |
31 | impl From for Between {
32 | fn from(between: PyBetween) -> Self {
33 | between.between
34 | }
35 | }
36 |
37 | impl From for PyBetween {
38 | fn from(between: Between) -> PyBetween {
39 | PyBetween { between }
40 | }
41 | }
42 |
43 | impl Display for PyBetween {
44 | fn fmt(&self, f: &mut Formatter) -> fmt::Result {
45 | write!(
46 | f,
47 | "Between
48 | Expr: {:?}
49 | Negated: {:?}
50 | Low: {:?}
51 | High: {:?}",
52 | &self.between.expr, &self.between.negated, &self.between.low, &self.between.high
53 | )
54 | }
55 | }
56 |
57 | #[pymethods]
58 | impl PyBetween {
59 | fn expr(&self) -> PyResult {
60 | Ok((*self.between.expr).clone().into())
61 | }
62 |
63 | fn negated(&self) -> PyResult {
64 | Ok(self.between.negated)
65 | }
66 |
67 | fn low(&self) -> PyResult {
68 | Ok((*self.between.low).clone().into())
69 | }
70 |
71 | fn high(&self) -> PyResult {
72 | Ok((*self.between.high).clone().into())
73 | }
74 |
75 | fn __repr__(&self) -> String {
76 | format!("{self}")
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/examples/datafusion-ffi-example/python/tests/_test_catalog_provider.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | from __future__ import annotations
19 |
20 | import pyarrow as pa
21 | from datafusion import SessionContext
22 | from datafusion_ffi_example import MyCatalogProvider
23 |
24 |
25 | def test_catalog_provider():
26 | ctx = SessionContext()
27 |
28 | my_catalog_name = "my_catalog"
29 | expected_schema_name = "my_schema"
30 | expected_table_name = "my_table"
31 | expected_table_columns = ["units", "price"]
32 |
33 | catalog_provider = MyCatalogProvider()
34 | ctx.register_catalog_provider(my_catalog_name, catalog_provider)
35 | my_catalog = ctx.catalog(my_catalog_name)
36 |
37 | my_catalog_schemas = my_catalog.names()
38 | assert expected_schema_name in my_catalog_schemas
39 | my_schema = my_catalog.schema(expected_schema_name)
40 | assert expected_table_name in my_schema.names()
41 | my_table = my_schema.table(expected_table_name)
42 | assert expected_table_columns == my_table.schema.names
43 |
44 | result = ctx.table(
45 | f"{my_catalog_name}.{expected_schema_name}.{expected_table_name}"
46 | ).collect()
47 | assert len(result) == 2
48 |
49 | col0_result = [r.column(0) for r in result]
50 | col1_result = [r.column(1) for r in result]
51 | expected_col0 = [
52 | pa.array([10, 20, 30], type=pa.int32()),
53 | pa.array([5, 7], type=pa.int32()),
54 | ]
55 | expected_col1 = [
56 | pa.array([1, 2, 5], type=pa.float64()),
57 | pa.array([1.5, 2.5], type=pa.float64()),
58 | ]
59 | assert col0_result == expected_col0
60 | assert col1_result == expected_col1
61 |
--------------------------------------------------------------------------------
/examples/substrait.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | from datafusion import SessionContext
19 | from datafusion import substrait as ss
20 |
21 | # Create a DataFusion context
22 | ctx = SessionContext()
23 |
24 | # Register table with context
25 | ctx.register_csv("aggregate_test_data", "./testing/data/csv/aggregate_test_100.csv")
26 |
27 | substrait_plan = ss.Serde.serialize_to_plan("SELECT * FROM aggregate_test_data", ctx)
28 | # type(substrait_plan) ->
29 |
30 | # Encode it to bytes
31 | substrait_bytes = substrait_plan.encode()
32 | # type(substrait_bytes) -> , at this point the bytes can be distributed to file, network, etc safely
33 | # where they could subsequently be deserialized on the receiving end.
34 |
35 | # Alternative serialization approaches
36 | # type(substrait_bytes) -> , at this point the bytes can be distributed to file, network, etc safely
37 | # where they could subsequently be deserialized on the receiving end.
38 | substrait_bytes = ss.Serde.serialize_bytes("SELECT * FROM aggregate_test_data", ctx)
39 |
40 | # Imagine here bytes would be read from network, file, etc ... for example brevity this is omitted and variable is simply reused
41 | # type(substrait_plan) ->
42 | substrait_plan = ss.Serde.deserialize_bytes(substrait_bytes)
43 |
44 | # type(df_logical_plan) ->
45 | df_logical_plan = ss.Consumer.from_substrait_plan(ctx, substrait_plan)
46 |
47 | # Back to Substrait Plan just for demonstration purposes
48 | # type(substrait_plan) ->
49 | substrait_plan = ss.Producer.to_substrait_plan(df_logical_plan, ctx)
50 |
--------------------------------------------------------------------------------
/python/tests/utils.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | """Testing-only helpers for datafusion-python.
19 |
20 | This module contains utilities used by the test-suite that should not be
21 | exposed as part of the public API. Keep the implementation minimal and
22 | documented so reviewers can easily see it's test-only.
23 | """
24 |
25 | from __future__ import annotations
26 |
27 | from typing import TYPE_CHECKING
28 |
29 | if TYPE_CHECKING:
30 | from datafusion import DataFrame
31 | from datafusion.context import SessionContext
32 |
33 |
34 | def range_table(
35 | ctx: SessionContext,
36 | start: int,
37 | stop: int | None = None,
38 | step: int = 1,
39 | partitions: int | None = None,
40 | ) -> DataFrame:
41 | """Create a DataFrame containing a sequence of numbers using SQL RANGE.
42 |
43 | This mirrors the previous ``SessionContext.range`` convenience method but
44 | lives in a testing-only module so it doesn't expand the public surface.
45 |
46 | Args:
47 | ctx: SessionContext instance to run the SQL against.
48 | start: Starting value for the sequence or exclusive stop when ``stop``
49 | is ``None``.
50 | stop: Exclusive upper bound of the sequence.
51 | step: Increment between successive values.
52 | partitions: Optional number of partitions for the generated data.
53 |
54 | Returns:
55 | DataFrame produced by the range table function.
56 | """
57 | if stop is None:
58 | start, stop = 0, start
59 |
60 | parts = f", {int(partitions)}" if partitions is not None else ""
61 | sql = f"SELECT * FROM range({int(start)}, {int(stop)}, {int(step)}{parts})"
62 | return ctx.sql(sql)
63 |
--------------------------------------------------------------------------------
/python/datafusion/common.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | """Common data types used throughout the DataFusion project."""
18 |
19 | from enum import Enum
20 |
21 | from ._internal import common as common_internal
22 |
23 | # TODO: these should all have proper wrapper classes
24 |
25 | DFSchema = common_internal.DFSchema
26 | DataType = common_internal.DataType
27 | DataTypeMap = common_internal.DataTypeMap
28 | PythonType = common_internal.PythonType
29 | RexType = common_internal.RexType
30 | SqlFunction = common_internal.SqlFunction
31 | SqlSchema = common_internal.SqlSchema
32 | SqlStatistics = common_internal.SqlStatistics
33 | SqlTable = common_internal.SqlTable
34 | SqlType = common_internal.SqlType
35 | SqlView = common_internal.SqlView
36 | TableType = common_internal.TableType
37 | TableSource = common_internal.TableSource
38 | Constraints = common_internal.Constraints
39 |
40 | __all__ = [
41 | "Constraints",
42 | "DFSchema",
43 | "DataType",
44 | "DataTypeMap",
45 | "NullTreatment",
46 | "PythonType",
47 | "RexType",
48 | "SqlFunction",
49 | "SqlSchema",
50 | "SqlStatistics",
51 | "SqlTable",
52 | "SqlType",
53 | "SqlView",
54 | "TableSource",
55 | "TableType",
56 | ]
57 |
58 |
59 | class NullTreatment(Enum):
60 | """Describe how null values are to be treated by functions.
61 |
62 | This is used primarily by aggregate and window functions. It can be set on
63 | these functions using the builder approach described in
64 | ref:`_window_functions` and ref:`_aggregation` in the online documentation.
65 |
66 | """
67 |
68 | RESPECT_NULLS = common_internal.NullTreatment.RESPECT_NULLS
69 | IGNORE_NULLS = common_internal.NullTreatment.IGNORE_NULLS
70 |
--------------------------------------------------------------------------------
/examples/tpch/README.md:
--------------------------------------------------------------------------------
1 |
19 |
20 | # DataFusion Python Examples for TPC-H
21 |
22 | These examples reproduce the problems listed in the Transaction Process Council
23 | TPC-H benchmark. The purpose of these examples is to demonstrate how to use
24 | different aspects of Data Fusion and not necessarily geared towards creating the
25 | most performant queries possible. Within each example is a description of the
26 | problem. For users who are familiar with SQL style commands, you can compare the
27 | approaches in these examples with those listed in the specification.
28 |
29 | - https://www.tpc.org/tpch/
30 |
31 | The examples provided are based on version 2.18.0 of the TPC-H specification.
32 |
33 | ## Data Setup
34 |
35 | To run these examples, you must first generate a dataset. The `dbgen` tool
36 | provided by TPC can create datasets of arbitrary scale. For testing it is
37 | typically sufficient to create a 1 gigabyte dataset. For convenience, this
38 | repository has a script which uses docker to create this dataset. From the
39 | `benchmarks/tpch` directory execute the following script.
40 |
41 | ```bash
42 | ./tpch-gen.sh 1
43 | ```
44 |
45 | The examples provided use parquet files for the tables generated by `dbgen`.
46 | A python script is provided to convert the text files from `dbgen` into parquet
47 | files expected by the examples. From the `examples/tpch` directory you can
48 | execute the following command to create the necessary parquet files.
49 |
50 | ```bash
51 | python convert_data_to_parquet.py
52 | ```
53 |
54 | ## Description of Examples
55 |
56 | For easier access, a description of the techniques demonstrated in each file
57 | is in the README.md file in the `examples` directory.
58 |
--------------------------------------------------------------------------------
/src/expr/indexed_field.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | use crate::expr::PyExpr;
19 | use datafusion::logical_expr::expr::{GetFieldAccess, GetIndexedField};
20 | use pyo3::prelude::*;
21 | use std::fmt::{Display, Formatter};
22 |
23 | use super::literal::PyLiteral;
24 |
25 | #[pyclass(frozen, name = "GetIndexedField", module = "datafusion.expr", subclass)]
26 | #[derive(Clone)]
27 | pub struct PyGetIndexedField {
28 | indexed_field: GetIndexedField,
29 | }
30 |
31 | impl From for GetIndexedField {
32 | fn from(indexed_field: PyGetIndexedField) -> Self {
33 | indexed_field.indexed_field
34 | }
35 | }
36 |
37 | impl From for PyGetIndexedField {
38 | fn from(indexed_field: GetIndexedField) -> PyGetIndexedField {
39 | PyGetIndexedField { indexed_field }
40 | }
41 | }
42 |
43 | impl Display for PyGetIndexedField {
44 | fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
45 | write!(
46 | f,
47 | "GetIndexedField
48 | Expr: {:?}
49 | Key: {:?}",
50 | &self.indexed_field.expr, &self.indexed_field.field
51 | )
52 | }
53 | }
54 |
55 | #[pymethods]
56 | impl PyGetIndexedField {
57 | fn expr(&self) -> PyResult {
58 | Ok((*self.indexed_field.expr).clone().into())
59 | }
60 |
61 | fn key(&self) -> PyResult {
62 | match &self.indexed_field.field {
63 | GetFieldAccess::NamedStructField { name, .. } => Ok(name.clone().into()),
64 | _ => todo!(),
65 | }
66 | }
67 |
68 | /// Get a String representation of this column
69 | fn __repr__(&self) -> String {
70 | format!("{}", self)
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/docs/mdbook/src/quickstart.md:
--------------------------------------------------------------------------------
1 |
17 | # DataFusion Quickstart
18 |
19 | You can easily query a DataFusion table with the Python API or with pure SQL.
20 |
21 | Let's create a small DataFrame and then run some queries with both APIs.
22 |
23 | Start by creating a DataFrame with four rows of data and two columns: `a` and `b`.
24 |
25 | ```python
26 | from datafusion import SessionContext
27 |
28 | ctx = SessionContext()
29 |
30 | df = ctx.from_pydict({"a": [1, 2, 3, 1], "b": [4, 5, 6, 7]}, name="my_table")
31 | ```
32 |
33 | Let's append a column to this DataFrame that adds columns `a` and `b` with the SQL API.
34 |
35 | ```
36 | ctx.sql("select a, b, a + b as sum_a_b from my_table")
37 |
38 | +---+---+---------+
39 | | a | b | sum_a_b |
40 | +---+---+---------+
41 | | 1 | 4 | 5 |
42 | | 2 | 5 | 7 |
43 | | 3 | 6 | 9 |
44 | | 1 | 7 | 8 |
45 | +---+---+---------+
46 | ```
47 |
48 | DataFusion makes it easy to run SQL queries on DataFrames.
49 |
50 | Now let's run the same query with the DataFusion Python API:
51 |
52 | ```python
53 | from datafusion import col
54 |
55 | df.select(
56 | col("a"),
57 | col("b"),
58 | col("a") + col("b"),
59 | )
60 | ```
61 |
62 | We get the same result as before:
63 |
64 | ```
65 | +---+---+-------------------------+
66 | | a | b | my_table.a + my_table.b |
67 | +---+---+-------------------------+
68 | | 1 | 4 | 5 |
69 | | 2 | 5 | 7 |
70 | | 3 | 6 | 9 |
71 | | 1 | 7 | 8 |
72 | +---+---+-------------------------+
73 | ```
74 |
75 | DataFusion also allows you to query data with a well-designed Python interface.
76 |
77 | Python users have two great ways to query DataFusion tables.
78 |
--------------------------------------------------------------------------------
/docs/source/user-guide/io/table_provider.rst:
--------------------------------------------------------------------------------
1 | .. Licensed to the Apache Software Foundation (ASF) under one
2 | .. or more contributor license agreements. See the NOTICE file
3 | .. distributed with this work for additional information
4 | .. regarding copyright ownership. The ASF licenses this file
5 | .. to you under the Apache License, Version 2.0 (the
6 | .. "License"); you may not use this file except in compliance
7 | .. with the License. You may obtain a copy of the License at
8 |
9 | .. http://www.apache.org/licenses/LICENSE-2.0
10 |
11 | .. Unless required by applicable law or agreed to in writing,
12 | .. software distributed under the License is distributed on an
13 | .. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | .. KIND, either express or implied. See the License for the
15 | .. specific language governing permissions and limitations
16 | .. under the License.
17 |
18 | .. _io_custom_table_provider:
19 |
20 | Custom Table Provider
21 | =====================
22 |
23 | If you have a custom data source that you want to integrate with DataFusion, you can do so by
24 | implementing the `TableProvider `_
25 | interface in Rust and then exposing it in Python. To do so,
26 | you must use DataFusion 43.0.0 or later and expose a `FFI_TableProvider `_
27 | via `PyCapsule `_.
28 |
29 | A complete example can be found in the `examples folder `_.
30 |
31 | .. code-block:: rust
32 |
33 | #[pymethods]
34 | impl MyTableProvider {
35 |
36 | fn __datafusion_table_provider__<'py>(
37 | &self,
38 | py: Python<'py>,
39 | ) -> PyResult> {
40 | let name = cr"datafusion_table_provider".into();
41 |
42 | let provider = Arc::new(self.clone());
43 | let provider = FFI_TableProvider::new(provider, false, None);
44 |
45 | PyCapsule::new_bound(py, provider, Some(name.clone()))
46 | }
47 | }
48 |
49 | Once you have this library available, you can construct a
50 | :py:class:`~datafusion.Table` in Python and register it with the
51 | ``SessionContext``.
52 |
53 | .. code-block:: python
54 |
55 | from datafusion import SessionContext, Table
56 |
57 | ctx = SessionContext()
58 | provider = MyTableProvider()
59 |
60 | ctx.register_table("capsule_table", provider)
61 |
62 | ctx.table("capsule_table").show()
63 |
--------------------------------------------------------------------------------
/dev/release/release-tarball.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Licensed to the Apache Software Foundation (ASF) under one
4 | # or more contributor license agreements. See the NOTICE file
5 | # distributed with this work for additional information
6 | # regarding copyright ownership. The ASF licenses this file
7 | # to you under the Apache License, Version 2.0 (the
8 | # "License"); you may not use this file except in compliance
9 | # with the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied. See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | #
20 |
21 | # Adapted from https://github.com/apache/arrow-rs/tree/master/dev/release/release-tarball.sh
22 |
23 | # This script copies a tarball from the "dev" area of the
24 | # dist.apache.arrow repository to the "release" area
25 | #
26 | # This script should only be run after the release has been approved
27 | # by the arrow PMC committee.
28 | #
29 | # See release/README.md for full release instructions
30 | #
31 | # Based in part on post-01-upload.sh from apache/arrow
32 |
33 |
34 | set -e
35 | set -u
36 |
37 | if [ "$#" -ne 2 ]; then
38 | echo "Usage: $0 "
39 | echo "ex. $0 4.1.0 2"
40 | exit
41 | fi
42 |
43 | version=$1
44 | rc=$2
45 |
46 | tmp_dir=tmp-apache-datafusion-python-dist
47 |
48 | echo "Recreate temporary directory: ${tmp_dir}"
49 | rm -rf ${tmp_dir}
50 | mkdir -p ${tmp_dir}
51 |
52 | echo "Clone dev dist repository"
53 | svn \
54 | co \
55 | https://dist.apache.org/repos/dist/dev/datafusion/apache-datafusion-python-${version}-rc${rc} \
56 | ${tmp_dir}/dev
57 |
58 | echo "Clone release dist repository"
59 | svn co https://dist.apache.org/repos/dist/release/datafusion ${tmp_dir}/release
60 |
61 | echo "Copy ${version}-rc${rc} to release working copy"
62 | release_version=datafusion-python-${version}
63 | mkdir -p ${tmp_dir}/release/${release_version}
64 | cp -r ${tmp_dir}/dev/* ${tmp_dir}/release/${release_version}/
65 | svn add ${tmp_dir}/release/${release_version}
66 |
67 | echo "Commit release"
68 | svn ci -m "Apache DataFusion Python ${version}" ${tmp_dir}/release
69 |
70 | echo "Clean up"
71 | rm -rf ${tmp_dir}
72 |
73 | echo "Success! The release is available here:"
74 | echo " https://dist.apache.org/repos/dist/release/datafusion/${release_version}"
75 |
--------------------------------------------------------------------------------
/src/expr/subquery.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | use std::fmt::{self, Display, Formatter};
19 |
20 | use datafusion::logical_expr::Subquery;
21 | use pyo3::prelude::*;
22 | use pyo3::IntoPyObjectExt;
23 |
24 | use super::logical_node::LogicalNode;
25 | use crate::sql::logical::PyLogicalPlan;
26 |
27 | #[pyclass(frozen, name = "Subquery", module = "datafusion.expr", subclass)]
28 | #[derive(Clone)]
29 | pub struct PySubquery {
30 | subquery: Subquery,
31 | }
32 |
33 | impl From for Subquery {
34 | fn from(subquery: PySubquery) -> Self {
35 | subquery.subquery
36 | }
37 | }
38 |
39 | impl From for PySubquery {
40 | fn from(subquery: Subquery) -> PySubquery {
41 | PySubquery { subquery }
42 | }
43 | }
44 |
45 | impl Display for PySubquery {
46 | fn fmt(&self, f: &mut Formatter) -> fmt::Result {
47 | write!(
48 | f,
49 | "Subquery
50 | Subquery: {:?}
51 | outer_ref_columns: {:?}",
52 | self.subquery.subquery, self.subquery.outer_ref_columns,
53 | )
54 | }
55 | }
56 |
57 | #[pymethods]
58 | impl PySubquery {
59 | /// Retrieves the input `LogicalPlan` to this `Projection` node
60 | fn input(&self) -> PyResult> {
61 | Ok(Self::inputs(self))
62 | }
63 |
64 | fn __repr__(&self) -> PyResult {
65 | Ok(format!("Subquery({self})"))
66 | }
67 |
68 | fn __name__(&self) -> PyResult {
69 | Ok("Subquery".to_string())
70 | }
71 | }
72 |
73 | impl LogicalNode for PySubquery {
74 | fn inputs(&self) -> Vec {
75 | vec![]
76 | }
77 |
78 | fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> {
79 | self.clone().into_bound_py_any(py)
80 | }
81 | }
82 |
--------------------------------------------------------------------------------
/src/pyarrow_util.rs:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied. See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 |
18 | //! Conversions between PyArrow and DataFusion types
19 |
20 | use arrow::array::{Array, ArrayData};
21 | use arrow::pyarrow::{FromPyArrow, ToPyArrow};
22 | use datafusion::scalar::ScalarValue;
23 | use pyo3::types::{PyAnyMethods, PyList};
24 | use pyo3::{Bound, FromPyObject, PyAny, PyObject, PyResult, Python};
25 |
26 | use crate::common::data_type::PyScalarValue;
27 | use crate::errors::PyDataFusionError;
28 |
29 | impl FromPyArrow for PyScalarValue {
30 | fn from_pyarrow_bound(value: &Bound<'_, PyAny>) -> PyResult {
31 | let py = value.py();
32 | let typ = value.getattr("type")?;
33 | let val = value.call_method0("as_py")?;
34 |
35 | // construct pyarrow array from the python value and pyarrow type
36 | let factory = py.import("pyarrow")?.getattr("array")?;
37 | let args = PyList::new(py, [val])?;
38 | let array = factory.call1((args, typ))?;
39 |
40 | // convert the pyarrow array to rust array using C data interface
41 | let array = arrow::array::make_array(ArrayData::from_pyarrow_bound(&array)?);
42 | let scalar = ScalarValue::try_from_array(&array, 0).map_err(PyDataFusionError::from)?;
43 |
44 | Ok(PyScalarValue(scalar))
45 | }
46 | }
47 |
48 | impl<'source> FromPyObject<'source> for PyScalarValue {
49 | fn extract_bound(value: &Bound<'source, PyAny>) -> PyResult {
50 | Self::from_pyarrow_bound(value)
51 | }
52 | }
53 |
54 | pub fn scalar_to_pyarrow(scalar: &ScalarValue, py: Python) -> PyResult {
55 | let array = scalar.to_array().map_err(PyDataFusionError::from)?;
56 | // convert to pyarrow array using C data interface
57 | let pyarray = array.to_data().to_pyarrow(py)?;
58 | let pyscalar = pyarray.call_method1(py, "__getitem__", (0,))?;
59 |
60 | Ok(pyscalar)
61 | }
62 |
--------------------------------------------------------------------------------