├── .asf.yaml ├── .devcontainer └── devcontainer.json ├── .dockerignore ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── actions │ └── setup-builder │ │ └── action.yaml ├── dependabot.yml ├── pull_request_template.md └── workflows │ ├── build.yml │ ├── cancel.yml │ ├── dev.yml │ ├── dev_pr.yml │ ├── dev_pr │ └── labeler.yml │ ├── docker.yml │ ├── docs.yaml │ └── rust.yml ├── .github_changelog_generator ├── .gitignore ├── .gitmodules ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Cargo.lock ├── Cargo.toml ├── LICENSE.txt ├── NOTICE.txt ├── README.md ├── ROADMAP.md ├── ballista-cli ├── Cargo.toml ├── Dockerfile ├── README.md └── src │ ├── command.rs │ ├── exec.rs │ ├── lib.rs │ └── main.rs ├── ballista ├── client │ ├── Cargo.toml │ ├── README.md │ ├── src │ │ ├── extension.rs │ │ ├── lib.rs │ │ └── prelude.rs │ ├── testdata │ │ ├── alltypes_plain.parquet │ │ └── single_nan.parquet │ └── tests │ │ ├── common │ │ └── mod.rs │ │ ├── context_basic.rs │ │ ├── context_checks.rs │ │ ├── context_setup.rs │ │ └── context_unsupported.rs ├── core │ ├── Cargo.toml │ ├── README.md │ ├── build.rs │ ├── proto │ │ ├── ballista.proto │ │ ├── datafusion.proto │ │ └── datafusion_common.proto │ ├── src │ │ ├── client.rs │ │ ├── config.rs │ │ ├── consistent_hash │ │ │ ├── mod.rs │ │ │ └── node.rs │ │ ├── diagram.rs │ │ ├── error.rs │ │ ├── event_loop.rs │ │ ├── execution_plans │ │ │ ├── distributed_query.rs │ │ │ ├── mod.rs │ │ │ ├── shuffle_reader.rs │ │ │ ├── shuffle_writer.rs │ │ │ └── unresolved_shuffle.rs │ │ ├── extension.rs │ │ ├── lib.rs │ │ ├── object_store.rs │ │ ├── planner.rs │ │ ├── registry.rs │ │ ├── serde │ │ │ ├── generated │ │ │ │ ├── ballista.rs │ │ │ │ └── mod.rs │ │ │ ├── mod.rs │ │ │ └── scheduler │ │ │ │ ├── from_proto.rs │ │ │ │ ├── mod.rs │ │ │ │ └── to_proto.rs │ │ └── utils.rs │ └── tests │ │ └── customer.csv ├── executor │ ├── Cargo.toml │ ├── README.md │ ├── build.rs │ ├── examples │ │ └── example_executor_config.toml │ ├── executor_config_spec.toml │ └── src │ │ ├── bin │ │ └── main.rs │ │ ├── collect.rs │ │ ├── config.rs │ │ ├── cpu_bound_executor.rs │ │ ├── execution_engine.rs │ │ ├── execution_loop.rs │ │ ├── executor.rs │ │ ├── executor_process.rs │ │ ├── executor_server.rs │ │ ├── flight_service.rs │ │ ├── lib.rs │ │ ├── metrics │ │ └── mod.rs │ │ ├── shutdown.rs │ │ ├── standalone.rs │ │ └── terminate.rs └── scheduler │ ├── Cargo.toml │ ├── README.md │ ├── build.rs │ ├── proto │ └── keda.proto │ ├── scheduler_config_spec.toml │ ├── src │ ├── api │ │ ├── handlers.rs │ │ └── mod.rs │ ├── bin │ │ └── main.rs │ ├── cluster │ │ ├── event │ │ │ └── mod.rs │ │ ├── memory.rs │ │ ├── mod.rs │ │ └── test_util │ │ │ └── mod.rs │ ├── config.rs │ ├── display.rs │ ├── lib.rs │ ├── metrics │ │ ├── mod.rs │ │ └── prometheus.rs │ ├── planner.rs │ ├── scheduler_process.rs │ ├── scheduler_server │ │ ├── event.rs │ │ ├── external_scaler.rs │ │ ├── grpc.rs │ │ ├── mod.rs │ │ └── query_stage_scheduler.rs │ ├── standalone.rs │ ├── state │ │ ├── execution_graph.rs │ │ ├── execution_graph_dot.rs │ │ ├── execution_stage.rs │ │ ├── executor_manager.rs │ │ ├── mod.rs │ │ ├── session_manager.rs │ │ └── task_manager.rs │ └── test_utils.rs │ └── testdata │ ├── customer │ └── customer.tbl │ ├── lineitem │ ├── partition0.tbl │ └── partition1.tbl │ ├── nation │ └── nation.tbl │ ├── orders │ └── orders.tbl │ ├── part │ └── part.tbl │ ├── partsupp │ └── partsupp.tbl │ ├── region │ └── region.tbl │ └── supplier │ └── supplier.tbl ├── benchmarks ├── .dockerignore ├── .gitignore ├── Cargo.toml ├── README.md ├── db-benchmark │ ├── README.md │ ├── db-benchmark.Dockerfile │ ├── groupby-datafusion.py │ ├── join-datafusion.py │ └── run-bench.sh ├── queries │ ├── q1.sql │ ├── q10.sql │ ├── q11.sql │ ├── q12.sql │ ├── q13.sql │ ├── q14.sql │ ├── q15.sql │ ├── q16.sql │ ├── q17.sql │ ├── q18.sql │ ├── q19.sql │ ├── q2.sql │ ├── q20.sql │ ├── q21.sql │ ├── q22.sql │ ├── q3.sql │ ├── q4.sql │ ├── q5.sql │ ├── q6.sql │ ├── q7.sql │ ├── q8.sql │ └── q9.sql ├── run.sh ├── spark │ ├── .gitignore │ ├── README.md │ ├── pom.xml │ └── src │ │ └── main │ │ └── scala │ │ └── org │ │ └── apache │ │ └── arrow │ │ └── SparkTpch.scala ├── src │ └── bin │ │ ├── nyctaxi.rs │ │ └── tpch.rs ├── tpch-gen.sh └── tpch.py ├── ci └── scripts │ ├── rust_clippy.sh │ ├── rust_fmt.sh │ └── rust_toml_fmt.sh ├── clippy.toml ├── dev ├── build-ballista-docker.sh ├── build-ballista-executables.sh ├── build-set-env.sh ├── create_license.py ├── docker │ ├── ballista-benchmarks.Dockerfile │ ├── ballista-builder.Dockerfile │ ├── ballista-cli.Dockerfile │ ├── ballista-executor.Dockerfile │ ├── ballista-scheduler.Dockerfile │ ├── ballista-standalone.Dockerfile │ ├── builder-entrypoint.sh │ ├── cli-entrypoint.sh │ ├── executor-entrypoint.sh │ ├── scheduler-entrypoint.sh │ └── standalone-entrypoint.sh ├── integration-tests.sh ├── release │ ├── README.md │ ├── check-rat-report.py │ ├── crate-deps.dot │ ├── crate-deps.svg │ ├── create-tarball.sh │ ├── download-python-wheels.py │ ├── generate-changelog.py │ ├── rat_exclude_files.txt │ ├── release-tarball.sh │ ├── run-rat.sh │ └── verify-release-candidate.sh ├── rust_lint.sh ├── update_arrow_deps.py ├── update_ballista_versions.py └── update_datafusion_versions.py ├── docker-compose.yml ├── docs ├── .gitignore ├── Makefile ├── README.md ├── build.sh ├── developer │ ├── README.md │ ├── architecture.md │ └── images │ │ └── query-execution.png ├── make.bat ├── requirements.txt └── source │ ├── _static │ ├── images │ │ ├── ballista-logo.png │ │ ├── ballista-logo.svg │ │ ├── ballista_black.png │ │ ├── ballista_black.svg │ │ ├── ballista_white.png │ │ ├── ballista_white.svg │ │ ├── tpch_allqueries.png │ │ ├── tpch_queries_compare.png │ │ ├── tpch_queries_speedup_abs.png │ │ └── tpch_queries_speedup_rel.png │ └── theme_overrides.css │ ├── _templates │ ├── docs-sidebar.html │ └── layout.html │ ├── community │ └── communication.md │ ├── conf.py │ ├── contributors-guide │ ├── architecture.md │ ├── ballista.drawio.png │ ├── ballista_architecture.excalidraw.svg │ ├── code-organization.md │ └── development.md │ ├── index.rst │ └── user-guide │ ├── cli.md │ ├── configs.md │ ├── deployment │ ├── cargo-install.md │ ├── docker-compose.md │ ├── docker.md │ ├── index.rst │ ├── kubernetes.md │ └── quick-start.md │ ├── extending-components.md │ ├── faq.md │ ├── images │ ├── ballista-web-ui.png │ └── example-query-plan.png │ ├── introduction.md │ ├── metrics.md │ ├── python.md │ ├── rust.md │ ├── scheduler.md │ └── tuning-guide.md ├── examples ├── Cargo.toml ├── README.md ├── examples │ ├── custom-client.rs │ ├── custom-executor.rs │ ├── custom-scheduler.rs │ ├── remote-dataframe.rs │ ├── remote-sql.rs │ └── standalone-sql.rs ├── src │ ├── lib.rs │ └── test_util.rs ├── testdata │ ├── aggregate_test_100.csv │ └── alltypes_plain.parquet └── tests │ ├── common │ └── mod.rs │ └── object_store.rs ├── header ├── pre-commit.sh ├── python ├── .cargo │ └── config.toml ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── LICENSE.txt ├── README.md ├── ballista │ ├── __init__.py │ └── tests │ │ ├── __init__.py │ │ └── test_context.py ├── examples │ ├── client_remote.py │ ├── client_standalone.py │ ├── executor.py │ ├── readme_remote.py │ ├── readme_standalone.py │ └── scheduler.py ├── pyproject.toml ├── requirements.txt ├── src │ ├── cluster.rs │ ├── codec.rs │ ├── lib.rs │ └── utils.rs └── testdata │ ├── test.csv │ └── test.parquet ├── rust-toolchain.toml └── rustfmt.toml /.asf.yaml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | notifications: 19 | commits: commits@datafusion.apache.org 20 | issues: github@datafusion.apache.org 21 | pullrequests: github@datafusion.apache.org 22 | jira_options: link label worklog 23 | github: 24 | description: "Apache DataFusion Ballista Distributed Query Engine" 25 | homepage: https://datafusion.apache.org/ballista 26 | labels: 27 | - arrow 28 | - big-data 29 | - dataframe 30 | - distributed 31 | - olap 32 | - python 33 | - query-engine 34 | - rust 35 | - sql 36 | enabled_merge_buttons: 37 | squash: true 38 | merge: false 39 | rebase: false 40 | features: 41 | issues: true 42 | # publishes the content of the `asf-site` branch to 43 | # https://datafusion.apache.org/ballista/ 44 | publish: 45 | whoami: asf-site 46 | subdir: ballista -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the 2 | // README at: https://github.com/devcontainers/templates/tree/main/src/rust 3 | { 4 | "name": "datafusion-ballista", 5 | // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile 6 | "image": "mcr.microsoft.com/devcontainers/rust:latest", 7 | "features": { 8 | "ghcr.io/devcontainers/features/rust:latest": { 9 | "version": "latest", 10 | "profile": "complete" 11 | }, 12 | "ghcr.io/devcontainers-contrib/features/protoc:1": {}, 13 | "ghcr.io/devcontainers/features/node:1": {}, 14 | "ghcr.io/devcontainers/features/docker-in-docker:2": {}, 15 | } 16 | // Use 'mounts' to make the cargo cache persistent in a Docker Volume. 17 | , 18 | "mounts": [ 19 | { 20 | "source": "devcontainer-cargo-cache-${devcontainerId}", 21 | "target": "/usr/local/cargo", 22 | "type": "volume" 23 | } 24 | ], 25 | // Features to add to the dev container. More info: https://containers.dev/features. 26 | // "features": {}, 27 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 28 | "forwardPorts": [ 29 | 50050, 30 | 3000 31 | ], 32 | // Use 'postCreateCommand' to run commands after the container is created. 33 | "postCreateCommand": "rustc --version", 34 | // Configure tool-specific properties. 35 | // "customizations": {}, 36 | // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. 37 | // "remoteUser": "root" 38 | } -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | ci 3 | conbench 4 | dev/dist 5 | dev/release 6 | python 7 | **/docs 8 | target/ 9 | CHANGELOG.md 10 | **/tests 11 | **/data 12 | !target/release/ballista-scheduler 13 | !target/release/ballista-executor 14 | !target/release/ballista-cli 15 | !target/release/tpch 16 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 16 | **Expected behavior** 17 | A clear and concise description of what you expected to happen. 18 | 19 | **Additional context** 20 | Add any other context about the problem here. 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem or challenge? Please describe what you are trying to do.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | (This section helps Arrow developers understand the context and *why* for this feature, in addition to the *what*) 13 | 14 | **Describe the solution you'd like** 15 | A clear and concise description of what you want to happen. 16 | 17 | **Describe alternatives you've considered** 18 | A clear and concise description of any alternative solutions or features you've considered. 19 | 20 | **Additional context** 21 | Add any other context or screenshots about the feature request here. 22 | -------------------------------------------------------------------------------- /.github/actions/setup-builder/action.yaml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | name: Prepare Rust Builder 19 | description: 'Prepare Rust Build Environment' 20 | inputs: 21 | rust-version: 22 | description: 'version of rust to install (e.g. stable)' 23 | required: true 24 | default: 'stable' 25 | runs: 26 | using: "composite" 27 | steps: 28 | - name: Install Build Dependencies 29 | shell: bash 30 | run: | 31 | apt-get update 32 | apt-get install -y protobuf-compiler 33 | - name: Setup Rust toolchain 34 | shell: bash 35 | run: | 36 | echo "Installing ${{ inputs.rust-version }}" 37 | rustup toolchain install ${{ inputs.rust-version }} 38 | rustup default ${{ inputs.rust-version }} 39 | rustup component add rustfmt 40 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | version: 2 19 | updates: 20 | - package-ecosystem: cargo 21 | directory: "/" 22 | schedule: 23 | interval: daily 24 | open-pull-requests-limit: 10 25 | target-branch: main 26 | labels: [auto-dependencies] 27 | ignore: 28 | # arrow and datafusion are bumped manually 29 | - dependency-name: "arrow*" 30 | update-types: ["version-update:semver-major"] 31 | - dependency-name: "datafusion*" 32 | update-types: ["version-update:semver-major"] 33 | - dependency-name: "sqlparser" 34 | update-types: ["version-update:semver-major"] 35 | - package-ecosystem: "github-actions" 36 | directory: "/" 37 | schedule: 38 | interval: "daily" 39 | open-pull-requests-limit: 10 40 | labels: [auto-dependencies] 41 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # Which issue does this PR close? 2 | 3 | 6 | 7 | Closes #. 8 | 9 | # Rationale for this change 10 | 14 | 15 | # What changes are included in this PR? 16 | 19 | 20 | # Are there any user-facing changes? 21 | 24 | 25 | 28 | -------------------------------------------------------------------------------- /.github/workflows/cancel.yml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | name: Cancel stale runs 19 | 20 | on: 21 | workflow_run: 22 | # The name of another workflow (whichever one) that always runs on PRs 23 | workflows: ['Dev'] 24 | types: ['requested'] 25 | 26 | jobs: 27 | cancel-stale-workflow-runs: 28 | name: "Cancel stale workflow runs" 29 | runs-on: ubuntu-latest 30 | steps: 31 | # Unfortunately, we need to define a separate cancellation step for 32 | # each workflow where we want to cancel stale runs. 33 | - uses: potiuk/cancel-workflow-runs@master 34 | name: "Cancel stale Dev runs" 35 | with: 36 | cancelMode: allDuplicates 37 | token: ${{ secrets.GITHUB_TOKEN }} 38 | workflowFileName: dev.yml 39 | skipEventTypes: '["push", "schedule"]' 40 | - uses: potiuk/cancel-workflow-runs@master 41 | name: "Cancel stale Rust runs" 42 | with: 43 | cancelMode: allDuplicates 44 | token: ${{ secrets.GITHUB_TOKEN }} 45 | workflowFileName: rust.yml 46 | skipEventTypes: '["push", "schedule"]' 47 | -------------------------------------------------------------------------------- /.github/workflows/dev.yml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | name: Dev 19 | on: [push, pull_request] 20 | 21 | jobs: 22 | lint: 23 | name: Lint C++, Python, R, Rust, Docker 24 | runs-on: ubuntu-latest 25 | steps: 26 | - name: Checkout Arrow 27 | uses: actions/checkout@v4 28 | with: 29 | repository: apache/arrow 30 | submodules: true 31 | fetch-depth: 0 32 | - name: Setup Python 33 | uses: actions/setup-python@v4 34 | with: 35 | python-version: "3.10" 36 | - name: Setup Archery 37 | run: pip install -e dev/archery[lint] 38 | - name: Lint 39 | run: archery lint --rat 40 | 41 | rat: 42 | name: Release Audit Tool (RAT) 43 | runs-on: ubuntu-latest 44 | steps: 45 | - name: Checkout 46 | uses: actions/checkout@v4 47 | - name: Setup Python 48 | uses: actions/setup-python@v4 49 | with: 50 | python-version: "3.10" 51 | - name: Audit licenses 52 | run: ./dev/release/run-rat.sh . 53 | 54 | prettier: 55 | name: Use prettier to check formatting of documents 56 | runs-on: ubuntu-latest 57 | steps: 58 | - uses: actions/checkout@v4 59 | - uses: actions/setup-node@v4 60 | with: 61 | node-version: "14" 62 | - name: Prettier check 63 | run: | 64 | # if you encounter error, try rerun the command below with --write instead of --check 65 | # and commit the changes 66 | # 67 | # ignore subproject CHANGELOG.md because they are machine generated 68 | npx prettier@2.3.2 --write \ 69 | '{ballista,docs}/**/*.md' \ 70 | '!ballista/CHANGELOG.md' \ 71 | README.md \ 72 | CONTRIBUTING.md 73 | git diff --exit-code 74 | -------------------------------------------------------------------------------- /.github/workflows/dev_pr.yml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | name: Labeler 19 | 20 | on: 21 | pull_request_target: 22 | types: 23 | - opened 24 | - edited 25 | - synchronize 26 | 27 | jobs: 28 | process: 29 | name: Process 30 | runs-on: ubuntu-latest 31 | steps: 32 | - uses: actions/checkout@v4 33 | 34 | - name: Assign GitHub labels 35 | if: | 36 | github.event_name == 'pull_request_target' && 37 | (github.event.action == 'opened' || 38 | github.event.action == 'synchronize') 39 | uses: actions/labeler@v4.3.0 40 | with: 41 | repo-token: ${{ secrets.GITHUB_TOKEN }} 42 | configuration-path: .github/workflows/dev_pr/labeler.yml 43 | sync-labels: true 44 | 45 | # TODO: Enable this when eps1lon/actions-label-merge-conflict is available. 46 | # - name: Checks if PR needs rebase 47 | # if: | 48 | # github.event_name == 'push' || 49 | # (github.event_name == 'pull_request_target' && 50 | # (github.event.action == 'opened' || 51 | # github.event.action == 'synchronize')) 52 | # uses: eps1lon/actions-label-merge-conflict@releases/2.x 53 | # with: 54 | # dirtyLabel: "needs-rebase" 55 | # repoToken: "${{ secrets.GITHUB_TOKEN }}" 56 | -------------------------------------------------------------------------------- /.github/workflows/dev_pr/labeler.yml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | python: 19 | - python/**/* 20 | 21 | development-process: 22 | - dev/**.* 23 | - .github/**.* 24 | - ci/**.* 25 | - .asf.yaml 26 | 27 | documentation: 28 | - docs/**.* 29 | - README.md 30 | - ./**/README.md 31 | - DEVELOPERS.md 32 | - ballista/docs/**.* 33 | -------------------------------------------------------------------------------- /.github/workflows/docs.yaml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | on: 19 | push: 20 | branches: 21 | - main 22 | paths: 23 | - .asf.yaml 24 | - .github/workflows/docs.yaml 25 | - docs/** 26 | 27 | name: Deploy DataFusion Ballista site 28 | 29 | jobs: 30 | build-docs: 31 | name: Build docs 32 | runs-on: ubuntu-latest 33 | steps: 34 | - name: Checkout docs sources 35 | uses: actions/checkout@v4 36 | 37 | - name: Checkout asf-site branch 38 | uses: actions/checkout@v4 39 | with: 40 | ref: asf-site 41 | path: asf-site 42 | 43 | - name: Setup Python 44 | uses: actions/setup-python@v5 45 | with: 46 | python-version: "3.10" 47 | 48 | - name: Install dependencies 49 | run: | 50 | set -x 51 | python3 -m venv venv 52 | source venv/bin/activate 53 | pip install -r docs/requirements.txt 54 | 55 | - name: Build docs 56 | run: | 57 | set -x 58 | source venv/bin/activate 59 | cd docs 60 | ./build.sh 61 | 62 | - name: Copy & push the generated HTML 63 | run: | 64 | set -x 65 | cd asf-site/ 66 | rsync \ 67 | -a \ 68 | --delete \ 69 | --exclude '/.git/' \ 70 | ../docs/build/html/ \ 71 | ./ 72 | cp ../.asf.yaml . 73 | touch .nojekyll 74 | git status --porcelain 75 | if [ "$(git status --porcelain)" != "" ]; then 76 | git config user.name "github-actions[bot]" 77 | git config user.email "github-actions[bot]@users.noreply.github.com" 78 | git add --all 79 | git commit -m 'Publish built docs triggered by ${{ github.sha }}' 80 | git push || git push --force 81 | fi -------------------------------------------------------------------------------- /.github_changelog_generator: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | # 20 | 21 | # some issues are just documentation 22 | add-sections={"documentation":{"prefix":"**Documentation updates:**","labels":["documentation"]},"performance":{"prefix":"**Performance improvements:**","labels":["performance"]}} 23 | # uncomment to not show PRs. TBD if we shown them or not. 24 | #pull-requests=false 25 | # so that the component is shown associated with the issue 26 | issue-line-labels=sql 27 | exclude-labels=development-process,invalid 28 | breaking-labels=api change 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | apache-rat-*.jar 19 | rat.txt 20 | filtered_rat.txt 21 | arrow-src.tar 22 | arrow-src.tar.gz 23 | CHANGELOG.md.bak 24 | Cargo.toml.bak 25 | 26 | # Compiled source 27 | *.a 28 | *.dll 29 | *.o 30 | *.py[ocd] 31 | *.so 32 | *.so.* 33 | *.bundle 34 | *.dylib 35 | .build_cache_dir 36 | dependency-reduced-pom.xml 37 | MANIFEST 38 | compile_commands.json 39 | build.ninja 40 | 41 | # Generated Visual Studio files 42 | *.vcxproj 43 | *.vcxproj.* 44 | *.sln 45 | *.iml 46 | 47 | # Linux perf sample data 48 | perf.data 49 | perf.data.old 50 | 51 | cpp/.idea/ 52 | .clangd/ 53 | cpp/.clangd/ 54 | cpp/apidoc/xml/ 55 | docs/example.gz 56 | docs/example1.dat 57 | docs/example3.dat 58 | python/.eggs/ 59 | python/doc/ 60 | # Egg metadata 61 | *.egg-info 62 | 63 | .vscode 64 | .idea/ 65 | .pytest_cache/ 66 | pkgs 67 | docker_cache 68 | .gdb_history 69 | *.orig 70 | .*.swp 71 | .*.swo 72 | 73 | site/ 74 | 75 | # R files 76 | **/.Rproj.user 77 | **/*.Rcheck/ 78 | **/.Rhistory 79 | .Rproj.user 80 | 81 | # macOS 82 | cpp/Brewfile.lock.json 83 | .DS_Store 84 | 85 | # docker volumes used for caching 86 | .docker 87 | 88 | # Rust 89 | target 90 | # Cargo.lock 91 | !ballista-cli/Cargo.lock 92 | 93 | rusty-tags.vi 94 | .history 95 | .flatbuffers/ 96 | 97 | .vscode 98 | venv/ 99 | 100 | # apache release artifacts 101 | dev/dist 102 | 103 | # logs 104 | logs/ 105 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/datafusion-ballista/97c919274d9de496b630e66c12ad29c3fccd110b/.gitmodules -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Code of Conduct 21 | 22 | - [Code of Conduct for The Apache Software Foundation][1] 23 | 24 | [1]: https://www.apache.org/foundation/policies/conduct.html 25 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | [workspace] 19 | exclude = ["python"] 20 | members = ["ballista-cli", "ballista/client", "ballista/core", "ballista/executor", "ballista/scheduler", "benchmarks", "examples"] 21 | resolver = "2" 22 | 23 | [workspace.dependencies] 24 | arrow = { version = "55", features = ["ipc_compression"] } 25 | arrow-flight = { version = "55", features = ["flight-sql-experimental"] } 26 | clap = { version = "4.5", features = ["derive", "cargo"] } 27 | configure_me = { version = "0.4.0" } 28 | configure_me_codegen = { version = "0.4.4" } 29 | datafusion = "47.0.0" 30 | datafusion-cli = "47.0.0" 31 | datafusion-proto = "47.0.0" 32 | datafusion-proto-common = "47.0.0" 33 | object_store = "0.12" 34 | prost = "0.13" 35 | prost-types = "0.13" 36 | tonic = { version = "0.12" } 37 | tonic-build = { version = "0.12", default-features = false, features = [ 38 | "transport", 39 | "prost" 40 | ] } 41 | tracing = "0.1" 42 | tracing-appender = "0.2.2" 43 | tracing-subscriber = { version = "0.3", features = ["env-filter"] } 44 | ctor = { version = "0.4" } 45 | mimalloc = { version = "0.1" } 46 | 47 | tokio = { version = "1" } 48 | uuid = { version = "1.13", features = ["v4", "v7"] } 49 | rand = { version = "0.9" } 50 | env_logger = { version = "0.11" } 51 | futures = { version = "0.3" } 52 | log = { version = "0.4" } 53 | parking_lot = { version = "0.12" } 54 | tempfile = { version = "3.16" } 55 | dashmap = { version = "6.1" } 56 | async-trait = { version = "0.1" } 57 | serde = { version = "1.0" } 58 | tokio-stream = { version = "0.1" } 59 | url = { version = "2.5" } 60 | 61 | # cargo build --profile release-lto 62 | [profile.release-lto] 63 | codegen-units = 1 64 | inherits = "release" 65 | lto = true 66 | 67 | # the release profile takes a long time to build so we can use this profile during development to save time 68 | # cargo build --profile release-nonlto 69 | [profile.release-nonlto] 70 | codegen-units = 16 71 | debug = false 72 | debug-assertions = false 73 | incremental = false 74 | inherits = "release" 75 | lto = false 76 | opt-level = 3 77 | overflow-checks = false 78 | panic = 'unwind' 79 | rpath = false 80 | -------------------------------------------------------------------------------- /ballista-cli/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | [package] 19 | name = "ballista-cli" 20 | description = "Command Line Client for Ballista distributed query engine." 21 | version = "47.0.0" 22 | authors = ["Apache DataFusion "] 23 | edition = "2021" 24 | keywords = ["ballista", "cli"] 25 | license = "Apache-2.0" 26 | homepage = "https://datafusion.apache.org/ballista/" 27 | repository = "https://github.com/apache/datafusion-ballista" 28 | readme = "README.md" 29 | 30 | [dependencies] 31 | ballista = { path = "../ballista/client", version = "47.0.0", features = ["standalone"] } 32 | clap = { workspace = true, features = ["derive", "cargo"] } 33 | datafusion = { workspace = true } 34 | datafusion-cli = { workspace = true } 35 | dirs = "6.0" 36 | env_logger = { workspace = true } 37 | mimalloc = { workspace = true } 38 | rustyline = "15.0.0" 39 | tokio = { workspace = true, features = ["macros", "rt", "rt-multi-thread", "sync", "parking_lot"] } 40 | 41 | [features] 42 | 43 | -------------------------------------------------------------------------------- /ballista-cli/Dockerfile: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | FROM rust:1.76-bullseye as builder 19 | 20 | COPY ./ballista-cli /usr/src/ballista-cli 21 | 22 | COPY ./ballista /usr/src/ballista 23 | 24 | COPY ./benchmarks /usr/src/benchmarks 25 | 26 | COPY ./examples /usr/src/examples 27 | 28 | COPY ./Cargo.toml /usr/src/Cargo.toml 29 | 30 | WORKDIR /usr/src/ballista-cli 31 | 32 | RUN apt-get update && apt-get install -y protobuf-compiler 33 | 34 | RUN rustup component add rustfmt 35 | 36 | RUN cargo build --release 37 | 38 | FROM debian:bullseye-slim 39 | 40 | COPY --from=builder /usr/src/target/release/ballista-cli /usr/local/bin 41 | 42 | ENTRYPOINT ["ballista-cli"] 43 | 44 | CMD ["--data-path", "/data"] 45 | -------------------------------------------------------------------------------- /ballista-cli/src/lib.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | #![doc = include_str!("../README.md")] 19 | pub const BALLISTA_CLI_VERSION: &str = env!("CARGO_PKG_VERSION"); 20 | 21 | pub mod command; 22 | pub mod exec; 23 | 24 | pub use datafusion_cli::{functions, helper, print_format, print_options}; 25 | -------------------------------------------------------------------------------- /ballista/client/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | [package] 19 | name = "ballista" 20 | description = "Ballista Distributed Compute" 21 | license = "Apache-2.0" 22 | version = "47.0.0" 23 | homepage = "https://datafusion.apache.org/ballista/" 24 | repository = "https://github.com/apache/datafusion-ballista" 25 | readme = "README.md" 26 | authors = ["Apache DataFusion "] 27 | edition = "2021" 28 | 29 | [dependencies] 30 | async-trait = { workspace = true } 31 | ballista-core = { path = "../core", version = "47.0.0" } 32 | ballista-executor = { path = "../executor", version = "47.0.0", optional = true } 33 | ballista-scheduler = { path = "../scheduler", version = "47.0.0", optional = true } 34 | datafusion = { workspace = true } 35 | log = { workspace = true } 36 | 37 | tokio = { workspace = true } 38 | url = { workspace = true } 39 | 40 | [dev-dependencies] 41 | ballista-executor = { path = "../executor", version = "47.0.0" } 42 | ballista-scheduler = { path = "../scheduler", version = "47.0.0" } 43 | ctor = { workspace = true } 44 | datafusion-proto = { workspace = true } 45 | env_logger = { workspace = true } 46 | rstest = { version = "0.25" } 47 | tempfile = { workspace = true } 48 | tonic = { workspace = true } 49 | 50 | [features] 51 | default = ["standalone"] 52 | standalone = ["ballista-executor", "ballista-scheduler"] 53 | -------------------------------------------------------------------------------- /ballista/client/src/lib.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | #![doc = include_str!("../README.md")] 19 | 20 | pub mod extension; 21 | pub mod prelude; 22 | pub use datafusion; 23 | -------------------------------------------------------------------------------- /ballista/client/src/prelude.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | //! Ballista Prelude (common imports) 19 | 20 | // pub use ballista_core::{ 21 | // config::BallistaConfig, 22 | // error::{BallistaError, Result}, 23 | // }; 24 | 25 | pub use crate::extension::{SessionConfigExt, SessionContextExt}; 26 | //pub use futures::StreamExt; 27 | -------------------------------------------------------------------------------- /ballista/client/testdata/alltypes_plain.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/datafusion-ballista/97c919274d9de496b630e66c12ad29c3fccd110b/ballista/client/testdata/alltypes_plain.parquet -------------------------------------------------------------------------------- /ballista/client/testdata/single_nan.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/datafusion-ballista/97c919274d9de496b630e66c12ad29c3fccd110b/ballista/client/testdata/single_nan.parquet -------------------------------------------------------------------------------- /ballista/core/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Ballista Core Library 21 | 22 | This crate contains the Ballista core library which is used as a dependency by the `ballista-client`, 23 | `ballista-scheduler`, and `ballista-executor` crates. 24 | -------------------------------------------------------------------------------- /ballista/core/build.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::path::Path; 19 | 20 | fn main() -> Result<(), String> { 21 | use std::io::Write; 22 | 23 | let out = std::path::PathBuf::from(std::env::var("OUT_DIR").unwrap()); 24 | 25 | // for use in docker build where file changes can be wonky 26 | println!("cargo:rerun-if-env-changed=FORCE_REBUILD"); 27 | 28 | let version = rustc_version::version().unwrap(); 29 | println!("cargo:rustc-env=RUSTC_VERSION={version}"); 30 | 31 | // TODO: undo when resolved: https://github.com/intellij-rust/intellij-rust/issues/9402 32 | #[cfg(feature = "docsrs")] 33 | let path = out.join("ballista.rs"); 34 | #[cfg(not(feature = "docsrs"))] 35 | let path = "src/serde/generated/ballista.rs"; 36 | 37 | // We don't include the proto files in releases so that downstreams 38 | // do not need to have PROTOC included 39 | if Path::new("proto/datafusion.proto").exists() { 40 | println!("cargo:rerun-if-changed=proto/datafusion_common.proto"); 41 | println!("cargo:rerun-if-changed=proto/datafusion.proto"); 42 | println!("cargo:rerun-if-changed=proto/ballista.proto"); 43 | tonic_build::configure() 44 | .extern_path(".datafusion_common", "::datafusion_proto_common") 45 | .extern_path(".datafusion", "::datafusion_proto::protobuf") 46 | .protoc_arg("--experimental_allow_proto3_optional") 47 | .compile_protos(&["proto/ballista.proto"], &["proto"]) 48 | .map_err(|e| format!("protobuf compilation failed: {e}"))?; 49 | let generated_source_path = out.join("ballista.protobuf.rs"); 50 | let code = std::fs::read_to_string(generated_source_path).unwrap(); 51 | let mut file = std::fs::OpenOptions::new() 52 | .write(true) 53 | .truncate(true) 54 | .create(true) 55 | .open(path) 56 | .unwrap(); 57 | file.write_all(code.as_str().as_ref()).unwrap(); 58 | } 59 | 60 | Ok(()) 61 | } 62 | -------------------------------------------------------------------------------- /ballista/core/src/consistent_hash/node.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | pub trait Node { 19 | fn name(&self) -> &str; 20 | 21 | fn is_valid(&self) -> bool; 22 | } 23 | -------------------------------------------------------------------------------- /ballista/core/src/execution_plans/mod.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | //! This module contains execution plans that are needed to distribute DataFusion's execution plans into 19 | //! several Ballista executors. 20 | 21 | mod distributed_query; 22 | mod shuffle_reader; 23 | mod shuffle_writer; 24 | mod unresolved_shuffle; 25 | 26 | pub use distributed_query::DistributedQueryExec; 27 | pub use shuffle_reader::ShuffleReaderExec; 28 | pub use shuffle_writer::ShuffleWriterExec; 29 | pub use unresolved_shuffle::UnresolvedShuffleExec; 30 | -------------------------------------------------------------------------------- /ballista/core/src/lib.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | #![doc = include_str!("../README.md")] 19 | 20 | use std::sync::Arc; 21 | 22 | use datafusion::{execution::runtime_env::RuntimeEnv, prelude::SessionConfig}; 23 | pub const BALLISTA_VERSION: &str = env!("CARGO_PKG_VERSION"); 24 | 25 | pub fn print_version() { 26 | println!("Ballista version: {BALLISTA_VERSION}") 27 | } 28 | 29 | pub mod client; 30 | pub mod config; 31 | pub mod consistent_hash; 32 | pub mod diagram; 33 | pub mod error; 34 | pub mod event_loop; 35 | pub mod execution_plans; 36 | pub mod extension; 37 | #[cfg(feature = "build-binary")] 38 | pub mod object_store; 39 | pub mod planner; 40 | pub mod registry; 41 | pub mod serde; 42 | pub mod utils; 43 | 44 | /// 45 | /// [RuntimeProducer] is a factory which creates runtime [RuntimeEnv] 46 | /// from [SessionConfig]. As [SessionConfig] will be propagated 47 | /// from client to executors, this provides possibility to 48 | /// create [RuntimeEnv] components and configure them according to 49 | /// [SessionConfig] or some of its config extension 50 | /// 51 | /// It is intended to be used with executor configuration 52 | /// 53 | pub type RuntimeProducer = Arc< 54 | dyn Fn(&SessionConfig) -> datafusion::error::Result> + Send + Sync, 55 | >; 56 | /// 57 | /// [ConfigProducer] is a factory which can create [SessionConfig], with 58 | /// additional extension or configuration codecs 59 | /// 60 | /// It is intended to be used with executor configuration 61 | /// 62 | pub type ConfigProducer = Arc SessionConfig + Send + Sync>; 63 | -------------------------------------------------------------------------------- /ballista/core/src/serde/generated/mod.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | // include the generated protobuf source as a submodule 19 | #[allow(clippy::all)] 20 | #[rustfmt::skip] 21 | #[cfg(not(docsrs))] 22 | pub mod ballista; 23 | 24 | #[cfg(docsrs)] 25 | #[allow(clippy::all)] 26 | pub mod ballista { 27 | include!(concat!(env!("OUT_DIR"), "/ballista.rs")); 28 | } 29 | -------------------------------------------------------------------------------- /ballista/core/tests/customer.csv: -------------------------------------------------------------------------------- 1 | andrew,100 2 | jorge,200 3 | andy,150 4 | paul,300 5 | -------------------------------------------------------------------------------- /ballista/executor/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | [package] 19 | name = "ballista-executor" 20 | description = "Ballista Distributed Compute - Executor" 21 | license = "Apache-2.0" 22 | version = "47.0.0" 23 | homepage = "https://datafusion.apache.org/ballista/" 24 | repository = "https://github.com/apache/datafusion-ballista" 25 | readme = "README.md" 26 | authors = ["Apache DataFusion "] 27 | edition = "2021" 28 | 29 | [package.metadata.configure_me.bin] 30 | executor = "executor_config_spec.toml" 31 | 32 | [[bin]] 33 | name = "ballista-executor" 34 | path = "src/bin/main.rs" 35 | required-features = ["build-binary"] 36 | 37 | [features] 38 | build-binary = ["configure_me", "tracing-subscriber", "tracing-appender", "tracing", "ballista-core/build-binary"] 39 | default = ["build-binary", "mimalloc"] 40 | 41 | [dependencies] 42 | arrow = { workspace = true } 43 | arrow-flight = { workspace = true } 44 | async-trait = { workspace = true } 45 | ballista-core = { path = "../core", version = "47.0.0" } 46 | configure_me = { workspace = true, optional = true } 47 | dashmap = { workspace = true } 48 | datafusion = { workspace = true } 49 | datafusion-proto = { workspace = true } 50 | futures = { workspace = true } 51 | log = { workspace = true } 52 | mimalloc = { workspace = true, optional = true } 53 | parking_lot = { workspace = true } 54 | tempfile = { workspace = true } 55 | tokio = { workspace = true, features = ["full"] } 56 | tokio-stream = { workspace = true, features = ["net"] } 57 | tonic = { workspace = true } 58 | tracing = { workspace = true, optional = true } 59 | tracing-appender = { workspace = true, optional = true } 60 | tracing-subscriber = { workspace = true, optional = true } 61 | uuid = { workspace = true } 62 | 63 | [dev-dependencies] 64 | 65 | [build-dependencies] 66 | configure_me_codegen = { workspace = true } 67 | 68 | # use libc on unix like platforms to set worker priority in DedicatedExecutor 69 | [target."cfg(unix)".dependencies.libc] 70 | version = "0.2" 71 | -------------------------------------------------------------------------------- /ballista/executor/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Ballista Executor Process 21 | 22 | This crate contains the Ballista executor process. 23 | -------------------------------------------------------------------------------- /ballista/executor/build.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | fn main() -> Result<(), String> { 19 | #[cfg(feature = "build-binary")] 20 | println!("cargo:rerun-if-changed=executor_config_spec.toml"); 21 | #[cfg(feature = "build-binary")] 22 | configure_me_codegen::build_script_auto() 23 | .map_err(|e| format!("configure_me code generation failed: {e}"))?; 24 | 25 | Ok(()) 26 | } 27 | -------------------------------------------------------------------------------- /ballista/executor/examples/example_executor_config.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # the default configuration location is "/etc/ballista/scheduler.toml" 19 | # if you include a specifc conf file using "--config-file = my_config_file.toml" 20 | # then that file will override environment variables, but not command line arguments 21 | namespace = "my_name_space" 22 | bind_host = "1.2.3.4" -------------------------------------------------------------------------------- /ballista/executor/src/metrics/mod.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use crate::execution_engine::QueryStageExecutor; 19 | use log::info; 20 | use std::sync::Arc; 21 | 22 | /// `ExecutorMetricsCollector` records metrics for `ShuffleWriteExec` 23 | /// after they are executed. 24 | /// 25 | /// After each stage completes, `ShuffleWriteExec::record_stage` will be 26 | /// called. 27 | pub trait ExecutorMetricsCollector: Send + Sync { 28 | /// Record metrics for stage after it is executed 29 | fn record_stage( 30 | &self, 31 | job_id: &str, 32 | stage_id: usize, 33 | partition: usize, 34 | plan: Arc, 35 | ); 36 | } 37 | 38 | /// Implementation of `ExecutorMetricsCollector` which logs the completed 39 | /// plan to stdout. 40 | #[derive(Default)] 41 | pub struct LoggingMetricsCollector {} 42 | 43 | impl ExecutorMetricsCollector for LoggingMetricsCollector { 44 | fn record_stage( 45 | &self, 46 | job_id: &str, 47 | stage_id: usize, 48 | partition: usize, 49 | plan: Arc, 50 | ) { 51 | info!( 52 | "=== [{}/{}/{}] Physical plan with metrics ===\n{}\n", 53 | job_id, stage_id, partition, plan 54 | ); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /ballista/executor/src/terminate.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | #[cfg(unix)] 19 | use tokio::signal::unix::SignalKind; 20 | #[cfg(unix)] 21 | use tokio::signal::unix::{self as os_impl}; 22 | #[cfg(windows)] 23 | use tokio::signal::windows::{self as os_impl}; 24 | 25 | use std::io; 26 | 27 | pub async fn sig_term() -> io::Result<()> { 28 | #[cfg(unix)] 29 | os_impl::signal(SignalKind::terminate())?.recv().await; 30 | #[cfg(windows)] 31 | // TODO fix windows terminate after upgrading to latest tokio 32 | os_impl::ctrl_break()?.recv().await; 33 | Ok(()) 34 | } 35 | -------------------------------------------------------------------------------- /ballista/scheduler/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Ballista Scheduler Process 21 | 22 | This crate contains the Ballista scheduler process. 23 | -------------------------------------------------------------------------------- /ballista/scheduler/build.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | fn main() -> Result<(), String> { 19 | #[cfg(feature = "build-binary")] 20 | println!("cargo:rerun-if-changed=scheduler_config_spec.toml"); 21 | 22 | #[cfg(feature = "build-binary")] 23 | configure_me_codegen::build_script_auto() 24 | .map_err(|e| format!("configure_me code generation failed: {e}"))?; 25 | 26 | #[cfg(feature = "keda-scaler")] 27 | println!("cargo:rerun-if-changed=proto/keda.proto"); 28 | 29 | #[cfg(feature = "keda-scaler")] 30 | tonic_build::configure() 31 | .compile_protos(&["proto/keda.proto"], &["proto"]) 32 | .map_err(|e| format!("protobuf compilation failed: {e}"))?; 33 | 34 | Ok(()) 35 | } 36 | -------------------------------------------------------------------------------- /ballista/scheduler/proto/keda.proto: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2020 The KEDA Authors. 3 | 4 | and others that have contributed code to the public domain. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at. 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | // This file comes from https://github.com/kedacore/keda/blob/main/pkg/scalers/externalscaler/externalscaler.proto 19 | syntax = "proto3"; 20 | 21 | package externalscaler; 22 | option go_package = ".;externalscaler"; 23 | 24 | service ExternalScaler { 25 | rpc IsActive(ScaledObjectRef) returns (IsActiveResponse) {} 26 | // Commented out since we aren't supporting the streaming scaler interface at the moment 27 | // rpc StreamIsActive(ScaledObjectRef) returns (stream IsActiveResponse) {} 28 | rpc GetMetricSpec(ScaledObjectRef) returns (GetMetricSpecResponse) {} 29 | rpc GetMetrics(GetMetricsRequest) returns (GetMetricsResponse) {} 30 | } 31 | 32 | message ScaledObjectRef { 33 | string name = 1; 34 | string namespace = 2; 35 | map scalerMetadata = 3; 36 | } 37 | 38 | message IsActiveResponse { 39 | bool result = 1; 40 | } 41 | 42 | message GetMetricSpecResponse { 43 | repeated MetricSpec metricSpecs = 1; 44 | } 45 | 46 | message MetricSpec { 47 | string metricName = 1; 48 | int64 targetSize = 2; 49 | } 50 | 51 | message GetMetricsRequest { 52 | ScaledObjectRef scaledObjectRef = 1; 53 | string metricName = 2; 54 | } 55 | 56 | message GetMetricsResponse { 57 | repeated MetricValue metricValues = 1; 58 | } 59 | 60 | message MetricValue { 61 | string metricName = 1; 62 | int64 metricValue = 2; 63 | } -------------------------------------------------------------------------------- /ballista/scheduler/src/api/mod.rs: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | 13 | mod handlers; 14 | 15 | use crate::scheduler_server::SchedulerServer; 16 | use axum::routing::patch; 17 | use axum::{routing::get, Router}; 18 | use datafusion_proto::logical_plan::AsLogicalPlan; 19 | use datafusion_proto::physical_plan::AsExecutionPlan; 20 | use std::sync::Arc; 21 | 22 | pub fn get_routes< 23 | T: AsLogicalPlan + Clone + Send + Sync + 'static, 24 | U: AsExecutionPlan + Send + Sync + 'static, 25 | >( 26 | scheduler_server: Arc>, 27 | ) -> Router { 28 | let router = Router::new() 29 | .route("/api/state", get(handlers::get_scheduler_state::)) 30 | .route("/api/executors", get(handlers::get_executors::)) 31 | .route("/api/jobs", get(handlers::get_jobs::)) 32 | .route("/api/job/:job_id", patch(handlers::cancel_job::)) 33 | .route( 34 | "/api/job/:job_id/stages", 35 | get(handlers::get_query_stages::), 36 | ) 37 | .route( 38 | "/api/job/:job_id/dot", 39 | get(handlers::get_job_dot_graph::), 40 | ) 41 | .route( 42 | "/api/job/:job_id/stage/:stage_id/dot", 43 | get(handlers::get_query_stage_dot_graph::), 44 | ) 45 | .route("/api/metrics", get(handlers::get_scheduler_metrics::)); 46 | 47 | #[cfg(feature = "graphviz-support")] 48 | let router = router.route( 49 | "/api/job/:job_id/dot_svg", 50 | get(handlers::get_job_svg_graph::), 51 | ); 52 | 53 | router.with_state(scheduler_server) 54 | } 55 | -------------------------------------------------------------------------------- /ballista/scheduler/src/lib.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | #![doc = include_str ! ("../README.md")] 19 | #[cfg(feature = "rest-api")] 20 | pub mod api; 21 | pub mod cluster; 22 | pub mod config; 23 | pub mod display; 24 | pub mod metrics; 25 | pub mod planner; 26 | pub mod scheduler_process; 27 | pub mod scheduler_server; 28 | pub mod standalone; 29 | pub mod state; 30 | 31 | #[cfg(test)] 32 | pub mod test_utils; 33 | 34 | pub use scheduler_server::SessionBuilder; 35 | -------------------------------------------------------------------------------- /ballista/scheduler/src/state/session_manager.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use crate::scheduler_server::SessionBuilder; 19 | use ballista_core::error::Result; 20 | use datafusion::prelude::{SessionConfig, SessionContext}; 21 | 22 | use crate::cluster::JobState; 23 | use std::sync::Arc; 24 | 25 | #[derive(Clone)] 26 | pub struct SessionManager { 27 | state: Arc, 28 | } 29 | 30 | impl SessionManager { 31 | pub fn new(state: Arc) -> Self { 32 | Self { state } 33 | } 34 | pub async fn remove_session(&self, session_id: &str) -> Result<()> { 35 | self.state.remove_session(session_id).await 36 | } 37 | 38 | pub async fn create_or_update_session( 39 | &self, 40 | session_id: &str, 41 | config: &SessionConfig, 42 | ) -> Result> { 43 | self.state 44 | .create_or_update_session(session_id, config) 45 | .await 46 | } 47 | 48 | pub(crate) fn produce_config(&self) -> SessionConfig { 49 | self.state.produce_config() 50 | } 51 | } 52 | 53 | /// Create a DataFusion session context that is compatible with Ballista Configuration 54 | pub fn create_datafusion_context( 55 | session_config: &SessionConfig, 56 | session_builder: SessionBuilder, 57 | ) -> datafusion::common::Result> { 58 | let session_state = if session_config.round_robin_repartition() { 59 | let session_config = session_config 60 | .clone() 61 | // should we disable catalog on the scheduler side 62 | .with_round_robin_repartition(false); 63 | 64 | log::warn!("session manager will override `datafusion.optimizer.enable_round_robin_repartition` to `false` "); 65 | session_builder(session_config)? 66 | } else { 67 | session_builder(session_config.clone())? 68 | }; 69 | 70 | Ok(Arc::new(SessionContext::new_with_state(session_state))) 71 | } 72 | -------------------------------------------------------------------------------- /ballista/scheduler/testdata/customer/customer.tbl: -------------------------------------------------------------------------------- 1 | 1|Customer#000000001|IVhzIApeRb ot,c,E|15|25-989-741-2988|711.56|BUILDING|to the even, regular platelets. regular, ironic epitaphs nag e| 2 | 2|Customer#000000002|XSTf4,NCwDVaWNe6tEgvwfmRchLXak|13|23-768-687-3665|121.65|AUTOMOBILE|l accounts. blithely ironic theodolites integrate boldly: caref| 3 | 3|Customer#000000003|MG9kdTD2WBHm|1|11-719-748-3364|7498.12|AUTOMOBILE| deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov| 4 | 4|Customer#000000004|XxVSJsLAGtn|4|14-128-190-5944|2866.83|MACHINERY| requests. final, regular ideas sleep final accou| 5 | 5|Customer#000000005|KvpyuHCplrB84WgAiGV6sYpZq7Tj|3|13-750-942-6364|794.47|HOUSEHOLD|n accounts will have to unwind. foxes cajole accor| 6 | 6|Customer#000000006|sKZz0CsnMD7mp4Xd0YrBvx,LREYKUWAh yVn|20|30-114-968-4951|7638.57|AUTOMOBILE|tions. even deposits boost according to the slyly bold packages. final accounts cajole requests. furious| 7 | 7|Customer#000000007|TcGe5gaZNgVePxU5kRrvXBfkasDTea|18|28-190-982-9759|9561.95|AUTOMOBILE|ainst the ironic, express theodolites. express, even pinto beans among the exp| 8 | 8|Customer#000000008|I0B10bB0AymmC, 0PrRYBCP1yGJ8xcBPmWhl5|17|27-147-574-9335|6819.74|BUILDING|among the slyly regular theodolites kindle blithely courts. carefully even theodolites haggle slyly along the ide| 9 | 9|Customer#000000009|xKiAFTjUsCuxfeleNqefumTrjS|8|18-338-906-3675|8324.07|FURNITURE|r theodolites according to the requests wake thinly excuses: pending requests haggle furiousl| 10 | 10|Customer#000000010|6LrEaV6KR6PLVcgl2ArL Q3rqzLzcT1 v2|5|15-741-346-9870|2753.54|HOUSEHOLD|es regular deposits haggle. fur| 11 | -------------------------------------------------------------------------------- /ballista/scheduler/testdata/lineitem/partition0.tbl: -------------------------------------------------------------------------------- 1 | 1|155190|7706|1|17|21168.23|0.04|0.02|N|O|1996-03-13|1996-02-12|1996-03-22|DELIVER IN PERSON|TRUCK|egular courts above the| 2 | 1|67310|7311|2|36|45983.16|0.09|0.06|N|O|1996-04-12|1996-02-28|1996-04-20|TAKE BACK RETURN|MAIL|ly final dependencies: slyly bold | 3 | 1|63700|3701|3|8|13309.60|0.10|0.02|N|O|1996-01-29|1996-03-05|1996-01-31|TAKE BACK RETURN|REG AIR|riously. regular, express dep| 4 | 1|2132|4633|4|28|28955.64|0.09|0.06|N|O|1996-04-21|1996-03-30|1996-05-16|NONE|AIR|lites. fluffily even de| 5 | 1|24027|1534|5|24|22824.48|0.10|0.04|N|O|1996-03-30|1996-03-14|1996-04-01|NONE|FOB| pending foxes. slyly re| 6 | 1|15635|638|6|32|49620.16|0.07|0.02|N|O|1996-01-30|1996-02-07|1996-02-03|DELIVER IN PERSON|MAIL|arefully slyly ex| 7 | 2|106170|1191|1|38|44694.46|0.00|0.05|N|O|1997-01-28|1997-01-14|1997-02-02|TAKE BACK RETURN|RAIL|ven requests. deposits breach a| 8 | 3|4297|1798|1|45|54058.05|0.06|0.00|R|F|1994-02-02|1994-01-04|1994-02-23|NONE|AIR|ongside of the furiously brave acco| 9 | 3|19036|6540|2|49|46796.47|0.10|0.00|R|F|1993-11-09|1993-12-20|1993-11-24|TAKE BACK RETURN|RAIL| unusual accounts. eve| 10 | 3|128449|3474|3|27|39890.88|0.06|0.07|A|F|1994-01-16|1993-11-22|1994-01-23|DELIVER IN PERSON|SHIP|nal foxes wake. | 11 | -------------------------------------------------------------------------------- /ballista/scheduler/testdata/lineitem/partition1.tbl: -------------------------------------------------------------------------------- 1 | 1|155190|7706|1|17|21168.23|0.04|0.02|N|O|1996-03-13|1996-02-12|1996-03-22|DELIVER IN PERSON|TRUCK|egular courts above the| 2 | 1|67310|7311|2|36|45983.16|0.09|0.06|N|O|1996-04-12|1996-02-28|1996-04-20|TAKE BACK RETURN|MAIL|ly final dependencies: slyly bold | 3 | 1|63700|3701|3|8|13309.60|0.10|0.02|N|O|1996-01-29|1996-03-05|1996-01-31|TAKE BACK RETURN|REG AIR|riously. regular, express dep| 4 | 1|2132|4633|4|28|28955.64|0.09|0.06|N|O|1996-04-21|1996-03-30|1996-05-16|NONE|AIR|lites. fluffily even de| 5 | 1|24027|1534|5|24|22824.48|0.10|0.04|N|O|1996-03-30|1996-03-14|1996-04-01|NONE|FOB| pending foxes. slyly re| 6 | 1|15635|638|6|32|49620.16|0.07|0.02|N|O|1996-01-30|1996-02-07|1996-02-03|DELIVER IN PERSON|MAIL|arefully slyly ex| 7 | 2|106170|1191|1|38|44694.46|0.00|0.05|N|O|1997-01-28|1997-01-14|1997-02-02|TAKE BACK RETURN|RAIL|ven requests. deposits breach a| 8 | 3|4297|1798|1|45|54058.05|0.06|0.00|R|F|1994-02-02|1994-01-04|1994-02-23|NONE|AIR|ongside of the furiously brave acco| 9 | 3|19036|6540|2|49|46796.47|0.10|0.00|R|F|1993-11-09|1993-12-20|1993-11-24|TAKE BACK RETURN|RAIL| unusual accounts. eve| 10 | 3|128449|3474|3|27|39890.88|0.06|0.07|A|F|1994-01-16|1993-11-22|1994-01-23|DELIVER IN PERSON|SHIP|nal foxes wake. | 11 | -------------------------------------------------------------------------------- /ballista/scheduler/testdata/nation/nation.tbl: -------------------------------------------------------------------------------- 1 | 0|ALGERIA|0| haggle. carefully final deposits detect slyly agai| 2 | 1|ARGENTINA|1|al foxes promise slyly according to the regular accounts. bold requests alon| 3 | 2|BRAZIL|1|y alongside of the pending deposits. carefully special packages are about the ironic forges. slyly special | 4 | 3|CANADA|1|eas hang ironic, silent packages. slyly regular packages are furiously over the tithes. fluffily bold| 5 | 4|EGYPT|4|y above the carefully unusual theodolites. final dugouts are quickly across the furiously regular d| 6 | 5|ETHIOPIA|0|ven packages wake quickly. regu| 7 | 6|FRANCE|3|refully final requests. regular, ironi| 8 | 7|GERMANY|3|l platelets. regular accounts x-ray: unusual, regular acco| 9 | 8|INDIA|2|ss excuses cajole slyly across the packages. deposits print aroun| 10 | 9|INDONESIA|2| slyly express asymptotes. regular deposits haggle slyly. carefully ironic hockey players sleep blithely. carefull| 11 | -------------------------------------------------------------------------------- /ballista/scheduler/testdata/orders/orders.tbl: -------------------------------------------------------------------------------- 1 | 1|36901|O|173665.47|1996-01-02|5-LOW|Clerk#000000951|0|nstructions sleep furiously among | 2 | 2|78002|O|46929.18|1996-12-01|1-URGENT|Clerk#000000880|0| foxes. pending accounts at the pending, silent asymptot| 3 | 3|123314|F|193846.25|1993-10-14|5-LOW|Clerk#000000955|0|sly final accounts boost. carefully regular ideas cajole carefully. depos| 4 | 4|136777|O|32151.78|1995-10-11|5-LOW|Clerk#000000124|0|sits. slyly regular warthogs cajole. regular, regular theodolites acro| 5 | 5|44485|F|144659.20|1994-07-30|5-LOW|Clerk#000000925|0|quickly. bold deposits sleep slyly. packages use slyly| 6 | 6|55624|F|58749.59|1992-02-21|4-NOT SPECIFIED|Clerk#000000058|0|ggle. special, final requests are against the furiously specia| 7 | 7|39136|O|252004.18|1996-01-10|2-HIGH|Clerk#000000470|0|ly special requests | 8 | 32|130057|O|208660.75|1995-07-16|2-HIGH|Clerk#000000616|0|ise blithely bold, regular requests. quickly unusual dep| 9 | 33|66958|F|163243.98|1993-10-27|3-MEDIUM|Clerk#000000409|0|uriously. furiously final request| 10 | 34|61001|O|58949.67|1998-07-21|3-MEDIUM|Clerk#000000223|0|ly final packages. fluffily final deposits wake blithely ideas. spe| 11 | -------------------------------------------------------------------------------- /ballista/scheduler/testdata/part/part.tbl: -------------------------------------------------------------------------------- 1 | 1|goldenrod lavender spring chocolate lace|Manufacturer#1|Brand#13|PROMO BURNISHED COPPER|7|JUMBO PKG|901.00|ly. slyly ironi| 2 | 2|blush thistle blue yellow saddle|Manufacturer#1|Brand#13|LARGE BRUSHED BRASS|1|LG CASE|902.00|lar accounts amo| 3 | 3|spring green yellow purple cornsilk|Manufacturer#4|Brand#42|STANDARD POLISHED BRASS|21|WRAP CASE|903.00|egular deposits hag| 4 | 4|cornflower chocolate smoke green pink|Manufacturer#3|Brand#34|SMALL PLATED BRASS|14|MED DRUM|904.00|p furiously r| 5 | 5|forest brown coral puff cream|Manufacturer#3|Brand#32|STANDARD POLISHED TIN|15|SM PKG|905.00| wake carefully | 6 | 6|bisque cornflower lawn forest magenta|Manufacturer#2|Brand#24|PROMO PLATED STEEL|4|MED BAG|906.00|sual a| 7 | 7|moccasin green thistle khaki floral|Manufacturer#1|Brand#11|SMALL PLATED COPPER|45|SM BAG|907.00|lyly. ex| 8 | 8|misty lace thistle snow royal|Manufacturer#4|Brand#44|PROMO BURNISHED TIN|41|LG DRUM|908.00|eposi| 9 | 9|thistle dim navajo dark gainsboro|Manufacturer#4|Brand#43|SMALL BURNISHED STEEL|12|WRAP CASE|909.00|ironic foxe| 10 | 10|linen pink saddle puff powder|Manufacturer#5|Brand#54|LARGE BURNISHED STEEL|44|LG CAN|910.01|ithely final deposit| 11 | -------------------------------------------------------------------------------- /ballista/scheduler/testdata/partsupp/partsupp.tbl: -------------------------------------------------------------------------------- 1 | 1|2|3325|771.64|, even theodolites. regular, final theodolites eat after the carefully pending foxes. furiously regular deposits sleep slyly. carefully bold realms above the ironic dependencies haggle careful| 2 | 1|2502|8076|993.49|ven ideas. quickly even packages print. pending multipliers must have to are fluff| 3 | 1|5002|3956|337.09|after the fluffily ironic deposits? blithely special dependencies integrate furiously even excuses. blithely silent theodolites could have to haggle pending, express requests; fu| 4 | 1|7502|4069|357.84|al, regular dependencies serve carefully after the quickly final pinto beans. furiously even deposits sleep quickly final, silent pinto beans. fluffily reg| 5 | 2|3|8895|378.49|nic accounts. final accounts sleep furiously about the ironic, bold packages. regular, regular accounts| 6 | 2|2503|4969|915.27|ptotes. quickly pending dependencies integrate furiously. fluffily ironic ideas impress blithely above the express accounts. furiously even epitaphs need to wak| 7 | 2|5003|8539|438.37|blithely bold ideas. furiously stealthy packages sleep fluffily. slyly special deposits snooze furiously carefully regular accounts. regular deposits according to the accounts nag carefully slyl| 8 | 2|7503|3025|306.39|olites. deposits wake carefully. even, express requests cajole. carefully regular ex| 9 | 3|4|4651|920.92|ilent foxes affix furiously quickly unusual requests. even packages across the carefully even theodolites nag above the sp| 10 | 3|2504|4093|498.13|ending dependencies haggle fluffily. regular deposits boost quickly carefully regular requests. deposits affix furiously around the pinto beans. ironic, unusual platelets across the p| 11 | -------------------------------------------------------------------------------- /ballista/scheduler/testdata/region/region.tbl: -------------------------------------------------------------------------------- 1 | 0|AFRICA|lar deposits. blithely final packages cajole. regular waters are final requests. regular accounts are according to | 2 | 1|AMERICA|hs use ironic, even requests. s| 3 | 2|ASIA|ges. thinly even pinto beans ca| 4 | 3|EUROPE|ly final courts cajole furiously final excuse| 5 | 4|MIDDLE EAST|uickly special accounts cajole carefully blithely close requests. carefully final asymptotes haggle furiousl| 6 | -------------------------------------------------------------------------------- /ballista/scheduler/testdata/supplier/supplier.tbl: -------------------------------------------------------------------------------- 1 | 1|Supplier#000000001| N kD4on9OM Ipw3,gf0JBoQDd7tgrzrddZ|17|27-918-335-1736|5755.94|each slyly above the careful| 2 | 2|Supplier#000000002|89eJ5ksX3ImxJQBvxObC,|5|15-679-861-2259|4032.68| slyly bold instructions. idle dependen| 3 | 3|Supplier#000000003|q1,G3Pj6OjIuUYfUoH18BFTKP5aU9bEV3|1|11-383-516-1199|4192.40|blithely silent requests after the express dependencies are sl| 4 | 4|Supplier#000000004|Bk7ah4CK8SYQTepEmvMkkgMwg|15|25-843-787-7479|4641.08|riously even requests above the exp| 5 | 5|Supplier#000000005|Gcdm2rJRzl5qlTVzc|11|21-151-690-3663|-283.84|. slyly regular pinto bea| 6 | 6|Supplier#000000006|tQxuVm7s7CnK|14|24-696-997-4969|1365.79|final accounts. regular dolphins use against the furiously ironic decoys. | 7 | 7|Supplier#000000007|s,4TicNGB4uO6PaSqNBUq|23|33-990-965-2201|6820.35|s unwind silently furiously regular courts. final requests are deposits. requests wake quietly blit| 8 | 8|Supplier#000000008|9Sq4bBH2FQEmaFOocY45sRTxo6yuoG|17|27-498-742-3860|7627.85|al pinto beans. asymptotes haggl| 9 | 9|Supplier#000000009|1KhUgZegwM3ua7dsYmekYBsK|10|20-403-398-8662|5302.37|s. unusual, even requests along the furiously regular pac| 10 | 10|Supplier#000000010|Saygah3gYWMp72i PY|24|34-852-489-8585|3891.91|ing waters. regular requests ar| 11 | -------------------------------------------------------------------------------- /benchmarks/.dockerignore: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # Turn .dockerignore to .dockerallow by excluding everything and explicitly 19 | # allowing specific files and directories. This enables us to quickly add 20 | # dependency files to the docker content without scanning the whole directory. 21 | # This setup requires to all of our docker containers have arrow's source 22 | # as a mounted directory. 23 | 24 | data 25 | target -------------------------------------------------------------------------------- /benchmarks/.gitignore: -------------------------------------------------------------------------------- 1 | data -------------------------------------------------------------------------------- /benchmarks/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | [package] 19 | name = "ballista-benchmarks" 20 | description = "Ballista Benchmarks" 21 | version = "47.0.0" 22 | edition = "2021" 23 | authors = ["Apache DataFusion "] 24 | homepage = "https://datafusion.apache.org/ballista/" 25 | repository = "https://github.com/apache/datafusion-ballista" 26 | license = "Apache-2.0" 27 | publish = false 28 | 29 | [features] 30 | ci = [] 31 | default = ["mimalloc"] 32 | snmalloc = ["snmalloc-rs"] 33 | 34 | [dependencies] 35 | ballista = { path = "../ballista/client", version = "47.0.0" } 36 | datafusion = { workspace = true } 37 | datafusion-proto = { workspace = true } 38 | env_logger = { workspace = true } 39 | futures = { workspace = true } 40 | mimalloc = { workspace = true, optional = true } 41 | rand = { workspace = true } 42 | serde = { workspace = true } 43 | serde_json = "1.0.78" 44 | snmalloc-rs = { version = "0.3", optional = true } 45 | structopt = { version = "0.3", default-features = false } 46 | tokio = { version = "^1.44", features = [ 47 | "macros", 48 | "rt", 49 | "rt-multi-thread", 50 | "parking_lot", 51 | ] } 52 | 53 | [dev-dependencies] 54 | ballista-core = { path = "../ballista/core", version = "47.0.0" } 55 | -------------------------------------------------------------------------------- /benchmarks/db-benchmark/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Run db-benchmark 21 | 22 | ## Directions 23 | 24 | Run the following from root `arrow-datafusion` directory 25 | 26 | ```bash 27 | $ docker buildx build -t db-benchmark -f benchmarks/db-benchmark/db-benchmark.Dockerfile . 28 | $ docker run --privileged db-benchmark 29 | ``` 30 | -------------------------------------------------------------------------------- /benchmarks/db-benchmark/run-bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | set -e 19 | 20 | SRC_DATANAME=G1_1e7_1e2_0_0 python3 datafusion/groupby-datafusion.py 21 | SRC_DATANAME=J1_1e7_NA_0_0 python3 datafusion/join-datafusion.py 22 | -------------------------------------------------------------------------------- /benchmarks/queries/q1.sql: -------------------------------------------------------------------------------- 1 | select 2 | l_returnflag, 3 | l_linestatus, 4 | sum(l_quantity) as sum_qty, 5 | sum(l_extendedprice) as sum_base_price, 6 | sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, 7 | sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, 8 | avg(l_quantity) as avg_qty, 9 | avg(l_extendedprice) as avg_price, 10 | avg(l_discount) as avg_disc, 11 | count(*) as count_order 12 | from 13 | lineitem 14 | where 15 | l_shipdate <= date '1998-09-02' 16 | group by 17 | l_returnflag, 18 | l_linestatus 19 | order by 20 | l_returnflag, 21 | l_linestatus; -------------------------------------------------------------------------------- /benchmarks/queries/q10.sql: -------------------------------------------------------------------------------- 1 | select 2 | c_custkey, 3 | c_name, 4 | sum(l_extendedprice * (1 - l_discount)) as revenue, 5 | c_acctbal, 6 | n_name, 7 | c_address, 8 | c_phone, 9 | c_comment 10 | from 11 | customer, 12 | orders, 13 | lineitem, 14 | nation 15 | where 16 | c_custkey = o_custkey 17 | and l_orderkey = o_orderkey 18 | and o_orderdate >= date '1993-10-01' 19 | and o_orderdate < date '1994-01-01' 20 | and l_returnflag = 'R' 21 | and c_nationkey = n_nationkey 22 | group by 23 | c_custkey, 24 | c_name, 25 | c_acctbal, 26 | c_phone, 27 | n_name, 28 | c_address, 29 | c_comment 30 | order by 31 | revenue desc 32 | limit 20; -------------------------------------------------------------------------------- /benchmarks/queries/q11.sql: -------------------------------------------------------------------------------- 1 | select 2 | ps_partkey, 3 | sum(ps_supplycost * ps_availqty) as value 4 | from 5 | partsupp, 6 | supplier, 7 | nation 8 | where 9 | ps_suppkey = s_suppkey 10 | and s_nationkey = n_nationkey 11 | and n_name = 'GERMANY' 12 | group by 13 | ps_partkey having 14 | sum(ps_supplycost * ps_availqty) > ( 15 | select 16 | sum(ps_supplycost * ps_availqty) * 0.0001 17 | from 18 | partsupp, 19 | supplier, 20 | nation 21 | where 22 | ps_suppkey = s_suppkey 23 | and s_nationkey = n_nationkey 24 | and n_name = 'GERMANY' 25 | ) 26 | order by 27 | value desc; -------------------------------------------------------------------------------- /benchmarks/queries/q12.sql: -------------------------------------------------------------------------------- 1 | select 2 | l_shipmode, 3 | sum(case 4 | when o_orderpriority = '1-URGENT' 5 | or o_orderpriority = '2-HIGH' 6 | then 1 7 | else 0 8 | end) as high_line_count, 9 | sum(case 10 | when o_orderpriority <> '1-URGENT' 11 | and o_orderpriority <> '2-HIGH' 12 | then 1 13 | else 0 14 | end) as low_line_count 15 | from 16 | lineitem 17 | join 18 | orders 19 | on 20 | l_orderkey = o_orderkey 21 | where 22 | l_shipmode in ('MAIL', 'SHIP') 23 | and l_commitdate < l_receiptdate 24 | and l_shipdate < l_commitdate 25 | and l_receiptdate >= date '1994-01-01' 26 | and l_receiptdate < date '1995-01-01' 27 | group by 28 | l_shipmode 29 | order by 30 | l_shipmode; -------------------------------------------------------------------------------- /benchmarks/queries/q13.sql: -------------------------------------------------------------------------------- 1 | select 2 | c_count, 3 | count(*) as custdist 4 | from 5 | ( 6 | select 7 | c_custkey, 8 | count(o_orderkey) 9 | from 10 | customer left outer join orders on 11 | c_custkey = o_custkey 12 | and o_comment not like '%special%requests%' 13 | group by 14 | c_custkey 15 | ) as c_orders (c_custkey, c_count) 16 | group by 17 | c_count 18 | order by 19 | custdist desc, 20 | c_count desc; -------------------------------------------------------------------------------- /benchmarks/queries/q14.sql: -------------------------------------------------------------------------------- 1 | select 2 | 100.00 * sum(case 3 | when p_type like 'PROMO%' 4 | then l_extendedprice * (1 - l_discount) 5 | else 0 6 | end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue 7 | from 8 | lineitem, 9 | part 10 | where 11 | l_partkey = p_partkey 12 | and l_shipdate >= date '1995-09-01' 13 | and l_shipdate < date '1995-10-01'; -------------------------------------------------------------------------------- /benchmarks/queries/q15.sql: -------------------------------------------------------------------------------- 1 | create view revenue0 (supplier_no, total_revenue) as 2 | select 3 | l_suppkey, 4 | sum(l_extendedprice * (1 - l_discount)) 5 | from 6 | lineitem 7 | where 8 | l_shipdate >= date '1996-01-01' 9 | and l_shipdate < date '1996-01-01' + interval '3' month 10 | group by 11 | l_suppkey; 12 | 13 | 14 | select 15 | s_suppkey, 16 | s_name, 17 | s_address, 18 | s_phone, 19 | total_revenue 20 | from 21 | supplier, 22 | revenue0 23 | where 24 | s_suppkey = supplier_no 25 | and total_revenue = ( 26 | select 27 | max(total_revenue) 28 | from 29 | revenue0 30 | ) 31 | order by 32 | s_suppkey; 33 | 34 | drop view revenue0; -------------------------------------------------------------------------------- /benchmarks/queries/q16.sql: -------------------------------------------------------------------------------- 1 | select 2 | p_brand, 3 | p_type, 4 | p_size, 5 | count(distinct ps_suppkey) as supplier_cnt 6 | from 7 | partsupp, 8 | part 9 | where 10 | p_partkey = ps_partkey 11 | and p_brand <> 'Brand#45' 12 | and p_type not like 'MEDIUM POLISHED%' 13 | and p_size in (49, 14, 23, 45, 19, 3, 36, 9) 14 | and ps_suppkey not in ( 15 | select 16 | s_suppkey 17 | from 18 | supplier 19 | where 20 | s_comment like '%Customer%Complaints%' 21 | ) 22 | group by 23 | p_brand, 24 | p_type, 25 | p_size 26 | order by 27 | supplier_cnt desc, 28 | p_brand, 29 | p_type, 30 | p_size; -------------------------------------------------------------------------------- /benchmarks/queries/q17.sql: -------------------------------------------------------------------------------- 1 | select 2 | sum(l_extendedprice) / 7.0 as avg_yearly 3 | from 4 | lineitem, 5 | part 6 | where 7 | p_partkey = l_partkey 8 | and p_brand = 'Brand#23' 9 | and p_container = 'MED BOX' 10 | and l_quantity < ( 11 | select 12 | 0.2 * avg(l_quantity) 13 | from 14 | lineitem 15 | where 16 | l_partkey = p_partkey 17 | ); -------------------------------------------------------------------------------- /benchmarks/queries/q18.sql: -------------------------------------------------------------------------------- 1 | select 2 | c_name, 3 | c_custkey, 4 | o_orderkey, 5 | o_orderdate, 6 | o_totalprice, 7 | sum(l_quantity) 8 | from 9 | customer, 10 | orders, 11 | lineitem 12 | where 13 | o_orderkey in ( 14 | select 15 | l_orderkey 16 | from 17 | lineitem 18 | group by 19 | l_orderkey having 20 | sum(l_quantity) > 300 21 | ) 22 | and c_custkey = o_custkey 23 | and o_orderkey = l_orderkey 24 | group by 25 | c_name, 26 | c_custkey, 27 | o_orderkey, 28 | o_orderdate, 29 | o_totalprice 30 | order by 31 | o_totalprice desc, 32 | o_orderdate 33 | limit 100; -------------------------------------------------------------------------------- /benchmarks/queries/q19.sql: -------------------------------------------------------------------------------- 1 | select 2 | sum(l_extendedprice* (1 - l_discount)) as revenue 3 | from 4 | lineitem, 5 | part 6 | where 7 | ( 8 | p_partkey = l_partkey 9 | and p_brand = 'Brand#12' 10 | and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') 11 | and l_quantity >= 1 and l_quantity <= 1 + 10 12 | and p_size between 1 and 5 13 | and l_shipmode in ('AIR', 'AIR REG') 14 | and l_shipinstruct = 'DELIVER IN PERSON' 15 | ) 16 | or 17 | ( 18 | p_partkey = l_partkey 19 | and p_brand = 'Brand#23' 20 | and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') 21 | and l_quantity >= 10 and l_quantity <= 10 + 10 22 | and p_size between 1 and 10 23 | and l_shipmode in ('AIR', 'AIR REG') 24 | and l_shipinstruct = 'DELIVER IN PERSON' 25 | ) 26 | or 27 | ( 28 | p_partkey = l_partkey 29 | and p_brand = 'Brand#34' 30 | and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') 31 | and l_quantity >= 20 and l_quantity <= 20 + 10 32 | and p_size between 1 and 15 33 | and l_shipmode in ('AIR', 'AIR REG') 34 | and l_shipinstruct = 'DELIVER IN PERSON' 35 | ); -------------------------------------------------------------------------------- /benchmarks/queries/q2.sql: -------------------------------------------------------------------------------- 1 | select 2 | s_acctbal, 3 | s_name, 4 | n_name, 5 | p_partkey, 6 | p_mfgr, 7 | s_address, 8 | s_phone, 9 | s_comment 10 | from 11 | part, 12 | supplier, 13 | partsupp, 14 | nation, 15 | region 16 | where 17 | p_partkey = ps_partkey 18 | and s_suppkey = ps_suppkey 19 | and p_size = 15 20 | and p_type like '%BRASS' 21 | and s_nationkey = n_nationkey 22 | and n_regionkey = r_regionkey 23 | and r_name = 'EUROPE' 24 | and ps_supplycost = ( 25 | select 26 | min(ps_supplycost) 27 | from 28 | partsupp, 29 | supplier, 30 | nation, 31 | region 32 | where 33 | p_partkey = ps_partkey 34 | and s_suppkey = ps_suppkey 35 | and s_nationkey = n_nationkey 36 | and n_regionkey = r_regionkey 37 | and r_name = 'EUROPE' 38 | ) 39 | order by 40 | s_acctbal desc, 41 | n_name, 42 | s_name, 43 | p_partkey 44 | limit 100; -------------------------------------------------------------------------------- /benchmarks/queries/q20.sql: -------------------------------------------------------------------------------- 1 | select 2 | s_name, 3 | s_address 4 | from 5 | supplier, 6 | nation 7 | where 8 | s_suppkey in ( 9 | select 10 | ps_suppkey 11 | from 12 | partsupp 13 | where 14 | ps_partkey in ( 15 | select 16 | p_partkey 17 | from 18 | part 19 | where 20 | p_name like 'forest%' 21 | ) 22 | and ps_availqty > ( 23 | select 24 | 0.5 * sum(l_quantity) 25 | from 26 | lineitem 27 | where 28 | l_partkey = ps_partkey 29 | and l_suppkey = ps_suppkey 30 | and l_shipdate >= date '1994-01-01' 31 | and l_shipdate < date '1994-01-01' + interval '1' year 32 | ) 33 | ) 34 | and s_nationkey = n_nationkey 35 | and n_name = 'CANADA' 36 | order by 37 | s_name; 38 | -------------------------------------------------------------------------------- /benchmarks/queries/q21.sql: -------------------------------------------------------------------------------- 1 | select 2 | s_name, 3 | count(*) as numwait 4 | from 5 | supplier, 6 | lineitem l1, 7 | orders, 8 | nation 9 | where 10 | s_suppkey = l1.l_suppkey 11 | and o_orderkey = l1.l_orderkey 12 | and o_orderstatus = 'F' 13 | and l1.l_receiptdate > l1.l_commitdate 14 | and exists ( 15 | select 16 | * 17 | from 18 | lineitem l2 19 | where 20 | l2.l_orderkey = l1.l_orderkey 21 | and l2.l_suppkey <> l1.l_suppkey 22 | ) 23 | and not exists ( 24 | select 25 | * 26 | from 27 | lineitem l3 28 | where 29 | l3.l_orderkey = l1.l_orderkey 30 | and l3.l_suppkey <> l1.l_suppkey 31 | and l3.l_receiptdate > l3.l_commitdate 32 | ) 33 | and s_nationkey = n_nationkey 34 | and n_name = 'SAUDI ARABIA' 35 | group by 36 | s_name 37 | order by 38 | numwait desc, 39 | s_name 40 | limit 100; -------------------------------------------------------------------------------- /benchmarks/queries/q22.sql: -------------------------------------------------------------------------------- 1 | select 2 | cntrycode, 3 | count(*) as numcust, 4 | sum(c_acctbal) as totacctbal 5 | from 6 | ( 7 | select 8 | substring(c_phone from 1 for 2) as cntrycode, 9 | c_acctbal 10 | from 11 | customer 12 | where 13 | substring(c_phone from 1 for 2) in 14 | ('13', '31', '23', '29', '30', '18', '17') 15 | and c_acctbal > ( 16 | select 17 | avg(c_acctbal) 18 | from 19 | customer 20 | where 21 | c_acctbal > 0.00 22 | and substring(c_phone from 1 for 2) in 23 | ('13', '31', '23', '29', '30', '18', '17') 24 | ) 25 | and not exists ( 26 | select 27 | * 28 | from 29 | orders 30 | where 31 | o_custkey = c_custkey 32 | ) 33 | ) as custsale 34 | group by 35 | cntrycode 36 | order by 37 | cntrycode; -------------------------------------------------------------------------------- /benchmarks/queries/q3.sql: -------------------------------------------------------------------------------- 1 | select 2 | l_orderkey, 3 | sum(l_extendedprice * (1 - l_discount)) as revenue, 4 | o_orderdate, 5 | o_shippriority 6 | from 7 | customer, 8 | orders, 9 | lineitem 10 | where 11 | c_mktsegment = 'BUILDING' 12 | and c_custkey = o_custkey 13 | and l_orderkey = o_orderkey 14 | and o_orderdate < date '1995-03-15' 15 | and l_shipdate > date '1995-03-15' 16 | group by 17 | l_orderkey, 18 | o_orderdate, 19 | o_shippriority 20 | order by 21 | revenue desc, 22 | o_orderdate 23 | limit 10; -------------------------------------------------------------------------------- /benchmarks/queries/q4.sql: -------------------------------------------------------------------------------- 1 | select 2 | o_orderpriority, 3 | count(*) as order_count 4 | from 5 | orders 6 | where 7 | o_orderdate >= '1993-07-01' 8 | and o_orderdate < date '1993-07-01' + interval '3' month 9 | and exists ( 10 | select 11 | * 12 | from 13 | lineitem 14 | where 15 | l_orderkey = o_orderkey 16 | and l_commitdate < l_receiptdate 17 | ) 18 | group by 19 | o_orderpriority 20 | order by 21 | o_orderpriority; -------------------------------------------------------------------------------- /benchmarks/queries/q5.sql: -------------------------------------------------------------------------------- 1 | select 2 | n_name, 3 | sum(l_extendedprice * (1 - l_discount)) as revenue 4 | from 5 | customer, 6 | orders, 7 | lineitem, 8 | supplier, 9 | nation, 10 | region 11 | where 12 | c_custkey = o_custkey 13 | and l_orderkey = o_orderkey 14 | and l_suppkey = s_suppkey 15 | and c_nationkey = s_nationkey 16 | and s_nationkey = n_nationkey 17 | and n_regionkey = r_regionkey 18 | and r_name = 'ASIA' 19 | and o_orderdate >= date '1994-01-01' 20 | and o_orderdate < date '1995-01-01' 21 | group by 22 | n_name 23 | order by 24 | revenue desc; -------------------------------------------------------------------------------- /benchmarks/queries/q6.sql: -------------------------------------------------------------------------------- 1 | select 2 | sum(l_extendedprice * l_discount) as revenue 3 | from 4 | lineitem 5 | where 6 | l_shipdate >= date '1994-01-01' 7 | and l_shipdate < date '1995-01-01' 8 | and l_discount between 0.06 - 0.01 and 0.06 + 0.01 9 | and l_quantity < 24; -------------------------------------------------------------------------------- /benchmarks/queries/q7.sql: -------------------------------------------------------------------------------- 1 | select 2 | supp_nation, 3 | cust_nation, 4 | l_year, 5 | sum(volume) as revenue 6 | from 7 | ( 8 | select 9 | n1.n_name as supp_nation, 10 | n2.n_name as cust_nation, 11 | extract(year from l_shipdate) as l_year, 12 | l_extendedprice * (1 - l_discount) as volume 13 | from 14 | supplier, 15 | lineitem, 16 | orders, 17 | customer, 18 | nation n1, 19 | nation n2 20 | where 21 | s_suppkey = l_suppkey 22 | and o_orderkey = l_orderkey 23 | and c_custkey = o_custkey 24 | and s_nationkey = n1.n_nationkey 25 | and c_nationkey = n2.n_nationkey 26 | and ( 27 | (n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY') 28 | or (n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE') 29 | ) 30 | and l_shipdate between date '1995-01-01' and date '1996-12-31' 31 | ) as shipping 32 | group by 33 | supp_nation, 34 | cust_nation, 35 | l_year 36 | order by 37 | supp_nation, 38 | cust_nation, 39 | l_year; 40 | -------------------------------------------------------------------------------- /benchmarks/queries/q8.sql: -------------------------------------------------------------------------------- 1 | select 2 | o_year, 3 | sum(case 4 | when nation = 'BRAZIL' then volume 5 | else 0 6 | end) / sum(volume) as mkt_share 7 | from 8 | ( 9 | select 10 | extract(year from o_orderdate) as o_year, 11 | l_extendedprice * (1 - l_discount) as volume, 12 | n2.n_name as nation 13 | from 14 | part, 15 | supplier, 16 | lineitem, 17 | orders, 18 | customer, 19 | nation n1, 20 | nation n2, 21 | region 22 | where 23 | p_partkey = l_partkey 24 | and s_suppkey = l_suppkey 25 | and l_orderkey = o_orderkey 26 | and o_custkey = c_custkey 27 | and c_nationkey = n1.n_nationkey 28 | and n1.n_regionkey = r_regionkey 29 | and r_name = 'AMERICA' 30 | and s_nationkey = n2.n_nationkey 31 | and o_orderdate between date '1995-01-01' and date '1996-12-31' 32 | and p_type = 'ECONOMY ANODIZED STEEL' 33 | ) as all_nations 34 | group by 35 | o_year 36 | order by 37 | o_year; -------------------------------------------------------------------------------- /benchmarks/queries/q9.sql: -------------------------------------------------------------------------------- 1 | select 2 | nation, 3 | o_year, 4 | sum(amount) as sum_profit 5 | from 6 | ( 7 | select 8 | n_name as nation, 9 | extract(year from o_orderdate) as o_year, 10 | l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount 11 | from 12 | part, 13 | supplier, 14 | lineitem, 15 | partsupp, 16 | orders, 17 | nation 18 | where 19 | s_suppkey = l_suppkey 20 | and ps_suppkey = l_suppkey 21 | and ps_partkey = l_partkey 22 | and p_partkey = l_partkey 23 | and o_orderkey = l_orderkey 24 | and s_nationkey = n_nationkey 25 | and p_name like '%green%' 26 | ) as profit 27 | group by 28 | nation, 29 | o_year 30 | order by 31 | nation, 32 | o_year desc; -------------------------------------------------------------------------------- /benchmarks/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | 19 | set -e 20 | set -x 21 | 22 | # This bash script is meant to be run inside the docker-compose environment. Check the README for instructions 23 | 24 | # regression checks for queries that return the correct results 25 | # TODO add all queries once https://github.com/apache/arrow-datafusion/issues/3478 is implemented and once 26 | # queries return decimal results with the correct precision 27 | for query in 4 12 13 28 | do 29 | /root/tpch benchmark ballista --host ballista-scheduler --port 50050 --query $query --path /data --format tbl --iterations 1 --debug --expected /data 30 | done 31 | 32 | # at least make sure these queries run, even though we do not check that the results are correct yet 33 | 34 | #TODO: add query 16 once we support it 35 | for query in 1 2 3 5 6 7 8 9 10 11 14 15 17 18 19 20 21 22 36 | do 37 | /root/tpch benchmark ballista --host ballista-scheduler --port 50050 --query $query --path /data --format tbl --iterations 1 --debug 38 | done 39 | 40 | -------------------------------------------------------------------------------- /benchmarks/spark/.gitignore: -------------------------------------------------------------------------------- 1 | target -------------------------------------------------------------------------------- /benchmarks/tpch-gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | 19 | mkdir -p data/answers 2>/dev/null 20 | 21 | set -e 22 | 23 | pushd .. 24 | . ./dev/build-set-env.sh 25 | popd 26 | 27 | # Generate data into the ./data directory if it does not already exist 28 | FILE=./data/supplier.tbl 29 | if test -f "$FILE"; then 30 | echo "$FILE exists." 31 | else 32 | docker run -v `pwd`/data:/data -it --rm ghcr.io/scalytics/tpch-docker:main -vf -s 1 33 | fi 34 | 35 | # Copy expected answers into the ./data/answers directory if it does not already exist 36 | FILE=./data/answers/q1.out 37 | if test -f "$FILE"; then 38 | echo "$FILE exists." 39 | else 40 | docker run -v `pwd`/data:/data -it --entrypoint /bin/bash --rm ghcr.io/scalytics/tpch-docker:main -c "cp /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/" 41 | fi -------------------------------------------------------------------------------- /benchmarks/tpch.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import sys 19 | import time 20 | import argparse 21 | 22 | parser = argparse.ArgumentParser(description='Run SQL benchmarks.') 23 | parser.add_argument('--query', help='query to run, such as q1') 24 | parser.add_argument('--path', help='path to data files') 25 | parser.add_argument('--ext', default='', help='optional file extension, such as parquet') 26 | 27 | args = parser.parse_args() 28 | 29 | query = args.query 30 | path = args.path 31 | table_ext = args.ext 32 | 33 | from ballista import BallistaBuilder 34 | from datafusion.context import SessionContext 35 | 36 | ctx: SessionContext = BallistaBuilder().remote("df://127.0.0.1:50050") 37 | 38 | tables = ["part", "supplier", "partsupp", "customer", "orders", "lineitem", "nation", "region"] 39 | 40 | for table in tables: 41 | table_path = path + "/" + table 42 | if len(table_ext) > 0: 43 | table_path = table_path + "." + table_ext 44 | print("Registering table", table, "at path", table_path) 45 | ctx.register_parquet(table, table_path) 46 | 47 | with open("queries/" + query + ".sql", 'r') as file: 48 | sql = file.read() 49 | 50 | import time 51 | 52 | start = time.time() 53 | 54 | df = ctx.sql(sql) 55 | df.show() 56 | 57 | end = time.time() 58 | print("Query", query, "took", end - start, "second(s)") 59 | 60 | 61 | -------------------------------------------------------------------------------- /ci/scripts/rust_clippy.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | set -ex 21 | cargo clippy --all-targets --workspace -- -D warnings 22 | -------------------------------------------------------------------------------- /ci/scripts/rust_fmt.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | set -ex 21 | cargo fmt --all -- --check 22 | -------------------------------------------------------------------------------- /ci/scripts/rust_toml_fmt.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | set -ex 21 | find . -mindepth 2 -name 'Cargo.toml' -exec cargo tomlfmt -k -p {} \; 22 | -------------------------------------------------------------------------------- /clippy.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # increasing the threshold until we get to datafusion 48 which should 19 | # address this issue. 20 | # 21 | # https://github.com/apache/datafusion/pull/15861 22 | large-error-threshold = 266 23 | -------------------------------------------------------------------------------- /dev/build-ballista-docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | set -e 21 | 22 | RELEASE_FLAG=${RELEASE_FLAG:=release} 23 | 24 | ./dev/build-ballista-executables.sh 25 | 26 | . ./dev/build-set-env.sh 27 | 28 | docker build -t "apache/datafusion-ballista-standalone:latest" -f dev/docker/ballista-standalone.Dockerfile . 29 | docker build -t "apache/datafusion-ballista-scheduler:latest" -f dev/docker/ballista-scheduler.Dockerfile . 30 | docker build -t "apache/datafusion-ballista-executor:latest" -f dev/docker/ballista-executor.Dockerfile . 31 | docker build -t "apache/datafusion-ballista-cli:latest" -f dev/docker/ballista-cli.Dockerfile . 32 | docker build -t "apache/datafusion-ballista-benchmarks:latest" -f dev/docker/ballista-benchmarks.Dockerfile . 33 | -------------------------------------------------------------------------------- /dev/build-ballista-executables.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | set -e 21 | 22 | RELEASE_FLAG=${RELEASE_FLAG:=release} 23 | 24 | # TODO: it would be very nice if we could make CI work the exact same way so the build logic isn't duplicated 25 | 26 | # build a docker container in which to run the build - this is to make life easier for Windows & Mac users 27 | docker build -t ballista-builder --build-arg EXT_UID="$(id -u)" -f dev/docker/ballista-builder.Dockerfile . 28 | 29 | # run cargo & yarn builds inside the builder container 30 | docker run -v $(pwd):/home/builder/workspace --env RELEASE_FLAG=$RELEASE_FLAG ballista-builder 31 | -------------------------------------------------------------------------------- /dev/build-set-env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | cd ballista/core/ 21 | export BALLISTA_VERSION=$(cargo pkgid | cut '-d@' -f2) 22 | cd - 23 | -------------------------------------------------------------------------------- /dev/docker/ballista-benchmarks.Dockerfile: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | FROM ubuntu:24.04 19 | 20 | ARG RELEASE_FLAG=release 21 | 22 | ENV RELEASE_FLAG=${RELEASE_FLAG} 23 | ENV RUST_LOG=info 24 | ENV RUST_BACKTRACE=full 25 | 26 | COPY target/$RELEASE_FLAG/ballista-scheduler /root/ballista-scheduler 27 | COPY target/$RELEASE_FLAG/ballista-executor /root/ballista-executor 28 | COPY target/$RELEASE_FLAG/tpch /root/tpch 29 | 30 | COPY benchmarks/run.sh /root/run.sh 31 | COPY benchmarks/queries/ /root/benchmarks/queries 32 | 33 | WORKDIR /root 34 | 35 | CMD ["/root/run.sh"] -------------------------------------------------------------------------------- /dev/docker/ballista-builder.Dockerfile: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | FROM rust:1.85-bullseye 19 | 20 | ARG EXT_UID 21 | 22 | ENV RUST_LOG=info 23 | ENV RUST_BACKTRACE=full 24 | ENV DEBIAN_FRONTEND=noninteractive 25 | 26 | RUN apt-get update && \ 27 | apt-get -y install libssl-dev openssl zlib1g zlib1g-dev libpq-dev cmake protobuf-compiler curl unzip 28 | 29 | RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - && \ 30 | apt-get update && \ 31 | apt-get install -y nodejs && \ 32 | npm install -g yarn 33 | 34 | # create build user with same UID as 35 | RUN adduser -q -u $EXT_UID builder --home /home/builder && \ 36 | mkdir -p /home/builder/workspace 37 | USER builder 38 | 39 | ENV NODE_VER=18.9.0 40 | ENV HOME=/home/builder 41 | ENV PATH=$HOME/.cargo/bin:$PATH 42 | 43 | # prepare rust 44 | RUN rustup update && \ 45 | rustup component add rustfmt && \ 46 | cargo install cargo-chef --version 0.1.62 47 | 48 | WORKDIR /home/builder/workspace 49 | 50 | COPY dev/docker/builder-entrypoint.sh /home/builder 51 | ENTRYPOINT ["/home/builder/builder-entrypoint.sh"] 52 | -------------------------------------------------------------------------------- /dev/docker/ballista-cli.Dockerfile: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | FROM ubuntu:24.04 19 | 20 | ARG RELEASE_FLAG=release 21 | 22 | ENV RELEASE_FLAG=${RELEASE_FLAG} 23 | ENV RUST_LOG=info 24 | ENV RUST_BACKTRACE=full 25 | 26 | COPY target/$RELEASE_FLAG/ballista-cli /root/ballista-cli 27 | 28 | COPY dev/docker/cli-entrypoint.sh /root/cli-entrypoint.sh 29 | ENTRYPOINT ["/root/cli-entrypoint.sh"] 30 | -------------------------------------------------------------------------------- /dev/docker/ballista-executor.Dockerfile: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | FROM ubuntu:24.04 19 | 20 | ARG RELEASE_FLAG=release 21 | 22 | ENV RELEASE_FLAG=${RELEASE_FLAG} 23 | ENV RUST_LOG=info 24 | ENV RUST_BACKTRACE=full 25 | 26 | COPY target/$RELEASE_FLAG/ballista-executor /root/ballista-executor 27 | 28 | # Expose Ballista Executor gRPC port 29 | EXPOSE 50051 30 | 31 | COPY dev/docker/executor-entrypoint.sh /root/executor-entrypoint.sh 32 | ENTRYPOINT ["/root/executor-entrypoint.sh"] 33 | -------------------------------------------------------------------------------- /dev/docker/ballista-scheduler.Dockerfile: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | FROM ubuntu:24.04 19 | 20 | ARG RELEASE_FLAG=release 21 | 22 | ENV RELEASE_FLAG=${RELEASE_FLAG} 23 | ENV RUST_LOG=info 24 | ENV RUST_BACKTRACE=full 25 | ENV DEBIAN_FRONTEND=noninteractive 26 | 27 | COPY target/$RELEASE_FLAG/ballista-scheduler /root/ballista-scheduler 28 | 29 | # Expose Ballista Scheduler gRPC port 30 | EXPOSE 50050 31 | 32 | COPY dev/docker/scheduler-entrypoint.sh /root/scheduler-entrypoint.sh 33 | ENTRYPOINT ["/root/scheduler-entrypoint.sh"] 34 | -------------------------------------------------------------------------------- /dev/docker/ballista-standalone.Dockerfile: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | FROM ubuntu:24.04 19 | 20 | LABEL org.opencontainers.image.source="https://github.com/apache/datafusion-ballista" 21 | LABEL org.opencontainers.image.description="Apache Arrow Ballista Distributed SQL Query Engine" 22 | LABEL org.opencontainers.image.licenses="Apache-2.0" 23 | 24 | ARG RELEASE_FLAG=release 25 | 26 | ENV RELEASE_FLAG=${RELEASE_FLAG} 27 | ENV RUST_LOG=info 28 | ENV RUST_BACKTRACE=full 29 | ENV DEBIAN_FRONTEND=noninteractive 30 | 31 | RUN apt-get -qq update && apt-get install -qq -y wget 32 | 33 | COPY target/$RELEASE_FLAG/ballista-scheduler /root/ballista-scheduler 34 | COPY target/$RELEASE_FLAG/ballista-executor /root/ballista-executor 35 | 36 | RUN chmod a+x /root/ballista-scheduler && \ 37 | chmod a+x /root/ballista-executor 38 | 39 | # populate some sample data for ListingSchemaProvider 40 | RUN mkdir -p /data && \ 41 | wget -q https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet -P /data/ 42 | ENV DATAFUSION_CATALOG_LOCATION=/data 43 | ENV DATAFUSION_CATALOG_TYPE=csv 44 | 45 | # Expose Ballista Scheduler gRPC port 46 | EXPOSE 50050 47 | 48 | # Expose Ballista Executor gRPC port 49 | EXPOSE 50051 50 | 51 | COPY dev/docker/standalone-entrypoint.sh /root/standalone-entrypoint.sh 52 | ENTRYPOINT ["/root/standalone-entrypoint.sh"] 53 | -------------------------------------------------------------------------------- /dev/docker/builder-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | set -e 21 | set -x 22 | 23 | printenv 24 | RELEASE_FLAG=${RELEASE_FLAG:=release} 25 | cargo build --features rest-api --profile $RELEASE_FLAG "$@" 26 | -------------------------------------------------------------------------------- /dev/docker/cli-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | set -e 21 | 22 | /root/ballista-cli "$@" 23 | -------------------------------------------------------------------------------- /dev/docker/executor-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | set -e 21 | 22 | /root/ballista-executor "$@" 23 | -------------------------------------------------------------------------------- /dev/docker/scheduler-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | set -e 21 | 22 | /root/ballista-scheduler "$@" 23 | -------------------------------------------------------------------------------- /dev/docker/standalone-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | set -e 21 | 22 | echo "Starting for scheduler..." 23 | /root/ballista-scheduler & 24 | while ! nc -z 127.0.0.1 50050; do 25 | sleep 1 26 | done 27 | 28 | echo "Starting executor" 29 | /root/ballista-executor 30 | -------------------------------------------------------------------------------- /dev/integration-tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | set -e 20 | 21 | echo "Generating benchmark data ..." 22 | pushd benchmarks 23 | ./tpch-gen.sh 24 | popd 25 | 26 | echo "Building Docker images ..." 27 | ./dev/build-ballista-docker.sh 28 | 29 | echo "Starting docker-compose in background ..." 30 | docker-compose up -d 31 | 32 | # give the scheduler a chance to start up 33 | echo "Sleeping (wait for scheduler to start)..." 34 | sleep 10 35 | 36 | echo "Running benchmarks ..." 37 | docker-compose run ballista-client /root/run.sh 38 | 39 | #TODO need to call docker-compose down even if benchmarks fail 40 | 41 | echo "Stopping docker-compose ..." 42 | docker-compose down 43 | 44 | popd 45 | -------------------------------------------------------------------------------- /dev/release/check-rat-report.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | ############################################################################## 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | ############################################################################## 20 | import fnmatch 21 | import re 22 | import sys 23 | import xml.etree.ElementTree as ET 24 | 25 | if len(sys.argv) != 3: 26 | sys.stderr.write("Usage: %s exclude_globs.lst rat_report.xml\n" % 27 | sys.argv[0]) 28 | sys.exit(1) 29 | 30 | exclude_globs_filename = sys.argv[1] 31 | xml_filename = sys.argv[2] 32 | 33 | globs = [line.strip() for line in open(exclude_globs_filename, "r")] 34 | 35 | tree = ET.parse(xml_filename) 36 | root = tree.getroot() 37 | resources = root.findall('resource') 38 | 39 | all_ok = True 40 | for r in resources: 41 | approvals = r.findall('license-approval') 42 | if not approvals or approvals[0].attrib['name'] == 'true': 43 | continue 44 | clean_name = re.sub('^[^/]+/', '', r.attrib['name']) 45 | excluded = False 46 | for g in globs: 47 | if fnmatch.fnmatch(clean_name, g): 48 | excluded = True 49 | break 50 | if not excluded: 51 | sys.stdout.write("NOT APPROVED: %s (%s): %s\n" % ( 52 | clean_name, r.attrib['name'], approvals[0].attrib['name'])) 53 | all_ok = False 54 | 55 | if not all_ok: 56 | sys.exit(1) 57 | 58 | print('OK') 59 | sys.exit(0) 60 | -------------------------------------------------------------------------------- /dev/release/crate-deps.dot: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | digraph G { 19 | 20 | ballista_core 21 | ballista_scheduler 22 | ballista_executor 23 | ballista 24 | ballista_cli 25 | 26 | ballista_scheduler -> ballista_core 27 | 28 | ballista_executor -> ballista_core 29 | 30 | ballista -> ballista_core 31 | ballista -> ballista_scheduler 32 | ballista -> ballista_executor 33 | 34 | ballista_cli -> ballista 35 | 36 | } 37 | -------------------------------------------------------------------------------- /dev/release/rat_exclude_files.txt: -------------------------------------------------------------------------------- 1 | *.npmrc 2 | *.gitignore 3 | *.dockerignore 4 | .gitmodules 5 | *_generated.js 6 | *_generated.ts 7 | *.csv 8 | *.json 9 | *.snap 10 | .github/ISSUE_TEMPLATE/*.md 11 | .github/pull_request_template.md 12 | ci/etc/rprofile 13 | ci/etc/*.patch 14 | ci/vcpkg/*.patch 15 | CHANGELOG.md 16 | ballista/CHANGELOG.md 17 | python/CHANGELOG.md 18 | dev/requirements*.txt 19 | dev/release/rat_exclude_files.txt 20 | helm/ballista/Chart.lock 21 | pax_global_header 22 | MANIFEST.in 23 | __init__.pxd 24 | __init__.py 25 | requirements.txt 26 | *.html 27 | *.sgml 28 | *.css 29 | *.png 30 | *.ico 31 | *.svg 32 | *.devhelp2 33 | *.scss 34 | .gitattributes 35 | benchmarks/queries/q*.sql 36 | ballista/scheduler/testdata/* 37 | **/yarn.lock 38 | python/requirements*.txt 39 | **/testdata/* 40 | benchmarks/queries/* 41 | benchmarks/data/* 42 | ci/* 43 | **/*.svg 44 | **/*.csv 45 | **/*.json 46 | **/*.sql 47 | venv/* 48 | testing/* 49 | target/* 50 | **/target/* 51 | Cargo.lock 52 | **/Cargo.lock 53 | .history 54 | parquet-testing/* 55 | *rat.txt 56 | ballista/core/src/serde/generated/ballista.rs -------------------------------------------------------------------------------- /dev/release/release-tarball.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | # 20 | 21 | # Adapted from https://github.com/apache/arrow-rs/tree/master/dev/release/release-tarball.sh 22 | 23 | # This script copies a tarball from the "dev" area of the 24 | # dist.apache.datafusion repository to the "release" area 25 | # 26 | # This script should only be run after the release has been approved 27 | # by the DataFusion PMC committee. 28 | # 29 | # See release/README.md for full release instructions 30 | # 31 | # Based in part on post-01-upload.sh from apache/arrow 32 | 33 | 34 | set -e 35 | set -u 36 | 37 | if [ "$#" -ne 2 ]; then 38 | echo "Usage: $0 " 39 | echo "ex. $0 4.1.0 2" 40 | exit 41 | fi 42 | 43 | version=$1 44 | rc=$2 45 | 46 | tmp_dir=tmp-apache-datafusion-ballista-dist 47 | 48 | echo "Recreate temporary directory: ${tmp_dir}" 49 | rm -rf ${tmp_dir} 50 | mkdir -p ${tmp_dir} 51 | 52 | echo "Clone dev dist repository" 53 | svn \ 54 | co \ 55 | https://dist.apache.org/repos/dist/dev/datafusion/apache-datafusion-ballista-${version}-rc${rc} \ 56 | ${tmp_dir}/dev 57 | 58 | echo "Clone release dist repository" 59 | svn co https://dist.apache.org/repos/dist/release/datafusion ${tmp_dir}/release 60 | 61 | echo "Copy ${version}-rc${rc} to release working copy" 62 | release_version=datafusion-ballista-${version} 63 | mkdir -p ${tmp_dir}/release/${release_version} 64 | cp -r ${tmp_dir}/dev/* ${tmp_dir}/release/${release_version}/ 65 | svn add ${tmp_dir}/release/${release_version} 66 | 67 | echo "Commit release" 68 | svn ci -m "Apache DataFusion Ballista ${version}" ${tmp_dir}/release 69 | 70 | echo "Clean up" 71 | rm -rf ${tmp_dir} 72 | 73 | echo "Success! The release is available here:" 74 | echo " https://dist.apache.org/repos/dist/release/datafusion/${release_version}" 75 | -------------------------------------------------------------------------------- /dev/release/run-rat.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | # 20 | 21 | RAT_VERSION=0.13 22 | 23 | # download apache rat 24 | if [ ! -f apache-rat-${RAT_VERSION}.jar ]; then 25 | curl -s https://repo1.maven.org/maven2/org/apache/rat/apache-rat/${RAT_VERSION}/apache-rat-${RAT_VERSION}.jar > apache-rat-${RAT_VERSION}.jar 26 | fi 27 | 28 | RAT="java -jar apache-rat-${RAT_VERSION}.jar -x " 29 | 30 | RELEASE_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd) 31 | 32 | # generate the rat report 33 | $RAT $1 > rat.txt 34 | python $RELEASE_DIR/check-rat-report.py $RELEASE_DIR/rat_exclude_files.txt rat.txt > filtered_rat.txt 35 | cat filtered_rat.txt 36 | UNAPPROVED=`cat filtered_rat.txt | grep "NOT APPROVED" | wc -l` 37 | 38 | if [ "0" -eq "${UNAPPROVED}" ]; then 39 | echo "No unapproved licenses" 40 | else 41 | echo "${UNAPPROVED} unapproved licences. Check rat report: rat.txt" 42 | exit 1 43 | fi 44 | -------------------------------------------------------------------------------- /dev/rust_lint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | set -e 20 | if ! command -v cargo-tomlfmt &> /dev/null; then 21 | echo "Installing cargo-tomlfmt using cargo" 22 | cargo install cargo-tomlfmt 23 | fi 24 | 25 | ci/scripts/rust_fmt.sh 26 | ci/scripts/rust_clippy.sh 27 | ci/scripts/rust_toml_fmt.sh -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | version: '3.3' 18 | services: 19 | ballista-scheduler: 20 | image: ballista-scheduler 21 | build: 22 | dockerfile: dev/docker/ballista-scheduler.Dockerfile 23 | context: . 24 | command: "--bind-host 0.0.0.0" 25 | ports: 26 | - "50050:50050" 27 | environment: 28 | - RUST_LOG=ballista=info 29 | volumes: 30 | - ./benchmarks/data:/data 31 | healthcheck: 32 | test: ["CMD", "nc", "-z", "ballista-scheduler", "50050"] 33 | interval: 5s 34 | timeout: 5s 35 | retries: 5 36 | ballista-executor: 37 | image: ballista-executor 38 | build: 39 | dockerfile: dev/docker/ballista-executor.Dockerfile 40 | context: . 41 | command: "--bind-host 0.0.0.0 --scheduler-host ballista-scheduler --scheduler-connect-timeout-seconds 15" 42 | deploy: 43 | replicas: 2 44 | restart: always 45 | environment: 46 | - RUST_LOG=ballista=info 47 | volumes: 48 | - ./benchmarks/data:/data 49 | depends_on: 50 | - ballista-scheduler 51 | healthcheck: 52 | test: ["CMD", "nc", "-z", "ballista-executor", "50051"] 53 | interval: 5s 54 | timeout: 5s 55 | retries: 5 56 | ballista-client: 57 | image: ballista-benchmarks 58 | build: 59 | dockerfile: dev/docker/ballista-benchmarks.Dockerfile 60 | context: . 61 | command: ["/bin/bash", "-c", "sleep infinity"] 62 | ports: 63 | - "50051:50051" 64 | environment: 65 | - RUST_LOG=info 66 | volumes: 67 | - ./benchmarks/data:/data 68 | depends_on: 69 | - ballista-scheduler 70 | - ballista-executor 71 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | build 19 | source/python/generated 20 | venv/ 21 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # 19 | # Minimal makefile for Sphinx documentation 20 | # 21 | 22 | # You can set these variables from the command line, and also 23 | # from the environment for the first two. 24 | SPHINXOPTS ?= 25 | SPHINXBUILD ?= sphinx-build 26 | SOURCEDIR = source 27 | BUILDDIR = build 28 | 29 | # Put it first so that "make" without argument is like "make help". 30 | help: 31 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 32 | 33 | .PHONY: help Makefile 34 | 35 | # Catch-all target: route all unknown targets to Sphinx using the new 36 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 37 | %: Makefile 38 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 39 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Ballista Documentation 21 | 22 | ## User Documentation 23 | 24 | Documentation for the current published release can be found at and the source 25 | content is located [here](source/user-guide/introduction.md). 26 | 27 | ## Developer Documentation 28 | 29 | Developer documentation can be found [here](developer/README.md). 30 | 31 | ## Building the User Guide 32 | 33 | ### Dependencies 34 | 35 | It's recommended to install build dependencies and build the documentation 36 | inside a Python virtualenv. 37 | 38 | - Python 39 | - `pip install -r requirements.txt` 40 | 41 | ## Build 42 | 43 | ```bash 44 | ./build.sh 45 | ``` 46 | 47 | ## Release 48 | 49 | The documentation is published from the `asf-site` branch of this repository. 50 | 51 | Documentation is published automatically when documentation changes are pushed to the main branch. 52 | -------------------------------------------------------------------------------- /docs/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | rm -rf build 21 | make html 22 | -------------------------------------------------------------------------------- /docs/developer/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Ballista Developer Documentation 21 | 22 | - Read the [Architecture Overview](architecture.md) to get an understanding of the scheduler and executor 23 | processes and how distributed query execution works. 24 | - Watch the [Ballista: Distributed Compute with Rust and Apache Arrow](https://www.youtube.com/watch?v=ZZHQaOap9pQ) 25 | talk from the New York Open Statistical Programming Meetup (Feb 2021) 26 | -------------------------------------------------------------------------------- /docs/developer/images/query-execution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/datafusion-ballista/97c919274d9de496b630e66c12ad29c3fccd110b/docs/developer/images/query-execution.png -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @rem Licensed to the Apache Software Foundation (ASF) under one 2 | @rem or more contributor license agreements. See the NOTICE file 3 | @rem distributed with this work for additional information 4 | @rem regarding copyright ownership. The ASF licenses this file 5 | @rem to you under the Apache License, Version 2.0 (the 6 | @rem "License"); you may not use this file except in compliance 7 | @rem with the License. You may obtain a copy of the License at 8 | @rem 9 | @rem http://www.apache.org/licenses/LICENSE-2.0 10 | @rem 11 | @rem Unless required by applicable law or agreed to in writing, 12 | @rem software distributed under the License is distributed on an 13 | @rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | @rem KIND, either express or implied. See the License for the 15 | @rem specific language governing permissions and limitations 16 | @rem under the License. 17 | 18 | @ECHO OFF 19 | 20 | pushd %~dp0 21 | 22 | REM Command file for Sphinx documentation 23 | 24 | if "%SPHINXBUILD%" == "" ( 25 | set SPHINXBUILD=sphinx-build 26 | ) 27 | set SOURCEDIR=source 28 | set BUILDDIR=build 29 | 30 | if "%1" == "" goto help 31 | 32 | %SPHINXBUILD% >NUL 2>NUL 33 | if errorlevel 9009 ( 34 | echo. 35 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 36 | echo.installed, then set the SPHINXBUILD environment variable to point 37 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 38 | echo.may add the Sphinx directory to PATH. 39 | echo. 40 | echo.If you don't have Sphinx installed, grab it from 41 | echo.http://sphinx-doc.org/ 42 | exit /b 1 43 | ) 44 | 45 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 46 | goto end 47 | 48 | :help 49 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 50 | 51 | :end 52 | popd 53 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | sphinx 19 | Jinja2 20 | pydata-sphinx-theme==0.8.0 21 | myst-parser 22 | maturin 23 | -------------------------------------------------------------------------------- /docs/source/_static/images/ballista-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/datafusion-ballista/97c919274d9de496b630e66c12ad29c3fccd110b/docs/source/_static/images/ballista-logo.png -------------------------------------------------------------------------------- /docs/source/_static/images/ballista_black.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/datafusion-ballista/97c919274d9de496b630e66c12ad29c3fccd110b/docs/source/_static/images/ballista_black.png -------------------------------------------------------------------------------- /docs/source/_static/images/ballista_white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/datafusion-ballista/97c919274d9de496b630e66c12ad29c3fccd110b/docs/source/_static/images/ballista_white.png -------------------------------------------------------------------------------- /docs/source/_static/images/tpch_allqueries.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/datafusion-ballista/97c919274d9de496b630e66c12ad29c3fccd110b/docs/source/_static/images/tpch_allqueries.png -------------------------------------------------------------------------------- /docs/source/_static/images/tpch_queries_compare.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/datafusion-ballista/97c919274d9de496b630e66c12ad29c3fccd110b/docs/source/_static/images/tpch_queries_compare.png -------------------------------------------------------------------------------- /docs/source/_static/images/tpch_queries_speedup_abs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/datafusion-ballista/97c919274d9de496b630e66c12ad29c3fccd110b/docs/source/_static/images/tpch_queries_speedup_abs.png -------------------------------------------------------------------------------- /docs/source/_static/images/tpch_queries_speedup_rel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/datafusion-ballista/97c919274d9de496b630e66c12ad29c3fccd110b/docs/source/_static/images/tpch_queries_speedup_rel.png -------------------------------------------------------------------------------- /docs/source/_templates/docs-sidebar.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 10 | 11 | 20 | -------------------------------------------------------------------------------- /docs/source/_templates/layout.html: -------------------------------------------------------------------------------- 1 | {% extends "pydata_sphinx_theme/layout.html" %} 2 | 3 | {# Silence the navbar #} 4 | {% block docs_navbar %} 5 | {% endblock %} 6 | 7 | 10 | {% block footer %} 11 | 12 |
13 |
14 | {% for footer_item in theme_footer_items %} 15 | 18 | {% endfor %} 19 | 23 |
24 |
25 | 26 | {% endblock %} 27 | -------------------------------------------------------------------------------- /docs/source/community/communication.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Communication 21 | 22 | We welcome participation from everyone and encourage you to join us, ask 23 | questions, and get involved. 24 | 25 | All participation in the Apache DataFusion Ballista project is governed by the 26 | Apache Software Foundation's [code of 27 | conduct](https://www.apache.org/foundation/policies/conduct.html). 28 | 29 | We use the same communication channels as the main DataFusion project: 30 | 31 | [https://datafusion.apache.org/contributor-guide/communication.html](https://datafusion.apache.org/contributor-guide/communication.html) 32 | 33 | ## Contributing 34 | 35 | Our source code is hosted on 36 | [GitHub](https://github.com/apache/datafusion-ballista). More information on contributing is in 37 | the [Contribution Guide](https://github.com/apache/datafusion-ballista/blob/main/CONTRIBUTING.md) 38 | , and we have curated a [good-first-issue](https://github.com/apache/datafusion-ballista/contribute) 39 | list to help you get started. You can find datafusion's major designs in docs/source/specification. 40 | 41 | We use GitHub issues for maintaining a queue of development work and as the 42 | public record. We often use Google docs, Github issues and pull requests for 43 | quick and small design discussions. For major design change proposals, we encourage you to write a rfc. 44 | -------------------------------------------------------------------------------- /docs/source/contributors-guide/ballista.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/datafusion-ballista/97c919274d9de496b630e66c12ad29c3fccd110b/docs/source/contributors-guide/ballista.drawio.png -------------------------------------------------------------------------------- /docs/source/contributors-guide/development.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Ballista Development 21 | 22 | We welcome participation from everyone and encourage you to join us, ask 23 | questions, and get involved. 24 | 25 | All participation in the Apache DataFusion Ballista project is governed by the 26 | Apache Software Foundation's [code of 27 | conduct](https://www.apache.org/foundation/policies/conduct.html). 28 | 29 | ## Development Environment 30 | 31 | The easiest way to get started if you are using VSCode or IntelliJ IDEA is to open the provided [Dev Container](https://containers.dev/overview) 32 | which will install all the required dependencies including Rust, Docker, Node.js and Yarn. A Dev Container is a 33 | development environment that runs in a Docker container. It is configured with all the required dependencies to 34 | build and test the project. It also includes VS Code and the Rust and Node.js extensions. Other supporting tools 35 | that use Dev Containers can be seen [here](https://containers.dev/supporting) 36 | 37 | To use the Dev Container, open the project in VS Code and then click the "Reopen in Container" button in the 38 | bottom right corner of the IDE. 39 | 40 | If you are not using the Dev Container or VScode, you will need to install these dependencies yourself. 41 | 42 | - [Rust](https://www.rust-lang.org/tools/install) 43 | - [Protobuf Compiler](https://protobuf.dev/downloads/) is required to build the project. 44 | - [Node.js](https://nodejs.org/en/download/) is required to build the project. 45 | - [Yarn](https://classic.yarnpkg.com/en/docs/install) is required to build the UI. 46 | - [Docker](https://docs.docker.com/get-docker/) is required to run the integration tests. 47 | 48 | ## Build the project 49 | 50 | From the root of the project, build release binaries. 51 | 52 | ```shell 53 | cargo build --release 54 | ``` 55 | 56 | ## Testing the project 57 | 58 | ```shell 59 | cargo test 60 | ``` 61 | 62 | ## Running the examples 63 | 64 | ```shell 65 | cd examples 66 | cargo run --example standalone_sql --features=ballista/standalone 67 | ``` 68 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. Licensed to the Apache Software Foundation (ASF) under one 2 | .. or more contributor license agreements. See the NOTICE file 3 | .. distributed with this work for additional information 4 | .. regarding copyright ownership. The ASF licenses this file 5 | .. to you under the Apache License, Version 2.0 (the 6 | .. "License"); you may not use this file except in compliance 7 | .. with the License. You may obtain a copy of the License at 8 | 9 | .. http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | .. Unless required by applicable law or agreed to in writing, 12 | .. software distributed under the License is distributed on an 13 | .. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | .. KIND, either express or implied. See the License for the 15 | .. specific language governing permissions and limitations 16 | .. under the License. 17 | 18 | ===================== 19 | Apache DataFusion Ballista 20 | ===================== 21 | 22 | Table of content 23 | ================ 24 | 25 | 26 | .. _toc.guide: 27 | 28 | .. toctree:: 29 | :maxdepth: 1 30 | :caption: User Guide 31 | 32 | Introduction 33 | 34 | .. toctree:: 35 | :maxdepth: 1 36 | :caption: Cluster Deployment 37 | 38 | Deployment 39 | Scheduler 40 | 41 | .. toctree:: 42 | :maxdepth: 1 43 | :caption: Clients 44 | 45 | Python 46 | Rust 47 | Flight SQL JDBC 48 | SQL CLI 49 | 50 | .. toctree:: 51 | :maxdepth: 1 52 | :caption: Reference 53 | 54 | user-guide/configs 55 | user-guide/tuning-guide 56 | user-guide/metrics 57 | user-guide/faq 58 | user-guide/extending-components 59 | 60 | .. _toc.contributors: 61 | 62 | .. toctree:: 63 | :maxdepth: 1 64 | :caption: Contributors Guide 65 | 66 | contributors-guide/architecture 67 | contributors-guide/code-organization 68 | contributors-guide/development 69 | Source code 70 | 71 | .. _toc.community: 72 | 73 | .. toctree:: 74 | :maxdepth: 1 75 | :caption: Community 76 | 77 | community/communication 78 | 79 | Issue tracker 80 | Code of conduct 81 | -------------------------------------------------------------------------------- /docs/source/user-guide/deployment/cargo-install.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Deploying a standalone Ballista cluster using cargo install 21 | 22 | A simple way to start a local cluster for testing purposes is to use cargo to install 23 | the scheduler and executor crates. 24 | 25 | ```bash 26 | cargo install --locked ballista-scheduler 27 | cargo install --locked ballista-executor 28 | ``` 29 | 30 | With these crates installed, it is now possible to start a scheduler process. 31 | 32 | ```bash 33 | RUST_LOG=info ballista-scheduler 34 | ``` 35 | 36 | The scheduler will bind to port 50050 by default. 37 | 38 | Next, start an executor processes in a new terminal session. 39 | 40 | ```bash 41 | RUST_LOG=info ballista-executor 42 | ``` 43 | 44 | The executor will bind to port 50051 by default. Additional executors can be started by 45 | manually specifying a bind port. For example: 46 | 47 | ```bash 48 | RUST_LOG=info ballista-executor --bind-port 50052 49 | ``` 50 | -------------------------------------------------------------------------------- /docs/source/user-guide/deployment/index.rst: -------------------------------------------------------------------------------- 1 | .. Licensed to the Apache Software Foundation (ASF) under one 2 | .. or more contributor license agreements. See the NOTICE file 3 | .. distributed with this work for additional information 4 | .. regarding copyright ownership. The ASF licenses this file 5 | .. to you under the Apache License, Version 2.0 (the 6 | .. "License"); you may not use this file except in compliance 7 | .. with the License. You may obtain a copy of the License at 8 | 9 | .. http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | .. Unless required by applicable law or agreed to in writing, 12 | .. software distributed under the License is distributed on an 13 | .. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | .. KIND, either express or implied. See the License for the 15 | .. specific language governing permissions and limitations 16 | .. under the License. 17 | 18 | Start a Ballista Cluster 19 | ======================== 20 | 21 | .. toctree:: 22 | :maxdepth: 2 23 | 24 | Quick Start 25 | Cargo Install 26 | Docker 27 | Docker Compose 28 | Kubernetes 29 | -------------------------------------------------------------------------------- /docs/source/user-guide/faq.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Frequently Asked Questions 21 | 22 | ## What is the relationship between DataFusion and Ballista? 23 | 24 | DataFusion is a library for executing queries in-process using the Apache Arrow memory 25 | model and computational kernels. It is designed to run within a single process, using threads 26 | for parallel query execution. 27 | 28 | Ballista is a distributed compute platform for DataFusion workloads. 29 | -------------------------------------------------------------------------------- /docs/source/user-guide/images/ballista-web-ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/datafusion-ballista/97c919274d9de496b630e66c12ad29c3fccd110b/docs/source/user-guide/images/ballista-web-ui.png -------------------------------------------------------------------------------- /docs/source/user-guide/images/example-query-plan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/datafusion-ballista/97c919274d9de496b630e66c12ad29c3fccd110b/docs/source/user-guide/images/example-query-plan.png -------------------------------------------------------------------------------- /docs/source/user-guide/introduction.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Overview 21 | 22 | Ballista is a distributed compute platform primarily implemented in Rust, and powered by Apache DataFusion. 23 | 24 | Ballista has a scheduler and an executor process that are standard Rust executables and can be executed directly, but 25 | Dockerfiles are provided to build images for use in containerized environments, such as Docker, Docker Compose, and 26 | Kubernetes. See the [deployment guide](deployment.md) for more information 27 | 28 | SQL and DataFrame queries can be submitted from Python and Rust, and SQL queries can be submitted via the Arrow 29 | Flight SQL JDBC driver, supporting your favorite JDBC compliant tools such as [DataGrip](datagrip) 30 | or [tableau](tableau). For setup instructions, please see the [FlightSQL guide](flightsql.md). 31 | 32 | ## How does this compare to Apache Spark? 33 | 34 | Although Ballista is largely inspired by Apache Spark, there are some key differences. 35 | 36 | - The choice of Rust as the main execution language means that memory usage is deterministic and avoids the overhead 37 | of GC pauses. 38 | - Ballista is designed from the ground up to use columnar data, enabling a number of efficiencies such as vectorized 39 | processing (SIMD and GPU) and efficient compression. Although Spark does have some columnar support, it is still 40 | largely row-based today. 41 | - The combination of Rust and Arrow provides excellent memory efficiency and memory usage can be 5x - 10x lower than 42 | Apache Spark in some cases, which means that more processing can fit on a single node, reducing the overhead of 43 | distributed compute. 44 | - The use of Apache Arrow as the memory model and network protocol means that data can be exchanged between executors 45 | in any programming language with minimal serialization overhead. 46 | 47 | [deployment](./deployment) 48 | [datagrip](https://www.jetbrains.com/datagrip/) 49 | [tableau](https://help.tableau.com/current/pro/desktop/en-us/examples_otherdatabases_jdbc.htm) 50 | -------------------------------------------------------------------------------- /docs/source/user-guide/metrics.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Ballista Scheduler Metrics 21 | 22 | ## Prometheus 23 | 24 | > This is optional scheduler feature which should be enabled with `prometheus-metrics` feature 25 | 26 | Built with default features, the ballista scheduler will automatically collect and expose a standard set of prometheus metrics. 27 | The metrics currently collected automatically include: 28 | 29 | - _job_exec_time_seconds_ - Histogram of successful job execution time in seconds 30 | - _planning_time_ms_ - Histogram of job planning time in milliseconds 31 | - _failed_ - Counter of failed jobs 32 | - _job_failed_total_ - Counter of failed jobs 33 | - _job_cancelled_total_ - Counter of cancelled jobs 34 | - _job_completed_total_ - Counter of completed jobs 35 | - _job_submitted_total_ - Counter of submitted jobs 36 | - _pending_task_queue_size_ - Number of pending tasks 37 | 38 | **NOTE** Currently the histogram buckets for the above metrics are set to reasonable defaults. If the defaults are not 39 | appropriate for a given use case, the only workaround is to implement a customer `SchedulerMetricsCollector`. In the future 40 | the buckets should be made configurable. 41 | 42 | The metrics are then exported through the scheduler REST API at `GET /api/metrics`. It should be sufficient to ingest metrics 43 | into an existing metrics system by point your chosen prometheus exporter at that endpoint. 44 | -------------------------------------------------------------------------------- /docs/source/user-guide/scheduler.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Ballista Scheduler 21 | 22 | ## REST API 23 | 24 | The scheduler also provides a REST API that allows jobs to be monitored. 25 | 26 | > This is optional scheduler feature which should be enabled with `rest-api` feature 27 | 28 | | API | Method | Description | 29 | | ------------------------------------ | ------ | ----------------------------------------------------------------- | 30 | | /api/jobs | GET | Get a list of jobs that have been submitted to the cluster. | 31 | | /api/job/{job_id} | GET | Get a summary of a submitted job. | 32 | | /api/job/{job_id}/dot | GET | Produce a query plan in DOT (graphviz) format. | 33 | | /api/job/:job_id/dot_svg | GET | Produce a query plan in SVG format. (`graphviz-support` required) | 34 | | /api/job/{job_id} | PATCH | Cancel a currently running job | 35 | | /api/job/:job_id/stage/:stage_id/dot | GET | Produces stage plan in DOT (graphviz) format | 36 | | /api/metrics | GET | Return current scheduler metric set | 37 | -------------------------------------------------------------------------------- /examples/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | [package] 19 | name = "ballista-examples" 20 | description = "Ballista usage examples" 21 | version = "47.0.0" 22 | homepage = "https://datafusion.apache.org/ballista/" 23 | repository = "https://github.com/apache/datafusion-ballista" 24 | authors = ["Apache DataFusion "] 25 | license = "Apache-2.0" 26 | keywords = ["arrow", "distributed", "query", "sql"] 27 | edition = "2021" 28 | publish = false 29 | 30 | [[example]] 31 | name = "standalone_sql" 32 | path = "examples/standalone-sql.rs" 33 | required-features = ["ballista/standalone"] 34 | 35 | [dependencies] 36 | ballista = { path = "../ballista/client", version = "47.0.0" } 37 | ballista-core = { path = "../ballista/core", version = "47.0.0", feature = ["binary-build"] } 38 | ballista-executor = { path = "../ballista/executor", version = "47.0.0", default-features = false } 39 | ballista-scheduler = { path = "../ballista/scheduler", version = "47.0.0", default-features = false } 40 | datafusion = { workspace = true } 41 | env_logger = { workspace = true } 42 | futures = { workspace = true } 43 | log = { workspace = true } 44 | object_store = { workspace = true, features = ["aws"] } 45 | tokio = { workspace = true, features = [ 46 | "macros", 47 | "rt", 48 | "rt-multi-thread", 49 | "sync", 50 | "parking_lot" 51 | ] } 52 | url = { workspace = true } 53 | 54 | [dev-dependencies] 55 | ctor = { workspace = true } 56 | env_logger = { workspace = true } 57 | testcontainers-modules = { version = "0.11", features = ["minio"] } 58 | tonic = { workspace = true } 59 | 60 | [features] 61 | default = [] 62 | testcontainers = [] 63 | -------------------------------------------------------------------------------- /examples/examples/custom-executor.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use ballista_core::object_store::{ 19 | runtime_env_with_s3_support, session_config_with_s3_support, 20 | }; 21 | 22 | use ballista_executor::executor_process::{ 23 | start_executor_process, ExecutorProcessConfig, 24 | }; 25 | use std::sync::Arc; 26 | /// 27 | /// # Custom Ballista Executor 28 | /// 29 | /// This example demonstrates how to crate custom ballista executors. 30 | /// 31 | #[tokio::main] 32 | async fn main() -> ballista_core::error::Result<()> { 33 | let _ = env_logger::builder() 34 | .filter_level(log::LevelFilter::Info) 35 | .is_test(true) 36 | .try_init(); 37 | 38 | let config: ExecutorProcessConfig = ExecutorProcessConfig { 39 | // overriding default config producer with custom producer 40 | // which has required S3 configuration options 41 | override_config_producer: Some(Arc::new(session_config_with_s3_support)), 42 | // overriding default runtime producer with custom producer 43 | // which knows how to create S3 connections 44 | override_runtime_producer: Some(Arc::new(runtime_env_with_s3_support)), 45 | ..Default::default() 46 | }; 47 | 48 | start_executor_process(Arc::new(config)).await 49 | } 50 | -------------------------------------------------------------------------------- /examples/examples/custom-scheduler.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use ballista_core::error::BallistaError; 19 | use ballista_core::object_store::{ 20 | session_config_with_s3_support, session_state_with_s3_support, 21 | }; 22 | 23 | use ballista_scheduler::cluster::BallistaCluster; 24 | use ballista_scheduler::config::SchedulerConfig; 25 | use ballista_scheduler::scheduler_process::start_server; 26 | use std::net::AddrParseError; 27 | use std::sync::Arc; 28 | 29 | /// 30 | /// # Custom Ballista Scheduler 31 | /// 32 | /// This example demonstrates how to crate custom ballista schedulers. 33 | /// 34 | #[tokio::main] 35 | async fn main() -> ballista_core::error::Result<()> { 36 | let _ = env_logger::builder() 37 | .filter_level(log::LevelFilter::Info) 38 | .is_test(true) 39 | .try_init(); 40 | 41 | let config: SchedulerConfig = SchedulerConfig { 42 | // overriding default runtime producer with custom producer 43 | // which knows how to create S3 connections 44 | override_config_producer: Some(Arc::new(session_config_with_s3_support)), 45 | // overriding default session builder, which has custom session configuration 46 | // runtime environment and session state. 47 | override_session_builder: Some(Arc::new(session_state_with_s3_support)), 48 | ..Default::default() 49 | }; 50 | 51 | let addr = format!("{}:{}", config.bind_host, config.bind_port); 52 | let addr = addr 53 | .parse() 54 | .map_err(|e: AddrParseError| BallistaError::Configuration(e.to_string()))?; 55 | 56 | let cluster = BallistaCluster::new_from_config(&config).await?; 57 | start_server(cluster, addr, Arc::new(config)).await?; 58 | 59 | Ok(()) 60 | } 61 | -------------------------------------------------------------------------------- /examples/examples/remote-dataframe.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use ballista::prelude::*; 19 | use ballista_examples::test_util; 20 | use datafusion::{ 21 | common::Result, 22 | execution::SessionStateBuilder, 23 | prelude::{col, lit, ParquetReadOptions, SessionConfig, SessionContext}, 24 | }; 25 | 26 | /// This example demonstrates executing a simple query against an Arrow data source (Parquet) and 27 | /// fetching results, using the DataFrame trait 28 | #[tokio::main] 29 | async fn main() -> Result<()> { 30 | let config = SessionConfig::new_with_ballista().with_target_partitions(4); 31 | 32 | let state = SessionStateBuilder::new() 33 | .with_config(config) 34 | .with_default_features() 35 | .build(); 36 | 37 | let ctx = SessionContext::remote_with_state("df://localhost:50050", state).await?; 38 | 39 | let test_data = test_util::examples_test_data(); 40 | let filename = format!("{test_data}/alltypes_plain.parquet"); 41 | 42 | let df = ctx 43 | .read_parquet(filename, ParquetReadOptions::default()) 44 | .await? 45 | .select_columns(&["id", "bool_col", "timestamp_col"])? 46 | .filter(col("id").gt(lit(1)))?; 47 | 48 | df.show().await?; 49 | 50 | Ok(()) 51 | } 52 | -------------------------------------------------------------------------------- /examples/examples/remote-sql.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use ballista::datafusion::{ 19 | common::Result, 20 | execution::SessionStateBuilder, 21 | prelude::{CsvReadOptions, SessionConfig, SessionContext}, 22 | }; 23 | use ballista::prelude::*; 24 | use ballista_examples::test_util; 25 | 26 | /// This example demonstrates executing a simple query against an Arrow data source (CSV) and 27 | /// fetching results, using SQL 28 | #[tokio::main] 29 | async fn main() -> Result<()> { 30 | let config = SessionConfig::new_with_ballista() 31 | .with_target_partitions(4) 32 | .with_ballista_job_name("Remote SQL Example"); 33 | 34 | let state = SessionStateBuilder::new() 35 | .with_config(config) 36 | .with_default_features() 37 | .build(); 38 | 39 | let ctx = SessionContext::remote_with_state("df://localhost:50050", state).await?; 40 | 41 | let test_data = test_util::examples_test_data(); 42 | 43 | ctx.register_csv( 44 | "test", 45 | &format!("{test_data}/aggregate_test_100.csv"), 46 | CsvReadOptions::new(), 47 | ) 48 | .await?; 49 | 50 | let df = ctx 51 | .sql( 52 | "SELECT c1, MIN(c12), MAX(c12) \ 53 | FROM test \ 54 | WHERE c11 > 0.1 AND c11 < 0.9 \ 55 | GROUP BY c1", 56 | ) 57 | .await?; 58 | 59 | df.show().await?; 60 | 61 | Ok(()) 62 | } 63 | -------------------------------------------------------------------------------- /examples/examples/standalone-sql.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use ballista::datafusion::{ 19 | common::Result, 20 | execution::{options::ParquetReadOptions, SessionStateBuilder}, 21 | prelude::{SessionConfig, SessionContext}, 22 | }; 23 | use ballista::prelude::{SessionConfigExt, SessionContextExt}; 24 | use ballista_examples::test_util; 25 | 26 | #[tokio::main] 27 | async fn main() -> Result<()> { 28 | let config = SessionConfig::new_with_ballista() 29 | .with_target_partitions(1) 30 | .with_ballista_standalone_parallelism(2); 31 | 32 | let state = SessionStateBuilder::new() 33 | .with_config(config) 34 | .with_default_features() 35 | .build(); 36 | 37 | let ctx = SessionContext::standalone_with_state(state).await?; 38 | 39 | let test_data = test_util::examples_test_data(); 40 | 41 | // register parquet file with the execution context 42 | ctx.register_parquet( 43 | "test", 44 | &format!("{test_data}/alltypes_plain.parquet"), 45 | ParquetReadOptions::default(), 46 | ) 47 | .await?; 48 | 49 | let df = ctx.sql("select count(1) from test").await?; 50 | 51 | df.show().await?; 52 | Ok(()) 53 | } 54 | -------------------------------------------------------------------------------- /examples/src/lib.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | pub mod test_util; 19 | -------------------------------------------------------------------------------- /examples/testdata/alltypes_plain.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/datafusion-ballista/97c919274d9de496b630e66c12ad29c3fccd110b/examples/testdata/alltypes_plain.parquet -------------------------------------------------------------------------------- /header: -------------------------------------------------------------------------------- 1 | Licensed to the Apache Software Foundation (ASF) under one 2 | or more contributor license agreements. See the NOTICE file 3 | distributed with this work for additional information 4 | regarding copyright ownership. The ASF licenses this file 5 | to you under the Apache License, Version 2.0 (the 6 | "License"); you may not use this file except in compliance 7 | with the License. You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | 17 | -------------------------------------------------------------------------------- /pre-commit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | # This file is git pre-commit hook. 21 | # 22 | # Soft link it as git hook under top dir of apache arrow git repository: 23 | # $ ln -s ../../pre-commit.sh .git/hooks/pre-commit 24 | # 25 | # This file be run directly: 26 | # $ ./pre-commit.sh 27 | 28 | function RED() { 29 | echo "\033[0;31m$@\033[0m" 30 | } 31 | 32 | function GREEN() { 33 | echo "\033[0;32m$@\033[0m" 34 | } 35 | 36 | function BYELLOW() { 37 | echo "\033[1;33m$@\033[0m" 38 | } 39 | 40 | # env GIT_DIR is set by git when run a pre-commit hook. 41 | if [ -z "${GIT_DIR}" ]; then 42 | GIT_DIR=$(git rev-parse --show-toplevel) 43 | fi 44 | 45 | cd ${GIT_DIR} 46 | 47 | NUM_CHANGES=$(git diff --cached --name-only . | 48 | grep -e ".*/*.rs$" | 49 | awk '{print $1}' | 50 | wc -l) 51 | 52 | if [ ${NUM_CHANGES} -eq 0 ]; then 53 | echo -e "$(GREEN INFO): no staged changes in *.rs, $(GREEN skip cargo fmt/clippy)" 54 | exit 0 55 | fi 56 | 57 | # 1. cargo clippy 58 | 59 | echo -e "$(GREEN INFO): cargo clippy ..." 60 | 61 | # Cargo clippy always return exit code 0, and `tee` doesn't work. 62 | # So let's just run cargo clippy. 63 | cargo clippy 64 | echo -e "$(GREEN INFO): cargo clippy done" 65 | 66 | # 2. cargo fmt: format with nightly and stable. 67 | 68 | CHANGED_BY_CARGO_FMT=false 69 | echo -e "$(GREEN INFO): cargo fmt with nightly and stable ..." 70 | 71 | for version in nightly stable; do 72 | CMD="cargo +${version} fmt" 73 | ${CMD} --all -q -- --check 2>/dev/null 74 | if [ $? -ne 0 ]; then 75 | ${CMD} --all 76 | echo -e "$(BYELLOW WARN): ${CMD} changed some files" 77 | CHANGED_BY_CARGO_FMT=true 78 | fi 79 | done 80 | 81 | if ${CHANGED_BY_CARGO_FMT}; then 82 | echo -e "$(RED FAIL): git commit $(RED ABORTED), please have a look and run git add/commit again" 83 | exit 1 84 | fi 85 | 86 | exit 0 87 | -------------------------------------------------------------------------------- /python/.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [target.x86_64-apple-darwin] 2 | rustflags = [ 3 | "-C", "link-arg=-undefined", 4 | "-C", "link-arg=dynamic_lookup", 5 | ] 6 | 7 | [target.aarch64-apple-darwin] 8 | rustflags = [ 9 | "-C", "link-arg=-undefined", 10 | "-C", "link-arg=dynamic_lookup", 11 | ] 12 | -------------------------------------------------------------------------------- /python/.gitignore: -------------------------------------------------------------------------------- 1 | venv 2 | *.so -------------------------------------------------------------------------------- /python/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | [package] 19 | name = "pyballista" 20 | version = "43.0.0" 21 | homepage = "https://datafusion.apache.org/ballista/" 22 | repository = "https://github.com/apache/datafusion-ballista" 23 | authors = ["Apache DataFusion "] 24 | description = "Apache Arrow Ballista Python Client" 25 | readme = "README.md" 26 | license = "Apache-2.0" 27 | edition = "2021" 28 | include = ["/src", "/ballista", "/LICENSE.txt", "pyproject.toml", "Cargo.toml", "Cargo.lock"] 29 | publish = false 30 | 31 | [dependencies] 32 | async-trait = "0.1.77" 33 | ballista = { version = "45.0.0" } 34 | ballista-core = { version = "45.0.0" } 35 | ballista-executor = { version = "45.0.0", default-features = false } 36 | ballista-scheduler = { version = "45.0.0", default-features = false } 37 | datafusion = { version = "45", features = ["pyarrow", "avro"] } 38 | datafusion-proto = { version = "45" } 39 | datafusion-python = { version = "45" } 40 | 41 | pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py38"] } 42 | pyo3-log = "0.12" 43 | tokio = { version = "1.42", features = ["macros", "rt", "rt-multi-thread", "sync"] } 44 | 45 | [lib] 46 | crate-type = ["cdylib"] 47 | name = "ballista" 48 | -------------------------------------------------------------------------------- /python/ballista/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | from abc import ABCMeta, abstractmethod 19 | from typing import List 20 | 21 | try: 22 | import importlib.metadata as importlib_metadata 23 | except ImportError: 24 | import importlib_metadata 25 | 26 | import pyarrow as pa 27 | 28 | from .ballista_internal import ( 29 | BallistaBuilder, BallistaScheduler, BallistaExecutor 30 | ) 31 | 32 | __version__ = importlib_metadata.version(__name__) 33 | 34 | __all__ = [ 35 | "BallistaBuilder", "BallistaScheduler", "BallistaExecutor" 36 | ] -------------------------------------------------------------------------------- /python/ballista/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | -------------------------------------------------------------------------------- /python/examples/client_remote.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # %% 19 | from ballista import BallistaBuilder 20 | from datafusion.context import SessionContext 21 | 22 | ctx: SessionContext = BallistaBuilder().remote("df://127.0.0.1:50050") 23 | 24 | # Select 1 to verify its working 25 | ctx.sql("SELECT 1").show() 26 | 27 | # %% 28 | -------------------------------------------------------------------------------- /python/examples/client_standalone.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # %% 19 | 20 | from ballista import BallistaBuilder 21 | from datafusion.context import SessionContext 22 | 23 | ctx: SessionContext = BallistaBuilder()\ 24 | .config("datafusion.catalog.information_schema","true")\ 25 | .config("ballista.job.name", "example ballista")\ 26 | .standalone() 27 | 28 | 29 | ctx.sql("SELECT 1").show() 30 | 31 | # %% 32 | ctx.sql("SHOW TABLES").show() 33 | # %% 34 | ctx.sql("select name, value from information_schema.df_settings where name like 'ballista.job.name'").show() 35 | 36 | 37 | # %% 38 | -------------------------------------------------------------------------------- /python/examples/executor.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # %% 19 | from ballista import BallistaExecutor 20 | # %% 21 | executor = BallistaExecutor() 22 | # %% 23 | executor.start() 24 | # %% 25 | executor 26 | # %% 27 | executor.wait_for_termination() 28 | # %% 29 | # %% 30 | executor.close() 31 | # %% 32 | -------------------------------------------------------------------------------- /python/examples/readme_remote.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # %% 19 | 20 | from ballista import BallistaBuilder 21 | from datafusion.context import SessionContext 22 | 23 | ctx: SessionContext = BallistaBuilder()\ 24 | .config("ballista.job.name", "Readme Example Remote")\ 25 | .config("datafusion.execution.target_partitions", "4")\ 26 | .remote("df://127.0.0.1:50050") 27 | 28 | ctx.sql("create external table t stored as parquet location '../testdata/test.parquet'") 29 | 30 | # %% 31 | df = ctx.sql("select * from t limit 5") 32 | pyarrow_batches = df.collect() 33 | pyarrow_batches[0].to_pandas() 34 | # %% 35 | df = ctx.read_parquet('../testdata/test.parquet').limit(5) 36 | pyarrow_batches = df.collect() 37 | pyarrow_batches[0].to_pandas() 38 | # %% -------------------------------------------------------------------------------- /python/examples/readme_standalone.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # %% 19 | 20 | from ballista import BallistaBuilder 21 | from datafusion.context import SessionContext 22 | 23 | ctx: SessionContext = BallistaBuilder()\ 24 | .config("ballista.job.name", "Readme Example")\ 25 | .config("datafusion.execution.target_partitions", "4")\ 26 | .standalone() 27 | 28 | ctx.sql("create external table t stored as parquet location '../testdata/test.parquet'") 29 | 30 | # %% 31 | df = ctx.sql("select * from t limit 5") 32 | pyarrow_batches = df.collect() 33 | pyarrow_batches[0].to_pandas() 34 | # %% 35 | df = ctx.read_parquet('../testdata/test.parquet').limit(5) 36 | pyarrow_batches = df.collect() 37 | pyarrow_batches[0].to_pandas() 38 | # %% -------------------------------------------------------------------------------- /python/examples/scheduler.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # %% 19 | from ballista import BallistaScheduler 20 | # %% 21 | scheduler = BallistaScheduler() 22 | # %% 23 | scheduler 24 | # %% 25 | scheduler.start() 26 | # %% 27 | scheduler.wait_for_termination() 28 | # %% 29 | scheduler.close() -------------------------------------------------------------------------------- /python/pyproject.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | [build-system] 19 | requires = ["maturin>=1.5.1,<1.6.0"] 20 | build-backend = "maturin" 21 | 22 | [project] 23 | name = "ballista" 24 | description = "Python client for Apache Arrow Ballista Distributed SQL Query Engine" 25 | readme = "README.md" 26 | license = {file = "LICENSE.txt"} 27 | requires-python = ">=3.9" 28 | keywords = ["ballista", "sql", "rust", "distributed"] 29 | classifier = [ 30 | "Development Status :: 2 - Pre-Alpha", 31 | "Intended Audience :: Developers", 32 | "License :: OSI Approved :: Apache Software License", 33 | "License :: OSI Approved", 34 | "Operating System :: MacOS", 35 | "Operating System :: Microsoft :: Windows", 36 | "Operating System :: POSIX :: Linux", 37 | "Programming Language :: Python :: 3", 38 | "Programming Language :: Python :: 3.7", 39 | "Programming Language :: Python :: 3.8", 40 | "Programming Language :: Python :: 3.9", 41 | "Programming Language :: Python :: 3.10", 42 | "Programming Language :: Python", 43 | "Programming Language :: Rust", 44 | ] 45 | dependencies = [ 46 | "pyarrow>=19.0.0", "cloudpickle" 47 | ] 48 | 49 | [project.urls] 50 | homepage = "https://datafusion.apache.org/ballista" 51 | documentation = "https://datafusion.apache.org/ballista" 52 | repository = "https://github.com/apache/datafusion-ballista" 53 | 54 | [tool.isort] 55 | profile = "black" 56 | 57 | [tool.maturin] 58 | module-name = "ballista.ballista_internal" 59 | include = [ 60 | { path = "Cargo.lock", format = "sdist" } 61 | ] 62 | exclude = [".github/**", "ci/**", ".asf.yaml"] 63 | # Require Cargo.lock is up to date 64 | locked = true 65 | -------------------------------------------------------------------------------- /python/requirements.txt: -------------------------------------------------------------------------------- 1 | datafusion==45.0.0 2 | pyarrow 3 | pytest 4 | maturin==1.5.1 5 | cloudpickle 6 | pandas -------------------------------------------------------------------------------- /python/src/utils.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::future::Future; 19 | use std::sync::OnceLock; 20 | use tokio::task::JoinHandle; 21 | 22 | use ballista_core::error::BallistaError; 23 | use pyo3::exceptions::PyException; 24 | use pyo3::{PyErr, Python}; 25 | use tokio::runtime::Runtime; 26 | 27 | use crate::TokioRuntime; 28 | 29 | pub(crate) fn to_pyerr(err: BallistaError) -> PyErr { 30 | PyException::new_err(err.to_string()) 31 | } 32 | 33 | #[inline] 34 | pub(crate) fn get_tokio_runtime() -> &'static TokioRuntime { 35 | // NOTE: Other pyo3 python libraries have had issues with using tokio 36 | // behind a forking app-server like `gunicorn` 37 | // If we run into that problem, in the future we can look to `delta-rs` 38 | // which adds a check in that disallows calls from a forked process 39 | // https://github.com/delta-io/delta-rs/blob/87010461cfe01563d91a4b9cd6fa468e2ad5f283/python/src/utils.rs#L10-L31 40 | static RUNTIME: OnceLock = OnceLock::new(); 41 | RUNTIME.get_or_init(|| TokioRuntime(tokio::runtime::Runtime::new().unwrap())) 42 | } 43 | 44 | /// Utility to collect rust futures with GIL released 45 | pub(crate) fn wait_for_future(py: Python, f: F) -> F::Output 46 | where 47 | F: Future + Send, 48 | F::Output: Send, 49 | { 50 | let runtime: &Runtime = &get_tokio_runtime().0; 51 | py.allow_threads(|| runtime.block_on(f)) 52 | } 53 | 54 | pub(crate) fn spawn_feature(py: Python, f: F) -> JoinHandle 55 | where 56 | F: Future + Send + 'static, 57 | F::Output: Send, 58 | { 59 | let runtime: &Runtime = &get_tokio_runtime().0; 60 | // do we need py.allow_threads ? 61 | py.allow_threads(|| runtime.spawn(f)) 62 | } 63 | -------------------------------------------------------------------------------- /python/testdata/test.csv: -------------------------------------------------------------------------------- 1 | a,b 2 | 1,2 -------------------------------------------------------------------------------- /python/testdata/test.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/datafusion-ballista/97c919274d9de496b630e66c12ad29c3fccd110b/python/testdata/test.parquet -------------------------------------------------------------------------------- /rust-toolchain.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | [toolchain] 19 | channel = "stable" 20 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | edition = "2021" 19 | max_width = 90 20 | 21 | --------------------------------------------------------------------------------