├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── config.yml │ ├── documentation_improvement.md │ └── feature_request.md ├── dependabot.yml ├── pull_request_template.md └── workflows │ ├── binaries.yml │ ├── ci.yml │ ├── docker.yml │ ├── docs.yml │ └── release.yml ├── .gitignore ├── ARCHITECTURE.md ├── CHANGELOG.md ├── CONTRIBUTING.md ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── aqueducts-cli ├── Cargo.toml ├── README.md └── src │ ├── local_exec.rs │ ├── main.rs │ ├── remote_exec.rs │ └── websocket_client.rs ├── aqueducts-executor ├── Cargo.toml ├── README.md └── src │ ├── api │ ├── auth.rs │ └── mod.rs │ ├── config.rs │ ├── error.rs │ ├── executor │ ├── manager.rs │ ├── mod.rs │ ├── progress_tracker.rs │ └── queue.rs │ └── main.rs ├── aqueducts ├── core │ ├── Cargo.toml │ ├── src │ │ ├── custom_udfs.rs │ │ ├── destinations │ │ │ ├── file.rs │ │ │ └── mod.rs │ │ ├── error.rs │ │ ├── lib.rs │ │ ├── progress_tracker.rs │ │ ├── schema_transform.rs │ │ ├── sources │ │ │ └── mod.rs │ │ ├── stages │ │ │ └── mod.rs │ │ ├── store │ │ │ ├── azure.rs │ │ │ ├── gcs.rs │ │ │ ├── mod.rs │ │ │ └── s3.rs │ │ └── templating.rs │ └── tests │ │ ├── common │ │ └── mod.rs │ │ └── integration.rs ├── delta │ ├── Cargo.toml │ ├── src │ │ ├── error.rs │ │ ├── handlers.rs │ │ └── lib.rs │ └── tests │ │ ├── data │ │ ├── aqueduct_pipeline_delta_append.yml │ │ ├── aqueduct_pipeline_delta_replace.yml │ │ ├── aqueduct_pipeline_delta_upsert.yml │ │ ├── example_1.csv │ │ └── example_2.csv │ │ └── integration.rs ├── meta │ ├── Cargo.toml │ └── src │ │ └── lib.rs ├── odbc │ ├── Cargo.toml │ └── src │ │ ├── error.rs │ │ └── lib.rs └── schemas │ ├── Cargo.toml │ ├── src │ ├── data_types.rs │ ├── destinations.rs │ ├── generate_schema.rs │ ├── lib.rs │ ├── location.rs │ ├── progress.rs │ ├── protocol.rs │ ├── serde_helpers.rs │ ├── sources.rs │ └── stages.rs │ └── tests │ └── integration.rs ├── cliff.toml ├── db └── init.sql ├── dist-workspace.toml ├── docker-compose.yml ├── docker ├── .dockerignore ├── Dockerfile ├── odbc.ini └── odbcinst.ini ├── docs ├── about.md ├── architecture.md ├── assets │ ├── favicon.ico │ └── logo.png ├── cli.md ├── index.md ├── schema_reference.md ├── storage.md ├── stylesheets │ └── extra.css └── usage.md ├── examples ├── aqueduct_pipeline_example.json ├── aqueduct_pipeline_example.toml ├── aqueduct_pipeline_example.yml ├── aqueduct_pipeline_odbc.yml ├── aqueduct_pipeline_simple.yml ├── location_dict.csv ├── temp_readings_feb_2024.csv └── temp_readings_jan_2024.csv ├── json_schema ├── aqueducts.schema.json ├── generate_schema_reference.py ├── schema_reference.md └── schema_reference_template │ ├── base.md │ ├── breadcrumbs.md │ ├── content.md │ ├── section_array.md │ ├── section_conditional_subschema.md │ ├── section_description.md │ ├── section_examples.md │ ├── section_not.md │ ├── section_one_of.md │ ├── section_properties_details.md │ ├── section_undocumented_required_properties.md │ └── tabbed_section.md ├── logo.png ├── mkdocs.yml └── release.toml /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve Aqueducts 4 | title: '[BUG] ' 5 | labels: bug 6 | assignees: '' 7 | --- 8 | 9 | ## Bug Description 10 | A clear and concise description of what the bug is. 11 | 12 | ## Reproduction Steps 13 | Steps to reproduce the behavior: 14 | 1. Install '...' 15 | 2. Configure '...' 16 | 3. Run command '...' 17 | 4. See error 18 | 19 | ## Expected Behavior 20 | A clear and concise description of what you expected to happen. 21 | 22 | ## Actual Behavior 23 | What actually happened, including error messages, logs, or screenshots if applicable. 24 | 25 | ## Additional Context 26 | Add any other context about the problem here. For example: 27 | - Does the issue happen consistently or intermittently? 28 | - Did this work in a previous version? 29 | - Are you using any special environment variables? 30 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: true 2 | contact_links: 3 | - name: Questions and Help 4 | url: https://github.com/vigimite/aqueducts/?tab=readme-ov-file#community 5 | about: Please ask and answer questions in the Aqueducts discord server 6 | - name: Documentation 7 | url: https://vigimite.github.io/aqueducts 8 | about: Check our documentation for information about usage and configuration 9 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation_improvement.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Documentation improvement 3 | about: Suggest improvements to Aqueducts documentation 4 | title: '[DOCS] ' 5 | labels: documentation 6 | assignees: '' 7 | --- 8 | 9 | ## Documentation Location 10 | Which document needs improvement? Provide links if possible. 11 | 12 | ## Problem Description 13 | What's wrong, confusing, or missing in the current documentation? 14 | 15 | ## Suggested Improvement 16 | Describe your proposed changes or additions. 17 | 18 | ## Additional Context 19 | Add any other context or screenshots about the documentation improvement here. 20 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for Aqueducts 4 | title: '[FEATURE] ' 5 | labels: enhancement 6 | assignees: '' 7 | --- 8 | 9 | ## Problem Statement 10 | A clear and concise description of what problem this feature would solve. For example: "I'm always frustrated when [...]" 11 | 12 | ## Proposed Solution 13 | A clear and concise description of what you want to happen or how the feature should work. 14 | 15 | ## Alternative Solutions 16 | A clear and concise description of any alternative solutions or features you've considered. 17 | 18 | ## Example Use Case 19 | Describe a concrete example of how this feature would be used. 20 | 21 | ## Additional Context 22 | Add any other context, diagrams, or screenshots about the feature request here. 23 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # Dependabot config 2 | 3 | version: 2 4 | updates: 5 | - package-ecosystem: cargo 6 | directory: "/" 7 | schedule: 8 | interval: weekly 9 | open-pull-requests-limit: 10 10 | target-branch: main 11 | labels: [auto-dependencies] 12 | - package-ecosystem: cargo 13 | directory: "aqueducts-cli/" 14 | schedule: 15 | interval: weekly 16 | open-pull-requests-limit: 10 17 | target-branch: main 18 | labels: [auto-dependencies] 19 | - package-ecosystem: cargo 20 | directory: "aqueducts-executor/" 21 | schedule: 22 | interval: weekly 23 | open-pull-requests-limit: 10 24 | target-branch: main 25 | labels: [auto-dependencies] 26 | - package-ecosystem: "github-actions" 27 | directory: "/" 28 | schedule: 29 | interval: "weekly" 30 | open-pull-requests-limit: 10 31 | labels: [auto-dependencies] 32 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | 3 | 4 | ## Type of change 5 | 6 | - [ ] Bug fix (non-breaking change which fixes an issue) 7 | - [ ] New feature (non-breaking change which adds functionality) 8 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) 9 | - [ ] Performance improvement 10 | - [ ] Code refactoring (no functional changes) 11 | - [ ] Build/CI improvement 12 | - [ ] Documentation update 13 | - [ ] Other (please describe): 14 | 15 | ## How has this been tested? 16 | 17 | - [ ] Unit tests added/updated 18 | - [ ] Integration tests added/updated 19 | - [ ] Manual testing performed 20 | 21 | ## Issue(s) addressed 22 | 23 | Closes # 24 | 25 | ## Additional context 26 | 27 | -------------------------------------------------------------------------------- /.github/workflows/binaries.yml: -------------------------------------------------------------------------------- 1 | name: Test Binary Builds 2 | 3 | on: 4 | workflow_dispatch: 5 | pull_request: 6 | paths: 7 | - '.github/workflows/release.yml' 8 | - 'aqueducts-cli/**' 9 | 10 | jobs: 11 | test-build: 12 | name: Test Build ${{ matrix.target }} 13 | runs-on: ${{ matrix.os }} 14 | strategy: 15 | fail-fast: false 16 | matrix: 17 | include: 18 | - os: ubuntu-latest 19 | target: x86_64-unknown-linux-gnu 20 | binary-name: aqueducts 21 | - os: macos-latest 22 | target: aarch64-apple-darwin 23 | binary-name: aqueducts 24 | 25 | steps: 26 | - name: Checkout repository 27 | uses: actions/checkout@v4 28 | 29 | - name: Set up Rust 30 | uses: dtolnay/rust-toolchain@stable 31 | with: 32 | targets: ${{ matrix.target }} 33 | 34 | - name: Cache Rust dependencies 35 | uses: Swatinem/rust-cache@v2 36 | 37 | - name: Setup build environment (Linux) 38 | if: matrix.os == 'ubuntu-latest' 39 | run: sudo apt-get update && sudo apt-get install -y unixodbc-dev 40 | 41 | - name: Setup build environment (macOS) 42 | if: matrix.os == 'macos-latest' 43 | run: brew install unixodbc 44 | 45 | - name: Test build 46 | run: cargo build --release --target ${{ matrix.target }} -p aqueducts-cli 47 | 48 | - name: Verify binary 49 | run: | 50 | ls -la target/${{ matrix.target }}/release/ 51 | file target/${{ matrix.target }}/release/${{ matrix.binary-name }} -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | RUST_BACKTRACE: 1 12 | 13 | jobs: 14 | # Check code formatting 15 | format: 16 | name: Format 17 | runs-on: ubuntu-latest 18 | steps: 19 | - name: Checkout repository 20 | uses: actions/checkout@v4 21 | 22 | - name: Set up Rust 23 | uses: dtolnay/rust-toolchain@stable 24 | with: 25 | components: rustfmt 26 | 27 | - name: Check formatting 28 | run: cargo fmt --all --check 29 | 30 | # Run lints 31 | clippy: 32 | name: Clippy 33 | runs-on: ubuntu-latest 34 | steps: 35 | - name: Checkout repository 36 | uses: actions/checkout@v4 37 | 38 | - name: Set up Rust 39 | uses: dtolnay/rust-toolchain@stable 40 | with: 41 | components: clippy 42 | 43 | - name: Cache Rust dependencies 44 | uses: Swatinem/rust-cache@v2 45 | 46 | - name: Setup build environment 47 | run: sudo apt-get update && sudo apt-get install -y unixodbc-dev 48 | 49 | - name: Run clippy 50 | run: cargo clippy --workspace --all-targets --all-features -- -D warnings 51 | 52 | # Check that documentation builds 53 | docs: 54 | name: Documentation 55 | runs-on: ubuntu-latest 56 | steps: 57 | - name: Checkout repository 58 | uses: actions/checkout@v4 59 | 60 | - name: Set up Rust 61 | uses: dtolnay/rust-toolchain@stable 62 | 63 | - name: Cache Rust dependencies 64 | uses: Swatinem/rust-cache@v2 65 | 66 | - name: Setup build environment 67 | run: sudo apt-get update && sudo apt-get install -y unixodbc-dev 68 | 69 | - name: Check documentation 70 | run: cargo doc --workspace --all-features --no-deps --document-private-items 71 | env: 72 | RUSTDOCFLAGS: "-D warnings" 73 | 74 | # Security audit 75 | security: 76 | name: Security Audit 77 | runs-on: ubuntu-latest 78 | steps: 79 | - name: Checkout repository 80 | uses: actions/checkout@v4 81 | 82 | - name: Set up Rust 83 | uses: dtolnay/rust-toolchain@stable 84 | 85 | - name: Install cargo-audit 86 | run: cargo install cargo-audit --locked 87 | 88 | - name: Run security audit 89 | run: cargo audit 90 | 91 | # Run tests 92 | test: 93 | name: Test 94 | runs-on: ${{ matrix.os }} 95 | strategy: 96 | fail-fast: false 97 | matrix: 98 | os: [ubuntu-latest] 99 | rust: [stable] 100 | 101 | services: 102 | postgres: 103 | image: postgres:15 104 | env: 105 | POSTGRES_USER: postgres 106 | POSTGRES_PASSWORD: postgres 107 | ports: 108 | - 5432:5432 109 | options: >- 110 | --health-cmd pg_isready 111 | --health-interval 10s 112 | --health-timeout 5s 113 | --health-retries 5 114 | 115 | steps: 116 | - name: Checkout repository 117 | uses: actions/checkout@v4 118 | 119 | - name: Set up Rust 120 | uses: dtolnay/rust-toolchain@master 121 | with: 122 | toolchain: ${{ matrix.rust }} 123 | 124 | - name: Cache Rust dependencies 125 | uses: Swatinem/rust-cache@v2 126 | with: 127 | key: ${{ matrix.os }}-${{ matrix.rust }} 128 | 129 | - name: Setup build environment (Linux) 130 | if: matrix.os == 'ubuntu-latest' 131 | run: | 132 | sudo apt-get update 133 | sudo apt-get install -y postgresql-client unixodbc-dev odbc-postgresql 134 | 135 | - name: Setup test database (Linux) 136 | if: matrix.os == 'ubuntu-latest' 137 | run: | 138 | export CONTAINER_ID=$(docker ps --filter "name=postgres" --format "{{.ID}}") 139 | docker cp ./examples/temp_readings_jan_2024.csv $CONTAINER_ID:/opt/ 140 | docker cp ./examples/temp_readings_feb_2024.csv $CONTAINER_ID:/opt/ 141 | PGPASSWORD=postgres psql -h localhost -U postgres -d postgres -f ./db/init.sql 142 | 143 | - name: Run tests 144 | run: cargo test --workspace --all-features 145 | 146 | # Build check for different feature combinations 147 | features: 148 | name: Feature Combinations 149 | runs-on: ubuntu-latest 150 | steps: 151 | - name: Checkout repository 152 | uses: actions/checkout@v4 153 | 154 | - name: Set up Rust 155 | uses: dtolnay/rust-toolchain@stable 156 | 157 | - name: Cache Rust dependencies 158 | uses: Swatinem/rust-cache@v2 159 | 160 | - name: Setup build environment 161 | run: sudo apt-get update && sudo apt-get install -y unixodbc-dev 162 | 163 | - name: Check no default features 164 | run: cargo check --workspace --no-default-features 165 | 166 | - name: Check minimal features 167 | run: cargo check -p aqueducts-cli --no-default-features --features yaml 168 | 169 | - name: Check ODBC features 170 | run: cargo check --workspace --features odbc 171 | 172 | - name: Check all features 173 | run: cargo check --workspace --all-features 174 | -------------------------------------------------------------------------------- /.github/workflows/docker.yml: -------------------------------------------------------------------------------- 1 | name: Docker 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | tags: ['v*.*.*'] 7 | pull_request: 8 | branches: [main] 9 | 10 | env: 11 | REGISTRY: ghcr.io 12 | IMAGE_NAME: ${{ github.repository }}/aqueducts-executor 13 | 14 | jobs: 15 | build-and-push: 16 | runs-on: ubuntu-latest 17 | permissions: 18 | contents: read 19 | packages: write 20 | 21 | steps: 22 | - name: Checkout repository 23 | uses: actions/checkout@v4 24 | 25 | - name: Set up Docker Buildx 26 | uses: docker/setup-buildx-action@v3 27 | 28 | - name: Log in to Container Registry 29 | if: github.event_name != 'pull_request' 30 | uses: docker/login-action@v3 31 | with: 32 | registry: ${{ env.REGISTRY }} 33 | username: ${{ github.actor }} 34 | password: ${{ secrets.GITHUB_TOKEN }} 35 | 36 | - name: Extract metadata 37 | id: meta 38 | uses: docker/metadata-action@v5 39 | with: 40 | images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} 41 | tags: | 42 | # For pushes to main branch 43 | type=ref,event=branch 44 | # For pull requests 45 | type=ref,event=pr 46 | # For tag pushes 47 | type=semver,pattern={{version}} 48 | type=semver,pattern={{major}}.{{minor}} 49 | type=semver,pattern={{major}} 50 | # Latest tag for main branch 51 | type=raw,value=latest,enable={{is_default_branch}} 52 | 53 | - name: Build and push Docker image 54 | uses: docker/build-push-action@v6 55 | with: 56 | context: . 57 | file: docker/Dockerfile 58 | platforms: linux/amd64,linux/arm64 59 | push: ${{ github.event_name != 'pull_request' }} 60 | tags: ${{ steps.meta.outputs.tags }} 61 | labels: ${{ steps.meta.outputs.labels }} 62 | cache-from: type=gha 63 | cache-to: type=gha,mode=max 64 | 65 | - name: Generate image summary 66 | if: github.event_name != 'pull_request' 67 | run: | 68 | echo "## Docker Image Published 🐳" >> $GITHUB_STEP_SUMMARY 69 | echo "" >> $GITHUB_STEP_SUMMARY 70 | echo "**Registry:** ${{ env.REGISTRY }}" >> $GITHUB_STEP_SUMMARY 71 | echo "**Image:** ${{ env.IMAGE_NAME }}" >> $GITHUB_STEP_SUMMARY 72 | echo "" >> $GITHUB_STEP_SUMMARY 73 | echo "### Available Tags:" >> $GITHUB_STEP_SUMMARY 74 | echo '```' >> $GITHUB_STEP_SUMMARY 75 | echo "${{ steps.meta.outputs.tags }}" >> $GITHUB_STEP_SUMMARY 76 | echo '```' >> $GITHUB_STEP_SUMMARY 77 | echo "" >> $GITHUB_STEP_SUMMARY 78 | echo "### Quick Start:" >> $GITHUB_STEP_SUMMARY 79 | echo '```bash' >> $GITHUB_STEP_SUMMARY 80 | echo "docker run -p 3031:3031 ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest" >> $GITHUB_STEP_SUMMARY 81 | echo '```' >> $GITHUB_STEP_SUMMARY -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: Deploy MkDocs 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | permissions: 9 | contents: write 10 | 11 | jobs: 12 | deploy: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - name: Checkout repository 17 | uses: actions/checkout@v4 18 | 19 | - name: Set credentials 20 | run: | 21 | git config user.name github-actions[bot] 22 | git config user.email 41898282+github-actions[bot]@users.noreply.github.com 23 | 24 | - name: Set up Python 25 | uses: actions/setup-python@v5 26 | with: 27 | python-version: 3.x 28 | 29 | - name: Set cache id 30 | run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV 31 | 32 | - name: Cache setup 33 | uses: actions/cache@v4 34 | with: 35 | key: mkdocs-material-${{ env.cache_id }} 36 | path: .cache 37 | restore-keys: | 38 | mkdocs-material- 39 | 40 | - name: Install dependencies 41 | run: pip install "mkdocs-material[imaging]" json-schema-for-humans 42 | 43 | - name: Deploy 44 | run: mkdocs gh-deploy --force 45 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .cargo/** 2 | /target 3 | /aqueducts-cli/target 4 | /examples/output* 5 | /aqueducts/core/tests/output/** 6 | /aqueducts/delta/tests/output/** 7 | .venv 8 | site/ 9 | .cache/ 10 | __pycache__ 11 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = [ 3 | "aqueducts/meta", 4 | "aqueducts/core", 5 | "aqueducts/schemas", 6 | "aqueducts/odbc", 7 | "aqueducts/delta", 8 | "aqueducts-cli", 9 | "aqueducts-executor" 10 | ] 11 | resolver = "2" 12 | 13 | [workspace.package] 14 | authors = [""] 15 | edition = "2021" 16 | description = "Framework to build ETL data pipelines declaratively" 17 | homepage = "https://github.com/vigimite/aqueducts" 18 | repository = "https://github.com/vigimite/aqueducts" 19 | readme = "README.md" 20 | version = "0.10.1" 21 | keywords = ["aqueducts", "ETL", "data", "pipeline"] 22 | categories = ["api-bindings"] 23 | license = "Apache-2.0" 24 | 25 | [workspace.metadata.badges] 26 | github = { repository = "vigimite/aqueducts", workflow = "build.yml" } 27 | 28 | [workspace.dependencies] 29 | # Internal crates 30 | aqueducts = { path = "aqueducts/meta", version = "0.10.1" } 31 | aqueducts-schemas = { path = "aqueducts/schemas", version = "0.10.1" } 32 | aqueducts-core = { path = "aqueducts/core", version = "0.10.1" } 33 | aqueducts-delta = { path = "aqueducts/delta", version = "0.10.1" } 34 | aqueducts-odbc = { path = "aqueducts/odbc", version = "0.10.1" } 35 | 36 | # Data processing libraries 37 | datafusion = "47" 38 | datafusion-functions-json = "0.47" 39 | deltalake = { version = "0.26.2", features = ["datafusion"] } 40 | arrow-odbc = "16.0.2" 41 | 42 | # Serialization/deserialization 43 | serde = { version = "1", features = ["derive"] } 44 | serde_json = "1" 45 | serde_yml = "0.0.10" 46 | toml = "0.8" 47 | schemars = { version = "0.8", features = ["chrono", "url", "preserve_order"] } 48 | 49 | # Async runtime and utilities 50 | tokio = { version = "1", features = ["rt"] } 51 | tokio-util = "0.7" 52 | futures = "0.3" 53 | futures-util = "0.3" 54 | 55 | # Network and API 56 | axum = { version = "0.8.4", features = ["macros", "ws"] } 57 | tower = { version = "0.5.2", features = ["util"] } 58 | http-body-util = "0.1.3" 59 | tokio-tungstenite = "0.26.2" 60 | openssl = { version = "0.10", features = ["vendored"] } 61 | 62 | # Logging and tracing 63 | tracing = "0.1" 64 | tracing-subscriber = { version = "0.3.19", features = ["env-filter"] } 65 | tracing-test = "0.2" 66 | 67 | # Error handling 68 | thiserror = "2" 69 | anyhow = "1.0.98" 70 | 71 | # CLI utilities 72 | clap = { version = "4.5.38", features = ["derive", "env"] } 73 | 74 | # General utilities 75 | bon = "3.6.3" 76 | regex = "1" 77 | url = { version = "2", features = ["serde"] } 78 | chrono = { version = "0.4", features = ["serde"] } 79 | uuid = { version = "1.17.0", features = ["v4", "serde"] } 80 | rand = "0.8" 81 | 82 | [profile.dist] 83 | inherits = "release" 84 | lto = "thin" 85 | -------------------------------------------------------------------------------- /aqueducts-cli/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "aqueducts-cli" 3 | authors = [""] 4 | edition = "2021" 5 | description = "CLI application to run pipelines defined for the aqueducts framework" 6 | homepage = "https://github.com/vigimite/aqueducts" 7 | repository = "https://github.com/vigimite/aqueducts" 8 | readme = "README.md" 9 | version = "0.10.1" 10 | keywords = ["aqueducts", "ETL", "data", "pipeline", "cli"] 11 | categories = ["command-line-utilities"] 12 | license = "Apache-2.0" 13 | 14 | [features] 15 | default = ["s3", "gcs", "azure", "yaml", "json", "delta"] 16 | s3 = ["aqueducts/s3"] 17 | gcs = ["aqueducts/gcs"] 18 | azure = ["aqueducts/azure"] 19 | delta = ["aqueducts/delta"] 20 | odbc = ["aqueducts/odbc"] 21 | json = ["aqueducts/json"] 22 | toml = ["aqueducts/toml"] 23 | yaml = ["aqueducts/yaml"] 24 | 25 | [dependencies] 26 | datafusion.workspace = true 27 | 28 | aqueducts = { workspace = true, features = ["protocol", "custom_udfs"] } 29 | 30 | clap.workspace = true 31 | 32 | tracing.workspace = true 33 | tracing-subscriber.workspace = true 34 | 35 | tokio = { workspace = true, features = ["full"] } 36 | futures-util.workspace = true 37 | 38 | anyhow.workspace = true 39 | uuid.workspace = true 40 | url.workspace = true 41 | 42 | tokio-tungstenite = { workspace = true, features = ["native-tls"] } 43 | 44 | serde.workspace = true 45 | serde_json.workspace = true 46 | 47 | openssl.workspace = true 48 | 49 | [[bin]] 50 | name = "aqueducts" 51 | path = "src/main.rs" 52 | -------------------------------------------------------------------------------- /aqueducts-cli/README.md: -------------------------------------------------------------------------------- 1 | # Aqueducts CLI 2 | 3 | A command-line interface for executing Aqueducts data pipelines, with support for both local and remote execution. 4 | 5 | ## Features 6 | 7 | - Run pipelines defined in YAML, JSON, or TOML formats 8 | - Execute pipelines locally or remotely via the Aqueducts Executor 9 | - Check status of remote executors 10 | - Cancel running pipelines on remote executors 11 | - Real-time progress tracking and event streaming 12 | - Cloud storage support (S3, GCS, Azure) via feature flags 13 | - ODBC database connectivity via feature flags 14 | 15 | ## Installation 16 | 17 | ### Recommended Installation Methods 18 | 19 | **Homebrew (macOS and Linux):** 20 | ```bash 21 | # Add the tap and install 22 | brew tap vigimite/aqueducts 23 | brew install aqueducts-cli 24 | ``` 25 | 26 | **Shell Installer (Cross-platform):** 27 | ```bash 28 | # One-line installer for Linux, macOS, and Windows 29 | curl --proto '=https' --tlsv1.2 -LsSf https://github.com/vigimite/aqueducts/releases/latest/download/aqueducts-installer.sh | sh 30 | ``` 31 | 32 | **Direct Download:** 33 | Download pre-built binaries for your platform from the [latest release](https://github.com/vigimite/aqueducts/releases/latest): 34 | - Linux x86_64 35 | - macOS Apple Silicon (ARM64) 36 | - macOS Intel (x86_64) 37 | 38 | ### Build from Source 39 | 40 | ```bash 41 | # Install with default features (s3, gcs, azure, yaml) 42 | cargo install aqueducts-cli --locked 43 | 44 | # Install with odbc support (requires unixodbc-dev) 45 | cargo install aqueducts-cli --locked --features odbc 46 | 47 | # Install with minimal features 48 | cargo install aqueducts-cli --locked --no-default-features --features yaml 49 | ``` 50 | 51 | ## Usage 52 | 53 | ### Running Pipelines 54 | 55 | Run a pipeline locally: 56 | 57 | ```bash 58 | # Basic usage (YAML) 59 | aqueducts run --file ./pipeline.yml 60 | 61 | # With parameters 62 | aqueducts run --file ./pipeline.yml --params key1=value1 --params key2=value2 63 | 64 | # Using TOML or JSON (with appropriate feature flags) 65 | aqueducts run --file ./pipeline.toml 66 | aqueducts run --file ./pipeline.json 67 | ``` 68 | 69 | Run a pipeline on a remote executor: 70 | 71 | ```bash 72 | # Execute on remote executor 73 | aqueducts run --file ./pipeline.yml --executor executor-host:3031 --api-key your_api_key 74 | ``` 75 | 76 | Cancel a running pipeline on a remote executor: 77 | 78 | ```bash 79 | # Cancel a specific execution by ID 80 | aqueducts cancel --executor executor-host:3031 --api-key your_api_key --execution-id abc-123 81 | ``` 82 | 83 | ## Pipeline Definition Examples 84 | 85 | YAML pipeline example: 86 | 87 | ```yaml 88 | sources: 89 | - type: File 90 | name: temp_readings 91 | file_type: 92 | type: Csv 93 | options: {} 94 | location: ./examples/temp_readings_${month}_${year}.csv 95 | 96 | stages: 97 | - - name: transformed_data 98 | query: "SELECT * FROM source_data WHERE value > 10" 99 | 100 | destination: 101 | type: File 102 | name: results 103 | file_type: 104 | type: Parquet 105 | options: {} 106 | location: ./examples/output_${month}_${year}.parquet 107 | ``` 108 | 109 | ## Troubleshooting 110 | 111 | Common issues: 112 | 113 | 1. **Authentication failures**: Verify API key is correct 114 | 2. **Connectivity issues**: Check network connectivity and firewall rules 115 | 3. **Pipeline validation errors**: Ensure your pipeline definition is valid 116 | 4. **Executor busy**: Only one pipeline can run at a time on an executor 117 | 5. **Missing features**: Make sure the CLI was compiled with the needed features 118 | 119 | For more information on architecture and advanced usage, see the [Aqueducts Architecture Documentation](https://github.com/vigimite/aqueducts/blob/main/ARCHITECTURE.md). 120 | -------------------------------------------------------------------------------- /aqueducts-cli/src/local_exec.rs: -------------------------------------------------------------------------------- 1 | use std::{collections::HashMap, path::PathBuf, sync::Arc}; 2 | 3 | use anyhow::Context; 4 | use aqueducts::prelude::*; 5 | use tracing::{debug, info}; 6 | 7 | pub async fn run_local(file: PathBuf, params: HashMap) -> anyhow::Result<()> { 8 | info!("Running pipeline locally from file: {}", file.display()); 9 | 10 | let aqueduct = Aqueduct::from_file(&file, params)?; 11 | 12 | debug!("Creating SessionContext"); 13 | let mut ctx = datafusion::prelude::SessionContext::new(); 14 | 15 | aqueducts::custom_udfs::register_all(&mut ctx)?; 16 | 17 | let progress_tracker = Arc::new(LoggingProgressTracker); 18 | 19 | debug!("Starting pipeline execution"); 20 | run_pipeline(Arc::new(ctx), aqueduct, Some(progress_tracker)) 21 | .await 22 | .context("Failure during execution of aqueducts file")?; 23 | 24 | debug!("Pipeline execution completed successfully"); 25 | Ok(()) 26 | } 27 | -------------------------------------------------------------------------------- /aqueducts-cli/src/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::anyhow; 2 | use clap::{Parser, Subcommand}; 3 | use std::{collections::HashMap, error::Error, path::PathBuf}; 4 | use tracing::info; 5 | use tracing_subscriber::{filter, layer::SubscriberExt, util::SubscriberInitExt, EnvFilter, Layer}; 6 | use uuid::Uuid; 7 | 8 | mod local_exec; 9 | mod remote_exec; 10 | mod websocket_client; 11 | 12 | /// Aqueducts CLI for executing data pipelines locally or remotely 13 | #[derive(Debug, Parser)] 14 | #[command(name = "aqueducts", version, about, long_about = None)] 15 | struct Args { 16 | #[command(subcommand)] 17 | command: Commands, 18 | } 19 | 20 | #[derive(Debug, Subcommand)] 21 | enum Commands { 22 | /// Run an Aqueduct pipeline locally or remotely 23 | Run { 24 | /// Path to Aqueduct configuration file 25 | #[arg(short, long)] 26 | file: PathBuf, 27 | 28 | /// k=v list of parameters to pass to the configuration file 29 | /// e.g. aqueduct run -f file.yml -p key1=value1 -p key2=value2 30 | #[arg(short, long, value_parser = parse_key_val::)] 31 | params: Option>, 32 | 33 | /// Execute the pipeline on a remote executor instead of locally 34 | /// example: 192.168.1.102:3031 35 | #[arg(long)] 36 | executor: Option, 37 | 38 | /// API key for the remote executor 39 | #[arg(long)] 40 | api_key: Option, 41 | }, 42 | /// Cancel a running pipeline on a remote executor 43 | Cancel { 44 | /// Execution ID to cancel 45 | #[arg(short, long)] 46 | execution_id: String, 47 | 48 | /// Remote executor URL 49 | /// example: 192.168.1.102:3031 50 | #[arg(long)] 51 | executor: String, 52 | 53 | /// API key for the remote executor 54 | #[arg(long)] 55 | api_key: String, 56 | }, 57 | } 58 | 59 | fn parse_key_val(s: &str) -> Result<(T, U), Box> 60 | where 61 | T: std::str::FromStr, 62 | T::Err: Error + Send + Sync + 'static, 63 | U: std::str::FromStr, 64 | U::Err: Error + Send + Sync + 'static, 65 | { 66 | let pos = s 67 | .find('=') 68 | .ok_or_else(|| format!("invalid KEY=value: no `=` found in `{s}`"))?; 69 | Ok((s[..pos].parse()?, s[pos + 1..].parse()?)) 70 | } 71 | 72 | #[tokio::main] 73 | async fn main() -> anyhow::Result<()> { 74 | tracing_subscriber::registry() 75 | .with( 76 | tracing_subscriber::fmt::layer() 77 | .with_ansi(true) 78 | .with_level(false) 79 | .with_target(false) 80 | .without_time() 81 | .with_filter(filter::filter_fn(|meta| !meta.is_span())), 82 | ) 83 | .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info"))) 84 | .init(); 85 | 86 | let args = Args::parse(); 87 | 88 | match args.command { 89 | Commands::Run { 90 | file, 91 | params, 92 | executor: Some(executor_url), 93 | api_key, 94 | } => { 95 | let api_key = 96 | api_key.ok_or_else(|| anyhow!("API key is required for remote execution"))?; 97 | 98 | info!("Executing pipeline on remote executor: {}", executor_url); 99 | let params = HashMap::from_iter(params.unwrap_or_default()); 100 | remote_exec::run_remote(file, params, executor_url, api_key).await?; 101 | } 102 | Commands::Run { 103 | file, 104 | params, 105 | executor: _, 106 | api_key: _, 107 | } => { 108 | let params = HashMap::from_iter(params.unwrap_or_default()); 109 | local_exec::run_local(file, params).await?; 110 | } 111 | Commands::Cancel { 112 | execution_id, 113 | executor, 114 | api_key, 115 | } => { 116 | let execution_id = Uuid::parse_str(&execution_id) 117 | .map_err(|e| anyhow!("Invalid execution ID: {}. Must be a valid UUID.", e))?; 118 | 119 | info!( 120 | "Cancelling execution {} on executor: {}", 121 | execution_id, executor 122 | ); 123 | remote_exec::cancel_remote_execution(executor, api_key, execution_id).await?; 124 | } 125 | } 126 | 127 | Ok(()) 128 | } 129 | -------------------------------------------------------------------------------- /aqueducts-cli/src/websocket_client.rs: -------------------------------------------------------------------------------- 1 | use anyhow::anyhow; 2 | use aqueducts::prelude::*; 3 | use futures_util::{SinkExt, StreamExt}; 4 | use std::{str::FromStr, sync::Arc}; 5 | use tokio::sync::{mpsc, Mutex}; 6 | use tokio_tungstenite::{ 7 | connect_async, 8 | tungstenite::{client::IntoClientRequest, http::Uri, protocol::Message, ClientRequestBuilder}, 9 | }; 10 | use tracing::{debug, error, info}; 11 | use url::Url; 12 | use uuid::Uuid; 13 | 14 | /// The custom header for API key authentication 15 | const X_API_KEY_HEADER: &str = "X-API-Key"; 16 | 17 | /// Manages connection to an executor server 18 | pub struct WebSocketClient { 19 | executor_url: Url, 20 | api_key: String, 21 | sender: Arc>>>, 22 | } 23 | 24 | impl WebSocketClient { 25 | /// Create a new client 26 | pub fn try_new(executor_url: String, api_key: String) -> anyhow::Result { 27 | let executor_url = Url::parse(&format!("ws://{executor_url}/ws/connect"))?; 28 | Ok(Self { 29 | executor_url, 30 | api_key, 31 | sender: Arc::new(Mutex::new(None)), 32 | }) 33 | } 34 | 35 | /// Connect to the executor and set up message handling 36 | pub async fn connect(&self) -> anyhow::Result> { 37 | info!("Connecting to executor at: {}", self.executor_url); 38 | 39 | // Set up channels for message passing 40 | let (outgoing_tx, mut outgoing_rx) = mpsc::channel::(16); 41 | let (incoming_tx, incoming_rx) = mpsc::channel::(32); 42 | 43 | debug!("Connecting with API key authentication"); 44 | let request = ClientRequestBuilder::new(Uri::from_str(self.executor_url.as_str())?) 45 | .with_header(X_API_KEY_HEADER, &self.api_key) 46 | .into_client_request()?; 47 | 48 | let (ws_stream, _) = connect_async(request).await?; 49 | debug!("WebSocket connection established"); 50 | 51 | let (mut ws_sender, mut ws_receiver) = ws_stream.split(); 52 | { 53 | let mut sender = self.sender.lock().await; 54 | *sender = Some(outgoing_tx); 55 | } 56 | 57 | // Handle outgoing messages 58 | tokio::spawn(async move { 59 | while let Some(message) = outgoing_rx.recv().await { 60 | match serde_json::to_string(&message) { 61 | Ok(json) => { 62 | debug!("Sending message: {}", json); 63 | if let Err(e) = ws_sender.send(Message::Text(json.into())).await { 64 | error!("Error sending message: {}", e); 65 | break; 66 | } 67 | } 68 | Err(e) => { 69 | error!("Failed to serialize message: {}", e); 70 | } 71 | } 72 | } 73 | debug!("Outgoing message handler finished"); 74 | }); 75 | 76 | // Handle incoming messages 77 | tokio::spawn(async move { 78 | while let Some(msg) = ws_receiver.next().await { 79 | match msg { 80 | Ok(Message::Text(text)) => { 81 | debug!("Received message: {}", text); 82 | match serde_json::from_str::(&text) { 83 | Ok(message) => { 84 | if let Err(e) = incoming_tx.send(message).await { 85 | error!("Failed to forward incoming message: {}", e); 86 | break; 87 | } 88 | } 89 | Err(e) => { 90 | error!("Failed to parse message: {}", e); 91 | } 92 | } 93 | } 94 | Ok(Message::Close(_)) => { 95 | info!("WebSocket connection closed by server"); 96 | break; 97 | } 98 | Err(e) => { 99 | error!("Error receiving message: {}", e); 100 | break; 101 | } 102 | _ => {} 103 | } 104 | } 105 | debug!("Incoming message handler finished"); 106 | }); 107 | 108 | // Return the receiver channel 109 | Ok(incoming_rx) 110 | } 111 | 112 | /// Submit a pipeline for execution 113 | pub async fn execute_pipeline(&self, pipeline: Aqueduct) -> anyhow::Result<()> { 114 | // Send execution request 115 | self.send_message(ClientMessage::ExecutionRequest { pipeline }) 116 | .await?; 117 | 118 | Ok(()) 119 | } 120 | 121 | /// Cancel an execution 122 | pub async fn cancel_execution(&self, execution_id: Uuid) -> anyhow::Result<()> { 123 | self.send_message(ClientMessage::CancelRequest { execution_id }) 124 | .await 125 | } 126 | 127 | /// Send a message to the executor 128 | async fn send_message(&self, message: ClientMessage) -> anyhow::Result<()> { 129 | let sender = self.sender.lock().await; 130 | match &*sender { 131 | Some(tx) => { 132 | tx.send(message).await?; 133 | Ok(()) 134 | } 135 | None => Err(anyhow!("Connection Closed")), 136 | } 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /aqueducts-executor/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "aqueducts-executor" 3 | authors = [""] 4 | edition = "2021" 5 | description = "Remote executor for the Aqueducts data pipeline framework" 6 | homepage = "https://github.com/vigimite/aqueducts" 7 | repository = "https://github.com/vigimite/aqueducts" 8 | readme = "README.md" 9 | version = "0.10.1" 10 | keywords = ["aqueducts", "ETL", "data", "pipeline"] 11 | categories = ["command-line-utilities"] 12 | license = "Apache-2.0" 13 | 14 | [features] 15 | default = [] 16 | odbc = ["aqueducts/odbc"] 17 | 18 | [dependencies] 19 | aqueducts = { workspace = true, features = ["protocol", "json", "s3", "gcs", "azure", "delta", "custom_udfs"] } 20 | 21 | axum.workspace = true 22 | clap.workspace = true 23 | 24 | datafusion.workspace = true 25 | 26 | serde.workspace = true 27 | serde_json.workspace = true 28 | 29 | tokio = { workspace = true, features = ["full"] } 30 | tokio-util.workspace = true 31 | futures.workspace = true 32 | futures-util.workspace = true 33 | 34 | thiserror.workspace = true 35 | 36 | tracing.workspace = true 37 | tracing-subscriber = { workspace = true, features = ["json"] } 38 | 39 | uuid.workspace = true 40 | itertools = "0.14.0" 41 | tower-http = { version = "0.6.4", features = ["trace"] } 42 | 43 | openssl.workspace = true 44 | 45 | [dev-dependencies] 46 | futures.workspace = true 47 | tower.workspace = true 48 | http-body-util.workspace = true 49 | -------------------------------------------------------------------------------- /aqueducts-executor/README.md: -------------------------------------------------------------------------------- 1 | # Aqueducts Executor 2 | 3 | A deployable application used to execute Aqueduct pipeline definitions within your infrastructure. The main use-case is to execute heavy queries within the infrastructure where the data is hosted, minimizing network load and removing the requirement for the client to have direct access to the data store. 4 | 5 | ## Features 6 | 7 | - **Remote Execution**: Run data pipelines securely within your own infrastructure close to the data sources 8 | - **Memory Management**: Configure maximum memory usage to control resource allocation using DataFusion's memory pool 9 | - **Real-time Feedback**: WebSockets provide bidirectional communication with live progress and log updates 10 | - **Cloud Storage Support**: Native integration with S3, GCS, and Azure Blob Storage 11 | - **Database Connectivity**: ODBC support for connecting to various database systems 12 | - **Scalability**: Deploy multiple executors across different regions as needed 13 | - **Exclusive Execution**: Guaranteed single-pipeline execution to optimize resource utilization 14 | 15 | ## Installation 16 | 17 | ### Docker (Recommended) 18 | 19 | The easiest way to run the executor is using Docker. The Docker image includes **ODBC support with PostgreSQL drivers pre-installed**, making it ready for database connectivity out of the box. 20 | 21 | ```bash 22 | # Pull from GitHub Container Registry 23 | docker pull ghcr.io/vigimite/aqueducts/aqueducts-executor:latest 24 | 25 | # Run with command line arguments 26 | docker run -d \ 27 | --name aqueducts-executor \ 28 | -p 3031:3031 \ 29 | ghcr.io/vigimite/aqueducts/aqueducts-executor:latest \ 30 | --api-key your_secret_key --max-memory 4 31 | 32 | # Or run with environment variables 33 | docker run -d \ 34 | --name aqueducts-executor \ 35 | -p 3031:3031 \ 36 | -e AQUEDUCTS_API_KEY=your_secret_key \ 37 | -e AQUEDUCTS_HOST=0.0.0.0 \ 38 | -e AQUEDUCTS_PORT=3031 \ 39 | -e AQUEDUCTS_MAX_MEMORY=4 \ 40 | -e AQUEDUCTS_LOG_LEVEL=info \ 41 | ghcr.io/vigimite/aqueducts/aqueducts-executor:latest 42 | ``` 43 | 44 | ### Docker Compose 45 | 46 | For local development, use the provided docker-compose setup: 47 | 48 | ```bash 49 | # Start just the database (default) 50 | docker-compose up 51 | 52 | # Start database + executor 53 | docker-compose --profile executor up 54 | 55 | # Build and start from source 56 | docker-compose --profile executor up --build 57 | ``` 58 | 59 | The executor will be available at `http://localhost:3031` with: 60 | - API key: `test_secret_key` (configurable) 61 | - Health check: `http://localhost:3031/api/health` 62 | - WebSocket: `ws://localhost:3031/ws/connect` 63 | 64 | ### Manual Installation 65 | 66 | Install the application using cargo: 67 | 68 | ```bash 69 | # Standard installation with all cloud storage features 70 | cargo install aqueducts-executor 71 | 72 | # Installation with ODBC support 73 | cargo install aqueducts-executor --features odbc 74 | ``` 75 | 76 | ## Configuration Options 77 | 78 | | Option | Description | Default | Environment Variable | 79 | |-----------------|-----------------------------------------------------|----------------|-------------------------| 80 | | `--api-key` | API key for authentication | - | `AQUEDUCTS_API_KEY` | 81 | | `--host` | Host address to bind to | 0.0.0.0 | `AQUEDUCTS_HOST` | 82 | | `--port` | Port to listen on | 8080 | `AQUEDUCTS_PORT` | 83 | | `--max-memory` | Maximum memory usage in GB (0 for unlimited) | 0 | `AQUEDUCTS_MAX_MEMORY` | 84 | | `--server-url` | URL of Aqueducts server for registration (optional) | - | `AQUEDUCTS_SERVER_URL` | 85 | | `--executor-id` | Unique identifier for this executor | auto-generated | `AQUEDUCTS_EXECUTOR_ID` | 86 | | `--log-level` | Logging level (info, debug, trace) | info | `AQUEDUCTS_LOG_LEVEL` | 87 | 88 | ## API Endpoints 89 | 90 | | Endpoint | Method | Auth | Description | 91 | |----------------|--------|------|----------------------------------------------------| 92 | | `/api/health` | GET | No | Basic health check | 93 | | `/ws/connect` | GET | Yes | WebSocket endpoint for bidirectional communication | 94 | 95 | ## ODBC Configuration Requirements 96 | 97 | ODBC support requires the UnixODBC library to be installed on your system, along with any database-specific drivers. 98 | 99 | ### Ubuntu/Debian 100 | ```bash 101 | # Install UnixODBC development libraries 102 | sudo apt-get update 103 | sudo apt-get install unixodbc-dev 104 | 105 | # Add database-specific drivers (examples) 106 | # For PostgreSQL 107 | sudo apt-get install odbc-postgresql 108 | 109 | # For MySQL 110 | sudo apt-get install libmyodbc 111 | ``` 112 | 113 | ### Fedora/RHEL/CentOS 114 | ```bash 115 | # Install UnixODBC development libraries 116 | sudo dnf install unixODBC-devel 117 | 118 | # Add database-specific drivers (examples) 119 | # For PostgreSQL 120 | sudo dnf install postgresql-odbc 121 | 122 | # For MySQL 123 | sudo dnf install mysql-connector-odbc 124 | ``` 125 | 126 | ### macOS 127 | ```bash 128 | # Install UnixODBC via Homebrew 129 | brew install unixodbc 130 | 131 | # For database drivers, use Homebrew if available or download from the database vendor 132 | # PostgreSQL example 133 | brew install psqlodbc 134 | 135 | # MySQL example 136 | brew install mysql-connector-c++ 137 | ``` 138 | 139 | ## Example Usage 140 | 141 | ### Using the CLI 142 | 143 | ```bash 144 | # Connect to the executor 145 | aqueducts run --executor executor-host:3031 --api-key your_api_key --file pipeline.yml 146 | ``` 147 | 148 | ## Troubleshooting 149 | 150 | Common issues and solutions: 151 | 152 | 1. **Connection timeouts**: Check network connectivity and firewall rules 153 | 2. **Authentication failures**: Verify API key configuration and correct header usage (X-API-Key) 154 | 4. **Memory errors**: 155 | - Increase max memory allocation with the `--max-memory` parameter 156 | - Optimize your pipeline by adding filtering earlier in the process 157 | - Break large queries into smaller stages with intermediate results 158 | 5. **ODBC issues**: 159 | - Verify your DSN configuration in `odbc.ini` and `odbcinst.ini` 160 | - Run `isql -v YOUR_DSN YOUR_USERNAME YOUR_PASSWORD` to test connections 161 | - Check that database-specific drivers are installed correctly 162 | 163 | For more information on architecture and advanced usage, see the [Aqueducts Architecture Documentation](https://github.com/vigimite/aqueducts/blob/main/ARCHITECTURE.md). 164 | -------------------------------------------------------------------------------- /aqueducts-executor/src/api/auth.rs: -------------------------------------------------------------------------------- 1 | use axum::{ 2 | extract::{Request, State}, 3 | middleware::Next, 4 | response::Response, 5 | }; 6 | use tracing::{debug, warn}; 7 | 8 | use crate::{error::ExecutorError, ApiContextRef}; 9 | 10 | /// The custom header for API key authentication 11 | const X_API_KEY_HEADER: &str = "X-API-Key"; 12 | 13 | /// Middleware function for API key authentication 14 | pub async fn require_api_key( 15 | State(context): State, 16 | req: Request, 17 | next: Next, 18 | ) -> Result { 19 | let api_key = req 20 | .headers() 21 | .get(X_API_KEY_HEADER) 22 | .and_then(|value| value.to_str().ok()); 23 | 24 | if let Some(provided) = api_key { 25 | if provided == context.config.api_key { 26 | debug!("API key authentication successful via X-API-Key header"); 27 | return Ok(next.run(req).await); 28 | } 29 | } 30 | 31 | warn!("Authentication failed: No valid API key provided"); 32 | Err(ExecutorError::AuthenticationFailed) 33 | } 34 | -------------------------------------------------------------------------------- /aqueducts-executor/src/api/mod.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use aqueducts::prelude::*; 4 | use axum::{ 5 | extract::{ 6 | ws::{Message, WebSocket}, 7 | State, WebSocketUpgrade, 8 | }, 9 | response::IntoResponse, 10 | routing::{any, get}, 11 | Json, Router, 12 | }; 13 | use futures::{SinkExt, StreamExt}; 14 | use serde::Serialize; 15 | use tokio::sync::Mutex; 16 | use tower_http::trace::{DefaultOnFailure, TraceLayer}; 17 | use tracing::{debug, error, info, instrument, Instrument, Level}; 18 | 19 | use crate::{ 20 | executor::{execute_pipeline, ExecutionManager}, 21 | ApiContextRef, 22 | }; 23 | 24 | mod auth; 25 | 26 | pub fn router(context: ApiContextRef) -> Router { 27 | let public_routes = Router::new().route("/api/health", get(health_check)); 28 | 29 | let protected_routes = Router::new().route("/ws/connect", any(ws_handler)).layer( 30 | axum::middleware::from_fn_with_state(context, auth::require_api_key), 31 | ); 32 | 33 | Router::new() 34 | .merge(public_routes) 35 | .merge(protected_routes) 36 | .layer(TraceLayer::new_for_http().on_failure(DefaultOnFailure::new().level(Level::ERROR))) 37 | } 38 | 39 | #[derive(Serialize)] 40 | struct HealthCheckResponse { 41 | status: String, 42 | } 43 | 44 | async fn health_check() -> Json { 45 | let response = HealthCheckResponse { 46 | status: "OK".to_string(), 47 | }; 48 | 49 | Json(response) 50 | } 51 | 52 | #[instrument(skip(ws, context), fields(executor_id = %context.config.executor_id))] 53 | async fn ws_handler( 54 | ws: WebSocketUpgrade, 55 | State(context): State, 56 | ) -> impl IntoResponse { 57 | info!("Opening WebSocket connection"); 58 | ws.on_upgrade(move |socket| { 59 | handle_socket( 60 | socket, 61 | context.manager.clone(), 62 | context.config.max_memory_gb, 63 | ) 64 | }) 65 | } 66 | 67 | #[instrument(skip(socket, manager), fields(max_memory_gb = ?max_memory_gb))] 68 | async fn handle_socket( 69 | socket: WebSocket, 70 | manager: Arc, 71 | max_memory_gb: Option, 72 | ) { 73 | let (sender, mut receiver) = socket.split(); 74 | let sender = Arc::new(Mutex::new(sender)); 75 | 76 | debug!("WebSocket connection established"); 77 | 78 | while let Some(Ok(msg)) = receiver.next().await { 79 | if let Message::Text(text) = msg { 80 | debug!(msg_len = text.len(), "Received message"); 81 | 82 | match serde_json::from_str::(&text) { 83 | Ok(ClientMessage::ExecutionRequest { pipeline }) => { 84 | info!( 85 | source_count = pipeline.sources.len(), 86 | stage_count = pipeline.stages.len(), 87 | "Received execution request" 88 | ); 89 | 90 | // Queue execution 91 | let (execution_id, mut queue_rx, mut progress_rx) = manager 92 | .submit(move |execution_id, client_tx| { 93 | Box::pin(async move { 94 | execute_pipeline(execution_id, client_tx, pipeline, max_memory_gb) 95 | .await 96 | }) 97 | }) 98 | .await; 99 | 100 | info!( 101 | execution_id = %execution_id, 102 | "Execution submitted to queue" 103 | ); 104 | 105 | // forward queue updates 106 | let send_q = sender.clone(); 107 | tokio::spawn( 108 | async move { 109 | debug!("Starting queue update forwarder"); 110 | while let Ok(update) = queue_rx.recv().await { 111 | if update.execution_id == execution_id { 112 | debug!(position = update.position, "Queue position update"); 113 | let msg = 114 | serde_json::to_string(&ExecutorMessage::QueuePosition { 115 | execution_id: update.execution_id, 116 | position: update.position, 117 | }) 118 | .unwrap(); 119 | if let Err(e) = 120 | send_q.lock().await.send(Message::text(msg)).await 121 | { 122 | error!("Failed to send queue update: {}", e); 123 | break; 124 | } 125 | } 126 | } 127 | debug!("Queue update forwarder finished"); 128 | } 129 | .instrument( 130 | tracing::info_span!("queue_forwarder", execution_id = %execution_id), 131 | ), 132 | ); 133 | 134 | // forward progress updates 135 | let send_p = sender.clone(); 136 | tokio::spawn( 137 | async move { 138 | debug!("Starting progress update forwarder"); 139 | while let Some(progress) = progress_rx.recv().await { 140 | match serde_json::to_string(&progress) { 141 | Ok(msg) => { 142 | if let Err(e) = 143 | send_p.lock().await.send(Message::text(msg)).await 144 | { 145 | error!("Failed to send progress update: {}", e); 146 | break; 147 | } 148 | } 149 | Err(e) => { 150 | error!("Failed to serialize progress update: {}", e); 151 | } 152 | } 153 | } 154 | debug!("Progress update forwarder finished"); 155 | } 156 | .instrument( 157 | tracing::info_span!("progress_forwarder", execution_id = %execution_id), 158 | ), 159 | ); 160 | } 161 | Ok(ClientMessage::CancelRequest { execution_id }) => { 162 | info!( 163 | execution_id = %execution_id, 164 | "Received cancellation request" 165 | ); 166 | manager.cancel(execution_id).await; 167 | } 168 | Err(e) => { 169 | error!( 170 | error = %e, 171 | "Failed to parse incoming message" 172 | ); 173 | } 174 | } 175 | } 176 | } 177 | 178 | info!("WebSocket connection closed"); 179 | } 180 | -------------------------------------------------------------------------------- /aqueducts-executor/src/config.rs: -------------------------------------------------------------------------------- 1 | use thiserror::Error; 2 | use uuid::Uuid; 3 | 4 | /// Errors that can occur during configuration validation 5 | #[derive(Debug, Error)] 6 | pub enum ConfigError { 7 | #[error("API key cannot be empty")] 8 | EmptyApiKey, 9 | 10 | #[error("Max memory must be at least 1 GB")] 11 | InvalidMemoryLimit, 12 | } 13 | 14 | /// Configuration for the executor 15 | #[derive(Debug, Clone)] 16 | pub struct Config { 17 | pub api_key: String, 18 | pub executor_id: Uuid, 19 | pub max_memory_gb: Option, 20 | } 21 | 22 | impl Config { 23 | /// Create a new config with validation 24 | pub fn try_new( 25 | api_key: String, 26 | executor_id: Uuid, 27 | max_memory_gb: Option, 28 | ) -> Result { 29 | if api_key.trim().is_empty() { 30 | return Err(ConfigError::EmptyApiKey); 31 | } 32 | 33 | if let Some(mem) = max_memory_gb { 34 | if mem == 0 { 35 | return Err(ConfigError::InvalidMemoryLimit); 36 | } 37 | } 38 | 39 | Ok(Self { 40 | api_key, 41 | executor_id, 42 | max_memory_gb, 43 | }) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /aqueducts-executor/src/error.rs: -------------------------------------------------------------------------------- 1 | use axum::{ 2 | http::{header, StatusCode}, 3 | response::{IntoResponse, Response}, 4 | }; 5 | use serde::Serialize; 6 | use thiserror::Error; 7 | 8 | #[derive(Debug, Error)] 9 | pub enum ExecutorError { 10 | #[error("Authentication failed")] 11 | AuthenticationFailed, 12 | } 13 | 14 | #[derive(Serialize)] 15 | struct ErrorResponse { 16 | error: String, 17 | } 18 | 19 | impl IntoResponse for ExecutorError { 20 | fn into_response(self) -> Response { 21 | let (status, error_response) = match &self { 22 | ExecutorError::AuthenticationFailed => { 23 | let response = ErrorResponse { 24 | error: self.to_string(), 25 | }; 26 | (StatusCode::UNAUTHORIZED, response) 27 | } 28 | }; 29 | 30 | let body = serde_json::to_string(&error_response) 31 | .unwrap_or_else(|_| format!("{{\"error\": \"{}\"}}", self)); 32 | 33 | let mut response = Response::new(body.into()); 34 | *response.status_mut() = status; 35 | 36 | response.headers_mut().insert( 37 | header::CONTENT_TYPE, 38 | header::HeaderValue::from_static("application/json"), 39 | ); 40 | 41 | response 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /aqueducts-executor/src/executor/mod.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use aqueducts::prelude::*; 4 | use datafusion::{execution::runtime_env::RuntimeEnvBuilder, prelude::SessionContext}; 5 | use futures::future::BoxFuture; 6 | use tokio::sync::mpsc; 7 | use tracing::{error, info, instrument}; 8 | use uuid::Uuid; 9 | 10 | pub use manager::ExecutionManager; 11 | pub use progress_tracker::ExecutorProgressTracker; 12 | 13 | mod manager; 14 | mod progress_tracker; 15 | mod queue; 16 | 17 | /// Broadcast when queue positions change 18 | #[derive(Debug, Clone)] 19 | pub struct QueueUpdate { 20 | pub execution_id: Uuid, 21 | pub position: usize, 22 | } 23 | 24 | /// An aqueduct pipeline execution 25 | pub struct Execution { 26 | pub id: Uuid, 27 | pub handler: BoxFuture<'static, ()>, 28 | } 29 | 30 | /// Execute an aqueduct pipeline communicating progress back to clients via websocket 31 | #[instrument(skip(client_tx, pipeline), fields(source_count = pipeline.sources.len(), stage_count = pipeline.stages.len()))] 32 | pub async fn execute_pipeline( 33 | execution_id: Uuid, 34 | client_tx: mpsc::Sender, 35 | pipeline: Aqueduct, 36 | max_memory_gb: Option, 37 | ) { 38 | info!(execution_id = %execution_id, "Starting pipeline execution setup"); 39 | 40 | let mut ctx = if let Some(memory_gb) = max_memory_gb { 41 | // Convert max_memory_gb directly to bytes (GB * 1024^3) 42 | let max_memory_bytes = memory_gb * 1024 * 1024 * 1024; 43 | 44 | info!( 45 | execution_id = %execution_id, 46 | memory_gb = memory_gb, 47 | memory_bytes = max_memory_bytes, 48 | "Creating runtime environment with memory limit" 49 | ); 50 | 51 | // Use 0.95 as the memory use percentage (allowing 95% of the limit to be used) 52 | let runtime_env = match RuntimeEnvBuilder::new() 53 | .with_memory_limit(max_memory_bytes, 0.95) 54 | .build_arc() 55 | { 56 | Ok(env) => env, 57 | Err(e) => { 58 | error!(execution_id = %execution_id, error = %e, "Failed to build runtime environment"); 59 | let _ = client_tx 60 | .send(ExecutorMessage::ExecutionError { 61 | execution_id, 62 | message: format!("Failed to build runtime environment: {}", e), 63 | }) 64 | .await; 65 | return; 66 | } 67 | }; 68 | 69 | let config = datafusion::execution::config::SessionConfig::new(); 70 | SessionContext::new_with_config_rt(config, runtime_env) 71 | } else { 72 | info!(execution_id = %execution_id, "Using session with unlimited memory allocation"); 73 | SessionContext::new() 74 | }; 75 | 76 | aqueducts::custom_udfs::register_all(&mut ctx).expect("failed to register custom_udfs"); 77 | 78 | let num_sources = pipeline.sources.len(); 79 | let num_stages = pipeline 80 | .stages 81 | .iter() 82 | .map(|s| s.len()) 83 | .reduce(|acc, e| acc + e) 84 | .unwrap_or(0); 85 | let num_destinations = pipeline.destination.is_some() as usize; 86 | 87 | let total_steps = num_sources 88 | + num_stages * 2 // 2 progress events per stage (started, completed) 89 | + num_destinations; 90 | 91 | info!( 92 | execution_id = %execution_id, 93 | total_steps = total_steps, 94 | "Creating progress tracker" 95 | ); 96 | 97 | let progress_tracker = Arc::new(ExecutorProgressTracker::new( 98 | client_tx.clone(), 99 | execution_id, 100 | total_steps, 101 | )); 102 | 103 | info!(execution_id = %execution_id, "Starting pipeline execution"); 104 | let result = run_pipeline(Arc::new(ctx), pipeline, Some(progress_tracker)).await; 105 | 106 | match result { 107 | Ok(_) => { 108 | info!(execution_id = %execution_id, "Pipeline executed successfully"); 109 | if let Err(e) = client_tx 110 | .send(ExecutorMessage::ExecutionSucceeded { execution_id }) 111 | .await 112 | { 113 | error!( 114 | execution_id = %execution_id, 115 | error = %e, 116 | "Failed to send error message to client" 117 | ); 118 | } 119 | } 120 | Err(error) => { 121 | error!(execution_id = %execution_id, error = %error, "Pipeline execution failed"); 122 | if let Err(e) = client_tx 123 | .send(ExecutorMessage::ExecutionError { 124 | execution_id, 125 | message: error.to_string(), 126 | }) 127 | .await 128 | { 129 | error!( 130 | execution_id = %execution_id, 131 | error = %e, 132 | "Failed to send error message to client" 133 | ); 134 | } 135 | } 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /aqueducts-executor/src/executor/progress_tracker.rs: -------------------------------------------------------------------------------- 1 | use aqueducts::prelude::*; 2 | use itertools::Itertools; 3 | use std::sync::atomic::AtomicUsize; 4 | use tokio::runtime::Handle; 5 | use tokio::sync::mpsc; 6 | use tracing::{debug, error, info, instrument}; 7 | use uuid::Uuid; 8 | 9 | const MAX_MESSAGE_CHARS: usize = 32_000; 10 | 11 | /// Implementation of ProgressTracker for the executor 12 | pub struct ExecutorProgressTracker { 13 | client_tx: mpsc::Sender, 14 | execution_id: Uuid, 15 | total_steps: usize, 16 | completed_steps: AtomicUsize, 17 | } 18 | 19 | impl ExecutorProgressTracker { 20 | pub fn new( 21 | client_tx: mpsc::Sender, 22 | execution_id: Uuid, 23 | total_steps: usize, 24 | ) -> Self { 25 | info!( 26 | execution_id = %execution_id, 27 | total_steps = total_steps, 28 | "Creating executor progress tracker" 29 | ); 30 | Self { 31 | client_tx, 32 | execution_id, 33 | total_steps, 34 | completed_steps: AtomicUsize::new(0), 35 | } 36 | } 37 | 38 | /// Calculate progress percentage based on completed steps 39 | fn calculate_progress(&self, current: usize) -> u8 { 40 | let progress = ((current as f32) / (self.total_steps as f32) * 100.0) as u8; 41 | debug!( 42 | execution_id = %self.execution_id, 43 | current_step = current, 44 | total_steps = self.total_steps, 45 | progress = progress, 46 | "Calculated execution progress" 47 | ); 48 | progress 49 | } 50 | 51 | /// Helper to send a message asynchronously 52 | fn send_message(&self, message: ExecutorMessage) { 53 | let tx = self.client_tx.clone(); 54 | let execution_id = self.execution_id; 55 | 56 | Handle::current().spawn(async move { 57 | debug!(execution_id = %execution_id, "Sending progress message"); 58 | match tx.send(message).await { 59 | Ok(_) => debug!(execution_id = %execution_id, "Progress message sent successfully"), 60 | Err(e) => error!(execution_id = %execution_id, error = %e, "Failed to send progress message"), 61 | } 62 | }); 63 | } 64 | } 65 | 66 | impl ProgressTracker for ExecutorProgressTracker { 67 | #[instrument(skip(self, event), fields(execution_id = %self.execution_id))] 68 | fn on_progress(&self, event: ProgressEvent) { 69 | debug!("Processing progress event"); 70 | 71 | let current = self 72 | .completed_steps 73 | .fetch_add(1, std::sync::atomic::Ordering::SeqCst) 74 | + 1; 75 | let progress = self.calculate_progress(current); 76 | 77 | let message = ExecutorMessage::ProgressUpdate { 78 | execution_id: self.execution_id, 79 | progress, 80 | event, 81 | }; 82 | 83 | // Send the progress update via the channel 84 | self.send_message(message); 85 | } 86 | 87 | #[instrument(skip(self, schema, batches), fields(execution_id = %self.execution_id, stage = %stage_name, output_type = ?output_type))] 88 | fn on_output( 89 | &self, 90 | stage_name: &str, 91 | output_type: OutputType, 92 | schema: &datafusion::common::DFSchema, 93 | batches: &[datafusion::arrow::array::RecordBatch], 94 | ) { 95 | debug!("Processing stage output"); 96 | 97 | // Generate output header based on type 98 | let output_header = match output_type { 99 | OutputType::Show => { 100 | format!("\n📋 Table Data: {stage_name}\n───────────────────────────────────────\n") 101 | } 102 | OutputType::ShowLimit => format!( 103 | "\n📋 Table Data (Preview): {stage_name}\n───────────────────────────────────────\n" 104 | ), 105 | OutputType::Explain => { 106 | format!("\n🔍 Query Plan: {stage_name}\n───────────────────────────────────────\n") 107 | } 108 | OutputType::ExplainAnalyze => format!( 109 | "\n📊 Query Metrics: {stage_name}\n───────────────────────────────────────\n" 110 | ), 111 | OutputType::PrintSchema => format!( 112 | "\n🔢 Schema: {stage_name}\n───────────────────────────────────────\n{schema:#?}\n" 113 | ), 114 | }; 115 | 116 | self.send_message(ExecutorMessage::StageOutput { 117 | execution_id: self.execution_id, 118 | stage_name: stage_name.to_string(), 119 | payload: StageOutputMessage::OutputStart { output_header }, 120 | }); 121 | 122 | let output = match datafusion::arrow::util::pretty::pretty_format_batches(batches) { 123 | Ok(output) => output, 124 | Err(e) => { 125 | error!(error = %e, "Failed to format stage output"); 126 | return; 127 | } 128 | }; 129 | 130 | let output_str = output.to_string(); 131 | let chunks = chunk_by_chars(&output_str, MAX_MESSAGE_CHARS); 132 | 133 | info!( 134 | chunk_count = chunks.len(), 135 | total_size = output_str.len(), 136 | "Chunking stage output" 137 | ); 138 | 139 | for (sequence, chunk) in chunks.into_iter().enumerate() { 140 | debug!( 141 | sequence = sequence, 142 | chunk_size = chunk.len(), 143 | "Sending output chunk" 144 | ); 145 | 146 | self.send_message(ExecutorMessage::StageOutput { 147 | execution_id: self.execution_id, 148 | stage_name: stage_name.to_string(), 149 | payload: StageOutputMessage::OutputChunk { 150 | sequence, 151 | body: chunk, 152 | }, 153 | }); 154 | } 155 | 156 | self.send_message(ExecutorMessage::StageOutput { 157 | execution_id: self.execution_id, 158 | stage_name: stage_name.to_string(), 159 | payload: StageOutputMessage::OutputEnd { 160 | output_footer: String::from(""), 161 | }, 162 | }); 163 | 164 | debug!("Stage output processing complete"); 165 | } 166 | } 167 | 168 | fn chunk_by_chars(s: &str, max_chars: usize) -> Vec { 169 | s.chars() 170 | .chunks(max_chars) 171 | .into_iter() 172 | .map(|chunk| chunk.collect()) 173 | .collect() 174 | } 175 | -------------------------------------------------------------------------------- /aqueducts-executor/src/executor/queue.rs: -------------------------------------------------------------------------------- 1 | use std::collections::VecDeque; 2 | 3 | use tokio::sync::broadcast; 4 | 5 | use super::{Execution, QueueUpdate}; 6 | 7 | /// Queue of pending jobs + broadcaster for queue updates 8 | pub struct ExecutionQueue { 9 | queue: VecDeque, 10 | broadcaster: broadcast::Sender, 11 | } 12 | 13 | impl ExecutionQueue { 14 | pub fn new(capacity: usize) -> Self { 15 | let (tx, _rx) = broadcast::channel(capacity); 16 | Self { 17 | queue: VecDeque::new(), 18 | broadcaster: tx, 19 | } 20 | } 21 | 22 | pub fn enqueue(&mut self, job: Execution) -> broadcast::Receiver { 23 | let rx = self.broadcaster.subscribe(); 24 | self.queue.push_back(job); 25 | self.broadcast_positions(); 26 | rx 27 | } 28 | 29 | pub fn dequeue(&mut self) -> Option { 30 | let job = self.queue.pop_front(); 31 | if job.is_some() { 32 | self.broadcast_positions(); 33 | } 34 | job 35 | } 36 | 37 | fn broadcast_positions(&self) { 38 | for (idx, execution) in self.queue.iter().enumerate() { 39 | let _ = self.broadcaster.send(QueueUpdate { 40 | execution_id: execution.id, 41 | position: idx, 42 | }); 43 | } 44 | } 45 | } 46 | 47 | // Unit tests for ExecutionQueue 48 | #[cfg(test)] 49 | mod tests { 50 | use super::*; 51 | use uuid::Uuid; 52 | 53 | fn dummy_execution() -> (Uuid, Execution) { 54 | let id = Uuid::new_v4(); 55 | let handler = Box::pin(async move {}); 56 | let execution = Execution { id, handler }; 57 | 58 | (id, execution) 59 | } 60 | 61 | #[tokio::test] 62 | async fn enqueue_broadcasts_position() { 63 | let mut queue = ExecutionQueue::new(10); 64 | let (execution_id, job) = dummy_execution(); 65 | 66 | let mut rx = queue.enqueue(job); 67 | let update_event = rx.recv().await.unwrap(); 68 | 69 | assert_eq!(update_event.execution_id, execution_id); 70 | assert_eq!(update_event.position, 0); 71 | } 72 | 73 | #[tokio::test] 74 | async fn enqueue_two_broadcasts_both_positions() { 75 | let mut queue = ExecutionQueue::new(10); 76 | let (execution_id_1, execution_1) = dummy_execution(); 77 | let mut rx_1 = queue.enqueue(execution_1); 78 | 79 | // consume initial update for execution_1 80 | let _ = rx_1.recv().await.unwrap(); 81 | 82 | let (execution_id_2, execution_2) = dummy_execution(); 83 | let mut rx_2 = queue.enqueue(execution_2); 84 | 85 | // rx_1 should receive both execution_1 and execution_2 positions 86 | let update_event_1 = rx_1.recv().await.unwrap(); 87 | let update_event_2 = rx_1.recv().await.unwrap(); 88 | assert_eq!(update_event_1.execution_id, execution_id_1); 89 | assert_eq!(update_event_1.position, 0); 90 | assert_eq!(update_event_2.execution_id, execution_id_2); 91 | assert_eq!(update_event_2.position, 1); 92 | 93 | // rx_2 should also get both 94 | let update_event_1 = rx_2.recv().await.unwrap(); 95 | let update_event_2 = rx_2.recv().await.unwrap(); 96 | assert_eq!(update_event_1.execution_id, execution_id_1); 97 | assert_eq!(update_event_2.execution_id, execution_id_2); 98 | } 99 | 100 | #[tokio::test] 101 | async fn dequeue_broadcasts_updated_positions() { 102 | let mut queue = ExecutionQueue::new(10); 103 | let (execution_id_1, execution_1) = dummy_execution(); 104 | let mut rx = queue.enqueue(execution_1); 105 | 106 | // consume initial update for execution_1 107 | let _ = rx.recv().await.unwrap(); 108 | 109 | let (execution_id_2, execution_2) = dummy_execution(); 110 | let _ = queue.enqueue(execution_2); 111 | 112 | // consume updates from enqueue execution_2 113 | let _ = rx.recv().await.unwrap(); 114 | let _ = rx.recv().await.unwrap(); 115 | 116 | let removed = queue.dequeue().unwrap(); 117 | assert_eq!(removed.id, execution_id_1); 118 | 119 | // consume update from dequeue 120 | let upd = rx.recv().await.unwrap(); 121 | assert_eq!(upd.execution_id, execution_id_2); 122 | assert_eq!(upd.position, 0); 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /aqueducts-executor/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::{net::SocketAddr, str::FromStr, sync::Arc, time::Duration}; 2 | 3 | use axum::Router; 4 | use clap::Parser; 5 | use config::Config; 6 | use executor::ExecutionManager; 7 | use tokio::signal; 8 | use tokio_util::sync::CancellationToken; 9 | use tracing::{error, info, Level}; 10 | use tracing_subscriber::{fmt, prelude::*, EnvFilter}; 11 | use uuid::Uuid; 12 | 13 | mod api; 14 | mod config; 15 | mod error; 16 | mod executor; 17 | 18 | /// Remote executor for Aqueducts data pipeline framework 19 | #[derive(Debug, Parser)] 20 | #[command(version, about, long_about = None)] 21 | struct Cli { 22 | /// API key for authentication 23 | #[arg(long, env = "AQUEDUCTS_API_KEY")] 24 | api_key: String, 25 | 26 | /// Host address to bind to 27 | #[arg(long, env = "AQUEDUCTS_HOST", default_value = "0.0.0.0")] 28 | host: String, 29 | 30 | /// Port to listen on 31 | #[arg(long, env = "AQUEDUCTS_PORT", default_value = "3031")] 32 | port: u16, 33 | 34 | /// Maximum memory usage in GB (optional) 35 | #[arg(long, env = "AQUEDUCTS_MAX_MEMORY")] 36 | max_memory: Option, 37 | 38 | /// URL of Aqueducts server for registration (optional) 39 | #[arg(long, env = "AQUEDUCTS_SERVER_URL")] 40 | server_url: Option, 41 | 42 | /// Unique identifier for this executor (optional) 43 | #[arg(long, env = "AQUEDUCTS_EXECUTOR_ID")] 44 | executor_id: Option, 45 | 46 | /// Logging level (info, debug, trace) 47 | #[arg(long, env = "AQUEDUCTS_LOG_LEVEL", default_value = "info")] 48 | log_level: String, 49 | } 50 | 51 | type ApiContextRef = Arc; 52 | 53 | pub struct ApiContext { 54 | pub config: Config, 55 | pub manager: Arc, 56 | } 57 | 58 | impl ApiContext { 59 | pub fn new(config: Config) -> Self { 60 | Self { 61 | config, 62 | manager: Arc::new(ExecutionManager::new(100)), 63 | } 64 | } 65 | } 66 | 67 | #[tokio::main] 68 | async fn main() { 69 | let cli = Cli::parse(); 70 | 71 | let log_level = Level::from_str(cli.log_level.to_lowercase().as_str()).unwrap_or(Level::INFO); 72 | tracing_subscriber::registry() 73 | .with( 74 | fmt::layer() 75 | .json() 76 | .with_current_span(true) 77 | .with_span_list(true) 78 | .with_target(true), 79 | ) 80 | .with(EnvFilter::from_default_env().add_directive(log_level.into())) 81 | .init(); 82 | 83 | let executor_id = cli.executor_id.unwrap_or_else(Uuid::new_v4); 84 | info!( 85 | executor_id = %executor_id, 86 | version = %env!("CARGO_PKG_VERSION"), 87 | "Starting Aqueducts Executor" 88 | ); 89 | 90 | let config = match Config::try_new(cli.api_key, executor_id, cli.max_memory) { 91 | Ok(config) => config, 92 | Err(e) => { 93 | error!("Configuration error: {}", e); 94 | std::process::exit(1); 95 | } 96 | }; 97 | 98 | info!( 99 | executor_id = %config.executor_id, 100 | max_memory_gb = ?config.max_memory_gb, 101 | "Configuration validated successfully" 102 | ); 103 | 104 | let context = Arc::new(ApiContext::new(config)); 105 | 106 | // Create shutdown signal handler 107 | let shutdown_token = CancellationToken::new(); 108 | let shutdown_token_ = shutdown_token.clone(); 109 | 110 | // Spawn a task to handle shutdown signals 111 | tokio::spawn(async move { 112 | handle_shutdown_signals(shutdown_token_).await; 113 | }); 114 | 115 | // Start the execution manager 116 | let manager_handle = { 117 | let manager = context.manager.clone(); 118 | tokio::spawn(async move { 119 | manager.start().await; 120 | }) 121 | }; 122 | 123 | let app = Router::new() 124 | .merge(api::router(Arc::clone(&context))) 125 | .with_state(context); 126 | 127 | let addr: SocketAddr = match format!("{}:{}", cli.host, cli.port).parse() { 128 | Ok(addr) => addr, 129 | Err(e) => { 130 | error!("Failed to parse socket address: {}", e); 131 | std::process::exit(1); 132 | } 133 | }; 134 | 135 | info!(addr = %addr, "Listening for connections"); 136 | let listener = match tokio::net::TcpListener::bind(addr).await { 137 | Ok(listener) => listener, 138 | Err(e) => { 139 | error!("Failed to bind to address {}: {}", addr, e); 140 | std::process::exit(1); 141 | } 142 | }; 143 | 144 | info!("Server started, press Ctrl+C to stop"); 145 | let server_handle = axum::serve(listener, app) 146 | .with_graceful_shutdown(shutdown_signal_handler(shutdown_token)) 147 | .await; 148 | 149 | match server_handle { 150 | Ok(_) => info!("Server shut down gracefully"), 151 | Err(e) => error!(error = %e, "Server error during shutdown"), 152 | } 153 | 154 | info!("Forcing shutdown of the execution manager"); 155 | drop(manager_handle); 156 | 157 | info!("Aqueducts executor shutdown complete"); 158 | } 159 | 160 | /// Handler function for shutdown signals 161 | async fn handle_shutdown_signals(shutdown_token: CancellationToken) { 162 | let ctrl_c = async { 163 | signal::ctrl_c() 164 | .await 165 | .expect("Failed to install Ctrl+C handler"); 166 | }; 167 | 168 | #[cfg(unix)] 169 | let terminate = async { 170 | signal::unix::signal(signal::unix::SignalKind::terminate()) 171 | .expect("Failed to install signal handler") 172 | .recv() 173 | .await; 174 | }; 175 | 176 | #[cfg(not(unix))] 177 | let terminate = std::future::pending::<()>(); 178 | 179 | tokio::select! { 180 | _ = ctrl_c => { 181 | info!("Received Ctrl+C, starting graceful shutdown"); 182 | }, 183 | _ = terminate => { 184 | info!("Received SIGTERM, starting graceful shutdown"); 185 | }, 186 | } 187 | 188 | // Signal the server to shut down 189 | shutdown_token.cancel(); 190 | } 191 | 192 | /// Returns a future that resolves when the shutdown signal is received 193 | async fn shutdown_signal_handler(token: CancellationToken) { 194 | token.cancelled().await; 195 | info!("Shutdown signal received, starting graceful shutdown"); 196 | 197 | // Give in-flight requests some time to complete 198 | tokio::time::sleep(Duration::from_secs(1)).await; 199 | } 200 | -------------------------------------------------------------------------------- /aqueducts/core/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "aqueducts-core" 3 | authors.workspace = true 4 | edition.workspace = true 5 | description.workspace = true 6 | repository.workspace = true 7 | readme.workspace = true 8 | version.workspace = true 9 | homepage.workspace = true 10 | keywords.workspace = true 11 | categories.workspace = true 12 | license.workspace = true 13 | 14 | [features] 15 | default = ["yaml"] 16 | s3 = ["object_store/aws", "aqueducts-delta?/s3"] 17 | gcs = ["object_store/gcp", "aqueducts-delta?/gcs"] 18 | azure = ["object_store/azure", "aqueducts-delta?/azure"] 19 | odbc = ["dep:aqueducts-odbc"] 20 | delta = ["dep:aqueducts-delta"] 21 | json = ["dep:serde_json"] 22 | yaml = ["dep:serde_yml"] 23 | toml = ["dep:toml"] 24 | custom_udfs = ["dep:datafusion-functions-json", "dep:serde_json"] 25 | 26 | [dependencies] 27 | aqueducts-schemas.workspace = true 28 | 29 | # Optional provider dependencies 30 | aqueducts-delta = { workspace = true, optional = true } 31 | aqueducts-odbc = { workspace = true, optional = true } 32 | 33 | datafusion.workspace = true 34 | datafusion-functions-json = { workspace = true, optional = true } 35 | object_store = { version = "0.12", default-features = false } 36 | 37 | serde.workspace = true 38 | serde_json = { workspace = true, optional = true } 39 | serde_yml = { workspace = true, optional = true } 40 | toml = { workspace = true, optional = true } 41 | 42 | tokio.workspace = true 43 | 44 | thiserror.workspace = true 45 | 46 | tracing.workspace = true 47 | 48 | url.workspace = true 49 | regex.workspace = true 50 | 51 | [dev-dependencies] 52 | tokio = { workspace = true, features = ["full"] } 53 | rand.workspace = true 54 | tracing-test.workspace = true 55 | tempfile = "3" 56 | -------------------------------------------------------------------------------- /aqueducts/core/src/custom_udfs.rs: -------------------------------------------------------------------------------- 1 | use datafusion::arrow::array::{as_string_array, Array, ListBuilder, StringBuilder}; 2 | use datafusion::error::DataFusionError; 3 | use datafusion::execution::FunctionRegistry; 4 | use datafusion::logical_expr::Volatility; 5 | use datafusion::{ 6 | arrow::array::ArrayRef, 7 | arrow::datatypes::DataType, 8 | arrow::datatypes::Field, 9 | logical_expr::{create_udf, ScalarUDF}, 10 | physical_plan::ColumnarValue, 11 | }; 12 | use std::sync::Arc; 13 | 14 | fn unnest_json_array_udf() -> datafusion::logical_expr::ScalarUDF { 15 | let fun = Arc::new( 16 | |args: &[ColumnarValue]| -> datafusion::error::Result { 17 | assert_eq!(args.len(), 1); 18 | 19 | let arrays = ColumnarValue::values_to_arrays(args)?; 20 | let sarr = as_string_array(&arrays[0]); 21 | 22 | let mut builder = ListBuilder::new(StringBuilder::new()); 23 | 24 | for i in 0..sarr.len() { 25 | if sarr.is_null(i) { 26 | builder.append(false); 27 | } else { 28 | let txt = sarr.value(i); 29 | let v: serde_json::Value = serde_json::from_str(txt) 30 | .map_err(|e| DataFusionError::Execution(e.to_string()))?; 31 | 32 | if let serde_json::Value::Array(elems) = v { 33 | for elem in elems { 34 | let s = elem.to_string(); 35 | builder.values().append_value(&s); 36 | } 37 | builder.append(true); 38 | } else { 39 | return Err(DataFusionError::Execution(format!( 40 | "unnest_json_array: expected JSON array, got {}", 41 | v 42 | ))); 43 | } 44 | } 45 | } 46 | 47 | let array = builder.finish(); 48 | Ok(ColumnarValue::Array(Arc::new(array) as ArrayRef)) 49 | }, 50 | ); 51 | 52 | create_udf( 53 | "unnest_json_array", 54 | vec![DataType::Utf8], 55 | DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), 56 | Volatility::Immutable, 57 | fun, 58 | ) 59 | } 60 | 61 | pub fn register_all(registry: &mut dyn FunctionRegistry) -> datafusion::error::Result<()> { 62 | let functions: Vec> = vec![Arc::new(unnest_json_array_udf())]; 63 | 64 | for function in functions { 65 | registry.register_udf(function)?; 66 | } 67 | 68 | datafusion_functions_json::register_all(registry)?; 69 | 70 | Ok(()) 71 | } 72 | 73 | #[cfg(test)] 74 | mod tests { 75 | use super::*; 76 | use datafusion::{ 77 | arrow::array::RecordBatch, assert_batches_sorted_eq, common::DFSchema, prelude::*, 78 | }; 79 | 80 | async fn prepare_df(json: &str) -> (DFSchema, Vec) { 81 | let ctx = SessionContext::new(); 82 | ctx.register_udf(unnest_json_array_udf()); 83 | let df = ctx 84 | .sql(&format!( 85 | "SELECT unnest_json_array(c) AS arr \ 86 | FROM (VALUES ('{}')) AS t(c)", 87 | json 88 | )) 89 | .await 90 | .unwrap(); 91 | let schema = df.schema().clone(); 92 | let batches = df.collect().await.unwrap(); 93 | (schema, batches) 94 | } 95 | 96 | #[tokio::test] 97 | async fn test_unnest_json_array_numbers() { 98 | let (schema, batches) = prepare_df("[1, 2, 3]").await; 99 | 100 | let field = schema.field(0); 101 | let expected_type = DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))); 102 | assert_eq!( 103 | field.data_type(), 104 | &expected_type, 105 | "Expected return type List, got {:?}", 106 | field.data_type() 107 | ); 108 | 109 | let expected = [ 110 | "+-----------+", 111 | "| arr |", 112 | "+-----------+", 113 | "| [1, 2, 3] |", 114 | "+-----------+", 115 | ]; 116 | assert_batches_sorted_eq!(&expected, &batches); 117 | } 118 | 119 | #[tokio::test] 120 | async fn test_unnest_json_array_strings() { 121 | let (schema, batches) = prepare_df(r#"["foo", "bar"]"#).await; 122 | 123 | let field = schema.field(0); 124 | let expected_type = DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))); 125 | assert_eq!( 126 | field.data_type(), 127 | &expected_type, 128 | "Expected return type List, got {:?}", 129 | field.data_type() 130 | ); 131 | 132 | let expected = [ 133 | "+----------------+", 134 | "| arr |", 135 | "+----------------+", 136 | "| [\"foo\", \"bar\"] |", 137 | "+----------------+", 138 | ]; 139 | assert_batches_sorted_eq!(&expected, &batches); 140 | } 141 | 142 | #[tokio::test] 143 | async fn test_unnest_json_array_objects() { 144 | let (schema, batches) = prepare_df(r#"[{"x":1}, {"y":"foo"}]"#).await; 145 | 146 | let field = schema.field(0); 147 | let expected_type = DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))); 148 | assert_eq!( 149 | field.data_type(), 150 | &expected_type, 151 | "Expected return type List, got {:?}", 152 | field.data_type() 153 | ); 154 | 155 | let expected = [ 156 | "+------------------------+", 157 | "| arr |", 158 | "+------------------------+", 159 | "| [{\"x\":1}, {\"y\":\"foo\"}] |", 160 | "+------------------------+", 161 | ]; 162 | assert_batches_sorted_eq!(&expected, &batches); 163 | } 164 | } 165 | -------------------------------------------------------------------------------- /aqueducts/core/src/destinations/mod.rs: -------------------------------------------------------------------------------- 1 | use aqueducts_schemas::Destination; 2 | use datafusion::{dataframe::DataFrame, datasource::MemTable, execution::context::SessionContext}; 3 | use std::sync::Arc; 4 | use tracing::{debug, instrument}; 5 | 6 | use crate::error::{AqueductsError, Result}; 7 | use crate::store::register_object_store; 8 | 9 | pub mod file; 10 | 11 | /// Creates a `Destination` 12 | #[instrument(skip(ctx, destination), err)] 13 | pub async fn register_destination( 14 | ctx: Arc, 15 | destination: &Destination, 16 | ) -> Result<()> { 17 | match destination { 18 | Destination::InMemory(_) => Ok(()), 19 | Destination::File(file_def) => { 20 | register_object_store(ctx, &file_def.location, &file_def.storage_config)?; 21 | Ok(()) 22 | } 23 | #[cfg(feature = "odbc")] 24 | Destination::Odbc(odbc_dest) => { 25 | debug!("Preparing ODBC destination '{}'", odbc_dest.name); 26 | aqueducts_odbc::register_odbc_destination( 27 | &odbc_dest.connection_string, 28 | &odbc_dest.name, 29 | ) 30 | .await 31 | .map_err(|e| { 32 | AqueductsError::destination( 33 | &odbc_dest.name, 34 | format!("ODBC destination error: {}", e), 35 | ) 36 | })?; 37 | Ok(()) 38 | } 39 | #[cfg(feature = "delta")] 40 | Destination::Delta(delta_dest) => { 41 | debug!("Preparing Delta destination '{}'", delta_dest.name); 42 | 43 | let arrow_fields: Result> = delta_dest 44 | .schema 45 | .iter() 46 | .map(|field| { 47 | crate::schema_transform::field_to_arrow(field).map_err(|e| { 48 | AqueductsError::schema_validation(format!("Schema conversion error: {}", e)) 49 | }) 50 | }) 51 | .collect(); 52 | let arrow_fields = arrow_fields?; 53 | 54 | aqueducts_delta::prepare_delta_destination( 55 | &delta_dest.name, 56 | delta_dest.location.as_str(), 57 | &delta_dest.storage_config, 58 | &delta_dest.partition_columns, 59 | &delta_dest.table_properties, 60 | &arrow_fields, 61 | ) 62 | .await 63 | .map_err(|e| { 64 | AqueductsError::destination( 65 | &delta_dest.name, 66 | format!("Delta destination error: {}", e), 67 | ) 68 | })?; 69 | Ok(()) 70 | } 71 | #[cfg(not(feature = "odbc"))] 72 | Destination::Odbc(dest) => Err(AqueductsError::unsupported( 73 | &dest.name, 74 | "ODBC support not enabled. Enable 'odbc' feature", 75 | )), 76 | #[cfg(not(feature = "delta"))] 77 | Destination::Delta(dest) => Err(AqueductsError::unsupported( 78 | &dest.name, 79 | "Delta support not enabled. Enable 'delta' feature", 80 | )), 81 | } 82 | } 83 | 84 | /// Write a `DataFrame` to an Aqueduct `Destination` 85 | #[instrument(skip(ctx, destination, data), err)] 86 | pub async fn write_to_destination( 87 | ctx: Arc, 88 | destination: &Destination, 89 | data: DataFrame, 90 | ) -> Result<()> { 91 | match destination { 92 | Destination::InMemory(mem_def) => { 93 | debug!("Writing data to in-memory table '{}'", mem_def.name); 94 | 95 | let schema = data.schema().clone(); 96 | let partitioned = data.collect_partitioned().await?; 97 | let table = MemTable::try_new(Arc::new(schema.as_arrow().clone()), partitioned)?; 98 | 99 | ctx.register_table(mem_def.name.as_str(), Arc::new(table))?; 100 | 101 | Ok(()) 102 | } 103 | Destination::File(file_def) => { 104 | debug!("Writing data to file at location '{}'", file_def.location); 105 | file::write(file_def, data).await?; 106 | 107 | Ok(()) 108 | } 109 | #[cfg(feature = "odbc")] 110 | Destination::Odbc(odbc_dest) => { 111 | debug!("Writing data to ODBC destination '{}'", odbc_dest.name); 112 | 113 | let schema = data.schema().as_arrow().clone(); 114 | let batches = data.collect().await?; 115 | 116 | aqueducts_odbc::write_arrow_batches( 117 | &odbc_dest.connection_string, 118 | &odbc_dest.name, // Using name as table name 119 | odbc_dest.write_mode.clone(), 120 | batches, 121 | Arc::new(schema), 122 | odbc_dest.batch_size, 123 | ) 124 | .await 125 | .map_err(|e| { 126 | AqueductsError::destination( 127 | &odbc_dest.name, 128 | format!("ODBC destination error: {}", e), 129 | ) 130 | })?; 131 | Ok(()) 132 | } 133 | #[cfg(feature = "delta")] 134 | Destination::Delta(delta_dest) => { 135 | debug!("Writing data to Delta destination '{}'", delta_dest.name); 136 | aqueducts_delta::write_to_delta_destination( 137 | &delta_dest.name, 138 | delta_dest.location.as_str(), 139 | &delta_dest.storage_config, 140 | &delta_dest.write_mode, 141 | data, 142 | ) 143 | .await 144 | .map_err(|e| { 145 | AqueductsError::destination( 146 | &delta_dest.name, 147 | format!("Delta destination error: {}", e), 148 | ) 149 | })?; 150 | Ok(()) 151 | } 152 | #[cfg(not(feature = "odbc"))] 153 | Destination::Odbc(dest) => Err(AqueductsError::unsupported( 154 | &dest.name, 155 | "ODBC support not enabled. Enable 'odbc' feature", 156 | )), 157 | #[cfg(not(feature = "delta"))] 158 | Destination::Delta(dest) => Err(AqueductsError::unsupported( 159 | &dest.name, 160 | "Delta support not enabled. Enable 'delta' feature", 161 | )), 162 | } 163 | } 164 | -------------------------------------------------------------------------------- /aqueducts/core/src/error.rs: -------------------------------------------------------------------------------- 1 | use std::{collections::HashSet, path::PathBuf}; 2 | 3 | use crate::templating::TemplateFormat; 4 | 5 | pub type Result = core::result::Result; 6 | 7 | #[derive(Debug, thiserror::Error)] 8 | pub enum AqueductsError { 9 | // === Configuration & Setup Errors === 10 | #[error("Unsupported operation: {operation} for {context}")] 11 | Unsupported { operation: String, context: String }, 12 | 13 | // === Data Processing Errors === 14 | #[error("Schema validation failed: {message}")] 15 | SchemaValidation { message: String }, 16 | 17 | #[error("Data processing failed: {message}")] 18 | DataProcessing { message: String }, 19 | 20 | // === I/O & Storage Errors === 21 | #[error("Storage operation failed: {operation} at {location}")] 22 | Storage { operation: String, location: String }, 23 | 24 | #[error("File operation failed: {message}")] 25 | FileOperation { message: String }, 26 | 27 | // === Pipeline Execution Errors === 28 | #[error("Source '{name}' failed: {message}")] 29 | Source { name: String, message: String }, 30 | 31 | #[error("Stage '{name}' failed: {message}")] 32 | Stage { name: String, message: String }, 33 | 34 | #[error("Destination '{name}' failed: {message}")] 35 | Destination { name: String, message: String }, 36 | 37 | // === Template & Parsing Errors === 38 | #[error("Template error: {message}")] 39 | Template { message: String }, 40 | 41 | #[error("Parse error: {message}")] 42 | Parse { message: String }, 43 | 44 | // === Resource Management === 45 | #[error("Resource not found: {resource} at {location}")] 46 | NotFound { resource: String, location: String }, 47 | } 48 | 49 | impl AqueductsError { 50 | // === Templating === 51 | pub fn unsupported(operation: impl Into, context: impl Into) -> Self { 52 | Self::Unsupported { 53 | operation: operation.into(), 54 | context: context.into(), 55 | } 56 | } 57 | 58 | // === Data Processing === 59 | pub fn schema_validation(message: impl Into) -> Self { 60 | Self::SchemaValidation { 61 | message: message.into(), 62 | } 63 | } 64 | 65 | pub fn data_processing(message: impl Into) -> Self { 66 | Self::DataProcessing { 67 | message: message.into(), 68 | } 69 | } 70 | 71 | // === I/O & Storage === 72 | pub fn storage(operation: impl Into, location: impl Into) -> Self { 73 | Self::Storage { 74 | operation: operation.into(), 75 | location: location.into(), 76 | } 77 | } 78 | 79 | pub fn file_operation(message: impl Into) -> Self { 80 | Self::FileOperation { 81 | message: message.into(), 82 | } 83 | } 84 | 85 | // === Pipeline Execution === 86 | pub fn source(name: impl Into, message: impl Into) -> Self { 87 | Self::Source { 88 | name: name.into(), 89 | message: message.into(), 90 | } 91 | } 92 | 93 | pub fn stage(name: impl Into, message: impl Into) -> Self { 94 | Self::Stage { 95 | name: name.into(), 96 | message: message.into(), 97 | } 98 | } 99 | 100 | pub fn destination(name: impl Into, message: impl Into) -> Self { 101 | Self::Destination { 102 | name: name.into(), 103 | message: message.into(), 104 | } 105 | } 106 | 107 | // === Template & Parsing === 108 | pub fn template(message: impl Into) -> Self { 109 | Self::Template { 110 | message: message.into(), 111 | } 112 | } 113 | 114 | pub fn parse(message: impl Into) -> Self { 115 | Self::Parse { 116 | message: message.into(), 117 | } 118 | } 119 | 120 | // === Resource Management === 121 | pub fn not_found(resource: impl Into, location: impl Into) -> Self { 122 | Self::NotFound { 123 | resource: resource.into(), 124 | location: location.into(), 125 | } 126 | } 127 | } 128 | 129 | // === External Error Conversions === 130 | 131 | impl From for AqueductsError { 132 | fn from(err: std::io::Error) -> Self { 133 | Self::FileOperation { 134 | message: err.to_string(), 135 | } 136 | } 137 | } 138 | 139 | impl From for AqueductsError { 140 | fn from(err: datafusion::error::DataFusionError) -> Self { 141 | use datafusion::error::DataFusionError as DF; 142 | match err { 143 | DF::SchemaError(_, _) => Self::SchemaValidation { 144 | message: err.to_string(), 145 | }, 146 | DF::ArrowError(_, _) => Self::DataProcessing { 147 | message: err.to_string(), 148 | }, 149 | DF::IoError(_) => Self::FileOperation { 150 | message: err.to_string(), 151 | }, 152 | _ => Self::DataProcessing { 153 | message: err.to_string(), 154 | }, 155 | } 156 | } 157 | } 158 | 159 | impl From for AqueductsError { 160 | fn from(err: datafusion::arrow::error::ArrowError) -> Self { 161 | use datafusion::arrow::error::ArrowError as AE; 162 | match err { 163 | AE::SchemaError(_) => Self::SchemaValidation { 164 | message: err.to_string(), 165 | }, 166 | AE::ComputeError(_) => Self::DataProcessing { 167 | message: err.to_string(), 168 | }, 169 | AE::IoError(_, _) => Self::FileOperation { 170 | message: err.to_string(), 171 | }, 172 | AE::ParseError(_) => Self::Parse { 173 | message: err.to_string(), 174 | }, 175 | _ => Self::DataProcessing { 176 | message: err.to_string(), 177 | }, 178 | } 179 | } 180 | } 181 | 182 | impl From for AqueductsError { 183 | fn from(err: object_store::Error) -> Self { 184 | Self::Storage { 185 | operation: "object_store".to_string(), 186 | location: err.to_string(), 187 | } 188 | } 189 | } 190 | 191 | impl From for AqueductsError { 192 | fn from(err: regex::Error) -> Self { 193 | Self::Parse { 194 | message: format!("Regex error: {}", err), 195 | } 196 | } 197 | } 198 | 199 | #[cfg(feature = "json")] 200 | impl From for AqueductsError { 201 | fn from(err: serde_json::Error) -> Self { 202 | Self::Parse { 203 | message: format!("JSON error: {}", err), 204 | } 205 | } 206 | } 207 | 208 | #[cfg(feature = "toml")] 209 | impl From for AqueductsError { 210 | fn from(err: toml::de::Error) -> Self { 211 | Self::Parse { 212 | message: format!("TOML deserialization error: {}", err), 213 | } 214 | } 215 | } 216 | 217 | #[cfg(feature = "toml")] 218 | impl From for AqueductsError { 219 | fn from(err: toml::ser::Error) -> Self { 220 | Self::Parse { 221 | message: format!("TOML serialization error: {}", err), 222 | } 223 | } 224 | } 225 | 226 | #[cfg(feature = "yaml")] 227 | impl From for AqueductsError { 228 | fn from(err: serde_yml::Error) -> Self { 229 | Self::Parse { 230 | message: format!("YAML error: {}", err), 231 | } 232 | } 233 | } 234 | 235 | // === Legacy Support for Template-Specific Errors === 236 | 237 | impl From> for AqueductsError { 238 | fn from(missing_params: HashSet) -> Self { 239 | Self::Template { 240 | message: format!("Missing template parameters: {:?}", missing_params), 241 | } 242 | } 243 | } 244 | 245 | impl From<(PathBuf, &'static str)> for AqueductsError { 246 | fn from((path, context): (PathBuf, &'static str)) -> Self { 247 | Self::Template { 248 | message: format!("{}: {:?}", context, path), 249 | } 250 | } 251 | } 252 | 253 | impl From for AqueductsError { 254 | fn from(format: TemplateFormat) -> Self { 255 | Self::Unsupported { 256 | operation: "template format".to_string(), 257 | context: format!( 258 | "{:?} support is not enabled in this build. Enable the corresponding feature flag", 259 | format 260 | ), 261 | } 262 | } 263 | } 264 | -------------------------------------------------------------------------------- /aqueducts/core/src/progress_tracker.rs: -------------------------------------------------------------------------------- 1 | use aqueducts_schemas::{OutputType, ProgressEvent}; 2 | use datafusion::arrow::array::RecordBatch; 3 | use datafusion::common::DFSchema; 4 | use tracing::{error, info, instrument}; 5 | 6 | /// A trait for handling progress events and stage output during pipeline execution. 7 | /// 8 | /// Implement this trait to create custom progress tracking and monitoring for 9 | /// Aqueducts pipeline execution. This allows you to: 10 | /// 11 | /// - Monitor pipeline progress in real-time 12 | /// - Capture and display stage outputs 13 | /// - Send progress updates to external systems 14 | /// - Build custom UIs for pipeline monitoring 15 | /// 16 | /// # Examples 17 | /// 18 | /// ## Basic Custom Progress Tracker 19 | /// 20 | /// ```rust 21 | /// use aqueducts_core::progress_tracker::ProgressTracker; 22 | /// use aqueducts_schemas::{ProgressEvent, OutputType}; 23 | /// use datafusion::arrow::array::RecordBatch; 24 | /// use datafusion::common::DFSchema; 25 | /// 26 | /// struct MyCustomTracker { 27 | /// start_time: std::time::Instant, 28 | /// } 29 | /// 30 | /// impl MyCustomTracker { 31 | /// fn new() -> Self { 32 | /// Self { 33 | /// start_time: std::time::Instant::now(), 34 | /// } 35 | /// } 36 | /// } 37 | /// 38 | /// impl ProgressTracker for MyCustomTracker { 39 | /// fn on_progress(&self, event: ProgressEvent) { 40 | /// match event { 41 | /// ProgressEvent::Started => { 42 | /// println!("Pipeline started at {:?}", self.start_time); 43 | /// } 44 | /// ProgressEvent::SourceRegistered { name } => { 45 | /// println!("Source '{}' registered", name); 46 | /// } 47 | /// ProgressEvent::StageCompleted { name, duration_ms, .. } => { 48 | /// println!("Stage '{}' completed in {}ms", name, duration_ms); 49 | /// } 50 | /// ProgressEvent::Completed { duration_ms } => { 51 | /// println!("Pipeline completed in {}ms", duration_ms); 52 | /// } 53 | /// _ => {} 54 | /// } 55 | /// } 56 | /// 57 | /// fn on_output( 58 | /// &self, 59 | /// stage_name: &str, 60 | /// output_type: OutputType, 61 | /// _schema: &DFSchema, 62 | /// batches: &[RecordBatch], 63 | /// ) { 64 | /// let row_count: usize = batches.iter().map(|b| b.num_rows()).sum(); 65 | /// println!("Stage '{}' produced {} rows ({:?})", stage_name, row_count, output_type); 66 | /// } 67 | /// } 68 | /// ``` 69 | pub trait ProgressTracker: Send + Sync { 70 | /// Called when a progress event occurs during pipeline execution. 71 | /// 72 | /// This method receives various types of progress events: 73 | /// - `Started` - Pipeline execution has begun 74 | /// - `SourceRegistered` - A data source has been registered 75 | /// - `StageStarted` - A processing stage has started 76 | /// - `StageCompleted` - A processing stage has finished 77 | /// - `DestinationCompleted` - Data has been written to destination 78 | /// - `Completed` - Entire pipeline has finished 79 | /// 80 | /// # Arguments 81 | /// 82 | /// * `event` - The progress event that occurred 83 | fn on_progress(&self, event: ProgressEvent); 84 | 85 | /// Called when a stage produces output that should be displayed or captured. 86 | /// 87 | /// This method is called for stages that use output directives like `show`, 88 | /// `explain`, or `print_schema`. It allows you to capture and process the 89 | /// results of these operations. 90 | /// 91 | /// # Arguments 92 | /// 93 | /// * `stage_name` - Name of the stage producing output 94 | /// * `output_type` - Type of output (Show, Explain, etc.) 95 | /// * `schema` - Schema of the data being output 96 | /// * `batches` - The actual data batches to display 97 | fn on_output( 98 | &self, 99 | stage_name: &str, 100 | output_type: OutputType, 101 | schema: &DFSchema, 102 | batches: &[RecordBatch], 103 | ); 104 | } 105 | 106 | /// A simple progress tracker that logs progress events and stage output using the `tracing` crate. 107 | /// 108 | /// This is the default progress tracker provided by Aqueducts. It logs all progress events 109 | /// and stage outputs using structured logging with emoji icons for better readability. 110 | /// 111 | /// # Examples 112 | /// 113 | /// ```rust,no_run 114 | /// use aqueducts_core::{run_pipeline, progress_tracker::LoggingProgressTracker, templating::TemplateLoader}; 115 | /// use aqueducts_schemas::Aqueduct; 116 | /// use datafusion::prelude::SessionContext; 117 | /// use std::sync::Arc; 118 | /// 119 | /// async fn example() -> Result<(), Box> { 120 | /// let pipeline = Aqueduct::from_file("pipeline.yml", Default::default())?; 121 | /// let ctx = Arc::new(SessionContext::new()); 122 | /// let tracker = Arc::new(LoggingProgressTracker); 123 | /// 124 | /// // This will log progress events as the pipeline executes 125 | /// let _result = run_pipeline(ctx, pipeline, Some(tracker)).await?; 126 | /// 127 | /// Ok(()) 128 | /// } 129 | /// ``` 130 | #[derive(Debug)] 131 | pub struct LoggingProgressTracker; 132 | 133 | impl ProgressTracker for LoggingProgressTracker { 134 | #[instrument(skip_all)] 135 | fn on_progress(&self, event: ProgressEvent) { 136 | match event { 137 | ProgressEvent::Started => { 138 | info!("🚀 Pipeline execution started"); 139 | } 140 | ProgressEvent::SourceRegistered { name } => { 141 | info!("📚 Registered source: {}", name); 142 | } 143 | ProgressEvent::StageStarted { 144 | name, 145 | position, 146 | sub_position, 147 | } => { 148 | info!( 149 | "⚙️ Processing stage: {} (position: {}, sub-position: {})", 150 | name, position, sub_position 151 | ); 152 | } 153 | ProgressEvent::StageCompleted { 154 | name, 155 | position: _, 156 | sub_position: _, 157 | duration_ms, 158 | } => { 159 | info!( 160 | "✅ Completed stage: {} (took: {:.2}s)", 161 | name, 162 | duration_ms as f64 / 1000.0 163 | ); 164 | } 165 | ProgressEvent::DestinationCompleted => { 166 | info!("📦 Data successfully written to destination"); 167 | } 168 | ProgressEvent::Completed { duration_ms } => { 169 | info!( 170 | "🎉 Pipeline execution completed (total time: {:.2}s)", 171 | duration_ms as f64 / 1000.0 172 | ); 173 | } 174 | } 175 | } 176 | 177 | #[instrument(skip_all)] 178 | fn on_output( 179 | &self, 180 | stage_name: &str, 181 | output_type: OutputType, 182 | schema: &DFSchema, 183 | batches: &[RecordBatch], 184 | ) { 185 | let output = datafusion::arrow::util::pretty::pretty_format_batches(batches); 186 | match (output_type, output){ 187 | (OutputType::Show, Ok(output_str)) => info!( 188 | "\n📋 Table Data: {stage_name}\n───────────────────────────────────────\n{output_str}\n" 189 | ), 190 | (OutputType::ShowLimit, Ok(output_str)) => info!( 191 | "\n📋 Table Data (Preview): {stage_name}\n───────────────────────────────────────\n{output_str}\n" 192 | ), 193 | (OutputType::Explain, Ok(output_str)) => info!( 194 | "\n🔍 Query Plan: {stage_name}\n───────────────────────────────────────\n{output_str}\n" 195 | ), 196 | (OutputType::ExplainAnalyze, Ok(output_str)) => info!( 197 | "\n📊 Query Metrics: {stage_name}\n───────────────────────────────────────\n{output_str}\n" 198 | ), 199 | (OutputType::PrintSchema, Ok(_)) => info!( 200 | "\n🔢 Schema: {stage_name}\n───────────────────────────────────────\n{schema:#?}\n" 201 | ), 202 | _ => error!("❗\n Failed to produce stage output\n") 203 | } 204 | } 205 | } 206 | -------------------------------------------------------------------------------- /aqueducts/core/src/stages/mod.rs: -------------------------------------------------------------------------------- 1 | use aqueducts_schemas::{OutputType, Stage}; 2 | use datafusion::{ 3 | datasource::MemTable, 4 | execution::context::{SQLOptions, SessionContext}, 5 | }; 6 | use std::sync::Arc; 7 | use tracing::instrument; 8 | 9 | use crate::error::{AqueductsError, Result}; 10 | 11 | /// Process a stage in the Aqueduct pipeline 12 | /// The result of the operation will be registered within the `SessionContext` as an 13 | /// in-memory table using the stages name as the table name 14 | /// Does not allow for ddl/dml queries or SQL statements (e.g. SET VARIABLE, CREATE TABLE, etc.) 15 | #[instrument(skip_all, err)] 16 | pub async fn process_stage( 17 | ctx: Arc, 18 | stage: Stage, 19 | progress_tracker: Option>, 20 | ) -> Result<()> { 21 | let options = SQLOptions::new() 22 | .with_allow_ddl(false) 23 | .with_allow_dml(false) 24 | .with_allow_statements(false); 25 | 26 | let result = ctx 27 | .sql_with_options(stage.query.as_str(), options) 28 | .await? 29 | .cache() 30 | .await 31 | .map_err(|e| { 32 | AqueductsError::stage( 33 | &stage.name, 34 | format!("Error occured during stage execution: {e}"), 35 | ) 36 | })?; 37 | let schema = result.schema().clone(); 38 | 39 | if stage.explain || stage.explain_analyze { 40 | let output_type = if stage.explain_analyze { 41 | OutputType::ExplainAnalyze 42 | } else { 43 | OutputType::Explain 44 | }; 45 | 46 | let explain = result.clone().explain(false, stage.explain_analyze)?; 47 | let batches = explain.collect().await?; 48 | 49 | if let Some(tracker) = &progress_tracker { 50 | tracker.on_output(&stage.name, output_type, &schema, &batches); 51 | } 52 | } 53 | 54 | match stage.show { 55 | Some(0) => { 56 | let batches = result.clone().collect().await?; 57 | if let Some(tracker) = &progress_tracker { 58 | tracker.on_output(&stage.name, OutputType::Show, &schema, &batches); 59 | } 60 | } 61 | Some(limit) => { 62 | let batches = result.clone().limit(0, Some(limit))?.collect().await?; 63 | if let Some(tracker) = &progress_tracker { 64 | tracker.on_output(&stage.name, OutputType::ShowLimit, &schema, &batches); 65 | } 66 | } 67 | _ => (), 68 | }; 69 | 70 | if stage.print_schema { 71 | if let Some(tracker) = &progress_tracker { 72 | let schema = result.schema(); 73 | tracker.on_output(&stage.name, OutputType::PrintSchema, schema, &[]); 74 | } 75 | } 76 | 77 | let partitioned = result.collect_partitioned().await?; 78 | let table = MemTable::try_new(Arc::new(schema.as_arrow().clone()), partitioned)?; 79 | 80 | ctx.register_table(stage.name.as_str(), Arc::new(table))?; 81 | 82 | Ok(()) 83 | } 84 | -------------------------------------------------------------------------------- /aqueducts/core/src/store/azure.rs: -------------------------------------------------------------------------------- 1 | //! # Azure Blob Storage Object Store Provider 2 | //! 3 | //! This module provides an Azure Blob Storage implementation of the `ObjectStoreProvider` trait 4 | //! using the `object_store` crate's Microsoft Azure backend. 5 | 6 | use object_store::azure::MicrosoftAzureBuilder; 7 | use std::{collections::HashMap, sync::Arc}; 8 | use tracing::warn; 9 | use url::Url; 10 | 11 | use super::ObjectStoreProvider; 12 | use crate::error::{AqueductsError, Result}; 13 | 14 | /// Provider for Azure Blob Storage. 15 | /// 16 | /// This provider supports: 17 | /// - `az://` URLs for Azure Blob Storage 18 | /// - `azure://` URLs (alternative Azure scheme) 19 | /// - `abfs://` URLs (Azure Data Lake Storage Gen2) 20 | /// - `abfss://` URLs (Azure Data Lake Storage Gen2 with SSL) 21 | /// 22 | /// ## Automatic Environment Variable Configuration 23 | /// 24 | /// The provider automatically reads Azure credentials and configuration from environment variables: 25 | /// - `AZURE_STORAGE_ACCOUNT_NAME` - Storage account name 26 | /// - `AZURE_STORAGE_ACCOUNT_KEY` - Storage account access key 27 | /// - `AZURE_CLIENT_ID` - Azure AD application client ID 28 | /// - `AZURE_CLIENT_SECRET` - Azure AD application client secret 29 | /// - `AZURE_TENANT_ID` - Azure AD tenant ID 30 | /// 31 | /// ## Supported Configuration Options 32 | /// 33 | /// | Option | Description | Environment Variable | 34 | /// |-------------------------------|----------------------------|------------------------------| 35 | /// | `azure_storage_account_name` | Storage account name | `AZURE_STORAGE_ACCOUNT_NAME` | 36 | /// | `azure_storage_account_key` | Storage account access key | `AZURE_STORAGE_ACCOUNT_KEY` | 37 | /// | `azure_storage_client_id` | Azure AD client ID | `AZURE_CLIENT_ID` | 38 | /// | `azure_storage_client_secret` | Azure AD client secret | `AZURE_CLIENT_SECRET` | 39 | /// | `azure_storage_tenant_id` | Azure AD tenant ID | `AZURE_TENANT_ID` | 40 | /// | `azure_storage_use_emulator` | Use storage emulator | - | 41 | /// | `azure_storage_use_azure_cli` | Use Azure CLI credentials | - | 42 | /// | `azure_federated_token_file` | Federated token file path | - | 43 | /// | `azure_use_fabric_endpoint` | Use Fabric endpoint | - | 44 | /// | `azure_msi_endpoint` | MSI endpoint URL | - | 45 | /// | `azure_disable_tagging` | Disable object tagging | - | 46 | pub struct AzureProvider; 47 | 48 | impl ObjectStoreProvider for AzureProvider { 49 | fn supports_scheme(&self, scheme: &str) -> bool { 50 | matches!(scheme, "az" | "azure" | "abfs" | "abfss") 51 | } 52 | 53 | fn create_store( 54 | &self, 55 | location: &Url, 56 | options: &HashMap, 57 | ) -> Result> { 58 | let mut builder = MicrosoftAzureBuilder::from_env(); 59 | 60 | if let Some(account) = location.host_str() { 61 | let container = location 62 | .path() 63 | .trim_start_matches('/') 64 | .split('/') 65 | .next() 66 | .unwrap_or(""); 67 | if !container.is_empty() { 68 | builder = builder.with_container_name(container); 69 | } 70 | 71 | if account.ends_with(".blob.core.windows.net") { 72 | let account_name = account.replace(".blob.core.windows.net", ""); 73 | builder = builder.with_account(account_name); 74 | } else { 75 | builder = builder.with_account(account); 76 | } 77 | } 78 | 79 | for (key, value) in options { 80 | builder = match key.as_str() { 81 | "azure_storage_account_name" | "account_name" => builder.with_account(value), 82 | "azure_storage_account_key" | "account_key" => builder.with_access_key(value), 83 | "azure_storage_client_id" | "client_id" => builder.with_client_id(value), 84 | "azure_storage_client_secret" | "client_secret" => { 85 | builder.with_client_secret(value) 86 | } 87 | "azure_storage_tenant_id" | "tenant_id" => builder.with_tenant_id(value), 88 | "azure_storage_use_emulator" => { 89 | builder.with_use_emulator(value.parse::().unwrap_or(false)) 90 | } 91 | "azure_storage_use_azure_cli" => { 92 | builder.with_use_azure_cli(value.parse::().unwrap_or(false)) 93 | } 94 | "azure_federated_token_file" => builder.with_federated_token_file(value), 95 | "azure_use_fabric_endpoint" => { 96 | builder.with_use_fabric_endpoint(value.parse::().unwrap_or(false)) 97 | } 98 | "azure_msi_endpoint" => builder.with_msi_endpoint(value), 99 | "azure_disable_tagging" => { 100 | builder.with_disable_tagging(value.parse::().unwrap_or(false)) 101 | } 102 | unknown => { 103 | warn!("Unknown object_store configuration key: {unknown}"); 104 | builder 105 | } 106 | }; 107 | } 108 | 109 | builder 110 | .build() 111 | .map(|store| Arc::new(store) as Arc) 112 | .map_err(|e| AqueductsError::storage("object_store", e.to_string())) 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /aqueducts/core/src/store/gcs.rs: -------------------------------------------------------------------------------- 1 | //! # Google Cloud Storage Object Store Provider 2 | //! 3 | //! This module provides a GCS implementation of the `ObjectStoreProvider` trait 4 | //! using the `object_store` crate's Google Cloud Storage backend. 5 | 6 | use object_store::gcp::GoogleCloudStorageBuilder; 7 | use std::{collections::HashMap, sync::Arc}; 8 | use tracing::warn; 9 | use url::Url; 10 | 11 | use super::ObjectStoreProvider; 12 | use crate::error::{AqueductsError, Result}; 13 | 14 | /// Provider for Google Cloud Storage. 15 | /// 16 | /// This provider supports: 17 | /// - `gs://` URLs for standard GCS access 18 | /// - `gcs://` URLs (alternative GCS scheme) 19 | /// 20 | /// ## Automatic Environment Variable Configuration 21 | /// 22 | /// The provider automatically reads GCP credentials and configuration from environment variables: 23 | /// - `GOOGLE_APPLICATION_CREDENTIALS` - Path to service account JSON file 24 | /// - `GOOGLE_SERVICE_ACCOUNT` - Service account email 25 | /// - `GOOGLE_SERVICE_ACCOUNT_KEY` - Service account private key 26 | /// 27 | /// ## Supported Configuration Options 28 | /// 29 | /// | Option | Description | Environment Variable | 30 | /// |----------------------------------|------------------------------|----------------------------------| 31 | /// | `google_service_account` | Path to service account JSON | `GOOGLE_APPLICATION_CREDENTIALS` | 32 | /// | `google_service_account_key` | Service account private key | - | 33 | /// | `google_application_credentials` | Application credentials | `GOOGLE_APPLICATION_CREDENTIALS` | 34 | pub struct GcsProvider; 35 | 36 | impl ObjectStoreProvider for GcsProvider { 37 | fn supports_scheme(&self, scheme: &str) -> bool { 38 | matches!(scheme, "gs" | "gcs") 39 | } 40 | 41 | fn create_store( 42 | &self, 43 | location: &Url, 44 | options: &HashMap, 45 | ) -> Result> { 46 | let mut builder = GoogleCloudStorageBuilder::from_env(); 47 | 48 | if let Some(bucket) = location.host_str() { 49 | builder = builder.with_bucket_name(bucket); 50 | } 51 | 52 | for (key, value) in options { 53 | builder = match key.as_str() { 54 | "google_service_account" => builder.with_service_account_path(value), 55 | "google_service_account_key" => builder.with_service_account_key(value), 56 | "google_application_credentials" => builder.with_application_credentials(value), 57 | unknown => { 58 | warn!("Unknown object_store configuration key: {unknown}"); 59 | builder 60 | } 61 | }; 62 | } 63 | 64 | builder 65 | .build() 66 | .map(|store| Arc::new(store) as Arc) 67 | .map_err(|e| AqueductsError::storage("object_store", e.to_string())) 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /aqueducts/core/src/store/s3.rs: -------------------------------------------------------------------------------- 1 | //! # AWS S3 Object Store Provider 2 | //! 3 | //! This module provides an S3 implementation of the `ObjectStoreProvider` trait 4 | //! using the `object_store` crate's AWS S3 backend. 5 | 6 | use super::ObjectStoreProvider; 7 | use crate::error::{AqueductsError, Result}; 8 | use object_store::aws::AmazonS3Builder; 9 | use std::{collections::HashMap, sync::Arc}; 10 | use tracing::warn; 11 | use url::Url; 12 | 13 | /// Provider for Amazon S3 and S3-compatible storage. 14 | /// 15 | /// This provider supports: 16 | /// - `s3://` URLs for standard S3 access 17 | /// - `s3a://` URLs (Hadoop-style S3 access) 18 | /// 19 | /// ## Automatic Environment Variable Configuration 20 | /// 21 | /// The provider automatically reads AWS credentials and configuration from environment variables: 22 | /// - `AWS_ACCESS_KEY_ID` - AWS access key 23 | /// - `AWS_SECRET_ACCESS_KEY` - AWS secret key 24 | /// - `AWS_REGION` - AWS region (e.g., "us-west-2") 25 | /// - `AWS_ENDPOINT` - Custom S3 endpoint (for S3-compatible services) 26 | /// - `AWS_SESSION_TOKEN` - Session token for temporary credentials 27 | /// - `AWS_PROFILE` - AWS profile name 28 | /// - `AWS_ALLOW_HTTP` - Allow HTTP connections (set to "true") 29 | /// 30 | /// ## Supported Configuration Override Options 31 | /// 32 | /// All options can be provided with or without the `aws_` prefix: 33 | /// 34 | /// | Option | Description | Environment Variable | 35 | /// |------------------------------------|-----------------------------------|-------------------------| 36 | /// | `aws_access_key_id` | AWS access key ID | `AWS_ACCESS_KEY_ID` | 37 | /// | `aws_secret_access_key` | AWS secret access key | `AWS_SECRET_ACCESS_KEY` | 38 | /// | `aws_region` | AWS region | `AWS_REGION` | 39 | /// | `aws_endpoint` | Custom S3 endpoint | `AWS_ENDPOINT` | 40 | /// | `aws_session_token` | AWS session token | `AWS_SESSION_TOKEN` | 41 | /// | `aws_allow_http` | Allow HTTP connections | `AWS_ALLOW_HTTP` | 42 | /// | `aws_virtual_hosted_style_request` | Use virtual hosted-style requests | - | 43 | /// | `aws_checksum_algorithm` | Checksum algorithm for uploads | - | 44 | /// | `aws_s3_express` | Enable S3 Express One Zone | - | 45 | /// | `aws_unsigned_payload` | Use unsigned payload | - | 46 | /// | `aws_skip_signature` | Skip request signing | - | 47 | /// | `aws_imdsv1_fallback` | Enable IMDSv1 fallback | - | 48 | pub struct S3Provider; 49 | 50 | impl ObjectStoreProvider for S3Provider { 51 | fn supports_scheme(&self, scheme: &str) -> bool { 52 | matches!(scheme, "s3" | "s3a") 53 | } 54 | 55 | fn create_store( 56 | &self, 57 | location: &Url, 58 | options: &HashMap, 59 | ) -> Result> { 60 | let mut builder = AmazonS3Builder::from_env(); 61 | 62 | if let Some(bucket) = location.host_str() { 63 | builder = builder.with_bucket_name(bucket); 64 | } 65 | 66 | for (key, value) in options { 67 | builder = match key.as_str() { 68 | "aws_access_key_id" | "access_key_id" => builder.with_access_key_id(value), 69 | "aws_secret_access_key" | "secret_access_key" => { 70 | builder.with_secret_access_key(value) 71 | } 72 | "aws_region" | "region" => builder.with_region(value), 73 | "aws_endpoint" | "endpoint" => builder.with_endpoint(value), 74 | "aws_session_token" | "session_token" => builder.with_token(value), 75 | "aws_allow_http" => builder.with_allow_http(value.parse::().unwrap_or(false)), 76 | "aws_virtual_hosted_style_request" => builder 77 | .with_virtual_hosted_style_request(value.parse::().unwrap_or(false)), 78 | "aws_checksum_algorithm" => { 79 | if let Ok(checksum) = value.parse() { 80 | builder.with_checksum_algorithm(checksum) 81 | } else { 82 | builder 83 | } 84 | } 85 | "aws_s3_express" | "s3_express" => { 86 | builder.with_s3_express(value.parse::().unwrap_or(false)) 87 | } 88 | "aws_unsigned_payload" => { 89 | builder.with_unsigned_payload(value.parse::().unwrap_or(false)) 90 | } 91 | "aws_skip_signature" => { 92 | builder.with_skip_signature(value.parse::().unwrap_or(false)) 93 | } 94 | "aws_imdsv1_fallback" => { 95 | if value.parse::().unwrap_or(false) { 96 | builder.with_imdsv1_fallback() 97 | } else { 98 | builder 99 | } 100 | } 101 | unknown => { 102 | warn!("Unknown object_store configuration key: {unknown}"); 103 | builder 104 | } 105 | }; 106 | } 107 | 108 | builder 109 | .build() 110 | .map(|store| Arc::new(store) as Arc) 111 | .map_err(|e| AqueductsError::storage("object_store", e.to_string())) 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /aqueducts/core/tests/integration.rs: -------------------------------------------------------------------------------- 1 | //! Integration tests for aqueducts core pipeline functionality. 2 | //! 3 | //! These tests focus on end-to-end pipeline execution and test data helpers to verify core functionality. 4 | 5 | mod common; 6 | 7 | use aqueducts_core::run_pipeline; 8 | use aqueducts_schemas::*; 9 | use common::*; 10 | use datafusion::prelude::*; 11 | use std::sync::Arc; 12 | 13 | #[tokio::test] 14 | async fn test_csv_source_to_memory_destination() { 15 | let dataset = TestDataSet::new().unwrap(); 16 | 17 | let pipeline = Aqueduct::builder() 18 | .sources(vec![Source::File( 19 | FileSource::builder() 20 | .name("test_data".to_string()) 21 | .format(sources::FileType::Csv(CsvSourceOptions::default())) 22 | .location(dataset.csv_url.clone().into()) 23 | .build(), 24 | )]) 25 | .stages(vec![vec![Stage::builder() 26 | .name("transform".to_string()) 27 | .query("SELECT id, name, value * 2 as doubled_value, active FROM test_data".to_string()) 28 | .build()]]) 29 | .destination(Destination::InMemory( 30 | InMemoryDestination::builder() 31 | .name("result".to_string()) 32 | .build(), 33 | )) 34 | .build(); 35 | 36 | let ctx = Arc::new(SessionContext::new()); 37 | let result_ctx = run_pipeline(ctx, pipeline, None).await.unwrap(); 38 | 39 | let table = result_ctx.table("result").await.unwrap(); 40 | let batches = table.collect().await.unwrap(); 41 | 42 | assert_eq!(batches.len(), 1); 43 | assert_eq!(batches[0].num_rows(), dataset.expected_rows()); 44 | 45 | // Verify data transformation worked 46 | let doubled_values = batches[0] 47 | .column_by_name("doubled_value") 48 | .expect("doubled_value column should exist"); 49 | 50 | // Should have doubled the original values 51 | assert!(!doubled_values.is_empty()); 52 | } 53 | 54 | #[tokio::test] 55 | async fn test_parquet_source_to_csv_destination() { 56 | let dataset = TestDataSet::new().unwrap(); 57 | let output_url = dataset.get_output_url("csv_out", "result.csv"); 58 | 59 | let pipeline = Aqueduct::builder() 60 | .sources(vec![Source::File( 61 | FileSource::builder() 62 | .name("parquet_data".to_string()) 63 | .format(sources::FileType::Parquet(ParquetSourceOptions::default())) 64 | .location(dataset.parquet_url.clone().into()) 65 | .build(), 66 | )]) 67 | .stages(vec![vec![Stage::builder() 68 | .name("filter_active".to_string()) 69 | .query("SELECT * FROM parquet_data WHERE active = true".to_string()) 70 | .build()]]) 71 | .destination(Destination::File( 72 | FileDestination::builder() 73 | .name("csv_output".to_string()) 74 | .format(destinations::FileType::Csv(CsvDestinationOptions::default())) 75 | .location(output_url.clone().into()) 76 | .build(), 77 | )) 78 | .build(); 79 | 80 | let ctx = Arc::new(SessionContext::new()); 81 | run_pipeline(ctx, pipeline, None).await.unwrap(); 82 | 83 | let output_path = output_url.to_file_path().unwrap(); 84 | assert!(output_path.exists()); 85 | 86 | let content = std::fs::read_to_string(&output_path).unwrap(); 87 | assert!(content.contains("id,name,value,active")); 88 | assert!(content.contains("true")); // Should only have active=true records 89 | assert!(!content.contains("false")); // Should not have active=false records 90 | } 91 | 92 | #[tokio::test] 93 | async fn test_pipeline_without_destination() { 94 | let dataset = TestDataSet::new().unwrap(); 95 | 96 | let pipeline = Aqueduct::builder() 97 | .sources(vec![Source::File( 98 | FileSource::builder() 99 | .name("test_source".to_string()) 100 | .format(sources::FileType::Csv(CsvSourceOptions::default())) 101 | .location(dataset.csv_url.clone().into()) 102 | .build(), 103 | )]) 104 | .stages(vec![vec![Stage::builder() 105 | .name("final_stage".to_string()) 106 | .query("SELECT * FROM test_source ORDER BY id".to_string()) 107 | .build()]]) 108 | .build(); 109 | 110 | let ctx = Arc::new(SessionContext::new()); 111 | let result_ctx = run_pipeline(ctx, pipeline, None).await.unwrap(); 112 | 113 | // The final stage should be available as a table 114 | let table = result_ctx.table("final_stage").await.unwrap(); 115 | let batches = table.collect().await.unwrap(); 116 | 117 | assert_eq!(batches[0].num_rows(), dataset.expected_rows()); 118 | } 119 | -------------------------------------------------------------------------------- /aqueducts/delta/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "aqueducts-delta" 3 | authors.workspace = true 4 | edition.workspace = true 5 | description.workspace = true 6 | homepage.workspace = true 7 | repository.workspace = true 8 | readme.workspace = true 9 | version.workspace = true 10 | keywords.workspace = true 11 | categories.workspace = true 12 | license.workspace = true 13 | 14 | [dependencies] 15 | deltalake.workspace = true 16 | aqueducts-schemas.workspace = true 17 | datafusion.workspace = true 18 | thiserror.workspace = true 19 | tracing.workspace = true 20 | 21 | [features] 22 | default = [] 23 | s3 = ["deltalake/s3"] 24 | gcs = ["deltalake/gcs"] 25 | azure = ["deltalake/azure"] 26 | 27 | [dev-dependencies] 28 | tokio = { workspace = true, features = ["full"] } 29 | rand.workspace = true 30 | tracing-test.workspace = true 31 | serde_json.workspace = true 32 | url.workspace = true 33 | tempfile = "3" 34 | -------------------------------------------------------------------------------- /aqueducts/delta/src/error.rs: -------------------------------------------------------------------------------- 1 | use thiserror::Error; 2 | 3 | /// Error types for Delta Lake operations. 4 | #[derive(Error, Debug)] 5 | pub enum DeltaError { 6 | /// Delta table operation failed. 7 | #[error("Delta table operation failed: {0}")] 8 | DeltaTable(#[from] deltalake::DeltaTableError), 9 | 10 | /// DataFusion error occurred. 11 | #[error("DataFusion error: {0}")] 12 | DataFusion(#[from] datafusion::error::DataFusionError), 13 | } 14 | -------------------------------------------------------------------------------- /aqueducts/delta/src/handlers.rs: -------------------------------------------------------------------------------- 1 | //! Delta Lake object store handler registration. 2 | //! 3 | //! This module provides functionality to register Delta Lake object store factories 4 | //! for cloud providers. These handlers are required for Delta Lake to work with 5 | //! cloud storage services like S3, GCS, and Azure Blob Storage. 6 | 7 | use std::sync::Once; 8 | 9 | static INIT: Once = Once::new(); 10 | 11 | /// Register Delta Lake object store handlers for enabled cloud providers. 12 | /// 13 | /// This function must be called before using Delta Lake with cloud storage to ensure 14 | /// the proper object store factories are registered. It will register handlers for 15 | /// all cloud providers that are enabled via feature flags. 16 | /// 17 | /// The registration is performed exactly once, even if this function is called multiple times. 18 | pub fn register_handlers() { 19 | INIT.call_once(|| { 20 | tracing::debug!("Registering Delta Lake object store handlers"); 21 | 22 | #[cfg(feature = "s3")] 23 | { 24 | tracing::debug!("Registering Delta Lake S3 handlers"); 25 | deltalake::aws::register_handlers(None); 26 | } 27 | 28 | #[cfg(feature = "gcs")] 29 | { 30 | tracing::debug!("Registering Delta Lake GCS handlers"); 31 | deltalake::gcp::register_handlers(None); 32 | } 33 | 34 | #[cfg(feature = "azure")] 35 | { 36 | tracing::debug!("Registering Delta Lake Azure handlers"); 37 | deltalake::azure::register_handlers(None); 38 | } 39 | 40 | tracing::debug!("Delta Lake handlers registration complete"); 41 | }); 42 | } 43 | 44 | #[cfg(test)] 45 | mod tests { 46 | use super::*; 47 | 48 | #[test] 49 | fn test_register_handlers() { 50 | // This test just ensures the function can be called without panicking 51 | register_handlers(); 52 | 53 | // Call it again to test idempotency 54 | register_handlers(); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /aqueducts/delta/tests/data/aqueduct_pipeline_delta_append.yml: -------------------------------------------------------------------------------- 1 | version: "v2" 2 | 3 | sources: 4 | - type: file 5 | name: some_table 6 | format: 7 | type: Csv 8 | options: 9 | has_header: true 10 | delimiter: "," 11 | location: ${local_path}/tests/data/example_1.csv 12 | 13 | - type: file 14 | name: another_table 15 | format: 16 | type: Csv 17 | options: 18 | has_header: true 19 | delimiter: "," 20 | location: ${local_path}/tests/data/example_2.csv 21 | 22 | stages: 23 | - - name: aggregate 24 | query: > 25 | SELECT date, country, SUM(a) as sum_1, SUM(b) as sum_2 26 | FROM some_table 27 | GROUP BY 1, 2 28 | 29 | - name: average 30 | query: > 31 | SELECT date, country, AVG(x) as avg_1, AVG(y) as avg_2 32 | FROM another_table 33 | GROUP BY 1, 2 34 | 35 | - - name: join 36 | query: > 37 | SELECT 38 | COALESCE(agg.date, avg.date) as date, 39 | COALESCE(agg.country, avg.country) as country, 40 | sum_1, 41 | sum_2, 42 | avg_1, 43 | avg_2 44 | FROM aggregate agg 45 | JOIN average avg ON agg.date = avg.date AND agg.country = avg.country 46 | WHERE COALESCE(agg.date, avg.date) = '${date}' 47 | 48 | destination: 49 | type: delta 50 | name: example_output 51 | location: ${local_path}/tests/output/test_delta_append/${run_id} 52 | storage_config: {} 53 | table_properties: {} 54 | 55 | write_mode: 56 | operation: append 57 | 58 | partition_columns: 59 | - date 60 | 61 | schema: 62 | - name: date 63 | data_type: date32 64 | nullable: true 65 | metadata: {} 66 | - name: country 67 | data_type: utf8 68 | nullable: true 69 | metadata: {} 70 | - name: sum_1 71 | data_type: int64 72 | nullable: true 73 | metadata: {} 74 | - name: sum_2 75 | data_type: float64 76 | nullable: true 77 | metadata: {} 78 | - name: avg_1 79 | data_type: float64 80 | nullable: true 81 | metadata: {} 82 | - name: avg_2 83 | data_type: float64 84 | nullable: true 85 | metadata: {} -------------------------------------------------------------------------------- /aqueducts/delta/tests/data/aqueduct_pipeline_delta_replace.yml: -------------------------------------------------------------------------------- 1 | version: "v2" 2 | 3 | sources: 4 | - type: file 5 | name: some_table 6 | format: 7 | type: Csv 8 | options: 9 | has_header: true 10 | delimiter: "," 11 | location: ${local_path}/tests/data/example_1.csv 12 | 13 | - type: file 14 | name: another_table 15 | format: 16 | type: Csv 17 | options: 18 | has_header: true 19 | delimiter: "," 20 | location: ${local_path}/tests/data/example_2.csv 21 | 22 | stages: 23 | - - name: aggregate 24 | query: > 25 | SELECT date, country, SUM(a) as sum_1, SUM(b) as sum_2 26 | FROM some_table 27 | GROUP BY 1, 2 28 | 29 | - name: average 30 | query: > 31 | SELECT date, country, AVG(x) as avg_1, AVG(y) as avg_2 32 | FROM another_table 33 | GROUP BY 1, 2 34 | 35 | - - name: join 36 | query: > 37 | SELECT 38 | COALESCE(agg.date, avg.date) as date, 39 | COALESCE(agg.country, avg.country) as country, 40 | sum_1, 41 | sum_2, 42 | avg_1, 43 | avg_2 44 | FROM aggregate agg 45 | JOIN average avg ON agg.date = avg.date AND agg.country = avg.country 46 | WHERE COALESCE(agg.date, avg.date) = '${date}' 47 | 48 | destination: 49 | type: delta 50 | name: example_output 51 | location: ${local_path}/tests/output/test_delta_replace/${run_id} 52 | storage_config: {} 53 | table_properties: {} 54 | 55 | write_mode: 56 | operation: replace 57 | params: 58 | - column: date 59 | value: ${date} 60 | 61 | partition_columns: 62 | - date 63 | 64 | schema: 65 | - name: date 66 | data_type: date32 67 | nullable: true 68 | metadata: {} 69 | - name: country 70 | data_type: utf8 71 | nullable: true 72 | metadata: {} 73 | - name: sum_1 74 | data_type: int64 75 | nullable: true 76 | metadata: {} 77 | - name: sum_2 78 | data_type: float64 79 | nullable: true 80 | metadata: {} 81 | - name: avg_1 82 | data_type: float64 83 | nullable: true 84 | metadata: {} 85 | - name: avg_2 86 | data_type: float64 87 | nullable: true 88 | metadata: {} -------------------------------------------------------------------------------- /aqueducts/delta/tests/data/aqueduct_pipeline_delta_upsert.yml: -------------------------------------------------------------------------------- 1 | version: "v2" 2 | 3 | sources: 4 | - type: file 5 | name: some_table 6 | format: 7 | type: Csv 8 | options: 9 | has_header: true 10 | delimiter: "," 11 | location: ${local_path}/tests/data/example_1.csv 12 | 13 | - type: file 14 | name: another_table 15 | format: 16 | type: Csv 17 | options: 18 | has_header: true 19 | delimiter: "," 20 | location: ${local_path}/tests/data/example_2.csv 21 | 22 | stages: 23 | - - name: aggregate 24 | query: > 25 | SELECT date, country, SUM(a) as sum_1, SUM(b) as sum_2 26 | FROM some_table 27 | GROUP BY 1, 2 28 | 29 | - name: average 30 | query: > 31 | SELECT date, country, AVG(x) as avg_1, AVG(y) as avg_2 32 | FROM another_table 33 | GROUP BY 1, 2 34 | 35 | - - name: join 36 | query: > 37 | SELECT 38 | COALESCE(agg.date, avg.date) as date, 39 | COALESCE(agg.country, avg.country) as country, 40 | sum_1, 41 | sum_2, 42 | avg_1, 43 | avg_2 44 | FROM aggregate agg 45 | JOIN average avg ON agg.date = avg.date AND agg.country = avg.country 46 | WHERE COALESCE(agg.date, avg.date) = '${date}' 47 | 48 | destination: 49 | type: delta 50 | name: example_output 51 | location: ${local_path}/tests/output/test_delta_upsert/${run_id} 52 | storage_config: {} 53 | table_properties: {} 54 | 55 | write_mode: 56 | operation: upsert 57 | params: 58 | - date 59 | - country 60 | 61 | partition_columns: 62 | - date 63 | 64 | schema: 65 | - name: date 66 | data_type: date32 67 | nullable: true 68 | metadata: {} 69 | - name: country 70 | data_type: utf8 71 | nullable: true 72 | metadata: {} 73 | - name: sum_1 74 | data_type: int64 75 | nullable: true 76 | metadata: {} 77 | - name: sum_2 78 | data_type: float64 79 | nullable: true 80 | metadata: {} 81 | - name: avg_1 82 | data_type: float64 83 | nullable: true 84 | metadata: {} 85 | - name: avg_2 86 | data_type: float64 87 | nullable: true 88 | metadata: {} -------------------------------------------------------------------------------- /aqueducts/meta/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "aqueducts" 3 | authors.workspace = true 4 | edition.workspace = true 5 | description = "Unified meta-crate for the Aqueducts data pipeline framework" 6 | repository.workspace = true 7 | readme.workspace = true 8 | version.workspace = true 9 | homepage.workspace = true 10 | keywords.workspace = true 11 | categories.workspace = true 12 | license.workspace = true 13 | 14 | [features] 15 | default = ["yaml", "s3", "gcs", "azure"] 16 | 17 | # Format support features 18 | json = ["aqueducts-core/json"] 19 | toml = ["aqueducts-core/toml"] 20 | yaml = ["aqueducts-core/yaml"] 21 | 22 | # Cloud storage provider features 23 | s3 = ["aqueducts-core/s3"] 24 | gcs = ["aqueducts-core/gcs"] 25 | azure = ["aqueducts-core/azure"] 26 | 27 | # Database connectivity features 28 | odbc = ["aqueducts-core/odbc", "aqueducts-odbc"] 29 | delta = ["aqueducts-core/delta", "aqueducts-delta"] 30 | 31 | # Schema generation features 32 | schema_gen = ["aqueducts-schemas/schema_gen"] 33 | 34 | # Protocol features for executor/CLI integration 35 | protocol = ["aqueducts-schemas/protocol"] 36 | 37 | # Custom udfs to extend the SQL syntax 38 | custom_udfs = ["aqueducts-core/custom_udfs"] 39 | 40 | [dependencies] 41 | # Core aqueducts functionality 42 | aqueducts-core.workspace = true 43 | aqueducts-schemas.workspace = true 44 | 45 | # Optional database-specific crates 46 | aqueducts-odbc = { workspace = true, optional = true } 47 | aqueducts-delta = { workspace = true, optional = true } 48 | 49 | # Re-export common dependencies that users might need 50 | datafusion.workspace = true 51 | tokio = { workspace = true, features = ["macros", "rt-multi-thread"] } 52 | tracing.workspace = true 53 | 54 | [dev-dependencies] 55 | tokio = { workspace = true, features = ["full"] } 56 | tracing-test.workspace = true 57 | -------------------------------------------------------------------------------- /aqueducts/odbc/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "aqueducts-odbc" 3 | authors.workspace = true 4 | edition.workspace = true 5 | description.workspace = true 6 | repository.workspace = true 7 | readme.workspace = true 8 | version.workspace = true 9 | homepage.workspace = true 10 | keywords.workspace = true 11 | categories.workspace = true 12 | license.workspace = true 13 | 14 | [features] 15 | default = [] 16 | odbc_tests = [] 17 | 18 | [dependencies] 19 | aqueducts-schemas.workspace = true 20 | 21 | datafusion.workspace = true 22 | arrow-odbc.workspace = true 23 | 24 | serde.workspace = true 25 | thiserror.workspace = true 26 | tracing.workspace = true 27 | 28 | [dev-dependencies] 29 | tokio = { workspace = true, features = ["full"] } 30 | tracing-test.workspace = true 31 | serde_json.workspace = true 32 | -------------------------------------------------------------------------------- /aqueducts/odbc/src/error.rs: -------------------------------------------------------------------------------- 1 | use thiserror::Error; 2 | 3 | /// Error types for ODBC operations with security-conscious error messages. 4 | /// 5 | /// IMPORTANT: This type never includes connection strings or other sensitive 6 | /// information in error messages to prevent password leakage. 7 | #[derive(Error, Debug)] 8 | pub enum OdbcError { 9 | /// ODBC connection failed (no sensitive details exposed). 10 | #[error("ODBC connection failed to data source")] 11 | ConnectionFailed, 12 | 13 | /// ODBC query execution failed. 14 | #[error("ODBC query execution failed: {message}")] 15 | QueryFailed { message: String }, 16 | 17 | /// ODBC write operation failed. 18 | #[error("ODBC write operation failed: {message}")] 19 | WriteFailed { message: String }, 20 | 21 | /// ODBC driver or environment setup error. 22 | #[error("ODBC driver error: {message}")] 23 | DriverError { message: String }, 24 | 25 | /// Arrow error occurred. 26 | #[error("Arrow error: {0}")] 27 | Arrow(#[from] datafusion::arrow::error::ArrowError), 28 | 29 | /// DataFusion error occurred. 30 | #[error("DataFusion error: {0}")] 31 | DataFusion(#[from] datafusion::error::DataFusionError), 32 | } 33 | 34 | impl OdbcError { 35 | /// Create a connection failed error. 36 | pub fn connection_failed() -> Self { 37 | Self::ConnectionFailed 38 | } 39 | 40 | /// Create a query failed error. 41 | pub fn query_failed(message: impl Into) -> Self { 42 | Self::QueryFailed { 43 | message: message.into(), 44 | } 45 | } 46 | 47 | /// Create a write failed error. 48 | pub fn write_failed(message: impl Into) -> Self { 49 | Self::WriteFailed { 50 | message: message.into(), 51 | } 52 | } 53 | 54 | /// Create a driver error. 55 | pub fn driver_error(message: impl Into) -> Self { 56 | Self::DriverError { 57 | message: message.into(), 58 | } 59 | } 60 | } 61 | 62 | // External error mappings with security considerations 63 | impl From for OdbcError { 64 | fn from(err: arrow_odbc::Error) -> Self { 65 | // Don't expose details that might contain sensitive information 66 | let err_str = err.to_string().to_lowercase(); 67 | if err_str.contains("connection") 68 | || err_str.contains("login") 69 | || err_str.contains("authentication") 70 | { 71 | Self::ConnectionFailed 72 | } else { 73 | Self::DriverError { 74 | message: "ODBC operation failed".to_string(), 75 | } 76 | } 77 | } 78 | } 79 | 80 | impl From for OdbcError { 81 | fn from(err: arrow_odbc::odbc_api::Error) -> Self { 82 | // Check if this is a connection-related error without exposing details 83 | let err_str = err.to_string().to_lowercase(); 84 | if err_str.contains("connection") 85 | || err_str.contains("login") 86 | || err_str.contains("authentication") 87 | { 88 | Self::ConnectionFailed 89 | } else { 90 | Self::DriverError { 91 | message: "ODBC API error".to_string(), 92 | } 93 | } 94 | } 95 | } 96 | 97 | impl From for OdbcError { 98 | fn from(_err: arrow_odbc::WriterError) -> Self { 99 | Self::WriteFailed { 100 | message: "ODBC write operation failed".to_string(), 101 | } 102 | } 103 | } 104 | 105 | /// Convenience result type for ODBC operations. 106 | pub type Result = std::result::Result; 107 | -------------------------------------------------------------------------------- /aqueducts/schemas/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "aqueducts-schemas" 3 | authors.workspace = true 4 | edition.workspace = true 5 | description.workspace = true 6 | repository.workspace = true 7 | readme.workspace = true 8 | version.workspace = true 9 | homepage.workspace = true 10 | keywords.workspace = true 11 | categories.workspace = true 12 | license.workspace = true 13 | 14 | [features] 15 | default = [] 16 | schema_gen = ["schemars"] 17 | protocol = ["uuid"] 18 | 19 | [dependencies] 20 | serde.workspace = true 21 | bon.workspace = true 22 | url.workspace = true 23 | chrono.workspace = true 24 | 25 | # Optional dependencies for features 26 | schemars = { workspace = true, optional = true } 27 | uuid = { workspace = true, optional = true } 28 | serde_json = { workspace = true, optional = true } 29 | 30 | [dev-dependencies] 31 | serde_json.workspace = true 32 | serde_yml.workspace = true 33 | -------------------------------------------------------------------------------- /aqueducts/schemas/src/generate_schema.rs: -------------------------------------------------------------------------------- 1 | //! Binary to generate JSON schema for the Aqueduct types 2 | //! 3 | //! This binary can be run with: cargo run --bin generate_schema --features schema_gen 4 | 5 | use aqueducts_schemas::Aqueduct; 6 | use schemars::schema_for; 7 | use std::fs::File; 8 | use std::io::Write; 9 | use std::path::PathBuf; 10 | 11 | fn main() -> Result<(), Box> { 12 | // Generate the JSON schema 13 | let schema = schema_for!(Aqueduct); 14 | 15 | // Serialize to pretty JSON 16 | let schema_json = serde_json::to_string_pretty(&schema)?; 17 | 18 | // Write to the json_schema directory in the project root 19 | let output_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) 20 | .parent() 21 | .unwrap() 22 | .parent() 23 | .unwrap() 24 | .join("json_schema") 25 | .join("aqueducts.schema.json"); 26 | 27 | // Ensure the output directory exists 28 | if let Some(parent) = output_path.parent() { 29 | std::fs::create_dir_all(parent)?; 30 | } 31 | 32 | // Write the schema file 33 | let mut file = File::create(&output_path)?; 34 | file.write_all(schema_json.as_bytes())?; 35 | 36 | println!("Generated JSON schema at: {}", output_path.display()); 37 | 38 | Ok(()) 39 | } 40 | -------------------------------------------------------------------------------- /aqueducts/schemas/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! # Aqueducts Schemas 2 | //! 3 | //! This crate contains all the schema definitions and configuration types used 4 | //! throughout the aqueducts ecosystem. By centralizing these types here, we avoid 5 | //! circular dependencies between core, delta, ODBC, and other provider crates. 6 | 7 | use bon::Builder; 8 | use serde::{Deserialize, Serialize}; 9 | 10 | pub mod data_types; 11 | pub mod destinations; 12 | pub mod location; 13 | pub mod progress; 14 | pub mod sources; 15 | pub mod stages; 16 | 17 | mod serde_helpers; 18 | 19 | #[cfg(feature = "protocol")] 20 | pub mod protocol; 21 | 22 | // Re-export the main types for convenience 23 | pub use data_types::{DataType, Field, IntervalUnit, TimeUnit, UnionMode}; 24 | pub use destinations::{ 25 | CsvDestinationOptions, CsvDestinationOptionsBuilder, DeltaWriteMode, 26 | FileType as DestinationFileType, ReplaceCondition, 27 | }; 28 | pub use destinations::{ 29 | DeltaDestination, Destination, FileDestination, InMemoryDestination, OdbcDestination, 30 | }; 31 | pub use destinations::{ 32 | DeltaDestinationBuilder, FileDestinationBuilder, InMemoryDestinationBuilder, 33 | OdbcDestinationBuilder, 34 | }; 35 | pub use location::Location; 36 | pub use sources::{ 37 | CsvSourceOptions, FileType as SourceFileType, JsonSourceOptions, ParquetSourceOptions, 38 | }; 39 | pub use sources::{CsvSourceOptionsBuilder, JsonSourceOptionsBuilder, ParquetSourceOptionsBuilder}; 40 | pub use sources::{DeltaSource, DirSource, FileSource, InMemorySource, OdbcSource, Source}; 41 | pub use sources::{ 42 | DeltaSourceBuilder, DirSourceBuilder, FileSourceBuilder, InMemorySourceBuilder, 43 | OdbcSourceBuilder, 44 | }; 45 | 46 | pub use progress::{OutputType, ProgressEvent}; 47 | pub use stages::{Stage, StageBuilder}; 48 | 49 | #[cfg(feature = "protocol")] 50 | pub use protocol::*; 51 | 52 | fn current_version() -> String { 53 | "v2".to_string() 54 | } 55 | 56 | /// Definition for an `Aqueduct` data pipeline. 57 | /// 58 | /// An aqueduct defines a complete data processing pipeline with sources, transformation stages, 59 | /// and an optional destination. Most configuration uses sensible defaults to minimize verbosity. 60 | /// 61 | /// # Examples 62 | /// 63 | /// ``` 64 | /// use aqueducts_schemas::{Aqueduct, Source, FileSource, SourceFileType, CsvSourceOptions, Stage}; 65 | /// 66 | /// // Complete pipeline with defaults - version defaults to "v2" 67 | /// let pipeline = Aqueduct::builder() 68 | /// .sources(vec![ 69 | /// Source::File( 70 | /// FileSource::builder() 71 | /// .name("sales".to_string()) 72 | /// .format(SourceFileType::Csv(CsvSourceOptions::default())) 73 | /// .location("./sales.csv".try_into().unwrap()) 74 | /// .build() 75 | /// ) 76 | /// ]) 77 | /// .stages(vec![vec![ 78 | /// Stage::builder() 79 | /// .name("totals".to_string()) 80 | /// .query("SELECT region, SUM(amount) as total FROM sales GROUP BY region".to_string()) 81 | /// .build() 82 | /// ]]) 83 | /// .build(); 84 | /// ``` 85 | #[derive(Debug, Clone, Serialize, Deserialize, Builder)] 86 | #[cfg_attr(feature = "schema_gen", derive(schemars::JsonSchema))] 87 | #[serde(rename_all = "snake_case")] 88 | pub struct Aqueduct { 89 | /// Schema version for migration compatibility 90 | #[serde(default = "current_version")] 91 | #[builder(default = current_version())] 92 | pub version: String, 93 | 94 | /// Definition of the data sources for this pipeline 95 | pub sources: Vec, 96 | 97 | /// A sequential list of transformations to execute within the context of this pipeline 98 | /// Nested stages are executed in parallel 99 | pub stages: Vec>, 100 | 101 | /// Destination for the final step of the `Aqueduct` 102 | /// takes the last stage as input for the write operation 103 | pub destination: Option, 104 | } 105 | -------------------------------------------------------------------------------- /aqueducts/schemas/src/location.rs: -------------------------------------------------------------------------------- 1 | //! Location type that handles both file paths and URLs 2 | 3 | use serde::{Deserialize, Deserializer, Serialize}; 4 | use std::path::Path; 5 | use url::Url; 6 | 7 | /// A location that can be either a file path or a URL 8 | /// 9 | /// This type automatically converts file paths to file:// URLs during deserialization 10 | /// 11 | /// # Examples 12 | /// 13 | /// ``` 14 | /// use aqueducts_schemas::Location; 15 | /// 16 | /// // From URL string 17 | /// let url_location: Location = "https://example.com/data.csv".try_into().unwrap(); 18 | /// 19 | /// // From absolute file path 20 | /// let file_location: Location = "/tmp/data.csv".try_into().unwrap(); 21 | /// 22 | /// // From relative file path 23 | /// let rel_location: Location = "./data.csv".try_into().unwrap(); 24 | /// ``` 25 | #[derive(Debug, Clone, PartialEq, Serialize)] 26 | #[cfg_attr(feature = "schema_gen", derive(schemars::JsonSchema))] 27 | #[cfg_attr( 28 | feature = "schema_gen", 29 | schemars( 30 | with = "String", 31 | description = "A file path or URL. File paths will be converted to file:// URLs. Examples: '/tmp/data.csv', './data.csv', 'https://example.com/data.csv', 's3://bucket/data.csv'" 32 | ) 33 | )] 34 | pub struct Location(pub Url); 35 | 36 | impl TryFrom<&str> for Location { 37 | type Error = String; 38 | 39 | fn try_from(s: &str) -> Result { 40 | // Try as URL first - if it has a scheme, it should parse as URL 41 | if let Ok(url) = Url::parse(s) { 42 | return Ok(Location(url)); 43 | } 44 | 45 | // Try as file path 46 | let path = Path::new(s); 47 | let url = if path.is_absolute() { 48 | Url::from_file_path(path) 49 | } else { 50 | // For relative paths, resolve against current directory 51 | let current_dir = std::env::current_dir() 52 | .map_err(|e| format!("Cannot get current directory: {}", e))?; 53 | Url::from_file_path(current_dir.join(path)) 54 | } 55 | .map_err(|_| format!("Invalid path: {}", s))?; 56 | 57 | Ok(Location(url)) 58 | } 59 | } 60 | 61 | impl TryFrom for Location { 62 | type Error = String; 63 | 64 | fn try_from(s: String) -> Result { 65 | Location::try_from(s.as_str()) 66 | } 67 | } 68 | 69 | impl From for Location { 70 | fn from(url: Url) -> Self { 71 | Location(url) 72 | } 73 | } 74 | 75 | impl<'de> Deserialize<'de> for Location { 76 | fn deserialize(deserializer: D) -> Result 77 | where 78 | D: Deserializer<'de>, 79 | { 80 | let s = String::deserialize(deserializer)?; 81 | Location::try_from(s.as_str()).map_err(serde::de::Error::custom) 82 | } 83 | } 84 | 85 | impl std::fmt::Display for Location { 86 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 87 | write!(f, "{}", self.0) 88 | } 89 | } 90 | 91 | impl AsRef for Location { 92 | fn as_ref(&self) -> &Url { 93 | &self.0 94 | } 95 | } 96 | 97 | impl std::ops::Deref for Location { 98 | type Target = Url; 99 | 100 | fn deref(&self) -> &Self::Target { 101 | &self.0 102 | } 103 | } 104 | 105 | #[cfg(test)] 106 | mod tests { 107 | use super::*; 108 | 109 | #[test] 110 | fn test_url_parsing() { 111 | let location: Location = "https://example.com/data.csv".try_into().unwrap(); 112 | assert_eq!(location.scheme(), "https"); 113 | assert_eq!(location.host_str(), Some("example.com")); 114 | } 115 | 116 | #[test] 117 | fn test_absolute_file_path() { 118 | let location: Location = "/tmp/data.csv".try_into().unwrap(); 119 | assert_eq!(location.scheme(), "file"); 120 | assert!(location.path().ends_with("/tmp/data.csv")); 121 | } 122 | 123 | #[test] 124 | fn test_relative_file_path() { 125 | let location: Location = "./data.csv".try_into().unwrap(); 126 | assert_eq!(location.scheme(), "file"); 127 | assert!(location.path().ends_with("/data.csv")); 128 | } 129 | 130 | #[test] 131 | fn test_windows_path() { 132 | if cfg!(windows) { 133 | let location: Location = r"C:\temp\data.csv".try_into().unwrap(); 134 | assert_eq!(location.scheme(), "file"); 135 | } 136 | } 137 | 138 | #[test] 139 | fn test_s3_url() { 140 | let location: Location = "s3://my-bucket/data.csv".try_into().unwrap(); 141 | assert_eq!(location.scheme(), "s3"); 142 | assert_eq!(location.host_str(), Some("my-bucket")); 143 | assert_eq!(location.path(), "/data.csv"); 144 | } 145 | 146 | #[test] 147 | fn test_serialization() { 148 | let location: Location = "https://example.com/data.csv".try_into().unwrap(); 149 | let json = serde_json::to_string(&location).unwrap(); 150 | assert_eq!(json, r#""https://example.com/data.csv""#); 151 | } 152 | 153 | #[test] 154 | fn test_deserialization() { 155 | let json = r#""./data.csv""#; 156 | let location: Location = serde_json::from_str(json).unwrap(); 157 | assert_eq!(location.scheme(), "file"); 158 | assert!(location.path().ends_with("/data.csv")); 159 | } 160 | 161 | #[test] 162 | fn test_location_in_config() { 163 | use serde_json; 164 | 165 | #[derive(serde::Serialize, serde::Deserialize, Debug, PartialEq)] 166 | struct Config { 167 | name: String, 168 | location: Location, 169 | } 170 | 171 | let config = Config { 172 | name: "test".to_string(), 173 | location: "s3://my-bucket/data".try_into().unwrap(), 174 | }; 175 | 176 | let json = serde_json::to_string(&config).unwrap(); 177 | let parsed: Config = serde_json::from_str(&json).unwrap(); 178 | 179 | assert_eq!(config.name, parsed.name); 180 | assert_eq!(config.location.as_ref(), parsed.location.as_ref()); 181 | } 182 | 183 | #[test] 184 | fn test_mixed_location_types() { 185 | let locations = vec![ 186 | ("./local.csv", "file"), 187 | ("/absolute/path.json", "file"), 188 | ("https://example.com/data.csv", "https"), 189 | ("s3://bucket/key.parquet", "s3"), 190 | ("gs://bucket/object", "gs"), 191 | ("azure://container/blob", "azure"), 192 | ]; 193 | 194 | for (input, expected_scheme) in locations { 195 | let location = Location::try_from(input).unwrap(); 196 | assert_eq!( 197 | location.scheme(), 198 | expected_scheme, 199 | "Failed for input: {}", 200 | input 201 | ); 202 | } 203 | } 204 | 205 | #[test] 206 | fn test_yaml_deserialization() { 207 | let yaml = r#" 208 | location: "./data/input.csv" 209 | "#; 210 | 211 | #[derive(serde::Deserialize)] 212 | struct Config { 213 | location: Location, 214 | } 215 | 216 | let config: Config = serde_yml::from_str(yaml).unwrap(); 217 | assert_eq!(config.location.scheme(), "file"); 218 | assert!(config.location.path().ends_with("/data/input.csv")); 219 | } 220 | } 221 | -------------------------------------------------------------------------------- /aqueducts/schemas/src/progress.rs: -------------------------------------------------------------------------------- 1 | //! Progress event types for tracking pipeline execution 2 | 3 | use serde::{Deserialize, Serialize}; 4 | 5 | /// Progress events emitted during pipeline execution 6 | #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] 7 | #[serde(tag = "type", rename_all = "snake_case")] 8 | pub enum ProgressEvent { 9 | /// Pipeline execution started 10 | Started, 11 | /// A source has been registered 12 | SourceRegistered { 13 | /// Name of the source 14 | name: String, 15 | }, 16 | /// A stage has started processing 17 | StageStarted { 18 | /// Name of the stage 19 | name: String, 20 | /// Position in the stages array (outer) 21 | position: usize, 22 | /// Position in the parallel stages array (inner) 23 | sub_position: usize, 24 | }, 25 | /// A stage has completed processing 26 | StageCompleted { 27 | /// Name of the stage 28 | name: String, 29 | /// Position in the stages array (outer) 30 | position: usize, 31 | /// Position in the parallel stages array (inner) 32 | sub_position: usize, 33 | /// Duration of the stage execution 34 | duration_ms: u64, 35 | }, 36 | /// Data has been written to the destination 37 | DestinationCompleted, 38 | /// Pipeline execution completed 39 | Completed { 40 | /// Total duration of the pipeline execution 41 | duration_ms: u64, 42 | }, 43 | } 44 | 45 | /// Stage output types for websocket communication 46 | #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] 47 | #[serde(tag = "type", rename_all = "snake_case")] 48 | pub enum OutputType { 49 | /// Stage outputs the full dataframe 50 | Show, 51 | /// Stage outputs up to `usize` records 52 | ShowLimit, 53 | /// Stage outputs query plan 54 | Explain, 55 | /// Stage outputs query plan with execution metrics 56 | ExplainAnalyze, 57 | /// Stage outputs the dataframe schema 58 | PrintSchema, 59 | } 60 | -------------------------------------------------------------------------------- /aqueducts/schemas/src/protocol.rs: -------------------------------------------------------------------------------- 1 | //! Protocol types for websocket communication between client and executor. 2 | 3 | use serde::{Deserialize, Serialize}; 4 | use uuid::Uuid; 5 | 6 | use crate::{Aqueduct, ProgressEvent}; 7 | 8 | /// Stage output sent down to clients 9 | #[derive(Debug, Clone, Serialize, Deserialize)] 10 | #[serde(tag = "type", rename_all = "snake_case")] 11 | pub enum StageOutputMessage { 12 | /// Stage output is being streamed to client 13 | OutputStart { 14 | output_header: String, 15 | }, 16 | /// Stage output content 17 | OutputChunk { 18 | /// Indicates the sequence of this chunk output 19 | sequence: usize, 20 | /// Output chunk body 21 | body: String, 22 | }, 23 | OutputEnd { 24 | output_footer: String, 25 | }, 26 | } 27 | 28 | /// Client websocket message 29 | #[derive(Debug, Clone, Serialize, Deserialize)] 30 | #[serde(tag = "type", rename_all = "snake_case")] 31 | #[allow(clippy::large_enum_variant)] 32 | pub enum ClientMessage { 33 | /// Execution requested by client 34 | ExecutionRequest { 35 | /// The aqueducts pipeline to be executed 36 | pipeline: Aqueduct, 37 | }, 38 | /// Execution cancellation requested by client 39 | CancelRequest { 40 | /// Execution id of the pipeline execution to cancel 41 | execution_id: Uuid, 42 | }, 43 | } 44 | 45 | /// Executor websocket message 46 | #[derive(Debug, Clone, Serialize, Deserialize)] 47 | #[serde(tag = "type", rename_all = "snake_case")] 48 | pub enum ExecutorMessage { 49 | /// Execution successfully queued 50 | ExecutionResponse { 51 | /// Execution id that identifies the queued execution 52 | execution_id: Uuid, 53 | }, 54 | /// Execution cancellation was successful 55 | CancelResponse { 56 | /// Execution id of the cancelled pipeline 57 | execution_id: Uuid, 58 | }, 59 | /// The queue position for the requested execution 60 | QueuePosition { 61 | /// Execution id of the queued pipeline 62 | execution_id: Uuid, 63 | /// Position of the requested execution in the queue 64 | position: usize, 65 | }, 66 | /// Progress update event emited by a running aqueducts pipeline 67 | ProgressUpdate { 68 | /// Execution id of the running pipeline 69 | execution_id: Uuid, 70 | /// Progress percentage (0-100) 71 | progress: u8, 72 | /// Progress event payload 73 | event: ProgressEvent, 74 | }, 75 | /// Stage output of a running pipeline 76 | StageOutput { 77 | /// Execution id of the running pipeline 78 | execution_id: Uuid, 79 | /// Stage name that is outputting 80 | stage_name: String, 81 | /// Stage output payload 82 | payload: StageOutputMessage, 83 | }, 84 | /// Pipeline execution completet successfully 85 | ExecutionSucceeded { 86 | /// Execution id of the pipeline 87 | execution_id: Uuid, 88 | }, 89 | ExecutionError { 90 | /// Execution id that produced error 91 | execution_id: Uuid, 92 | /// Error message 93 | message: String, 94 | }, 95 | } 96 | -------------------------------------------------------------------------------- /aqueducts/schemas/src/serde_helpers.rs: -------------------------------------------------------------------------------- 1 | //! Shared serde helper functions for deserialization and default values. 2 | //! 3 | //! This module consolidates common serde helpers used across the schema types 4 | 5 | use crate::data_types::DataType; 6 | use serde::{Deserialize, Deserializer}; 7 | use std::str::FromStr; 8 | 9 | // ============================================================================= 10 | // Default value functions 11 | // ============================================================================= 12 | 13 | /// Default value for boolean fields that should be true 14 | pub fn default_true() -> bool { 15 | true 16 | } 17 | 18 | /// Default comma delimiter for CSV files 19 | pub fn default_comma() -> char { 20 | ',' 21 | } 22 | 23 | /// Default batch size for ODBC operations 24 | pub fn default_batch_size() -> usize { 25 | 1000 26 | } 27 | 28 | // ============================================================================= 29 | // Custom deserializers 30 | // ============================================================================= 31 | 32 | /// Custom deserializer that handles string representations of DataType 33 | pub fn deserialize_data_type<'de, D>(deserializer: D) -> Result 34 | where 35 | D: Deserializer<'de>, 36 | { 37 | use serde::de::Error; 38 | 39 | // Deserialize as a string 40 | let s = String::deserialize(deserializer)?; 41 | DataType::from_str(&s).map_err(|e| D::Error::custom(format!("Invalid data type: {}", e))) 42 | } 43 | 44 | /// Custom deserializer for partition columns that handles both tuple and object formats 45 | pub fn deserialize_partition_columns<'de, D>( 46 | deserializer: D, 47 | ) -> Result, D::Error> 48 | where 49 | D: Deserializer<'de>, 50 | { 51 | use serde::de::Error; 52 | 53 | #[derive(Deserialize)] 54 | #[serde(untagged)] 55 | enum PartitionColumn { 56 | Tuple(String, String), // (name, type_string) 57 | Object { name: String, data_type: String }, // {name: "col", data_type: "int32"} 58 | } 59 | 60 | let columns: Vec = Vec::deserialize(deserializer)?; 61 | 62 | columns 63 | .into_iter() 64 | .map(|col| match col { 65 | PartitionColumn::Tuple(name, type_str) => { 66 | let data_type = DataType::from_str(&type_str).map_err(|e| { 67 | D::Error::custom(format!("Invalid data type in partition column: {}", e)) 68 | })?; 69 | Ok((name, data_type)) 70 | } 71 | PartitionColumn::Object { 72 | name, 73 | data_type: type_str, 74 | } => { 75 | let data_type = DataType::from_str(&type_str).map_err(|e| { 76 | D::Error::custom(format!("Invalid data type in partition column: {}", e)) 77 | })?; 78 | Ok((name, data_type)) 79 | } 80 | }) 81 | .collect() 82 | } 83 | -------------------------------------------------------------------------------- /aqueducts/schemas/src/stages.rs: -------------------------------------------------------------------------------- 1 | //! Stage configuration types and schemas. 2 | //! 3 | //! Stages define SQL transformations that are executed as part of an aqueducts pipeline. 4 | //! Each stage produces a named table that can be referenced by subsequent stages. 5 | 6 | use bon::Builder; 7 | use serde::{Deserialize, Serialize}; 8 | 9 | /// A processing stage in an aqueducts pipeline. 10 | /// 11 | /// Stages execute SQL queries against the available data sources and previous stage results. 12 | /// Each stage creates a named table that can be referenced by subsequent stages. 13 | /// 14 | /// # Examples 15 | /// 16 | /// ``` 17 | /// use aqueducts_schemas::Stage; 18 | /// 19 | /// // Basic stage - debug fields default to false, show defaults to None 20 | /// let stage = Stage::builder() 21 | /// .name("aggregated_sales".to_string()) 22 | /// .query("SELECT region, SUM(amount) as total FROM sales GROUP BY region".to_string()) 23 | /// .build(); 24 | /// 25 | /// // Stage with output shown 26 | /// let debug_stage = Stage::builder() 27 | /// .name("debug_query".to_string()) 28 | /// .query("SELECT * FROM source LIMIT 5".to_string()) 29 | /// .show(10) 30 | /// .build(); 31 | /// ``` 32 | #[derive(Debug, Clone, Serialize, Deserialize, Builder)] 33 | #[cfg_attr(feature = "schema_gen", derive(schemars::JsonSchema))] 34 | #[serde(rename_all = "snake_case")] 35 | pub struct Stage { 36 | /// Name of the stage, used as the table name for the result of this stage 37 | pub name: String, 38 | 39 | /// SQL query that is executed against a datafusion context. Check the datafusion SQL reference for more information 40 | pub query: String, 41 | 42 | /// When set to a value of up to `usize`, will print the result of this stage to the stdout limited by the number 43 | /// Set value to 0 to not limit the outputs 44 | #[serde(default)] 45 | pub show: Option, 46 | 47 | /// When set to 'true' the stage will output the query execution plan 48 | #[serde(default)] 49 | #[builder(default)] 50 | pub explain: bool, 51 | 52 | /// When set to 'true' the stage will output the query execution plan with added execution metrics 53 | #[serde(default)] 54 | #[builder(default)] 55 | pub explain_analyze: bool, 56 | 57 | /// When set to 'true' the stage will pretty print the output schema of the executed query 58 | #[serde(default)] 59 | #[builder(default)] 60 | pub print_schema: bool, 61 | } 62 | -------------------------------------------------------------------------------- /aqueducts/schemas/tests/integration.rs: -------------------------------------------------------------------------------- 1 | //! Integration tests for aqueducts schemas 2 | //! 3 | //! Tests backwards compatibility, serialization, and basic functionality. 4 | 5 | use aqueducts_schemas::{Aqueduct, Field}; 6 | use std::fs; 7 | use std::path::Path; 8 | 9 | #[test] 10 | fn test_backwards_compatibility() { 11 | // Test that old field names are still supported via aliases 12 | let old_format_json = r#"{ 13 | "sources": [ 14 | { 15 | "type": "File", 16 | "name": "test_data", 17 | "file_type": { 18 | "type": "Csv", 19 | "options": { 20 | "has_header": true, 21 | "delimiter": "," 22 | } 23 | }, 24 | "location": "./data.csv", 25 | "storage_options": {} 26 | } 27 | ], 28 | "stages": [], 29 | "destination": { 30 | "type": "Delta", 31 | "name": "output", 32 | "location": "./output", 33 | "write_mode": { 34 | "operation": "Append" 35 | }, 36 | "storage_options": {}, 37 | "partition_cols": [], 38 | "table_properties": {}, 39 | "custom_metadata": {}, 40 | "schema": [] 41 | } 42 | }"#; 43 | 44 | let parsed: Aqueduct = serde_json::from_str(old_format_json).unwrap(); 45 | assert_eq!(parsed.sources.len(), 1); 46 | assert!(parsed.destination.is_some()); 47 | } 48 | 49 | #[test] 50 | fn test_field_defaults() { 51 | // Test field without nullable should get default (true) 52 | let field_json = r#"{ 53 | "name": "test_field", 54 | "type": "string" 55 | }"#; 56 | 57 | let field: Field = serde_json::from_str(field_json).unwrap(); 58 | assert!(field.nullable); // Should get default 59 | } 60 | 61 | #[test] 62 | fn test_version_default() { 63 | // Test that version gets default value when missing 64 | let config_json = r#"{ 65 | "sources": [], 66 | "stages": [] 67 | }"#; 68 | 69 | let parsed: Aqueduct = serde_json::from_str(config_json).unwrap(); 70 | assert_eq!(parsed.version, "v2"); 71 | } 72 | 73 | #[test] 74 | fn test_pipeline_serialization_roundtrip() { 75 | // Test complete pipeline roundtrip serialization 76 | let pipeline = Aqueduct { 77 | version: "v2".to_string(), 78 | sources: vec![], 79 | stages: vec![], 80 | destination: None, 81 | }; 82 | 83 | let json = serde_json::to_string(&pipeline).unwrap(); 84 | let parsed: Aqueduct = serde_json::from_str(&json).unwrap(); 85 | 86 | assert_eq!(pipeline.version, parsed.version); 87 | assert_eq!(pipeline.sources.len(), parsed.sources.len()); 88 | } 89 | 90 | #[test] 91 | fn test_example_pipeline_files() { 92 | // Test that all example pipeline files can be deserialized 93 | let examples_dir = Path::new(env!("CARGO_MANIFEST_DIR")) 94 | .parent() 95 | .unwrap() 96 | .parent() 97 | .unwrap() 98 | .join("examples"); 99 | 100 | if !examples_dir.exists() { 101 | return; // Skip if examples directory doesn't exist 102 | } 103 | 104 | for entry in fs::read_dir(examples_dir).unwrap() { 105 | let entry = entry.unwrap(); 106 | let path = entry.path(); 107 | let file_name = path.file_name().unwrap().to_str().unwrap(); 108 | 109 | // Skip non-pipeline files 110 | if !file_name.starts_with("aqueduct_pipeline") { 111 | continue; 112 | } 113 | 114 | let content = 115 | fs::read_to_string(&path).unwrap_or_else(|_| panic!("Failed to read file: {:?}", path)); 116 | 117 | // Test deserialization based on file extension 118 | let _pipeline: Aqueduct = 119 | if path.extension().unwrap() == "yml" || path.extension().unwrap() == "yaml" { 120 | serde_yml::from_str(&content) 121 | .unwrap_or_else(|e| panic!("Failed to parse YAML file {}: {}", file_name, e)) 122 | } else if path.extension().unwrap() == "json" { 123 | serde_json::from_str(&content) 124 | .unwrap_or_else(|e| panic!("Failed to parse JSON file {}: {}", file_name, e)) 125 | } else { 126 | continue; // Skip non-YAML/JSON files 127 | }; 128 | 129 | println!("Successfully parsed: {}", file_name); 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /cliff.toml: -------------------------------------------------------------------------------- 1 | # git-cliff ~ configuration file 2 | # https://git-cliff.org/docs/configuration 3 | 4 | [changelog] 5 | # template for the changelog footer 6 | header = """ 7 | # Changelog\n 8 | All notable changes to this project will be documented in this file. 9 | 10 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 11 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).\n 12 | """ 13 | # template for the changelog body 14 | # https://keats.github.io/tera/docs/#introduction 15 | body = """ 16 | {%- macro remote_url() -%} 17 | https://github.com/{{ remote.github.owner }}/{{ remote.github.repo }} 18 | {%- endmacro -%} 19 | 20 | {% if version -%} 21 | ## [{{ version | trim_start_matches(pat="v") }}] - {{ timestamp | date(format="%Y-%m-%d") }} 22 | {% else -%} 23 | ## [Unreleased] 24 | {% endif -%} 25 | 26 | ### Details\ 27 | 28 | {% for group, commits in commits | group_by(attribute="group") %} 29 | #### {{ group | upper_first }} 30 | {%- for commit in commits %} 31 | - {{ commit.message | upper_first | trim }}\ 32 | {% if commit.remote.username %} by @{{ commit.remote.username }}{%- endif -%} 33 | {% if commit.remote.pr_number %} in \ 34 | [#{{ commit.remote.pr_number }}]({{ self::remote_url() }}/pull/{{ commit.remote.pr_number }}) \ 35 | {%- endif -%} 36 | {% endfor %} 37 | {% endfor %} 38 | 39 | {%- if github.contributors | filter(attribute="is_first_time", value=true) | length != 0 %} 40 | ## New Contributors 41 | {%- endif -%} 42 | 43 | {% for contributor in github.contributors | filter(attribute="is_first_time", value=true) %} 44 | * @{{ contributor.username }} made their first contribution 45 | {%- if contributor.pr_number %} in \ 46 | [#{{ contributor.pr_number }}]({{ self::remote_url() }}/pull/{{ contributor.pr_number }}) \ 47 | {%- endif %} 48 | {%- endfor %}\n 49 | """ 50 | # template for the changelog footer 51 | footer = """ 52 | {%- macro remote_url() -%} 53 | https://github.com/{{ remote.github.owner }}/{{ remote.github.repo }} 54 | {%- endmacro -%} 55 | 56 | {% for release in releases -%} 57 | {% if release.version -%} 58 | {% if release.previous.version -%} 59 | [{{ release.version | trim_start_matches(pat="v") }}]: \ 60 | {{ self::remote_url() }}/compare/{{ release.previous.version }}..{{ release.version }} 61 | {% endif -%} 62 | {% else -%} 63 | [unreleased]: {{ self::remote_url() }}/compare/{{ release.previous.version }}..HEAD 64 | {% endif -%} 65 | {% endfor %} 66 | 67 | """ 68 | # remove the leading and trailing whitespace from the templates 69 | trim = true 70 | 71 | [git] 72 | # parse the commits based on https://www.conventionalcommits.org 73 | conventional_commits = true 74 | # filter out the commits that are not conventional 75 | filter_unconventional = false 76 | # process each line of a commit as an individual commit 77 | split_commits = false 78 | # regex for preprocessing the commit messages 79 | commit_preprocessors = [ 80 | # remove issue numbers from commits 81 | { pattern = '\((\w+\s)?#([0-9]+)\)', replace = "" }, 82 | ] 83 | # regex for parsing and grouping commits 84 | commit_parsers = [ 85 | { message = "^.*: add", group = "Added" }, 86 | { message = "^.*: support", group = "Added" }, 87 | { message = "^.*: remove", group = "Removed" }, 88 | { message = "^.*: delete", group = "Removed" }, 89 | { message = "^test", group = "Fixed" }, 90 | { message = "^fix", group = "Fixed" }, 91 | { message = "^.*: fix", group = "Fixed" }, 92 | { message = "^.*", group = "Changed" }, 93 | ] 94 | # protect breaking changes from being skipped due to matching a skipping commit_parser 95 | protect_breaking_commits = false 96 | # filter out the commits that are not matched by commit parsers 97 | filter_commits = true 98 | # regex for matching git tags 99 | tag_pattern = "v[0-9].*" 100 | # regex for skipping tags 101 | skip_tags = "v0.1.0-beta.1" 102 | # regex for ignoring tags 103 | ignore_tags = "" 104 | # sort the tags topologically 105 | topo_order = false 106 | # sort the commits inside sections by oldest/newest order 107 | sort_commits = "oldest" 108 | -------------------------------------------------------------------------------- /db/init.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE temp_readings ( 2 | location_id INTEGER, 3 | timestamp TIMESTAMP, 4 | temperature_c FLOAT, 5 | humidity FLOAT, 6 | weather_condition VARCHAR(50) 7 | ); 8 | 9 | CREATE TABLE temp_readings_empty ( 10 | location_id INTEGER, 11 | timestamp TIMESTAMP, 12 | temperature_c FLOAT, 13 | humidity FLOAT, 14 | weather_condition VARCHAR(50) 15 | ); 16 | 17 | CREATE TABLE temp_readings_aggregated ( 18 | date DATE, 19 | location_id INTEGER, 20 | min_temp_c FLOAT, 21 | min_humidity FLOAT, 22 | max_temp_c FLOAT, 23 | max_humidity FLOAT, 24 | avg_temp_c FLOAT, 25 | avg_humidity FLOAT 26 | ); 27 | 28 | COPY temp_readings FROM '/opt/temp_readings_jan_2024.csv' DELIMITER ',' CSV HEADER; 29 | COPY temp_readings FROM '/opt/temp_readings_feb_2024.csv' DELIMITER ',' CSV HEADER; 30 | 31 | CREATE TABLE test_custom_delete_insert_ok ( 32 | id INTEGER, 33 | value VARCHAR(50) 34 | ); 35 | 36 | CREATE TABLE test_custom_delete_insert_failed ( 37 | id INTEGER, 38 | value VARCHAR(50) 39 | ); 40 | -------------------------------------------------------------------------------- /dist-workspace.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = ["cargo:."] 3 | 4 | # Config for 'dist' 5 | [dist] 6 | # The preferred dist version to use in CI (Cargo.toml SemVer syntax) 7 | cargo-dist-version = "0.28.3" 8 | # CI backends to support 9 | ci = "github" 10 | # The installers to generate for each app 11 | installers = ["shell", "homebrew"] 12 | # A GitHub repo to push Homebrew formulas to 13 | tap = "vigimite/homebrew-aqueducts" 14 | # Target platforms to build apps for (Rust target-triple syntax) 15 | targets = ["aarch64-apple-darwin", "aarch64-unknown-linux-gnu", "x86_64-apple-darwin", "x86_64-unknown-linux-gnu", "x86_64-unknown-linux-musl"] 16 | # Path that installers should place binaries in 17 | install-path = "CARGO_HOME" 18 | # Publish jobs to run in CI 19 | publish-jobs = ["homebrew"] 20 | # Whether to install an updater program 21 | install-updater = false 22 | # Only build these specific binaries 23 | bins = ["aqueducts"] 24 | 25 | # Homebrew-specific configuration 26 | [dist.homebrew] 27 | # Override the formula name to match the crate name 28 | formula-name = "aqueducts-cli" 29 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | db: 3 | image: postgres:15 4 | restart: "no" 5 | environment: 6 | POSTGRES_USER: postgres 7 | POSTGRES_PASSWORD: postgres 8 | ports: 9 | - 5432:5432 10 | volumes: 11 | - ./db/init.sql:/docker-entrypoint-initdb.d/init.sql 12 | - ./examples/temp_readings_jan_2024.csv:/opt/temp_readings_jan_2024.csv 13 | - ./examples/temp_readings_feb_2024.csv:/opt/temp_readings_feb_2024.csv 14 | 15 | aqueducts-executor: 16 | profiles: ["executor"] 17 | build: 18 | context: . 19 | dockerfile: docker/Dockerfile 20 | ports: 21 | - "3031:3031" 22 | environment: 23 | - RUST_LOG=info 24 | command: ["aqueducts-executor", "--host", "0.0.0.0", "--port", "3031", "--api-key", "test_secret_key"] 25 | restart: unless-stopped 26 | depends_on: 27 | - db 28 | healthcheck: 29 | test: ["CMD", "curl", "-f", "http://localhost:3031/api/health"] 30 | interval: 30s 31 | timeout: 10s 32 | retries: 3 33 | start_period: 40s 34 | -------------------------------------------------------------------------------- /docker/.dockerignore: -------------------------------------------------------------------------------- 1 | # Target directories 2 | target/ 3 | **/target/ 4 | 5 | # Git 6 | .git/ 7 | .gitignore 8 | 9 | # CI/CD 10 | .github/ 11 | 12 | # Documentation 13 | docs/ 14 | *.md 15 | 16 | # Examples and test data 17 | examples/ 18 | **/tests/output/ 19 | 20 | # IDE 21 | .vscode/ 22 | .idea/ 23 | 24 | # OS 25 | .DS_Store 26 | Thumbs.db 27 | 28 | # Logs 29 | *.log 30 | 31 | # Temporary files 32 | *.tmp 33 | *.temp -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rust:1.86-slim AS builder 2 | 3 | RUN apt-get update && apt-get install -y \ 4 | pkg-config \ 5 | libssl-dev \ 6 | unixodbc-dev \ 7 | perl \ 8 | make \ 9 | gcc \ 10 | && rm -rf /var/lib/apt/lists/* 11 | 12 | WORKDIR /app 13 | 14 | COPY Cargo.toml Cargo.lock ./ 15 | COPY aqueducts/ ./aqueducts/ 16 | COPY aqueducts-cli/ ./aqueducts-cli/ 17 | COPY aqueducts-executor/ ./aqueducts-executor/ 18 | 19 | RUN cargo build --release --features odbc -p aqueducts-executor 20 | 21 | FROM debian:bookworm-slim 22 | 23 | RUN apt-get update && apt-get install -y \ 24 | ca-certificates \ 25 | unixodbc \ 26 | odbc-postgresql \ 27 | curl \ 28 | && rm -rf /var/lib/apt/lists/* 29 | 30 | RUN useradd --create-home --shell /bin/bash aqueducts 31 | 32 | COPY --from=builder /app/target/release/aqueducts-executor /usr/local/bin/aqueducts-executor 33 | 34 | # Copy ODBC configuration files 35 | COPY docker/odbcinst.ini /etc/odbcinst.ini 36 | COPY docker/odbc.ini /etc/odbc.ini 37 | 38 | RUN chmod +x /usr/local/bin/aqueducts-executor 39 | 40 | USER aqueducts 41 | WORKDIR /home/aqueducts 42 | 43 | EXPOSE 3031 44 | 45 | HEALTHCHECK --interval=30s --timeout=3s --start-period=10s --retries=3 \ 46 | CMD curl -f http://localhost:3031/api/health || exit 1 47 | 48 | CMD ["aqueducts-executor", "--host", "0.0.0.0", "--port", "3031"] 49 | -------------------------------------------------------------------------------- /docker/odbc.ini: -------------------------------------------------------------------------------- 1 | [postgres] 2 | Description=PostgreSQL connection 3 | Driver=PostgreSQL Unicode 4 | Server=postgres 5 | Port=5432 6 | Database=postgres 7 | Username=postgres 8 | Password= 9 | SSLMode=prefer 10 | 11 | [aqueducts_test] 12 | Description=Aqueducts test database 13 | Driver=PostgreSQL Unicode 14 | Server=db 15 | Port=5432 16 | Database=postgres 17 | Username=postgres 18 | Password=postgres 19 | SSLMode=prefer 20 | -------------------------------------------------------------------------------- /docker/odbcinst.ini: -------------------------------------------------------------------------------- 1 | [PostgreSQL ANSI] 2 | Description=PostgreSQL ODBC driver (ANSI version) 3 | Driver=psqlodbca.so 4 | Setup=libodbcpsqlS.so 5 | Debug=0 6 | CommLog=1 7 | UsageCount=1 8 | 9 | [PostgreSQL Unicode] 10 | Description=PostgreSQL ODBC driver (Unicode version) 11 | Driver=psqlodbcw.so 12 | Setup=libodbcpsqlS.so 13 | Debug=0 14 | CommLog=1 15 | UsageCount=1 16 | -------------------------------------------------------------------------------- /docs/about.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | TODO 4 | -------------------------------------------------------------------------------- /docs/architecture.md: -------------------------------------------------------------------------------- 1 | # Architecture 2 | 3 | An aqueduct is a pipeline definition and consists of 3 main parts 4 | 5 | - Source -> the source data for this pipeline 6 | - Stage -> transformations applied within this pipeline 7 | - Destination -> output of the pipeline result 8 | 9 | ## Source 10 | 11 | An Aqueduct source can be: 12 | 13 | - CSV or Parquet file(s) 14 | - single file 15 | - directory 16 | - Delta table 17 | - ODBC query (EXPERIMENTAL) 18 | 19 | For file based sources a schema can be provided optionally. 20 | 21 | The source is registered within the `SessionContext` as a table that can be referenced using the sources configured name. A prerequisite here is that the necessary features for the underlying object stores are enabled. 22 | This can be provided by an external `SessionContext` passed into the `run_pipeline` function or by registering the correct handlers for deltalake. 23 | 24 | **EXPERIMENTAL ODBC support** 25 | 26 | As an experimental feature it is possible to query various databases using ODBC. This is enabled through [arrow-odbc](https://crates.io/crates/arrow-odbc). 27 | Besides enabling the `odbc` feature flag in your `Cargo.toml` there are some other prerequisites for the executing system: 28 | 29 | - `unixodbc` on unix based systems 30 | - ODBC driver for the database you want to access like [ODBC Driver for SQL server](https://learn.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server) or [psqlodbc](https://github.com/postgresql-interfaces/psqlodbc) 31 | - registering the driver in the ODBC manager configuration (usually located in `/etc/odbcinst.ini`) 32 | 33 | If you have issues setting this up there are many resources online explaining how to set this up, it is a bit of a hassle. 34 | 35 | ## Stage 36 | 37 | An Aqueduct stage defines a transformation using SQL. Each stage has access to all defined sources and to every previously executed stage within the SQL context using the respectively configured names. 38 | Once executed the stage will then persist its result into the SQL context making it accessible to downstream consumers. 39 | 40 | The stage can be set to print the result and/or the result schema to the `stdout`. This is useful for development/debugging purposes. 41 | 42 | Nested stages are executed in parallel 43 | 44 | ## Destination 45 | 46 | An Aqueduct destination can be: 47 | 48 | - CSV or Parquet file(s) 49 | - single file 50 | - directory 51 | - Delta table 52 | - ODBC query (NOT IMPLEMENTED YET) 53 | 54 | An Aqueduct destination is the target for the execution of the pipeline, the result of the final stage that was executed is used as the input for the destination to write the data to the underlying table/file. 55 | 56 | **File based destinations** 57 | 58 | File based destinations have support for HDFS style partitioning (`output/location=1/...`) and can be set to output only a single file or multiple files based on the configuration. 59 | 60 | **Delta Table destination** 61 | 62 | For a DeltaTable there is some additional logic that is utilized to maintain the table integrity. 63 | 64 | The destination will first cast and validate the schema of the input data and then use one of 3 configurable modes to write the data: 65 | 66 | - Append -> appends the data to the destination 67 | - Upsert -> merges the data to the destination, using the provided configuration for this mode to identify cohort columns that are used to determine which data should be updated 68 | - provided merge columns are used to check equality e.g. `vec!["date", "country"]` -> update data where `old.date = new.date AND old.country = new.country` 69 | - Replace -> replaces the data using a configurable predicate to determine which data should be replaced by the operation 70 | - provided replacement conditions are used to check equality e.g. `ReplacementCondition { column: "date", value: "1970-01-01" }` -> replace data where `old.date = '1970-01-01'` 71 | -------------------------------------------------------------------------------- /docs/assets/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vigimite/aqueducts/6eaf764852ac9348eb45c7073ecf126d61eb5505/docs/assets/favicon.ico -------------------------------------------------------------------------------- /docs/assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vigimite/aqueducts/6eaf764852ac9348eb45c7073ecf126d61eb5505/docs/assets/logo.png -------------------------------------------------------------------------------- /docs/cli.md: -------------------------------------------------------------------------------- 1 | # Aqueducts CLI 2 | 3 | Example CLI application utilizing the Aqueducts framework to run ETL pipelines declared in YAML. 4 | 5 | ## Install 6 | 7 | ```bash 8 | # install with default features (s3, gcs, azure) 9 | cargo install aqueducts-cli 10 | 11 | # install with odbc support 12 | cargo install aqueducts-cli --features odbc 13 | 14 | # install with s3 support only 15 | cargo install aqueducts-cli --no-default-features --features s3 16 | ``` 17 | 18 | ## Run 19 | 20 | ```bash 21 | aqueducts --file ./example.yml --param key1=value1 --param key2=value2 22 | ``` 23 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Aqueducts 2 | 3 | This is the documentation for [Aqueducts](https://github.com/vigimite/aqueducts) 4 | 5 | [![Build status](https://github.com/vigimite/aqueducts/actions/workflows/build.yml/badge.svg?branch=main)](https://github.com/vigimite/aqueducts/actions/workflows/CI.yml) [![Crates.io](https://img.shields.io/crates/v/aqueducts)](https://crates.io/crates/aqueducts) [![Documentation](https://docs.rs/aqueducts/badge.svg)](https://docs.rs/aqueducts) 6 | 7 | 8 | 9 | Aqueducts is a framework to write and execute ETL data pipelines declaratively. 10 | 11 | **Features:** 12 | 13 | - Define ETL pipelines in YAML 14 | - Extract data from csv files, parquet files or delta tables 15 | - Process data using SQL 16 | - Load data into object stores as csv/parquet or delta tables 17 | - Support for file and delta table partitioning 18 | - Support for Upsert/Replace/Append operation on delta tables 19 | - Support for Local, S3, GCS and Azure Blob storage 20 | - *EXPERIMENTAL* Support for ODBC Sources and Destinations 21 | 22 | This framework builds on the fantastic work done by projects such as: 23 | 24 | - [arrow-rs](https://github.com/apache/arrow-rs) 25 | - [datafusion](https://github.com/apache/datafusion) 26 | - [delta-rs](https://github.com/delta-io/delta-rs) 27 | 28 | Please show these projects some support :heart:! 29 | -------------------------------------------------------------------------------- /docs/stylesheets/extra.css: -------------------------------------------------------------------------------- 1 | [data-md-color-scheme="default"] { 2 | --md-primary-fg-color: #111F2C; 3 | --md-primary-fg-color--light: #111F2C; 4 | --md-primary-fg-color--dark: #111F2C; 5 | --md-footer-bg-color--light: #111F2C; 6 | --md-footer-bg-color--dark: #111F2C; 7 | } 8 | 9 | [data-md-color-scheme="slate"] { 10 | --md-primary-fg-color: #3B444B; 11 | --md-primary-fg-color--light: #3B444B; 12 | --md-primary-fg-color--dark: #3B444B; 13 | --md-footer-bg-color--light: #3B444B; 14 | --md-footer-bg-color--dark: #3B444B; 15 | } -------------------------------------------------------------------------------- /examples/aqueduct_pipeline_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "sources": [ 3 | { 4 | "type": "File", 5 | "name": "some_table", 6 | "file_type": { 7 | "type": "Csv", 8 | "options": { 9 | "has_header": true, 10 | "delimiter": "," 11 | } 12 | }, 13 | "location": "${local_path}/examples/test_data/example_1.csv" 14 | }, 15 | { 16 | "type": "File", 17 | "name": "another_table", 18 | "file_type": { 19 | "type": "Csv", 20 | "options": { 21 | "has_header": true, 22 | "delimiter": "," 23 | } 24 | }, 25 | "location": "${local_path}/examples/test_data/example_2.csv" 26 | } 27 | ], 28 | "stages": [ 29 | [ 30 | { 31 | "name": "aggregate", 32 | "query": "SELECT date, country, SUM(a) as sum_1, SUM(b) as sum_2 FROM some_table GROUP BY 1, 2", 33 | "show": 20 34 | }, 35 | { 36 | "name": "average", 37 | "query": "SELECT date, country, AVG(x) as avg_1, AVG(y) as avg_2 FROM another_table GROUP BY 1, 2", 38 | "show": 0 39 | } 40 | ], 41 | [ 42 | { 43 | "name": "join", 44 | "query": "SELECT COALESCE(agg.date, avg.date) as date, COALESCE(agg.country, avg.country) as country, sum_1, sum_2, avg_1, avg_2FROM aggregate agg JOIN average avg ON agg.date = avg.date AND agg.country = avg.country WHERE COALESCE(agg.date, avg.date) = '1970-01-01'" 45 | } 46 | ] 47 | ], 48 | "destination": { 49 | "type": "Delta", 50 | "name": "example_output", 51 | "location": "${local_path}/examples/output_delta_example/${run_id}", 52 | "storage_options": {}, 53 | "table_properties": {}, 54 | "write_mode": { 55 | "operation": "Replace", 56 | "params": [ 57 | { 58 | "column": "date", 59 | "value": "1970-01-01" 60 | } 61 | ] 62 | }, 63 | "partition_cols": [ 64 | "date" 65 | ], 66 | "schema": [ 67 | { 68 | "name": "date", 69 | "type": "date", 70 | "nullable": true, 71 | "metadata": {} 72 | }, 73 | { 74 | "name": "country", 75 | "type": "string", 76 | "nullable": true, 77 | "metadata": {} 78 | }, 79 | { 80 | "name": "sum_1", 81 | "type": "integer", 82 | "nullable": true, 83 | "metadata": {} 84 | }, 85 | { 86 | "name": "sum_2", 87 | "type": "double", 88 | "nullable": true, 89 | "metadata": {} 90 | }, 91 | { 92 | "name": "avg_1", 93 | "type": "double", 94 | "nullable": true, 95 | "metadata": {} 96 | }, 97 | { 98 | "name": "avg_2", 99 | "type": "double", 100 | "nullable": true, 101 | "metadata": {} 102 | } 103 | ] 104 | } 105 | } -------------------------------------------------------------------------------- /examples/aqueduct_pipeline_example.toml: -------------------------------------------------------------------------------- 1 | version = "v2" 2 | 3 | stages = [ 4 | [ 5 | { name = "jan_aggregated", query = """ 6 | SELECT 7 | cast(timestamp as date) date, 8 | location_id, 9 | round(min(temperature_c),2) min_temp_c, 10 | round(min(humidity),2) min_humidity, 11 | round(max(temperature_c),2) max_temp_c, 12 | round(max(humidity),2) max_humidity, 13 | round(avg(temperature_c),2) avg_temp_c, 14 | round(avg(humidity),2) avg_humidity 15 | FROM jan_data GROUP by 1,2 ORDER by 1 asc 16 | """, show = 20 }, 17 | { name = "feb_aggregated", query = """ 18 | SELECT 19 | cast(timestamp as date) date, 20 | location_id, 21 | round(min(temperature_c),2) min_temp_c, 22 | round(min(humidity),2) min_humidity, 23 | round(max(temperature_c),2) max_temp_c, 24 | round(max(humidity),2) max_humidity, 25 | round(avg(temperature_c),2) avg_temp_c, 26 | round(avg(humidity),2) avg_humidity 27 | FROM feb_data GROUP by 1,2 ORDER by 1 asc 28 | """, show = 0 }, 29 | ], 30 | [ 31 | { name = "union", query = "SELECT * FROM jan_aggregated UNION (SELECT * FROM feb_aggregated)", print_schema = true }, 32 | ], 33 | ] 34 | 35 | [[sources]] 36 | type = "file" 37 | name = "jan_data" 38 | location = "${local_path}/examples/temp_readings_jan_2024.csv" 39 | [sources.format] 40 | type = "csv" 41 | [sources.format.options] 42 | has_header = true 43 | delimiter = "," 44 | 45 | [[sources]] 46 | type = "file" 47 | name = "feb_data" 48 | location = "${local_path}/examples/temp_readings_feb_2024.csv" 49 | [sources.format] 50 | type = "csv" 51 | [sources.format.options] 52 | has_header = true 53 | delimiter = "," 54 | 55 | [destination] 56 | type = "delta" 57 | name = "example_output" 58 | location = "${local_path}/examples/output_delta_example/${run_id}" 59 | storage_config = {} 60 | table_properties = {} 61 | partition_columns = ["date"] 62 | 63 | [destination.write_mode] 64 | operation = "upsert" 65 | params = ["date"] 66 | 67 | [[destination.schema]] 68 | name = "date" 69 | data_type = "date32" 70 | nullable = true 71 | metadata = {} 72 | 73 | [[destination.schema]] 74 | name = "location_id" 75 | data_type = "int32" 76 | nullable = true 77 | metadata = {} 78 | 79 | [[destination.schema]] 80 | name = "min_temp_c" 81 | data_type = "float64" 82 | nullable = true 83 | metadata = {} 84 | 85 | [[destination.schema]] 86 | name = "min_humidity" 87 | data_type = "float64" 88 | nullable = true 89 | metadata = {} 90 | 91 | [[destination.schema]] 92 | name = "max_temp_c" 93 | data_type = "float64" 94 | nullable = true 95 | metadata = {} 96 | 97 | [[destination.schema]] 98 | name = "max_humidity" 99 | data_type = "float64" 100 | nullable = true 101 | metadata = {} 102 | 103 | [[destination.schema]] 104 | name = "avg_temp_c" 105 | data_type = "float64" 106 | nullable = true 107 | metadata = {} 108 | 109 | [[destination.schema]] 110 | name = "avg_humidity" 111 | data_type = "float64" 112 | nullable = true 113 | metadata = {} -------------------------------------------------------------------------------- /examples/aqueduct_pipeline_example.yml: -------------------------------------------------------------------------------- 1 | version: "v2" 2 | sources: 3 | - type: file 4 | name: jan_data 5 | format: 6 | type: csv 7 | options: 8 | has_header: true 9 | delimiter: "," 10 | location: ${local_path}/examples/temp_readings_jan_2024.csv 11 | 12 | - type: file 13 | name: feb_data 14 | format: 15 | type: csv 16 | options: 17 | has_header: true 18 | delimiter: "," 19 | location: ${local_path}/examples/temp_readings_feb_2024.csv 20 | 21 | stages: 22 | - - name: jan_aggregated 23 | query: > 24 | SELECT 25 | cast(timestamp as date) date, 26 | location_id, 27 | round(min(temperature_c),2) min_temp_c, 28 | round(min(humidity),2) min_humidity, 29 | round(max(temperature_c),2) max_temp_c, 30 | round(max(humidity),2) max_humidity, 31 | round(avg(temperature_c),2) avg_temp_c, 32 | round(avg(humidity),2) avg_humidity 33 | FROM jan_data 34 | GROUP by 1,2 35 | ORDER by 1 asc 36 | # print 20 rows of the result for this query to stdout 37 | show: 20 38 | 39 | - name: feb_aggregated 40 | query: > 41 | SELECT 42 | cast(timestamp as date) date, 43 | location_id, 44 | round(min(temperature_c),2) min_temp_c, 45 | round(min(humidity),2) min_humidity, 46 | round(max(temperature_c),2) max_temp_c, 47 | round(max(humidity),2) max_humidity, 48 | round(avg(temperature_c),2) avg_temp_c, 49 | round(avg(humidity),2) avg_humidity 50 | FROM feb_data 51 | GROUP by 1,2 52 | ORDER by 1 asc 53 | # print the entire result for this query to stdout 54 | show: 0 55 | 56 | - - name: union 57 | query: > 58 | SELECT * FROM jan_aggregated UNION (SELECT * FROM feb_aggregated) 59 | print_schema: true # print the resulting schema of this query to stdout 60 | 61 | destination: 62 | type: delta 63 | name: example_output 64 | location: ${local_path}/examples/output_delta_example/${run_id} 65 | storage_config: {} 66 | table_properties: {} 67 | 68 | # how to write this table 69 | # valid options are Append, Replace and Upsert 70 | write_mode: 71 | # upserts using the date as the "primary" key 72 | operation: upsert 73 | params: 74 | - date 75 | 76 | # columns by which to partition the table 77 | partition_columns: 78 | - date 79 | 80 | # table schema using de-serialization provided by `deltalake::kernel::StructField` 81 | schema: 82 | - name: date 83 | data_type: date32 84 | nullable: true 85 | metadata: {} 86 | - name: location_id 87 | data_type: int32 88 | nullable: true 89 | metadata: {} 90 | - name: min_temp_c 91 | data_type: float64 92 | nullable: true 93 | metadata: {} 94 | - name: min_humidity 95 | data_type: float64 96 | nullable: true 97 | metadata: {} 98 | - name: max_temp_c 99 | data_type: float64 100 | nullable: true 101 | metadata: {} 102 | - name: max_humidity 103 | data_type: float64 104 | nullable: true 105 | metadata: {} 106 | - name: avg_temp_c 107 | data_type: float64 108 | nullable: true 109 | metadata: {} 110 | - name: avg_humidity 111 | data_type: float64 112 | nullable: true 113 | metadata: {} -------------------------------------------------------------------------------- /examples/aqueduct_pipeline_odbc.yml: -------------------------------------------------------------------------------- 1 | version: "v2" 2 | sources: 3 | - type: odbc 4 | name: jan_data 5 | # connection_string: Driver={PostgreSQL Unicode};Server=db;UID=${user};PWD=${pass}; # <- For executor 6 | connection_string: Driver={PostgreSQL Unicode};Server=localhost;UID=${user};PWD=${pass}; 7 | query: SELECT * FROM temp_readings WHERE timestamp BETWEEN '2024-01-01' AND '2024-01-31' 8 | 9 | - type: odbc 10 | name: feb_data 11 | # connection_string: Driver={PostgreSQL Unicode};Server=db;UID=${user};PWD=${pass}; # <- For executor 12 | connection_string: Driver={PostgreSQL Unicode};Server=localhost;UID=${user};PWD=${pass}; 13 | query: SELECT * FROM temp_readings WHERE timestamp BETWEEN '2024-02-01' AND '2024-02-29' 14 | 15 | stages: 16 | - - name: jan_aggregated 17 | query: > 18 | SELECT 19 | cast(timestamp as date) date, 20 | location_id, 21 | round(min(temperature_c),2) min_temp_c, 22 | round(min(humidity),2) min_humidity, 23 | round(max(temperature_c),2) max_temp_c, 24 | round(max(humidity),2) max_humidity, 25 | round(avg(temperature_c),2) avg_temp_c, 26 | round(avg(humidity),2) avg_humidity 27 | FROM jan_data 28 | GROUP by 1,2 29 | ORDER by 1 asc 30 | # print 20 rows of the result for this query to stdout 31 | show: 20 32 | 33 | - name: feb_aggregated 34 | query: > 35 | SELECT 36 | cast(timestamp as date) date, 37 | location_id, 38 | round(min(temperature_c),2) min_temp_c, 39 | round(min(humidity),2) min_humidity, 40 | round(max(temperature_c),2) max_temp_c, 41 | round(max(humidity),2) max_humidity, 42 | round(avg(temperature_c),2) avg_temp_c, 43 | round(avg(humidity),2) avg_humidity 44 | FROM feb_data 45 | GROUP by 1,2 46 | ORDER by 1 asc 47 | # print the entire result for this query to stdout 48 | show: 0 49 | 50 | - - name: union 51 | query: > 52 | SELECT * FROM jan_aggregated UNION ALL SELECT * FROM feb_aggregated 53 | 54 | destination: 55 | type: odbc 56 | name: temp_readings_aggregated 57 | # connection_string: Driver={PostgreSQL Unicode};Server=db;UID=${user};PWD=${pass}; # <- For executor 58 | connection_string: Driver={PostgreSQL Unicode};Server=localhost;UID=${user};PWD=${pass}; 59 | write_mode: 60 | operation: append 61 | batch_size: 100 62 | -------------------------------------------------------------------------------- /examples/aqueduct_pipeline_simple.yml: -------------------------------------------------------------------------------- 1 | version: "v2" 2 | sources: 3 | # Register a local file source containing temperature readings for various cities 4 | - type: file 5 | name: temp_readings 6 | format: 7 | type: csv 8 | options: {} 9 | location: ./examples/temp_readings_${month}_${year}.csv # use templating functionality to parameterize the month and year 10 | 11 | #Register a local file source containing a mapping between location_ids and location names 12 | - type: file 13 | name: locations 14 | format: 15 | type: csv 16 | options: {} 17 | location: ./examples/location_dict.csv 18 | 19 | stages: 20 | # Query to aggregate temperature data by date and location 21 | - - name: aggregated 22 | query: > 23 | SELECT 24 | cast(timestamp as date) date, 25 | location_id, 26 | round(min(temperature_c),2) min_temp_c, 27 | round(min(humidity),2) min_humidity, 28 | round(max(temperature_c),2) max_temp_c, 29 | round(max(humidity),2) max_humidity, 30 | round(avg(temperature_c),2) avg_temp_c, 31 | round(avg(humidity),2) avg_humidity 32 | FROM temp_readings 33 | GROUP by 1,2 34 | ORDER by 1 asc 35 | explain: true # print the query plan to stdout for debugging purposes 36 | 37 | # Enrich aggregation with the location name 38 | - - name: enriched 39 | query: > 40 | SELECT 41 | date, 42 | location_name, 43 | min_temp_c, 44 | max_temp_c, 45 | avg_temp_c, 46 | min_humidity, 47 | max_humidity, 48 | avg_humidity 49 | FROM aggregated 50 | JOIN locations 51 | ON aggregated.location_id = locations.location_id 52 | ORDER BY date, location_name 53 | show: 10 # print 10 rows to stdout for debugging purposes 54 | 55 | # Write the pipeline result to a parquet file 56 | destination: 57 | type: file 58 | name: results 59 | format: 60 | type: parquet 61 | options: {} 62 | location: ./examples/output_${month}_${year}.parquet -------------------------------------------------------------------------------- /examples/location_dict.csv: -------------------------------------------------------------------------------- 1 | location_id,location_name 2 | 1,"New York" 3 | 2,"Los Angeles" 4 | 3,"Chicago" 5 | 4,"Vienna" 6 | 5,"Prague" 7 | 6,"Berlin" 8 | 7,"Paris" 9 | 8,"London" 10 | -------------------------------------------------------------------------------- /json_schema/generate_schema_reference.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import shutil 4 | from json_schema_for_humans.generate import generate_from_filename 5 | from json_schema_for_humans.generation_configuration import GenerationConfiguration 6 | 7 | 8 | OUTPUT_DIR = "json_schema" 9 | 10 | 11 | def find_latest_generated_json(target_dir='target', pattern='aqueducts.schema.json'): 12 | # Search for the JSON file in the target directory 13 | search_pattern = os.path.join(target_dir, 'debug', 'build', '**', pattern) 14 | files = glob.glob(search_pattern, recursive=True) 15 | 16 | if not files: 17 | raise FileNotFoundError(f"No files found matching pattern: {search_pattern}") 18 | 19 | # Find the most recently modified file 20 | latest_file = max(files, key=os.path.getmtime) 21 | return latest_file 22 | 23 | 24 | def on_startup(command, dirty): 25 | try: 26 | json_path = find_latest_generated_json() 27 | print(f"Found latest JSON schema at: {json_path}") 28 | 29 | # output file to destination directory 30 | output_path = shutil.copy(json_path, OUTPUT_DIR) 31 | 32 | gen_cfg = GenerationConfiguration( 33 | custom_template_path="json_schema/schema_reference_template/base.md", 34 | footer_show_time=False, 35 | description_is_markdown=True, 36 | link_to_reused_ref=False, 37 | show_breadcrumbs=False, 38 | show_toc=False, 39 | template_md_options={ 40 | "badge_as_image": True, 41 | "show_heading_numbers": False, 42 | "show_array_restrictions": False, 43 | "properties_table_columns": [ 44 | "Property", 45 | "Pattern", 46 | "Type", 47 | "Title/Description" 48 | ] 49 | } 50 | ) 51 | 52 | generate_from_filename(output_path, "docs/schema_reference.md", config=gen_cfg) 53 | 54 | except Exception as e: 55 | print(f"An error occurred: {e}") 56 | -------------------------------------------------------------------------------- /json_schema/schema_reference_template/base.md: -------------------------------------------------------------------------------- 1 | {% set depth = 0 %} 2 | {{ schema.keywords.get("title").literal | default("Schema Docs") | md_heading(depth) }} 3 | {% set contentBase %} 4 | {% with schema=schema, skip_headers=False, depth=depth %} 5 | {% include "content.md" %} 6 | {% endwith %} 7 | {% endset %} 8 | 9 | {{ md_get_toc() }} 10 | 11 | This is a generated JSONSchema reference for the Aqueducts configuration. 12 | 13 | {{ contentBase }} 14 | 15 | ---------------------------------------------------------------------------------------------------------------------------- 16 | {% if config.with_footer -%} 17 | Generated using [json-schema-for-humans](https://github.com/coveooss/json-schema-for-humans){% if config.footer_show_time %} on {{ get_local_time() }}{% endif %} 18 | 19 | {% endif -%} 20 | -------------------------------------------------------------------------------- /json_schema/schema_reference_template/breadcrumbs.md: -------------------------------------------------------------------------------- 1 | {%- filter md_escape_for_table -%} 2 | {%- if config.show_breadcrumbs -%} 3 | {%- for node in schema.nodes_from_root -%} 4 | {{ node.name_for_breadcrumbs }}{%- if not loop.last %} > {% endif -%} 5 | {%- endfor -%} 6 | {%- else -%} 7 | Field: {{ schema.name_for_breadcrumbs }} 8 | {%- endif -%} 9 | {%- endfilter -%} 10 | -------------------------------------------------------------------------------- /json_schema/schema_reference_template/content.md: -------------------------------------------------------------------------------- 1 | {# 2 | content is a template and not a macro in md 3 | because macro parameters are not through context 4 | when rendering a template from the macro and it caused 5 | serious problems when using recursive calls 6 | mandatory context parameters: 7 | schema 8 | #} 9 | {# context parameters default values #} 10 | {% set skip_headers = skip_headers or False %} 11 | {% set depth = depth or 0 %} 12 | {# end context parameters #} 13 | 14 | {% set keys = schema.keywords %} 15 | {%- if not skip_headers %} 16 | 17 | {% if schema.title and schema.title | length > 0 %} 18 | **Title:** {{ schema.title }} 19 | {% endif %} 20 | 21 | {{ schema | md_type_info_table | md_generate_table }} 22 | 23 | {% set description = (schema | get_description) %} 24 | {% include "section_description.md" %} 25 | {% endif %} 26 | 27 | {# Display examples #} 28 | {% set examples = schema.examples %} 29 | {% if examples %} 30 | {% include "section_examples.md" %} 31 | {% endif %} 32 | 33 | {% if schema.should_be_a_link(config) %} 34 | {% elif schema.refers_to -%} 35 | {%- with schema=schema.refers_to_merged, skip_headers=True, depth=depth -%} 36 | {% include "content.md" %} 37 | {% endwith %} 38 | {% else %} 39 | {# Properties, pattern properties, additional properties #} 40 | {% if schema.is_object %} 41 | {{- schema | md_properties_table | md_generate_table -}} 42 | {% endif %} 43 | 44 | {# Combining: allOf, anyOf, oneOf, not #} 45 | {% if schema.kw_all_of %} 46 | {% with operator="allOf", title="All of(Requirement)", current_node=schema.kw_all_of, skip_required=True %} 47 | {% include "tabbed_section.md" %} 48 | {% endwith %} 49 | {% endif %} 50 | {% if schema.kw_any_of %} 51 | {% with operator="anyOf", title="Any of(Option)", current_node=schema.kw_any_of, skip_required=True %} 52 | {% include "tabbed_section.md" %} 53 | {% endwith %} 54 | {% endif %} 55 | {% if schema.kw_one_of %} 56 | {% with operator="oneOf", title="One of(Option)",current_node=schema.kw_one_of, skip_required=True %} 57 | {% include "tabbed_section.md" %} 58 | {% endwith %} 59 | {% endif %} 60 | {% if schema.kw_not %} 61 | {% include "section_not.md" %} 62 | {% endif %} 63 | 64 | {# Enum and const #} 65 | {% if schema.kw_enum -%} 66 | {% include "section_one_of.md" %} 67 | {%- endif %} 68 | {%- if schema.kw_const -%} 69 | Specific value: `{{ schema.kw_const.raw | python_to_json }}` 70 | {%- endif -%} 71 | 72 | {# Conditional subschema, or if-then-else section #} 73 | {% if schema.has_conditional %} 74 | {% with skip_headers=False, depth=depth+1 %} 75 | {% include "section_conditional_subschema.md" %} 76 | {% endwith %} 77 | {% endif %} 78 | 79 | {# Required properties that are not defined under "properties". They will only be listed #} 80 | {% include "section_undocumented_required_properties.md" %} 81 | 82 | {# Show the requested type(s) #} 83 | {{- schema | md_restrictions_table | md_generate_table -}} 84 | 85 | {# Show array restrictions #} 86 | {% if schema.type_name.startswith("array") %} 87 | {% include "section_array.md" %} 88 | {% endif %} 89 | 90 | {# details of Properties, pattern properties, additional properties #} 91 | {% if schema.is_object %} 92 | {% include "section_properties_details.md" %} 93 | {% endif %} 94 | {% endif %} 95 | -------------------------------------------------------------------------------- /json_schema/schema_reference_template/section_array.md: -------------------------------------------------------------------------------- 1 | {{ schema | md_array_restrictions | md_generate_table }} 2 | 3 | {% if schema.array_items_def or schema.tuple_validation_items %} 4 | {{ schema | md_array_items_restrictions | md_generate_table }} 5 | {% endif %} 6 | 7 | {% if schema.array_items_def %} 8 | {% filter md_heading(depth+1) %} 9 | {% with schema=schema.array_items_def %}{%- include "breadcrumbs.md" %}{% endwith %} 10 | {% endfilter %} 11 | {% with schema=schema.array_items_def, skip_headers=False, depth=depth+1, skip_required=True %} 12 | {% include "content.md" %} 13 | {% endwith %} 14 | {% endif %} 15 | 16 | {% if schema.tuple_validation_items %} 17 | {% for item in schema.tuple_validation_items %} 18 | {% filter md_heading(depth+1) %} 19 | {% with schema=item %}{%- include "breadcrumbs.md" %}{% endwith %} 20 | {% endfilter %} 21 | {% with schema=item, skip_headers=False, depth=depth+1, skip_required=True %} 22 | {% include "content.md" %} 23 | {% endwith %} 24 | {% endfor %} 25 | {% endif %} 26 | 27 | {% if schema.kw_contains and schema.kw_contains.literal != {} %} 28 | {{ "At least one of the items must be" | md_heading(depth+1) }} 29 | {% with schema=schema.kw_contains, skip_headers=False, depth=depth+1, skip_required=True %} 30 | {% include "content.md" %} 31 | {% endwith %} 32 | {% endif %} 33 | 34 | {% if schema.array_additional_items_def %} 35 | {{ "Additional items must be" | md_heading(depth+1) }} 36 | {% with schema=schema.array_additional_items_def, skip_headers=False, depth=depth+1, skip_required=True %} 37 | {% include "content.md" %} 38 | {% endwith %} 39 | {% endif %} 40 | -------------------------------------------------------------------------------- /json_schema/schema_reference_template/section_conditional_subschema.md: -------------------------------------------------------------------------------- 1 | {% if schema.kw_if %} 2 | {% set first_property = schema.kw_if | get_first_property %} 3 | 4 | {% if schema.kw_then %} 5 | {%- filter md_heading(depth) -%}If ( 6 | {{- first_property.property_name | md_escape_for_table -}} 7 | {{- " = " -}} 8 | {{- first_property.kw_const.literal | python_to_json -}} 9 | ){%- endfilter -%} 10 | {% with schema=schema.kw_then, skip_headers=False, depth=depth %} 11 | {% include "content.md" %} 12 | {% endwith %} 13 | {% endif %} 14 | {% if schema.kw_else %} 15 | {%- filter md_heading(depth) -%}Else (i.e. {{ " " }} 16 | {{- first_property.property_name | md_escape_for_table -}} 17 | {{- " != " -}} 18 | {{- first_property.kw_const.literal | python_to_json -}} 19 | ){%- endfilter -%} 20 | {% with schema=schema.kw_else, skip_headers=False, depth=depth %} 21 | {% include "content.md" %} 22 | {% endwith %} 23 | {% endif %} 24 | {% endif %} -------------------------------------------------------------------------------- /json_schema/schema_reference_template/section_description.md: -------------------------------------------------------------------------------- 1 | {# Display description #} 2 | {% if description %} 3 | **Description:**{{ " " }}{{ description }} 4 | {% else %} 5 | **Description:**{{ " " }} *No description...* 6 | {% endif %} 7 | -------------------------------------------------------------------------------- /json_schema/schema_reference_template/section_examples.md: -------------------------------------------------------------------------------- 1 | **Example{% if examples|length > 1 %}s{% endif %}:**{{ " " }} 2 | 3 | {% for example in examples %} 4 | {%- if loop.first %}{{ "\n" }}{% endif -%} 5 | {% set example_id = schema.html_id ~ "_ex" ~ loop.index %} 6 | {%- if not examples_as_yaml -%} 7 | {{- "" }}```json 8 | {{- "\n" }}{{ example }} 9 | {{- "\n" }}``` 10 | {%- else -%} 11 | {{- "" }}```yaml 12 | {{- "\n" }}{{ example | yaml_example }} 13 | {{- "\n" }}``` 14 | {%- endif -%} 15 | {{ "\n" }} 16 | {% endfor %} 17 | -------------------------------------------------------------------------------- /json_schema/schema_reference_template/section_not.md: -------------------------------------------------------------------------------- 1 | {{ "Must **not** be" | md_heading(depth+1) }} 2 | {% with schema=schema.kw_not, skip_headers=False, depth=depth+1, skip_required=True %} 3 | {% include "content.md" %} 4 | {% endwith %} -------------------------------------------------------------------------------- /json_schema/schema_reference_template/section_one_of.md: -------------------------------------------------------------------------------- 1 | Must be one of: 2 | {% for enum_choice in schema.kw_enum.array_items %} 3 | * {{ enum_choice.literal | python_to_json }} 4 | {% endfor %} -------------------------------------------------------------------------------- /json_schema/schema_reference_template/section_properties_details.md: -------------------------------------------------------------------------------- 1 | {% for sub_property in schema.iterate_properties %} 2 | 3 | ---------------------------------------------------- 4 | 5 | {%- if sub_property.is_additional_properties and not sub_property.is_additional_properties_schema -%} 6 | {% continue %} 7 | {% endif %} 8 | 9 | {% set html_id = sub_property.html_id %} 10 | 11 | {% set description = sub_property | get_description %} 12 | 13 | {% filter md_heading(depth + 1, html_id) -%} 14 | {%- filter replace('\n', '') -%} 15 | {%- if not skip_required and sub_property.property_name -%} 16 | {{ md_badge("Required", "blue", fallback=False) if sub_property.is_required_property else md_badge("Optional", "yellow", fallback=False) -}} 17 | {%- endif -%} 18 | {%- if sub_property is deprecated -%}~~{%- endif -%} 19 | {%- if sub_property.is_pattern_property %} Pattern{% endif %} {% with schema=sub_property %}{%- include "breadcrumbs.md" %}{% endwith %} 20 | {%- if sub_property is deprecated -%}~~{%- endif -%} 21 | {%- endfilter %} 22 | {%- endfilter %} 23 | 24 | {% if sub_property.is_pattern_property %} 25 | > All properties whose name matches the regular expression 26 | ```{{ sub_property.property_name }}``` ([Test](https://regex101.com/?regex={{ sub_property.property_name | urlencode }})) 27 | must respect the following conditions 28 | {% endif %} 29 | 30 | 31 | {% with schema=sub_property, skip_headers=False, depth=depth+1 %} 32 | {% include "content.md" %} 33 | {% endwith %} 34 | 35 | {% endfor %} 36 | -------------------------------------------------------------------------------- /json_schema/schema_reference_template/section_undocumented_required_properties.md: -------------------------------------------------------------------------------- 1 | {% set undocumented_required_properties = schema | get_undocumented_required_properties %} 2 | {% if undocumented_required_properties%} 3 | {{ "The following properties are required" | md_heading(depth+1) }} 4 | {% for required_property in undocumented_required_properties %} 5 | * {{ required_property }} 6 | {% endfor %} 7 | {% endif %} -------------------------------------------------------------------------------- /json_schema/schema_reference_template/tabbed_section.md: -------------------------------------------------------------------------------- 1 | 2 | {{ current_node | md_array_items(title) | md_generate_table }} 3 | 4 | {% for node in current_node.array_items %} 5 | {% filter md_heading(depth+1, node.html_id) -%} 6 | {% if node.is_pattern_property %}Pattern{% endif %} **{% with schema=node %}{%- include "breadcrumbs.md" %}{% endwith %}** 7 | {%- endfilter %} 8 | {% with schema=node, skip_headers=False, depth=depth+1 %} 9 | {% include "content.md" %} 10 | {% endwith %} 11 | {% endfor %} 12 | -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vigimite/aqueducts/6eaf764852ac9348eb45c7073ecf126d61eb5505/logo.png -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: Aqueducts Documentation 2 | site_url: https://vigimite.github.io/aqueducts 3 | repo_url: https://github.com/vigimite/aqueducts 4 | repo_name: vigimite/aqueducts 5 | copyright: Copyright © 2024 Michele Vigilante 6 | docs_dir: "docs/" 7 | 8 | hooks: 9 | - json_schema/generate_schema_reference.py 10 | 11 | nav: 12 | - Home: index.md 13 | - Usage: usage.md 14 | - Storage Configuration: storage.md 15 | - Schema Reference: schema_reference.md 16 | - Architecture: architecture.md 17 | - Aqueducts CLI: cli.md 18 | - About: about.md 19 | 20 | theme: 21 | language: en 22 | name: material 23 | logo: assets/logo.png 24 | icon: 25 | repo: fontawesome/brands/github 26 | favicon: assets/favicon.ico 27 | palette: 28 | # Palette toggle for automatic mode 29 | - media: "(prefers-color-scheme)" 30 | toggle: 31 | icon: material/brightness-auto 32 | name: Switch to light mode 33 | 34 | # Palette toggle for light mode 35 | - media: "(prefers-color-scheme: light)" 36 | scheme: default 37 | primary: custom 38 | toggle: 39 | icon: material/brightness-7 40 | name: Switch to dark mode 41 | 42 | # Palette toggle for dark mode 43 | - media: "(prefers-color-scheme: dark)" 44 | scheme: slate 45 | primary: blue grey 46 | toggle: 47 | icon: material/brightness-4 48 | name: Switch to system preference 49 | 50 | features: 51 | - navigation.tabs 52 | - navigation.instant 53 | - navigation.instant.prefetch 54 | - navigation.tracking 55 | - navigation.path 56 | - navigation.top 57 | 58 | - toc.integrate 59 | - toc.follow 60 | 61 | - search.suggest 62 | 63 | - content.code.copy 64 | - content.code.annotate 65 | 66 | extra_css: 67 | - stylesheets/extra.css 68 | 69 | extra: 70 | generator: true 71 | social: 72 | - icon: fontawesome/brands/mastodon 73 | link: https://fosstodon.org/@kato 74 | - icon: fontawesome/brands/github 75 | link: https://github.com/vigimite 76 | 77 | plugins: 78 | - search 79 | - social 80 | 81 | markdown_extensions: 82 | - toc: 83 | toc_depth: 5 84 | - tables 85 | - admonition 86 | - attr_list 87 | - pymdownx.emoji: 88 | emoji_index: !!python/name:material.extensions.emoji.twemoji 89 | emoji_generator: !!python/name:material.extensions.emoji.to_svg 90 | - pymdownx.highlight: 91 | anchor_linenums: true 92 | line_spans: __span 93 | pygments_lang_class: true 94 | - pymdownx.inlinehilite 95 | - pymdownx.snippets 96 | - pymdownx.superfences 97 | - pymdownx.details 98 | - pymdownx.tabbed: 99 | alternate_style: true 100 | -------------------------------------------------------------------------------- /release.toml: -------------------------------------------------------------------------------- 1 | # Publish all crates in dependency order 2 | allow-branch = ["main"] 3 | # Use shared versioning across workspace 4 | shared-version = true 5 | --------------------------------------------------------------------------------