├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── config.yml
    │   ├── documentation_improvement.md
    │   └── feature_request.md
    ├── dependabot.yml
    ├── pull_request_template.md
    └── workflows
    │   ├── binaries.yml
    │   ├── ci.yml
    │   ├── docker.yml
    │   ├── docs.yml
    │   └── release.yml
├── .gitignore
├── ARCHITECTURE.md
├── CHANGELOG.md
├── CONTRIBUTING.md
├── Cargo.lock
├── Cargo.toml
├── LICENSE
├── README.md
├── aqueducts-cli
    ├── Cargo.toml
    ├── README.md
    └── src
    │   ├── local_exec.rs
    │   ├── main.rs
    │   ├── remote_exec.rs
    │   └── websocket_client.rs
├── aqueducts-executor
    ├── Cargo.toml
    ├── README.md
    └── src
    │   ├── api
    │       ├── auth.rs
    │       └── mod.rs
    │   ├── config.rs
    │   ├── error.rs
    │   ├── executor
    │       ├── manager.rs
    │       ├── mod.rs
    │       ├── progress_tracker.rs
    │       └── queue.rs
    │   └── main.rs
├── aqueducts
    ├── core
    │   ├── Cargo.toml
    │   ├── src
    │   │   ├── custom_udfs.rs
    │   │   ├── destinations
    │   │   │   ├── file.rs
    │   │   │   └── mod.rs
    │   │   ├── error.rs
    │   │   ├── lib.rs
    │   │   ├── progress_tracker.rs
    │   │   ├── schema_transform.rs
    │   │   ├── sources
    │   │   │   └── mod.rs
    │   │   ├── stages
    │   │   │   └── mod.rs
    │   │   ├── store
    │   │   │   ├── azure.rs
    │   │   │   ├── gcs.rs
    │   │   │   ├── mod.rs
    │   │   │   └── s3.rs
    │   │   └── templating.rs
    │   └── tests
    │   │   ├── common
    │   │       └── mod.rs
    │   │   └── integration.rs
    ├── delta
    │   ├── Cargo.toml
    │   ├── src
    │   │   ├── error.rs
    │   │   ├── handlers.rs
    │   │   └── lib.rs
    │   └── tests
    │   │   ├── data
    │   │       ├── aqueduct_pipeline_delta_append.yml
    │   │       ├── aqueduct_pipeline_delta_replace.yml
    │   │       ├── aqueduct_pipeline_delta_upsert.yml
    │   │       ├── example_1.csv
    │   │       └── example_2.csv
    │   │   └── integration.rs
    ├── meta
    │   ├── Cargo.toml
    │   └── src
    │   │   └── lib.rs
    ├── odbc
    │   ├── Cargo.toml
    │   └── src
    │   │   ├── error.rs
    │   │   └── lib.rs
    └── schemas
    │   ├── Cargo.toml
    │   ├── src
    │       ├── data_types.rs
    │       ├── destinations.rs
    │       ├── generate_schema.rs
    │       ├── lib.rs
    │       ├── location.rs
    │       ├── progress.rs
    │       ├── protocol.rs
    │       ├── serde_helpers.rs
    │       ├── sources.rs
    │       └── stages.rs
    │   └── tests
    │       └── integration.rs
├── cliff.toml
├── db
    └── init.sql
├── dist-workspace.toml
├── docker-compose.yml
├── docker
    ├── .dockerignore
    ├── Dockerfile
    ├── odbc.ini
    └── odbcinst.ini
├── docs
    ├── about.md
    ├── architecture.md
    ├── assets
    │   ├── favicon.ico
    │   └── logo.png
    ├── cli.md
    ├── index.md
    ├── schema_reference.md
    ├── storage.md
    ├── stylesheets
    │   └── extra.css
    └── usage.md
├── examples
    ├── aqueduct_pipeline_example.json
    ├── aqueduct_pipeline_example.toml
    ├── aqueduct_pipeline_example.yml
    ├── aqueduct_pipeline_odbc.yml
    ├── aqueduct_pipeline_simple.yml
    ├── location_dict.csv
    ├── temp_readings_feb_2024.csv
    └── temp_readings_jan_2024.csv
├── json_schema
    ├── aqueducts.schema.json
    ├── generate_schema_reference.py
    ├── schema_reference.md
    └── schema_reference_template
    │   ├── base.md
    │   ├── breadcrumbs.md
    │   ├── content.md
    │   ├── section_array.md
    │   ├── section_conditional_subschema.md
    │   ├── section_description.md
    │   ├── section_examples.md
    │   ├── section_not.md
    │   ├── section_one_of.md
    │   ├── section_properties_details.md
    │   ├── section_undocumented_required_properties.md
    │   └── tabbed_section.md
├── logo.png
├── mkdocs.yml
└── release.toml


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve Aqueducts
 4 | title: '[BUG] '
 5 | labels: bug
 6 | assignees: ''
 7 | ---
 8 | 
 9 | ## Bug Description
10 | A clear and concise description of what the bug is.
11 | 
12 | ## Reproduction Steps
13 | Steps to reproduce the behavior:
14 | 1. Install '...'
15 | 2. Configure '...'
16 | 3. Run command '...'
17 | 4. See error
18 | 
19 | ## Expected Behavior
20 | A clear and concise description of what you expected to happen.
21 | 
22 | ## Actual Behavior
23 | What actually happened, including error messages, logs, or screenshots if applicable.
24 | 
25 | ## Additional Context
26 | Add any other context about the problem here. For example:
27 | - Does the issue happen consistently or intermittently?
28 | - Did this work in a previous version?
29 | - Are you using any special environment variables?
30 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: true
2 | contact_links:
3 |   - name: Questions and Help
4 |     url: https://github.com/vigimite/aqueducts/?tab=readme-ov-file#community
5 |     about: Please ask and answer questions in the Aqueducts discord server
6 |   - name: Documentation
7 |     url: https://vigimite.github.io/aqueducts
8 |     about: Check our documentation for information about usage and configuration
9 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation_improvement.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Documentation improvement
 3 | about: Suggest improvements to Aqueducts documentation
 4 | title: '[DOCS] '
 5 | labels: documentation
 6 | assignees: ''
 7 | ---
 8 | 
 9 | ## Documentation Location
10 | Which document needs improvement? Provide links if possible.
11 | 
12 | ## Problem Description
13 | What's wrong, confusing, or missing in the current documentation?
14 | 
15 | ## Suggested Improvement
16 | Describe your proposed changes or additions.
17 | 
18 | ## Additional Context
19 | Add any other context or screenshots about the documentation improvement here.
20 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for Aqueducts
 4 | title: '[FEATURE] '
 5 | labels: enhancement
 6 | assignees: ''
 7 | ---
 8 | 
 9 | ## Problem Statement
10 | A clear and concise description of what problem this feature would solve. For example: "I'm always frustrated when [...]"
11 | 
12 | ## Proposed Solution
13 | A clear and concise description of what you want to happen or how the feature should work.
14 | 
15 | ## Alternative Solutions
16 | A clear and concise description of any alternative solutions or features you've considered.
17 | 
18 | ## Example Use Case
19 | Describe a concrete example of how this feature would be used.
20 | 
21 | ## Additional Context
22 | Add any other context, diagrams, or screenshots about the feature request here.
23 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # Dependabot config
 2 | 
 3 | version: 2
 4 | updates:
 5 |   - package-ecosystem: cargo
 6 |     directory: "/"
 7 |     schedule:
 8 |       interval: weekly
 9 |     open-pull-requests-limit: 10
10 |     target-branch: main
11 |     labels: [auto-dependencies]
12 |   - package-ecosystem: cargo
13 |     directory: "aqueducts-cli/"
14 |     schedule:
15 |       interval: weekly
16 |     open-pull-requests-limit: 10
17 |     target-branch: main
18 |     labels: [auto-dependencies]
19 |   - package-ecosystem: cargo
20 |     directory: "aqueducts-executor/"
21 |     schedule:
22 |       interval: weekly
23 |     open-pull-requests-limit: 10
24 |     target-branch: main
25 |     labels: [auto-dependencies]
26 |   - package-ecosystem: "github-actions"
27 |     directory: "/"
28 |     schedule:
29 |       interval: "weekly"
30 |     open-pull-requests-limit: 10
31 |     labels: [auto-dependencies]
32 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | <!-- Provide a clear description of what this PR does and why it's needed -->
 3 | 
 4 | ## Type of change
 5 | <!-- Check the relevant options by putting an "x" in the brackets -->
 6 | - [ ] Bug fix (non-breaking change which fixes an issue)
 7 | - [ ] New feature (non-breaking change which adds functionality)
 8 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
 9 | - [ ] Performance improvement
10 | - [ ] Code refactoring (no functional changes)
11 | - [ ] Build/CI improvement
12 | - [ ] Documentation update
13 | - [ ] Other (please describe):
14 | 
15 | ## How has this been tested?
16 | <!-- Describe the tests you ran to verify your changes -->
17 | - [ ] Unit tests added/updated
18 | - [ ] Integration tests added/updated
19 | - [ ] Manual testing performed
20 | 
21 | ## Issue(s) addressed
22 | <!-- List the issues this PR closes or references. Use GitHub keywords like "closes" or "fixes" to automatically close issues when this PR is merged -->
23 | Closes #
24 | 
25 | ## Additional context
26 | <!-- Add any other context or screenshots about the pull request here -->
27 | 


--------------------------------------------------------------------------------
/.github/workflows/binaries.yml:
--------------------------------------------------------------------------------
 1 | name: Test Binary Builds
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   pull_request:
 6 |     paths:
 7 |       - '.github/workflows/release.yml'
 8 |       - 'aqueducts-cli/**'
 9 | 
10 | jobs:
11 |   test-build:
12 |     name: Test Build ${{ matrix.target }}
13 |     runs-on: ${{ matrix.os }}
14 |     strategy:
15 |       fail-fast: false
16 |       matrix:
17 |         include:
18 |           - os: ubuntu-latest
19 |             target: x86_64-unknown-linux-gnu
20 |             binary-name: aqueducts
21 |           - os: macos-latest
22 |             target: aarch64-apple-darwin
23 |             binary-name: aqueducts
24 | 
25 |     steps:
26 |       - name: Checkout repository
27 |         uses: actions/checkout@v4
28 | 
29 |       - name: Set up Rust
30 |         uses: dtolnay/rust-toolchain@stable
31 |         with:
32 |           targets: ${{ matrix.target }}
33 | 
34 |       - name: Cache Rust dependencies
35 |         uses: Swatinem/rust-cache@v2
36 | 
37 |       - name: Setup build environment (Linux)
38 |         if: matrix.os == 'ubuntu-latest'
39 |         run: sudo apt-get update && sudo apt-get install -y unixodbc-dev
40 | 
41 |       - name: Setup build environment (macOS)
42 |         if: matrix.os == 'macos-latest'
43 |         run: brew install unixodbc
44 | 
45 |       - name: Test build
46 |         run: cargo build --release --target ${{ matrix.target }} -p aqueducts-cli
47 | 
48 |       - name: Verify binary
49 |         run: |
50 |           ls -la target/${{ matrix.target }}/release/
51 |           file target/${{ matrix.target }}/release/${{ matrix.binary-name }}


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | name: CI
  2 | 
  3 | on:
  4 |   push:
  5 |     branches: [main]
  6 |   pull_request:
  7 |     branches: [main]
  8 | 
  9 | env:
 10 |   CARGO_TERM_COLOR: always
 11 |   RUST_BACKTRACE: 1
 12 | 
 13 | jobs:
 14 |   # Check code formatting
 15 |   format:
 16 |     name: Format
 17 |     runs-on: ubuntu-latest
 18 |     steps:
 19 |       - name: Checkout repository
 20 |         uses: actions/checkout@v4
 21 | 
 22 |       - name: Set up Rust
 23 |         uses: dtolnay/rust-toolchain@stable
 24 |         with:
 25 |           components: rustfmt
 26 | 
 27 |       - name: Check formatting
 28 |         run: cargo fmt --all --check
 29 | 
 30 |   # Run lints
 31 |   clippy:
 32 |     name: Clippy
 33 |     runs-on: ubuntu-latest
 34 |     steps:
 35 |       - name: Checkout repository
 36 |         uses: actions/checkout@v4
 37 | 
 38 |       - name: Set up Rust
 39 |         uses: dtolnay/rust-toolchain@stable
 40 |         with:
 41 |           components: clippy
 42 | 
 43 |       - name: Cache Rust dependencies
 44 |         uses: Swatinem/rust-cache@v2
 45 | 
 46 |       - name: Setup build environment
 47 |         run: sudo apt-get update && sudo apt-get install -y unixodbc-dev
 48 | 
 49 |       - name: Run clippy
 50 |         run: cargo clippy --workspace --all-targets --all-features -- -D warnings
 51 | 
 52 |   # Check that documentation builds
 53 |   docs:
 54 |     name: Documentation
 55 |     runs-on: ubuntu-latest
 56 |     steps:
 57 |       - name: Checkout repository
 58 |         uses: actions/checkout@v4
 59 | 
 60 |       - name: Set up Rust
 61 |         uses: dtolnay/rust-toolchain@stable
 62 | 
 63 |       - name: Cache Rust dependencies
 64 |         uses: Swatinem/rust-cache@v2
 65 | 
 66 |       - name: Setup build environment
 67 |         run: sudo apt-get update && sudo apt-get install -y unixodbc-dev
 68 | 
 69 |       - name: Check documentation
 70 |         run: cargo doc --workspace --all-features --no-deps --document-private-items
 71 |         env:
 72 |           RUSTDOCFLAGS: "-D warnings"
 73 | 
 74 |   # Security audit
 75 |   security:
 76 |     name: Security Audit
 77 |     runs-on: ubuntu-latest
 78 |     steps:
 79 |       - name: Checkout repository
 80 |         uses: actions/checkout@v4
 81 | 
 82 |       - name: Set up Rust
 83 |         uses: dtolnay/rust-toolchain@stable
 84 | 
 85 |       - name: Install cargo-audit
 86 |         run: cargo install cargo-audit --locked
 87 | 
 88 |       - name: Run security audit
 89 |         run: cargo audit
 90 | 
 91 |   # Run tests
 92 |   test:
 93 |     name: Test
 94 |     runs-on: ${{ matrix.os }}
 95 |     strategy:
 96 |       fail-fast: false
 97 |       matrix:
 98 |         os: [ubuntu-latest]
 99 |         rust: [stable]
100 | 
101 |     services:
102 |       postgres:
103 |         image: postgres:15
104 |         env:
105 |           POSTGRES_USER: postgres
106 |           POSTGRES_PASSWORD: postgres
107 |         ports:
108 |           - 5432:5432
109 |         options: >-
110 |           --health-cmd pg_isready
111 |           --health-interval 10s
112 |           --health-timeout 5s
113 |           --health-retries 5
114 | 
115 |     steps:
116 |       - name: Checkout repository
117 |         uses: actions/checkout@v4
118 | 
119 |       - name: Set up Rust
120 |         uses: dtolnay/rust-toolchain@master
121 |         with:
122 |           toolchain: ${{ matrix.rust }}
123 | 
124 |       - name: Cache Rust dependencies
125 |         uses: Swatinem/rust-cache@v2
126 |         with:
127 |           key: ${{ matrix.os }}-${{ matrix.rust }}
128 | 
129 |       - name: Setup build environment (Linux)
130 |         if: matrix.os == 'ubuntu-latest'
131 |         run: |
132 |           sudo apt-get update
133 |           sudo apt-get install -y postgresql-client unixodbc-dev odbc-postgresql
134 | 
135 |       - name: Setup test database (Linux)
136 |         if: matrix.os == 'ubuntu-latest'
137 |         run: |
138 |           export CONTAINER_ID=$(docker ps --filter "name=postgres" --format "{{.ID}}")
139 |           docker cp ./examples/temp_readings_jan_2024.csv $CONTAINER_ID:/opt/
140 |           docker cp ./examples/temp_readings_feb_2024.csv $CONTAINER_ID:/opt/
141 |           PGPASSWORD=postgres psql -h localhost -U postgres -d postgres -f ./db/init.sql
142 | 
143 |       - name: Run tests
144 |         run: cargo test --workspace --all-features
145 | 
146 |   # Build check for different feature combinations
147 |   features:
148 |     name: Feature Combinations
149 |     runs-on: ubuntu-latest
150 |     steps:
151 |       - name: Checkout repository
152 |         uses: actions/checkout@v4
153 | 
154 |       - name: Set up Rust
155 |         uses: dtolnay/rust-toolchain@stable
156 | 
157 |       - name: Cache Rust dependencies
158 |         uses: Swatinem/rust-cache@v2
159 | 
160 |       - name: Setup build environment
161 |         run: sudo apt-get update && sudo apt-get install -y unixodbc-dev
162 | 
163 |       - name: Check no default features
164 |         run: cargo check --workspace --no-default-features
165 | 
166 |       - name: Check minimal features
167 |         run: cargo check -p aqueducts-cli --no-default-features --features yaml
168 | 
169 |       - name: Check ODBC features
170 |         run: cargo check --workspace --features odbc
171 | 
172 |       - name: Check all features
173 |         run: cargo check --workspace --all-features
174 | 


--------------------------------------------------------------------------------
/.github/workflows/docker.yml:
--------------------------------------------------------------------------------
 1 | name: Docker
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [main]
 6 |     tags: ['v*.*.*']
 7 |   pull_request:
 8 |     branches: [main]
 9 | 
10 | env:
11 |   REGISTRY: ghcr.io
12 |   IMAGE_NAME: ${{ github.repository }}/aqueducts-executor
13 | 
14 | jobs:
15 |   build-and-push:
16 |     runs-on: ubuntu-latest
17 |     permissions:
18 |       contents: read
19 |       packages: write
20 | 
21 |     steps:
22 |       - name: Checkout repository
23 |         uses: actions/checkout@v4
24 | 
25 |       - name: Set up Docker Buildx
26 |         uses: docker/setup-buildx-action@v3
27 | 
28 |       - name: Log in to Container Registry
29 |         if: github.event_name != 'pull_request'
30 |         uses: docker/login-action@v3
31 |         with:
32 |           registry: ${{ env.REGISTRY }}
33 |           username: ${{ github.actor }}
34 |           password: ${{ secrets.GITHUB_TOKEN }}
35 | 
36 |       - name: Extract metadata
37 |         id: meta
38 |         uses: docker/metadata-action@v5
39 |         with:
40 |           images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
41 |           tags: |
42 |             # For pushes to main branch
43 |             type=ref,event=branch
44 |             # For pull requests
45 |             type=ref,event=pr
46 |             # For tag pushes
47 |             type=semver,pattern={{version}}
48 |             type=semver,pattern={{major}}.{{minor}}
49 |             type=semver,pattern={{major}}
50 |             # Latest tag for main branch
51 |             type=raw,value=latest,enable={{is_default_branch}}
52 | 
53 |       - name: Build and push Docker image
54 |         uses: docker/build-push-action@v6
55 |         with:
56 |           context: .
57 |           file: docker/Dockerfile
58 |           platforms: linux/amd64,linux/arm64
59 |           push: ${{ github.event_name != 'pull_request' }}
60 |           tags: ${{ steps.meta.outputs.tags }}
61 |           labels: ${{ steps.meta.outputs.labels }}
62 |           cache-from: type=gha
63 |           cache-to: type=gha,mode=max
64 | 
65 |       - name: Generate image summary
66 |         if: github.event_name != 'pull_request'
67 |         run: |
68 |           echo "## Docker Image Published 🐳" >> $GITHUB_STEP_SUMMARY
69 |           echo "" >> $GITHUB_STEP_SUMMARY
70 |           echo "**Registry:** ${{ env.REGISTRY }}" >> $GITHUB_STEP_SUMMARY
71 |           echo "**Image:** ${{ env.IMAGE_NAME }}" >> $GITHUB_STEP_SUMMARY
72 |           echo "" >> $GITHUB_STEP_SUMMARY
73 |           echo "### Available Tags:" >> $GITHUB_STEP_SUMMARY
74 |           echo '```' >> $GITHUB_STEP_SUMMARY
75 |           echo "${{ steps.meta.outputs.tags }}" >> $GITHUB_STEP_SUMMARY
76 |           echo '```' >> $GITHUB_STEP_SUMMARY
77 |           echo "" >> $GITHUB_STEP_SUMMARY
78 |           echo "### Quick Start:" >> $GITHUB_STEP_SUMMARY
79 |           echo '```bash' >> $GITHUB_STEP_SUMMARY
80 |           echo "docker run -p 3031:3031 ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest" >> $GITHUB_STEP_SUMMARY
81 |           echo '```' >> $GITHUB_STEP_SUMMARY


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy MkDocs
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 | 
 8 | permissions:
 9 |   contents: write
10 | 
11 | jobs:
12 |   deploy:
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |       - name: Checkout repository
17 |         uses: actions/checkout@v4
18 | 
19 |       - name: Set credentials
20 |         run: |
21 |           git config user.name github-actions[bot]
22 |           git config user.email 41898282+github-actions[bot]@users.noreply.github.com
23 | 
24 |       - name: Set up Python
25 |         uses: actions/setup-python@v5
26 |         with:
27 |           python-version: 3.x
28 | 
29 |       - name: Set cache id
30 |         run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
31 | 
32 |       - name: Cache setup
33 |         uses: actions/cache@v4
34 |         with:
35 |           key: mkdocs-material-${{ env.cache_id }}
36 |           path: .cache
37 |           restore-keys: |
38 |             mkdocs-material-
39 | 
40 |       - name: Install dependencies
41 |         run: pip install "mkdocs-material[imaging]" json-schema-for-humans
42 | 
43 |       - name: Deploy
44 |         run: mkdocs gh-deploy --force
45 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .cargo/**
 2 | /target
 3 | /aqueducts-cli/target
 4 | /examples/output*
 5 | /aqueducts/core/tests/output/**
 6 | /aqueducts/delta/tests/output/**
 7 | .venv
 8 | site/
 9 | .cache/
10 | __pycache__
11 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [workspace]
 2 | members = [
 3 |   "aqueducts/meta",
 4 |   "aqueducts/core",
 5 |   "aqueducts/schemas",
 6 |   "aqueducts/odbc",
 7 |   "aqueducts/delta",
 8 |   "aqueducts-cli",
 9 |   "aqueducts-executor"
10 | ]
11 | resolver = "2"
12 | 
13 | [workspace.package]
14 | authors = ["<vigimite@protonmail.com>"]
15 | edition = "2021"
16 | description = "Framework to build ETL data pipelines declaratively"
17 | homepage = "https://github.com/vigimite/aqueducts"
18 | repository = "https://github.com/vigimite/aqueducts"
19 | readme = "README.md"
20 | version = "0.10.1"
21 | keywords = ["aqueducts", "ETL", "data", "pipeline"]
22 | categories = ["api-bindings"]
23 | license = "Apache-2.0"
24 | 
25 | [workspace.metadata.badges]
26 | github = { repository = "vigimite/aqueducts", workflow = "build.yml" }
27 | 
28 | [workspace.dependencies]
29 | # Internal crates
30 | aqueducts = { path = "aqueducts/meta", version = "0.10.1" }
31 | aqueducts-schemas = { path = "aqueducts/schemas", version = "0.10.1" }
32 | aqueducts-core = { path = "aqueducts/core", version = "0.10.1" }
33 | aqueducts-delta = { path = "aqueducts/delta", version = "0.10.1" }
34 | aqueducts-odbc = { path = "aqueducts/odbc", version = "0.10.1" }
35 | 
36 | # Data processing libraries
37 | datafusion = "47"
38 | datafusion-functions-json = "0.47"
39 | deltalake = { version = "0.26.2", features = ["datafusion"] }
40 | arrow-odbc = "16.0.2"
41 | 
42 | # Serialization/deserialization
43 | serde = { version = "1", features = ["derive"] }
44 | serde_json = "1"
45 | serde_yml = "0.0.10"
46 | toml = "0.8"
47 | schemars = { version = "0.8", features = ["chrono", "url", "preserve_order"] }
48 | 
49 | # Async runtime and utilities
50 | tokio = { version = "1", features = ["rt"] }
51 | tokio-util = "0.7"
52 | futures = "0.3"
53 | futures-util = "0.3"
54 | 
55 | # Network and API
56 | axum = { version = "0.8.4", features = ["macros", "ws"] }
57 | tower = { version = "0.5.2", features = ["util"] }
58 | http-body-util = "0.1.3"
59 | tokio-tungstenite = "0.26.2"
60 | openssl = { version = "0.10", features = ["vendored"] }
61 | 
62 | # Logging and tracing
63 | tracing = "0.1"
64 | tracing-subscriber = { version = "0.3.19", features = ["env-filter"] }
65 | tracing-test = "0.2"
66 | 
67 | # Error handling
68 | thiserror = "2"
69 | anyhow = "1.0.98"
70 | 
71 | # CLI utilities
72 | clap = { version = "4.5.38", features = ["derive", "env"] }
73 | 
74 | # General utilities
75 | bon = "3.6.3"
76 | regex = "1"
77 | url = { version = "2", features = ["serde"] }
78 | chrono = { version = "0.4", features = ["serde"] }
79 | uuid = { version = "1.17.0", features = ["v4", "serde"] }
80 | rand = "0.8"
81 | 
82 | [profile.dist]
83 | inherits = "release"
84 | lto = "thin"
85 | 


--------------------------------------------------------------------------------
/aqueducts-cli/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "aqueducts-cli"
 3 | authors = ["<vigimite@protonmail.com>"]
 4 | edition = "2021"
 5 | description = "CLI application to run pipelines defined for the aqueducts framework"
 6 | homepage = "https://github.com/vigimite/aqueducts"
 7 | repository = "https://github.com/vigimite/aqueducts"
 8 | readme = "README.md"
 9 | version = "0.10.1"
10 | keywords = ["aqueducts", "ETL", "data", "pipeline", "cli"]
11 | categories = ["command-line-utilities"]
12 | license = "Apache-2.0"
13 | 
14 | [features]
15 | default = ["s3", "gcs", "azure", "yaml", "json", "delta"]
16 | s3 = ["aqueducts/s3"]
17 | gcs = ["aqueducts/gcs"]
18 | azure = ["aqueducts/azure"]
19 | delta = ["aqueducts/delta"]
20 | odbc = ["aqueducts/odbc"]
21 | json = ["aqueducts/json"]
22 | toml = ["aqueducts/toml"]
23 | yaml = ["aqueducts/yaml"]
24 | 
25 | [dependencies]
26 | datafusion.workspace = true
27 | 
28 | aqueducts = { workspace = true, features = ["protocol", "custom_udfs"] }
29 | 
30 | clap.workspace = true
31 | 
32 | tracing.workspace = true
33 | tracing-subscriber.workspace = true
34 | 
35 | tokio = { workspace = true, features = ["full"] }
36 | futures-util.workspace = true
37 | 
38 | anyhow.workspace = true
39 | uuid.workspace = true
40 | url.workspace = true
41 | 
42 | tokio-tungstenite = { workspace = true, features = ["native-tls"] }
43 | 
44 | serde.workspace = true
45 | serde_json.workspace = true
46 | 
47 | openssl.workspace = true
48 | 
49 | [[bin]]
50 | name = "aqueducts"
51 | path = "src/main.rs"
52 | 


--------------------------------------------------------------------------------
/aqueducts-cli/README.md:
--------------------------------------------------------------------------------
  1 | # Aqueducts CLI
  2 | 
  3 | A command-line interface for executing Aqueducts data pipelines, with support for both local and remote execution.
  4 | 
  5 | ## Features
  6 | 
  7 | - Run pipelines defined in YAML, JSON, or TOML formats
  8 | - Execute pipelines locally or remotely via the Aqueducts Executor
  9 | - Check status of remote executors
 10 | - Cancel running pipelines on remote executors
 11 | - Real-time progress tracking and event streaming
 12 | - Cloud storage support (S3, GCS, Azure) via feature flags
 13 | - ODBC database connectivity via feature flags
 14 | 
 15 | ## Installation
 16 | 
 17 | ### Recommended Installation Methods
 18 | 
 19 | **Homebrew (macOS and Linux):**
 20 | ```bash
 21 | # Add the tap and install
 22 | brew tap vigimite/aqueducts
 23 | brew install aqueducts-cli
 24 | ```
 25 | 
 26 | **Shell Installer (Cross-platform):**
 27 | ```bash
 28 | # One-line installer for Linux, macOS, and Windows
 29 | curl --proto '=https' --tlsv1.2 -LsSf https://github.com/vigimite/aqueducts/releases/latest/download/aqueducts-installer.sh | sh
 30 | ```
 31 | 
 32 | **Direct Download:**
 33 | Download pre-built binaries for your platform from the [latest release](https://github.com/vigimite/aqueducts/releases/latest):
 34 | - Linux x86_64
 35 | - macOS Apple Silicon (ARM64)  
 36 | - macOS Intel (x86_64)
 37 | 
 38 | ### Build from Source
 39 | 
 40 | ```bash
 41 | # Install with default features (s3, gcs, azure, yaml)
 42 | cargo install aqueducts-cli --locked
 43 | 
 44 | # Install with odbc support (requires unixodbc-dev)
 45 | cargo install aqueducts-cli --locked --features odbc
 46 | 
 47 | # Install with minimal features
 48 | cargo install aqueducts-cli --locked --no-default-features --features yaml
 49 | ```
 50 | 
 51 | ## Usage
 52 | 
 53 | ### Running Pipelines
 54 | 
 55 | Run a pipeline locally:
 56 | 
 57 | ```bash
 58 | # Basic usage (YAML)
 59 | aqueducts run --file ./pipeline.yml
 60 | 
 61 | # With parameters
 62 | aqueducts run --file ./pipeline.yml --params key1=value1 --params key2=value2
 63 | 
 64 | # Using TOML or JSON (with appropriate feature flags)
 65 | aqueducts run --file ./pipeline.toml
 66 | aqueducts run --file ./pipeline.json
 67 | ```
 68 | 
 69 | Run a pipeline on a remote executor:
 70 | 
 71 | ```bash
 72 | # Execute on remote executor
 73 | aqueducts run --file ./pipeline.yml --executor executor-host:3031 --api-key your_api_key
 74 | ```
 75 | 
 76 | Cancel a running pipeline on a remote executor:
 77 | 
 78 | ```bash
 79 | # Cancel a specific execution by ID
 80 | aqueducts cancel --executor executor-host:3031 --api-key your_api_key --execution-id abc-123
 81 | ```
 82 | 
 83 | ## Pipeline Definition Examples
 84 | 
 85 | YAML pipeline example:
 86 | 
 87 | ```yaml
 88 | sources:
 89 |   - type: File
 90 |     name: temp_readings
 91 |     file_type:
 92 |       type: Csv
 93 |       options: {}
 94 |     location: ./examples/temp_readings_${month}_${year}.csv
 95 | 
 96 | stages:
 97 |   - - name: transformed_data
 98 |       query: "SELECT * FROM source_data WHERE value > 10"
 99 | 
100 | destination:
101 |   type: File
102 |   name: results
103 |   file_type:
104 |     type: Parquet
105 |     options: {}
106 |   location: ./examples/output_${month}_${year}.parquet
107 | ```
108 | 
109 | ## Troubleshooting
110 | 
111 | Common issues:
112 | 
113 | 1. **Authentication failures**: Verify API key is correct
114 | 2. **Connectivity issues**: Check network connectivity and firewall rules
115 | 3. **Pipeline validation errors**: Ensure your pipeline definition is valid
116 | 4. **Executor busy**: Only one pipeline can run at a time on an executor
117 | 5. **Missing features**: Make sure the CLI was compiled with the needed features
118 | 
119 | For more information on architecture and advanced usage, see the [Aqueducts Architecture Documentation](https://github.com/vigimite/aqueducts/blob/main/ARCHITECTURE.md).
120 | 


--------------------------------------------------------------------------------
/aqueducts-cli/src/local_exec.rs:
--------------------------------------------------------------------------------
 1 | use std::{collections::HashMap, path::PathBuf, sync::Arc};
 2 | 
 3 | use anyhow::Context;
 4 | use aqueducts::prelude::*;
 5 | use tracing::{debug, info};
 6 | 
 7 | pub async fn run_local(file: PathBuf, params: HashMap<String, String>) -> anyhow::Result<()> {
 8 |     info!("Running pipeline locally from file: {}", file.display());
 9 | 
10 |     let aqueduct = Aqueduct::from_file(&file, params)?;
11 | 
12 |     debug!("Creating SessionContext");
13 |     let mut ctx = datafusion::prelude::SessionContext::new();
14 | 
15 |     aqueducts::custom_udfs::register_all(&mut ctx)?;
16 | 
17 |     let progress_tracker = Arc::new(LoggingProgressTracker);
18 | 
19 |     debug!("Starting pipeline execution");
20 |     run_pipeline(Arc::new(ctx), aqueduct, Some(progress_tracker))
21 |         .await
22 |         .context("Failure during execution of aqueducts file")?;
23 | 
24 |     debug!("Pipeline execution completed successfully");
25 |     Ok(())
26 | }
27 | 


--------------------------------------------------------------------------------
/aqueducts-cli/src/main.rs:
--------------------------------------------------------------------------------
  1 | use anyhow::anyhow;
  2 | use clap::{Parser, Subcommand};
  3 | use std::{collections::HashMap, error::Error, path::PathBuf};
  4 | use tracing::info;
  5 | use tracing_subscriber::{filter, layer::SubscriberExt, util::SubscriberInitExt, EnvFilter, Layer};
  6 | use uuid::Uuid;
  7 | 
  8 | mod local_exec;
  9 | mod remote_exec;
 10 | mod websocket_client;
 11 | 
 12 | /// Aqueducts CLI for executing data pipelines locally or remotely
 13 | #[derive(Debug, Parser)]
 14 | #[command(name = "aqueducts", version, about, long_about = None)]
 15 | struct Args {
 16 |     #[command(subcommand)]
 17 |     command: Commands,
 18 | }
 19 | 
 20 | #[derive(Debug, Subcommand)]
 21 | enum Commands {
 22 |     /// Run an Aqueduct pipeline locally or remotely
 23 |     Run {
 24 |         /// Path to Aqueduct configuration file
 25 |         #[arg(short, long)]
 26 |         file: PathBuf,
 27 | 
 28 |         /// k=v list of parameters to pass to the configuration file
 29 |         /// e.g. aqueduct run -f file.yml -p key1=value1 -p key2=value2
 30 |         #[arg(short, long, value_parser = parse_key_val::<String, String>)]
 31 |         params: Option<Vec<(String, String)>>,
 32 | 
 33 |         /// Execute the pipeline on a remote executor instead of locally
 34 |         /// example: 192.168.1.102:3031
 35 |         #[arg(long)]
 36 |         executor: Option<String>,
 37 | 
 38 |         /// API key for the remote executor
 39 |         #[arg(long)]
 40 |         api_key: Option<String>,
 41 |     },
 42 |     /// Cancel a running pipeline on a remote executor
 43 |     Cancel {
 44 |         /// Execution ID to cancel
 45 |         #[arg(short, long)]
 46 |         execution_id: String,
 47 | 
 48 |         /// Remote executor URL
 49 |         /// example: 192.168.1.102:3031
 50 |         #[arg(long)]
 51 |         executor: String,
 52 | 
 53 |         /// API key for the remote executor
 54 |         #[arg(long)]
 55 |         api_key: String,
 56 |     },
 57 | }
 58 | 
 59 | fn parse_key_val<T, U>(s: &str) -> Result<(T, U), Box<dyn Error + Send + Sync + 'static>>
 60 | where
 61 |     T: std::str::FromStr,
 62 |     T::Err: Error + Send + Sync + 'static,
 63 |     U: std::str::FromStr,
 64 |     U::Err: Error + Send + Sync + 'static,
 65 | {
 66 |     let pos = s
 67 |         .find('=')
 68 |         .ok_or_else(|| format!("invalid KEY=value: no `=` found in `{s}`"))?;
 69 |     Ok((s[..pos].parse()?, s[pos + 1..].parse()?))
 70 | }
 71 | 
 72 | #[tokio::main]
 73 | async fn main() -> anyhow::Result<()> {
 74 |     tracing_subscriber::registry()
 75 |         .with(
 76 |             tracing_subscriber::fmt::layer()
 77 |                 .with_ansi(true)
 78 |                 .with_level(false)
 79 |                 .with_target(false)
 80 |                 .without_time()
 81 |                 .with_filter(filter::filter_fn(|meta| !meta.is_span())),
 82 |         )
 83 |         .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")))
 84 |         .init();
 85 | 
 86 |     let args = Args::parse();
 87 | 
 88 |     match args.command {
 89 |         Commands::Run {
 90 |             file,
 91 |             params,
 92 |             executor: Some(executor_url),
 93 |             api_key,
 94 |         } => {
 95 |             let api_key =
 96 |                 api_key.ok_or_else(|| anyhow!("API key is required for remote execution"))?;
 97 | 
 98 |             info!("Executing pipeline on remote executor: {}", executor_url);
 99 |             let params = HashMap::from_iter(params.unwrap_or_default());
100 |             remote_exec::run_remote(file, params, executor_url, api_key).await?;
101 |         }
102 |         Commands::Run {
103 |             file,
104 |             params,
105 |             executor: _,
106 |             api_key: _,
107 |         } => {
108 |             let params = HashMap::from_iter(params.unwrap_or_default());
109 |             local_exec::run_local(file, params).await?;
110 |         }
111 |         Commands::Cancel {
112 |             execution_id,
113 |             executor,
114 |             api_key,
115 |         } => {
116 |             let execution_id = Uuid::parse_str(&execution_id)
117 |                 .map_err(|e| anyhow!("Invalid execution ID: {}. Must be a valid UUID.", e))?;
118 | 
119 |             info!(
120 |                 "Cancelling execution {} on executor: {}",
121 |                 execution_id, executor
122 |             );
123 |             remote_exec::cancel_remote_execution(executor, api_key, execution_id).await?;
124 |         }
125 |     }
126 | 
127 |     Ok(())
128 | }
129 | 


--------------------------------------------------------------------------------
/aqueducts-cli/src/websocket_client.rs:
--------------------------------------------------------------------------------
  1 | use anyhow::anyhow;
  2 | use aqueducts::prelude::*;
  3 | use futures_util::{SinkExt, StreamExt};
  4 | use std::{str::FromStr, sync::Arc};
  5 | use tokio::sync::{mpsc, Mutex};
  6 | use tokio_tungstenite::{
  7 |     connect_async,
  8 |     tungstenite::{client::IntoClientRequest, http::Uri, protocol::Message, ClientRequestBuilder},
  9 | };
 10 | use tracing::{debug, error, info};
 11 | use url::Url;
 12 | use uuid::Uuid;
 13 | 
 14 | /// The custom header for API key authentication
 15 | const X_API_KEY_HEADER: &str = "X-API-Key";
 16 | 
 17 | /// Manages connection to an executor server
 18 | pub struct WebSocketClient {
 19 |     executor_url: Url,
 20 |     api_key: String,
 21 |     sender: Arc<Mutex<Option<mpsc::Sender<ClientMessage>>>>,
 22 | }
 23 | 
 24 | impl WebSocketClient {
 25 |     /// Create a new client
 26 |     pub fn try_new(executor_url: String, api_key: String) -> anyhow::Result<Self> {
 27 |         let executor_url = Url::parse(&format!("ws://{executor_url}/ws/connect"))?;
 28 |         Ok(Self {
 29 |             executor_url,
 30 |             api_key,
 31 |             sender: Arc::new(Mutex::new(None)),
 32 |         })
 33 |     }
 34 | 
 35 |     /// Connect to the executor and set up message handling
 36 |     pub async fn connect(&self) -> anyhow::Result<mpsc::Receiver<ExecutorMessage>> {
 37 |         info!("Connecting to executor at: {}", self.executor_url);
 38 | 
 39 |         // Set up channels for message passing
 40 |         let (outgoing_tx, mut outgoing_rx) = mpsc::channel::<ClientMessage>(16);
 41 |         let (incoming_tx, incoming_rx) = mpsc::channel::<ExecutorMessage>(32);
 42 | 
 43 |         debug!("Connecting with API key authentication");
 44 |         let request = ClientRequestBuilder::new(Uri::from_str(self.executor_url.as_str())?)
 45 |             .with_header(X_API_KEY_HEADER, &self.api_key)
 46 |             .into_client_request()?;
 47 | 
 48 |         let (ws_stream, _) = connect_async(request).await?;
 49 |         debug!("WebSocket connection established");
 50 | 
 51 |         let (mut ws_sender, mut ws_receiver) = ws_stream.split();
 52 |         {
 53 |             let mut sender = self.sender.lock().await;
 54 |             *sender = Some(outgoing_tx);
 55 |         }
 56 | 
 57 |         // Handle outgoing messages
 58 |         tokio::spawn(async move {
 59 |             while let Some(message) = outgoing_rx.recv().await {
 60 |                 match serde_json::to_string(&message) {
 61 |                     Ok(json) => {
 62 |                         debug!("Sending message: {}", json);
 63 |                         if let Err(e) = ws_sender.send(Message::Text(json.into())).await {
 64 |                             error!("Error sending message: {}", e);
 65 |                             break;
 66 |                         }
 67 |                     }
 68 |                     Err(e) => {
 69 |                         error!("Failed to serialize message: {}", e);
 70 |                     }
 71 |                 }
 72 |             }
 73 |             debug!("Outgoing message handler finished");
 74 |         });
 75 | 
 76 |         // Handle incoming messages
 77 |         tokio::spawn(async move {
 78 |             while let Some(msg) = ws_receiver.next().await {
 79 |                 match msg {
 80 |                     Ok(Message::Text(text)) => {
 81 |                         debug!("Received message: {}", text);
 82 |                         match serde_json::from_str::<ExecutorMessage>(&text) {
 83 |                             Ok(message) => {
 84 |                                 if let Err(e) = incoming_tx.send(message).await {
 85 |                                     error!("Failed to forward incoming message: {}", e);
 86 |                                     break;
 87 |                                 }
 88 |                             }
 89 |                             Err(e) => {
 90 |                                 error!("Failed to parse message: {}", e);
 91 |                             }
 92 |                         }
 93 |                     }
 94 |                     Ok(Message::Close(_)) => {
 95 |                         info!("WebSocket connection closed by server");
 96 |                         break;
 97 |                     }
 98 |                     Err(e) => {
 99 |                         error!("Error receiving message: {}", e);
100 |                         break;
101 |                     }
102 |                     _ => {}
103 |                 }
104 |             }
105 |             debug!("Incoming message handler finished");
106 |         });
107 | 
108 |         // Return the receiver channel
109 |         Ok(incoming_rx)
110 |     }
111 | 
112 |     /// Submit a pipeline for execution
113 |     pub async fn execute_pipeline(&self, pipeline: Aqueduct) -> anyhow::Result<()> {
114 |         // Send execution request
115 |         self.send_message(ClientMessage::ExecutionRequest { pipeline })
116 |             .await?;
117 | 
118 |         Ok(())
119 |     }
120 | 
121 |     /// Cancel an execution
122 |     pub async fn cancel_execution(&self, execution_id: Uuid) -> anyhow::Result<()> {
123 |         self.send_message(ClientMessage::CancelRequest { execution_id })
124 |             .await
125 |     }
126 | 
127 |     /// Send a message to the executor
128 |     async fn send_message(&self, message: ClientMessage) -> anyhow::Result<()> {
129 |         let sender = self.sender.lock().await;
130 |         match &*sender {
131 |             Some(tx) => {
132 |                 tx.send(message).await?;
133 |                 Ok(())
134 |             }
135 |             None => Err(anyhow!("Connection Closed")),
136 |         }
137 |     }
138 | }
139 | 


--------------------------------------------------------------------------------
/aqueducts-executor/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "aqueducts-executor"
 3 | authors = ["<vigimite@protonmail.com>"]
 4 | edition = "2021"
 5 | description = "Remote executor for the Aqueducts data pipeline framework"
 6 | homepage = "https://github.com/vigimite/aqueducts"
 7 | repository = "https://github.com/vigimite/aqueducts"
 8 | readme = "README.md"
 9 | version = "0.10.1"
10 | keywords = ["aqueducts", "ETL", "data", "pipeline"]
11 | categories = ["command-line-utilities"]
12 | license = "Apache-2.0"
13 | 
14 | [features]
15 | default = []
16 | odbc = ["aqueducts/odbc"]
17 | 
18 | [dependencies]
19 | aqueducts = { workspace = true, features = ["protocol", "json", "s3", "gcs", "azure", "delta", "custom_udfs"] }
20 | 
21 | axum.workspace = true
22 | clap.workspace = true
23 | 
24 | datafusion.workspace = true
25 | 
26 | serde.workspace = true
27 | serde_json.workspace = true
28 | 
29 | tokio = { workspace = true, features = ["full"] }
30 | tokio-util.workspace = true
31 | futures.workspace = true
32 | futures-util.workspace = true
33 | 
34 | thiserror.workspace = true
35 | 
36 | tracing.workspace = true
37 | tracing-subscriber = { workspace = true, features = ["json"] }
38 | 
39 | uuid.workspace = true
40 | itertools = "0.14.0"
41 | tower-http = { version = "0.6.4", features = ["trace"] }
42 | 
43 | openssl.workspace = true
44 | 
45 | [dev-dependencies]
46 | futures.workspace = true
47 | tower.workspace = true
48 | http-body-util.workspace = true
49 | 


--------------------------------------------------------------------------------
/aqueducts-executor/README.md:
--------------------------------------------------------------------------------
  1 | # Aqueducts Executor
  2 | 
  3 | A deployable application used to execute Aqueduct pipeline definitions within your infrastructure. The main use-case is to execute heavy queries within the infrastructure where the data is hosted, minimizing network load and removing the requirement for the client to have direct access to the data store.
  4 | 
  5 | ## Features
  6 | 
  7 | - **Remote Execution**: Run data pipelines securely within your own infrastructure close to the data sources
  8 | - **Memory Management**: Configure maximum memory usage to control resource allocation using DataFusion's memory pool
  9 | - **Real-time Feedback**: WebSockets provide bidirectional communication with live progress and log updates
 10 | - **Cloud Storage Support**: Native integration with S3, GCS, and Azure Blob Storage
 11 | - **Database Connectivity**: ODBC support for connecting to various database systems
 12 | - **Scalability**: Deploy multiple executors across different regions as needed
 13 | - **Exclusive Execution**: Guaranteed single-pipeline execution to optimize resource utilization
 14 | 
 15 | ## Installation
 16 | 
 17 | ### Docker (Recommended)
 18 | 
 19 | The easiest way to run the executor is using Docker. The Docker image includes **ODBC support with PostgreSQL drivers pre-installed**, making it ready for database connectivity out of the box.
 20 | 
 21 | ```bash
 22 | # Pull from GitHub Container Registry
 23 | docker pull ghcr.io/vigimite/aqueducts/aqueducts-executor:latest
 24 | 
 25 | # Run with command line arguments
 26 | docker run -d \
 27 |   --name aqueducts-executor \
 28 |   -p 3031:3031 \
 29 |   ghcr.io/vigimite/aqueducts/aqueducts-executor:latest \
 30 |   --api-key your_secret_key --max-memory 4
 31 | 
 32 | # Or run with environment variables
 33 | docker run -d \
 34 |   --name aqueducts-executor \
 35 |   -p 3031:3031 \
 36 |   -e AQUEDUCTS_API_KEY=your_secret_key \
 37 |   -e AQUEDUCTS_HOST=0.0.0.0 \
 38 |   -e AQUEDUCTS_PORT=3031 \
 39 |   -e AQUEDUCTS_MAX_MEMORY=4 \
 40 |   -e AQUEDUCTS_LOG_LEVEL=info \
 41 |   ghcr.io/vigimite/aqueducts/aqueducts-executor:latest
 42 | ```
 43 | 
 44 | ### Docker Compose
 45 | 
 46 | For local development, use the provided docker-compose setup:
 47 | 
 48 | ```bash
 49 | # Start just the database (default)
 50 | docker-compose up
 51 | 
 52 | # Start database + executor
 53 | docker-compose --profile executor up
 54 | 
 55 | # Build and start from source
 56 | docker-compose --profile executor up --build
 57 | ```
 58 | 
 59 | The executor will be available at `http://localhost:3031` with:
 60 | - API key: `test_secret_key` (configurable)
 61 | - Health check: `http://localhost:3031/api/health`
 62 | - WebSocket: `ws://localhost:3031/ws/connect`
 63 | 
 64 | ### Manual Installation
 65 | 
 66 | Install the application using cargo:
 67 | 
 68 | ```bash
 69 | # Standard installation with all cloud storage features
 70 | cargo install aqueducts-executor
 71 | 
 72 | # Installation with ODBC support
 73 | cargo install aqueducts-executor --features odbc
 74 | ```
 75 | 
 76 | ## Configuration Options
 77 | 
 78 | | Option          | Description                                         | Default        | Environment Variable    |
 79 | |-----------------|-----------------------------------------------------|----------------|-------------------------|
 80 | | `--api-key`     | API key for authentication                          | -              | `AQUEDUCTS_API_KEY`     |
 81 | | `--host`        | Host address to bind to                             | 0.0.0.0        | `AQUEDUCTS_HOST`        |
 82 | | `--port`        | Port to listen on                                   | 8080           | `AQUEDUCTS_PORT`        |
 83 | | `--max-memory`  | Maximum memory usage in GB (0 for unlimited)        | 0              | `AQUEDUCTS_MAX_MEMORY`  |
 84 | | `--server-url`  | URL of Aqueducts server for registration (optional) | -              | `AQUEDUCTS_SERVER_URL`  |
 85 | | `--executor-id` | Unique identifier for this executor                 | auto-generated | `AQUEDUCTS_EXECUTOR_ID` |
 86 | | `--log-level`   | Logging level (info, debug, trace)                  | info           | `AQUEDUCTS_LOG_LEVEL`   |
 87 | 
 88 | ## API Endpoints
 89 | 
 90 | | Endpoint       | Method | Auth | Description                                        |
 91 | |----------------|--------|------|----------------------------------------------------|
 92 | | `/api/health`  | GET    | No   | Basic health check                                 |
 93 | | `/ws/connect`  | GET    | Yes  | WebSocket endpoint for bidirectional communication |
 94 | 
 95 | ## ODBC Configuration Requirements
 96 | 
 97 | ODBC support requires the UnixODBC library to be installed on your system, along with any database-specific drivers.
 98 | 
 99 | ### Ubuntu/Debian
100 | ```bash
101 | # Install UnixODBC development libraries
102 | sudo apt-get update
103 | sudo apt-get install unixodbc-dev
104 | 
105 | # Add database-specific drivers (examples)
106 | # For PostgreSQL
107 | sudo apt-get install odbc-postgresql
108 | 
109 | # For MySQL
110 | sudo apt-get install libmyodbc
111 | ```
112 | 
113 | ### Fedora/RHEL/CentOS
114 | ```bash
115 | # Install UnixODBC development libraries
116 | sudo dnf install unixODBC-devel
117 | 
118 | # Add database-specific drivers (examples)
119 | # For PostgreSQL
120 | sudo dnf install postgresql-odbc
121 | 
122 | # For MySQL
123 | sudo dnf install mysql-connector-odbc
124 | ```
125 | 
126 | ### macOS
127 | ```bash
128 | # Install UnixODBC via Homebrew
129 | brew install unixodbc
130 | 
131 | # For database drivers, use Homebrew if available or download from the database vendor
132 | # PostgreSQL example
133 | brew install psqlodbc
134 | 
135 | # MySQL example
136 | brew install mysql-connector-c++
137 | ```
138 | 
139 | ## Example Usage
140 | 
141 | ### Using the CLI
142 | 
143 | ```bash
144 | # Connect to the executor
145 | aqueducts run --executor executor-host:3031 --api-key your_api_key --file pipeline.yml
146 | ```
147 | 
148 | ## Troubleshooting
149 | 
150 | Common issues and solutions:
151 | 
152 | 1. **Connection timeouts**: Check network connectivity and firewall rules
153 | 2. **Authentication failures**: Verify API key configuration and correct header usage (X-API-Key)
154 | 4. **Memory errors**: 
155 |    - Increase max memory allocation with the `--max-memory` parameter
156 |    - Optimize your pipeline by adding filtering earlier in the process
157 |    - Break large queries into smaller stages with intermediate results
158 | 5. **ODBC issues**:
159 |    - Verify your DSN configuration in `odbc.ini` and `odbcinst.ini`
160 |    - Run `isql -v YOUR_DSN YOUR_USERNAME YOUR_PASSWORD` to test connections
161 |    - Check that database-specific drivers are installed correctly
162 | 
163 | For more information on architecture and advanced usage, see the [Aqueducts Architecture Documentation](https://github.com/vigimite/aqueducts/blob/main/ARCHITECTURE.md).
164 | 


--------------------------------------------------------------------------------
/aqueducts-executor/src/api/auth.rs:
--------------------------------------------------------------------------------
 1 | use axum::{
 2 |     extract::{Request, State},
 3 |     middleware::Next,
 4 |     response::Response,
 5 | };
 6 | use tracing::{debug, warn};
 7 | 
 8 | use crate::{error::ExecutorError, ApiContextRef};
 9 | 
10 | /// The custom header for API key authentication
11 | const X_API_KEY_HEADER: &str = "X-API-Key";
12 | 
13 | /// Middleware function for API key authentication
14 | pub async fn require_api_key(
15 |     State(context): State<ApiContextRef>,
16 |     req: Request,
17 |     next: Next,
18 | ) -> Result<Response, ExecutorError> {
19 |     let api_key = req
20 |         .headers()
21 |         .get(X_API_KEY_HEADER)
22 |         .and_then(|value| value.to_str().ok());
23 | 
24 |     if let Some(provided) = api_key {
25 |         if provided == context.config.api_key {
26 |             debug!("API key authentication successful via X-API-Key header");
27 |             return Ok(next.run(req).await);
28 |         }
29 |     }
30 | 
31 |     warn!("Authentication failed: No valid API key provided");
32 |     Err(ExecutorError::AuthenticationFailed)
33 | }
34 | 


--------------------------------------------------------------------------------
/aqueducts-executor/src/api/mod.rs:
--------------------------------------------------------------------------------
  1 | use std::sync::Arc;
  2 | 
  3 | use aqueducts::prelude::*;
  4 | use axum::{
  5 |     extract::{
  6 |         ws::{Message, WebSocket},
  7 |         State, WebSocketUpgrade,
  8 |     },
  9 |     response::IntoResponse,
 10 |     routing::{any, get},
 11 |     Json, Router,
 12 | };
 13 | use futures::{SinkExt, StreamExt};
 14 | use serde::Serialize;
 15 | use tokio::sync::Mutex;
 16 | use tower_http::trace::{DefaultOnFailure, TraceLayer};
 17 | use tracing::{debug, error, info, instrument, Instrument, Level};
 18 | 
 19 | use crate::{
 20 |     executor::{execute_pipeline, ExecutionManager},
 21 |     ApiContextRef,
 22 | };
 23 | 
 24 | mod auth;
 25 | 
 26 | pub fn router(context: ApiContextRef) -> Router<ApiContextRef> {
 27 |     let public_routes = Router::new().route("/api/health", get(health_check));
 28 | 
 29 |     let protected_routes = Router::new().route("/ws/connect", any(ws_handler)).layer(
 30 |         axum::middleware::from_fn_with_state(context, auth::require_api_key),
 31 |     );
 32 | 
 33 |     Router::new()
 34 |         .merge(public_routes)
 35 |         .merge(protected_routes)
 36 |         .layer(TraceLayer::new_for_http().on_failure(DefaultOnFailure::new().level(Level::ERROR)))
 37 | }
 38 | 
 39 | #[derive(Serialize)]
 40 | struct HealthCheckResponse {
 41 |     status: String,
 42 | }
 43 | 
 44 | async fn health_check() -> Json<HealthCheckResponse> {
 45 |     let response = HealthCheckResponse {
 46 |         status: "OK".to_string(),
 47 |     };
 48 | 
 49 |     Json(response)
 50 | }
 51 | 
 52 | #[instrument(skip(ws, context), fields(executor_id = %context.config.executor_id))]
 53 | async fn ws_handler(
 54 |     ws: WebSocketUpgrade,
 55 |     State(context): State<ApiContextRef>,
 56 | ) -> impl IntoResponse {
 57 |     info!("Opening WebSocket connection");
 58 |     ws.on_upgrade(move |socket| {
 59 |         handle_socket(
 60 |             socket,
 61 |             context.manager.clone(),
 62 |             context.config.max_memory_gb,
 63 |         )
 64 |     })
 65 | }
 66 | 
 67 | #[instrument(skip(socket, manager), fields(max_memory_gb = ?max_memory_gb))]
 68 | async fn handle_socket(
 69 |     socket: WebSocket,
 70 |     manager: Arc<ExecutionManager>,
 71 |     max_memory_gb: Option<usize>,
 72 | ) {
 73 |     let (sender, mut receiver) = socket.split();
 74 |     let sender = Arc::new(Mutex::new(sender));
 75 | 
 76 |     debug!("WebSocket connection established");
 77 | 
 78 |     while let Some(Ok(msg)) = receiver.next().await {
 79 |         if let Message::Text(text) = msg {
 80 |             debug!(msg_len = text.len(), "Received message");
 81 | 
 82 |             match serde_json::from_str::<ClientMessage>(&text) {
 83 |                 Ok(ClientMessage::ExecutionRequest { pipeline }) => {
 84 |                     info!(
 85 |                         source_count = pipeline.sources.len(),
 86 |                         stage_count = pipeline.stages.len(),
 87 |                         "Received execution request"
 88 |                     );
 89 | 
 90 |                     // Queue execution
 91 |                     let (execution_id, mut queue_rx, mut progress_rx) = manager
 92 |                         .submit(move |execution_id, client_tx| {
 93 |                             Box::pin(async move {
 94 |                                 execute_pipeline(execution_id, client_tx, pipeline, max_memory_gb)
 95 |                                     .await
 96 |                             })
 97 |                         })
 98 |                         .await;
 99 | 
100 |                     info!(
101 |                         execution_id = %execution_id,
102 |                         "Execution submitted to queue"
103 |                     );
104 | 
105 |                     // forward queue updates
106 |                     let send_q = sender.clone();
107 |                     tokio::spawn(
108 |                         async move {
109 |                             debug!("Starting queue update forwarder");
110 |                             while let Ok(update) = queue_rx.recv().await {
111 |                                 if update.execution_id == execution_id {
112 |                                     debug!(position = update.position, "Queue position update");
113 |                                     let msg =
114 |                                         serde_json::to_string(&ExecutorMessage::QueuePosition {
115 |                                             execution_id: update.execution_id,
116 |                                             position: update.position,
117 |                                         })
118 |                                         .unwrap();
119 |                                     if let Err(e) =
120 |                                         send_q.lock().await.send(Message::text(msg)).await
121 |                                     {
122 |                                         error!("Failed to send queue update: {}", e);
123 |                                         break;
124 |                                     }
125 |                                 }
126 |                             }
127 |                             debug!("Queue update forwarder finished");
128 |                         }
129 |                         .instrument(
130 |                             tracing::info_span!("queue_forwarder", execution_id = %execution_id),
131 |                         ),
132 |                     );
133 | 
134 |                     // forward progress updates
135 |                     let send_p = sender.clone();
136 |                     tokio::spawn(
137 |                         async move {
138 |                             debug!("Starting progress update forwarder");
139 |                             while let Some(progress) = progress_rx.recv().await {
140 |                                 match serde_json::to_string(&progress) {
141 |                                     Ok(msg) => {
142 |                                         if let Err(e) =
143 |                                             send_p.lock().await.send(Message::text(msg)).await
144 |                                         {
145 |                                             error!("Failed to send progress update: {}", e);
146 |                                             break;
147 |                                         }
148 |                                     }
149 |                                     Err(e) => {
150 |                                         error!("Failed to serialize progress update: {}", e);
151 |                                     }
152 |                                 }
153 |                             }
154 |                             debug!("Progress update forwarder finished");
155 |                         }
156 |                         .instrument(
157 |                             tracing::info_span!("progress_forwarder", execution_id = %execution_id),
158 |                         ),
159 |                     );
160 |                 }
161 |                 Ok(ClientMessage::CancelRequest { execution_id }) => {
162 |                     info!(
163 |                         execution_id = %execution_id,
164 |                         "Received cancellation request"
165 |                     );
166 |                     manager.cancel(execution_id).await;
167 |                 }
168 |                 Err(e) => {
169 |                     error!(
170 |                         error = %e,
171 |                         "Failed to parse incoming message"
172 |                     );
173 |                 }
174 |             }
175 |         }
176 |     }
177 | 
178 |     info!("WebSocket connection closed");
179 | }
180 | 


--------------------------------------------------------------------------------
/aqueducts-executor/src/config.rs:
--------------------------------------------------------------------------------
 1 | use thiserror::Error;
 2 | use uuid::Uuid;
 3 | 
 4 | /// Errors that can occur during configuration validation
 5 | #[derive(Debug, Error)]
 6 | pub enum ConfigError {
 7 |     #[error("API key cannot be empty")]
 8 |     EmptyApiKey,
 9 | 
10 |     #[error("Max memory must be at least 1 GB")]
11 |     InvalidMemoryLimit,
12 | }
13 | 
14 | /// Configuration for the executor
15 | #[derive(Debug, Clone)]
16 | pub struct Config {
17 |     pub api_key: String,
18 |     pub executor_id: Uuid,
19 |     pub max_memory_gb: Option<usize>,
20 | }
21 | 
22 | impl Config {
23 |     /// Create a new config with validation
24 |     pub fn try_new(
25 |         api_key: String,
26 |         executor_id: Uuid,
27 |         max_memory_gb: Option<usize>,
28 |     ) -> Result<Self, ConfigError> {
29 |         if api_key.trim().is_empty() {
30 |             return Err(ConfigError::EmptyApiKey);
31 |         }
32 | 
33 |         if let Some(mem) = max_memory_gb {
34 |             if mem == 0 {
35 |                 return Err(ConfigError::InvalidMemoryLimit);
36 |             }
37 |         }
38 | 
39 |         Ok(Self {
40 |             api_key,
41 |             executor_id,
42 |             max_memory_gb,
43 |         })
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/aqueducts-executor/src/error.rs:
--------------------------------------------------------------------------------
 1 | use axum::{
 2 |     http::{header, StatusCode},
 3 |     response::{IntoResponse, Response},
 4 | };
 5 | use serde::Serialize;
 6 | use thiserror::Error;
 7 | 
 8 | #[derive(Debug, Error)]
 9 | pub enum ExecutorError {
10 |     #[error("Authentication failed")]
11 |     AuthenticationFailed,
12 | }
13 | 
14 | #[derive(Serialize)]
15 | struct ErrorResponse {
16 |     error: String,
17 | }
18 | 
19 | impl IntoResponse for ExecutorError {
20 |     fn into_response(self) -> Response {
21 |         let (status, error_response) = match &self {
22 |             ExecutorError::AuthenticationFailed => {
23 |                 let response = ErrorResponse {
24 |                     error: self.to_string(),
25 |                 };
26 |                 (StatusCode::UNAUTHORIZED, response)
27 |             }
28 |         };
29 | 
30 |         let body = serde_json::to_string(&error_response)
31 |             .unwrap_or_else(|_| format!("{{\"error\": \"{}\"}}", self));
32 | 
33 |         let mut response = Response::new(body.into());
34 |         *response.status_mut() = status;
35 | 
36 |         response.headers_mut().insert(
37 |             header::CONTENT_TYPE,
38 |             header::HeaderValue::from_static("application/json"),
39 |         );
40 | 
41 |         response
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/aqueducts-executor/src/executor/mod.rs:
--------------------------------------------------------------------------------
  1 | use std::sync::Arc;
  2 | 
  3 | use aqueducts::prelude::*;
  4 | use datafusion::{execution::runtime_env::RuntimeEnvBuilder, prelude::SessionContext};
  5 | use futures::future::BoxFuture;
  6 | use tokio::sync::mpsc;
  7 | use tracing::{error, info, instrument};
  8 | use uuid::Uuid;
  9 | 
 10 | pub use manager::ExecutionManager;
 11 | pub use progress_tracker::ExecutorProgressTracker;
 12 | 
 13 | mod manager;
 14 | mod progress_tracker;
 15 | mod queue;
 16 | 
 17 | /// Broadcast when queue positions change
 18 | #[derive(Debug, Clone)]
 19 | pub struct QueueUpdate {
 20 |     pub execution_id: Uuid,
 21 |     pub position: usize,
 22 | }
 23 | 
 24 | /// An aqueduct pipeline execution
 25 | pub struct Execution {
 26 |     pub id: Uuid,
 27 |     pub handler: BoxFuture<'static, ()>,
 28 | }
 29 | 
 30 | /// Execute an aqueduct pipeline communicating progress back to clients via websocket
 31 | #[instrument(skip(client_tx, pipeline), fields(source_count = pipeline.sources.len(), stage_count = pipeline.stages.len()))]
 32 | pub async fn execute_pipeline(
 33 |     execution_id: Uuid,
 34 |     client_tx: mpsc::Sender<ExecutorMessage>,
 35 |     pipeline: Aqueduct,
 36 |     max_memory_gb: Option<usize>,
 37 | ) {
 38 |     info!(execution_id = %execution_id, "Starting pipeline execution setup");
 39 | 
 40 |     let mut ctx = if let Some(memory_gb) = max_memory_gb {
 41 |         // Convert max_memory_gb directly to bytes (GB * 1024^3)
 42 |         let max_memory_bytes = memory_gb * 1024 * 1024 * 1024;
 43 | 
 44 |         info!(
 45 |             execution_id = %execution_id,
 46 |             memory_gb = memory_gb,
 47 |             memory_bytes = max_memory_bytes,
 48 |             "Creating runtime environment with memory limit"
 49 |         );
 50 | 
 51 |         // Use 0.95 as the memory use percentage (allowing 95% of the limit to be used)
 52 |         let runtime_env = match RuntimeEnvBuilder::new()
 53 |             .with_memory_limit(max_memory_bytes, 0.95)
 54 |             .build_arc()
 55 |         {
 56 |             Ok(env) => env,
 57 |             Err(e) => {
 58 |                 error!(execution_id = %execution_id, error = %e, "Failed to build runtime environment");
 59 |                 let _ = client_tx
 60 |                     .send(ExecutorMessage::ExecutionError {
 61 |                         execution_id,
 62 |                         message: format!("Failed to build runtime environment: {}", e),
 63 |                     })
 64 |                     .await;
 65 |                 return;
 66 |             }
 67 |         };
 68 | 
 69 |         let config = datafusion::execution::config::SessionConfig::new();
 70 |         SessionContext::new_with_config_rt(config, runtime_env)
 71 |     } else {
 72 |         info!(execution_id = %execution_id, "Using session with unlimited memory allocation");
 73 |         SessionContext::new()
 74 |     };
 75 | 
 76 |     aqueducts::custom_udfs::register_all(&mut ctx).expect("failed to register custom_udfs");
 77 | 
 78 |     let num_sources = pipeline.sources.len();
 79 |     let num_stages = pipeline
 80 |         .stages
 81 |         .iter()
 82 |         .map(|s| s.len())
 83 |         .reduce(|acc, e| acc + e)
 84 |         .unwrap_or(0);
 85 |     let num_destinations = pipeline.destination.is_some() as usize;
 86 | 
 87 |     let total_steps = num_sources
 88 |         + num_stages * 2 // 2 progress events per stage (started, completed)
 89 |         + num_destinations;
 90 | 
 91 |     info!(
 92 |         execution_id = %execution_id,
 93 |         total_steps = total_steps,
 94 |         "Creating progress tracker"
 95 |     );
 96 | 
 97 |     let progress_tracker = Arc::new(ExecutorProgressTracker::new(
 98 |         client_tx.clone(),
 99 |         execution_id,
100 |         total_steps,
101 |     ));
102 | 
103 |     info!(execution_id = %execution_id, "Starting pipeline execution");
104 |     let result = run_pipeline(Arc::new(ctx), pipeline, Some(progress_tracker)).await;
105 | 
106 |     match result {
107 |         Ok(_) => {
108 |             info!(execution_id = %execution_id, "Pipeline executed successfully");
109 |             if let Err(e) = client_tx
110 |                 .send(ExecutorMessage::ExecutionSucceeded { execution_id })
111 |                 .await
112 |             {
113 |                 error!(
114 |                     execution_id = %execution_id,
115 |                     error = %e,
116 |                     "Failed to send error message to client"
117 |                 );
118 |             }
119 |         }
120 |         Err(error) => {
121 |             error!(execution_id = %execution_id, error = %error, "Pipeline execution failed");
122 |             if let Err(e) = client_tx
123 |                 .send(ExecutorMessage::ExecutionError {
124 |                     execution_id,
125 |                     message: error.to_string(),
126 |                 })
127 |                 .await
128 |             {
129 |                 error!(
130 |                     execution_id = %execution_id,
131 |                     error = %e,
132 |                     "Failed to send error message to client"
133 |                 );
134 |             }
135 |         }
136 |     }
137 | }
138 | 


--------------------------------------------------------------------------------
/aqueducts-executor/src/executor/progress_tracker.rs:
--------------------------------------------------------------------------------
  1 | use aqueducts::prelude::*;
  2 | use itertools::Itertools;
  3 | use std::sync::atomic::AtomicUsize;
  4 | use tokio::runtime::Handle;
  5 | use tokio::sync::mpsc;
  6 | use tracing::{debug, error, info, instrument};
  7 | use uuid::Uuid;
  8 | 
  9 | const MAX_MESSAGE_CHARS: usize = 32_000;
 10 | 
 11 | /// Implementation of ProgressTracker for the executor
 12 | pub struct ExecutorProgressTracker {
 13 |     client_tx: mpsc::Sender<ExecutorMessage>,
 14 |     execution_id: Uuid,
 15 |     total_steps: usize,
 16 |     completed_steps: AtomicUsize,
 17 | }
 18 | 
 19 | impl ExecutorProgressTracker {
 20 |     pub fn new(
 21 |         client_tx: mpsc::Sender<ExecutorMessage>,
 22 |         execution_id: Uuid,
 23 |         total_steps: usize,
 24 |     ) -> Self {
 25 |         info!(
 26 |             execution_id = %execution_id,
 27 |             total_steps = total_steps,
 28 |             "Creating executor progress tracker"
 29 |         );
 30 |         Self {
 31 |             client_tx,
 32 |             execution_id,
 33 |             total_steps,
 34 |             completed_steps: AtomicUsize::new(0),
 35 |         }
 36 |     }
 37 | 
 38 |     /// Calculate progress percentage based on completed steps
 39 |     fn calculate_progress(&self, current: usize) -> u8 {
 40 |         let progress = ((current as f32) / (self.total_steps as f32) * 100.0) as u8;
 41 |         debug!(
 42 |             execution_id = %self.execution_id,
 43 |             current_step = current,
 44 |             total_steps = self.total_steps,
 45 |             progress = progress,
 46 |             "Calculated execution progress"
 47 |         );
 48 |         progress
 49 |     }
 50 | 
 51 |     /// Helper to send a message asynchronously
 52 |     fn send_message(&self, message: ExecutorMessage) {
 53 |         let tx = self.client_tx.clone();
 54 |         let execution_id = self.execution_id;
 55 | 
 56 |         Handle::current().spawn(async move {
 57 |             debug!(execution_id = %execution_id, "Sending progress message");
 58 |             match tx.send(message).await {
 59 |                 Ok(_) => debug!(execution_id = %execution_id, "Progress message sent successfully"),
 60 |                 Err(e) => error!(execution_id = %execution_id, error = %e, "Failed to send progress message"),
 61 |             }
 62 |         });
 63 |     }
 64 | }
 65 | 
 66 | impl ProgressTracker for ExecutorProgressTracker {
 67 |     #[instrument(skip(self, event), fields(execution_id = %self.execution_id))]
 68 |     fn on_progress(&self, event: ProgressEvent) {
 69 |         debug!("Processing progress event");
 70 | 
 71 |         let current = self
 72 |             .completed_steps
 73 |             .fetch_add(1, std::sync::atomic::Ordering::SeqCst)
 74 |             + 1;
 75 |         let progress = self.calculate_progress(current);
 76 | 
 77 |         let message = ExecutorMessage::ProgressUpdate {
 78 |             execution_id: self.execution_id,
 79 |             progress,
 80 |             event,
 81 |         };
 82 | 
 83 |         // Send the progress update via the channel
 84 |         self.send_message(message);
 85 |     }
 86 | 
 87 |     #[instrument(skip(self, schema, batches), fields(execution_id = %self.execution_id, stage = %stage_name, output_type = ?output_type))]
 88 |     fn on_output(
 89 |         &self,
 90 |         stage_name: &str,
 91 |         output_type: OutputType,
 92 |         schema: &datafusion::common::DFSchema,
 93 |         batches: &[datafusion::arrow::array::RecordBatch],
 94 |     ) {
 95 |         debug!("Processing stage output");
 96 | 
 97 |         // Generate output header based on type
 98 |         let output_header = match output_type {
 99 |             OutputType::Show => {
100 |                 format!("\n📋 Table Data: {stage_name}\n───────────────────────────────────────\n")
101 |             }
102 |             OutputType::ShowLimit => format!(
103 |                 "\n📋 Table Data (Preview): {stage_name}\n───────────────────────────────────────\n"
104 |             ),
105 |             OutputType::Explain => {
106 |                 format!("\n🔍 Query Plan: {stage_name}\n───────────────────────────────────────\n")
107 |             }
108 |             OutputType::ExplainAnalyze => format!(
109 |                 "\n📊 Query Metrics: {stage_name}\n───────────────────────────────────────\n"
110 |             ),
111 |             OutputType::PrintSchema => format!(
112 |                 "\n🔢 Schema: {stage_name}\n───────────────────────────────────────\n{schema:#?}\n"
113 |             ),
114 |         };
115 | 
116 |         self.send_message(ExecutorMessage::StageOutput {
117 |             execution_id: self.execution_id,
118 |             stage_name: stage_name.to_string(),
119 |             payload: StageOutputMessage::OutputStart { output_header },
120 |         });
121 | 
122 |         let output = match datafusion::arrow::util::pretty::pretty_format_batches(batches) {
123 |             Ok(output) => output,
124 |             Err(e) => {
125 |                 error!(error = %e, "Failed to format stage output");
126 |                 return;
127 |             }
128 |         };
129 | 
130 |         let output_str = output.to_string();
131 |         let chunks = chunk_by_chars(&output_str, MAX_MESSAGE_CHARS);
132 | 
133 |         info!(
134 |             chunk_count = chunks.len(),
135 |             total_size = output_str.len(),
136 |             "Chunking stage output"
137 |         );
138 | 
139 |         for (sequence, chunk) in chunks.into_iter().enumerate() {
140 |             debug!(
141 |                 sequence = sequence,
142 |                 chunk_size = chunk.len(),
143 |                 "Sending output chunk"
144 |             );
145 | 
146 |             self.send_message(ExecutorMessage::StageOutput {
147 |                 execution_id: self.execution_id,
148 |                 stage_name: stage_name.to_string(),
149 |                 payload: StageOutputMessage::OutputChunk {
150 |                     sequence,
151 |                     body: chunk,
152 |                 },
153 |             });
154 |         }
155 | 
156 |         self.send_message(ExecutorMessage::StageOutput {
157 |             execution_id: self.execution_id,
158 |             stage_name: stage_name.to_string(),
159 |             payload: StageOutputMessage::OutputEnd {
160 |                 output_footer: String::from(""),
161 |             },
162 |         });
163 | 
164 |         debug!("Stage output processing complete");
165 |     }
166 | }
167 | 
168 | fn chunk_by_chars(s: &str, max_chars: usize) -> Vec<String> {
169 |     s.chars()
170 |         .chunks(max_chars)
171 |         .into_iter()
172 |         .map(|chunk| chunk.collect())
173 |         .collect()
174 | }
175 | 


--------------------------------------------------------------------------------
/aqueducts-executor/src/executor/queue.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::VecDeque;
  2 | 
  3 | use tokio::sync::broadcast;
  4 | 
  5 | use super::{Execution, QueueUpdate};
  6 | 
  7 | /// Queue of pending jobs + broadcaster for queue updates
  8 | pub struct ExecutionQueue {
  9 |     queue: VecDeque<Execution>,
 10 |     broadcaster: broadcast::Sender<QueueUpdate>,
 11 | }
 12 | 
 13 | impl ExecutionQueue {
 14 |     pub fn new(capacity: usize) -> Self {
 15 |         let (tx, _rx) = broadcast::channel(capacity);
 16 |         Self {
 17 |             queue: VecDeque::new(),
 18 |             broadcaster: tx,
 19 |         }
 20 |     }
 21 | 
 22 |     pub fn enqueue(&mut self, job: Execution) -> broadcast::Receiver<QueueUpdate> {
 23 |         let rx = self.broadcaster.subscribe();
 24 |         self.queue.push_back(job);
 25 |         self.broadcast_positions();
 26 |         rx
 27 |     }
 28 | 
 29 |     pub fn dequeue(&mut self) -> Option<Execution> {
 30 |         let job = self.queue.pop_front();
 31 |         if job.is_some() {
 32 |             self.broadcast_positions();
 33 |         }
 34 |         job
 35 |     }
 36 | 
 37 |     fn broadcast_positions(&self) {
 38 |         for (idx, execution) in self.queue.iter().enumerate() {
 39 |             let _ = self.broadcaster.send(QueueUpdate {
 40 |                 execution_id: execution.id,
 41 |                 position: idx,
 42 |             });
 43 |         }
 44 |     }
 45 | }
 46 | 
 47 | // Unit tests for ExecutionQueue
 48 | #[cfg(test)]
 49 | mod tests {
 50 |     use super::*;
 51 |     use uuid::Uuid;
 52 | 
 53 |     fn dummy_execution() -> (Uuid, Execution) {
 54 |         let id = Uuid::new_v4();
 55 |         let handler = Box::pin(async move {});
 56 |         let execution = Execution { id, handler };
 57 | 
 58 |         (id, execution)
 59 |     }
 60 | 
 61 |     #[tokio::test]
 62 |     async fn enqueue_broadcasts_position() {
 63 |         let mut queue = ExecutionQueue::new(10);
 64 |         let (execution_id, job) = dummy_execution();
 65 | 
 66 |         let mut rx = queue.enqueue(job);
 67 |         let update_event = rx.recv().await.unwrap();
 68 | 
 69 |         assert_eq!(update_event.execution_id, execution_id);
 70 |         assert_eq!(update_event.position, 0);
 71 |     }
 72 | 
 73 |     #[tokio::test]
 74 |     async fn enqueue_two_broadcasts_both_positions() {
 75 |         let mut queue = ExecutionQueue::new(10);
 76 |         let (execution_id_1, execution_1) = dummy_execution();
 77 |         let mut rx_1 = queue.enqueue(execution_1);
 78 | 
 79 |         // consume initial update for execution_1
 80 |         let _ = rx_1.recv().await.unwrap();
 81 | 
 82 |         let (execution_id_2, execution_2) = dummy_execution();
 83 |         let mut rx_2 = queue.enqueue(execution_2);
 84 | 
 85 |         // rx_1 should receive both execution_1 and execution_2 positions
 86 |         let update_event_1 = rx_1.recv().await.unwrap();
 87 |         let update_event_2 = rx_1.recv().await.unwrap();
 88 |         assert_eq!(update_event_1.execution_id, execution_id_1);
 89 |         assert_eq!(update_event_1.position, 0);
 90 |         assert_eq!(update_event_2.execution_id, execution_id_2);
 91 |         assert_eq!(update_event_2.position, 1);
 92 | 
 93 |         // rx_2 should also get both
 94 |         let update_event_1 = rx_2.recv().await.unwrap();
 95 |         let update_event_2 = rx_2.recv().await.unwrap();
 96 |         assert_eq!(update_event_1.execution_id, execution_id_1);
 97 |         assert_eq!(update_event_2.execution_id, execution_id_2);
 98 |     }
 99 | 
100 |     #[tokio::test]
101 |     async fn dequeue_broadcasts_updated_positions() {
102 |         let mut queue = ExecutionQueue::new(10);
103 |         let (execution_id_1, execution_1) = dummy_execution();
104 |         let mut rx = queue.enqueue(execution_1);
105 | 
106 |         // consume initial update for execution_1
107 |         let _ = rx.recv().await.unwrap();
108 | 
109 |         let (execution_id_2, execution_2) = dummy_execution();
110 |         let _ = queue.enqueue(execution_2);
111 | 
112 |         // consume updates from enqueue execution_2
113 |         let _ = rx.recv().await.unwrap();
114 |         let _ = rx.recv().await.unwrap();
115 | 
116 |         let removed = queue.dequeue().unwrap();
117 |         assert_eq!(removed.id, execution_id_1);
118 | 
119 |         // consume update from dequeue
120 |         let upd = rx.recv().await.unwrap();
121 |         assert_eq!(upd.execution_id, execution_id_2);
122 |         assert_eq!(upd.position, 0);
123 |     }
124 | }
125 | 


--------------------------------------------------------------------------------
/aqueducts-executor/src/main.rs:
--------------------------------------------------------------------------------
  1 | use std::{net::SocketAddr, str::FromStr, sync::Arc, time::Duration};
  2 | 
  3 | use axum::Router;
  4 | use clap::Parser;
  5 | use config::Config;
  6 | use executor::ExecutionManager;
  7 | use tokio::signal;
  8 | use tokio_util::sync::CancellationToken;
  9 | use tracing::{error, info, Level};
 10 | use tracing_subscriber::{fmt, prelude::*, EnvFilter};
 11 | use uuid::Uuid;
 12 | 
 13 | mod api;
 14 | mod config;
 15 | mod error;
 16 | mod executor;
 17 | 
 18 | /// Remote executor for Aqueducts data pipeline framework
 19 | #[derive(Debug, Parser)]
 20 | #[command(version, about, long_about = None)]
 21 | struct Cli {
 22 |     /// API key for authentication
 23 |     #[arg(long, env = "AQUEDUCTS_API_KEY")]
 24 |     api_key: String,
 25 | 
 26 |     /// Host address to bind to
 27 |     #[arg(long, env = "AQUEDUCTS_HOST", default_value = "0.0.0.0")]
 28 |     host: String,
 29 | 
 30 |     /// Port to listen on
 31 |     #[arg(long, env = "AQUEDUCTS_PORT", default_value = "3031")]
 32 |     port: u16,
 33 | 
 34 |     /// Maximum memory usage in GB (optional)
 35 |     #[arg(long, env = "AQUEDUCTS_MAX_MEMORY")]
 36 |     max_memory: Option<usize>,
 37 | 
 38 |     /// URL of Aqueducts server for registration (optional)
 39 |     #[arg(long, env = "AQUEDUCTS_SERVER_URL")]
 40 |     server_url: Option<String>,
 41 | 
 42 |     /// Unique identifier for this executor (optional)
 43 |     #[arg(long, env = "AQUEDUCTS_EXECUTOR_ID")]
 44 |     executor_id: Option<Uuid>,
 45 | 
 46 |     /// Logging level (info, debug, trace)
 47 |     #[arg(long, env = "AQUEDUCTS_LOG_LEVEL", default_value = "info")]
 48 |     log_level: String,
 49 | }
 50 | 
 51 | type ApiContextRef = Arc<ApiContext>;
 52 | 
 53 | pub struct ApiContext {
 54 |     pub config: Config,
 55 |     pub manager: Arc<ExecutionManager>,
 56 | }
 57 | 
 58 | impl ApiContext {
 59 |     pub fn new(config: Config) -> Self {
 60 |         Self {
 61 |             config,
 62 |             manager: Arc::new(ExecutionManager::new(100)),
 63 |         }
 64 |     }
 65 | }
 66 | 
 67 | #[tokio::main]
 68 | async fn main() {
 69 |     let cli = Cli::parse();
 70 | 
 71 |     let log_level = Level::from_str(cli.log_level.to_lowercase().as_str()).unwrap_or(Level::INFO);
 72 |     tracing_subscriber::registry()
 73 |         .with(
 74 |             fmt::layer()
 75 |                 .json()
 76 |                 .with_current_span(true)
 77 |                 .with_span_list(true)
 78 |                 .with_target(true),
 79 |         )
 80 |         .with(EnvFilter::from_default_env().add_directive(log_level.into()))
 81 |         .init();
 82 | 
 83 |     let executor_id = cli.executor_id.unwrap_or_else(Uuid::new_v4);
 84 |     info!(
 85 |         executor_id = %executor_id,
 86 |         version = %env!("CARGO_PKG_VERSION"),
 87 |         "Starting Aqueducts Executor"
 88 |     );
 89 | 
 90 |     let config = match Config::try_new(cli.api_key, executor_id, cli.max_memory) {
 91 |         Ok(config) => config,
 92 |         Err(e) => {
 93 |             error!("Configuration error: {}", e);
 94 |             std::process::exit(1);
 95 |         }
 96 |     };
 97 | 
 98 |     info!(
 99 |         executor_id = %config.executor_id,
100 |         max_memory_gb = ?config.max_memory_gb,
101 |         "Configuration validated successfully"
102 |     );
103 | 
104 |     let context = Arc::new(ApiContext::new(config));
105 | 
106 |     // Create shutdown signal handler
107 |     let shutdown_token = CancellationToken::new();
108 |     let shutdown_token_ = shutdown_token.clone();
109 | 
110 |     // Spawn a task to handle shutdown signals
111 |     tokio::spawn(async move {
112 |         handle_shutdown_signals(shutdown_token_).await;
113 |     });
114 | 
115 |     // Start the execution manager
116 |     let manager_handle = {
117 |         let manager = context.manager.clone();
118 |         tokio::spawn(async move {
119 |             manager.start().await;
120 |         })
121 |     };
122 | 
123 |     let app = Router::new()
124 |         .merge(api::router(Arc::clone(&context)))
125 |         .with_state(context);
126 | 
127 |     let addr: SocketAddr = match format!("{}:{}", cli.host, cli.port).parse() {
128 |         Ok(addr) => addr,
129 |         Err(e) => {
130 |             error!("Failed to parse socket address: {}", e);
131 |             std::process::exit(1);
132 |         }
133 |     };
134 | 
135 |     info!(addr = %addr, "Listening for connections");
136 |     let listener = match tokio::net::TcpListener::bind(addr).await {
137 |         Ok(listener) => listener,
138 |         Err(e) => {
139 |             error!("Failed to bind to address {}: {}", addr, e);
140 |             std::process::exit(1);
141 |         }
142 |     };
143 | 
144 |     info!("Server started, press Ctrl+C to stop");
145 |     let server_handle = axum::serve(listener, app)
146 |         .with_graceful_shutdown(shutdown_signal_handler(shutdown_token))
147 |         .await;
148 | 
149 |     match server_handle {
150 |         Ok(_) => info!("Server shut down gracefully"),
151 |         Err(e) => error!(error = %e, "Server error during shutdown"),
152 |     }
153 | 
154 |     info!("Forcing shutdown of the execution manager");
155 |     drop(manager_handle);
156 | 
157 |     info!("Aqueducts executor shutdown complete");
158 | }
159 | 
160 | /// Handler function for shutdown signals
161 | async fn handle_shutdown_signals(shutdown_token: CancellationToken) {
162 |     let ctrl_c = async {
163 |         signal::ctrl_c()
164 |             .await
165 |             .expect("Failed to install Ctrl+C handler");
166 |     };
167 | 
168 |     #[cfg(unix)]
169 |     let terminate = async {
170 |         signal::unix::signal(signal::unix::SignalKind::terminate())
171 |             .expect("Failed to install signal handler")
172 |             .recv()
173 |             .await;
174 |     };
175 | 
176 |     #[cfg(not(unix))]
177 |     let terminate = std::future::pending::<()>();
178 | 
179 |     tokio::select! {
180 |         _ = ctrl_c => {
181 |             info!("Received Ctrl+C, starting graceful shutdown");
182 |         },
183 |         _ = terminate => {
184 |             info!("Received SIGTERM, starting graceful shutdown");
185 |         },
186 |     }
187 | 
188 |     // Signal the server to shut down
189 |     shutdown_token.cancel();
190 | }
191 | 
192 | /// Returns a future that resolves when the shutdown signal is received
193 | async fn shutdown_signal_handler(token: CancellationToken) {
194 |     token.cancelled().await;
195 |     info!("Shutdown signal received, starting graceful shutdown");
196 | 
197 |     // Give in-flight requests some time to complete
198 |     tokio::time::sleep(Duration::from_secs(1)).await;
199 | }
200 | 


--------------------------------------------------------------------------------
/aqueducts/core/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "aqueducts-core"
 3 | authors.workspace = true
 4 | edition.workspace = true
 5 | description.workspace = true
 6 | repository.workspace = true
 7 | readme.workspace = true
 8 | version.workspace = true
 9 | homepage.workspace = true
10 | keywords.workspace = true
11 | categories.workspace = true
12 | license.workspace = true
13 | 
14 | [features]
15 | default = ["yaml"]
16 | s3 = ["object_store/aws", "aqueducts-delta?/s3"]
17 | gcs = ["object_store/gcp", "aqueducts-delta?/gcs"]  
18 | azure = ["object_store/azure", "aqueducts-delta?/azure"]
19 | odbc = ["dep:aqueducts-odbc"]
20 | delta = ["dep:aqueducts-delta"]
21 | json = ["dep:serde_json"]
22 | yaml = ["dep:serde_yml"]
23 | toml = ["dep:toml"]
24 | custom_udfs = ["dep:datafusion-functions-json", "dep:serde_json"]
25 | 
26 | [dependencies]
27 | aqueducts-schemas.workspace = true
28 | 
29 | # Optional provider dependencies
30 | aqueducts-delta = { workspace = true, optional = true }
31 | aqueducts-odbc = { workspace = true, optional = true }
32 | 
33 | datafusion.workspace = true
34 | datafusion-functions-json = { workspace = true, optional = true }
35 | object_store = { version = "0.12", default-features = false }
36 | 
37 | serde.workspace = true
38 | serde_json = { workspace = true, optional = true }
39 | serde_yml = { workspace = true, optional = true }
40 | toml = { workspace = true, optional = true }
41 | 
42 | tokio.workspace = true
43 | 
44 | thiserror.workspace = true
45 | 
46 | tracing.workspace = true
47 | 
48 | url.workspace = true
49 | regex.workspace = true
50 | 
51 | [dev-dependencies]
52 | tokio = { workspace = true, features = ["full"] }
53 | rand.workspace = true
54 | tracing-test.workspace = true
55 | tempfile = "3"
56 | 


--------------------------------------------------------------------------------
/aqueducts/core/src/custom_udfs.rs:
--------------------------------------------------------------------------------
  1 | use datafusion::arrow::array::{as_string_array, Array, ListBuilder, StringBuilder};
  2 | use datafusion::error::DataFusionError;
  3 | use datafusion::execution::FunctionRegistry;
  4 | use datafusion::logical_expr::Volatility;
  5 | use datafusion::{
  6 |     arrow::array::ArrayRef,
  7 |     arrow::datatypes::DataType,
  8 |     arrow::datatypes::Field,
  9 |     logical_expr::{create_udf, ScalarUDF},
 10 |     physical_plan::ColumnarValue,
 11 | };
 12 | use std::sync::Arc;
 13 | 
 14 | fn unnest_json_array_udf() -> datafusion::logical_expr::ScalarUDF {
 15 |     let fun = Arc::new(
 16 |         |args: &[ColumnarValue]| -> datafusion::error::Result<ColumnarValue> {
 17 |             assert_eq!(args.len(), 1);
 18 | 
 19 |             let arrays = ColumnarValue::values_to_arrays(args)?;
 20 |             let sarr = as_string_array(&arrays[0]);
 21 | 
 22 |             let mut builder = ListBuilder::new(StringBuilder::new());
 23 | 
 24 |             for i in 0..sarr.len() {
 25 |                 if sarr.is_null(i) {
 26 |                     builder.append(false);
 27 |                 } else {
 28 |                     let txt = sarr.value(i);
 29 |                     let v: serde_json::Value = serde_json::from_str(txt)
 30 |                         .map_err(|e| DataFusionError::Execution(e.to_string()))?;
 31 | 
 32 |                     if let serde_json::Value::Array(elems) = v {
 33 |                         for elem in elems {
 34 |                             let s = elem.to_string();
 35 |                             builder.values().append_value(&s);
 36 |                         }
 37 |                         builder.append(true);
 38 |                     } else {
 39 |                         return Err(DataFusionError::Execution(format!(
 40 |                             "unnest_json_array: expected JSON array, got {}",
 41 |                             v
 42 |                         )));
 43 |                     }
 44 |                 }
 45 |             }
 46 | 
 47 |             let array = builder.finish();
 48 |             Ok(ColumnarValue::Array(Arc::new(array) as ArrayRef))
 49 |         },
 50 |     );
 51 | 
 52 |     create_udf(
 53 |         "unnest_json_array",
 54 |         vec![DataType::Utf8],
 55 |         DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))),
 56 |         Volatility::Immutable,
 57 |         fun,
 58 |     )
 59 | }
 60 | 
 61 | pub fn register_all(registry: &mut dyn FunctionRegistry) -> datafusion::error::Result<()> {
 62 |     let functions: Vec<Arc<ScalarUDF>> = vec![Arc::new(unnest_json_array_udf())];
 63 | 
 64 |     for function in functions {
 65 |         registry.register_udf(function)?;
 66 |     }
 67 | 
 68 |     datafusion_functions_json::register_all(registry)?;
 69 | 
 70 |     Ok(())
 71 | }
 72 | 
 73 | #[cfg(test)]
 74 | mod tests {
 75 |     use super::*;
 76 |     use datafusion::{
 77 |         arrow::array::RecordBatch, assert_batches_sorted_eq, common::DFSchema, prelude::*,
 78 |     };
 79 | 
 80 |     async fn prepare_df(json: &str) -> (DFSchema, Vec<RecordBatch>) {
 81 |         let ctx = SessionContext::new();
 82 |         ctx.register_udf(unnest_json_array_udf());
 83 |         let df = ctx
 84 |             .sql(&format!(
 85 |                 "SELECT unnest_json_array(c) AS arr \
 86 |                  FROM (VALUES ('{}')) AS t(c)",
 87 |                 json
 88 |             ))
 89 |             .await
 90 |             .unwrap();
 91 |         let schema = df.schema().clone();
 92 |         let batches = df.collect().await.unwrap();
 93 |         (schema, batches)
 94 |     }
 95 | 
 96 |     #[tokio::test]
 97 |     async fn test_unnest_json_array_numbers() {
 98 |         let (schema, batches) = prepare_df("[1, 2, 3]").await;
 99 | 
100 |         let field = schema.field(0);
101 |         let expected_type = DataType::List(Arc::new(Field::new("item", DataType::Utf8, true)));
102 |         assert_eq!(
103 |             field.data_type(),
104 |             &expected_type,
105 |             "Expected return type List<item:Utf8>, got {:?}",
106 |             field.data_type()
107 |         );
108 | 
109 |         let expected = [
110 |             "+-----------+",
111 |             "| arr       |",
112 |             "+-----------+",
113 |             "| [1, 2, 3] |",
114 |             "+-----------+",
115 |         ];
116 |         assert_batches_sorted_eq!(&expected, &batches);
117 |     }
118 | 
119 |     #[tokio::test]
120 |     async fn test_unnest_json_array_strings() {
121 |         let (schema, batches) = prepare_df(r#"["foo", "bar"]"#).await;
122 | 
123 |         let field = schema.field(0);
124 |         let expected_type = DataType::List(Arc::new(Field::new("item", DataType::Utf8, true)));
125 |         assert_eq!(
126 |             field.data_type(),
127 |             &expected_type,
128 |             "Expected return type List<item:Utf8>, got {:?}",
129 |             field.data_type()
130 |         );
131 | 
132 |         let expected = [
133 |             "+----------------+",
134 |             "| arr            |",
135 |             "+----------------+",
136 |             "| [\"foo\", \"bar\"] |",
137 |             "+----------------+",
138 |         ];
139 |         assert_batches_sorted_eq!(&expected, &batches);
140 |     }
141 | 
142 |     #[tokio::test]
143 |     async fn test_unnest_json_array_objects() {
144 |         let (schema, batches) = prepare_df(r#"[{"x":1}, {"y":"foo"}]"#).await;
145 | 
146 |         let field = schema.field(0);
147 |         let expected_type = DataType::List(Arc::new(Field::new("item", DataType::Utf8, true)));
148 |         assert_eq!(
149 |             field.data_type(),
150 |             &expected_type,
151 |             "Expected return type List<item:Utf8>, got {:?}",
152 |             field.data_type()
153 |         );
154 | 
155 |         let expected = [
156 |             "+------------------------+",
157 |             "| arr                    |",
158 |             "+------------------------+",
159 |             "| [{\"x\":1}, {\"y\":\"foo\"}] |",
160 |             "+------------------------+",
161 |         ];
162 |         assert_batches_sorted_eq!(&expected, &batches);
163 |     }
164 | }
165 | 


--------------------------------------------------------------------------------
/aqueducts/core/src/destinations/mod.rs:
--------------------------------------------------------------------------------
  1 | use aqueducts_schemas::Destination;
  2 | use datafusion::{dataframe::DataFrame, datasource::MemTable, execution::context::SessionContext};
  3 | use std::sync::Arc;
  4 | use tracing::{debug, instrument};
  5 | 
  6 | use crate::error::{AqueductsError, Result};
  7 | use crate::store::register_object_store;
  8 | 
  9 | pub mod file;
 10 | 
 11 | /// Creates a `Destination`
 12 | #[instrument(skip(ctx, destination), err)]
 13 | pub async fn register_destination(
 14 |     ctx: Arc<SessionContext>,
 15 |     destination: &Destination,
 16 | ) -> Result<()> {
 17 |     match destination {
 18 |         Destination::InMemory(_) => Ok(()),
 19 |         Destination::File(file_def) => {
 20 |             register_object_store(ctx, &file_def.location, &file_def.storage_config)?;
 21 |             Ok(())
 22 |         }
 23 |         #[cfg(feature = "odbc")]
 24 |         Destination::Odbc(odbc_dest) => {
 25 |             debug!("Preparing ODBC destination '{}'", odbc_dest.name);
 26 |             aqueducts_odbc::register_odbc_destination(
 27 |                 &odbc_dest.connection_string,
 28 |                 &odbc_dest.name,
 29 |             )
 30 |             .await
 31 |             .map_err(|e| {
 32 |                 AqueductsError::destination(
 33 |                     &odbc_dest.name,
 34 |                     format!("ODBC destination error: {}", e),
 35 |                 )
 36 |             })?;
 37 |             Ok(())
 38 |         }
 39 |         #[cfg(feature = "delta")]
 40 |         Destination::Delta(delta_dest) => {
 41 |             debug!("Preparing Delta destination '{}'", delta_dest.name);
 42 | 
 43 |             let arrow_fields: Result<Vec<_>> = delta_dest
 44 |                 .schema
 45 |                 .iter()
 46 |                 .map(|field| {
 47 |                     crate::schema_transform::field_to_arrow(field).map_err(|e| {
 48 |                         AqueductsError::schema_validation(format!("Schema conversion error: {}", e))
 49 |                     })
 50 |                 })
 51 |                 .collect();
 52 |             let arrow_fields = arrow_fields?;
 53 | 
 54 |             aqueducts_delta::prepare_delta_destination(
 55 |                 &delta_dest.name,
 56 |                 delta_dest.location.as_str(),
 57 |                 &delta_dest.storage_config,
 58 |                 &delta_dest.partition_columns,
 59 |                 &delta_dest.table_properties,
 60 |                 &arrow_fields,
 61 |             )
 62 |             .await
 63 |             .map_err(|e| {
 64 |                 AqueductsError::destination(
 65 |                     &delta_dest.name,
 66 |                     format!("Delta destination error: {}", e),
 67 |                 )
 68 |             })?;
 69 |             Ok(())
 70 |         }
 71 |         #[cfg(not(feature = "odbc"))]
 72 |         Destination::Odbc(dest) => Err(AqueductsError::unsupported(
 73 |             &dest.name,
 74 |             "ODBC support not enabled. Enable 'odbc' feature",
 75 |         )),
 76 |         #[cfg(not(feature = "delta"))]
 77 |         Destination::Delta(dest) => Err(AqueductsError::unsupported(
 78 |             &dest.name,
 79 |             "Delta support not enabled. Enable 'delta' feature",
 80 |         )),
 81 |     }
 82 | }
 83 | 
 84 | /// Write a `DataFrame` to an Aqueduct `Destination`
 85 | #[instrument(skip(ctx, destination, data), err)]
 86 | pub async fn write_to_destination(
 87 |     ctx: Arc<SessionContext>,
 88 |     destination: &Destination,
 89 |     data: DataFrame,
 90 | ) -> Result<()> {
 91 |     match destination {
 92 |         Destination::InMemory(mem_def) => {
 93 |             debug!("Writing data to in-memory table '{}'", mem_def.name);
 94 | 
 95 |             let schema = data.schema().clone();
 96 |             let partitioned = data.collect_partitioned().await?;
 97 |             let table = MemTable::try_new(Arc::new(schema.as_arrow().clone()), partitioned)?;
 98 | 
 99 |             ctx.register_table(mem_def.name.as_str(), Arc::new(table))?;
100 | 
101 |             Ok(())
102 |         }
103 |         Destination::File(file_def) => {
104 |             debug!("Writing data to file at location '{}'", file_def.location);
105 |             file::write(file_def, data).await?;
106 | 
107 |             Ok(())
108 |         }
109 |         #[cfg(feature = "odbc")]
110 |         Destination::Odbc(odbc_dest) => {
111 |             debug!("Writing data to ODBC destination '{}'", odbc_dest.name);
112 | 
113 |             let schema = data.schema().as_arrow().clone();
114 |             let batches = data.collect().await?;
115 | 
116 |             aqueducts_odbc::write_arrow_batches(
117 |                 &odbc_dest.connection_string,
118 |                 &odbc_dest.name, // Using name as table name
119 |                 odbc_dest.write_mode.clone(),
120 |                 batches,
121 |                 Arc::new(schema),
122 |                 odbc_dest.batch_size,
123 |             )
124 |             .await
125 |             .map_err(|e| {
126 |                 AqueductsError::destination(
127 |                     &odbc_dest.name,
128 |                     format!("ODBC destination error: {}", e),
129 |                 )
130 |             })?;
131 |             Ok(())
132 |         }
133 |         #[cfg(feature = "delta")]
134 |         Destination::Delta(delta_dest) => {
135 |             debug!("Writing data to Delta destination '{}'", delta_dest.name);
136 |             aqueducts_delta::write_to_delta_destination(
137 |                 &delta_dest.name,
138 |                 delta_dest.location.as_str(),
139 |                 &delta_dest.storage_config,
140 |                 &delta_dest.write_mode,
141 |                 data,
142 |             )
143 |             .await
144 |             .map_err(|e| {
145 |                 AqueductsError::destination(
146 |                     &delta_dest.name,
147 |                     format!("Delta destination error: {}", e),
148 |                 )
149 |             })?;
150 |             Ok(())
151 |         }
152 |         #[cfg(not(feature = "odbc"))]
153 |         Destination::Odbc(dest) => Err(AqueductsError::unsupported(
154 |             &dest.name,
155 |             "ODBC support not enabled. Enable 'odbc' feature",
156 |         )),
157 |         #[cfg(not(feature = "delta"))]
158 |         Destination::Delta(dest) => Err(AqueductsError::unsupported(
159 |             &dest.name,
160 |             "Delta support not enabled. Enable 'delta' feature",
161 |         )),
162 |     }
163 | }
164 | 


--------------------------------------------------------------------------------
/aqueducts/core/src/error.rs:
--------------------------------------------------------------------------------
  1 | use std::{collections::HashSet, path::PathBuf};
  2 | 
  3 | use crate::templating::TemplateFormat;
  4 | 
  5 | pub type Result<T> = core::result::Result<T, AqueductsError>;
  6 | 
  7 | #[derive(Debug, thiserror::Error)]
  8 | pub enum AqueductsError {
  9 |     // === Configuration & Setup Errors ===
 10 |     #[error("Unsupported operation: {operation} for {context}")]
 11 |     Unsupported { operation: String, context: String },
 12 | 
 13 |     // === Data Processing Errors ===
 14 |     #[error("Schema validation failed: {message}")]
 15 |     SchemaValidation { message: String },
 16 | 
 17 |     #[error("Data processing failed: {message}")]
 18 |     DataProcessing { message: String },
 19 | 
 20 |     // === I/O & Storage Errors ===
 21 |     #[error("Storage operation failed: {operation} at {location}")]
 22 |     Storage { operation: String, location: String },
 23 | 
 24 |     #[error("File operation failed: {message}")]
 25 |     FileOperation { message: String },
 26 | 
 27 |     // === Pipeline Execution Errors ===
 28 |     #[error("Source '{name}' failed: {message}")]
 29 |     Source { name: String, message: String },
 30 | 
 31 |     #[error("Stage '{name}' failed: {message}")]
 32 |     Stage { name: String, message: String },
 33 | 
 34 |     #[error("Destination '{name}' failed: {message}")]
 35 |     Destination { name: String, message: String },
 36 | 
 37 |     // === Template & Parsing Errors ===
 38 |     #[error("Template error: {message}")]
 39 |     Template { message: String },
 40 | 
 41 |     #[error("Parse error: {message}")]
 42 |     Parse { message: String },
 43 | 
 44 |     // === Resource Management ===
 45 |     #[error("Resource not found: {resource} at {location}")]
 46 |     NotFound { resource: String, location: String },
 47 | }
 48 | 
 49 | impl AqueductsError {
 50 |     // === Templating ===
 51 |     pub fn unsupported(operation: impl Into<String>, context: impl Into<String>) -> Self {
 52 |         Self::Unsupported {
 53 |             operation: operation.into(),
 54 |             context: context.into(),
 55 |         }
 56 |     }
 57 | 
 58 |     // === Data Processing ===
 59 |     pub fn schema_validation(message: impl Into<String>) -> Self {
 60 |         Self::SchemaValidation {
 61 |             message: message.into(),
 62 |         }
 63 |     }
 64 | 
 65 |     pub fn data_processing(message: impl Into<String>) -> Self {
 66 |         Self::DataProcessing {
 67 |             message: message.into(),
 68 |         }
 69 |     }
 70 | 
 71 |     // === I/O & Storage ===
 72 |     pub fn storage(operation: impl Into<String>, location: impl Into<String>) -> Self {
 73 |         Self::Storage {
 74 |             operation: operation.into(),
 75 |             location: location.into(),
 76 |         }
 77 |     }
 78 | 
 79 |     pub fn file_operation(message: impl Into<String>) -> Self {
 80 |         Self::FileOperation {
 81 |             message: message.into(),
 82 |         }
 83 |     }
 84 | 
 85 |     // === Pipeline Execution ===
 86 |     pub fn source(name: impl Into<String>, message: impl Into<String>) -> Self {
 87 |         Self::Source {
 88 |             name: name.into(),
 89 |             message: message.into(),
 90 |         }
 91 |     }
 92 | 
 93 |     pub fn stage(name: impl Into<String>, message: impl Into<String>) -> Self {
 94 |         Self::Stage {
 95 |             name: name.into(),
 96 |             message: message.into(),
 97 |         }
 98 |     }
 99 | 
100 |     pub fn destination(name: impl Into<String>, message: impl Into<String>) -> Self {
101 |         Self::Destination {
102 |             name: name.into(),
103 |             message: message.into(),
104 |         }
105 |     }
106 | 
107 |     // === Template & Parsing ===
108 |     pub fn template(message: impl Into<String>) -> Self {
109 |         Self::Template {
110 |             message: message.into(),
111 |         }
112 |     }
113 | 
114 |     pub fn parse(message: impl Into<String>) -> Self {
115 |         Self::Parse {
116 |             message: message.into(),
117 |         }
118 |     }
119 | 
120 |     // === Resource Management ===
121 |     pub fn not_found(resource: impl Into<String>, location: impl Into<String>) -> Self {
122 |         Self::NotFound {
123 |             resource: resource.into(),
124 |             location: location.into(),
125 |         }
126 |     }
127 | }
128 | 
129 | // === External Error Conversions ===
130 | 
131 | impl From<std::io::Error> for AqueductsError {
132 |     fn from(err: std::io::Error) -> Self {
133 |         Self::FileOperation {
134 |             message: err.to_string(),
135 |         }
136 |     }
137 | }
138 | 
139 | impl From<datafusion::error::DataFusionError> for AqueductsError {
140 |     fn from(err: datafusion::error::DataFusionError) -> Self {
141 |         use datafusion::error::DataFusionError as DF;
142 |         match err {
143 |             DF::SchemaError(_, _) => Self::SchemaValidation {
144 |                 message: err.to_string(),
145 |             },
146 |             DF::ArrowError(_, _) => Self::DataProcessing {
147 |                 message: err.to_string(),
148 |             },
149 |             DF::IoError(_) => Self::FileOperation {
150 |                 message: err.to_string(),
151 |             },
152 |             _ => Self::DataProcessing {
153 |                 message: err.to_string(),
154 |             },
155 |         }
156 |     }
157 | }
158 | 
159 | impl From<datafusion::arrow::error::ArrowError> for AqueductsError {
160 |     fn from(err: datafusion::arrow::error::ArrowError) -> Self {
161 |         use datafusion::arrow::error::ArrowError as AE;
162 |         match err {
163 |             AE::SchemaError(_) => Self::SchemaValidation {
164 |                 message: err.to_string(),
165 |             },
166 |             AE::ComputeError(_) => Self::DataProcessing {
167 |                 message: err.to_string(),
168 |             },
169 |             AE::IoError(_, _) => Self::FileOperation {
170 |                 message: err.to_string(),
171 |             },
172 |             AE::ParseError(_) => Self::Parse {
173 |                 message: err.to_string(),
174 |             },
175 |             _ => Self::DataProcessing {
176 |                 message: err.to_string(),
177 |             },
178 |         }
179 |     }
180 | }
181 | 
182 | impl From<object_store::Error> for AqueductsError {
183 |     fn from(err: object_store::Error) -> Self {
184 |         Self::Storage {
185 |             operation: "object_store".to_string(),
186 |             location: err.to_string(),
187 |         }
188 |     }
189 | }
190 | 
191 | impl From<regex::Error> for AqueductsError {
192 |     fn from(err: regex::Error) -> Self {
193 |         Self::Parse {
194 |             message: format!("Regex error: {}", err),
195 |         }
196 |     }
197 | }
198 | 
199 | #[cfg(feature = "json")]
200 | impl From<serde_json::Error> for AqueductsError {
201 |     fn from(err: serde_json::Error) -> Self {
202 |         Self::Parse {
203 |             message: format!("JSON error: {}", err),
204 |         }
205 |     }
206 | }
207 | 
208 | #[cfg(feature = "toml")]
209 | impl From<toml::de::Error> for AqueductsError {
210 |     fn from(err: toml::de::Error) -> Self {
211 |         Self::Parse {
212 |             message: format!("TOML deserialization error: {}", err),
213 |         }
214 |     }
215 | }
216 | 
217 | #[cfg(feature = "toml")]
218 | impl From<toml::ser::Error> for AqueductsError {
219 |     fn from(err: toml::ser::Error) -> Self {
220 |         Self::Parse {
221 |             message: format!("TOML serialization error: {}", err),
222 |         }
223 |     }
224 | }
225 | 
226 | #[cfg(feature = "yaml")]
227 | impl From<serde_yml::Error> for AqueductsError {
228 |     fn from(err: serde_yml::Error) -> Self {
229 |         Self::Parse {
230 |             message: format!("YAML error: {}", err),
231 |         }
232 |     }
233 | }
234 | 
235 | // === Legacy Support for Template-Specific Errors ===
236 | 
237 | impl From<HashSet<String>> for AqueductsError {
238 |     fn from(missing_params: HashSet<String>) -> Self {
239 |         Self::Template {
240 |             message: format!("Missing template parameters: {:?}", missing_params),
241 |         }
242 |     }
243 | }
244 | 
245 | impl From<(PathBuf, &'static str)> for AqueductsError {
246 |     fn from((path, context): (PathBuf, &'static str)) -> Self {
247 |         Self::Template {
248 |             message: format!("{}: {:?}", context, path),
249 |         }
250 |     }
251 | }
252 | 
253 | impl From<TemplateFormat> for AqueductsError {
254 |     fn from(format: TemplateFormat) -> Self {
255 |         Self::Unsupported {
256 |             operation: "template format".to_string(),
257 |             context: format!(
258 |                 "{:?} support is not enabled in this build. Enable the corresponding feature flag",
259 |                 format
260 |             ),
261 |         }
262 |     }
263 | }
264 | 


--------------------------------------------------------------------------------
/aqueducts/core/src/progress_tracker.rs:
--------------------------------------------------------------------------------
  1 | use aqueducts_schemas::{OutputType, ProgressEvent};
  2 | use datafusion::arrow::array::RecordBatch;
  3 | use datafusion::common::DFSchema;
  4 | use tracing::{error, info, instrument};
  5 | 
  6 | /// A trait for handling progress events and stage output during pipeline execution.
  7 | ///
  8 | /// Implement this trait to create custom progress tracking and monitoring for
  9 | /// Aqueducts pipeline execution. This allows you to:
 10 | ///
 11 | /// - Monitor pipeline progress in real-time
 12 | /// - Capture and display stage outputs
 13 | /// - Send progress updates to external systems
 14 | /// - Build custom UIs for pipeline monitoring
 15 | ///
 16 | /// # Examples
 17 | ///
 18 | /// ## Basic Custom Progress Tracker
 19 | ///
 20 | /// ```rust
 21 | /// use aqueducts_core::progress_tracker::ProgressTracker;
 22 | /// use aqueducts_schemas::{ProgressEvent, OutputType};
 23 | /// use datafusion::arrow::array::RecordBatch;
 24 | /// use datafusion::common::DFSchema;
 25 | ///
 26 | /// struct MyCustomTracker {
 27 | ///     start_time: std::time::Instant,
 28 | /// }
 29 | ///
 30 | /// impl MyCustomTracker {
 31 | ///     fn new() -> Self {
 32 | ///         Self {
 33 | ///             start_time: std::time::Instant::now(),
 34 | ///         }
 35 | ///     }
 36 | /// }
 37 | ///
 38 | /// impl ProgressTracker for MyCustomTracker {
 39 | ///     fn on_progress(&self, event: ProgressEvent) {
 40 | ///         match event {
 41 | ///             ProgressEvent::Started => {
 42 | ///                 println!("Pipeline started at {:?}", self.start_time);
 43 | ///             }
 44 | ///             ProgressEvent::SourceRegistered { name } => {
 45 | ///                 println!("Source '{}' registered", name);
 46 | ///             }
 47 | ///             ProgressEvent::StageCompleted { name, duration_ms, .. } => {
 48 | ///                 println!("Stage '{}' completed in {}ms", name, duration_ms);
 49 | ///             }
 50 | ///             ProgressEvent::Completed { duration_ms } => {
 51 | ///                 println!("Pipeline completed in {}ms", duration_ms);
 52 | ///             }
 53 | ///             _ => {}
 54 | ///         }
 55 | ///     }
 56 | ///
 57 | ///     fn on_output(
 58 | ///         &self,
 59 | ///         stage_name: &str,
 60 | ///         output_type: OutputType,
 61 | ///         _schema: &DFSchema,
 62 | ///         batches: &[RecordBatch],
 63 | ///     ) {
 64 | ///         let row_count: usize = batches.iter().map(|b| b.num_rows()).sum();
 65 | ///         println!("Stage '{}' produced {} rows ({:?})", stage_name, row_count, output_type);
 66 | ///     }
 67 | /// }
 68 | /// ```
 69 | pub trait ProgressTracker: Send + Sync {
 70 |     /// Called when a progress event occurs during pipeline execution.
 71 |     ///
 72 |     /// This method receives various types of progress events:
 73 |     /// - `Started` - Pipeline execution has begun
 74 |     /// - `SourceRegistered` - A data source has been registered
 75 |     /// - `StageStarted` - A processing stage has started
 76 |     /// - `StageCompleted` - A processing stage has finished
 77 |     /// - `DestinationCompleted` - Data has been written to destination
 78 |     /// - `Completed` - Entire pipeline has finished
 79 |     ///
 80 |     /// # Arguments
 81 |     ///
 82 |     /// * `event` - The progress event that occurred
 83 |     fn on_progress(&self, event: ProgressEvent);
 84 | 
 85 |     /// Called when a stage produces output that should be displayed or captured.
 86 |     ///
 87 |     /// This method is called for stages that use output directives like `show`,
 88 |     /// `explain`, or `print_schema`. It allows you to capture and process the
 89 |     /// results of these operations.
 90 |     ///
 91 |     /// # Arguments
 92 |     ///
 93 |     /// * `stage_name` - Name of the stage producing output
 94 |     /// * `output_type` - Type of output (Show, Explain, etc.)
 95 |     /// * `schema` - Schema of the data being output
 96 |     /// * `batches` - The actual data batches to display
 97 |     fn on_output(
 98 |         &self,
 99 |         stage_name: &str,
100 |         output_type: OutputType,
101 |         schema: &DFSchema,
102 |         batches: &[RecordBatch],
103 |     );
104 | }
105 | 
106 | /// A simple progress tracker that logs progress events and stage output using the `tracing` crate.
107 | ///
108 | /// This is the default progress tracker provided by Aqueducts. It logs all progress events
109 | /// and stage outputs using structured logging with emoji icons for better readability.
110 | ///
111 | /// # Examples
112 | ///
113 | /// ```rust,no_run
114 | /// use aqueducts_core::{run_pipeline, progress_tracker::LoggingProgressTracker, templating::TemplateLoader};
115 | /// use aqueducts_schemas::Aqueduct;
116 | /// use datafusion::prelude::SessionContext;
117 | /// use std::sync::Arc;
118 | ///
119 | /// async fn example() -> Result<(), Box<dyn std::error::Error>> {
120 | ///     let pipeline = Aqueduct::from_file("pipeline.yml", Default::default())?;
121 | ///     let ctx = Arc::new(SessionContext::new());
122 | ///     let tracker = Arc::new(LoggingProgressTracker);
123 | ///     
124 | ///     // This will log progress events as the pipeline executes
125 | ///     let _result = run_pipeline(ctx, pipeline, Some(tracker)).await?;
126 | ///     
127 | ///     Ok(())
128 | /// }
129 | /// ```
130 | #[derive(Debug)]
131 | pub struct LoggingProgressTracker;
132 | 
133 | impl ProgressTracker for LoggingProgressTracker {
134 |     #[instrument(skip_all)]
135 |     fn on_progress(&self, event: ProgressEvent) {
136 |         match event {
137 |             ProgressEvent::Started => {
138 |                 info!("🚀 Pipeline execution started");
139 |             }
140 |             ProgressEvent::SourceRegistered { name } => {
141 |                 info!("📚 Registered source: {}", name);
142 |             }
143 |             ProgressEvent::StageStarted {
144 |                 name,
145 |                 position,
146 |                 sub_position,
147 |             } => {
148 |                 info!(
149 |                     "⚙️  Processing stage: {} (position: {}, sub-position: {})",
150 |                     name, position, sub_position
151 |                 );
152 |             }
153 |             ProgressEvent::StageCompleted {
154 |                 name,
155 |                 position: _,
156 |                 sub_position: _,
157 |                 duration_ms,
158 |             } => {
159 |                 info!(
160 |                     "✅ Completed stage: {} (took: {:.2}s)",
161 |                     name,
162 |                     duration_ms as f64 / 1000.0
163 |                 );
164 |             }
165 |             ProgressEvent::DestinationCompleted => {
166 |                 info!("📦 Data successfully written to destination");
167 |             }
168 |             ProgressEvent::Completed { duration_ms } => {
169 |                 info!(
170 |                     "🎉 Pipeline execution completed (total time: {:.2}s)",
171 |                     duration_ms as f64 / 1000.0
172 |                 );
173 |             }
174 |         }
175 |     }
176 | 
177 |     #[instrument(skip_all)]
178 |     fn on_output(
179 |         &self,
180 |         stage_name: &str,
181 |         output_type: OutputType,
182 |         schema: &DFSchema,
183 |         batches: &[RecordBatch],
184 |     ) {
185 |         let output = datafusion::arrow::util::pretty::pretty_format_batches(batches);
186 |         match (output_type, output){
187 |             (OutputType::Show, Ok(output_str)) => info!(
188 |                 "\n📋 Table Data: {stage_name}\n───────────────────────────────────────\n{output_str}\n"
189 |             ),
190 |             (OutputType::ShowLimit, Ok(output_str)) => info!(
191 |                 "\n📋 Table Data (Preview): {stage_name}\n───────────────────────────────────────\n{output_str}\n"
192 |             ),
193 |             (OutputType::Explain, Ok(output_str)) => info!(
194 |                 "\n🔍 Query Plan: {stage_name}\n───────────────────────────────────────\n{output_str}\n"
195 |             ),
196 |             (OutputType::ExplainAnalyze, Ok(output_str)) => info!(
197 |                 "\n📊 Query Metrics: {stage_name}\n───────────────────────────────────────\n{output_str}\n"
198 |             ),
199 |             (OutputType::PrintSchema, Ok(_)) => info!(
200 |                 "\n🔢 Schema: {stage_name}\n───────────────────────────────────────\n{schema:#?}\n"
201 |             ),
202 |             _ => error!("❗\n Failed to produce stage output\n")
203 |         }
204 |     }
205 | }
206 | 


--------------------------------------------------------------------------------
/aqueducts/core/src/stages/mod.rs:
--------------------------------------------------------------------------------
 1 | use aqueducts_schemas::{OutputType, Stage};
 2 | use datafusion::{
 3 |     datasource::MemTable,
 4 |     execution::context::{SQLOptions, SessionContext},
 5 | };
 6 | use std::sync::Arc;
 7 | use tracing::instrument;
 8 | 
 9 | use crate::error::{AqueductsError, Result};
10 | 
11 | /// Process a stage in the Aqueduct pipeline
12 | /// The result of the operation will be registered within the `SessionContext` as an
13 | /// in-memory table using the stages name as the table name
14 | /// Does not allow for ddl/dml queries or SQL statements (e.g. SET VARIABLE, CREATE TABLE, etc.)
15 | #[instrument(skip_all, err)]
16 | pub async fn process_stage(
17 |     ctx: Arc<SessionContext>,
18 |     stage: Stage,
19 |     progress_tracker: Option<Arc<dyn crate::ProgressTracker>>,
20 | ) -> Result<()> {
21 |     let options = SQLOptions::new()
22 |         .with_allow_ddl(false)
23 |         .with_allow_dml(false)
24 |         .with_allow_statements(false);
25 | 
26 |     let result = ctx
27 |         .sql_with_options(stage.query.as_str(), options)
28 |         .await?
29 |         .cache()
30 |         .await
31 |         .map_err(|e| {
32 |             AqueductsError::stage(
33 |                 &stage.name,
34 |                 format!("Error occured during stage execution: {e}"),
35 |             )
36 |         })?;
37 |     let schema = result.schema().clone();
38 | 
39 |     if stage.explain || stage.explain_analyze {
40 |         let output_type = if stage.explain_analyze {
41 |             OutputType::ExplainAnalyze
42 |         } else {
43 |             OutputType::Explain
44 |         };
45 | 
46 |         let explain = result.clone().explain(false, stage.explain_analyze)?;
47 |         let batches = explain.collect().await?;
48 | 
49 |         if let Some(tracker) = &progress_tracker {
50 |             tracker.on_output(&stage.name, output_type, &schema, &batches);
51 |         }
52 |     }
53 | 
54 |     match stage.show {
55 |         Some(0) => {
56 |             let batches = result.clone().collect().await?;
57 |             if let Some(tracker) = &progress_tracker {
58 |                 tracker.on_output(&stage.name, OutputType::Show, &schema, &batches);
59 |             }
60 |         }
61 |         Some(limit) => {
62 |             let batches = result.clone().limit(0, Some(limit))?.collect().await?;
63 |             if let Some(tracker) = &progress_tracker {
64 |                 tracker.on_output(&stage.name, OutputType::ShowLimit, &schema, &batches);
65 |             }
66 |         }
67 |         _ => (),
68 |     };
69 | 
70 |     if stage.print_schema {
71 |         if let Some(tracker) = &progress_tracker {
72 |             let schema = result.schema();
73 |             tracker.on_output(&stage.name, OutputType::PrintSchema, schema, &[]);
74 |         }
75 |     }
76 | 
77 |     let partitioned = result.collect_partitioned().await?;
78 |     let table = MemTable::try_new(Arc::new(schema.as_arrow().clone()), partitioned)?;
79 | 
80 |     ctx.register_table(stage.name.as_str(), Arc::new(table))?;
81 | 
82 |     Ok(())
83 | }
84 | 


--------------------------------------------------------------------------------
/aqueducts/core/src/store/azure.rs:
--------------------------------------------------------------------------------
  1 | //! # Azure Blob Storage Object Store Provider
  2 | //!
  3 | //! This module provides an Azure Blob Storage implementation of the `ObjectStoreProvider` trait
  4 | //! using the `object_store` crate's Microsoft Azure backend.
  5 | 
  6 | use object_store::azure::MicrosoftAzureBuilder;
  7 | use std::{collections::HashMap, sync::Arc};
  8 | use tracing::warn;
  9 | use url::Url;
 10 | 
 11 | use super::ObjectStoreProvider;
 12 | use crate::error::{AqueductsError, Result};
 13 | 
 14 | /// Provider for Azure Blob Storage.
 15 | ///
 16 | /// This provider supports:
 17 | /// - `az://` URLs for Azure Blob Storage
 18 | /// - `azure://` URLs (alternative Azure scheme)
 19 | /// - `abfs://` URLs (Azure Data Lake Storage Gen2)
 20 | /// - `abfss://` URLs (Azure Data Lake Storage Gen2 with SSL)
 21 | ///
 22 | /// ## Automatic Environment Variable Configuration
 23 | ///
 24 | /// The provider automatically reads Azure credentials and configuration from environment variables:
 25 | /// - `AZURE_STORAGE_ACCOUNT_NAME` - Storage account name
 26 | /// - `AZURE_STORAGE_ACCOUNT_KEY` - Storage account access key
 27 | /// - `AZURE_CLIENT_ID` - Azure AD application client ID
 28 | /// - `AZURE_CLIENT_SECRET` - Azure AD application client secret
 29 | /// - `AZURE_TENANT_ID` - Azure AD tenant ID
 30 | ///
 31 | /// ## Supported Configuration Options
 32 | ///
 33 | /// | Option                        | Description                | Environment Variable         |
 34 | /// |-------------------------------|----------------------------|------------------------------|
 35 | /// | `azure_storage_account_name`  | Storage account name       | `AZURE_STORAGE_ACCOUNT_NAME` |
 36 | /// | `azure_storage_account_key`   | Storage account access key | `AZURE_STORAGE_ACCOUNT_KEY`  |
 37 | /// | `azure_storage_client_id`     | Azure AD client ID         | `AZURE_CLIENT_ID`            |
 38 | /// | `azure_storage_client_secret` | Azure AD client secret     | `AZURE_CLIENT_SECRET`        |
 39 | /// | `azure_storage_tenant_id`     | Azure AD tenant ID         | `AZURE_TENANT_ID`            |
 40 | /// | `azure_storage_use_emulator`  | Use storage emulator       | -                            |
 41 | /// | `azure_storage_use_azure_cli` | Use Azure CLI credentials  | -                            |
 42 | /// | `azure_federated_token_file`  | Federated token file path  | -                            |
 43 | /// | `azure_use_fabric_endpoint`   | Use Fabric endpoint        | -                            |
 44 | /// | `azure_msi_endpoint`          | MSI endpoint URL           | -                            |
 45 | /// | `azure_disable_tagging`       | Disable object tagging     | -                            |
 46 | pub struct AzureProvider;
 47 | 
 48 | impl ObjectStoreProvider for AzureProvider {
 49 |     fn supports_scheme(&self, scheme: &str) -> bool {
 50 |         matches!(scheme, "az" | "azure" | "abfs" | "abfss")
 51 |     }
 52 | 
 53 |     fn create_store(
 54 |         &self,
 55 |         location: &Url,
 56 |         options: &HashMap<String, String>,
 57 |     ) -> Result<Arc<dyn object_store::ObjectStore>> {
 58 |         let mut builder = MicrosoftAzureBuilder::from_env();
 59 | 
 60 |         if let Some(account) = location.host_str() {
 61 |             let container = location
 62 |                 .path()
 63 |                 .trim_start_matches('/')
 64 |                 .split('/')
 65 |                 .next()
 66 |                 .unwrap_or("");
 67 |             if !container.is_empty() {
 68 |                 builder = builder.with_container_name(container);
 69 |             }
 70 | 
 71 |             if account.ends_with(".blob.core.windows.net") {
 72 |                 let account_name = account.replace(".blob.core.windows.net", "");
 73 |                 builder = builder.with_account(account_name);
 74 |             } else {
 75 |                 builder = builder.with_account(account);
 76 |             }
 77 |         }
 78 | 
 79 |         for (key, value) in options {
 80 |             builder = match key.as_str() {
 81 |                 "azure_storage_account_name" | "account_name" => builder.with_account(value),
 82 |                 "azure_storage_account_key" | "account_key" => builder.with_access_key(value),
 83 |                 "azure_storage_client_id" | "client_id" => builder.with_client_id(value),
 84 |                 "azure_storage_client_secret" | "client_secret" => {
 85 |                     builder.with_client_secret(value)
 86 |                 }
 87 |                 "azure_storage_tenant_id" | "tenant_id" => builder.with_tenant_id(value),
 88 |                 "azure_storage_use_emulator" => {
 89 |                     builder.with_use_emulator(value.parse::<bool>().unwrap_or(false))
 90 |                 }
 91 |                 "azure_storage_use_azure_cli" => {
 92 |                     builder.with_use_azure_cli(value.parse::<bool>().unwrap_or(false))
 93 |                 }
 94 |                 "azure_federated_token_file" => builder.with_federated_token_file(value),
 95 |                 "azure_use_fabric_endpoint" => {
 96 |                     builder.with_use_fabric_endpoint(value.parse::<bool>().unwrap_or(false))
 97 |                 }
 98 |                 "azure_msi_endpoint" => builder.with_msi_endpoint(value),
 99 |                 "azure_disable_tagging" => {
100 |                     builder.with_disable_tagging(value.parse::<bool>().unwrap_or(false))
101 |                 }
102 |                 unknown => {
103 |                     warn!("Unknown object_store configuration key: {unknown}");
104 |                     builder
105 |                 }
106 |             };
107 |         }
108 | 
109 |         builder
110 |             .build()
111 |             .map(|store| Arc::new(store) as Arc<dyn object_store::ObjectStore>)
112 |             .map_err(|e| AqueductsError::storage("object_store", e.to_string()))
113 |     }
114 | }
115 | 


--------------------------------------------------------------------------------
/aqueducts/core/src/store/gcs.rs:
--------------------------------------------------------------------------------
 1 | //! # Google Cloud Storage Object Store Provider
 2 | //!
 3 | //! This module provides a GCS implementation of the `ObjectStoreProvider` trait
 4 | //! using the `object_store` crate's Google Cloud Storage backend.
 5 | 
 6 | use object_store::gcp::GoogleCloudStorageBuilder;
 7 | use std::{collections::HashMap, sync::Arc};
 8 | use tracing::warn;
 9 | use url::Url;
10 | 
11 | use super::ObjectStoreProvider;
12 | use crate::error::{AqueductsError, Result};
13 | 
14 | /// Provider for Google Cloud Storage.
15 | ///
16 | /// This provider supports:
17 | /// - `gs://` URLs for standard GCS access
18 | /// - `gcs://` URLs (alternative GCS scheme)
19 | ///
20 | /// ## Automatic Environment Variable Configuration
21 | ///
22 | /// The provider automatically reads GCP credentials and configuration from environment variables:
23 | /// - `GOOGLE_APPLICATION_CREDENTIALS` - Path to service account JSON file
24 | /// - `GOOGLE_SERVICE_ACCOUNT` - Service account email
25 | /// - `GOOGLE_SERVICE_ACCOUNT_KEY` - Service account private key
26 | ///
27 | /// ## Supported Configuration Options
28 | ///
29 | /// | Option                           | Description                  | Environment Variable             |
30 | /// |----------------------------------|------------------------------|----------------------------------|
31 | /// | `google_service_account`         | Path to service account JSON | `GOOGLE_APPLICATION_CREDENTIALS` |
32 | /// | `google_service_account_key`     | Service account private key  | -                                |
33 | /// | `google_application_credentials` | Application credentials      | `GOOGLE_APPLICATION_CREDENTIALS` |
34 | pub struct GcsProvider;
35 | 
36 | impl ObjectStoreProvider for GcsProvider {
37 |     fn supports_scheme(&self, scheme: &str) -> bool {
38 |         matches!(scheme, "gs" | "gcs")
39 |     }
40 | 
41 |     fn create_store(
42 |         &self,
43 |         location: &Url,
44 |         options: &HashMap<String, String>,
45 |     ) -> Result<Arc<dyn object_store::ObjectStore>> {
46 |         let mut builder = GoogleCloudStorageBuilder::from_env();
47 | 
48 |         if let Some(bucket) = location.host_str() {
49 |             builder = builder.with_bucket_name(bucket);
50 |         }
51 | 
52 |         for (key, value) in options {
53 |             builder = match key.as_str() {
54 |                 "google_service_account" => builder.with_service_account_path(value),
55 |                 "google_service_account_key" => builder.with_service_account_key(value),
56 |                 "google_application_credentials" => builder.with_application_credentials(value),
57 |                 unknown => {
58 |                     warn!("Unknown object_store configuration key: {unknown}");
59 |                     builder
60 |                 }
61 |             };
62 |         }
63 | 
64 |         builder
65 |             .build()
66 |             .map(|store| Arc::new(store) as Arc<dyn object_store::ObjectStore>)
67 |             .map_err(|e| AqueductsError::storage("object_store", e.to_string()))
68 |     }
69 | }
70 | 


--------------------------------------------------------------------------------
/aqueducts/core/src/store/s3.rs:
--------------------------------------------------------------------------------
  1 | //! # AWS S3 Object Store Provider
  2 | //!
  3 | //! This module provides an S3 implementation of the `ObjectStoreProvider` trait
  4 | //! using the `object_store` crate's AWS S3 backend.
  5 | 
  6 | use super::ObjectStoreProvider;
  7 | use crate::error::{AqueductsError, Result};
  8 | use object_store::aws::AmazonS3Builder;
  9 | use std::{collections::HashMap, sync::Arc};
 10 | use tracing::warn;
 11 | use url::Url;
 12 | 
 13 | /// Provider for Amazon S3 and S3-compatible storage.
 14 | ///
 15 | /// This provider supports:
 16 | /// - `s3://` URLs for standard S3 access
 17 | /// - `s3a://` URLs (Hadoop-style S3 access)
 18 | ///
 19 | /// ## Automatic Environment Variable Configuration
 20 | ///
 21 | /// The provider automatically reads AWS credentials and configuration from environment variables:
 22 | /// - `AWS_ACCESS_KEY_ID` - AWS access key
 23 | /// - `AWS_SECRET_ACCESS_KEY` - AWS secret key
 24 | /// - `AWS_REGION` - AWS region (e.g., "us-west-2")
 25 | /// - `AWS_ENDPOINT` - Custom S3 endpoint (for S3-compatible services)
 26 | /// - `AWS_SESSION_TOKEN` - Session token for temporary credentials
 27 | /// - `AWS_PROFILE` - AWS profile name
 28 | /// - `AWS_ALLOW_HTTP` - Allow HTTP connections (set to "true")
 29 | ///
 30 | /// ## Supported Configuration Override Options
 31 | ///
 32 | /// All options can be provided with or without the `aws_` prefix:
 33 | ///
 34 | /// | Option                             | Description                       | Environment Variable    |
 35 | /// |------------------------------------|-----------------------------------|-------------------------|
 36 | /// | `aws_access_key_id`                | AWS access key ID                 | `AWS_ACCESS_KEY_ID`     |
 37 | /// | `aws_secret_access_key`            | AWS secret access key             | `AWS_SECRET_ACCESS_KEY` |
 38 | /// | `aws_region`                       | AWS region                        | `AWS_REGION`            |
 39 | /// | `aws_endpoint`                     | Custom S3 endpoint                | `AWS_ENDPOINT`          |
 40 | /// | `aws_session_token`                | AWS session token                 | `AWS_SESSION_TOKEN`     |
 41 | /// | `aws_allow_http`                   | Allow HTTP connections            | `AWS_ALLOW_HTTP`        |
 42 | /// | `aws_virtual_hosted_style_request` | Use virtual hosted-style requests | -                       |
 43 | /// | `aws_checksum_algorithm`           | Checksum algorithm for uploads    | -                       |
 44 | /// | `aws_s3_express`                   | Enable S3 Express One Zone        | -                       |
 45 | /// | `aws_unsigned_payload`             | Use unsigned payload              | -                       |
 46 | /// | `aws_skip_signature`               | Skip request signing              | -                       |
 47 | /// | `aws_imdsv1_fallback`              | Enable IMDSv1 fallback            | -                       |
 48 | pub struct S3Provider;
 49 | 
 50 | impl ObjectStoreProvider for S3Provider {
 51 |     fn supports_scheme(&self, scheme: &str) -> bool {
 52 |         matches!(scheme, "s3" | "s3a")
 53 |     }
 54 | 
 55 |     fn create_store(
 56 |         &self,
 57 |         location: &Url,
 58 |         options: &HashMap<String, String>,
 59 |     ) -> Result<Arc<dyn object_store::ObjectStore>> {
 60 |         let mut builder = AmazonS3Builder::from_env();
 61 | 
 62 |         if let Some(bucket) = location.host_str() {
 63 |             builder = builder.with_bucket_name(bucket);
 64 |         }
 65 | 
 66 |         for (key, value) in options {
 67 |             builder = match key.as_str() {
 68 |                 "aws_access_key_id" | "access_key_id" => builder.with_access_key_id(value),
 69 |                 "aws_secret_access_key" | "secret_access_key" => {
 70 |                     builder.with_secret_access_key(value)
 71 |                 }
 72 |                 "aws_region" | "region" => builder.with_region(value),
 73 |                 "aws_endpoint" | "endpoint" => builder.with_endpoint(value),
 74 |                 "aws_session_token" | "session_token" => builder.with_token(value),
 75 |                 "aws_allow_http" => builder.with_allow_http(value.parse::<bool>().unwrap_or(false)),
 76 |                 "aws_virtual_hosted_style_request" => builder
 77 |                     .with_virtual_hosted_style_request(value.parse::<bool>().unwrap_or(false)),
 78 |                 "aws_checksum_algorithm" => {
 79 |                     if let Ok(checksum) = value.parse() {
 80 |                         builder.with_checksum_algorithm(checksum)
 81 |                     } else {
 82 |                         builder
 83 |                     }
 84 |                 }
 85 |                 "aws_s3_express" | "s3_express" => {
 86 |                     builder.with_s3_express(value.parse::<bool>().unwrap_or(false))
 87 |                 }
 88 |                 "aws_unsigned_payload" => {
 89 |                     builder.with_unsigned_payload(value.parse::<bool>().unwrap_or(false))
 90 |                 }
 91 |                 "aws_skip_signature" => {
 92 |                     builder.with_skip_signature(value.parse::<bool>().unwrap_or(false))
 93 |                 }
 94 |                 "aws_imdsv1_fallback" => {
 95 |                     if value.parse::<bool>().unwrap_or(false) {
 96 |                         builder.with_imdsv1_fallback()
 97 |                     } else {
 98 |                         builder
 99 |                     }
100 |                 }
101 |                 unknown => {
102 |                     warn!("Unknown object_store configuration key: {unknown}");
103 |                     builder
104 |                 }
105 |             };
106 |         }
107 | 
108 |         builder
109 |             .build()
110 |             .map(|store| Arc::new(store) as Arc<dyn object_store::ObjectStore>)
111 |             .map_err(|e| AqueductsError::storage("object_store", e.to_string()))
112 |     }
113 | }
114 | 


--------------------------------------------------------------------------------
/aqueducts/core/tests/integration.rs:
--------------------------------------------------------------------------------
  1 | //! Integration tests for aqueducts core pipeline functionality.
  2 | //!
  3 | //! These tests focus on end-to-end pipeline execution and test data helpers to verify core functionality.
  4 | 
  5 | mod common;
  6 | 
  7 | use aqueducts_core::run_pipeline;
  8 | use aqueducts_schemas::*;
  9 | use common::*;
 10 | use datafusion::prelude::*;
 11 | use std::sync::Arc;
 12 | 
 13 | #[tokio::test]
 14 | async fn test_csv_source_to_memory_destination() {
 15 |     let dataset = TestDataSet::new().unwrap();
 16 | 
 17 |     let pipeline = Aqueduct::builder()
 18 |         .sources(vec![Source::File(
 19 |             FileSource::builder()
 20 |                 .name("test_data".to_string())
 21 |                 .format(sources::FileType::Csv(CsvSourceOptions::default()))
 22 |                 .location(dataset.csv_url.clone().into())
 23 |                 .build(),
 24 |         )])
 25 |         .stages(vec![vec![Stage::builder()
 26 |             .name("transform".to_string())
 27 |             .query("SELECT id, name, value * 2 as doubled_value, active FROM test_data".to_string())
 28 |             .build()]])
 29 |         .destination(Destination::InMemory(
 30 |             InMemoryDestination::builder()
 31 |                 .name("result".to_string())
 32 |                 .build(),
 33 |         ))
 34 |         .build();
 35 | 
 36 |     let ctx = Arc::new(SessionContext::new());
 37 |     let result_ctx = run_pipeline(ctx, pipeline, None).await.unwrap();
 38 | 
 39 |     let table = result_ctx.table("result").await.unwrap();
 40 |     let batches = table.collect().await.unwrap();
 41 | 
 42 |     assert_eq!(batches.len(), 1);
 43 |     assert_eq!(batches[0].num_rows(), dataset.expected_rows());
 44 | 
 45 |     // Verify data transformation worked
 46 |     let doubled_values = batches[0]
 47 |         .column_by_name("doubled_value")
 48 |         .expect("doubled_value column should exist");
 49 | 
 50 |     // Should have doubled the original values
 51 |     assert!(!doubled_values.is_empty());
 52 | }
 53 | 
 54 | #[tokio::test]
 55 | async fn test_parquet_source_to_csv_destination() {
 56 |     let dataset = TestDataSet::new().unwrap();
 57 |     let output_url = dataset.get_output_url("csv_out", "result.csv");
 58 | 
 59 |     let pipeline = Aqueduct::builder()
 60 |         .sources(vec![Source::File(
 61 |             FileSource::builder()
 62 |                 .name("parquet_data".to_string())
 63 |                 .format(sources::FileType::Parquet(ParquetSourceOptions::default()))
 64 |                 .location(dataset.parquet_url.clone().into())
 65 |                 .build(),
 66 |         )])
 67 |         .stages(vec![vec![Stage::builder()
 68 |             .name("filter_active".to_string())
 69 |             .query("SELECT * FROM parquet_data WHERE active = true".to_string())
 70 |             .build()]])
 71 |         .destination(Destination::File(
 72 |             FileDestination::builder()
 73 |                 .name("csv_output".to_string())
 74 |                 .format(destinations::FileType::Csv(CsvDestinationOptions::default()))
 75 |                 .location(output_url.clone().into())
 76 |                 .build(),
 77 |         ))
 78 |         .build();
 79 | 
 80 |     let ctx = Arc::new(SessionContext::new());
 81 |     run_pipeline(ctx, pipeline, None).await.unwrap();
 82 | 
 83 |     let output_path = output_url.to_file_path().unwrap();
 84 |     assert!(output_path.exists());
 85 | 
 86 |     let content = std::fs::read_to_string(&output_path).unwrap();
 87 |     assert!(content.contains("id,name,value,active"));
 88 |     assert!(content.contains("true")); // Should only have active=true records
 89 |     assert!(!content.contains("false")); // Should not have active=false records
 90 | }
 91 | 
 92 | #[tokio::test]
 93 | async fn test_pipeline_without_destination() {
 94 |     let dataset = TestDataSet::new().unwrap();
 95 | 
 96 |     let pipeline = Aqueduct::builder()
 97 |         .sources(vec![Source::File(
 98 |             FileSource::builder()
 99 |                 .name("test_source".to_string())
100 |                 .format(sources::FileType::Csv(CsvSourceOptions::default()))
101 |                 .location(dataset.csv_url.clone().into())
102 |                 .build(),
103 |         )])
104 |         .stages(vec![vec![Stage::builder()
105 |             .name("final_stage".to_string())
106 |             .query("SELECT * FROM test_source ORDER BY id".to_string())
107 |             .build()]])
108 |         .build();
109 | 
110 |     let ctx = Arc::new(SessionContext::new());
111 |     let result_ctx = run_pipeline(ctx, pipeline, None).await.unwrap();
112 | 
113 |     // The final stage should be available as a table
114 |     let table = result_ctx.table("final_stage").await.unwrap();
115 |     let batches = table.collect().await.unwrap();
116 | 
117 |     assert_eq!(batches[0].num_rows(), dataset.expected_rows());
118 | }
119 | 


--------------------------------------------------------------------------------
/aqueducts/delta/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "aqueducts-delta"
 3 | authors.workspace = true
 4 | edition.workspace = true
 5 | description.workspace = true
 6 | homepage.workspace = true
 7 | repository.workspace = true
 8 | readme.workspace = true
 9 | version.workspace = true
10 | keywords.workspace = true
11 | categories.workspace = true
12 | license.workspace = true
13 | 
14 | [dependencies]
15 | deltalake.workspace = true
16 | aqueducts-schemas.workspace = true
17 | datafusion.workspace = true
18 | thiserror.workspace = true
19 | tracing.workspace = true
20 | 
21 | [features]
22 | default = []
23 | s3 = ["deltalake/s3"]
24 | gcs = ["deltalake/gcs"]
25 | azure = ["deltalake/azure"]
26 | 
27 | [dev-dependencies]
28 | tokio = { workspace = true, features = ["full"] }
29 | rand.workspace = true
30 | tracing-test.workspace = true
31 | serde_json.workspace = true
32 | url.workspace = true
33 | tempfile = "3"
34 | 


--------------------------------------------------------------------------------
/aqueducts/delta/src/error.rs:
--------------------------------------------------------------------------------
 1 | use thiserror::Error;
 2 | 
 3 | /// Error types for Delta Lake operations.
 4 | #[derive(Error, Debug)]
 5 | pub enum DeltaError {
 6 |     /// Delta table operation failed.
 7 |     #[error("Delta table operation failed: {0}")]
 8 |     DeltaTable(#[from] deltalake::DeltaTableError),
 9 | 
10 |     /// DataFusion error occurred.
11 |     #[error("DataFusion error: {0}")]
12 |     DataFusion(#[from] datafusion::error::DataFusionError),
13 | }
14 | 


--------------------------------------------------------------------------------
/aqueducts/delta/src/handlers.rs:
--------------------------------------------------------------------------------
 1 | //! Delta Lake object store handler registration.
 2 | //!
 3 | //! This module provides functionality to register Delta Lake object store factories
 4 | //! for cloud providers. These handlers are required for Delta Lake to work with
 5 | //! cloud storage services like S3, GCS, and Azure Blob Storage.
 6 | 
 7 | use std::sync::Once;
 8 | 
 9 | static INIT: Once = Once::new();
10 | 
11 | /// Register Delta Lake object store handlers for enabled cloud providers.
12 | ///
13 | /// This function must be called before using Delta Lake with cloud storage to ensure
14 | /// the proper object store factories are registered. It will register handlers for
15 | /// all cloud providers that are enabled via feature flags.
16 | ///
17 | /// The registration is performed exactly once, even if this function is called multiple times.
18 | pub fn register_handlers() {
19 |     INIT.call_once(|| {
20 |         tracing::debug!("Registering Delta Lake object store handlers");
21 | 
22 |         #[cfg(feature = "s3")]
23 |         {
24 |             tracing::debug!("Registering Delta Lake S3 handlers");
25 |             deltalake::aws::register_handlers(None);
26 |         }
27 | 
28 |         #[cfg(feature = "gcs")]
29 |         {
30 |             tracing::debug!("Registering Delta Lake GCS handlers");
31 |             deltalake::gcp::register_handlers(None);
32 |         }
33 | 
34 |         #[cfg(feature = "azure")]
35 |         {
36 |             tracing::debug!("Registering Delta Lake Azure handlers");
37 |             deltalake::azure::register_handlers(None);
38 |         }
39 | 
40 |         tracing::debug!("Delta Lake handlers registration complete");
41 |     });
42 | }
43 | 
44 | #[cfg(test)]
45 | mod tests {
46 |     use super::*;
47 | 
48 |     #[test]
49 |     fn test_register_handlers() {
50 |         // This test just ensures the function can be called without panicking
51 |         register_handlers();
52 | 
53 |         // Call it again to test idempotency
54 |         register_handlers();
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/aqueducts/delta/tests/data/aqueduct_pipeline_delta_append.yml:
--------------------------------------------------------------------------------
 1 | version: "v2"
 2 | 
 3 | sources:
 4 |   - type: file
 5 |     name: some_table
 6 |     format:
 7 |       type: Csv
 8 |       options:
 9 |         has_header: true
10 |         delimiter: ","
11 |     location: ${local_path}/tests/data/example_1.csv
12 | 
13 |   - type: file
14 |     name: another_table
15 |     format:
16 |       type: Csv
17 |       options:
18 |         has_header: true
19 |         delimiter: ","
20 |     location: ${local_path}/tests/data/example_2.csv
21 | 
22 | stages:
23 |   - - name: aggregate
24 |       query: >
25 |           SELECT date, country, SUM(a) as sum_1, SUM(b) as sum_2
26 |           FROM some_table 
27 |           GROUP BY 1, 2
28 | 
29 |     - name: average
30 |       query: >
31 |           SELECT date, country, AVG(x) as avg_1, AVG(y) as avg_2
32 |           FROM another_table 
33 |           GROUP BY 1, 2
34 | 
35 |   - - name: join
36 |       query: >
37 |         SELECT
38 |           COALESCE(agg.date, avg.date) as date,
39 |           COALESCE(agg.country, avg.country) as country,
40 |           sum_1,
41 |           sum_2,
42 |           avg_1,
43 |           avg_2
44 |         FROM aggregate agg
45 |         JOIN average avg ON agg.date = avg.date AND agg.country = avg.country
46 |         WHERE COALESCE(agg.date, avg.date) = '${date}'
47 | 
48 | destination:
49 |   type: delta
50 |   name: example_output
51 |   location: ${local_path}/tests/output/test_delta_append/${run_id}
52 |   storage_config: {}
53 |   table_properties: {}
54 | 
55 |   write_mode:
56 |     operation: append
57 | 
58 |   partition_columns:
59 |     - date
60 | 
61 |   schema:
62 |     - name: date
63 |       data_type: date32
64 |       nullable: true
65 |       metadata: {}
66 |     - name: country
67 |       data_type: utf8
68 |       nullable: true
69 |       metadata: {}
70 |     - name: sum_1
71 |       data_type: int64
72 |       nullable: true
73 |       metadata: {}
74 |     - name: sum_2
75 |       data_type: float64
76 |       nullable: true
77 |       metadata: {}
78 |     - name: avg_1
79 |       data_type: float64
80 |       nullable: true
81 |       metadata: {}
82 |     - name: avg_2
83 |       data_type: float64
84 |       nullable: true
85 |       metadata: {}


--------------------------------------------------------------------------------
/aqueducts/delta/tests/data/aqueduct_pipeline_delta_replace.yml:
--------------------------------------------------------------------------------
 1 | version: "v2"
 2 | 
 3 | sources:
 4 |   - type: file
 5 |     name: some_table
 6 |     format:
 7 |       type: Csv
 8 |       options:
 9 |         has_header: true
10 |         delimiter: ","
11 |     location: ${local_path}/tests/data/example_1.csv
12 | 
13 |   - type: file
14 |     name: another_table
15 |     format:
16 |       type: Csv
17 |       options:
18 |         has_header: true
19 |         delimiter: ","
20 |     location: ${local_path}/tests/data/example_2.csv
21 | 
22 | stages:
23 |   - - name: aggregate
24 |       query: >
25 |           SELECT date, country, SUM(a) as sum_1, SUM(b) as sum_2
26 |           FROM some_table 
27 |           GROUP BY 1, 2
28 | 
29 |     - name: average
30 |       query: >
31 |           SELECT date, country, AVG(x) as avg_1, AVG(y) as avg_2
32 |           FROM another_table 
33 |           GROUP BY 1, 2
34 | 
35 |   - - name: join
36 |       query: >
37 |         SELECT
38 |           COALESCE(agg.date, avg.date) as date,
39 |           COALESCE(agg.country, avg.country) as country,
40 |           sum_1,
41 |           sum_2,
42 |           avg_1,
43 |           avg_2
44 |         FROM aggregate agg
45 |         JOIN average avg ON agg.date = avg.date AND agg.country = avg.country
46 |         WHERE COALESCE(agg.date, avg.date) = '${date}'
47 | 
48 | destination:
49 |   type: delta
50 |   name: example_output
51 |   location: ${local_path}/tests/output/test_delta_replace/${run_id}
52 |   storage_config: {}
53 |   table_properties: {}
54 | 
55 |   write_mode:
56 |     operation: replace
57 |     params: 
58 |       - column: date
59 |         value: ${date}
60 | 
61 |   partition_columns:
62 |     - date
63 | 
64 |   schema:
65 |     - name: date
66 |       data_type: date32
67 |       nullable: true
68 |       metadata: {}
69 |     - name: country
70 |       data_type: utf8
71 |       nullable: true
72 |       metadata: {}
73 |     - name: sum_1
74 |       data_type: int64
75 |       nullable: true
76 |       metadata: {}
77 |     - name: sum_2
78 |       data_type: float64
79 |       nullable: true
80 |       metadata: {}
81 |     - name: avg_1
82 |       data_type: float64
83 |       nullable: true
84 |       metadata: {}
85 |     - name: avg_2
86 |       data_type: float64
87 |       nullable: true
88 |       metadata: {}


--------------------------------------------------------------------------------
/aqueducts/delta/tests/data/aqueduct_pipeline_delta_upsert.yml:
--------------------------------------------------------------------------------
 1 | version: "v2"
 2 | 
 3 | sources:
 4 |   - type: file
 5 |     name: some_table
 6 |     format:
 7 |       type: Csv
 8 |       options:
 9 |         has_header: true
10 |         delimiter: ","
11 |     location: ${local_path}/tests/data/example_1.csv
12 | 
13 |   - type: file
14 |     name: another_table
15 |     format:
16 |       type: Csv
17 |       options:
18 |         has_header: true
19 |         delimiter: ","
20 |     location: ${local_path}/tests/data/example_2.csv
21 | 
22 | stages:
23 |   - - name: aggregate
24 |       query: >
25 |           SELECT date, country, SUM(a) as sum_1, SUM(b) as sum_2
26 |           FROM some_table 
27 |           GROUP BY 1, 2
28 | 
29 |     - name: average
30 |       query: >
31 |           SELECT date, country, AVG(x) as avg_1, AVG(y) as avg_2
32 |           FROM another_table 
33 |           GROUP BY 1, 2
34 | 
35 |   - - name: join
36 |       query: >
37 |         SELECT
38 |           COALESCE(agg.date, avg.date) as date,
39 |           COALESCE(agg.country, avg.country) as country,
40 |           sum_1,
41 |           sum_2,
42 |           avg_1,
43 |           avg_2
44 |         FROM aggregate agg
45 |         JOIN average avg ON agg.date = avg.date AND agg.country = avg.country
46 |         WHERE COALESCE(agg.date, avg.date) = '${date}'
47 | 
48 | destination:
49 |   type: delta
50 |   name: example_output
51 |   location: ${local_path}/tests/output/test_delta_upsert/${run_id}
52 |   storage_config: {}
53 |   table_properties: {}
54 | 
55 |   write_mode:
56 |     operation: upsert
57 |     params: 
58 |       - date
59 |       - country
60 | 
61 |   partition_columns:
62 |     - date
63 | 
64 |   schema:
65 |     - name: date
66 |       data_type: date32
67 |       nullable: true
68 |       metadata: {}
69 |     - name: country
70 |       data_type: utf8
71 |       nullable: true
72 |       metadata: {}
73 |     - name: sum_1
74 |       data_type: int64
75 |       nullable: true
76 |       metadata: {}
77 |     - name: sum_2
78 |       data_type: float64
79 |       nullable: true
80 |       metadata: {}
81 |     - name: avg_1
82 |       data_type: float64
83 |       nullable: true
84 |       metadata: {}
85 |     - name: avg_2
86 |       data_type: float64
87 |       nullable: true
88 |       metadata: {}


--------------------------------------------------------------------------------
/aqueducts/meta/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "aqueducts"
 3 | authors.workspace = true
 4 | edition.workspace = true
 5 | description = "Unified meta-crate for the Aqueducts data pipeline framework"
 6 | repository.workspace = true
 7 | readme.workspace = true
 8 | version.workspace = true
 9 | homepage.workspace = true
10 | keywords.workspace = true
11 | categories.workspace = true
12 | license.workspace = true
13 | 
14 | [features]
15 | default = ["yaml", "s3", "gcs", "azure"]
16 | 
17 | # Format support features
18 | json = ["aqueducts-core/json"]
19 | toml = ["aqueducts-core/toml"]
20 | yaml = ["aqueducts-core/yaml"]
21 | 
22 | # Cloud storage provider features
23 | s3 = ["aqueducts-core/s3"]
24 | gcs = ["aqueducts-core/gcs"]
25 | azure = ["aqueducts-core/azure"]
26 | 
27 | # Database connectivity features
28 | odbc = ["aqueducts-core/odbc", "aqueducts-odbc"]
29 | delta = ["aqueducts-core/delta", "aqueducts-delta"]
30 | 
31 | # Schema generation features
32 | schema_gen = ["aqueducts-schemas/schema_gen"]
33 | 
34 | # Protocol features for executor/CLI integration
35 | protocol = ["aqueducts-schemas/protocol"]
36 | 
37 | # Custom udfs to extend the SQL syntax
38 | custom_udfs = ["aqueducts-core/custom_udfs"]
39 | 
40 | [dependencies]
41 | # Core aqueducts functionality
42 | aqueducts-core.workspace = true
43 | aqueducts-schemas.workspace = true
44 | 
45 | # Optional database-specific crates
46 | aqueducts-odbc = { workspace = true, optional = true }
47 | aqueducts-delta = { workspace = true, optional = true }
48 | 
49 | # Re-export common dependencies that users might need
50 | datafusion.workspace = true
51 | tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }
52 | tracing.workspace = true
53 | 
54 | [dev-dependencies]
55 | tokio = { workspace = true, features = ["full"] }
56 | tracing-test.workspace = true
57 | 


--------------------------------------------------------------------------------
/aqueducts/odbc/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "aqueducts-odbc"
 3 | authors.workspace = true
 4 | edition.workspace = true
 5 | description.workspace = true
 6 | repository.workspace = true
 7 | readme.workspace = true
 8 | version.workspace = true
 9 | homepage.workspace = true
10 | keywords.workspace = true
11 | categories.workspace = true
12 | license.workspace = true
13 | 
14 | [features]
15 | default = []
16 | odbc_tests = []
17 | 
18 | [dependencies]
19 | aqueducts-schemas.workspace = true
20 | 
21 | datafusion.workspace = true
22 | arrow-odbc.workspace = true
23 | 
24 | serde.workspace = true
25 | thiserror.workspace = true
26 | tracing.workspace = true
27 | 
28 | [dev-dependencies]
29 | tokio = { workspace = true, features = ["full"] }
30 | tracing-test.workspace = true
31 | serde_json.workspace = true
32 | 


--------------------------------------------------------------------------------
/aqueducts/odbc/src/error.rs:
--------------------------------------------------------------------------------
  1 | use thiserror::Error;
  2 | 
  3 | /// Error types for ODBC operations with security-conscious error messages.
  4 | ///
  5 | /// IMPORTANT: This type never includes connection strings or other sensitive
  6 | /// information in error messages to prevent password leakage.
  7 | #[derive(Error, Debug)]
  8 | pub enum OdbcError {
  9 |     /// ODBC connection failed (no sensitive details exposed).
 10 |     #[error("ODBC connection failed to data source")]
 11 |     ConnectionFailed,
 12 | 
 13 |     /// ODBC query execution failed.
 14 |     #[error("ODBC query execution failed: {message}")]
 15 |     QueryFailed { message: String },
 16 | 
 17 |     /// ODBC write operation failed.
 18 |     #[error("ODBC write operation failed: {message}")]
 19 |     WriteFailed { message: String },
 20 | 
 21 |     /// ODBC driver or environment setup error.
 22 |     #[error("ODBC driver error: {message}")]
 23 |     DriverError { message: String },
 24 | 
 25 |     /// Arrow error occurred.
 26 |     #[error("Arrow error: {0}")]
 27 |     Arrow(#[from] datafusion::arrow::error::ArrowError),
 28 | 
 29 |     /// DataFusion error occurred.
 30 |     #[error("DataFusion error: {0}")]
 31 |     DataFusion(#[from] datafusion::error::DataFusionError),
 32 | }
 33 | 
 34 | impl OdbcError {
 35 |     /// Create a connection failed error.
 36 |     pub fn connection_failed() -> Self {
 37 |         Self::ConnectionFailed
 38 |     }
 39 | 
 40 |     /// Create a query failed error.
 41 |     pub fn query_failed(message: impl Into<String>) -> Self {
 42 |         Self::QueryFailed {
 43 |             message: message.into(),
 44 |         }
 45 |     }
 46 | 
 47 |     /// Create a write failed error.
 48 |     pub fn write_failed(message: impl Into<String>) -> Self {
 49 |         Self::WriteFailed {
 50 |             message: message.into(),
 51 |         }
 52 |     }
 53 | 
 54 |     /// Create a driver error.
 55 |     pub fn driver_error(message: impl Into<String>) -> Self {
 56 |         Self::DriverError {
 57 |             message: message.into(),
 58 |         }
 59 |     }
 60 | }
 61 | 
 62 | // External error mappings with security considerations
 63 | impl From<arrow_odbc::Error> for OdbcError {
 64 |     fn from(err: arrow_odbc::Error) -> Self {
 65 |         // Don't expose details that might contain sensitive information
 66 |         let err_str = err.to_string().to_lowercase();
 67 |         if err_str.contains("connection")
 68 |             || err_str.contains("login")
 69 |             || err_str.contains("authentication")
 70 |         {
 71 |             Self::ConnectionFailed
 72 |         } else {
 73 |             Self::DriverError {
 74 |                 message: "ODBC operation failed".to_string(),
 75 |             }
 76 |         }
 77 |     }
 78 | }
 79 | 
 80 | impl From<arrow_odbc::odbc_api::Error> for OdbcError {
 81 |     fn from(err: arrow_odbc::odbc_api::Error) -> Self {
 82 |         // Check if this is a connection-related error without exposing details
 83 |         let err_str = err.to_string().to_lowercase();
 84 |         if err_str.contains("connection")
 85 |             || err_str.contains("login")
 86 |             || err_str.contains("authentication")
 87 |         {
 88 |             Self::ConnectionFailed
 89 |         } else {
 90 |             Self::DriverError {
 91 |                 message: "ODBC API error".to_string(),
 92 |             }
 93 |         }
 94 |     }
 95 | }
 96 | 
 97 | impl From<arrow_odbc::WriterError> for OdbcError {
 98 |     fn from(_err: arrow_odbc::WriterError) -> Self {
 99 |         Self::WriteFailed {
100 |             message: "ODBC write operation failed".to_string(),
101 |         }
102 |     }
103 | }
104 | 
105 | /// Convenience result type for ODBC operations.
106 | pub type Result<T> = std::result::Result<T, OdbcError>;
107 | 


--------------------------------------------------------------------------------
/aqueducts/schemas/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "aqueducts-schemas"
 3 | authors.workspace = true
 4 | edition.workspace = true
 5 | description.workspace = true
 6 | repository.workspace = true
 7 | readme.workspace = true
 8 | version.workspace = true
 9 | homepage.workspace = true
10 | keywords.workspace = true
11 | categories.workspace = true
12 | license.workspace = true
13 | 
14 | [features]
15 | default = []
16 | schema_gen = ["schemars"]
17 | protocol = ["uuid"]
18 | 
19 | [dependencies]
20 | serde.workspace = true
21 | bon.workspace = true
22 | url.workspace = true
23 | chrono.workspace = true
24 | 
25 | # Optional dependencies for features
26 | schemars = { workspace = true, optional = true }
27 | uuid = { workspace = true, optional = true }
28 | serde_json = { workspace = true, optional = true }
29 | 
30 | [dev-dependencies]
31 | serde_json.workspace = true
32 | serde_yml.workspace = true
33 | 


--------------------------------------------------------------------------------
/aqueducts/schemas/src/generate_schema.rs:
--------------------------------------------------------------------------------
 1 | //! Binary to generate JSON schema for the Aqueduct types
 2 | //!
 3 | //! This binary can be run with: cargo run --bin generate_schema --features schema_gen
 4 | 
 5 | use aqueducts_schemas::Aqueduct;
 6 | use schemars::schema_for;
 7 | use std::fs::File;
 8 | use std::io::Write;
 9 | use std::path::PathBuf;
10 | 
11 | fn main() -> Result<(), Box<dyn std::error::Error>> {
12 |     // Generate the JSON schema
13 |     let schema = schema_for!(Aqueduct);
14 | 
15 |     // Serialize to pretty JSON
16 |     let schema_json = serde_json::to_string_pretty(&schema)?;
17 | 
18 |     // Write to the json_schema directory in the project root
19 |     let output_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
20 |         .parent()
21 |         .unwrap()
22 |         .parent()
23 |         .unwrap()
24 |         .join("json_schema")
25 |         .join("aqueducts.schema.json");
26 | 
27 |     // Ensure the output directory exists
28 |     if let Some(parent) = output_path.parent() {
29 |         std::fs::create_dir_all(parent)?;
30 |     }
31 | 
32 |     // Write the schema file
33 |     let mut file = File::create(&output_path)?;
34 |     file.write_all(schema_json.as_bytes())?;
35 | 
36 |     println!("Generated JSON schema at: {}", output_path.display());
37 | 
38 |     Ok(())
39 | }
40 | 


--------------------------------------------------------------------------------
/aqueducts/schemas/src/lib.rs:
--------------------------------------------------------------------------------
  1 | //! # Aqueducts Schemas
  2 | //!
  3 | //! This crate contains all the schema definitions and configuration types used
  4 | //! throughout the aqueducts ecosystem. By centralizing these types here, we avoid
  5 | //! circular dependencies between core, delta, ODBC, and other provider crates.
  6 | 
  7 | use bon::Builder;
  8 | use serde::{Deserialize, Serialize};
  9 | 
 10 | pub mod data_types;
 11 | pub mod destinations;
 12 | pub mod location;
 13 | pub mod progress;
 14 | pub mod sources;
 15 | pub mod stages;
 16 | 
 17 | mod serde_helpers;
 18 | 
 19 | #[cfg(feature = "protocol")]
 20 | pub mod protocol;
 21 | 
 22 | // Re-export the main types for convenience
 23 | pub use data_types::{DataType, Field, IntervalUnit, TimeUnit, UnionMode};
 24 | pub use destinations::{
 25 |     CsvDestinationOptions, CsvDestinationOptionsBuilder, DeltaWriteMode,
 26 |     FileType as DestinationFileType, ReplaceCondition,
 27 | };
 28 | pub use destinations::{
 29 |     DeltaDestination, Destination, FileDestination, InMemoryDestination, OdbcDestination,
 30 | };
 31 | pub use destinations::{
 32 |     DeltaDestinationBuilder, FileDestinationBuilder, InMemoryDestinationBuilder,
 33 |     OdbcDestinationBuilder,
 34 | };
 35 | pub use location::Location;
 36 | pub use sources::{
 37 |     CsvSourceOptions, FileType as SourceFileType, JsonSourceOptions, ParquetSourceOptions,
 38 | };
 39 | pub use sources::{CsvSourceOptionsBuilder, JsonSourceOptionsBuilder, ParquetSourceOptionsBuilder};
 40 | pub use sources::{DeltaSource, DirSource, FileSource, InMemorySource, OdbcSource, Source};
 41 | pub use sources::{
 42 |     DeltaSourceBuilder, DirSourceBuilder, FileSourceBuilder, InMemorySourceBuilder,
 43 |     OdbcSourceBuilder,
 44 | };
 45 | 
 46 | pub use progress::{OutputType, ProgressEvent};
 47 | pub use stages::{Stage, StageBuilder};
 48 | 
 49 | #[cfg(feature = "protocol")]
 50 | pub use protocol::*;
 51 | 
 52 | fn current_version() -> String {
 53 |     "v2".to_string()
 54 | }
 55 | 
 56 | /// Definition for an `Aqueduct` data pipeline.
 57 | ///
 58 | /// An aqueduct defines a complete data processing pipeline with sources, transformation stages,
 59 | /// and an optional destination. Most configuration uses sensible defaults to minimize verbosity.
 60 | ///
 61 | /// # Examples
 62 | ///
 63 | /// ```
 64 | /// use aqueducts_schemas::{Aqueduct, Source, FileSource, SourceFileType, CsvSourceOptions, Stage};
 65 | ///
 66 | /// // Complete pipeline with defaults - version defaults to "v2"
 67 | /// let pipeline = Aqueduct::builder()
 68 | ///     .sources(vec![
 69 | ///         Source::File(
 70 | ///             FileSource::builder()
 71 | ///                 .name("sales".to_string())
 72 | ///                 .format(SourceFileType::Csv(CsvSourceOptions::default()))
 73 | ///                 .location("./sales.csv".try_into().unwrap())
 74 | ///                 .build()
 75 | ///         )
 76 | ///     ])
 77 | ///     .stages(vec![vec![
 78 | ///         Stage::builder()
 79 | ///             .name("totals".to_string())
 80 | ///             .query("SELECT region, SUM(amount) as total FROM sales GROUP BY region".to_string())
 81 | ///             .build()
 82 | ///     ]])
 83 | ///     .build();
 84 | /// ```
 85 | #[derive(Debug, Clone, Serialize, Deserialize, Builder)]
 86 | #[cfg_attr(feature = "schema_gen", derive(schemars::JsonSchema))]
 87 | #[serde(rename_all = "snake_case")]
 88 | pub struct Aqueduct {
 89 |     /// Schema version for migration compatibility
 90 |     #[serde(default = "current_version")]
 91 |     #[builder(default = current_version())]
 92 |     pub version: String,
 93 | 
 94 |     /// Definition of the data sources for this pipeline
 95 |     pub sources: Vec<Source>,
 96 | 
 97 |     /// A sequential list of transformations to execute within the context of this pipeline
 98 |     /// Nested stages are executed in parallel
 99 |     pub stages: Vec<Vec<Stage>>,
100 | 
101 |     /// Destination for the final step of the `Aqueduct`
102 |     /// takes the last stage as input for the write operation
103 |     pub destination: Option<Destination>,
104 | }
105 | 


--------------------------------------------------------------------------------
/aqueducts/schemas/src/location.rs:
--------------------------------------------------------------------------------
  1 | //! Location type that handles both file paths and URLs
  2 | 
  3 | use serde::{Deserialize, Deserializer, Serialize};
  4 | use std::path::Path;
  5 | use url::Url;
  6 | 
  7 | /// A location that can be either a file path or a URL
  8 | ///
  9 | /// This type automatically converts file paths to file:// URLs during deserialization
 10 | ///
 11 | /// # Examples
 12 | ///
 13 | /// ```
 14 | /// use aqueducts_schemas::Location;
 15 | ///
 16 | /// // From URL string
 17 | /// let url_location: Location = "https://example.com/data.csv".try_into().unwrap();
 18 | ///
 19 | /// // From absolute file path
 20 | /// let file_location: Location = "/tmp/data.csv".try_into().unwrap();
 21 | ///
 22 | /// // From relative file path
 23 | /// let rel_location: Location = "./data.csv".try_into().unwrap();
 24 | /// ```
 25 | #[derive(Debug, Clone, PartialEq, Serialize)]
 26 | #[cfg_attr(feature = "schema_gen", derive(schemars::JsonSchema))]
 27 | #[cfg_attr(
 28 |     feature = "schema_gen",
 29 |     schemars(
 30 |         with = "String",
 31 |         description = "A file path or URL. File paths will be converted to file:// URLs. Examples: '/tmp/data.csv', './data.csv', 'https://example.com/data.csv', 's3://bucket/data.csv'"
 32 |     )
 33 | )]
 34 | pub struct Location(pub Url);
 35 | 
 36 | impl TryFrom<&str> for Location {
 37 |     type Error = String;
 38 | 
 39 |     fn try_from(s: &str) -> Result<Self, Self::Error> {
 40 |         // Try as URL first - if it has a scheme, it should parse as URL
 41 |         if let Ok(url) = Url::parse(s) {
 42 |             return Ok(Location(url));
 43 |         }
 44 | 
 45 |         // Try as file path
 46 |         let path = Path::new(s);
 47 |         let url = if path.is_absolute() {
 48 |             Url::from_file_path(path)
 49 |         } else {
 50 |             // For relative paths, resolve against current directory
 51 |             let current_dir = std::env::current_dir()
 52 |                 .map_err(|e| format!("Cannot get current directory: {}", e))?;
 53 |             Url::from_file_path(current_dir.join(path))
 54 |         }
 55 |         .map_err(|_| format!("Invalid path: {}", s))?;
 56 | 
 57 |         Ok(Location(url))
 58 |     }
 59 | }
 60 | 
 61 | impl TryFrom<String> for Location {
 62 |     type Error = String;
 63 | 
 64 |     fn try_from(s: String) -> Result<Self, Self::Error> {
 65 |         Location::try_from(s.as_str())
 66 |     }
 67 | }
 68 | 
 69 | impl From<Url> for Location {
 70 |     fn from(url: Url) -> Self {
 71 |         Location(url)
 72 |     }
 73 | }
 74 | 
 75 | impl<'de> Deserialize<'de> for Location {
 76 |     fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
 77 |     where
 78 |         D: Deserializer<'de>,
 79 |     {
 80 |         let s = String::deserialize(deserializer)?;
 81 |         Location::try_from(s.as_str()).map_err(serde::de::Error::custom)
 82 |     }
 83 | }
 84 | 
 85 | impl std::fmt::Display for Location {
 86 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 87 |         write!(f, "{}", self.0)
 88 |     }
 89 | }
 90 | 
 91 | impl AsRef<Url> for Location {
 92 |     fn as_ref(&self) -> &Url {
 93 |         &self.0
 94 |     }
 95 | }
 96 | 
 97 | impl std::ops::Deref for Location {
 98 |     type Target = Url;
 99 | 
100 |     fn deref(&self) -> &Self::Target {
101 |         &self.0
102 |     }
103 | }
104 | 
105 | #[cfg(test)]
106 | mod tests {
107 |     use super::*;
108 | 
109 |     #[test]
110 |     fn test_url_parsing() {
111 |         let location: Location = "https://example.com/data.csv".try_into().unwrap();
112 |         assert_eq!(location.scheme(), "https");
113 |         assert_eq!(location.host_str(), Some("example.com"));
114 |     }
115 | 
116 |     #[test]
117 |     fn test_absolute_file_path() {
118 |         let location: Location = "/tmp/data.csv".try_into().unwrap();
119 |         assert_eq!(location.scheme(), "file");
120 |         assert!(location.path().ends_with("/tmp/data.csv"));
121 |     }
122 | 
123 |     #[test]
124 |     fn test_relative_file_path() {
125 |         let location: Location = "./data.csv".try_into().unwrap();
126 |         assert_eq!(location.scheme(), "file");
127 |         assert!(location.path().ends_with("/data.csv"));
128 |     }
129 | 
130 |     #[test]
131 |     fn test_windows_path() {
132 |         if cfg!(windows) {
133 |             let location: Location = r"C:\temp\data.csv".try_into().unwrap();
134 |             assert_eq!(location.scheme(), "file");
135 |         }
136 |     }
137 | 
138 |     #[test]
139 |     fn test_s3_url() {
140 |         let location: Location = "s3://my-bucket/data.csv".try_into().unwrap();
141 |         assert_eq!(location.scheme(), "s3");
142 |         assert_eq!(location.host_str(), Some("my-bucket"));
143 |         assert_eq!(location.path(), "/data.csv");
144 |     }
145 | 
146 |     #[test]
147 |     fn test_serialization() {
148 |         let location: Location = "https://example.com/data.csv".try_into().unwrap();
149 |         let json = serde_json::to_string(&location).unwrap();
150 |         assert_eq!(json, r#""https://example.com/data.csv""#);
151 |     }
152 | 
153 |     #[test]
154 |     fn test_deserialization() {
155 |         let json = r#""./data.csv""#;
156 |         let location: Location = serde_json::from_str(json).unwrap();
157 |         assert_eq!(location.scheme(), "file");
158 |         assert!(location.path().ends_with("/data.csv"));
159 |     }
160 | 
161 |     #[test]
162 |     fn test_location_in_config() {
163 |         use serde_json;
164 | 
165 |         #[derive(serde::Serialize, serde::Deserialize, Debug, PartialEq)]
166 |         struct Config {
167 |             name: String,
168 |             location: Location,
169 |         }
170 | 
171 |         let config = Config {
172 |             name: "test".to_string(),
173 |             location: "s3://my-bucket/data".try_into().unwrap(),
174 |         };
175 | 
176 |         let json = serde_json::to_string(&config).unwrap();
177 |         let parsed: Config = serde_json::from_str(&json).unwrap();
178 | 
179 |         assert_eq!(config.name, parsed.name);
180 |         assert_eq!(config.location.as_ref(), parsed.location.as_ref());
181 |     }
182 | 
183 |     #[test]
184 |     fn test_mixed_location_types() {
185 |         let locations = vec![
186 |             ("./local.csv", "file"),
187 |             ("/absolute/path.json", "file"),
188 |             ("https://example.com/data.csv", "https"),
189 |             ("s3://bucket/key.parquet", "s3"),
190 |             ("gs://bucket/object", "gs"),
191 |             ("azure://container/blob", "azure"),
192 |         ];
193 | 
194 |         for (input, expected_scheme) in locations {
195 |             let location = Location::try_from(input).unwrap();
196 |             assert_eq!(
197 |                 location.scheme(),
198 |                 expected_scheme,
199 |                 "Failed for input: {}",
200 |                 input
201 |             );
202 |         }
203 |     }
204 | 
205 |     #[test]
206 |     fn test_yaml_deserialization() {
207 |         let yaml = r#"
208 | location: "./data/input.csv"
209 | "#;
210 | 
211 |         #[derive(serde::Deserialize)]
212 |         struct Config {
213 |             location: Location,
214 |         }
215 | 
216 |         let config: Config = serde_yml::from_str(yaml).unwrap();
217 |         assert_eq!(config.location.scheme(), "file");
218 |         assert!(config.location.path().ends_with("/data/input.csv"));
219 |     }
220 | }
221 | 


--------------------------------------------------------------------------------
/aqueducts/schemas/src/progress.rs:
--------------------------------------------------------------------------------
 1 | //! Progress event types for tracking pipeline execution
 2 | 
 3 | use serde::{Deserialize, Serialize};
 4 | 
 5 | /// Progress events emitted during pipeline execution
 6 | #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 7 | #[serde(tag = "type", rename_all = "snake_case")]
 8 | pub enum ProgressEvent {
 9 |     /// Pipeline execution started
10 |     Started,
11 |     /// A source has been registered
12 |     SourceRegistered {
13 |         /// Name of the source
14 |         name: String,
15 |     },
16 |     /// A stage has started processing
17 |     StageStarted {
18 |         /// Name of the stage
19 |         name: String,
20 |         /// Position in the stages array (outer)
21 |         position: usize,
22 |         /// Position in the parallel stages array (inner)
23 |         sub_position: usize,
24 |     },
25 |     /// A stage has completed processing
26 |     StageCompleted {
27 |         /// Name of the stage
28 |         name: String,
29 |         /// Position in the stages array (outer)
30 |         position: usize,
31 |         /// Position in the parallel stages array (inner)
32 |         sub_position: usize,
33 |         /// Duration of the stage execution
34 |         duration_ms: u64,
35 |     },
36 |     /// Data has been written to the destination
37 |     DestinationCompleted,
38 |     /// Pipeline execution completed
39 |     Completed {
40 |         /// Total duration of the pipeline execution
41 |         duration_ms: u64,
42 |     },
43 | }
44 | 
45 | /// Stage output types for websocket communication
46 | #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
47 | #[serde(tag = "type", rename_all = "snake_case")]
48 | pub enum OutputType {
49 |     /// Stage outputs the full dataframe
50 |     Show,
51 |     /// Stage outputs up to `usize` records
52 |     ShowLimit,
53 |     /// Stage outputs query plan
54 |     Explain,
55 |     /// Stage outputs query plan with execution metrics
56 |     ExplainAnalyze,
57 |     /// Stage outputs the dataframe schema
58 |     PrintSchema,
59 | }
60 | 


--------------------------------------------------------------------------------
/aqueducts/schemas/src/protocol.rs:
--------------------------------------------------------------------------------
 1 | //! Protocol types for websocket communication between client and executor.
 2 | 
 3 | use serde::{Deserialize, Serialize};
 4 | use uuid::Uuid;
 5 | 
 6 | use crate::{Aqueduct, ProgressEvent};
 7 | 
 8 | /// Stage output sent down to clients
 9 | #[derive(Debug, Clone, Serialize, Deserialize)]
10 | #[serde(tag = "type", rename_all = "snake_case")]
11 | pub enum StageOutputMessage {
12 |     /// Stage output is being streamed to client
13 |     OutputStart {
14 |         output_header: String,
15 |     },
16 |     /// Stage output content
17 |     OutputChunk {
18 |         /// Indicates the sequence of this chunk output
19 |         sequence: usize,
20 |         /// Output chunk body
21 |         body: String,
22 |     },
23 |     OutputEnd {
24 |         output_footer: String,
25 |     },
26 | }
27 | 
28 | /// Client websocket message
29 | #[derive(Debug, Clone, Serialize, Deserialize)]
30 | #[serde(tag = "type", rename_all = "snake_case")]
31 | #[allow(clippy::large_enum_variant)]
32 | pub enum ClientMessage {
33 |     /// Execution requested by client
34 |     ExecutionRequest {
35 |         /// The aqueducts pipeline to be executed
36 |         pipeline: Aqueduct,
37 |     },
38 |     /// Execution cancellation requested by client
39 |     CancelRequest {
40 |         /// Execution id of the pipeline execution to cancel
41 |         execution_id: Uuid,
42 |     },
43 | }
44 | 
45 | /// Executor websocket message
46 | #[derive(Debug, Clone, Serialize, Deserialize)]
47 | #[serde(tag = "type", rename_all = "snake_case")]
48 | pub enum ExecutorMessage {
49 |     /// Execution successfully queued
50 |     ExecutionResponse {
51 |         /// Execution id that identifies the queued execution
52 |         execution_id: Uuid,
53 |     },
54 |     /// Execution cancellation was successful
55 |     CancelResponse {
56 |         /// Execution id of the cancelled pipeline
57 |         execution_id: Uuid,
58 |     },
59 |     /// The queue position for the requested execution
60 |     QueuePosition {
61 |         /// Execution id of the queued pipeline
62 |         execution_id: Uuid,
63 |         /// Position of the requested execution in the queue
64 |         position: usize,
65 |     },
66 |     /// Progress update event emited by a running aqueducts pipeline
67 |     ProgressUpdate {
68 |         /// Execution id of the running pipeline
69 |         execution_id: Uuid,
70 |         /// Progress percentage (0-100)
71 |         progress: u8,
72 |         /// Progress event payload
73 |         event: ProgressEvent,
74 |     },
75 |     /// Stage output of a running pipeline
76 |     StageOutput {
77 |         /// Execution id of the running pipeline
78 |         execution_id: Uuid,
79 |         /// Stage name that is outputting
80 |         stage_name: String,
81 |         /// Stage output payload
82 |         payload: StageOutputMessage,
83 |     },
84 |     /// Pipeline execution completet successfully
85 |     ExecutionSucceeded {
86 |         /// Execution id of the pipeline
87 |         execution_id: Uuid,
88 |     },
89 |     ExecutionError {
90 |         /// Execution id that produced error
91 |         execution_id: Uuid,
92 |         /// Error message
93 |         message: String,
94 |     },
95 | }
96 | 


--------------------------------------------------------------------------------
/aqueducts/schemas/src/serde_helpers.rs:
--------------------------------------------------------------------------------
 1 | //! Shared serde helper functions for deserialization and default values.
 2 | //!
 3 | //! This module consolidates common serde helpers used across the schema types
 4 | 
 5 | use crate::data_types::DataType;
 6 | use serde::{Deserialize, Deserializer};
 7 | use std::str::FromStr;
 8 | 
 9 | // =============================================================================
10 | // Default value functions
11 | // =============================================================================
12 | 
13 | /// Default value for boolean fields that should be true
14 | pub fn default_true() -> bool {
15 |     true
16 | }
17 | 
18 | /// Default comma delimiter for CSV files
19 | pub fn default_comma() -> char {
20 |     ','
21 | }
22 | 
23 | /// Default batch size for ODBC operations
24 | pub fn default_batch_size() -> usize {
25 |     1000
26 | }
27 | 
28 | // =============================================================================
29 | // Custom deserializers
30 | // =============================================================================
31 | 
32 | /// Custom deserializer that handles string representations of DataType
33 | pub fn deserialize_data_type<'de, D>(deserializer: D) -> Result<DataType, D::Error>
34 | where
35 |     D: Deserializer<'de>,
36 | {
37 |     use serde::de::Error;
38 | 
39 |     // Deserialize as a string
40 |     let s = String::deserialize(deserializer)?;
41 |     DataType::from_str(&s).map_err(|e| D::Error::custom(format!("Invalid data type: {}", e)))
42 | }
43 | 
44 | /// Custom deserializer for partition columns that handles both tuple and object formats
45 | pub fn deserialize_partition_columns<'de, D>(
46 |     deserializer: D,
47 | ) -> Result<Vec<(String, DataType)>, D::Error>
48 | where
49 |     D: Deserializer<'de>,
50 | {
51 |     use serde::de::Error;
52 | 
53 |     #[derive(Deserialize)]
54 |     #[serde(untagged)]
55 |     enum PartitionColumn {
56 |         Tuple(String, String),                      // (name, type_string)
57 |         Object { name: String, data_type: String }, // {name: "col", data_type: "int32"}
58 |     }
59 | 
60 |     let columns: Vec<PartitionColumn> = Vec::deserialize(deserializer)?;
61 | 
62 |     columns
63 |         .into_iter()
64 |         .map(|col| match col {
65 |             PartitionColumn::Tuple(name, type_str) => {
66 |                 let data_type = DataType::from_str(&type_str).map_err(|e| {
67 |                     D::Error::custom(format!("Invalid data type in partition column: {}", e))
68 |                 })?;
69 |                 Ok((name, data_type))
70 |             }
71 |             PartitionColumn::Object {
72 |                 name,
73 |                 data_type: type_str,
74 |             } => {
75 |                 let data_type = DataType::from_str(&type_str).map_err(|e| {
76 |                     D::Error::custom(format!("Invalid data type in partition column: {}", e))
77 |                 })?;
78 |                 Ok((name, data_type))
79 |             }
80 |         })
81 |         .collect()
82 | }
83 | 


--------------------------------------------------------------------------------
/aqueducts/schemas/src/stages.rs:
--------------------------------------------------------------------------------
 1 | //! Stage configuration types and schemas.
 2 | //!
 3 | //! Stages define SQL transformations that are executed as part of an aqueducts pipeline.
 4 | //! Each stage produces a named table that can be referenced by subsequent stages.
 5 | 
 6 | use bon::Builder;
 7 | use serde::{Deserialize, Serialize};
 8 | 
 9 | /// A processing stage in an aqueducts pipeline.
10 | ///
11 | /// Stages execute SQL queries against the available data sources and previous stage results.
12 | /// Each stage creates a named table that can be referenced by subsequent stages.
13 | ///
14 | /// # Examples
15 | ///
16 | /// ```
17 | /// use aqueducts_schemas::Stage;
18 | ///
19 | /// // Basic stage - debug fields default to false, show defaults to None
20 | /// let stage = Stage::builder()
21 | ///     .name("aggregated_sales".to_string())
22 | ///     .query("SELECT region, SUM(amount) as total FROM sales GROUP BY region".to_string())
23 | ///     .build();
24 | ///
25 | /// // Stage with output shown
26 | /// let debug_stage = Stage::builder()
27 | ///     .name("debug_query".to_string())
28 | ///     .query("SELECT * FROM source LIMIT 5".to_string())
29 | ///     .show(10)
30 | ///     .build();
31 | /// ```
32 | #[derive(Debug, Clone, Serialize, Deserialize, Builder)]
33 | #[cfg_attr(feature = "schema_gen", derive(schemars::JsonSchema))]
34 | #[serde(rename_all = "snake_case")]
35 | pub struct Stage {
36 |     /// Name of the stage, used as the table name for the result of this stage
37 |     pub name: String,
38 | 
39 |     /// SQL query that is executed against a datafusion context. Check the datafusion SQL reference for more information <https://datafusion.apache.org/user-guide/sql/index.html>
40 |     pub query: String,
41 | 
42 |     /// When set to a value of up to `usize`, will print the result of this stage to the stdout limited by the number
43 |     /// Set value to 0 to not limit the outputs
44 |     #[serde(default)]
45 |     pub show: Option<usize>,
46 | 
47 |     /// When set to 'true' the stage will output the query execution plan
48 |     #[serde(default)]
49 |     #[builder(default)]
50 |     pub explain: bool,
51 | 
52 |     /// When set to 'true' the stage will output the query execution plan with added execution metrics
53 |     #[serde(default)]
54 |     #[builder(default)]
55 |     pub explain_analyze: bool,
56 | 
57 |     /// When set to 'true' the stage will pretty print the output schema of the executed query
58 |     #[serde(default)]
59 |     #[builder(default)]
60 |     pub print_schema: bool,
61 | }
62 | 


--------------------------------------------------------------------------------
/aqueducts/schemas/tests/integration.rs:
--------------------------------------------------------------------------------
  1 | //! Integration tests for aqueducts schemas
  2 | //!
  3 | //! Tests backwards compatibility, serialization, and basic functionality.
  4 | 
  5 | use aqueducts_schemas::{Aqueduct, Field};
  6 | use std::fs;
  7 | use std::path::Path;
  8 | 
  9 | #[test]
 10 | fn test_backwards_compatibility() {
 11 |     // Test that old field names are still supported via aliases
 12 |     let old_format_json = r#"{
 13 |         "sources": [
 14 |             {
 15 |                 "type": "File",
 16 |                 "name": "test_data",
 17 |                 "file_type": {
 18 |                     "type": "Csv",
 19 |                     "options": {
 20 |                         "has_header": true,
 21 |                         "delimiter": ","
 22 |                     }
 23 |                 },
 24 |                 "location": "./data.csv",
 25 |                 "storage_options": {}
 26 |             }
 27 |         ],
 28 |         "stages": [],
 29 |         "destination": {
 30 |             "type": "Delta",
 31 |             "name": "output",
 32 |             "location": "./output",
 33 |             "write_mode": {
 34 |                 "operation": "Append"
 35 |             },
 36 |             "storage_options": {},
 37 |             "partition_cols": [],
 38 |             "table_properties": {},
 39 |             "custom_metadata": {},
 40 |             "schema": []
 41 |         }
 42 |     }"#;
 43 | 
 44 |     let parsed: Aqueduct = serde_json::from_str(old_format_json).unwrap();
 45 |     assert_eq!(parsed.sources.len(), 1);
 46 |     assert!(parsed.destination.is_some());
 47 | }
 48 | 
 49 | #[test]
 50 | fn test_field_defaults() {
 51 |     // Test field without nullable should get default (true)
 52 |     let field_json = r#"{
 53 |         "name": "test_field",
 54 |         "type": "string"
 55 |     }"#;
 56 | 
 57 |     let field: Field = serde_json::from_str(field_json).unwrap();
 58 |     assert!(field.nullable); // Should get default
 59 | }
 60 | 
 61 | #[test]
 62 | fn test_version_default() {
 63 |     // Test that version gets default value when missing
 64 |     let config_json = r#"{
 65 |         "sources": [],
 66 |         "stages": []
 67 |     }"#;
 68 | 
 69 |     let parsed: Aqueduct = serde_json::from_str(config_json).unwrap();
 70 |     assert_eq!(parsed.version, "v2");
 71 | }
 72 | 
 73 | #[test]
 74 | fn test_pipeline_serialization_roundtrip() {
 75 |     // Test complete pipeline roundtrip serialization
 76 |     let pipeline = Aqueduct {
 77 |         version: "v2".to_string(),
 78 |         sources: vec![],
 79 |         stages: vec![],
 80 |         destination: None,
 81 |     };
 82 | 
 83 |     let json = serde_json::to_string(&pipeline).unwrap();
 84 |     let parsed: Aqueduct = serde_json::from_str(&json).unwrap();
 85 | 
 86 |     assert_eq!(pipeline.version, parsed.version);
 87 |     assert_eq!(pipeline.sources.len(), parsed.sources.len());
 88 | }
 89 | 
 90 | #[test]
 91 | fn test_example_pipeline_files() {
 92 |     // Test that all example pipeline files can be deserialized
 93 |     let examples_dir = Path::new(env!("CARGO_MANIFEST_DIR"))
 94 |         .parent()
 95 |         .unwrap()
 96 |         .parent()
 97 |         .unwrap()
 98 |         .join("examples");
 99 | 
100 |     if !examples_dir.exists() {
101 |         return; // Skip if examples directory doesn't exist
102 |     }
103 | 
104 |     for entry in fs::read_dir(examples_dir).unwrap() {
105 |         let entry = entry.unwrap();
106 |         let path = entry.path();
107 |         let file_name = path.file_name().unwrap().to_str().unwrap();
108 | 
109 |         // Skip non-pipeline files
110 |         if !file_name.starts_with("aqueduct_pipeline") {
111 |             continue;
112 |         }
113 | 
114 |         let content =
115 |             fs::read_to_string(&path).unwrap_or_else(|_| panic!("Failed to read file: {:?}", path));
116 | 
117 |         // Test deserialization based on file extension
118 |         let _pipeline: Aqueduct =
119 |             if path.extension().unwrap() == "yml" || path.extension().unwrap() == "yaml" {
120 |                 serde_yml::from_str(&content)
121 |                     .unwrap_or_else(|e| panic!("Failed to parse YAML file {}: {}", file_name, e))
122 |             } else if path.extension().unwrap() == "json" {
123 |                 serde_json::from_str(&content)
124 |                     .unwrap_or_else(|e| panic!("Failed to parse JSON file {}: {}", file_name, e))
125 |             } else {
126 |                 continue; // Skip non-YAML/JSON files
127 |             };
128 | 
129 |         println!("Successfully parsed: {}", file_name);
130 |     }
131 | }
132 | 


--------------------------------------------------------------------------------
/cliff.toml:
--------------------------------------------------------------------------------
  1 | # git-cliff ~ configuration file
  2 | # https://git-cliff.org/docs/configuration
  3 | 
  4 | [changelog]
  5 | # template for the changelog footer
  6 | header = """
  7 | # Changelog\n
  8 | All notable changes to this project will be documented in this file.
  9 | 
 10 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 11 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).\n
 12 | """
 13 | # template for the changelog body
 14 | # https://keats.github.io/tera/docs/#introduction
 15 | body = """
 16 | {%- macro remote_url() -%}
 17 |   https://github.com/{{ remote.github.owner }}/{{ remote.github.repo }}
 18 | {%- endmacro -%}
 19 | 
 20 | {% if version -%}
 21 |     ## [{{ version | trim_start_matches(pat="v") }}] - {{ timestamp | date(format="%Y-%m-%d") }}
 22 | {% else -%}
 23 |     ## [Unreleased]
 24 | {% endif -%}
 25 | 
 26 | ### Details\
 27 | 
 28 | {% for group, commits in commits | group_by(attribute="group") %}
 29 |     #### {{ group | upper_first }}
 30 |     {%- for commit in commits %}
 31 |         - {{ commit.message | upper_first | trim }}\
 32 |             {% if commit.remote.username %} by @{{ commit.remote.username }}{%- endif -%}
 33 |             {% if commit.remote.pr_number %} in \
 34 |               [#{{ commit.remote.pr_number }}]({{ self::remote_url() }}/pull/{{ commit.remote.pr_number }}) \
 35 |             {%- endif -%}
 36 |     {% endfor %}
 37 | {% endfor %}
 38 | 
 39 | {%- if github.contributors | filter(attribute="is_first_time", value=true) | length != 0 %}
 40 |   ## New Contributors
 41 | {%- endif -%}
 42 | 
 43 | {% for contributor in github.contributors | filter(attribute="is_first_time", value=true) %}
 44 |   * @{{ contributor.username }} made their first contribution
 45 |     {%- if contributor.pr_number %} in \
 46 |       [#{{ contributor.pr_number }}]({{ self::remote_url() }}/pull/{{ contributor.pr_number }}) \
 47 |     {%- endif %}
 48 | {%- endfor %}\n
 49 | """
 50 | # template for the changelog footer
 51 | footer = """
 52 | {%- macro remote_url() -%}
 53 |   https://github.com/{{ remote.github.owner }}/{{ remote.github.repo }}
 54 | {%- endmacro -%}
 55 | 
 56 | {% for release in releases -%}
 57 |     {% if release.version -%}
 58 |         {% if release.previous.version -%}
 59 |             [{{ release.version | trim_start_matches(pat="v") }}]: \
 60 |                 {{ self::remote_url() }}/compare/{{ release.previous.version }}..{{ release.version }}
 61 |         {% endif -%}
 62 |     {% else -%}
 63 |         [unreleased]: {{ self::remote_url() }}/compare/{{ release.previous.version }}..HEAD
 64 |     {% endif -%}
 65 | {% endfor %}
 66 | <!-- generated by git-cliff -->
 67 | """
 68 | # remove the leading and trailing whitespace from the templates
 69 | trim = true
 70 | 
 71 | [git]
 72 | # parse the commits based on https://www.conventionalcommits.org
 73 | conventional_commits = true
 74 | # filter out the commits that are not conventional
 75 | filter_unconventional = false
 76 | # process each line of a commit as an individual commit
 77 | split_commits = false
 78 | # regex for preprocessing the commit messages
 79 | commit_preprocessors = [
 80 |     # remove issue numbers from commits
 81 |     { pattern = '\((\w+\s)?#([0-9]+)\)', replace = "" },
 82 | ]
 83 | # regex for parsing and grouping commits
 84 | commit_parsers = [
 85 |     { message = "^.*: add", group = "Added" },
 86 |     { message = "^.*: support", group = "Added" },
 87 |     { message = "^.*: remove", group = "Removed" },
 88 |     { message = "^.*: delete", group = "Removed" },
 89 |     { message = "^test", group = "Fixed" },
 90 |     { message = "^fix", group = "Fixed" },
 91 |     { message = "^.*: fix", group = "Fixed" },
 92 |     { message = "^.*", group = "Changed" },
 93 | ]
 94 | # protect breaking changes from being skipped due to matching a skipping commit_parser
 95 | protect_breaking_commits = false
 96 | # filter out the commits that are not matched by commit parsers
 97 | filter_commits = true
 98 | # regex for matching git tags
 99 | tag_pattern = "v[0-9].*"
100 | # regex for skipping tags
101 | skip_tags = "v0.1.0-beta.1"
102 | # regex for ignoring tags
103 | ignore_tags = ""
104 | # sort the tags topologically
105 | topo_order = false
106 | # sort the commits inside sections by oldest/newest order
107 | sort_commits = "oldest"
108 | 


--------------------------------------------------------------------------------
/db/init.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE temp_readings (
 2 |     location_id       INTEGER,
 3 |     timestamp         TIMESTAMP,
 4 |     temperature_c     FLOAT,
 5 |     humidity          FLOAT,
 6 |     weather_condition VARCHAR(50)
 7 | );
 8 | 
 9 | CREATE TABLE temp_readings_empty (
10 |     location_id       INTEGER,
11 |     timestamp         TIMESTAMP,
12 |     temperature_c     FLOAT,
13 |     humidity          FLOAT,
14 |     weather_condition VARCHAR(50)
15 | );
16 | 
17 | CREATE TABLE temp_readings_aggregated (
18 |     date            DATE,
19 |     location_id     INTEGER,
20 |     min_temp_c      FLOAT,
21 |     min_humidity    FLOAT,
22 |     max_temp_c      FLOAT,
23 |     max_humidity    FLOAT,
24 |     avg_temp_c      FLOAT,
25 |     avg_humidity    FLOAT
26 | );
27 | 
28 | COPY temp_readings FROM '/opt/temp_readings_jan_2024.csv' DELIMITER ',' CSV HEADER;
29 | COPY temp_readings FROM '/opt/temp_readings_feb_2024.csv' DELIMITER ',' CSV HEADER;
30 | 
31 | CREATE TABLE test_custom_delete_insert_ok (
32 |     id          INTEGER,
33 |     value       VARCHAR(50)
34 | );
35 | 
36 | CREATE TABLE test_custom_delete_insert_failed (
37 |     id          INTEGER,
38 |     value       VARCHAR(50)
39 | );
40 | 


--------------------------------------------------------------------------------
/dist-workspace.toml:
--------------------------------------------------------------------------------
 1 | [workspace]
 2 | members = ["cargo:."]
 3 | 
 4 | # Config for 'dist'
 5 | [dist]
 6 | # The preferred dist version to use in CI (Cargo.toml SemVer syntax)
 7 | cargo-dist-version = "0.28.3"
 8 | # CI backends to support
 9 | ci = "github"
10 | # The installers to generate for each app
11 | installers = ["shell", "homebrew"]
12 | # A GitHub repo to push Homebrew formulas to
13 | tap = "vigimite/homebrew-aqueducts"
14 | # Target platforms to build apps for (Rust target-triple syntax)
15 | targets = ["aarch64-apple-darwin", "aarch64-unknown-linux-gnu", "x86_64-apple-darwin", "x86_64-unknown-linux-gnu", "x86_64-unknown-linux-musl"]
16 | # Path that installers should place binaries in
17 | install-path = "CARGO_HOME"
18 | # Publish jobs to run in CI
19 | publish-jobs = ["homebrew"]
20 | # Whether to install an updater program
21 | install-updater = false
22 | # Only build these specific binaries
23 | bins = ["aqueducts"]
24 | 
25 | # Homebrew-specific configuration
26 | [dist.homebrew]
27 | # Override the formula name to match the crate name
28 | formula-name = "aqueducts-cli"
29 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   db:
 3 |     image: postgres:15
 4 |     restart: "no"
 5 |     environment:
 6 |       POSTGRES_USER: postgres
 7 |       POSTGRES_PASSWORD: postgres
 8 |     ports:
 9 |       - 5432:5432
10 |     volumes:
11 |       - ./db/init.sql:/docker-entrypoint-initdb.d/init.sql
12 |       - ./examples/temp_readings_jan_2024.csv:/opt/temp_readings_jan_2024.csv
13 |       - ./examples/temp_readings_feb_2024.csv:/opt/temp_readings_feb_2024.csv
14 | 
15 |   aqueducts-executor:
16 |     profiles: ["executor"]
17 |     build: 
18 |       context: .
19 |       dockerfile: docker/Dockerfile
20 |     ports:
21 |       - "3031:3031"
22 |     environment:
23 |       - RUST_LOG=info
24 |     command: ["aqueducts-executor", "--host", "0.0.0.0", "--port", "3031", "--api-key", "test_secret_key"]
25 |     restart: unless-stopped
26 |     depends_on:
27 |       - db
28 |     healthcheck:
29 |       test: ["CMD", "curl", "-f", "http://localhost:3031/api/health"]
30 |       interval: 30s
31 |       timeout: 10s
32 |       retries: 3
33 |       start_period: 40s
34 | 


--------------------------------------------------------------------------------
/docker/.dockerignore:
--------------------------------------------------------------------------------
 1 | # Target directories
 2 | target/
 3 | **/target/
 4 | 
 5 | # Git
 6 | .git/
 7 | .gitignore
 8 | 
 9 | # CI/CD
10 | .github/
11 | 
12 | # Documentation
13 | docs/
14 | *.md
15 | 
16 | # Examples and test data
17 | examples/
18 | **/tests/output/
19 | 
20 | # IDE
21 | .vscode/
22 | .idea/
23 | 
24 | # OS
25 | .DS_Store
26 | Thumbs.db
27 | 
28 | # Logs
29 | *.log
30 | 
31 | # Temporary files
32 | *.tmp
33 | *.temp


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM rust:1.86-slim AS builder
 2 | 
 3 | RUN apt-get update && apt-get install -y \
 4 |     pkg-config \
 5 |     libssl-dev \
 6 |     unixodbc-dev \
 7 |     perl \
 8 |     make \
 9 |     gcc \
10 |     && rm -rf /var/lib/apt/lists/*
11 | 
12 | WORKDIR /app
13 | 
14 | COPY Cargo.toml Cargo.lock ./
15 | COPY aqueducts/ ./aqueducts/
16 | COPY aqueducts-cli/ ./aqueducts-cli/
17 | COPY aqueducts-executor/ ./aqueducts-executor/
18 | 
19 | RUN cargo build --release --features odbc -p aqueducts-executor
20 | 
21 | FROM debian:bookworm-slim
22 | 
23 | RUN apt-get update && apt-get install -y \
24 |     ca-certificates \
25 |     unixodbc \
26 |     odbc-postgresql \
27 |     curl \
28 |     && rm -rf /var/lib/apt/lists/*
29 | 
30 | RUN useradd --create-home --shell /bin/bash aqueducts
31 | 
32 | COPY --from=builder /app/target/release/aqueducts-executor /usr/local/bin/aqueducts-executor
33 | 
34 | # Copy ODBC configuration files
35 | COPY docker/odbcinst.ini /etc/odbcinst.ini
36 | COPY docker/odbc.ini /etc/odbc.ini
37 | 
38 | RUN chmod +x /usr/local/bin/aqueducts-executor
39 | 
40 | USER aqueducts
41 | WORKDIR /home/aqueducts
42 | 
43 | EXPOSE 3031
44 | 
45 | HEALTHCHECK --interval=30s --timeout=3s --start-period=10s --retries=3 \
46 |   CMD curl -f http://localhost:3031/api/health || exit 1
47 | 
48 | CMD ["aqueducts-executor", "--host", "0.0.0.0", "--port", "3031"]
49 | 


--------------------------------------------------------------------------------
/docker/odbc.ini:
--------------------------------------------------------------------------------
 1 | [postgres]
 2 | Description=PostgreSQL connection
 3 | Driver=PostgreSQL Unicode
 4 | Server=postgres
 5 | Port=5432
 6 | Database=postgres
 7 | Username=postgres
 8 | Password=
 9 | SSLMode=prefer
10 | 
11 | [aqueducts_test]
12 | Description=Aqueducts test database
13 | Driver=PostgreSQL Unicode
14 | Server=db
15 | Port=5432
16 | Database=postgres
17 | Username=postgres
18 | Password=postgres
19 | SSLMode=prefer
20 | 


--------------------------------------------------------------------------------
/docker/odbcinst.ini:
--------------------------------------------------------------------------------
 1 | [PostgreSQL ANSI]
 2 | Description=PostgreSQL ODBC driver (ANSI version)
 3 | Driver=psqlodbca.so
 4 | Setup=libodbcpsqlS.so
 5 | Debug=0
 6 | CommLog=1
 7 | UsageCount=1
 8 | 
 9 | [PostgreSQL Unicode]
10 | Description=PostgreSQL ODBC driver (Unicode version)
11 | Driver=psqlodbcw.so
12 | Setup=libodbcpsqlS.so
13 | Debug=0
14 | CommLog=1
15 | UsageCount=1
16 | 


--------------------------------------------------------------------------------
/docs/about.md:
--------------------------------------------------------------------------------
1 | # About
2 | 
3 | TODO
4 | 


--------------------------------------------------------------------------------
/docs/architecture.md:
--------------------------------------------------------------------------------
 1 | # Architecture
 2 | 
 3 | An aqueduct is a pipeline definition and consists of 3 main parts
 4 | 
 5 | - Source -> the source data for this pipeline
 6 | - Stage -> transformations applied within this pipeline
 7 | - Destination -> output of the pipeline result
 8 | 
 9 | ## Source
10 | 
11 | An Aqueduct source can be:
12 | 
13 | - CSV or Parquet file(s)
14 |   - single file
15 |   - directory
16 | - Delta table
17 | - ODBC query (EXPERIMENTAL)
18 | 
19 | For file based sources a schema can be provided optionally.
20 | 
21 | The source is registered within the `SessionContext` as a table that can be referenced using the sources configured name. A prerequisite here is that the necessary features for the underlying object stores are enabled.
22 | This can be provided by an external `SessionContext` passed into the `run_pipeline` function or by registering the correct handlers for deltalake.
23 | 
24 | **EXPERIMENTAL ODBC support**
25 | 
26 | As an experimental feature it is possible to query various databases using ODBC. This is enabled through [arrow-odbc](https://crates.io/crates/arrow-odbc).
27 | Besides enabling the `odbc` feature flag in your `Cargo.toml` there are some other prerequisites for the executing system:
28 | 
29 | - `unixodbc` on unix based systems
30 | - ODBC driver for the database you want to access like [ODBC Driver for SQL server](https://learn.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server) or [psqlodbc](https://github.com/postgresql-interfaces/psqlodbc)
31 | - registering the driver in the ODBC manager configuration (usually located in `/etc/odbcinst.ini`)
32 | 
33 | If you have issues setting this up there are many resources online explaining how to set this up, it is a bit of a hassle.
34 | 
35 | ## Stage
36 | 
37 | An Aqueduct stage defines a transformation using SQL. Each stage has access to all defined sources and to every previously executed stage within the SQL context using the respectively configured names.
38 | Once executed the stage will then persist its result into the SQL context making it accessible to downstream consumers.
39 | 
40 | The stage can be set to print the result and/or the result schema to the `stdout`. This is useful for development/debugging purposes.
41 | 
42 | Nested stages are executed in parallel
43 | 
44 | ## Destination
45 | 
46 | An Aqueduct destination can be:
47 | 
48 | - CSV or Parquet file(s)
49 |   - single file
50 |   - directory
51 | - Delta table
52 | - ODBC query (NOT IMPLEMENTED YET)
53 | 
54 | An Aqueduct destination is the target for the execution of the pipeline, the result of the final stage that was executed is used as the input for the destination to write the data to the underlying table/file.
55 | 
56 | **File based destinations**
57 | 
58 | File based destinations have support for HDFS style partitioning (`output/location=1/...`) and can be set to output only a single file or multiple files based on the configuration.
59 | 
60 | **Delta Table destination**
61 | 
62 | For a DeltaTable there is some additional logic that is utilized to maintain the table integrity.
63 | 
64 | The destination will first cast and validate the schema of the input data and then use one of 3 configurable modes to write the data:
65 | 
66 | - Append -> appends the data to the destination
67 | - Upsert -> merges the data to the destination, using the provided configuration for this mode to identify cohort columns that are used to determine which data should be updated
68 |   - provided merge columns are used to check equality e.g. `vec!["date", "country"]` -> update data where `old.date = new.date AND old.country = new.country`
69 | - Replace -> replaces the data using a configurable predicate to determine which data should be replaced by the operation
70 |   - provided replacement conditions are used to check equality e.g. `ReplacementCondition { column: "date", value: "1970-01-01" }` -> replace data where `old.date = '1970-01-01'`
71 | 


--------------------------------------------------------------------------------
/docs/assets/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vigimite/aqueducts/6eaf764852ac9348eb45c7073ecf126d61eb5505/docs/assets/favicon.ico


--------------------------------------------------------------------------------
/docs/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vigimite/aqueducts/6eaf764852ac9348eb45c7073ecf126d61eb5505/docs/assets/logo.png


--------------------------------------------------------------------------------
/docs/cli.md:
--------------------------------------------------------------------------------
 1 | # Aqueducts CLI
 2 | 
 3 | Example CLI application utilizing the Aqueducts framework to run ETL pipelines declared in YAML.
 4 | 
 5 | ## Install
 6 | 
 7 | ```bash
 8 | # install with default features (s3, gcs, azure)
 9 | cargo install aqueducts-cli
10 | 
11 | # install with odbc support
12 | cargo install aqueducts-cli --features odbc
13 | 
14 | # install with s3 support only
15 | cargo install aqueducts-cli --no-default-features --features s3
16 | ```
17 | 
18 | ## Run
19 | 
20 | ```bash
21 | aqueducts --file ./example.yml --param key1=value1 --param key2=value2  
22 | ```
23 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Aqueducts
 2 | 
 3 | This is the documentation for [Aqueducts](https://github.com/vigimite/aqueducts)
 4 | 
 5 | [![Build status](https://github.com/vigimite/aqueducts/actions/workflows/build.yml/badge.svg?branch=main)](https://github.com/vigimite/aqueducts/actions/workflows/CI.yml) [![Crates.io](https://img.shields.io/crates/v/aqueducts)](https://crates.io/crates/aqueducts) [![Documentation](https://docs.rs/aqueducts/badge.svg)](https://docs.rs/aqueducts)
 6 | 
 7 | <img src="assets/logo.png" width="100">
 8 | 
 9 | Aqueducts is a framework to write and execute ETL data pipelines declaratively.
10 | 
11 | **Features:**
12 | 
13 | - Define ETL pipelines in YAML
14 | - Extract data from csv files, parquet files or delta tables
15 | - Process data using SQL
16 | - Load data into object stores as csv/parquet or delta tables
17 | - Support for file and delta table partitioning
18 | - Support for Upsert/Replace/Append operation on delta tables
19 | - Support for Local, S3, GCS and Azure Blob storage
20 | - *EXPERIMENTAL* Support for ODBC Sources and Destinations
21 | 
22 | This framework builds on the fantastic work done by projects such as:
23 | 
24 | - [arrow-rs](https://github.com/apache/arrow-rs)
25 | - [datafusion](https://github.com/apache/datafusion)
26 | - [delta-rs](https://github.com/delta-io/delta-rs)
27 | 
28 | Please show these projects some support :heart:!
29 | 


--------------------------------------------------------------------------------
/docs/stylesheets/extra.css:
--------------------------------------------------------------------------------
 1 | [data-md-color-scheme="default"] {
 2 |   --md-primary-fg-color: #111F2C;
 3 |   --md-primary-fg-color--light: #111F2C;
 4 |   --md-primary-fg-color--dark: #111F2C;
 5 |   --md-footer-bg-color--light: #111F2C;
 6 |   --md-footer-bg-color--dark: #111F2C;
 7 | }
 8 | 
 9 | [data-md-color-scheme="slate"] {
10 |   --md-primary-fg-color: #3B444B;
11 |   --md-primary-fg-color--light: #3B444B;
12 |   --md-primary-fg-color--dark: #3B444B;
13 |   --md-footer-bg-color--light: #3B444B;
14 |   --md-footer-bg-color--dark: #3B444B;
15 | }


--------------------------------------------------------------------------------
/examples/aqueduct_pipeline_example.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "sources": [
  3 |     {
  4 |       "type": "File",
  5 |       "name": "some_table",
  6 |       "file_type": {
  7 |         "type": "Csv",
  8 |         "options": {
  9 |           "has_header": true,
 10 |           "delimiter": ","
 11 |         }
 12 |       },
 13 |       "location": "${local_path}/examples/test_data/example_1.csv"
 14 |     },
 15 |     {
 16 |       "type": "File",
 17 |       "name": "another_table",
 18 |       "file_type": {
 19 |         "type": "Csv",
 20 |         "options": {
 21 |           "has_header": true,
 22 |           "delimiter": ","
 23 |         }
 24 |       },
 25 |       "location": "${local_path}/examples/test_data/example_2.csv"
 26 |     }
 27 |   ],
 28 |   "stages": [
 29 |     [
 30 |       {
 31 |         "name": "aggregate",
 32 |         "query": "SELECT date, country, SUM(a) as sum_1, SUM(b) as sum_2 FROM some_table  GROUP BY 1, 2",
 33 |         "show": 20
 34 |       },
 35 |       {
 36 |         "name": "average",
 37 |         "query": "SELECT date, country, AVG(x) as avg_1, AVG(y) as avg_2 FROM another_table  GROUP BY 1, 2",
 38 |         "show": 0
 39 |       }
 40 |     ],
 41 |     [
 42 |       {
 43 |         "name": "join",
 44 |         "query": "SELECT  COALESCE(agg.date, avg.date) as date,  COALESCE(agg.country, avg.country) as country,  sum_1,  sum_2,  avg_1,  avg_2FROM aggregate agg JOIN average avg ON agg.date = avg.date AND agg.country = avg.country WHERE COALESCE(agg.date, avg.date) = '1970-01-01'"
 45 |       }
 46 |     ]
 47 |   ],
 48 |   "destination": {
 49 |     "type": "Delta",
 50 |     "name": "example_output",
 51 |     "location": "${local_path}/examples/output_delta_example/${run_id}",
 52 |     "storage_options": {},
 53 |     "table_properties": {},
 54 |     "write_mode": {
 55 |       "operation": "Replace",
 56 |       "params": [
 57 |         {
 58 |           "column": "date",
 59 |           "value": "1970-01-01"
 60 |         }
 61 |       ]
 62 |     },
 63 |     "partition_cols": [
 64 |       "date"
 65 |     ],
 66 |     "schema": [
 67 |       {
 68 |         "name": "date",
 69 |         "type": "date",
 70 |         "nullable": true,
 71 |         "metadata": {}
 72 |       },
 73 |       {
 74 |         "name": "country",
 75 |         "type": "string",
 76 |         "nullable": true,
 77 |         "metadata": {}
 78 |       },
 79 |       {
 80 |         "name": "sum_1",
 81 |         "type": "integer",
 82 |         "nullable": true,
 83 |         "metadata": {}
 84 |       },
 85 |       {
 86 |         "name": "sum_2",
 87 |         "type": "double",
 88 |         "nullable": true,
 89 |         "metadata": {}
 90 |       },
 91 |       {
 92 |         "name": "avg_1",
 93 |         "type": "double",
 94 |         "nullable": true,
 95 |         "metadata": {}
 96 |       },
 97 |       {
 98 |         "name": "avg_2",
 99 |         "type": "double",
100 |         "nullable": true,
101 |         "metadata": {}
102 |       }
103 |     ]
104 |   }
105 | }


--------------------------------------------------------------------------------
/examples/aqueduct_pipeline_example.toml:
--------------------------------------------------------------------------------
  1 | version = "v2"
  2 | 
  3 | stages = [
  4 |   [
  5 |     { name = "jan_aggregated", query = """
  6 |       SELECT
  7 |         cast(timestamp as date) date,
  8 |         location_id,
  9 |         round(min(temperature_c),2) min_temp_c,
 10 |         round(min(humidity),2) min_humidity,
 11 |         round(max(temperature_c),2) max_temp_c,
 12 |         round(max(humidity),2) max_humidity,
 13 |         round(avg(temperature_c),2) avg_temp_c,
 14 |         round(avg(humidity),2) avg_humidity
 15 |       FROM jan_data GROUP by 1,2 ORDER by 1 asc
 16 |     """, show = 20 },
 17 |     { name = "feb_aggregated", query = """
 18 |       SELECT
 19 |         cast(timestamp as date) date,
 20 |         location_id,
 21 |         round(min(temperature_c),2) min_temp_c,
 22 |         round(min(humidity),2) min_humidity,
 23 |         round(max(temperature_c),2) max_temp_c,
 24 |         round(max(humidity),2) max_humidity,
 25 |         round(avg(temperature_c),2) avg_temp_c,
 26 |         round(avg(humidity),2) avg_humidity
 27 |       FROM feb_data GROUP by 1,2 ORDER by 1 asc
 28 |     """, show = 0 },
 29 |   ],
 30 |   [
 31 |     { name = "union", query = "SELECT * FROM jan_aggregated UNION (SELECT * FROM feb_aggregated)", print_schema = true },
 32 |   ],
 33 | ]
 34 | 
 35 | [[sources]]
 36 | type = "file"
 37 | name = "jan_data"
 38 | location = "${local_path}/examples/temp_readings_jan_2024.csv"
 39 | [sources.format]
 40 | type = "csv"
 41 | [sources.format.options]
 42 | has_header = true
 43 | delimiter = ","
 44 | 
 45 | [[sources]]
 46 | type = "file"
 47 | name = "feb_data"
 48 | location = "${local_path}/examples/temp_readings_feb_2024.csv"
 49 | [sources.format]
 50 | type = "csv"
 51 | [sources.format.options]
 52 | has_header = true
 53 | delimiter = ","
 54 | 
 55 | [destination]
 56 | type = "delta"
 57 | name = "example_output"
 58 | location = "${local_path}/examples/output_delta_example/${run_id}"
 59 | storage_config = {}
 60 | table_properties = {}
 61 | partition_columns = ["date"]
 62 | 
 63 | [destination.write_mode]
 64 | operation = "upsert"
 65 | params = ["date"]
 66 | 
 67 | [[destination.schema]]
 68 | name = "date"
 69 | data_type = "date32"
 70 | nullable = true
 71 | metadata = {}
 72 | 
 73 | [[destination.schema]]
 74 | name = "location_id"
 75 | data_type = "int32"
 76 | nullable = true
 77 | metadata = {}
 78 | 
 79 | [[destination.schema]]
 80 | name = "min_temp_c"
 81 | data_type = "float64"
 82 | nullable = true
 83 | metadata = {}
 84 | 
 85 | [[destination.schema]]
 86 | name = "min_humidity"
 87 | data_type = "float64"
 88 | nullable = true
 89 | metadata = {}
 90 | 
 91 | [[destination.schema]]
 92 | name = "max_temp_c"
 93 | data_type = "float64"
 94 | nullable = true
 95 | metadata = {}
 96 | 
 97 | [[destination.schema]]
 98 | name = "max_humidity"
 99 | data_type = "float64"
100 | nullable = true
101 | metadata = {}
102 | 
103 | [[destination.schema]]
104 | name = "avg_temp_c"
105 | data_type = "float64"
106 | nullable = true
107 | metadata = {}
108 | 
109 | [[destination.schema]]
110 | name = "avg_humidity"
111 | data_type = "float64"
112 | nullable = true
113 | metadata = {}


--------------------------------------------------------------------------------
/examples/aqueduct_pipeline_example.yml:
--------------------------------------------------------------------------------
  1 | version: "v2"
  2 | sources:
  3 |   - type: file
  4 |     name: jan_data
  5 |     format:
  6 |       type: csv
  7 |       options:
  8 |         has_header: true
  9 |         delimiter: ","
 10 |     location: ${local_path}/examples/temp_readings_jan_2024.csv
 11 | 
 12 |   - type: file
 13 |     name: feb_data
 14 |     format:
 15 |       type: csv
 16 |       options:
 17 |         has_header: true
 18 |         delimiter: ","
 19 |     location: ${local_path}/examples/temp_readings_feb_2024.csv
 20 | 
 21 | stages:
 22 |   - - name: jan_aggregated
 23 |       query: >
 24 |           SELECT
 25 |             cast(timestamp as date) date,
 26 |             location_id,
 27 |             round(min(temperature_c),2) min_temp_c,
 28 |             round(min(humidity),2) min_humidity,
 29 |             round(max(temperature_c),2) max_temp_c,
 30 |             round(max(humidity),2) max_humidity,
 31 |             round(avg(temperature_c),2) avg_temp_c,
 32 |             round(avg(humidity),2) avg_humidity
 33 |           FROM jan_data
 34 |           GROUP by 1,2
 35 |           ORDER by 1 asc
 36 |       # print 20 rows of the result for this query to stdout
 37 |       show: 20 
 38 | 
 39 |     - name: feb_aggregated
 40 |       query: >
 41 |           SELECT
 42 |             cast(timestamp as date) date,
 43 |             location_id,
 44 |             round(min(temperature_c),2) min_temp_c,
 45 |             round(min(humidity),2) min_humidity,
 46 |             round(max(temperature_c),2) max_temp_c,
 47 |             round(max(humidity),2) max_humidity,
 48 |             round(avg(temperature_c),2) avg_temp_c,
 49 |             round(avg(humidity),2) avg_humidity
 50 |           FROM feb_data
 51 |           GROUP by 1,2
 52 |           ORDER by 1 asc
 53 |       # print the entire result for this query to stdout
 54 |       show: 0
 55 | 
 56 |   - - name: union
 57 |       query: >
 58 |         SELECT * FROM jan_aggregated UNION (SELECT * FROM feb_aggregated)
 59 |       print_schema: true # print the resulting schema of this query to stdout
 60 | 
 61 | destination:
 62 |   type: delta
 63 |   name: example_output
 64 |   location: ${local_path}/examples/output_delta_example/${run_id}
 65 |   storage_config: {}
 66 |   table_properties: {}
 67 | 
 68 |   # how to write this table
 69 |   # valid options are Append, Replace and Upsert
 70 |   write_mode:
 71 |     # upserts using the date as the "primary" key
 72 |     operation: upsert
 73 |     params: 
 74 |       - date
 75 | 
 76 |   # columns by which to partition the table
 77 |   partition_columns:
 78 |     - date
 79 | 
 80 |   # table schema using de-serialization provided by `deltalake::kernel::StructField`
 81 |   schema:
 82 |     - name: date
 83 |       data_type: date32
 84 |       nullable: true
 85 |       metadata: {}
 86 |     - name: location_id
 87 |       data_type: int32
 88 |       nullable: true
 89 |       metadata: {}
 90 |     - name: min_temp_c
 91 |       data_type: float64
 92 |       nullable: true
 93 |       metadata: {}
 94 |     - name: min_humidity
 95 |       data_type: float64
 96 |       nullable: true
 97 |       metadata: {}
 98 |     - name: max_temp_c
 99 |       data_type: float64
100 |       nullable: true
101 |       metadata: {}
102 |     - name: max_humidity
103 |       data_type: float64
104 |       nullable: true
105 |       metadata: {}
106 |     - name: avg_temp_c
107 |       data_type: float64
108 |       nullable: true
109 |       metadata: {}
110 |     - name: avg_humidity
111 |       data_type: float64
112 |       nullable: true
113 |       metadata: {}


--------------------------------------------------------------------------------
/examples/aqueduct_pipeline_odbc.yml:
--------------------------------------------------------------------------------
 1 | version: "v2"
 2 | sources:
 3 |   - type: odbc
 4 |     name: jan_data
 5 |     # connection_string: Driver={PostgreSQL Unicode};Server=db;UID=${user};PWD=${pass}; # <- For executor
 6 |     connection_string: Driver={PostgreSQL Unicode};Server=localhost;UID=${user};PWD=${pass};
 7 |     query: SELECT * FROM temp_readings WHERE timestamp BETWEEN '2024-01-01' AND '2024-01-31'
 8 | 
 9 |   - type: odbc
10 |     name: feb_data
11 |     # connection_string: Driver={PostgreSQL Unicode};Server=db;UID=${user};PWD=${pass}; # <- For executor
12 |     connection_string: Driver={PostgreSQL Unicode};Server=localhost;UID=${user};PWD=${pass};
13 |     query: SELECT * FROM temp_readings WHERE timestamp BETWEEN '2024-02-01' AND '2024-02-29'
14 | 
15 | stages:
16 |   - - name: jan_aggregated
17 |       query: >
18 |           SELECT
19 |             cast(timestamp as date) date,
20 |             location_id,
21 |             round(min(temperature_c),2) min_temp_c,
22 |             round(min(humidity),2) min_humidity,
23 |             round(max(temperature_c),2) max_temp_c,
24 |             round(max(humidity),2) max_humidity,
25 |             round(avg(temperature_c),2) avg_temp_c,
26 |             round(avg(humidity),2) avg_humidity
27 |           FROM jan_data
28 |           GROUP by 1,2
29 |           ORDER by 1 asc
30 |       # print 20 rows of the result for this query to stdout
31 |       show: 20 
32 | 
33 |     - name: feb_aggregated
34 |       query: >
35 |           SELECT
36 |             cast(timestamp as date) date,
37 |             location_id,
38 |             round(min(temperature_c),2) min_temp_c,
39 |             round(min(humidity),2) min_humidity,
40 |             round(max(temperature_c),2) max_temp_c,
41 |             round(max(humidity),2) max_humidity,
42 |             round(avg(temperature_c),2) avg_temp_c,
43 |             round(avg(humidity),2) avg_humidity
44 |           FROM feb_data
45 |           GROUP by 1,2
46 |           ORDER by 1 asc
47 |       # print the entire result for this query to stdout
48 |       show: 0
49 | 
50 |   - - name: union
51 |       query: >
52 |         SELECT * FROM jan_aggregated UNION ALL SELECT * FROM feb_aggregated
53 | 
54 | destination:
55 |   type: odbc
56 |   name: temp_readings_aggregated
57 |     # connection_string: Driver={PostgreSQL Unicode};Server=db;UID=${user};PWD=${pass}; # <- For executor
58 |   connection_string: Driver={PostgreSQL Unicode};Server=localhost;UID=${user};PWD=${pass};
59 |   write_mode:
60 |     operation: append
61 |   batch_size: 100
62 | 


--------------------------------------------------------------------------------
/examples/aqueduct_pipeline_simple.yml:
--------------------------------------------------------------------------------
 1 | version: "v2"
 2 | sources:
 3 |   # Register a local file source containing temperature readings for various cities
 4 |   - type: file
 5 |     name: temp_readings
 6 |     format:
 7 |       type: csv
 8 |       options: {}
 9 |     location: ./examples/temp_readings_${month}_${year}.csv # use templating functionality to parameterize the month and year
10 | 
11 |   #Register a local file source containing a mapping between location_ids and location names
12 |   - type: file
13 |     name: locations
14 |     format:
15 |       type: csv
16 |       options: {}
17 |     location: ./examples/location_dict.csv
18 | 
19 | stages:
20 |   # Query to aggregate temperature data by date and location
21 |   - - name: aggregated
22 |       query: >
23 |           SELECT
24 |             cast(timestamp as date) date,
25 |             location_id,
26 |             round(min(temperature_c),2) min_temp_c,
27 |             round(min(humidity),2) min_humidity,
28 |             round(max(temperature_c),2) max_temp_c,
29 |             round(max(humidity),2) max_humidity,
30 |             round(avg(temperature_c),2) avg_temp_c,
31 |             round(avg(humidity),2) avg_humidity
32 |           FROM temp_readings
33 |           GROUP by 1,2
34 |           ORDER by 1 asc
35 |       explain: true # print the query plan to stdout for debugging purposes
36 | 
37 |   # Enrich aggregation with the location name
38 |   - - name: enriched
39 |       query: >
40 |         SELECT
41 |           date,
42 |           location_name,
43 |           min_temp_c,
44 |           max_temp_c,
45 |           avg_temp_c,
46 |           min_humidity,
47 |           max_humidity,
48 |           avg_humidity
49 |         FROM aggregated
50 |         JOIN locations 
51 |           ON aggregated.location_id = locations.location_id
52 |         ORDER BY date, location_name
53 |       show: 10 # print 10 rows to stdout for debugging purposes
54 | 
55 | # Write the pipeline result to a parquet file
56 | destination:
57 |   type: file
58 |   name: results
59 |   format:
60 |     type: parquet
61 |     options: {}
62 |   location: ./examples/output_${month}_${year}.parquet


--------------------------------------------------------------------------------
/examples/location_dict.csv:
--------------------------------------------------------------------------------
 1 | location_id,location_name
 2 | 1,"New York"
 3 | 2,"Los Angeles"
 4 | 3,"Chicago"
 5 | 4,"Vienna"
 6 | 5,"Prague"
 7 | 6,"Berlin"
 8 | 7,"Paris"
 9 | 8,"London"
10 | 


--------------------------------------------------------------------------------
/json_schema/generate_schema_reference.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import shutil
 4 | from json_schema_for_humans.generate import generate_from_filename
 5 | from json_schema_for_humans.generation_configuration import GenerationConfiguration
 6 | 
 7 | 
 8 | OUTPUT_DIR = "json_schema"
 9 | 
10 | 
11 | def find_latest_generated_json(target_dir='target', pattern='aqueducts.schema.json'):
12 |     # Search for the JSON file in the target directory
13 |     search_pattern = os.path.join(target_dir, 'debug', 'build', '**', pattern)
14 |     files = glob.glob(search_pattern, recursive=True)
15 | 
16 |     if not files:
17 |         raise FileNotFoundError(f"No files found matching pattern: {search_pattern}")
18 | 
19 |     # Find the most recently modified file
20 |     latest_file = max(files, key=os.path.getmtime)
21 |     return latest_file
22 | 
23 | 
24 | def on_startup(command, dirty):
25 |     try:
26 |         json_path = find_latest_generated_json()
27 |         print(f"Found latest JSON schema at: {json_path}")
28 | 
29 |         # output file to destination directory
30 |         output_path = shutil.copy(json_path, OUTPUT_DIR)
31 | 
32 |         gen_cfg = GenerationConfiguration(
33 |                     custom_template_path="json_schema/schema_reference_template/base.md",
34 |                     footer_show_time=False,
35 |                     description_is_markdown=True,
36 |                     link_to_reused_ref=False,
37 |                     show_breadcrumbs=False,
38 |                     show_toc=False,
39 |                     template_md_options={
40 |                         "badge_as_image": True,
41 |                         "show_heading_numbers": False,
42 |                         "show_array_restrictions": False,
43 |                         "properties_table_columns": [
44 |                             "Property",
45 |                             "Pattern",
46 |                             "Type",
47 |                             "Title/Description"
48 |                         ]
49 |                     }
50 |                  )
51 | 
52 |         generate_from_filename(output_path, "docs/schema_reference.md", config=gen_cfg)
53 | 
54 |     except Exception as e:
55 |         print(f"An error occurred: {e}")
56 | 


--------------------------------------------------------------------------------
/json_schema/schema_reference_template/base.md:
--------------------------------------------------------------------------------
 1 | {% set depth = 0 %}
 2 | {{ schema.keywords.get("title").literal | default("Schema Docs") | md_heading(depth) }}
 3 | {% set contentBase %}
 4 | {% with schema=schema, skip_headers=False, depth=depth %}
 5 |     {% include "content.md" %}
 6 | {% endwith %}
 7 | {% endset %}
 8 | 
 9 | {{ md_get_toc() }}
10 | 
11 | This is a generated JSONSchema reference for the Aqueducts configuration.
12 | 
13 | {{ contentBase }}
14 | 
15 | ----------------------------------------------------------------------------------------------------------------------------
16 | {% if config.with_footer -%}
17 | Generated using [json-schema-for-humans](https://github.com/coveooss/json-schema-for-humans){% if config.footer_show_time %} on {{ get_local_time() }}{% endif %}
18 | 
19 | {% endif -%}
20 | 


--------------------------------------------------------------------------------
/json_schema/schema_reference_template/breadcrumbs.md:
--------------------------------------------------------------------------------
 1 | {%- filter md_escape_for_table -%}
 2 | {%- if config.show_breadcrumbs -%}
 3 |   {%- for node in schema.nodes_from_root -%}
 4 |     {{ node.name_for_breadcrumbs }}{%- if not loop.last %} > {% endif -%}
 5 |   {%- endfor -%}
 6 | {%- else -%}
 7 |   Field: {{ schema.name_for_breadcrumbs }}
 8 | {%- endif -%}
 9 | {%- endfilter -%}
10 | 


--------------------------------------------------------------------------------
/json_schema/schema_reference_template/content.md:
--------------------------------------------------------------------------------
 1 | {#
 2 |     content is a template and not a macro in md
 3 |         because macro parameters are not through context
 4 |         when rendering a template from the macro  and it caused
 5 |         serious problems when using recursive calls
 6 |     mandatory context parameters:
 7 |     schema
 8 | #}
 9 | {# context parameters default values #}
10 | {% set skip_headers = skip_headers or False %}
11 | {% set depth = depth or 0 %}
12 | {# end context parameters #}
13 | 
14 | {% set keys = schema.keywords %}
15 | {%- if not skip_headers %}
16 | 
17 | {% if schema.title and schema.title | length > 0 %}
18 | **Title:** {{ schema.title }}
19 | {% endif %}
20 | 
21 | {{ schema | md_type_info_table | md_generate_table }}
22 | 
23 | {% set description = (schema | get_description) %}
24 | {% include "section_description.md" %}
25 | {% endif %}
26 | 
27 | {# Display examples #}
28 | {% set examples = schema.examples %}
29 | {% if examples %}
30 |     {% include "section_examples.md" %}
31 | {% endif %}
32 | 
33 | {% if schema.should_be_a_link(config) %}
34 | {% elif schema.refers_to -%}
35 |     {%- with schema=schema.refers_to_merged, skip_headers=True, depth=depth -%}
36 |         {% include "content.md" %}
37 |     {% endwith %}
38 | {% else %}
39 |     {# Properties, pattern properties, additional properties #}
40 |     {% if schema.is_object %}
41 |     {{- schema | md_properties_table | md_generate_table -}}
42 |     {% endif %}
43 | 
44 |     {# Combining: allOf, anyOf, oneOf, not #}
45 |     {% if schema.kw_all_of %}
46 |         {% with operator="allOf", title="All of(Requirement)", current_node=schema.kw_all_of, skip_required=True %}
47 |             {% include "tabbed_section.md" %}
48 |         {% endwith %}
49 |     {% endif %}
50 |     {% if schema.kw_any_of %}
51 |         {% with operator="anyOf", title="Any of(Option)", current_node=schema.kw_any_of, skip_required=True %}
52 |             {% include "tabbed_section.md" %}
53 |         {% endwith %}
54 |     {% endif %}
55 |     {% if schema.kw_one_of %}
56 |         {% with operator="oneOf", title="One of(Option)",current_node=schema.kw_one_of, skip_required=True %}
57 |             {% include "tabbed_section.md" %}
58 |         {% endwith %}
59 |     {% endif %}
60 |     {% if schema.kw_not %}
61 |         {% include "section_not.md" %}
62 |     {% endif %}
63 | 
64 |     {# Enum and const #}
65 |     {% if schema.kw_enum -%}
66 |         {% include "section_one_of.md" %}
67 |     {%- endif %}
68 |     {%- if schema.kw_const -%}
69 |         Specific value: `{{ schema.kw_const.raw | python_to_json }}`
70 |     {%- endif -%}
71 | 
72 |     {# Conditional subschema, or if-then-else section #}
73 |     {% if schema.has_conditional %}
74 |         {% with skip_headers=False, depth=depth+1 %}
75 |             {% include "section_conditional_subschema.md" %}
76 |         {% endwith %}
77 |     {% endif %}
78 | 
79 |     {# Required properties that are not defined under "properties". They will only be listed #}
80 |     {% include "section_undocumented_required_properties.md" %}
81 | 
82 |     {# Show the requested type(s) #}
83 |     {{- schema | md_restrictions_table | md_generate_table -}}
84 | 
85 |     {# Show array restrictions #}
86 |     {% if schema.type_name.startswith("array") %}
87 |         {% include "section_array.md" %}
88 |     {% endif %}
89 | 
90 |     {# details of Properties, pattern properties, additional properties #}
91 |     {% if schema.is_object %}
92 |     {% include "section_properties_details.md" %}
93 |     {% endif %}
94 | {% endif %}
95 | 


--------------------------------------------------------------------------------
/json_schema/schema_reference_template/section_array.md:
--------------------------------------------------------------------------------
 1 | {{ schema | md_array_restrictions | md_generate_table }}
 2 | 
 3 | {% if schema.array_items_def or schema.tuple_validation_items %}
 4 | {{ schema | md_array_items_restrictions | md_generate_table }}
 5 | {% endif %}
 6 | 
 7 | {% if schema.array_items_def %}
 8 | {% filter md_heading(depth+1) %}
 9 | {% with schema=schema.array_items_def %}{%- include "breadcrumbs.md" %}{% endwith %}
10 | {% endfilter %}
11 | {% with schema=schema.array_items_def, skip_headers=False, depth=depth+1, skip_required=True %}
12 |     {% include "content.md" %}
13 | {% endwith %}
14 | {% endif %}
15 | 
16 | {% if schema.tuple_validation_items %}
17 | {% for item in schema.tuple_validation_items %}
18 |     {% filter md_heading(depth+1) %}
19 |     {% with schema=item %}{%- include "breadcrumbs.md" %}{% endwith %}
20 |     {% endfilter %}
21 |     {% with schema=item, skip_headers=False, depth=depth+1, skip_required=True %}
22 |         {% include "content.md" %}
23 |     {% endwith %}
24 | {% endfor %}
25 | {% endif %}
26 | 
27 | {% if schema.kw_contains and schema.kw_contains.literal != {} %}
28 | {{ "At least one of the items must be" | md_heading(depth+1) }}
29 | {% with schema=schema.kw_contains, skip_headers=False, depth=depth+1, skip_required=True %}
30 |     {% include "content.md" %}
31 | {% endwith %}
32 | {% endif %}
33 | 
34 | {% if schema.array_additional_items_def %}
35 | {{ "Additional items must be" | md_heading(depth+1) }}
36 | {% with schema=schema.array_additional_items_def, skip_headers=False, depth=depth+1, skip_required=True %}
37 |     {% include "content.md" %}
38 | {% endwith %}
39 | {% endif %}
40 | 


--------------------------------------------------------------------------------
/json_schema/schema_reference_template/section_conditional_subschema.md:
--------------------------------------------------------------------------------
 1 | {% if schema.kw_if %}
 2 |     {% set first_property =  schema.kw_if | get_first_property %}
 3 | 
 4 |     {% if schema.kw_then %}
 5 |         {%- filter md_heading(depth) -%}If (
 6 |             {{- first_property.property_name | md_escape_for_table -}}
 7 |             {{- " = " -}}
 8 |             {{- first_property.kw_const.literal | python_to_json -}}
 9 |         ){%- endfilter -%}
10 |         {% with schema=schema.kw_then, skip_headers=False, depth=depth %}
11 |             {% include "content.md" %}
12 |         {% endwith %}
13 |     {% endif %}
14 |     {% if schema.kw_else %}
15 |         {%- filter md_heading(depth) -%}Else (i.e. {{ " " }}
16 |             {{- first_property.property_name | md_escape_for_table -}}
17 |             {{- " != " -}}
18 |             {{- first_property.kw_const.literal | python_to_json -}}
19 |         ){%- endfilter -%}
20 |         {% with schema=schema.kw_else, skip_headers=False, depth=depth %}
21 |             {% include "content.md" %}
22 |         {% endwith %}
23 |     {% endif %}
24 | {% endif %}


--------------------------------------------------------------------------------
/json_schema/schema_reference_template/section_description.md:
--------------------------------------------------------------------------------
1 | {# Display description #}
2 | {% if description %}
3 | **Description:**{{ " " }}{{ description }}
4 | {% else %}
5 | **Description:**{{ " " }} *No description...*
6 | {% endif %}
7 | 


--------------------------------------------------------------------------------
/json_schema/schema_reference_template/section_examples.md:
--------------------------------------------------------------------------------
 1 | **Example{% if examples|length > 1 %}s{% endif %}:**{{ " " }}
 2 | 
 3 | {% for example in examples %}
 4 |     {%- if loop.first %}{{ "\n" }}{% endif -%}
 5 |     {% set example_id = schema.html_id ~ "_ex" ~ loop.index %}
 6 |     {%- if not examples_as_yaml -%}
 7 |         {{- "" }}```json
 8 |         {{- "\n" }}{{ example }}
 9 |         {{- "\n" }}```
10 |     {%- else -%}
11 |         {{- "" }}```yaml
12 |         {{- "\n" }}{{ example | yaml_example }}
13 |         {{- "\n" }}```
14 |     {%- endif -%}
15 |     {{ "\n" }}
16 | {% endfor %}
17 | 


--------------------------------------------------------------------------------
/json_schema/schema_reference_template/section_not.md:
--------------------------------------------------------------------------------
1 | {{ "Must **not** be" | md_heading(depth+1) }}
2 | {% with schema=schema.kw_not, skip_headers=False, depth=depth+1, skip_required=True %}
3 |     {% include "content.md" %}
4 | {% endwith %}


--------------------------------------------------------------------------------
/json_schema/schema_reference_template/section_one_of.md:
--------------------------------------------------------------------------------
1 | Must be one of:
2 | {% for enum_choice in schema.kw_enum.array_items %}
3 | * {{ enum_choice.literal | python_to_json }}
4 | {% endfor %}


--------------------------------------------------------------------------------
/json_schema/schema_reference_template/section_properties_details.md:
--------------------------------------------------------------------------------
 1 | {% for sub_property in schema.iterate_properties %}
 2 | 
 3 | ----------------------------------------------------
 4 | 
 5 |   {%- if sub_property.is_additional_properties and not sub_property.is_additional_properties_schema -%}
 6 |     {% continue %}
 7 |   {% endif %}
 8 | 
 9 |   {% set html_id = sub_property.html_id %}
10 | 
11 |   {% set description = sub_property | get_description %}
12 | 
13 |   {% filter md_heading(depth + 1, html_id) -%}
14 |     {%- filter replace('\n', '') -%}
15 |     {%- if not skip_required and sub_property.property_name -%}
16 |         {{ md_badge("Required", "blue", fallback=False) if sub_property.is_required_property else md_badge("Optional", "yellow", fallback=False) -}}
17 |     {%- endif -%}
18 |     {%- if sub_property is deprecated  -%}~~{%- endif -%}
19 |     {%- if sub_property.is_pattern_property %} Pattern{% endif %} {% with schema=sub_property %}{%- include "breadcrumbs.md" %}{% endwith %}
20 |     {%- if sub_property is deprecated -%}~~{%- endif -%}
21 |     {%- endfilter %}
22 |   {%- endfilter %}
23 | 
24 |   {% if sub_property.is_pattern_property %}
25 | > All properties whose name matches the regular expression
26 | ```{{ sub_property.property_name }}``` ([Test](https://regex101.com/?regex={{ sub_property.property_name | urlencode }}))
27 | must respect the following conditions
28 |   {% endif %}
29 | 
30 | 
31 |   {% with schema=sub_property, skip_headers=False, depth=depth+1 %}
32 |     {% include "content.md" %}
33 |   {% endwith %}
34 | 
35 | {% endfor %}
36 | 


--------------------------------------------------------------------------------
/json_schema/schema_reference_template/section_undocumented_required_properties.md:
--------------------------------------------------------------------------------
1 | {% set undocumented_required_properties = schema | get_undocumented_required_properties %}
2 | {% if undocumented_required_properties%}
3 | {{ "The following properties are required" | md_heading(depth+1) }}
4 | {% for required_property in undocumented_required_properties %}
5 | * {{ required_property }}
6 | {% endfor %}
7 | {% endif %}


--------------------------------------------------------------------------------
/json_schema/schema_reference_template/tabbed_section.md:
--------------------------------------------------------------------------------
 1 | 
 2 | {{ current_node | md_array_items(title) | md_generate_table }}
 3 | 
 4 | {% for node in current_node.array_items %}
 5 |     {% filter md_heading(depth+1, node.html_id) -%}
 6 |         {% if node.is_pattern_property %}Pattern{% endif %} **{% with schema=node %}{%- include "breadcrumbs.md" %}{% endwith %}**
 7 |     {%- endfilter %}
 8 |     {% with schema=node, skip_headers=False, depth=depth+1 %}
 9 |         {% include "content.md" %}
10 |     {% endwith %}
11 | {% endfor %}
12 | 


--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vigimite/aqueducts/6eaf764852ac9348eb45c7073ecf126d61eb5505/logo.png


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
  1 | site_name: Aqueducts Documentation
  2 | site_url: https://vigimite.github.io/aqueducts
  3 | repo_url: https://github.com/vigimite/aqueducts
  4 | repo_name: vigimite/aqueducts
  5 | copyright: Copyright &copy; 2024 Michele Vigilante
  6 | docs_dir: "docs/"
  7 | 
  8 | hooks:
  9 |   - json_schema/generate_schema_reference.py
 10 | 
 11 | nav:
 12 |   - Home: index.md
 13 |   - Usage: usage.md
 14 |   - Storage Configuration: storage.md
 15 |   - Schema Reference: schema_reference.md
 16 |   - Architecture: architecture.md
 17 |   - Aqueducts CLI: cli.md
 18 |   - About: about.md
 19 | 
 20 | theme:
 21 |   language: en
 22 |   name: material
 23 |   logo: assets/logo.png
 24 |   icon: 
 25 |     repo: fontawesome/brands/github
 26 |   favicon: assets/favicon.ico
 27 |   palette:
 28 |     # Palette toggle for automatic mode
 29 |     - media: "(prefers-color-scheme)"
 30 |       toggle:
 31 |         icon: material/brightness-auto
 32 |         name: Switch to light mode
 33 | 
 34 |     # Palette toggle for light mode
 35 |     - media: "(prefers-color-scheme: light)"
 36 |       scheme: default 
 37 |       primary: custom
 38 |       toggle:
 39 |         icon: material/brightness-7
 40 |         name: Switch to dark mode
 41 | 
 42 |     # Palette toggle for dark mode
 43 |     - media: "(prefers-color-scheme: dark)"
 44 |       scheme: slate
 45 |       primary: blue grey
 46 |       toggle:
 47 |         icon: material/brightness-4
 48 |         name: Switch to system preference
 49 | 
 50 |   features:
 51 |     - navigation.tabs
 52 |     - navigation.instant
 53 |     - navigation.instant.prefetch
 54 |     - navigation.tracking
 55 |     - navigation.path
 56 |     - navigation.top
 57 | 
 58 |     - toc.integrate
 59 |     - toc.follow
 60 | 
 61 |     - search.suggest
 62 | 
 63 |     - content.code.copy
 64 |     - content.code.annotate
 65 | 
 66 | extra_css:
 67 |   - stylesheets/extra.css
 68 | 
 69 | extra:
 70 |   generator: true
 71 |   social:
 72 |     - icon: fontawesome/brands/mastodon 
 73 |       link: https://fosstodon.org/@kato
 74 |     - icon: fontawesome/brands/github
 75 |       link: https://github.com/vigimite
 76 | 
 77 | plugins: 
 78 |   - search
 79 |   - social
 80 | 
 81 | markdown_extensions:
 82 |   - toc:
 83 |       toc_depth: 5
 84 |   - tables
 85 |   - admonition
 86 |   - attr_list
 87 |   - pymdownx.emoji:
 88 |       emoji_index: !!python/name:material.extensions.emoji.twemoji
 89 |       emoji_generator: !!python/name:material.extensions.emoji.to_svg
 90 |   - pymdownx.highlight:
 91 |       anchor_linenums: true
 92 |       line_spans: __span
 93 |       pygments_lang_class: true
 94 |   - pymdownx.inlinehilite
 95 |   - pymdownx.snippets
 96 |   - pymdownx.superfences
 97 |   - pymdownx.details
 98 |   - pymdownx.tabbed:
 99 |       alternate_style: true
100 | 


--------------------------------------------------------------------------------
/release.toml:
--------------------------------------------------------------------------------
1 | # Publish all crates in dependency order
2 | allow-branch = ["main"]
3 | # Use shared versioning across workspace
4 | shared-version = true
5 | 


--------------------------------------------------------------------------------