├── .github ├── actions │ └── install-env │ │ └── action.yml ├── dependabot.yml └── workflows │ ├── code-quality.yml │ └── unit-tests.yml ├── .gitignore ├── .gitmodules ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── README.md ├── examples ├── diff │ ├── README.md │ └── views │ │ ├── dev │ │ ├── analytics │ │ │ └── kpis.sql │ │ ├── core │ │ │ └── orders.sql.jinja │ │ └── staging │ │ │ ├── customers.py │ │ │ ├── orders.py │ │ │ └── payments.py │ │ └── prod │ │ ├── core │ │ ├── customers.sql │ │ └── orders.sql.jinja │ │ └── staging │ │ ├── customers.py │ │ ├── orders.py │ │ └── payments.py ├── incremental │ ├── README.md │ ├── scripts_today │ │ └── core │ │ │ └── events.sql │ └── scripts_tomorrow │ │ └── core │ │ └── events.sql ├── jaffle_shop │ ├── README.md │ ├── docs │ │ ├── README.md │ │ ├── analytics │ │ │ └── README.md │ │ ├── core │ │ │ └── README.md │ │ └── staging │ │ │ └── README.md │ └── scripts │ │ ├── analytics │ │ ├── finance │ │ │ └── kpis.sql │ │ └── kpis.sql │ │ ├── core │ │ ├── customers.sql │ │ └── orders.sql.jinja │ │ ├── staging │ │ ├── customers.sql │ │ ├── orders.sql │ │ └── payments.sql │ │ └── tests │ │ └── orders_are_dated.sql ├── motherduck │ └── README.md └── school │ ├── README.md │ ├── scripts │ ├── analytics │ │ ├── finance │ │ │ └── expenses.sql │ │ ├── major.sql │ │ └── scholarship_award.sql │ ├── core │ │ └── yearly_results.sql │ ├── staging │ │ ├── grades.sql │ │ └── students.sql │ └── tests │ │ └── budget.sql │ └── seeds │ ├── raw_grades.csv │ └── raw_students.csv ├── lea ├── __init__.py ├── assertions │ ├── NO_NULLS.sql.jinja │ ├── SET.sql.jinja │ ├── UNIQUE.sql.jinja │ └── UNIQUE_BY.sql.jinja ├── cli.py ├── comment.py ├── conductor.py ├── dag.py ├── databases.py ├── dialects.py ├── field.py ├── job.py ├── scripts.py ├── session.py ├── table_ref.py ├── test_big_query.py ├── test_duckdb.py └── test_table_ref.py ├── poetry.lock └── pyproject.toml /.github/actions/install-env/action.yml: -------------------------------------------------------------------------------- 1 | name: Install env 2 | runs: 3 | using: "composite" 4 | steps: 5 | - name: Check out repository 6 | uses: actions/checkout@v4 7 | with: 8 | submodules: true 9 | 10 | - name: Set up python 11 | id: set-up-python 12 | uses: actions/setup-python@v4 13 | with: 14 | python-version: 3.11 15 | 16 | - name: Load cached Poetry installation 17 | uses: actions/cache@v3 18 | with: 19 | path: ~/.local 20 | key: poetry-0 21 | 22 | - name: Install poetry 23 | uses: snok/install-poetry@v1 24 | with: 25 | virtualenvs-create: true 26 | virtualenvs-in-project: true 27 | installer-parallel: true 28 | 29 | - name: Load cached virtual env 30 | uses: actions/cache@v3 31 | with: 32 | path: .venv 33 | key: venv-${{ runner.os }}-${{ steps.set-up-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} 34 | 35 | - name: Install dependencies 36 | shell: bash 37 | if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' 38 | run: poetry install --no-interaction --no-ansi 39 | 40 | - name: Activate environment 41 | shell: bash 42 | run: source $VENV 43 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: pip 4 | directory: "/" 5 | schedule: 6 | interval: "weekly" 7 | day: "tuesday" 8 | reviewers: 9 | - MaxHalford 10 | groups: 11 | python-packages: 12 | patterns: 13 | - "*" 14 | -------------------------------------------------------------------------------- /.github/workflows/code-quality.yml: -------------------------------------------------------------------------------- 1 | name: Code quality 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - "*" 7 | push: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | run: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v3 16 | - uses: ./.github/actions/install-env 17 | - name: Run pre-commit on all files 18 | run: poetry run pre-commit run --all-files 19 | -------------------------------------------------------------------------------- /.github/workflows/unit-tests.yml: -------------------------------------------------------------------------------- 1 | name: Unit tests 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - "*" 7 | push: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | run: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v3 16 | - uses: ./.github/actions/install-env 17 | - name: Run pytest 18 | run: poetry run pytest 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | *.pyc 3 | *.db 4 | .env 5 | dist/ 6 | /*.ipynb 7 | .DS_Store 8 | *.wal 9 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "examples/jaffle_shop/jaffle_shop"] 2 | path = examples/jaffle_shop/jaffle_shop 3 | url = https://github.com/dbt-labs/jaffle_shop/ 4 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | files: . 2 | repos: 3 | - repo: https://github.com/pre-commit/pre-commit-hooks 4 | rev: v4.4.0 5 | hooks: 6 | - id: check-json 7 | - id: check-yaml 8 | - id: trailing-whitespace 9 | - id: mixed-line-ending 10 | 11 | - repo: https://github.com/astral-sh/ruff-pre-commit 12 | rev: v0.1.7 13 | hooks: 14 | - id: ruff 15 | - id: ruff-format 16 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | ## Setup 4 | 5 | Start by cloning the repository: 6 | 7 | ```sh 8 | git clone https://github.com/carbonfact/lea 9 | ``` 10 | 11 | There are submodules in this repository, so you'll need to fetch/update them: 12 | 13 | ```sh 14 | git submodule init 15 | git submodule update 16 | ``` 17 | 18 | Next, you'll need a Python environment: 19 | 20 | ```sh 21 | pyenv install -v 3.11 22 | ``` 23 | 24 | You'll also need [Poetry](https://python-poetry.org/): 25 | 26 | ```sh 27 | curl -sSL https://install.python-poetry.org | python3 - 28 | poetry install 29 | poetry shell 30 | ``` 31 | 32 | ## Testing 33 | 34 | You can run tests once the environment is set up: 35 | 36 | ```sh 37 | pytest 38 | ``` 39 | 40 | ## Code quality 41 | 42 | Install the code quality routine so that it runs each time you try to push your commits. 43 | 44 | ```sh 45 | pre-commit install --hook-type pre-push 46 | ``` 47 | 48 | You can also run the code quality routine ad-hoc. 49 | 50 | ```sh 51 | pre-commit run --all-files 52 | ``` 53 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2023 Carbonfact 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | fomo: 2 | git fetch && git rebase origin/main 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

lea

2 | 3 | 4 | 5 |

6 | 7 | 8 | tests 9 | 10 | 11 | 12 | 13 | code_quality 14 | 15 | 16 | 17 | 18 | pypi 19 | 20 | 21 | 22 | 23 | license 24 | 25 |

26 | 27 | lea is a minimalist alternative to SQL orchestrators like [dbt](https://www.getdbt.com/) and [SQLMesh](https://sqlmesh.com/). 28 | 29 | lea aims to be simple and provides sane defaults. We happily use it every day at [Carbonfact](https://www.carbonfact.com/) to manage our BigQuery data warehouse. We will actively maintain it and add features, while welcoming contributions. 30 | 31 | - [Examples](#examples) 32 | - [Installation](#installation) 33 | - [Configuration](#configuration) 34 | - [DuckDB](#duckdb) 35 | - [BigQuery](#bigquery) 36 | - [Usage](#usage) 37 | - [`lea run`](#lea-run) 38 | - [File structure](#file-structure) 39 | - [Jinja templating](#jinja-templating) 40 | - [Development vs. production](#development-vs-production) 41 | - [Selecting scripts](#selecting-scripts) 42 | - [Write-Audit-Publish (WAP)](#write-audit-publish-wap) 43 | - [Testing while running](#testing-while-running) 44 | - [Skipping unmodified scripts during development](#skipping-unmodified-scripts-during-development) 45 | - [Warehouse specific features](#warehouse-specific-features) 46 | - [BigQuery](#bigquery-1) 47 | - [Default clustering](#default-clustering) 48 | - [Big Blue Pick API](#big-blue-pick-api) 49 | - [Contributing](#contributing) 50 | - [License](#license) 51 | 52 | ## Examples 53 | 54 | - [Jaffle shop 🥪](examples/jaffle_shop/) 55 | - [Incremental 🕐](examples/incremental) 56 | - [School 🏫](examples/school/) 57 | - [Compare development to production 👯‍♀️](examples/diff/) 58 | - [Using MotherDuck 🦆](examples/motherduck/) 59 | 60 | ## Installation 61 | 62 | Use one of the following commands, depending on which warehouse you wish to use: 63 | 64 | ```sh 65 | pip install lea-cli 66 | ``` 67 | 68 | This installs the `lea` command. It also makes the `lea` Python library available. 69 | 70 | ## Configuration 71 | 72 | lea is configured via environment variables. 73 | 74 | ### DuckDB 75 | 76 | ```sh 77 | LEA_WAREHOUSE=duckdb 78 | LEA_DUCKDB_PATH=duckdb.db 79 | ``` 80 | 81 | ### BigQuery 82 | 83 | ```sh 84 | # Required 85 | LEA_WAREHOUSE=bigquery 86 | # Required 87 | LEA_BQ_LOCATION=EU 88 | # Required 89 | LEA_BQ_DATASET_NAME=kaya 90 | # Required, the project where the dataset is located 91 | LEA_BQ_PROJECT_ID=carbonfact-dwh 92 | # Optional, allows using a different project for compute 93 | LEA_BQ_COMPUTE_PROJECT_ID=carbonfact-dwh-compute 94 | # Not necessary if you're logged in with the gcloud CLI 95 | LEA_BQ_SERVICE_ACCOUNT= # not a path ⚠️ 96 | # Defaults to https://www.googleapis.com/auth/bigquery 97 | LEA_BQ_SCOPES=https://www.googleapis.com/auth/bigquery,https://www.googleapis.com/auth/drive 98 | # LOGICAL or PHYSICAL, defaults to PHYSICAL 99 | LEA_BQ_STORAGE_BILLING_MODEL=PHYSICAL 100 | ``` 101 | 102 | ## Usage 103 | 104 | These parameters can be provided in an `.env` file, or directly in the shell. Each command also has an `--env` flag to provide a path to an `.env` file. 105 | 106 | ### `lea run` 107 | 108 | This is the main command. It runs SQL queries stored in the `scripts` directory: 109 | 110 | ```sh 111 | lea run 112 | ``` 113 | 114 | You can indicate the directory where the scripts are stored: 115 | 116 | ```sh 117 | lea run --scripts /path/to/scripts 118 | ``` 119 | 120 | The scripts are run concurrently. They are organized in a DAG, which is traversed in a topological order. The DAG's structure is determined [automatically](https://maxhalford.github.io/blog/dbt-ref-rant/) by analyzing the dependency between queries. 121 | 122 | ### File structure 123 | 124 | Each query is expected to be placed under a schema, represented by a directory. Schemas can have sub-schemas. Here's an example: 125 | 126 | ``` 127 | scripts/ 128 | schema_1/ 129 | table_1.sql 130 | table_2.sql 131 | schema_2/ 132 | table_3.sql 133 | table_4.sql 134 | sub_schema_2_1/ 135 | table_5.sql 136 | table_6.sql 137 | ``` 138 | 139 | Each script is materialized into a table. The table is named according to the script's name, following the warehouse convention. 140 | 141 | #### Jinja templating 142 | 143 | SQL queries can be templated with [Jinja](https://jinja.palletsprojects.com/en/3.1.x/). A `.sql.jinja` extension is necessary for lea to recognise them. 144 | 145 | You have access to an `env` variable within the template context, which is simply an access point to `os.environ`. 146 | 147 | ### Development vs. production 148 | 149 | By default, lea creates an isolation layer with production. The way this is done depends on your warehouse: 150 | 151 | - BigQuery : by appending a `_` suffix to schema names 152 | - DuckDB : by adding a suffix `_` to database file. 153 | 154 | In other words, a development environment is used by default. Use the `--production` flag when executing `lea run` to disable this behaviour, and instead target the product environment. 155 | 156 | ```sh 157 | lea run --production 158 | ``` 159 | 160 | The `` is determined automatically from the [login name](https://docs.python.org/3/library/getpass.html#getpass.getuser). It can be overriden by setting the `LEA_USERNAME` environment variable. 161 | 162 | ### Selecting scripts 163 | 164 | A single script can be run: 165 | 166 | ```sh 167 | lea run --select core.users 168 | ``` 169 | 170 | Several scripts can be run: 171 | 172 | ```sh 173 | lea run --select core.users --select core.orders 174 | ``` 175 | 176 | Similar to dbt, lea also supports graph operators: 177 | 178 | ```sh 179 | lea run --select core.users+ # users and everything that depends on it 180 | lea run --select +core.users # users and everything it depends on 181 | lea run --select +core.users+ # users and all its dependencies 182 | ``` 183 | 184 | You can select all scripts in a schema: 185 | 186 | ```sh 187 | lea run --select core/ # the trailing slash matters 188 | ``` 189 | 190 | This also work with sub-schemas: 191 | 192 | ```sh 193 | lea run --select analytics.finance/ 194 | ``` 195 | 196 | There are thus 8 possible operators: 197 | 198 | ``` 199 | schema.table (table by itself) 200 | schema.table+ (table with its descendants) 201 | +schema.table (table with its ancestors) 202 | +schema.table+ (table with its ancestors and descendants) 203 | schema/ (all tables in schema) 204 | schema/+ (all tables in schema with their descendants) 205 | +schema/ (all tables in schema with their ancestors) 206 | +schema/+ (all tables in schema with their ancestors and descendants) 207 | ``` 208 | 209 | Combinations are possible: 210 | 211 | ```sh 212 | lea run --select core.users+ --select +core.orders 213 | ``` 214 | 215 | There's an Easter egg that allows choosing scripts that have been committed or modified in the current Git branch: 216 | 217 | ```sh 218 | lea run --select git 219 | lea run --select git+ # includes all descendants 220 | ``` 221 | 222 | This becomes very handy when using lea in continuous integration. 223 | 224 | ### Write-Audit-Publish (WAP) 225 | 226 | [WAP](https://lakefs.io/blog/data-engineering-patterns-write-audit-publish/) is a data engineering pattern that ensures data consistency and reliability. It's the data engineering equivalent of [blue-green deployment](https://en.wikipedia.org/wiki/Blue%E2%80%93green_deployment) in the software engineering world. 227 | 228 | lea follows the WAP pattern by default. When you execute `lea run`, it actually creates temporary tables that have an `___audit` suffix. The latter tables are promoted to replace the existing tables, once they have all been materialized without errors. 229 | 230 | This is a good default behavior. Let's say you refresh table `foo`. Then you refresh table `bar` that depends on `foo`. If the refresh of `bar` fails, you're left with a corrupt state. This is what the WAP pattern solves. In WAP mode, when you run `foo`'s script, it creates a `foo___audit` table. If `bar`'s script fails, then the run stops and `foo` is not modified. 231 | 232 | ### Testing while running 233 | 234 | There is no `lea test` command. Tests are run together with the regular script when `lea run` is executed. The run stops whenever a test fails. 235 | 236 | There are two types of tests: 237 | 238 | - Singular tests — these are queries which return failing rows. They are stored in a `tests` directory. 239 | - Assertion tests — these are comment annotations in the queries themselves: 240 | - `#NO_NULLS` — checks that all values in a column are not null. 241 | - `#UNIQUE` — checks that a column's values are unique. 242 | - `#UNIQUE_BY()` — checks that a column's values are unique within a group. 243 | - `#SET{}` — checks that a column's values are in a set of values. 244 | 245 | Here's an example of a query annotated with assertion tests: 246 | 247 | ```sql 248 | SELECT 249 | -- #UNIQUE 250 | -- #NO_NULLS 251 | user_id, 252 | -- #NO_NULLS 253 | address, 254 | -- #UNIQUE_BY(address) 255 | full_name, 256 | -- #SET{'A', 'B', 'AB', 'O'} 257 | blood_type 258 | FROM core.users 259 | ``` 260 | 261 | You can run a single test via the `--select` flag: 262 | 263 | ```sh 264 | lea run --select tests.check_n_users 265 | ``` 266 | 267 | Or even run all the tests, as so: 268 | 269 | ```sh 270 | lea run --select tests/ # the trailing slash matters 271 | ``` 272 | 273 | ☝️ When you run a script that is not a test, all the applicable tests are run as well. For instance, the following command will run the `core.users` script and all the tests that are applicable to it: 274 | 275 | ```sh 276 | lea run --select core.users 277 | ``` 278 | 279 | You may decide to run all scripts without executing tests, which is obviously not advisable: 280 | 281 | ```sh 282 | lea run --unselect tests/ 283 | lea run --select core.users --unselect tests/ 284 | ``` 285 | 286 | ### Skipping unmodified scripts during development 287 | 288 | When you call `lea run`, it generates audit tables, which are then promoted to replace the original tables. This is done to ensure that the data is consistent and reliable. lea doesn't run scripts when the audit table already exists, and when the script hasn't modified since the last time the audit table was created. This is to avoid unnecessary re-runs of scripts that haven't changed. 289 | 290 | For instance: 291 | 292 | 1. You execute `lea run` to sync all tables from sources, no errors, all tables are materialized. 293 | 2. You modify a script named `core/expenses.sql` depending on `staging/customers.sql` and `staging/orders.sql` 294 | 3. You execute `lea run core.expenses+` to run again all impacted tables 295 | 4. `core__expenses___audit` is materialized in your data warehouse but the `-- #NO_NULLS` assertion test on a column fails 296 | 5. After reviewing data in `core__expenses___audit`, you edit and fix `core/expenses.sql` to filter out results where NULLs are appearing 297 | 6. You execute `lea run` 298 | 7. The `staging/customers.sql` and `staging/orders.sql` scripts are skipped because they were modified before `staging__customers` and `staging__orders` was last materialized 299 | 8. The `core/expenses.sql` script is run because it was modified after `core__expenses` was last materialized 300 | 9. All audit tables are wipped out from database as the whole DAG has run successfully ! 🎉 301 | 302 | You can disable this behavior altogether: 303 | 304 | ```sh 305 | lea run --restart 306 | ``` 307 | 308 | ## Warehouse specific features 309 | 310 | ### BigQuery 311 | 312 | #### Default clustering 313 | 314 | At Carbonfact, we cluster most of our tables by customer. This is done to optimize query performance and reduce costs. lea allows you to automatically cluster tables that contain a given field: 315 | 316 | ```sh 317 | LEA_BQ_DEFAULT_CLUSTERING_FIELDS=account_slug 318 | ``` 319 | 320 | You can also specify multiple fields, meaning that tables which contain both fields will be clustered: 321 | 322 | ```sh 323 | LEA_BQ_DEFAULT_CLUSTERING_FIELDS=account_slug,brand_slug 324 | ``` 325 | 326 | For each table, lea will use the clustering fields it can and ignore the others. With the previous configuration, if your table defines `account_slug` and not `brand_slug`, it will cluster by `account_slug`. 327 | 328 | #### Big Blue Pick API 329 | 330 | [Big Blue](https://biq.blue/) is a SaaS product to monitor and optimize BigQuery costs. As part of their offering, they provide a [Pick API](https://biq.blue/blog/compute/how-to-implement-bigquery-autoscaling-reservation-in-10-minutes). The idea is that some queries should be run on-demand, while others should be run on a reservation. Big Blue's Pick API suggests which billing model to use for each query. 331 | 332 | We use this at Carbonfact, and so this API is available out of the box in lea. You can enable it by setting the following environment variables: 333 | 334 | ```sh 335 | LEA_BQ_BIG_BLUE_PICK_API_KEY= 336 | LEA_BQ_BIG_BLUE_PICK_API_URL=https://pick.biq.blue 337 | LEA_BQ_BIG_BLUE_PICK_API_ON_DEMAND_PROJECT_ID=on-demand-compute-project-id 338 | LEA_BQ_BIG_BLUE_PICK_API_REVERVATION_PROJECT_ID=reservation-compute-project-id 339 | ``` 340 | 341 | ## Contributing 342 | 343 | Feel free to reach out to [max@carbonfact.com](mailto:max@carbonfact.com) if you want to know more and/or contribute 😊 344 | 345 | We have suggested [some issues](https://github.com/carbonfact/lea/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc+label%3A%22good+first+issue%22) as good places to get started. 346 | 347 | ## License 348 | 349 | lea is free and open-source software licensed under the Apache License, Version 2.0. 350 | -------------------------------------------------------------------------------- /examples/diff/README.md: -------------------------------------------------------------------------------- 1 | # Compare development to production 2 | 3 | The first thing to do is create an `.env` file, as so: 4 | 5 | ```sh 6 | echo " 7 | LEA_USERNAME=max 8 | LEA_WAREHOUSE=duckdb 9 | LEA_DUCKDB_PATH=jaffle_shop.db 10 | " > .env 11 | ``` 12 | 13 | This example is about comparing data in development to what's in production. For the purpose of this example, there's a `views/prod` directory and a `views/dev` directory. 14 | 15 | Let's start by running the views in production. First, the schemas needs to be created: 16 | 17 | ```sh 18 | lea prepare views/prod --production 19 | ``` 20 | 21 | ``` 22 | Created schema staging 23 | Created schema core 24 | ``` 25 | 26 | The views can now be run in production: 27 | 28 | ```sh 29 | lea run views/prod --production 30 | ``` 31 | 32 | Now let's say we're working in development. We would start by creating the schemas: 33 | 34 | ```sh 35 | lea prepare views/dev 36 | ``` 37 | 38 | ``` 39 | Created schema staging 40 | Created schema core 41 | Created schema analytics 42 | ``` 43 | 44 | We do some changes by editing the `views/dev` directory. Then we can run the views in development: 45 | 46 | ```sh 47 | lea run views/dev 48 | ``` 49 | 50 | Now we can compare the data in development to the data in production: 51 | 52 | ```sh 53 | lea diff 54 | ``` 55 | 56 | ```diff 57 | + analytics.kpis 58 | + 1 rows 59 | + metric 60 | + value 61 | 62 | - core.customers 63 | - 100 rows 64 | - customer_id 65 | - customer_lifetime_value 66 | - first_name 67 | - first_order 68 | - last_name 69 | - most_recent_order 70 | - number_of_orders 71 | 72 | core.orders 73 | - 29 rows 74 | ``` 75 | 76 | The diff shows several things: 77 | 78 | - The `customers` view got dropped. 79 | - The `orders` didn't get dropped, but it lost some rows. This is because we added a `WHERE` to the underlying SQL. 80 | - The `kpis` view got added, and it contains a single row. 81 | 82 | The nice thing is that `lea diff` prints out a neat summary. This output can be highlighted on GitHub, which what we've done above, by using a `diff` code block. 83 | 84 | In a pull request, an automated message can be posted with the diff. Here is an example of a GitHub action that does this: 85 | 86 | ````yaml 87 | name: Branch tests 88 | 89 | on: 90 | pull_request: 91 | branches: 92 | - "*" 93 | 94 | jobs: 95 | run: 96 | runs-on: ubuntu-latest 97 | env: 98 | LEA_WAREHOUSE: bigquery 99 | LEA_BQ_SERVICE_ACCOUNT: ${{ secrets.LEA_BQ_SERVICE_ACCOUNT }} 100 | LEA_BQ_LOCATION: EU 101 | LEA_BQ_PROJECT_ID: carbonlytics 102 | LEA_SCHEMA: kaya 103 | steps: 104 | - uses: actions/checkout@v4 105 | - uses: ./.github/actions/install-env 106 | 107 | - name: Check code quality 108 | run: poetry run pre-commit run --all-files 109 | 110 | - name: Set environment variables 111 | run: | 112 | export PR_NUMBER=$(cut -d'/' -f3 <<< "$GITHUB_REF") 113 | export LEA_USERNAME="pr$PR_NUMBER" 114 | echo "LEA_USERNAME=$LEA_USERNAME" >> $GITHUB_ENV 115 | 116 | - name: Create BigQuery dataset for this pull request 117 | run: poetry run lea prepare 118 | 119 | - name: Refresh views 120 | run: poetry run lea run --raise-exceptions 121 | 122 | - name: Calculate diff 123 | run: | 124 | export DIFF=$(poetry run lea diff kaya_$LEA_USERNAME kaya) 125 | DIFF=$(echo "$DIFF" | sed '1d') 126 | EOF=$(dd if=/dev/urandom bs=15 count=1 status=none | base64) 127 | echo "DIFF<<$EOF" >> "$GITHUB_ENV" 128 | echo "$DIFF" >> "$GITHUB_ENV" 129 | echo "$EOF" >> "$GITHUB_ENV" 130 | 131 | - name: Comment PR with execution number 132 | uses: thollander/actions-comment-pull-request@v2 133 | with: 134 | message: | 135 | ```diff 136 | ${{ env.DIFF }} 137 | ``` 138 | comment_tag: execution 139 | 140 | - name: Run tests 141 | run: poetry run lea test --raise-exceptions 142 | ```` 143 | -------------------------------------------------------------------------------- /examples/diff/views/dev/analytics/kpis.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | 'n_orders' AS metric, 3 | COUNT(*) AS value 4 | FROM 5 | core.orders 6 | -------------------------------------------------------------------------------- /examples/diff/views/dev/core/orders.sql.jinja: -------------------------------------------------------------------------------- 1 | {% set payment_methods = ['credit_card', 'coupon', 'bank_transfer', 'gift_card'] %} 2 | 3 | with order_payments as ( 4 | 5 | select 6 | order_id, 7 | 8 | {% for payment_method in payment_methods -%} 9 | sum(case when payment_method = '{{ payment_method }}' then amount else 0 end) as {{ payment_method }}_amount, 10 | {% endfor -%} 11 | 12 | sum(amount) as total_amount 13 | 14 | from staging.payments 15 | 16 | group by order_id 17 | 18 | ) 19 | 20 | select 21 | orders.order_id, 22 | orders.customer_id, 23 | orders.order_date, 24 | orders.status, 25 | 26 | {% for payment_method in payment_methods -%} 27 | 28 | order_payments.{{ payment_method }}_amount, 29 | 30 | {% endfor -%} 31 | 32 | order_payments.total_amount as amount 33 | 34 | from staging.orders 35 | left join order_payments 36 | on orders.order_id = order_payments.order_id 37 | 38 | where date_part('month', cast(order_date AS date)) > 1 39 | -------------------------------------------------------------------------------- /examples/diff/views/dev/staging/customers.py: -------------------------------------------------------------------------------- 1 | """Docstring for the customers view.""" 2 | 3 | from __future__ import annotations 4 | 5 | import pathlib 6 | 7 | import pandas as pd 8 | 9 | here = pathlib.Path(__file__).parent 10 | customers = pd.read_csv( 11 | here.parents[3] / "jaffle_shop" / "jaffle_shop" / "seeds" / "raw_customers.csv" 12 | ) 13 | customers = customers.rename(columns={"id": "customer_id"}) 14 | -------------------------------------------------------------------------------- /examples/diff/views/dev/staging/orders.py: -------------------------------------------------------------------------------- 1 | """Docstring for the orders view.""" 2 | from __future__ import annotations 3 | 4 | import pathlib 5 | 6 | import pandas as pd 7 | 8 | here = pathlib.Path(__file__).parent 9 | orders = pd.read_csv(here.parents[3] / "jaffle_shop" / "jaffle_shop" / "seeds" / "raw_orders.csv") 10 | orders = orders.rename(columns={"id": "order_id", "user_id": "customer_id"}) 11 | -------------------------------------------------------------------------------- /examples/diff/views/dev/staging/payments.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pathlib 4 | 5 | import pandas as pd 6 | 7 | here = pathlib.Path(__file__).parent 8 | payments = pd.read_csv( 9 | here.parents[3] / "jaffle_shop" / "jaffle_shop" / "seeds" / "raw_payments.csv" 10 | ) 11 | payments = payments.rename(columns={"id": "payment_id"}) 12 | payments["amount"] = payments["amount"] 13 | -------------------------------------------------------------------------------- /examples/diff/views/prod/core/customers.sql: -------------------------------------------------------------------------------- 1 | with customer_orders as ( 2 | 3 | select 4 | customer_id, 5 | 6 | min(order_date) as first_order, 7 | max(order_date) as most_recent_order, 8 | count(order_id) as number_of_orders 9 | from staging.orders 10 | 11 | group by customer_id 12 | 13 | ), 14 | 15 | customer_payments as ( 16 | 17 | select 18 | orders.customer_id, 19 | sum(amount) as total_amount 20 | 21 | from staging.payments 22 | 23 | left join staging.orders on 24 | payments.order_id = orders.order_id 25 | 26 | group by orders.customer_id 27 | 28 | ) 29 | 30 | select 31 | -- #UNIQUE 32 | customers.customer_id, 33 | customers.first_name, 34 | customers.last_name, 35 | customer_orders.first_order, 36 | customer_orders.most_recent_order, 37 | customer_orders.number_of_orders, 38 | customer_payments.total_amount as customer_lifetime_value 39 | 40 | from staging.customers 41 | 42 | left join customer_orders 43 | on customers.customer_id = customer_orders.customer_id 44 | 45 | left join customer_payments 46 | on customers.customer_id = customer_payments.customer_id 47 | -------------------------------------------------------------------------------- /examples/diff/views/prod/core/orders.sql.jinja: -------------------------------------------------------------------------------- 1 | {% set payment_methods = ['credit_card', 'coupon', 'bank_transfer', 'gift_card'] %} 2 | 3 | with order_payments as ( 4 | 5 | select 6 | order_id, 7 | 8 | {% for payment_method in payment_methods -%} 9 | sum(case when payment_method = '{{ payment_method }}' then amount else 0 end) as {{ payment_method }}_amount, 10 | {% endfor -%} 11 | 12 | sum(amount) as total_amount 13 | 14 | from staging.payments 15 | 16 | group by order_id 17 | 18 | ) 19 | 20 | select 21 | orders.order_id, 22 | orders.customer_id, 23 | orders.order_date, 24 | orders.status, 25 | 26 | {% for payment_method in payment_methods -%} 27 | 28 | order_payments.{{ payment_method }}_amount, 29 | 30 | {% endfor -%} 31 | 32 | order_payments.total_amount as amount 33 | 34 | from staging.orders orders 35 | 36 | 37 | left join order_payments 38 | on orders.order_id = order_payments.order_id 39 | -------------------------------------------------------------------------------- /examples/diff/views/prod/staging/customers.py: -------------------------------------------------------------------------------- 1 | """Docstring for the customers view.""" 2 | 3 | from __future__ import annotations 4 | 5 | import pathlib 6 | 7 | import pandas as pd 8 | 9 | here = pathlib.Path(__file__).parent 10 | customers = pd.read_csv( 11 | here.parents[3] / "jaffle_shop" / "jaffle_shop" / "seeds" / "raw_customers.csv" 12 | ) 13 | customers = customers.rename(columns={"id": "customer_id"}) 14 | -------------------------------------------------------------------------------- /examples/diff/views/prod/staging/orders.py: -------------------------------------------------------------------------------- 1 | """Docstring for the orders view.""" 2 | from __future__ import annotations 3 | 4 | import pathlib 5 | 6 | import pandas as pd 7 | 8 | here = pathlib.Path(__file__).parent 9 | orders = pd.read_csv(here.parents[3] / "jaffle_shop" / "jaffle_shop" / "seeds" / "raw_orders.csv") 10 | orders = orders.rename(columns={"id": "order_id", "user_id": "customer_id"}) 11 | -------------------------------------------------------------------------------- /examples/diff/views/prod/staging/payments.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pathlib 4 | 5 | import pandas as pd 6 | 7 | here = pathlib.Path(__file__).parent 8 | payments = pd.read_csv( 9 | here.parents[3] / "jaffle_shop" / "jaffle_shop" / "seeds" / "raw_payments.csv" 10 | ) 11 | payments = payments.rename(columns={"id": "payment_id"}) 12 | payments["amount"] = payments["amount"] / 100 # convert cents to dollars 13 | -------------------------------------------------------------------------------- /examples/incremental/README.md: -------------------------------------------------------------------------------- 1 | # Incremental scripts 2 | 3 | Let's start with creating the database in the usual way: 4 | 5 | ```sh 6 | echo " 7 | LEA_USERNAME=max 8 | LEA_WAREHOUSE=duckdb 9 | LEA_DUCKDB_PATH=incremental.db 10 | " > .env 11 | ``` 12 | 13 | There are two `scripts` folders to simulate two days with different amounts of data. Let's say we're the 4th of January, and we run our views: 14 | 15 | ```sh 16 | lea run --scripts scripts_today 17 | ``` 18 | 19 | ```sh 20 | python -c "import duckdb; print(duckdb.connect('incremental_max.db').execute('SELECT created_at, day_of_year FROM core.events').df())" 21 | ``` 22 | 23 | ``` 24 | created_at day_of_year 25 | 0 2023-01-02 2 26 | 1 2023-01-03 3 27 | 2 2023-01-04 4 28 | ``` 29 | 30 | The next day, there's new data. When we refresh, we don't want to start from scratch. We want to keep the data from the previous day and only add the new data. This will happen automatically because the view is tagged with a `#INCREMENTAL` comment. 31 | 32 | ```sh 33 | lea run --script scripts_tomorrow --select core.events --incremental day_of_year 5 34 | ``` 35 | 36 | ```sh 37 | python -c "import duckdb; print(duckdb.connect('incremental_max.db').execute('SELECT created_at, day_of_year FROM core.events').df())" 38 | ``` 39 | 40 | ``` 41 | created_at day_of_year 42 | 0 2023-01-02 2 43 | 1 2023-01-03 3 44 | 2 2023-01-04 4 45 | 3 2023-01-05 5 46 | ``` 47 | 48 | We can see the new event from the 5th of January. However, in this case there is an event from the 1st of January that is missing. This is because the event has arrived with a delay. In such cases, we can force a full refresh by ommitting the flag --incremental: 49 | 50 | ```sh 51 | lea run --script scripts_tomorrow 52 | ``` 53 | 54 | ```sh 55 | python -c "import duckdb; print(duckdb.connect('incremental_max.db').execute('SELECT * FROM core.events').df())" 56 | ``` 57 | 58 | ``` 59 | created_at day_of_year 60 | 0 2023-01-01 1 61 | 1 2023-01-02 2 62 | 2 2023-01-03 3 63 | 3 2023-01-04 4 64 | 4 2023-01-05 5 65 | ``` 66 | -------------------------------------------------------------------------------- /examples/incremental/scripts_today/core/events.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | DATE '2023-01-01' + INTERVAL (i) DAY AS created_at, 3 | i + 1 AS day_of_year 4 | FROM GENERATE_SERIES(1, 3) AS t(i) 5 | -------------------------------------------------------------------------------- /examples/incremental/scripts_tomorrow/core/events.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | DATE '2023-01-01' + INTERVAL (i) DAY AS created_at, 3 | -- #INCREMENTAL 4 | i + 1 AS day_of_year 5 | FROM GENERATE_SERIES(0, 4) AS t(i) 6 | -------------------------------------------------------------------------------- /examples/jaffle_shop/README.md: -------------------------------------------------------------------------------- 1 | # Jaffle shop example 2 | 3 | This example is taken from the [`jaffle_shop` example](https://github.com/dbt-labs/jaffle_shop/) from dbt. Here is the scripts file structure: 4 | 5 | ``` 6 | scripts 7 | ├── analytics 8 | │   ├── finance 9 | │   │   └── kpis.sql 10 | │   └── kpis.sql 11 | ├── core 12 | │   ├── customers.sql 13 | │   └── orders.sql.jinja 14 | ├── staging 15 | │   ├── customers.sql 16 | │   ├── orders.sql 17 | │   └── payments.sql 18 | └── tests 19 | └── orders_are_dated.sql 20 | ``` 21 | 22 | The first thing to do is create an `.env` file, as so: 23 | 24 | ```sh 25 | echo " 26 | LEA_USERNAME=max 27 | LEA_WAREHOUSE=duckdb 28 | LEA_DUCKDB_PATH=jaffle_shop.db 29 | " > .env 30 | ``` 31 | 32 | This example uses DuckDB as the data warehouse. With lea, the convention when using DuckDB is to use a separate `.db` file per environment. For instance, in production, the file would be called `jaffle_shop.db`. In development, the file would be called `jaffle_shop_max.db`. The `max` suffix is the username from the `.env` file. 33 | 34 | You can run the scripts: 35 | 36 | ```sh 37 | lea run 38 | ``` 39 | 40 | lea will create audit tables, run tests against audit tables and if successfull. 41 | 42 | There are a couple of cool things: 43 | 44 | 1. The staging schema is populated using SQL scripts and native DuckDB parsing of CSV files. 45 | 2. The `core.orders` table is created using a Jinja SQL script. lea will automatically run the script through Jinja, and then execute the resulting SQL. 46 | 3. Skip feature can help fasten development cycle during WAP pattern. If a table is not passing through audit, all materialized tables won't be run again if the associated SQL script has'nt changed. 47 | If the script has changed, the audit table will be generated again, and all it's related childs in the DAG. 48 | 49 | Let's take the example given in [README.md](README.md). 50 | 51 | - Tables are materialized since you ran earlier `lea run` 52 | 53 | ## Write 54 | 55 | - Add a new script `core/expenses.sql` 56 | 57 | ```sh 58 | echo ''' 59 | with customer_orders as ( 60 | 61 | select 62 | customer_id, 63 | 64 | min(order_date) as first_order, 65 | max(order_date) as most_recent_order, 66 | count(order_id) as number_of_orders 67 | from staging.orders 68 | 69 | group by customer_id 70 | 71 | ), 72 | 73 | customer_payments as ( 74 | 75 | select 76 | orders.customer_id, 77 | sum(payments.amount) as total_amount 78 | 79 | from staging.payments as payments 80 | 81 | left join staging.orders as orders 82 | on payments.order_id = orders.order_id 83 | 84 | group by orders.customer_id 85 | 86 | ), 87 | 88 | expenses as ( 89 | select 90 | -- #UNIQUE 91 | customers.customer_id, 92 | customers.first_name, 93 | customers.last_name, 94 | customer_orders.first_order, 95 | customer_orders.most_recent_order, 96 | customer_orders.number_of_orders, 97 | -- #NO_NULLS 98 | customer_payments.total_amount as customer_lifetime_value 99 | from staging.customers as customers --comment here 100 | left join customer_orders --comment here 101 | on customers.customer_id = customer_orders.customer_id --comment here 102 | -- FROM customer_orders --uncomment here 103 | -- left join staging.customers as customers --uncomment here 104 | -- on customer_orders.customer_id = customers.customer_id --uncomment here 105 | left join customer_payments 106 | on customers.customer_id = customer_payments.customer_id 107 | ) 108 | 109 | select * from expenses 110 | ''' > scripts/core/expenses.sql 111 | ``` 112 | 113 | ## Audit 114 | 115 | - Run the scripts `lea run` : `lea_duckdb_max.tests.core__expenses__customer_lifetime_value___no_nulls___audit` is failing ❌ 116 | - Uncomment and comment lines to reverse the JOIN orders, and exclude customers absent from orders tables. 117 | 118 | ```sh 119 | sed -i '' '/--comment here/s/^/--/' scripts/core/expenses.sql 120 | sed -i '' '/--uncomment here/s/-- //' scripts/core/expenses.sql 121 | ``` 122 | 123 | - Run again scripts, you should see that all stagings audit tables are not executed again. 124 | - `core.expenses` is executed as lea detected modification on the script 125 | - All tests are now passing 🎉 126 | - Audit tables are wiped out from development warehouse. 127 | 128 | ## Publish 129 | 130 | - As all tests passed, tables are materialized in the development warehouse. 131 | - If you want now to run it against production and not development warehouse, you would add a `--production` flag to each command: 132 | 133 | ```sh 134 | lea run --production 135 | ``` 136 | -------------------------------------------------------------------------------- /examples/jaffle_shop/docs/README.md: -------------------------------------------------------------------------------- 1 | # Views 2 | 3 | ## Schemas 4 | 5 | - [`analytics`](./analytics) 6 | - [`core`](./core) 7 | - [`staging`](./staging) 8 | 9 | ## Schema flowchart 10 | 11 | ```mermaid 12 | %%{init: {"flowchart": {"defaultRenderer": "elk"}} }%% 13 | flowchart TB 14 | analytics(analytics) 15 | core(core) 16 | staging(staging) 17 | core --> analytics 18 | staging --> core 19 | ``` 20 | 21 | ## Flowchart 22 | 23 | ```mermaid 24 | %%{init: {"flowchart": {"defaultRenderer": "elk"}} }%% 25 | flowchart TB 26 | 27 | subgraph analytics 28 | 29 | subgraph finance 30 | analytics.finance.kpis(kpis) 31 | end 32 | 33 | analytics.kpis(kpis) 34 | end 35 | 36 | 37 | subgraph core 38 | core.customers(customers) 39 | core.orders(orders) 40 | end 41 | 42 | 43 | subgraph staging 44 | staging.customers(customers) 45 | staging.orders(orders) 46 | staging.payments(payments) 47 | end 48 | 49 | core.orders --> analytics.finance.kpis 50 | core.customers --> analytics.kpis 51 | core.orders --> analytics.kpis 52 | staging.customers --> core.customers 53 | staging.orders --> core.customers 54 | staging.payments --> core.customers 55 | staging.orders --> core.orders 56 | staging.payments --> core.orders 57 | ``` 58 | 59 | -------------------------------------------------------------------------------- /examples/jaffle_shop/docs/analytics/README.md: -------------------------------------------------------------------------------- 1 | # analytics 2 | 3 | ## Table of contents 4 | 5 | - [analytics.finance.kpis](#analyticsfinancekpis) 6 | - [analytics.kpis](#analyticskpis) 7 | 8 | ## Views 9 | 10 | ### analytics.finance.kpis 11 | 12 | ```sql 13 | SELECT * 14 | FROM analytics.finance__kpis 15 | ``` 16 | 17 | | Column | Description | Unique | 18 | |:--------------------|:--------------|:---------| 19 | | total_order_value | | | 20 | | average_order_value | | | 21 | 22 | ### analytics.kpis 23 | 24 | ```sql 25 | SELECT * 26 | FROM analytics.kpis 27 | ``` 28 | 29 | | Column | Description | Unique | 30 | |:---------|:--------------|:---------| 31 | | metric | | | 32 | | value | | | 33 | 34 | -------------------------------------------------------------------------------- /examples/jaffle_shop/docs/core/README.md: -------------------------------------------------------------------------------- 1 | # core 2 | 3 | ## Table of contents 4 | 5 | - [core.customers](#corecustomers) 6 | - [core.orders](#coreorders) 7 | 8 | ## Views 9 | 10 | ### core.customers 11 | 12 | ```sql 13 | SELECT * 14 | FROM core.customers 15 | ``` 16 | 17 | | Column | Description | Unique | 18 | |:------------------------|:--------------|:---------| 19 | | customer_id | | ✅ | 20 | | first_name | | | 21 | | last_name | | | 22 | | first_order | | | 23 | | most_recent_order | | | 24 | | number_of_orders | | | 25 | | customer_lifetime_value | | | 26 | 27 | ### core.orders 28 | 29 | ```sql 30 | SELECT * 31 | FROM core.orders 32 | ``` 33 | 34 | | Column | Description | Unique | 35 | |:---------------------|:--------------|:---------| 36 | | order_id | | | 37 | | customer_id | | | 38 | | order_date | | | 39 | | status | | | 40 | | credit_card_amount | | | 41 | | coupon_amount | | | 42 | | bank_transfer_amount | | | 43 | | gift_card_amount | | | 44 | | amount | | | 45 | 46 | -------------------------------------------------------------------------------- /examples/jaffle_shop/docs/staging/README.md: -------------------------------------------------------------------------------- 1 | # staging 2 | 3 | ## Table of contents 4 | 5 | - [staging.customers](#stagingcustomers) 6 | - [staging.orders](#stagingorders) 7 | - [staging.payments](#stagingpayments) 8 | 9 | ## Views 10 | 11 | ### staging.customers 12 | 13 | Docstring for the customers view. 14 | 15 | ```sql 16 | SELECT * 17 | FROM staging.customers 18 | ``` 19 | 20 | | Column | Description | Unique | 21 | |----------|---------------|----------| 22 | 23 | ### staging.orders 24 | 25 | Docstring for the orders view. 26 | 27 | ```sql 28 | SELECT * 29 | FROM staging.orders 30 | ``` 31 | 32 | | Column | Description | Unique | 33 | |----------|---------------|----------| 34 | 35 | ### staging.payments 36 | 37 | ```sql 38 | SELECT * 39 | FROM staging.payments 40 | ``` 41 | 42 | | Column | Description | Unique | 43 | |----------|---------------|----------| 44 | 45 | -------------------------------------------------------------------------------- /examples/jaffle_shop/scripts/analytics/finance/kpis.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | SUM(amount) AS total_order_value, 3 | AVG(amount) AS average_order_value 4 | FROM core.orders 5 | -------------------------------------------------------------------------------- /examples/jaffle_shop/scripts/analytics/kpis.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | 'n_customers' AS metric, 3 | COUNT(*) AS value 4 | FROM 5 | core.customers 6 | 7 | UNION ALL 8 | 9 | SELECT 10 | 'n_orders' AS metric, 11 | COUNT(*) AS value 12 | FROM 13 | core.orders 14 | -------------------------------------------------------------------------------- /examples/jaffle_shop/scripts/core/customers.sql: -------------------------------------------------------------------------------- 1 | with customer_orders as ( 2 | 3 | select 4 | customer_id, 5 | 6 | min(order_date) as first_order, 7 | max(order_date) as most_recent_order, 8 | count(order_id) as number_of_orders 9 | from staging.orders 10 | 11 | group by customer_id 12 | 13 | ), 14 | 15 | customer_payments as ( 16 | 17 | select 18 | orders.customer_id, 19 | sum(amount) as total_amount 20 | 21 | from staging.payments 22 | 23 | left join staging.orders orders using (order_id) 24 | 25 | group by orders.customer_id 26 | 27 | ) 28 | 29 | select 30 | -- #UNIQUE 31 | customers.customer_id, 32 | customers.first_name, 33 | customers.last_name, 34 | customer_orders.first_order, 35 | customer_orders.most_recent_order, 36 | customer_orders.number_of_orders, 37 | customer_payments.total_amount as customer_lifetime_value 38 | 39 | from staging.customers customers 40 | 41 | left join customer_orders 42 | on customers.customer_id = customer_orders.customer_id 43 | 44 | left join customer_payments 45 | on customers.customer_id = customer_payments.customer_id 46 | -------------------------------------------------------------------------------- /examples/jaffle_shop/scripts/core/orders.sql.jinja: -------------------------------------------------------------------------------- 1 | {% set payment_methods = ['credit_card', 'coupon', 'bank_transfer', 'gift_card'] %} 2 | 3 | with order_payments as ( 4 | 5 | select 6 | order_id, 7 | 8 | {% for payment_method in payment_methods -%} 9 | sum(case when payment_method = '{{ payment_method }}' then amount else 0 end) as {{ payment_method }}_amount, 10 | {% endfor -%} 11 | 12 | sum(amount) as total_amount 13 | 14 | from staging.payments 15 | 16 | group by order_id 17 | 18 | ) 19 | 20 | select 21 | orders.order_id, 22 | orders.customer_id, 23 | orders.order_date, 24 | orders.status, 25 | 26 | {% for payment_method in payment_methods -%} 27 | 28 | order_payments.{{ payment_method }}_amount, 29 | 30 | {% endfor -%} 31 | 32 | order_payments.total_amount as amount 33 | 34 | from staging.orders orders 35 | 36 | 37 | left join order_payments 38 | on orders.order_id = order_payments.order_id 39 | -------------------------------------------------------------------------------- /examples/jaffle_shop/scripts/staging/customers.sql: -------------------------------------------------------------------------------- 1 | WITH raw_customers AS ( 2 | SELECT * FROM 'jaffle_shop/seeds/raw_customers.csv' 3 | ) 4 | 5 | SELECT 6 | id AS customer_id, 7 | first_name, 8 | last_name 9 | FROM raw_customers; 10 | -------------------------------------------------------------------------------- /examples/jaffle_shop/scripts/staging/orders.sql: -------------------------------------------------------------------------------- 1 | WITH raw_orders AS ( 2 | SELECT 3 | id, 4 | user_id, 5 | order_date, 6 | status 7 | FROM 'jaffle_shop/seeds/raw_orders.csv' 8 | ) 9 | 10 | SELECT 11 | id AS order_id, 12 | user_id AS customer_id, 13 | order_date, 14 | status 15 | FROM raw_orders; 16 | -------------------------------------------------------------------------------- /examples/jaffle_shop/scripts/staging/payments.sql: -------------------------------------------------------------------------------- 1 | WITH raw_payments AS (SELECT * FROM 'jaffle_shop/seeds/raw_payments.csv') 2 | 3 | SELECT 4 | id AS payments_id, 5 | order_id, 6 | payment_method, 7 | amount / 100 AS amount 8 | FROM raw_payments; 9 | -------------------------------------------------------------------------------- /examples/jaffle_shop/scripts/tests/orders_are_dated.sql: -------------------------------------------------------------------------------- 1 | SELECT * 2 | FROM core.orders 3 | WHERE order_date IS NULL 4 | -------------------------------------------------------------------------------- /examples/motherduck/README.md: -------------------------------------------------------------------------------- 1 | # Using MotherDuck 2 | 3 | lea works with DuckDB, and thus can be used with [MotherDuck](https://motherduck.com/) too. 4 | 5 | Here is an example `.env` file: 6 | 7 | ```sh 8 | echo " 9 | LEA_USERNAME=max 10 | LEA_WAREHOUSE=duckdb 11 | LEA_DUCKDB_PATH=md:jaffle_shop 12 | MOTHERDUCK_TOKEN= 13 | " > .env 14 | ``` 15 | 16 | The token can be obtained by logging into MotherDuck from the terminal, as documented [here](https://motherduck.com/docs/getting-started/connect-query-from-python/installation-authentication#authenticating-to-motherduck). 17 | 18 | Then, you can run the usual commands. For the sake of example, let's re-use the jaffle shop views: 19 | 20 | ```sh 21 | lea prepare ../jaffle_shop/views 22 | ``` 23 | 24 | ``` 25 | Created schema analytics 26 | Created schema staging 27 | Created schema core 28 | ``` 29 | 30 | ```sh 31 | lea run ../jaffle_shop/views 32 | ``` 33 | 34 | You should see the views in your MotherDuck UI: 35 | -------------------------------------------------------------------------------- /examples/school/README.md: -------------------------------------------------------------------------------- 1 | # School example 2 | 3 | Let's back to school with an example and demonstrate the use of tests. 4 | 5 | ## Bootstrapping 6 | 7 | First, usual bootstrapping of database for `lea`, it goes by creating a `.env`: 8 | 9 | ```sh 10 | echo " 11 | LEA_USERNAME=max 12 | LEA_WAREHOUSE=duckdb 13 | LEA_DUCKDB_PATH=school.db 14 | " > .env 15 | ``` 16 | 17 | This example uses DuckDB as the datawarehouse. 18 | 19 | You can run the scripts: 20 | 21 | ```sh 22 | lea run 23 | ``` 24 | 25 | Lea will create schema in DuckDB, create audit tables based on scripts definition. 26 | Once audit tables are generated, lea run tests against audit tables. 27 | 28 | Let's review some tests together. 29 | 30 | ## Exploration 31 | 32 | Vizualize students in this school : 33 | 34 | ```sh 35 | python -c import duckdb; print(duckdb.connect('school_max.db').execute('SELECT student_id, first_name, last_name, university FROM staging.students').df()) 36 | ``` 37 | 38 | ``` 39 | student_id first_name last_name university 40 | 0 1 Lauren Levine Stanford University 41 | 1 2 Daniel Lopez Massachusetts Institute of Technology 42 | 2 3 Melanie Foster University of California Berkeley 43 | 3 4 Gabriel Cooke Harvard University 44 | 4 5 Anne Porter Harvard University 45 | 5 6 Amy Lee Princeton University 46 | 6 7 Rebecca Chavez Princeton University 47 | ``` 48 | 49 | You can see some students, let's review their grades ! 50 | 51 | ```sh 52 | python -c "import duckdb; print(duckdb.connect('school_max.db').execute('SELECT student_id, student_name, class_name, semester, average_grade FROM core.yearly_results USING SAMPLE 5').df())" 53 | ``` 54 | 55 | ``` 56 | student_id student_name class_name semester average_grade 57 | 0 6 Amy Lee Mathematics Semester 1 59.0 58 | 1 5 Anne Porter Literature Semester 2 100.0 59 | 2 5 Anne Porter Physics Semester 2 46.0 60 | 3 1 Lauren Levine Biology Semester 1 28.5 61 | 4 1 Lauren Levine Literature Semester 2 52.5 62 | ``` 63 | 64 | ## Tests 65 | 66 | Awesome ! Pretty good students, let's review some tests made. 67 | 68 | ```sql 69 | WITH raw_students AS ( 70 | SELECT * FROM './seeds/raw_students.csv' 71 | ) 72 | 73 | SELECT 74 | -- #UNIQUE 75 | -- #NO_NULLS 76 | id AS student_id, 77 | first_name, 78 | -- #UNIQUE_BY(first_name) 79 | last_name, 80 | -- #SET{'Stanford University', 'University of California Berkeley', 'Princeton University', 'Harvard University', 'Massachusetts Institute of Technology'} 81 | university, 82 | FROM raw_students; 83 | ``` 84 | 85 | During the Write-Audit-Publish pattern, thoses checks will ensure Data Quality making assertions tests. 86 | 87 | Here for instance, the staging model during Audit step will ensure that : 88 | 89 | - `student_id` values are not null and unique 90 | - `last_name` are unique by first_name 91 | - `university` values are in the exposed list 92 | 93 | ## WAP pattern in action - break during auditing 94 | 95 | Let's break a test on purpose for demonstration : 96 | 97 | Under `seeds/raw_students`, let's add a new student : 98 | 99 | ```sh 100 | echo "8,Andy,Bernard,Cornell University,23" >> seeds/raw_students.csv 101 | ``` 102 | 103 | Let's run again scripts : 104 | 105 | ```sh 106 | lea run 107 | ``` 108 | 109 | Cornell University is not allowed here : 110 | 111 | ``` 112 | ✋ Early ending because an error occurred 113 | 😴 Ending session 114 | STOPPED school_max.core.yearly_results___audit 115 | SUCCESS school_max.core.yearly_results___audit, contains 112 rows 116 | ERRORED school_max.tests.staging__students__university___set___audit 117 | university 118 | 0 Cornell University 119 | ❌ Finished, took less than a second 🚀 120 | ``` 121 | 122 | Remove last line added to restore source: 123 | 124 | ```sh 125 | sed -i '' '$d' seeds/raw_students.csv 126 | ``` 127 | 128 | As audit prevented from corrupting intermediate tables, your tables 129 | are still healthy. 130 | 131 | ## Restart Feature demo - Get a fresh environment 132 | 133 | However, as our audit tables are messy and not sync with source, let's rerun them: 134 | 135 | ```sh 136 | lea run --restart 137 | ``` 138 | 139 | It will flush the audit table, as if it was a fresh start. 140 | 141 | ## Skipping feature demonstration 142 | 143 | You might think now, "hey, how does lea know which audit table should be run again 144 | or not ?". That's an excellent question ! 145 | 146 | Have you noticed that `lea` automatically skip table that are not relevant to process during audits ? 147 | 148 | Let's see together with a closer look by practicing with an example ! 149 | 150 | First you can vizualize award winner of scholarships: 151 | 152 | Each top performing student get a 1000$ grant and 2nd gets a 500$ grant. 153 | 154 | You can see the winners of the year in `Economics`: 155 | 156 | ```sh 157 | python -c "import duckdb; print(duckdb.connect('school_max.db').execute('SELECT student_name, domain, scholarship_amount FROM analytics.scholarship_award WHERE domain = \'Economics\'').df())" 158 | ``` 159 | 160 | ``` 161 | student_name domain scholarship_amount 162 | 0 Daniel Lopez Economics 1000 163 | 1 Gabriel Cooke Economics 500 164 | ``` 165 | 166 | You can review the total amount of money spent : 167 | 168 | ```sh 169 | lea run --select analytics.finance.expenses 170 | python -c "import duckdb; print(duckdb.connect('school_max.db').execute('SELECT total_expenses FROM analytics.finance__expenses').df())" 171 | ``` 172 | 173 | ``` 174 | total_expenses 175 | 0 12000.0 176 | ``` 177 | 178 | Let's modify a script and demonstrate that lea will run again only scripts that have been modified. 179 | 180 | Good news, the academy got 2x more budget this year ! You can deliver a scholarship award 181 | for top performing student **each semester**. 182 | 183 | To apply this evolution, uncomment all lines under `analytics.scholarship_award` with `--uncomment here` and comment the `--comment here` ones 184 | 185 | ```sh 186 | sed -i '' '/--comment here/s/^/--/' scripts/analytics/scholarship_award.sql 187 | sed -i '' '/--uncomment here/s/-- //' scripts/analytics/scholarship_award.sql 188 | ``` 189 | 190 | Then run again the finance script. 191 | 192 | ```sh 193 | lea run --select analytics.finance.expenses 194 | ``` 195 | 196 | Oh no, the budget test is failing ! Modify the value under `scripts/tests/budget.sql` : 197 | 198 | ```sh 199 | sed -i '' '/--comment here/s/^/--/' scripts/tests/budget.sql 200 | sed -i '' '/--uncomment here/s/-- //' scripts/tests/budget.sql 201 | ``` 202 | 203 | Now let's run again the scripts : 204 | 205 | ```sh 206 | lea run 207 | ``` 208 | 209 | Everything pass 🎉 210 | 211 | Look closely : **audit tables haven't been materialized again for `school*max.core.yearly_results***audit` 212 | as they were already existing and the script modification date was **anterior** to materialization date ! 213 | 214 | But it would has been executed, if the script was modified **prior** last table materialization. 215 | 216 | You can check the table materialization date with : 217 | 218 | ``` 219 | python -c "import duckdb; print(duckdb.connect('school_max.db').execute('SELECT MAX(_materialized_timestamp) AS last_materialized FROM analytics.scholarship_award').df())" 220 | ``` 221 | 222 | last_materialized 223 | 224 | 0 2025-03-14 00:31:28.114 225 | 226 | Now the school has extra budget, you can view the new scholarship award winners ! 227 | 228 | There is twice more winners now, 2 at each semester : 229 | 230 | ```sh 231 | python -c "import duckdb; print(duckdb.connect('school_max.db').execute('SELECT student_name, domain, semester, scholarship_amount FROM analytics.scholarship_award WHERE domain = \'Economics\'').df())" 232 | ``` 233 | 234 | ``` 235 | student_name domain semester scholarship_amount 236 | 0 Lauren Levine Economics Semester 2 1000 237 | 1 Gabriel Cooke Economics Semester 2 500 238 | 2 Daniel Lopez Economics Semester 1 1000 239 | 3 Gabriel Cooke Economics Semester 1 500 240 | ``` 241 | 242 | As you can see now, the expenses have doubled : 243 | 244 | ```sh 245 | lea run --select analytics.finance.expenses 246 | python -c "import duckdb; print(duckdb.connect('school_max.db').execute('SELECT total_expenses FROM analytics.finance__expenses').df())" 247 | ``` 248 | 249 | ``` 250 | total_expenses 251 | 0 24000.0 252 | ``` 253 | -------------------------------------------------------------------------------- /examples/school/scripts/analytics/finance/expenses.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | SUM(scholarship_amount) AS total_expenses 3 | FROM 4 | analytics.scholarship_award 5 | -------------------------------------------------------------------------------- /examples/school/scripts/analytics/major.sql: -------------------------------------------------------------------------------- 1 | WITH ordered_yearly_results AS ( 2 | SELECT 3 | student_name, 4 | AVG(average_grade) AS total_grade 5 | FROM 6 | core.yearly_results 7 | GROUP BY student_name 8 | ) 9 | SELECT 10 | student_name, 11 | total_grade 12 | FROM ordered_yearly_results 13 | ORDER BY total_grade DESC 14 | LIMIT 1; 15 | -------------------------------------------------------------------------------- /examples/school/scripts/analytics/scholarship_award.sql: -------------------------------------------------------------------------------- 1 | WITH ordered_results AS ( 2 | SELECT 3 | student_name, 4 | class_name, 5 | -- semester, --uncomment here 6 | -- row_number() OVER (PARTITION BY class_name, semester --uncomment here 7 | row_number() OVER (PARTITION BY class_name --comment here 8 | ORDER BY average_grade DESC 9 | ) AS ranking 10 | FROM 11 | core.yearly_results 12 | ) 13 | SELECT 14 | student_name, 15 | -- semester, --uncomment here 16 | class_name AS domain, 17 | CASE 18 | WHEN ranking = 1 THEN 1000 19 | WHEN ranking = 2 THEN 500 20 | ELSE 0 21 | END AS scholarship_amount 22 | FROM ordered_results 23 | WHERE ranking <= 2; 24 | -------------------------------------------------------------------------------- /examples/school/scripts/core/yearly_results.sql: -------------------------------------------------------------------------------- 1 | WITH grades_per_class_per_semester AS ( 2 | SELECT 3 | student_id, 4 | class_name, 5 | grade, 6 | CASE 7 | WHEN datepart('month', exam_date) BETWEEN 1 AND 6 THEN 'Semester 1' 8 | ELSE 'Semester 2' 9 | END AS semester 10 | FROM staging.grades 11 | ), 12 | avg_grades_per_class AS ( 13 | SELECT 14 | student_id, 15 | class_name, 16 | semester, 17 | AVG(grade) AS average_grade 18 | FROM grades_per_class_per_semester 19 | GROUP BY class_name, semester, student_id 20 | ) 21 | SELECT 22 | students.student_id, 23 | CONCAT(students.first_name, ' ', students.last_name) AS student_name, 24 | grades_per_class.class_name, 25 | grades_per_class.semester, 26 | grades_per_class.average_grade, 27 | students.university 28 | FROM avg_grades_per_class AS grades_per_class 29 | LEFT JOIN staging.students AS students 30 | ON grades_per_class.student_id = students.student_id 31 | ORDER BY student_name, class_name; 32 | -------------------------------------------------------------------------------- /examples/school/scripts/staging/grades.sql: -------------------------------------------------------------------------------- 1 | WITH raw_grades AS ( 2 | SELECT * FROM './seeds/raw_grades.csv' 3 | ) 4 | 5 | SELECT 6 | -- #NO_NULLS 7 | student_id, 8 | -- #NO_NULLS 9 | class_name, 10 | -- #NO_NULLS 11 | grade, 12 | -- #NO_NULLS 13 | strptime(exam_date, '%m-%Y') AS exam_date, 14 | FROM raw_grades; 15 | -------------------------------------------------------------------------------- /examples/school/scripts/staging/students.sql: -------------------------------------------------------------------------------- 1 | WITH raw_students AS ( 2 | SELECT * FROM './seeds/raw_students.csv' 3 | ) 4 | 5 | SELECT 6 | -- #UNIQUE 7 | -- #NO_NULLS 8 | id AS student_id, 9 | first_name, 10 | -- #UNIQUE_BY(first_name) 11 | last_name, 12 | -- #SET{'Stanford University', 'University of California Berkeley', 'Princeton University', 'Harvard University', 'Massachusetts Institute of Technology'} 13 | university, 14 | FROM raw_students; 15 | -------------------------------------------------------------------------------- /examples/school/scripts/tests/budget.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | * 3 | FROM analytics.finance__expenses 4 | -- WHERE total_expenses > 24000; --uncomment here 5 | WHERE total_expenses > 12000; --comment here 6 | -------------------------------------------------------------------------------- /examples/school/seeds/raw_grades.csv: -------------------------------------------------------------------------------- 1 | student_id,class_name,grade,exam_date 2 | 1,Mathematics,88,5-2025 3 | 1,Mathematics,57,10-2025 4 | 1,Mathematics,9,6-2025 5 | 1,Mathematics,80,9-2025 6 | 1,Physics,52,3-2025 7 | 1,Physics,81,10-2025 8 | 1,Physics,100,2-2025 9 | 1,Physics,99,9-2025 10 | 1,Chemistry,49,6-2025 11 | 1,Chemistry,1,11-2025 12 | 1,Chemistry,45,5-2025 13 | 1,Chemistry,0,11-2025 14 | 1,Biology,37,3-2025 15 | 1,Biology,88,9-2025 16 | 1,Biology,20,5-2025 17 | 1,Biology,32,10-2025 18 | 1,Computer Science,1,2-2025 19 | 1,Computer Science,72,9-2025 20 | 1,Computer Science,48,3-2025 21 | 1,Computer Science,70,7-2025 22 | 1,History,14,6-2025 23 | 1,History,100,7-2025 24 | 1,History,57,6-2025 25 | 1,History,58,12-2025 26 | 1,Literature,23,6-2025 27 | 1,Literature,67,10-2025 28 | 1,Literature,48,5-2025 29 | 1,Literature,38,7-2025 30 | 1,Economics,18,5-2025 31 | 1,Economics,94,11-2025 32 | 1,Economics,97,4-2025 33 | 1,Economics,20,7-2025 34 | 2,Mathematics,16,4-2025 35 | 2,Mathematics,38,10-2025 36 | 2,Mathematics,93,5-2025 37 | 2,Mathematics,73,10-2025 38 | 2,Physics,23,3-2025 39 | 2,Physics,83,10-2025 40 | 2,Physics,15,5-2025 41 | 2,Physics,35,11-2025 42 | 2,Chemistry,88,2-2025 43 | 2,Chemistry,96,7-2025 44 | 2,Chemistry,98,3-2025 45 | 2,Chemistry,63,8-2025 46 | 2,Biology,17,1-2025 47 | 2,Biology,32,7-2025 48 | 2,Biology,66,1-2025 49 | 2,Biology,52,8-2025 50 | 2,Computer Science,53,1-2025 51 | 2,Computer Science,53,7-2025 52 | 2,Computer Science,28,3-2025 53 | 2,Computer Science,97,11-2025 54 | 2,History,42,4-2025 55 | 2,History,23,9-2025 56 | 2,History,24,4-2025 57 | 2,History,100,8-2025 58 | 2,Literature,0,2-2025 59 | 2,Literature,56,7-2025 60 | 2,Literature,78,1-2025 61 | 2,Literature,37,10-2025 62 | 2,Economics,52,1-2025 63 | 2,Economics,55,7-2025 64 | 2,Economics,74,4-2025 65 | 2,Economics,20,10-2025 66 | 3,Mathematics,9,6-2025 67 | 3,Mathematics,3,10-2025 68 | 3,Mathematics,67,6-2025 69 | 3,Mathematics,100,11-2025 70 | 3,Physics,78,5-2025 71 | 3,Physics,100,12-2025 72 | 3,Physics,69,4-2025 73 | 3,Physics,97,8-2025 74 | 3,Chemistry,64,1-2025 75 | 3,Chemistry,58,9-2025 76 | 3,Chemistry,94,2-2025 77 | 3,Chemistry,100,8-2025 78 | 3,Biology,96,4-2025 79 | 3,Biology,23,10-2025 80 | 3,Biology,13,5-2025 81 | 3,Biology,89,8-2025 82 | 3,Computer Science,92,1-2025 83 | 3,Computer Science,100,10-2025 84 | 3,Computer Science,100,1-2025 85 | 3,Computer Science,27,9-2025 86 | 3,History,2,6-2025 87 | 3,History,100,7-2025 88 | 3,History,62,6-2025 89 | 3,History,100,11-2025 90 | 3,Literature,78,6-2025 91 | 3,Literature,100,12-2025 92 | 3,Literature,69,6-2025 93 | 3,Literature,16,8-2025 94 | 3,Economics,37,5-2025 95 | 3,Economics,85,8-2025 96 | 3,Economics,42,6-2025 97 | 3,Economics,2,7-2025 98 | 4,Mathematics,99,2-2025 99 | 4,Mathematics,56,11-2025 100 | 4,Mathematics,68,6-2025 101 | 4,Mathematics,43,8-2025 102 | 4,Physics,97,4-2025 103 | 4,Physics,52,9-2025 104 | 4,Physics,16,4-2025 105 | 4,Physics,58,10-2025 106 | 4,Chemistry,71,6-2025 107 | 4,Chemistry,64,12-2025 108 | 4,Chemistry,52,6-2025 109 | 4,Chemistry,88,9-2025 110 | 4,Biology,24,3-2025 111 | 4,Biology,75,7-2025 112 | 4,Biology,77,5-2025 113 | 4,Biology,11,7-2025 114 | 4,Computer Science,46,2-2025 115 | 4,Computer Science,48,8-2025 116 | 4,Computer Science,1,3-2025 117 | 4,Computer Science,93,8-2025 118 | 4,History,60,5-2025 119 | 4,History,69,10-2025 120 | 4,History,35,6-2025 121 | 4,History,63,10-2025 122 | 4,Literature,41,2-2025 123 | 4,Literature,100,8-2025 124 | 4,Literature,42,4-2025 125 | 4,Literature,11,9-2025 126 | 4,Economics,100,5-2025 127 | 4,Economics,100,8-2025 128 | 4,Economics,24,1-2025 129 | 4,Economics,13,10-2025 130 | 5,Mathematics,92,3-2025 131 | 5,Mathematics,73,9-2025 132 | 5,Mathematics,92,5-2025 133 | 5,Mathematics,100,9-2025 134 | 5,Physics,99,5-2025 135 | 5,Physics,17,7-2025 136 | 5,Physics,29,1-2025 137 | 5,Physics,75,9-2025 138 | 5,Chemistry,35,6-2025 139 | 5,Chemistry,60,10-2025 140 | 5,Chemistry,60,3-2025 141 | 5,Chemistry,29,9-2025 142 | 5,Biology,49,4-2025 143 | 5,Biology,65,10-2025 144 | 5,Biology,55,2-2025 145 | 5,Biology,35,11-2025 146 | 5,Computer Science,40,6-2025 147 | 5,Computer Science,77,11-2025 148 | 5,Computer Science,47,1-2025 149 | 5,Computer Science,75,10-2025 150 | 5,History,22,5-2025 151 | 5,History,33,9-2025 152 | 5,History,62,4-2025 153 | 5,History,75,9-2025 154 | 5,Literature,18,5-2025 155 | 5,Literature,100,9-2025 156 | 5,Literature,84,2-2025 157 | 5,Literature,100,10-2025 158 | 5,Economics,63,3-2025 159 | 5,Economics,40,10-2025 160 | 5,Economics,18,6-2025 161 | 5,Economics,37,8-2025 162 | 6,Mathematics,100,4-2025 163 | 6,Mathematics,39,11-2025 164 | 6,Mathematics,18,4-2025 165 | 6,Mathematics,43,12-2025 166 | 6,Physics,96,3-2025 167 | 6,Physics,67,10-2025 168 | 6,Physics,3,3-2025 169 | 6,Physics,37,8-2025 170 | 6,Chemistry,38,2-2025 171 | 6,Chemistry,29,11-2025 172 | 6,Chemistry,62,2-2025 173 | 6,Chemistry,4,11-2025 174 | 6,Biology,89,4-2025 175 | 6,Biology,100,10-2025 176 | 6,Biology,26,1-2025 177 | 6,Biology,100,11-2025 178 | 6,Computer Science,66,1-2025 179 | 6,Computer Science,62,11-2025 180 | 6,Computer Science,12,1-2025 181 | 6,Computer Science,51,12-2025 182 | 6,History,56,3-2025 183 | 6,History,21,12-2025 184 | 6,History,97,3-2025 185 | 6,History,18,9-2025 186 | 6,Literature,12,1-2025 187 | 6,Literature,31,7-2025 188 | 6,Literature,56,6-2025 189 | 6,Literature,100,10-2025 190 | 6,Economics,60,3-2025 191 | 6,Economics,100,7-2025 192 | 6,Economics,62,3-2025 193 | 6,Economics,1,11-2025 194 | 7,Mathematics,10,3-2025 195 | 7,Mathematics,65,8-2025 196 | 7,Mathematics,21,1-2025 197 | 7,Mathematics,19,9-2025 198 | 7,Physics,30,6-2025 199 | 7,Physics,0,9-2025 200 | 7,Physics,2,1-2025 201 | 7,Physics,91,11-2025 202 | 7,Chemistry,28,3-2025 203 | 7,Chemistry,6,12-2025 204 | 7,Chemistry,7,1-2025 205 | 7,Chemistry,5,7-2025 206 | 7,Biology,96,4-2025 207 | 7,Biology,38,9-2025 208 | 7,Biology,17,5-2025 209 | 7,Biology,87,7-2025 210 | 7,Computer Science,80,6-2025 211 | 7,Computer Science,41,12-2025 212 | 7,Computer Science,79,3-2025 213 | 7,Computer Science,94,9-2025 214 | 7,History,15,3-2025 215 | 7,History,57,9-2025 216 | 7,History,15,2-2025 217 | 7,History,0,7-2025 218 | 7,Literature,51,3-2025 219 | 7,Literature,64,8-2025 220 | 7,Literature,1,1-2025 221 | 7,Literature,57,10-2025 222 | 7,Economics,1,5-2025 223 | 7,Economics,22,8-2025 224 | 7,Economics,42,4-2025 225 | 7,Economics,78,10-2025 226 | -------------------------------------------------------------------------------- /examples/school/seeds/raw_students.csv: -------------------------------------------------------------------------------- 1 | id,first_name,last_name,university,age 2 | 1,Lauren,Levine,Stanford University,22 3 | 2,Daniel,Lopez,Massachusetts Institute of Technology,24 4 | 3,Melanie,Foster,University of California Berkeley,20 5 | 4,Gabriel,Cooke,Harvard University,19 6 | 5,Anne,Porter,Harvard University,23 7 | 6,Amy,Lee,Princeton University,24 8 | 7,Rebecca,Chavez,Princeton University,25 9 | -------------------------------------------------------------------------------- /lea/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import logging 4 | 5 | import click 6 | from rich.logging import RichHandler 7 | 8 | from lea import cli, databases 9 | from lea.conductor import Conductor 10 | 11 | logging.basicConfig( 12 | level=logging.INFO, 13 | format="%(message)s", 14 | datefmt="[%X]", 15 | handlers=[ 16 | RichHandler( 17 | rich_tracebacks=True, 18 | show_level=False, 19 | show_path=False, 20 | markup=True, 21 | tracebacks_suppress=[click], 22 | ) 23 | ], 24 | ) 25 | 26 | log = logging.getLogger("rich") 27 | 28 | 29 | __all__ = ["cli", "log", "Conductor", "databases"] 30 | -------------------------------------------------------------------------------- /lea/assertions/NO_NULLS.sql.jinja: -------------------------------------------------------------------------------- 1 | SELECT ROW_NUMBER() OVER () AS row_number 2 | FROM {{ table }} 3 | WHERE {{ column }} IS NULL 4 | -------------------------------------------------------------------------------- /lea/assertions/SET.sql.jinja: -------------------------------------------------------------------------------- 1 | SELECT DISTINCT({{ column }}) AS {{ column }} 2 | FROM {{ table }} 3 | WHERE {{ column }} NOT IN ({{ ', '.join(elements) }}) 4 | -------------------------------------------------------------------------------- /lea/assertions/UNIQUE.sql.jinja: -------------------------------------------------------------------------------- 1 | SELECT 2 | {{ column }}, 3 | COUNT(*) AS n 4 | FROM {{ table }} 5 | GROUP BY {{ column }} 6 | HAVING n > 1 7 | -------------------------------------------------------------------------------- /lea/assertions/UNIQUE_BY.sql.jinja: -------------------------------------------------------------------------------- 1 | SELECT 2 | {{ by }}, 3 | COUNT(*) AS n, 4 | COUNT(DISTINCT {{ column }}) AS n_distinct 5 | FROM {{ table }} 6 | GROUP BY {{ by }} 7 | HAVING n != n_distinct 8 | -------------------------------------------------------------------------------- /lea/cli.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import collections 4 | import pathlib 5 | 6 | import click 7 | 8 | import lea 9 | 10 | 11 | @click.group() 12 | def app(): 13 | ... 14 | 15 | 16 | @app.command() 17 | @click.option("--select", "-m", multiple=True, default=["*"], help="Scripts to materialize.") 18 | @click.option("--unselect", "-m", multiple=True, default=[], help="Scripts to unselect.") 19 | @click.option("--dataset", default=None, help="Name of the base dataset.") 20 | @click.option("--scripts", default="scripts", help="Directory where the scripts are located.") 21 | @click.option( 22 | "--incremental", nargs=2, type=str, multiple=True, help="Incremental field name and value." 23 | ) 24 | @click.option("--dry", is_flag=True, default=False, help="Whether to run in dry mode.") 25 | @click.option("--print", is_flag=True, default=False, help="Whether to print the SQL code.") 26 | @click.option( 27 | "--production", is_flag=True, default=False, help="Whether to run the scripts in production." 28 | ) 29 | @click.option("--restart", is_flag=True, default=False, help="Whether to restart from scratch.") 30 | def run(select, unselect, dataset, scripts, incremental, dry, print, production, restart): 31 | if select in {"", "Ø"}: 32 | select = [] 33 | 34 | if not pathlib.Path(scripts).is_dir(): 35 | raise click.ClickException(f"Directory {scripts} does not exist") 36 | 37 | # Handle incremental option 38 | incremental_field_values = collections.defaultdict(set) 39 | for field, value in incremental: 40 | incremental_field_values[field].add(value) 41 | if len(incremental_field_values) > 1: 42 | raise click.ClickException("Specifying multiple incremental fields is not supported") 43 | incremental_field_name = next(iter(incremental_field_values), None) 44 | incremental_field_values = incremental_field_values[incremental_field_name] 45 | 46 | conductor = lea.Conductor(scripts_dir=scripts, dataset_name=dataset) 47 | conductor.run( 48 | select=select, 49 | unselect=unselect, 50 | production=production, 51 | dry_run=dry, 52 | restart=restart, 53 | incremental_field_name=incremental_field_name, 54 | incremental_field_values=incremental_field_values, 55 | print_mode=print, 56 | ) 57 | -------------------------------------------------------------------------------- /lea/comment.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import collections 4 | import dataclasses 5 | 6 | import sqlglot 7 | 8 | from .dialects import SQLDialect 9 | 10 | 11 | @dataclasses.dataclass 12 | class Comment: 13 | line: int 14 | text: str 15 | 16 | 17 | class CommentBlock(collections.UserList): 18 | def __init__(self, comments: list[Comment]): 19 | super().__init__(sorted(comments, key=lambda c: c.line)) 20 | 21 | @property 22 | def first_line(self): 23 | return self[0].line 24 | 25 | @property 26 | def last_line(self): 27 | return self[-1].line 28 | 29 | 30 | def extract_comments( 31 | code: str, expected_field_names: list[str], sql_dialect: SQLDialect 32 | ) -> dict[str, CommentBlock]: 33 | dialect = sqlglot.Dialect.get_or_raise(sql_dialect.sqlglot_dialect.value) 34 | tokens = dialect.tokenizer_class().tokenize(code) 35 | 36 | # Extract comments, which are lines that start with -- 37 | comments = [ 38 | Comment(line=line, text=comment.replace("--", "").strip()) 39 | for line, comment in enumerate(code.splitlines(), start=1) 40 | if comment.strip().startswith("--") 41 | ] 42 | 43 | # Pack comments into CommentBlock objects 44 | comment_blocks = merge_adjacent_comments(comments) 45 | 46 | # We assume the tokens are stored. Therefore, by looping over them and building a dictionary, 47 | # each key will be unique and the last value will be the last variable in the line. 48 | var_tokens = [ 49 | token 50 | for token in tokens 51 | if token.token_type.value == "VAR" and token.text in expected_field_names 52 | ] 53 | 54 | def is_var_line(line): 55 | line_tokens = [t for t in tokens if t.line == line and t.token_type.value != "COMMA"] 56 | return line_tokens[-1].token_type.value == "VAR" 57 | 58 | last_var_per_line = {token.line: token.text for token in var_tokens if is_var_line(token.line)} 59 | 60 | # Now assign each comment block to a variable 61 | var_comments = {} 62 | for comment_block in comment_blocks: 63 | adjacent_var = next( 64 | (var for line, var in last_var_per_line.items() if comment_block.last_line == line - 1), 65 | None, 66 | ) 67 | if adjacent_var: 68 | var_comments[adjacent_var] = comment_block 69 | 70 | return var_comments 71 | 72 | 73 | def merge_adjacent_comments(comments: list[Comment]) -> list[CommentBlock]: 74 | if not comments: 75 | return [] 76 | 77 | # Sort comments by their line number 78 | comments.sort(key=lambda c: c.line) 79 | 80 | merged_blocks = [] 81 | current_block = [comments[0]] 82 | 83 | # Iterate through comments and group adjacent ones 84 | for i in range(1, len(comments)): 85 | if comments[i].line == comments[i - 1].line + 1: # Check if adjacent 86 | current_block.append(comments[i]) 87 | else: 88 | # Create a CommentBlock for the current group 89 | merged_blocks.append(CommentBlock(current_block)) 90 | # Start a new block 91 | current_block = [comments[i]] 92 | 93 | # Add the last block 94 | merged_blocks.append(CommentBlock(current_block)) 95 | 96 | return merged_blocks 97 | -------------------------------------------------------------------------------- /lea/conductor.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import concurrent.futures 4 | import datetime as dt 5 | import getpass 6 | import json 7 | import os 8 | import pathlib 9 | import sys 10 | 11 | import dotenv 12 | 13 | import lea 14 | from lea import databases 15 | from lea.dag import DAGOfScripts 16 | from lea.databases import DatabaseClient, TableStats 17 | from lea.dialects import BigQueryDialect, DuckDBDialect 18 | from lea.session import Session 19 | from lea.table_ref import AUDIT_TABLE_SUFFIX, TableRef 20 | 21 | 22 | class Conductor: 23 | def __init__( 24 | self, scripts_dir: str, dataset_name: str | None = None, project_name: str | None = None 25 | ): 26 | # Load environment variables from .env file 27 | # TODO: is is Pythonic to do this here? 28 | dotenv.load_dotenv(".env", verbose=True) 29 | 30 | self.warehouse = os.environ["LEA_WAREHOUSE"].lower() 31 | 32 | self.scripts_dir = pathlib.Path(scripts_dir) 33 | if not self.scripts_dir.is_dir(): 34 | raise ValueError(f"Directory {self.scripts_dir} not found") 35 | 36 | if dataset_name is None: 37 | if self.warehouse == "bigquery": 38 | dataset_name = os.environ.get("LEA_BQ_DATASET_NAME") 39 | 40 | if self.warehouse == "duckdb": 41 | duckdb_path = pathlib.Path(os.environ.get("LEA_DUCKDB_PATH", "")) 42 | dataset_name = duckdb_path.stem 43 | if dataset_name is None: 44 | raise ValueError("Dataset name could not be inferred") 45 | self.dataset_name = dataset_name 46 | 47 | if project_name is None: 48 | if self.warehouse == "bigquery": 49 | project_name = os.environ.get("LEA_BQ_PROJECT_ID") 50 | if self.warehouse == "duckdb": 51 | project_name = dataset_name 52 | if project_name is None: 53 | raise ValueError("Project name could not be inferred") 54 | self.project_name = project_name 55 | 56 | lea.log.info("📝 Reading scripts") 57 | 58 | if self.warehouse == "bigquery": 59 | self.dag = DAGOfScripts.from_directory( 60 | scripts_dir=self.scripts_dir, 61 | sql_dialect=BigQueryDialect(), 62 | dataset_name=self.dataset_name, 63 | project_name=self.project_name if self.warehouse == "bigquery" else None, 64 | ) 65 | if self.warehouse == "duckdb": 66 | self.dag = DAGOfScripts.from_directory( 67 | scripts_dir=self.scripts_dir, 68 | sql_dialect=DuckDBDialect(), 69 | dataset_name=self.dataset_name, 70 | project_name=None, 71 | ) 72 | lea.log.info(f"{sum(1 for s in self.dag.scripts if not s.is_test):,d} table scripts") 73 | lea.log.info(f"{sum(1 for s in self.dag.scripts if s.is_test):,d} test scripts") 74 | 75 | def run( 76 | self, 77 | select: list[str], 78 | unselect: list[str], 79 | production: bool = False, 80 | dry_run: bool = False, 81 | restart: bool = False, 82 | incremental_field_name: str | None = None, 83 | incremental_field_values: list[str] | None = None, 84 | print_mode: bool = False, 85 | ): 86 | session = self.prepare_session( 87 | select=select, 88 | unselect=unselect, 89 | production=production, 90 | dry_run=dry_run, 91 | incremental_field_name=incremental_field_name, 92 | incremental_field_values=incremental_field_values, 93 | print_mode=print_mode, 94 | ) 95 | 96 | try: 97 | self.run_session(session, restart=restart, dry_run=dry_run) 98 | if session.any_error_has_occurred: 99 | return sys.exit(1) 100 | except KeyboardInterrupt: 101 | lea.log.error("🛑 Keyboard interrupt") 102 | session.end() 103 | return sys.exit(1) 104 | 105 | def prepare_session( 106 | self, 107 | select: list[str], 108 | unselect: list[str], 109 | production: bool = False, 110 | dry_run: bool = False, 111 | incremental_field_name: str | None = None, 112 | incremental_field_values: list[str] | None = None, 113 | print_mode: bool = False, 114 | ) -> Session: 115 | # We need a database client to run scripts 116 | database_client = self.make_client(dry_run=dry_run, print_mode=print_mode) 117 | 118 | # We need to select the scripts we want to run. We do this by querying the DAG. 119 | selected_table_refs = self.dag.select(*select) 120 | unselected_table_refs = self.dag.select(*unselect) 121 | if not selected_table_refs - unselected_table_refs: 122 | msg = "Nothing found for select " + ", ".join(select) 123 | if unselect: 124 | msg += " and unselect: " + ", ".join(unselect) 125 | lea.log.error(msg) 126 | return sys.exit(1) 127 | 128 | # We need a dataset to materialize the scripts. If we're in production mode, we use the 129 | # base dataset. If we're in user mode, we use a dataset named after the user. 130 | write_dataset = self.dataset_name if production else self.name_user_dataset() 131 | database_client.create_dataset(write_dataset) 132 | 133 | # When using DuckDB, we need to create schema for the tables 134 | if self.warehouse == "duckdb": 135 | lea.log.info("🔩 Creating schemas") 136 | for table_ref in selected_table_refs - unselected_table_refs: 137 | database_client.create_schema(table_ref) 138 | 139 | # When the scripts run, they are materialized into side-tables which we call "audit" 140 | # tables. When a run stops because of an error, the audit tables are left behind. If we 141 | # want to start fresh, we have to delete the audit tables. If not, the materialized tables 142 | # can be skipped. 143 | existing_tables = self.list_existing_tables( 144 | database_client=database_client, dataset=write_dataset 145 | ) 146 | lea.log.info(f"{len(existing_tables):,d} tables already exist") 147 | existing_audit_tables = self.list_existing_audit_tables( 148 | database_client=database_client, dataset=write_dataset 149 | ) 150 | 151 | lea.log.info(f"{len(existing_audit_tables):,d} audit tables already exist") 152 | 153 | session = Session( 154 | database_client=database_client, 155 | base_dataset=self.dataset_name, 156 | write_dataset=write_dataset, 157 | scripts=self.dag.scripts, 158 | selected_table_refs=selected_table_refs, 159 | unselected_table_refs=unselected_table_refs, 160 | existing_tables=existing_tables, 161 | existing_audit_tables=existing_audit_tables, 162 | incremental_field_name=incremental_field_name, 163 | incremental_field_values=incremental_field_values, 164 | ) 165 | 166 | return session 167 | 168 | def run_session(self, session: Session, restart: bool, dry_run: bool): 169 | if restart: 170 | delete_audit_tables(session) 171 | 172 | # Loop over table references in topological order 173 | materialize_scripts(dag=self.dag, session=session) 174 | 175 | # At this point, the scripts have been materialized into side-tables which we call "audit" 176 | # tables. We can now take care of promoting the audit tables to production. 177 | if not session.any_error_has_occurred and not dry_run: 178 | promote_audit_tables(session) 179 | 180 | # If all the scripts succeeded, we can delete the audit tables. 181 | if not session.any_error_has_occurred and not dry_run: 182 | delete_audit_tables(session) 183 | 184 | # Let's also delete orphan tables, which are tables that exist but who's scripts have 185 | # been deleted. 186 | delete_orphan_tables(session) 187 | 188 | # Regardless of whether all the jobs succeeded or not, we want to summarize the session. 189 | session.end() 190 | duration_str = str(session.ended_at - session.started_at).split(".")[0] # type: ignore[operator] 191 | emoji = "✅" if not session.any_error_has_occurred else "❌" 192 | msg = f"{emoji} Finished" 193 | if session.ended_at - session.started_at > dt.timedelta(seconds=1): 194 | msg += f", took {duration_str}" 195 | else: 196 | msg += ", took less than a second 🚀" 197 | if session.total_billed_dollars > 0: 198 | msg += f", cost ${session.total_billed_dollars:.2f}" 199 | lea.log.info(msg) 200 | 201 | def make_client(self, dry_run: bool = False, print_mode: bool = False) -> DatabaseClient: 202 | if self.warehouse.lower() == "bigquery": 203 | # Do imports here to avoid loading them all the time 204 | from google.oauth2 import service_account 205 | 206 | scopes_str = os.environ.get("LEA_BQ_SCOPES", "https://www.googleapis.com/auth/bigquery") 207 | scopes = scopes_str.split(",") 208 | scopes = [scope.strip() for scope in scopes] 209 | 210 | credentials = ( 211 | service_account.Credentials.from_service_account_info( 212 | json.loads(bq_service_account_info_str, strict=False), scopes=scopes 213 | ) 214 | if (bq_service_account_info_str := os.environ.get("LEA_BQ_SERVICE_ACCOUNT")) 215 | is not None 216 | else None 217 | ) 218 | client = databases.BigQueryClient( 219 | credentials=credentials, 220 | location=os.environ["LEA_BQ_LOCATION"], 221 | write_project_id=os.environ["LEA_BQ_PROJECT_ID"], 222 | compute_project_id=os.environ.get( 223 | "LEA_BQ_COMPUTE_PROJECT_ID", 224 | credentials.project_id if credentials is not None else None, 225 | ), 226 | storage_billing_model=os.environ.get("LEA_BQ_STORAGE_BILLING_MODEL"), 227 | dry_run=dry_run, 228 | print_mode=print_mode, 229 | default_clustering_fields=[ 230 | clustering_field.strip() 231 | for clustering_field in os.environ.get( 232 | "LEA_BQ_DEFAULT_CLUSTERING_FIELDS", "" 233 | ).split(",") 234 | if clustering_field.strip() 235 | ], 236 | big_blue_pick_api_url=os.environ.get("LEA_BQ_BIG_BLUE_PICK_API_URL"), 237 | big_blue_pick_api_key=os.environ.get("LEA_BQ_BIG_BLUE_PICK_API_KEY"), 238 | big_blue_pick_api_on_demand_project_id=os.environ.get( 239 | "LEA_BQ_BIG_BLUE_PICK_API_ON_DEMAND_PROJECT_ID" 240 | ), 241 | big_blue_pick_api_reservation_project_id=os.environ.get( 242 | "LEA_BQ_BIG_BLUE_PICK_API_REVERVATION_PROJECT_ID" 243 | ), 244 | ) 245 | if client.big_blue_pick_api is not None: 246 | lea.log.info("🧔‍♂️ Using Big Blue Pick API") 247 | return client 248 | 249 | if self.warehouse.lower() == "duckdb": 250 | return databases.DuckDBClient( 251 | database_path=pathlib.Path(os.environ.get("LEA_DUCKDB_PATH", "")), 252 | dry_run=dry_run, 253 | print_mode=print_mode, 254 | ) 255 | 256 | raise ValueError(f"Unsupported warehouse {self.warehouse!r}") 257 | 258 | def name_user_dataset(self) -> str: 259 | username = os.environ.get("LEA_USERNAME", getpass.getuser()) 260 | return f"{self.dataset_name}_{username}" 261 | 262 | def list_existing_tables( 263 | self, database_client: DatabaseClient, dataset: str 264 | ) -> dict[TableRef, TableStats]: 265 | existing_tables = database_client.list_table_stats(dataset) 266 | existing_tables = { 267 | table_ref: stats 268 | for table_ref, stats in existing_tables.items() 269 | if not table_ref.name.endswith(AUDIT_TABLE_SUFFIX) 270 | } 271 | return existing_tables 272 | 273 | def list_existing_audit_tables( 274 | self, database_client: DatabaseClient, dataset: str 275 | ) -> dict[TableRef, TableStats]: 276 | existing_audit_tables = database_client.list_table_stats(dataset) 277 | existing_audit_tables = { 278 | table_ref: stats 279 | for table_ref, stats in existing_audit_tables.items() 280 | if table_ref.name.endswith(AUDIT_TABLE_SUFFIX) 281 | } 282 | return existing_audit_tables 283 | 284 | 285 | def materialize_scripts(dag: DAGOfScripts, session: Session): 286 | table_refs_to_run = determine_table_refs_to_run( 287 | selected_table_refs=session.selected_table_refs, 288 | unselected_table_refs=session.unselected_table_refs, 289 | existing_audit_tables=session.existing_audit_tables, 290 | dag=dag, 291 | base_dataset=session.base_dataset, 292 | ) 293 | if not table_refs_to_run: 294 | lea.log.info("✅ Nothing needs materializing") 295 | return 296 | lea.log.info(f"🔵 Running {len(table_refs_to_run):,d} scripts") 297 | dag.prepare() 298 | while dag.is_active(): 299 | # If we're in early end mode, we need to check if any script errored, in which case we 300 | # have to stop everything. 301 | if session.any_error_has_occurred: 302 | lea.log.error("✋ Early ending because an error occurred") 303 | break 304 | 305 | # Start available jobs 306 | for script_to_run in dag.iter_scripts(table_refs_to_run): 307 | # Before executing a script, we need to contextualize it. We have to edit its 308 | # dependencies, add incremental logic, and set the write context. 309 | script_to_run = session.add_context_to_script(script_to_run) 310 | # 🔨 if you're developping on lea, you can call session.run_script(script_to_run) here 311 | # to get a better stack trace. This is because the executor will run the script in a 312 | # different thread, and the exception will be raised in that thread, not in the main 313 | # thread. 314 | future = session.executor.submit(session.run_script, script_to_run) 315 | session.run_script_futures[future] = script_to_run 316 | 317 | # Check for scripts that have finished 318 | done, _ = concurrent.futures.wait( 319 | session.run_script_futures, return_when=concurrent.futures.FIRST_COMPLETED 320 | ) 321 | for future in done: 322 | script_done = session.run_script_futures[future] 323 | if exception := future.exception(): 324 | lea.log.error(f"Failed running {script_done.table_ref}\n{exception}") 325 | table_ref = session.remove_write_context_from_table_ref(script_done.table_ref) 326 | session.run_script_futures_complete[future] = session.run_script_futures.pop(future) 327 | dag.done(table_ref) 328 | 329 | 330 | def promote_audit_tables(session: Session): 331 | lea.log.info("🟢 Promoting audit tables") 332 | # Ideally, we would like to do this automatically, but BigQuery does not support DDL 333 | # statements in a transaction. So we do it concurrently. This isn't ideal, but it's the 334 | # best we can do for now. There's a very small chance that at least one promotion job will 335 | # fail. 336 | # https://hiflylabs.com/blog/2022/11/22/dbt-deployment-best-practices 337 | # https://calogica.com/sql/bigquery/dbt/2020/05/24/dbt-bigquery-blue-green-wap.html 338 | # https://calogica.com/assets/wap_dbt_bigquery.pdf 339 | # Note: it's important for the following loop to be a list comprehension. If we used a 340 | # generator expression, the loop would be infinite because jobs are being added to 341 | # session.jobs when session.promote is called. 342 | for selected_table_ref in session.selected_table_refs: 343 | if selected_table_ref.is_test: 344 | continue 345 | selected_table_ref = session.add_write_context_to_table_ref(selected_table_ref) 346 | future = session.executor.submit(session.promote_audit_table, selected_table_ref) 347 | session.promote_audit_tables_futures[future] = selected_table_ref 348 | 349 | # Wait for all promotion jobs to finish 350 | for future in concurrent.futures.as_completed(session.promote_audit_tables_futures): 351 | if (exception := future.exception()) is not None: 352 | lea.log.error(f"Promotion failed\n{exception}") 353 | 354 | 355 | def delete_audit_tables(session: Session): 356 | # Depending on when delete_audit_tables is called, there might be new audit tables that have 357 | # been created. We need to delete them too. We do this by adding the write context to the 358 | # table references. This will add the audit suffix to the table reference, which will make 359 | # it match the audit tables that have been created. 360 | table_refs_to_delete = set(session.existing_audit_tables) | { 361 | session.add_write_context_to_table_ref(table_ref) 362 | for table_ref in session.selected_table_refs 363 | } 364 | if table_refs_to_delete: 365 | lea.log.info("🧹 Deleting audit tables") 366 | delete_table_refs( 367 | table_refs=table_refs_to_delete, 368 | database_client=session.database_client, 369 | executor=concurrent.futures.ThreadPoolExecutor(max_workers=None), 370 | verbose=False, 371 | ) 372 | session.existing_audit_tables = {} 373 | 374 | 375 | def delete_orphan_tables(session: Session): 376 | table_refs_to_delete = set(session.existing_tables) - { 377 | session.add_write_context_to_table_ref(table_ref).remove_audit_suffix() 378 | for table_ref in session.scripts 379 | } 380 | if table_refs_to_delete: 381 | lea.log.info("🧹 Deleting orphan tables") 382 | delete_table_refs( 383 | table_refs=table_refs_to_delete, 384 | database_client=session.database_client, 385 | executor=concurrent.futures.ThreadPoolExecutor(max_workers=None), 386 | verbose=True, 387 | ) 388 | session.existing_audit_tables = {} 389 | 390 | 391 | def delete_table_refs( 392 | table_refs: set[TableRef], 393 | database_client: DatabaseClient, 394 | executor: concurrent.futures.ThreadPoolExecutor, 395 | verbose: bool, 396 | ): 397 | futures: dict[concurrent.futures.Future, TableRef] = {} 398 | for table_ref in table_refs: 399 | future = executor.submit(database_client.delete_table, table_ref) 400 | futures[future] = table_ref 401 | 402 | for future in concurrent.futures.as_completed(futures): 403 | if (exception := future.exception()) is not None: 404 | lea.log.error(exception) 405 | continue 406 | if verbose: 407 | lea.log.info(f"Deleted {futures[future]}") 408 | 409 | 410 | def determine_table_refs_to_run( 411 | selected_table_refs: set[TableRef], 412 | unselected_table_refs: set[TableRef], 413 | existing_audit_tables: dict[TableRef, TableStats], 414 | dag: DAGOfScripts, 415 | base_dataset: str, 416 | ) -> set[TableRef]: 417 | """Determine which table references need to be run. 418 | 419 | We want to: 420 | 421 | 1. Run tables that have been selected. This is obtained from the DAGOfScripts.select method. 422 | 2. Skip tables that already exist. This is obtained from the database client. 423 | 3. Don't skip tables that have been edited since last being run. This is obtained from the 424 | scripts themselves. 425 | 426 | This last requirement is why we need an extra method to determine which table references need 427 | to be run. We compare the updated_at of the script with the updated_at of the corresponding 428 | table (if it exists): a script that has been modified since the last time it was run needs to 429 | be run again. All the descendants of this script also need to be run. 430 | 431 | On top of this, we also include each test script that is associated with the selected table 432 | references. We do this because it's a good default behavior. 433 | 434 | """ 435 | table_refs_to_run = selected_table_refs.copy() 436 | 437 | # By default, we do not run scripts that have an audit table materialized. We will determine 438 | # afterwards, based on each script's modified_at, if we need to run them again. 439 | existing_audit_table_refs = { 440 | table_ref.remove_audit_suffix().replace_dataset(base_dataset): stats 441 | for table_ref, stats in existing_audit_tables.items() 442 | } 443 | table_refs_to_run -= set(existing_audit_table_refs) 444 | 445 | # Now we check if any of the audit tables have had their script modified since the last time 446 | # they were materialized. If so, we need to run them again, as well as their descendants. 447 | for table_ref in selected_table_refs & set(existing_audit_table_refs): 448 | script = dag.scripts[table_ref] 449 | if script.updated_at > existing_audit_table_refs[table_ref].updated_at: 450 | lea.log.info(f"📝 {table_ref} was modified, re-materializing it") 451 | table_refs_to_run.add(table_ref) 452 | table_refs_to_run |= set(dag.iter_descendants(table_ref)) & selected_table_refs 453 | 454 | # Include applicable tests. That is, test scripts whose dependencies are all in the set of 455 | # selected table references. 456 | applicable_test_scripts_table_refs = { 457 | script.table_ref 458 | for script in dag.scripts.values() 459 | if script.is_test 460 | and all(dependency in table_refs_to_run for dependency in script.dependencies) 461 | } 462 | table_refs_to_run |= applicable_test_scripts_table_refs 463 | 464 | # Now we remove the unselected table references from the set of table references to run. We do 465 | # this at the very end, because of the above logic which adds table references to the set of 466 | # table references to run. For instance, if we run 467 | # 468 | # lea --select core.accounts --unselect tests 469 | # 470 | # we don't want the tests which are applicable to core.accounts to be run. 471 | table_refs_to_run -= unselected_table_refs 472 | 473 | return table_refs_to_run 474 | -------------------------------------------------------------------------------- /lea/dag.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import graphlib 4 | import pathlib 5 | import re 6 | from collections.abc import Iterator 7 | 8 | import git 9 | 10 | from .dialects import SQLDialect 11 | from .scripts import Script, read_scripts 12 | from .table_ref import TableRef 13 | 14 | 15 | class DAGOfScripts(graphlib.TopologicalSorter): 16 | def __init__( 17 | self, 18 | dependency_graph: dict[TableRef, set[TableRef]], 19 | scripts: list[Script], 20 | scripts_dir: pathlib.Path, 21 | dataset_name: str, 22 | project_name: str | None, 23 | ): 24 | graphlib.TopologicalSorter.__init__(self, dependency_graph) 25 | self.dependency_graph = dependency_graph 26 | self.scripts = {script.table_ref: script for script in scripts} 27 | self.scripts_dir = scripts_dir 28 | self.dataset_name = dataset_name 29 | self.project_name = project_name 30 | 31 | @classmethod 32 | def from_directory( 33 | cls, 34 | scripts_dir: pathlib.Path, 35 | sql_dialect: SQLDialect, 36 | dataset_name: str, 37 | project_name: str | None, 38 | ) -> DAGOfScripts: 39 | scripts = read_scripts( 40 | scripts_dir=scripts_dir, 41 | sql_dialect=sql_dialect, 42 | dataset_name=dataset_name, 43 | project_name=project_name, 44 | ) 45 | 46 | # Fields in the script's code may contain tags. These tags induce assertion tests, which 47 | # are also scripts. We need to include these assertion tests in the dependency graph. 48 | for script in scripts: 49 | scripts.extend(script.assertion_tests) 50 | 51 | # TODO: the following is quite slow. This is because parsing dependencies from each script 52 | # is slow. There are several optimizations that could be done. 53 | dependency_graph = { 54 | script.table_ref: { 55 | dependency.replace_dataset(dataset_name) for dependency in script.dependencies 56 | } 57 | for script in scripts 58 | } 59 | 60 | return cls( 61 | dependency_graph=dependency_graph, 62 | scripts=scripts, 63 | scripts_dir=scripts_dir, 64 | dataset_name=dataset_name, 65 | project_name=project_name, 66 | ) 67 | 68 | def select(self, *queries: str) -> set[TableRef]: 69 | """Select a subset of the views in the DAG.""" 70 | 71 | def _select( 72 | query: str, 73 | include_ancestors: bool = False, 74 | include_descendants: bool = False, 75 | ): 76 | if query == "*": 77 | yield from self.scripts.keys() 78 | return 79 | 80 | # It's possible to query views via git. For example: 81 | # * `git` will select all the views that have been modified compared to the main branch. 82 | # * `git+` will select all the modified views, and their descendants. 83 | # * `+git` will select all the modified views, and their ancestors. 84 | # * `+git+` will select all the modified views, with their ancestors and descendants. 85 | if m := re.match(r"(?P\+?)git(?P\+?)", query): 86 | include_ancestors = include_ancestors or m.group("ancestors") == "+" 87 | include_descendants = include_descendants or m.group("descendants") == "+" 88 | for table_ref in list_table_refs_that_changed( 89 | scripts_dir=self.scripts_dir, project_name=self.project_name 90 | ): 91 | yield from _select( 92 | ".".join([*table_ref.schema, table_ref.name]), 93 | include_ancestors=include_ancestors, 94 | include_descendants=include_descendants, 95 | ) 96 | return 97 | 98 | if query.endswith("+"): 99 | yield from _select( 100 | query=query[:-1], 101 | include_ancestors=include_ancestors, 102 | include_descendants=True, 103 | ) 104 | return 105 | 106 | if query.startswith("+"): 107 | yield from _select( 108 | query=query[1:], 109 | include_ancestors=True, 110 | include_descendants=include_descendants, 111 | ) 112 | return 113 | 114 | if "/" in query: 115 | schema = tuple(query.strip("/").split("/")) 116 | for table_ref in self.dependency_graph: 117 | if table_ref.schema == schema: 118 | yield from _select( 119 | ".".join([*table_ref.schema, table_ref.name]), 120 | include_ancestors=include_ancestors, 121 | include_descendants=include_descendants, 122 | ) 123 | return 124 | 125 | *schema, name = query.split(".") 126 | table_ref = TableRef( 127 | dataset=self.dataset_name, 128 | schema=tuple(schema), 129 | name=name, 130 | project=self.project_name, 131 | ) 132 | yield table_ref 133 | if include_ancestors: 134 | yield from self.iter_ancestors(node=table_ref) 135 | if include_descendants: 136 | yield from self.iter_descendants(node=table_ref) 137 | 138 | all_selected_table_refs = set() 139 | for query in queries: 140 | selected_table_refs = set(_select(query)) 141 | all_selected_table_refs.update(selected_table_refs) 142 | 143 | return { 144 | table_ref 145 | for table_ref in all_selected_table_refs 146 | # Some nodes in the graph are not part of the views, such as external dependencies 147 | if table_ref in self.scripts 148 | } 149 | 150 | def iter_scripts(self, table_refs: set[TableRef]) -> Iterator[Script]: 151 | """Loop over scripts in topological order. 152 | 153 | This method does not have the responsibility of calling .prepare() and .done() when a 154 | script terminates. This is the responsibility of the caller. 155 | 156 | """ 157 | 158 | for table_ref in self.get_ready(): 159 | if ( 160 | # The DAG contains all the scripts as well as all the dependencies of each script. 161 | # Not all of these dependencies are scripts. We need to filter out the non-script 162 | # dependencies. 163 | table_ref not in self.scripts 164 | # We also need to filter out the scripts that are not part of the selected table 165 | # refs. 166 | or table_ref not in table_refs 167 | ): 168 | self.done(table_ref) 169 | continue 170 | 171 | yield self.scripts[table_ref] 172 | 173 | def iter_ancestors(self, node: TableRef): 174 | for child in self.dependency_graph.get(node, []): 175 | yield child 176 | yield from self.iter_ancestors(node=child) 177 | 178 | def iter_descendants(self, node: TableRef): 179 | for potential_child in self.dependency_graph: 180 | if node in self.dependency_graph[potential_child]: 181 | yield potential_child 182 | yield from self.iter_descendants(node=potential_child) 183 | 184 | 185 | def list_table_refs_that_changed(scripts_dir: pathlib.Path, project_name: str) -> set[TableRef]: 186 | repo = git.Repo(search_parent_directories=True) 187 | repo_root = pathlib.Path(repo.working_tree_dir) 188 | 189 | absolute_scripts_dir = scripts_dir.resolve() 190 | 191 | # Changes that have been committed 192 | staged_diffs = repo.index.diff( 193 | repo.refs.main.commit 194 | # repo.remotes.origin.refs.main.commit 195 | ) 196 | # Changes that have not been committed 197 | unstage_diffs = repo.head.commit.diff(None) 198 | 199 | table_refs = set() 200 | for diff in staged_diffs + unstage_diffs: 201 | # One thing to note is that we don't filter out deleted views. This is because 202 | # these views will get filtered out by dag.select anyway. 203 | diff_path = pathlib.Path(repo_root / diff.a_path).resolve() 204 | if diff_path.is_relative_to(absolute_scripts_dir) and tuple(diff_path.suffixes) in { 205 | (".sql",), 206 | (".sql", ".jinja"), 207 | }: 208 | table_ref = TableRef.from_path( 209 | scripts_dir=scripts_dir, 210 | relative_path=diff_path.relative_to(absolute_scripts_dir), 211 | project_name=project_name, 212 | ) 213 | table_refs.add(table_ref) 214 | 215 | return table_refs 216 | -------------------------------------------------------------------------------- /lea/databases.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import dataclasses 4 | import datetime as dt 5 | import hashlib 6 | import typing 7 | import urllib.parse 8 | from pathlib import Path 9 | 10 | import duckdb 11 | import pandas as pd 12 | import requests 13 | import rich 14 | from google.cloud import bigquery 15 | from google.oauth2 import service_account 16 | 17 | import lea 18 | from lea import scripts 19 | from lea.dialects import BigQueryDialect, DuckDBDialect 20 | from lea.table_ref import TableRef 21 | 22 | 23 | class DatabaseJob(typing.Protocol): 24 | @property 25 | def is_done(self) -> bool: 26 | pass 27 | 28 | def stop(self): 29 | pass 30 | 31 | @property 32 | def result(self) -> pd.DataFrame: 33 | pass 34 | 35 | @property 36 | def exception(self) -> Exception: 37 | pass 38 | 39 | @property 40 | def billed_dollars(self) -> float: 41 | pass 42 | 43 | @property 44 | def statistics(self) -> TableStats | None: 45 | pass 46 | 47 | @property 48 | def metadata(self) -> list[str]: 49 | return [] 50 | 51 | def conclude(self): 52 | pass 53 | 54 | 55 | class DatabaseClient(typing.Protocol): 56 | def create_dataset(self, dataset_name: str): 57 | pass 58 | 59 | def delete_dataset(self, dataset_name: str): 60 | pass 61 | 62 | def materialize_script(self, script: scripts.Script) -> DatabaseJob: 63 | pass 64 | 65 | def query_script(self, script: scripts.Script) -> DatabaseJob: 66 | pass 67 | 68 | def clone_table( 69 | self, from_table_ref: scripts.TableRef, to_table_ref: scripts.TableRef 70 | ) -> DatabaseJob: 71 | pass 72 | 73 | def delete_and_insert( 74 | self, from_table_ref: scripts.TableRef, to_table_ref: scripts.TableRef, on: str 75 | ) -> DatabaseJob: 76 | pass 77 | 78 | def delete_table(self, table_ref: scripts.TableRef) -> DatabaseJob: 79 | pass 80 | 81 | def list_table_stats(self, dataset_name: str) -> dict[scripts.TableRef, TableStats]: 82 | pass 83 | 84 | def list_table_fields(self, dataset_name: str) -> dict[scripts.TableRef, list[scripts.Field]]: 85 | pass 86 | 87 | 88 | @dataclasses.dataclass 89 | class BigQueryJob: 90 | client: BigQueryClient 91 | query_job: bigquery.QueryJob 92 | destination: bigquery.TableReference | None = None 93 | script: scripts.SQLScript | None = None 94 | 95 | @property 96 | def is_done(self) -> bool: 97 | return self.query_job.done() 98 | 99 | @property 100 | def billed_dollars(self) -> float: 101 | bytes_billed = ( 102 | self.query_job.total_bytes_processed 103 | if self.client.dry_run 104 | else self.query_job.total_bytes_billed 105 | ) 106 | if bytes_billed is None: 107 | return 0.0 108 | return self.client.estimate_cost_in_dollars(bytes_billed) 109 | 110 | @property 111 | def statistics(self) -> TableStats | None: 112 | if self.client.dry_run or self.destination is None: 113 | return None 114 | table = self.client.client.get_table( 115 | self.destination, retry=bigquery.DEFAULT_RETRY.with_deadline(10) 116 | ) 117 | return TableStats(n_rows=table.num_rows, n_bytes=table.num_bytes, updated_at=table.modified) 118 | 119 | def stop(self): 120 | self.client.client.cancel_job(self.query_job.job_id) 121 | 122 | @property 123 | def result(self) -> pd.DataFrame: 124 | return self.query_job.result().to_dataframe() 125 | 126 | @property 127 | def exception(self) -> Exception: 128 | return self.query_job.exception() 129 | 130 | @property 131 | def is_using_reservation(self) -> bool: 132 | return ( 133 | self.query_job._properties.get("statistics", {}) 134 | .get("reservationUsage", [{}])[0] 135 | .get("name") 136 | ) is not None 137 | 138 | @property 139 | def metadata(self) -> list[str]: 140 | billing_model = ("reservation" if self.is_using_reservation else "on-demand") + " billing" 141 | return [billing_model] 142 | 143 | def conclude(self): 144 | if self.client.big_blue_pick_api is not None and self.script is not None: 145 | self.client.big_blue_pick_api.record_job_for_script( 146 | script=self.script, job=self.query_job 147 | ) 148 | 149 | 150 | @dataclasses.dataclass(frozen=True) 151 | class TableStats: 152 | n_rows: int 153 | n_bytes: int | None 154 | updated_at: dt.datetime 155 | 156 | 157 | class BigBluePickAPI: 158 | """Big Blue Pick API implementation. 159 | 160 | https://biq.blue/blog/compute/how-to-implement-bigquery-autoscaling-reservation-in-10-minutes 161 | 162 | Parameters 163 | ---------- 164 | on_demand_project_id 165 | The project ID of the on-demand BigQuery project. 166 | reservation_project_id 167 | The project ID of the reservation BigQuery project. 168 | default_project_id 169 | The project ID of the default BigQuery project. This is used if something with the 170 | BigBlue Pick API fails. 171 | 172 | """ 173 | 174 | def __init__( 175 | self, 176 | api_url: str, 177 | api_key: str, 178 | on_demand_project_id: str, 179 | reservation_project_id: str, 180 | default_project_id: str, 181 | ): 182 | self.api_url = api_url 183 | self.api_key = api_key 184 | self.on_demand_project_id = on_demand_project_id 185 | self.reservation_project_id = reservation_project_id 186 | self.default_project_id = default_project_id 187 | 188 | def call_pick_api(self, path, body): 189 | try: 190 | response = requests.post( 191 | urllib.parse.urljoin(self.api_url, path), 192 | json=body, 193 | headers={ 194 | "Content-Type": "application/json", 195 | "Authorization": f"Bearer {self.api_key}", 196 | }, 197 | ) 198 | response.raise_for_status() 199 | return response.json() 200 | except requests.exceptions.RequestException as e: 201 | lea.log.warning(f"Big Blue Pick API call failed: {e}") 202 | return None 203 | 204 | @staticmethod 205 | def hash_script(script: scripts.SQLScript) -> str: 206 | return hashlib.sha256( 207 | str(script.table_ref.replace_dataset("FREEZE").replace_project("FREEZE")).encode() 208 | ).hexdigest() 209 | 210 | def pick_project_id_for_script(self, script: scripts.SQLScript) -> str: 211 | response = self.call_pick_api( 212 | path="/pick", 213 | body={"hash": self.hash_script(script)}, 214 | ) 215 | if not response or not (pick := response.get("pick")): 216 | lea.log.warning("Big Blue Pick API call failed, using default project ID") 217 | elif pick not in {"ON-DEMAND", "RESERVATION"}: 218 | lea.log.warning( 219 | f"Big Blue Pick API returned unexpected choice {response['pick']!r}, using default project ID" 220 | ) 221 | elif pick == "ON-DEMAND": 222 | return self.on_demand_project_id 223 | elif pick == "RESERVATION": 224 | return self.reservation_project_id 225 | return self.default_project_id 226 | 227 | def pick_client( 228 | self, script: scripts.SQLScript, credentials: service_account.Credentials, location: str 229 | ) -> DatabaseClient: 230 | project_id = self.pick_project_id_for_script(script=script) 231 | return bigquery.Client(project=project_id, credentials=credentials, location=location) 232 | 233 | def record_job_for_script(self, script: scripts.SQLScript, job: bigquery.QueryJob): 234 | self.call_pick_api( 235 | path="/write", 236 | # https://github.com/biqblue/docs/blob/1ec0eae06ccfabb339cf11bc19dbcbe04b404373/examples/python/pick.py#L42 237 | body={ 238 | "hash": self.hash_script(script), 239 | "job_id": job.job_id, 240 | "creation_time": str(int(job.created.timestamp() * 1000)), 241 | "start_time": str(int(job.started.timestamp() * 1000)), 242 | "end_time": str(int(job.ended.timestamp() * 1000)), 243 | "total_slot_ms": job.slot_millis, 244 | "total_bytes_billed": job.total_bytes_billed, 245 | "total_bytes_processed": job.total_bytes_processed, 246 | "bi_engine_mode": getattr(job, "bi_engine_statistics", {}).get( 247 | "bi_engine_mode", "" 248 | ), 249 | "reservation_id": ( 250 | job._properties.get("statistics", {}) 251 | .get("reservationUsage", [{}])[0] 252 | .get("name", "") 253 | ), 254 | }, 255 | ) 256 | 257 | 258 | class BigQueryClient(BigBluePickAPI): 259 | def __init__( 260 | self, 261 | credentials: service_account.Credentials, 262 | location: str, 263 | write_project_id: str, 264 | compute_project_id: str, 265 | storage_billing_model: str = "PHYSICAL", 266 | dry_run: bool = False, 267 | print_mode: bool = False, 268 | default_clustering_fields: list[str] = None, 269 | big_blue_pick_api_url: str = None, 270 | big_blue_pick_api_key: str = None, 271 | big_blue_pick_api_on_demand_project_id: str = None, 272 | big_blue_pick_api_reservation_project_id: str = None, 273 | ): 274 | self.credentials = credentials 275 | self.write_project_id = write_project_id 276 | self.compute_project_id = compute_project_id 277 | self.storage_billing_model = storage_billing_model 278 | self.location = location 279 | self.client = bigquery.Client( 280 | project=self.compute_project_id, 281 | credentials=self.credentials, 282 | location=self.location, 283 | ) 284 | self.dry_run = dry_run 285 | self.print_mode = print_mode 286 | self.default_clustering_fields = default_clustering_fields or [] 287 | 288 | self.big_blue_pick_api = ( 289 | BigBluePickAPI( 290 | api_url=big_blue_pick_api_url, 291 | api_key=big_blue_pick_api_key, 292 | on_demand_project_id=big_blue_pick_api_on_demand_project_id, 293 | reservation_project_id=big_blue_pick_api_reservation_project_id, 294 | default_project_id=self.write_project_id, 295 | ) 296 | if ( 297 | big_blue_pick_api_url is not None 298 | and big_blue_pick_api_key is not None 299 | and big_blue_pick_api_on_demand_project_id is not None 300 | and big_blue_pick_api_reservation_project_id is not None 301 | ) 302 | else None 303 | ) 304 | 305 | def create_dataset(self, dataset_name: str): 306 | dataset_ref = bigquery.DatasetReference( 307 | project=self.write_project_id, dataset_id=dataset_name 308 | ) 309 | dataset = bigquery.Dataset(dataset_ref) 310 | dataset.location = self.location 311 | dataset.storage_billing_model = self.storage_billing_model 312 | dataset = self.client.create_dataset(dataset, exists_ok=True) 313 | 314 | def delete_dataset(self, dataset_name: str): 315 | self.client.delete_dataset( 316 | dataset=f"{self.write_project_id}.{dataset_name}", 317 | delete_contents=True, 318 | not_found_ok=True, 319 | ) 320 | 321 | @staticmethod 322 | def estimate_cost_in_dollars(bytes_billed: int) -> float: 323 | cost_per_tb = 5 324 | return (bytes_billed / 10**12) * cost_per_tb 325 | 326 | def materialize_script(self, script: scripts.Script) -> BigQueryJob: 327 | if isinstance(script, scripts.SQLScript): 328 | return self.materialize_sql_script(sql_script=script) 329 | raise ValueError("Unsupported script type") 330 | 331 | def materialize_sql_script(self, sql_script: scripts.SQLScript) -> BigQueryJob: 332 | destination = BigQueryDialect.convert_table_ref_to_bigquery_table_reference( 333 | table_ref=sql_script.table_ref, project=self.write_project_id 334 | ) 335 | clustering_fields = ( 336 | ( 337 | [ 338 | clustering_field 339 | for clustering_field in self.default_clustering_fields 340 | if clustering_field in {field.name for field in sql_script.fields} 341 | ] 342 | ) 343 | if self.default_clustering_fields 344 | else None 345 | ) 346 | job_config = self.make_job_config( 347 | script=sql_script, 348 | destination=destination, 349 | write_disposition="WRITE_TRUNCATE", 350 | clustering_fields=clustering_fields 351 | if clustering_fields and not sql_script.table_ref.is_test 352 | else None, 353 | ) 354 | 355 | client = ( 356 | self.big_blue_pick_api.pick_client( 357 | script=sql_script, 358 | credentials=self.credentials, 359 | location=self.location, 360 | ) 361 | if self.big_blue_pick_api is not None 362 | else self.client 363 | ) 364 | 365 | return BigQueryJob( 366 | client=self, 367 | query_job=client.query( 368 | query=sql_script.code, job_config=job_config, location=self.location 369 | ), 370 | destination=destination, 371 | script=sql_script, 372 | ) 373 | 374 | def query_script(self, script: scripts.Script) -> BigQueryJob: 375 | if isinstance(script, scripts.SQLScript): 376 | return self.query_sql_script(sql_script=script) 377 | raise ValueError("Unsupported script type") 378 | 379 | def query_sql_script(self, sql_script: scripts.SQLScript) -> BigQueryJob: 380 | job_config = self.make_job_config(script=sql_script) 381 | client = ( 382 | self.big_blue_pick_api.pick_client( 383 | script=sql_script, 384 | credentials=self.credentials, 385 | location=self.location, 386 | ) 387 | if self.big_blue_pick_api is not None 388 | else self.client 389 | ) 390 | return BigQueryJob( 391 | client=self, 392 | query_job=client.query( 393 | query=sql_script.code, job_config=job_config, location=self.location 394 | ), 395 | script=sql_script, 396 | ) 397 | 398 | def clone_table( 399 | self, from_table_ref: scripts.TableRef, to_table_ref: scripts.TableRef 400 | ) -> BigQueryJob: 401 | destination = BigQueryDialect.convert_table_ref_to_bigquery_table_reference( 402 | table_ref=to_table_ref, project=self.write_project_id 403 | ) 404 | source = BigQueryDialect.convert_table_ref_to_bigquery_table_reference( 405 | table_ref=from_table_ref, project=self.write_project_id 406 | ) 407 | clone_code = f""" 408 | CREATE OR REPLACE TABLE {destination} 409 | CLONE {source} 410 | """ 411 | job_config = self.make_job_config( 412 | script=scripts.SQLScript( 413 | table_ref=to_table_ref, code=clone_code, sql_dialect=BigQueryDialect, fields=[] 414 | ) 415 | ) 416 | return BigQueryJob( 417 | client=self, 418 | query_job=self.client.query(clone_code, job_config=job_config, location=self.location), 419 | destination=destination, 420 | ) 421 | 422 | def delete_and_insert( 423 | self, from_table_ref: scripts.TableRef, to_table_ref: scripts.TableRef, on: str 424 | ) -> BigQueryJob: 425 | source = BigQueryDialect.convert_table_ref_to_bigquery_table_reference( 426 | table_ref=from_table_ref, project=self.write_project_id 427 | ) 428 | destination = BigQueryDialect.convert_table_ref_to_bigquery_table_reference( 429 | table_ref=to_table_ref, project=self.write_project_id 430 | ) 431 | # TODO: the following could instead be done with a MERGE statement. 432 | delete_and_insert_code = f""" 433 | BEGIN TRANSACTION; 434 | 435 | -- Delete existing data 436 | DELETE FROM {destination} 437 | WHERE {on} IN (SELECT DISTINCT {on} FROM {source}); 438 | 439 | -- Insert new data 440 | INSERT INTO {destination} 441 | SELECT * FROM {source}; 442 | 443 | COMMIT TRANSACTION; 444 | """ 445 | job_config = self.make_job_config( 446 | script=scripts.SQLScript( 447 | table_ref=to_table_ref, 448 | code=delete_and_insert_code, 449 | sql_dialect=BigQueryDialect, 450 | fields=[], 451 | ) 452 | ) 453 | return BigQueryJob( 454 | client=self, 455 | query_job=self.client.query( 456 | delete_and_insert_code, job_config=job_config, location=self.location 457 | ), 458 | destination=destination, 459 | ) 460 | 461 | def delete_table(self, table_ref: scripts.TableRef) -> BigQueryJob: 462 | table_reference = BigQueryDialect.convert_table_ref_to_bigquery_table_reference( 463 | table_ref=table_ref, project=self.write_project_id 464 | ) 465 | delete_code = f""" 466 | DROP TABLE IF EXISTS {table_reference} 467 | """ 468 | job_config = self.make_job_config( 469 | script=scripts.SQLScript( 470 | table_ref=table_ref, 471 | code=delete_code, 472 | sql_dialect=BigQueryDialect, 473 | fields=[], 474 | ) 475 | ) 476 | return BigQueryJob( 477 | client=self, 478 | query_job=self.client.query(delete_code, job_config=job_config, location=self.location), 479 | ) 480 | 481 | def list_table_stats(self, dataset_name: str) -> dict[scripts.TableRef, TableStats]: 482 | query = f""" 483 | SELECT table_id, row_count, size_bytes, last_modified_time 484 | FROM `{self.write_project_id}.{dataset_name}.__TABLES__` 485 | """ 486 | job = self.client.query(query, location=self.location) 487 | return { 488 | BigQueryDialect.parse_table_ref( 489 | f"{self.write_project_id}.{dataset_name}.{row['table_id']}" 490 | ): TableStats( 491 | n_rows=row["row_count"], 492 | n_bytes=row["size_bytes"], 493 | updated_at=( 494 | dt.datetime.fromtimestamp(row["last_modified_time"] // 1000, tz=dt.timezone.utc) 495 | ), 496 | ) 497 | for row in job.result() 498 | } 499 | 500 | def list_table_fields(self, dataset_name: str) -> dict[scripts.TableRef, set[scripts.Field]]: 501 | query = f""" 502 | SELECT table_name, column_name 503 | FROM `{self.write_project_id}.{dataset_name}.INFORMATION_SCHEMA.COLUMNS` 504 | """ 505 | job = self.client.query(query, location=self.location) 506 | return { 507 | BigQueryDialect.parse_table_ref( 508 | f"{self.write_project_id}.{dataset_name}.{table_name}" 509 | ): [scripts.Field(name=row["column_name"]) for _, row in rows.iterrows()] 510 | for table_name, rows in job.result() 511 | .to_dataframe() 512 | .sort_values(["table_name", "column_name"]) 513 | .groupby("table_name") 514 | } 515 | 516 | def make_job_config(self, script: scripts.SQLScript, **kwargs) -> bigquery.QueryJobConfig: 517 | if self.print_mode: 518 | rich.print(script) 519 | return bigquery.QueryJobConfig( 520 | priority=bigquery.QueryPriority.INTERACTIVE, 521 | use_query_cache=False, 522 | dry_run=self.dry_run, 523 | **kwargs, 524 | ) 525 | 526 | 527 | @dataclasses.dataclass 528 | class DuckDBJob: 529 | query: str 530 | connection: duckdb.DuckDBPyConnection 531 | destination: str | None = None 532 | exception: str | None = None 533 | 534 | def execute(self): 535 | self.connection.execute(self.query) 536 | 537 | @property 538 | def is_done(self) -> bool: 539 | try: 540 | self.execute() 541 | except Exception as e: 542 | self.exception = repr(e) 543 | raise e 544 | else: 545 | return True 546 | 547 | def stop(self): 548 | pass # No support for stopping queries in DuckDB 549 | 550 | @property 551 | def result(self) -> pd.DataFrame: 552 | return self.connection.execute(self.query).fetchdf() 553 | 554 | @property 555 | def billed_dollars(self) -> float: 556 | return None # DuckDB is free to use 557 | 558 | @property 559 | def statistics(self) -> TableStats | None: 560 | query = f"SELECT COUNT(*) AS n_rows, MAX(_materialized_timestamp) AS updated_at FROM {self.destination}" 561 | table = self.connection.execute(query).fetchdf().iloc[0] 562 | return TableStats( 563 | n_rows=int(table["n_rows"]), 564 | n_bytes=None, 565 | updated_at=table["updated_at"], 566 | ) 567 | 568 | 569 | class DuckDBClient: 570 | def __init__(self, database_path: Path, dry_run: bool = False, print_mode: bool = False): 571 | self.database_path = database_path 572 | if self.database_path == "": 573 | raise ValueError("DuckDB path not configured") 574 | self.dry_run = dry_run 575 | self.print_mode = print_mode 576 | 577 | @property 578 | def connection(self) -> duckdb.DuckDBPyConnection: 579 | return duckdb.connect(database=str(self.database_path)) 580 | 581 | @property 582 | def dataset(self) -> str: 583 | return self.database_path.stem 584 | 585 | def create_dataset(self, dataset_name: str): 586 | self.database_path = self.database_path.with_stem(dataset_name) 587 | 588 | def create_schema(self, table_ref: scripts.TableRef): 589 | self.connection.execute(f"CREATE SCHEMA IF NOT EXISTS {table_ref.schema[0]}") 590 | 591 | def materialize_script(self, script: scripts.Script) -> DuckDBJob: 592 | if isinstance(script, scripts.SQLScript): 593 | return self.materialize_sql_script(sql_script=script) 594 | raise ValueError("Unsupported script type") 595 | 596 | def materialize_sql_script(self, sql_script: scripts.SQLScript) -> DuckDBJob: 597 | destination = DuckDBDialect.convert_table_ref_to_duckdb_table_reference( 598 | table_ref=sql_script.table_ref 599 | ) 600 | # We need to materialize the script with a timestamp to keep track of when it was materialized. 601 | # DuckDB does not provide a metadata table, so we need to create one with a technical column. 602 | # bear in mind that this is a workaround and not a best practice. Any change done outside 603 | # lea will not be reflected in the metadata column and could break orchestration mecanism. 604 | materialize_code = f""" 605 | CREATE OR REPLACE TABLE {destination} AS ( 606 | WITH logic_table AS ({sql_script.code}), 607 | materialized_infos AS (SELECT CURRENT_LOCALTIMESTAMP() AS _materialized_timestamp) 608 | SELECT * FROM logic_table, materialized_infos 609 | ); 610 | """ 611 | return self.make_job_config( 612 | script=scripts.SQLScript( 613 | table_ref=sql_script.table_ref, 614 | code=materialize_code, 615 | sql_dialect=DuckDBDialect, 616 | fields=[], 617 | ), 618 | destination=destination, 619 | ) 620 | 621 | def query_script(self, script: scripts.Script) -> DuckDBJob: 622 | if isinstance(script, scripts.SQLScript): 623 | job = self.make_job_config(script=script) 624 | return job 625 | raise ValueError("Unsupported script type") 626 | 627 | def clone_table( 628 | self, from_table_ref: scripts.TableRef, to_table_ref: scripts.TableRef 629 | ) -> DuckDBJob: 630 | destination = DuckDBDialect.convert_table_ref_to_duckdb_table_reference( 631 | table_ref=to_table_ref 632 | ) 633 | source = DuckDBDialect.convert_table_ref_to_duckdb_table_reference(table_ref=from_table_ref) 634 | clone_code = f""" 635 | CREATE OR REPLACE TABLE {destination} AS SELECT * FROM {source} 636 | """ 637 | job = self.make_job_config( 638 | script=scripts.SQLScript( 639 | table_ref=to_table_ref, code=clone_code, sql_dialect=DuckDBDialect, fields=[] 640 | ), 641 | destination=destination, 642 | ) 643 | return job 644 | 645 | def delete_and_insert( 646 | self, from_table_ref: scripts.TableRef, to_table_ref: scripts.TableRef, on: str 647 | ) -> DuckDBJob: 648 | to_table_reference = DuckDBDialect.convert_table_ref_to_duckdb_table_reference( 649 | table_ref=to_table_ref 650 | ) 651 | from_table_reference = DuckDBDialect.convert_table_ref_to_duckdb_table_reference( 652 | table_ref=from_table_ref 653 | ) 654 | 655 | delete_and_insert_code = f""" 656 | DELETE FROM {to_table_reference} WHERE {on} IN (SELECT DISTINCT {on} FROM {from_table_reference}); 657 | INSERT INTO {to_table_reference} SELECT * FROM {from_table_reference}; 658 | """ 659 | job = self.make_job_config( 660 | script=scripts.SQLScript( 661 | table_ref=to_table_ref, 662 | code=delete_and_insert_code, 663 | sql_dialect=DuckDBDialect, 664 | fields=[], 665 | ), 666 | destination=to_table_reference, 667 | ) 668 | job.execute() 669 | return job 670 | 671 | def delete_table(self, table_ref: scripts.TableRef) -> DuckDBJob: 672 | table_reference = DuckDBDialect.convert_table_ref_to_duckdb_table_reference( 673 | table_ref=table_ref 674 | ) 675 | delete_code = f"DROP TABLE IF EXISTS {table_reference}" 676 | job = self.make_job_config( 677 | script=scripts.SQLScript( 678 | table_ref=table_ref, code=delete_code, sql_dialect=DuckDBDialect, fields=[] 679 | ) 680 | ) 681 | job.execute() 682 | return job 683 | 684 | def list_table_stats(self, dataset_name: str) -> dict[TableRef, TableStats]: 685 | tables_query = """ 686 | SELECT table_name, schema_name, estimated_size 687 | FROM duckdb_tables(); 688 | """ 689 | tables_result = self.connection.execute(tables_query).fetchdf() 690 | 691 | table_stats = {} 692 | for _, row in tables_result.iterrows(): 693 | table_name = row["table_name"] 694 | table_schema = row["schema_name"] 695 | n_rows = int(row["estimated_size"]) 696 | stats_query = f""" 697 | SELECT 698 | MAX(_materialized_timestamp) AS last_modified 699 | FROM {table_schema}.{table_name} 700 | """ 701 | result = self.connection.execute(stats_query).fetchdf().dropna() 702 | if result.empty: 703 | updated_at = dt.datetime.now(dt.timezone.utc) 704 | else: 705 | updated_at = dt.datetime.fromtimestamp( 706 | result.iloc[0]["last_modified"].to_pydatetime().timestamp(), 707 | tz=dt.timezone.utc, 708 | ) 709 | table_stats[ 710 | DuckDBDialect.parse_table_ref(f"{table_schema}.{table_name}").replace_dataset( 711 | dataset_name 712 | ) 713 | ] = TableStats( 714 | n_rows=n_rows, 715 | n_bytes=None, 716 | updated_at=updated_at, 717 | ) 718 | return table_stats 719 | 720 | def list_table_fields(self, dataset_name: str) -> dict[scripts.TableRef, list[scripts.Field]]: 721 | query = f""" 722 | SELECT table_name, column_name 723 | FROM information_schema.columns 724 | WHERE table_schema = '{dataset_name}' 725 | """ 726 | result = self.connection.execute(query).fetchdf() 727 | return { 728 | scripts.TableRef(name=table_name): [ 729 | scripts.Field(name=row["column_name"]) for _, row in rows.iterrows() 730 | ] 731 | for table_name, rows in result.groupby("table_name") 732 | } 733 | 734 | def make_job_config( 735 | self, script: scripts.SQLScript, destination: str | None = None 736 | ) -> DuckDBJob: 737 | if self.print_mode: 738 | rich.print(script) 739 | job = DuckDBJob(query=script.code, connection=self.connection, destination=destination) 740 | return job 741 | -------------------------------------------------------------------------------- /lea/dialects.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pathlib 4 | import re 5 | import textwrap 6 | 7 | import jinja2 8 | import sqlglot 9 | from google.cloud import bigquery 10 | 11 | from lea.field import FieldTag 12 | from lea.table_ref import TableRef 13 | 14 | 15 | class SQLDialect: 16 | sqlglot_dialect: sqlglot.dialects.Dialects | None = None 17 | 18 | @staticmethod 19 | def parse_table_ref(table_ref: str) -> TableRef: 20 | raise NotImplementedError 21 | 22 | @staticmethod 23 | def format_table_ref(table_ref: TableRef) -> str: 24 | raise NotImplementedError 25 | 26 | def make_column_test_unique(self, table_ref: TableRef, field_name: str) -> str: 27 | table_ref_str = self.format_table_ref(table_ref) 28 | return load_assertion_test_template(FieldTag.UNIQUE).render( 29 | table=table_ref_str, column=field_name 30 | ) 31 | 32 | def make_column_test_unique_by(self, table_ref: TableRef, field_name: str, by: str) -> str: 33 | table_ref_str = self.format_table_ref(table_ref) 34 | return load_assertion_test_template(FieldTag.UNIQUE_BY).render( 35 | table=table_ref_str, 36 | column=field_name, 37 | by=by, 38 | ) 39 | 40 | def make_column_test_no_nulls(self, table_ref: TableRef, field_name: str) -> str: 41 | table_ref_str = self.format_table_ref(table_ref) 42 | return load_assertion_test_template(FieldTag.NO_NULLS).render( 43 | table=table_ref_str, column=field_name 44 | ) 45 | 46 | def make_column_test_set(self, table_ref: TableRef, field_name: str, elements: set[str]) -> str: 47 | table_ref_str = self.format_table_ref(table_ref) 48 | return load_assertion_test_template(FieldTag.SET).render( 49 | table=table_ref_str, 50 | column=field_name, 51 | elements=elements, 52 | ) 53 | 54 | @classmethod 55 | def add_dependency_filters( 56 | cls, 57 | code: str, 58 | incremental_field_name: str, 59 | incremental_field_values: set[str], 60 | dependencies_to_filter: set[TableRef], 61 | ) -> str: 62 | code = remove_comment_lines(code) 63 | incremental_field_values_str = ", ".join(f"'{value}'" for value in incremental_field_values) 64 | for dependency in dependencies_to_filter: 65 | dependency_str = cls.format_table_ref(dependency) 66 | code = re.sub( 67 | # We could use \b, but it doesn't work with backticks 68 | rf"(? str: 86 | code = remove_comment_lines(code) 87 | incremental_field_values_str = ", ".join(f"'{value}'" for value in incremental_field_values) 88 | for ( 89 | dependency_without_wap_suffix, 90 | dependency_with_wap_suffix, 91 | ) in incremental_dependencies.items(): 92 | dependency_without_wap_suffix_str = cls.format_table_ref(dependency_without_wap_suffix) 93 | dependency_with_wap_suffix_str = cls.format_table_ref(dependency_with_wap_suffix) 94 | code = re.sub( 95 | # We could use \b, but it doesn't work with backticks 96 | rf"(? str: 112 | return "\n".join(line for line in code.split("\n") if not line.strip().startswith("--")) 113 | 114 | 115 | def load_assertion_test_template(tag: str) -> jinja2.Template: 116 | return jinja2.Template( 117 | (pathlib.Path(__file__).parent / "assertions" / f"{tag.lstrip('#')}.sql.jinja").read_text() 118 | ) 119 | 120 | 121 | class BigQueryDialect(SQLDialect): 122 | sqlglot_dialect = sqlglot.dialects.Dialects.BIGQUERY 123 | 124 | @staticmethod 125 | def parse_table_ref(table_ref: str) -> TableRef: 126 | """ 127 | 128 | >>> BigQueryDialect.parse_table_ref("my_dataset.my_schema__my_table") 129 | TableRef(dataset='my_dataset', schema=('my_schema',), name='my_table', project=None) 130 | 131 | >>> BigQueryDialect.parse_table_ref("my_dataset.my_table") 132 | TableRef(dataset='my_dataset', schema=(), name='my_table', project=None) 133 | 134 | >>> BigQueryDialect.parse_table_ref("my_dataset.my_schema__my_table___audit") 135 | TableRef(dataset='my_dataset', schema=('my_schema',), name='my_table___audit', project=None) 136 | 137 | >>> BigQueryDialect.parse_table_ref("my_project.my_dataset.my_schema__my_table___audit") 138 | TableRef(dataset='my_dataset', schema=('my_schema',), name='my_table___audit', project='my_project') 139 | 140 | >>> BigQueryDialect.parse_table_ref("`carbonfact-gsheet`.hubspot.company") 141 | TableRef(dataset='hubspot', schema=(), name='company', project='carbonfact-gsheet') 142 | 143 | """ 144 | project, dataset, leftover = None, *tuple(table_ref.rsplit(".", 1)) 145 | if "." in dataset: 146 | project, dataset = dataset.split(".") 147 | *schema, name = tuple(re.split(r"(? str: 157 | table_ref_str = "" 158 | if table_ref.project: 159 | table_ref_str += f"{table_ref.project}." 160 | if table_ref.dataset: 161 | table_ref_str += f"{table_ref.dataset}." 162 | table_ref_str += f"{'__'.join([*table_ref.schema, table_ref.name])}" 163 | return table_ref_str 164 | 165 | @staticmethod 166 | def convert_table_ref_to_bigquery_table_reference( 167 | table_ref: TableRef, project: str 168 | ) -> bigquery.TableReference: 169 | return bigquery.TableReference( 170 | dataset_ref=bigquery.DatasetReference(project=project, dataset_id=table_ref.dataset), 171 | table_id=f"{'__'.join([*table_ref.schema, table_ref.name])}", 172 | ) 173 | 174 | 175 | class DuckDBDialect(SQLDialect): 176 | sqlglot_dialect = sqlglot.dialects.Dialects.DUCKDB 177 | 178 | @staticmethod 179 | def parse_table_ref(table_ref: str) -> TableRef: 180 | """ 181 | Parses a DuckDB table reference string into a TableRef object. 182 | 183 | >>> DuckDBDialect.parse_table_ref("my_schema.my_table") 184 | TableRef(dataset=None, schema=('my_schema',), name='my_table', project=None) 185 | 186 | >>> DuckDBDialect.parse_table_ref("my_schema.my_subschema__my_table") 187 | TableRef(dataset=None, schema=('my_schema', 'my_subschema'), name='my_table', project=None) 188 | 189 | >>> DuckDBDialect.parse_table_ref("my_table") 190 | TableRef(dataset=None, schema=(), name='my_table', project=None) 191 | """ 192 | if "." in table_ref: 193 | project, schema, leftover = None, *tuple(table_ref.rsplit(".", 1)) 194 | *subschema, name = tuple(re.split(r"(? str: 206 | """ 207 | Formats a TableRef object into a DuckDB table reference string. 208 | 209 | >>> DuckDBDialect.format_table_ref(TableRef(dataset=None, schema=('my_schema',), name='my_table', project=None)) 210 | 'my_schema.my_table' 211 | 212 | >>> DuckDBDialect.format_table_ref(TableRef(dataset=None, schema=('my_schema', 'my_subschema'), name='my_table', project=None)) 213 | 'my_schema.my_subschema__my_table' 214 | 215 | >>> DuckDBDialect.format_table_ref(TableRef(dataset=None, schema=(), name='my_table', project=None)) 216 | 'my_table' 217 | """ 218 | if len(table_ref.schema) > 0: 219 | schema = table_ref.schema[0] 220 | if len(table_ref.schema) > 1: 221 | full_table_ref = f"{schema}.{'__'.join([*table_ref.schema[1:], table_ref.name])}" 222 | else: 223 | full_table_ref = f"{schema}.{table_ref.name}" 224 | return full_table_ref 225 | return table_ref.name 226 | 227 | @staticmethod 228 | def convert_table_ref_to_duckdb_table_reference(table_ref: TableRef) -> str: 229 | return DuckDBDialect.format_table_ref(table_ref) 230 | 231 | 232 | def strip_quotes(x: str) -> str: 233 | return x.strip('"').strip("`") 234 | -------------------------------------------------------------------------------- /lea/field.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import dataclasses 4 | import enum 5 | 6 | 7 | @dataclasses.dataclass(frozen=True) 8 | class Field: 9 | name: str 10 | tags: set[FieldTag] = dataclasses.field(default_factory=set) 11 | description: str | None = None 12 | 13 | @property 14 | def is_unique(self): 15 | return FieldTag.UNIQUE in self.tags 16 | 17 | 18 | class FieldTag(enum.StrEnum): 19 | NO_NULLS = "#NO_NULLS" 20 | UNIQUE = "#UNIQUE" 21 | UNIQUE_BY = "#UNIQUE_BY" 22 | SET = "#SET" 23 | INCREMENTAL = "#INCREMENTAL" 24 | -------------------------------------------------------------------------------- /lea/job.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import dataclasses 4 | import datetime as dt 5 | import enum 6 | 7 | from lea.databases import DatabaseJob 8 | from lea.table_ref import TableRef 9 | 10 | 11 | class JobStatus(enum.Enum): 12 | RUNNING = "RUNNING" 13 | SUCCESS = "[green]SUCCESS[/green]" 14 | ERRORED = "[red]ERRORED[/red]" 15 | STOPPED = "[yellow]STOPPED[/yellow]" 16 | 17 | def __str__(self): 18 | return self.value 19 | 20 | 21 | @dataclasses.dataclass 22 | class Job: 23 | table_ref: TableRef 24 | is_test: bool 25 | database_job: DatabaseJob 26 | started_at: dt.datetime = dataclasses.field(default_factory=dt.datetime.now) 27 | ended_at: dt.datetime | None = None 28 | status: JobStatus = JobStatus.RUNNING 29 | 30 | def __hash__(self): 31 | return hash(self.table_ref) 32 | -------------------------------------------------------------------------------- /lea/scripts.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import dataclasses 4 | import datetime as dt 5 | import functools 6 | import os 7 | import pathlib 8 | import re 9 | import textwrap 10 | 11 | import jinja2 12 | import rich.syntax 13 | import sqlglot 14 | import sqlglot.optimizer 15 | 16 | from .comment import extract_comments 17 | from .dialects import SQLDialect 18 | from .field import Field, FieldTag 19 | from .table_ref import TableRef 20 | 21 | 22 | @dataclasses.dataclass(frozen=True) 23 | class SQLScript: 24 | table_ref: TableRef 25 | code: str 26 | sql_dialect: SQLDialect 27 | fields: list[Field] | None = dataclasses.field(default=None) 28 | updated_at: dt.datetime | None = None 29 | 30 | def __post_init__(self): 31 | """ 32 | 33 | This part is a bit tricky. We extract fields from each script for different reasons. For 34 | instance, the fields are used to generate assertion tests. 35 | 36 | The logic to extract fields is based on SQLGlot. The latter usually works well, but it 37 | sometimes fail for complex queries. For instance, in incremental mode, we have to edit 38 | the queries to filter their dependencies. These queries are not always parsed correctly by 39 | SQLGlot. 40 | 41 | To circumvent this issue, we extract fields, and cache them. This way, whenever we call 42 | dataclasses.replace, they won't have to be recomputed. This makes sense because the scripts 43 | are never edited to add or remove fields. They are only edited to change the filtering 44 | conditions. 45 | 46 | """ 47 | if self.fields is not None: 48 | return 49 | field_names = self.ast.named_selects 50 | field_comments = extract_comments( 51 | code=self.code, expected_field_names=field_names, sql_dialect=self.sql_dialect 52 | ) 53 | fields = [ 54 | Field( 55 | name=name, 56 | tags={ 57 | comment.text 58 | for comment in field_comments.get(name, []) 59 | if comment.text.startswith("#") 60 | }, 61 | description=" ".join( 62 | comment.text 63 | for comment in field_comments.get(name, []) 64 | if not comment.text.startswith("#") 65 | ), 66 | ) 67 | for name in field_names 68 | if name != "*" 69 | ] 70 | # https://stackoverflow.com/a/54119384 71 | object.__setattr__(self, "fields", fields) 72 | 73 | @classmethod 74 | def from_path( 75 | cls, 76 | scripts_dir: pathlib.Path, 77 | relative_path: pathlib.Path, 78 | sql_dialect: SQLDialect, 79 | project_name: str, 80 | ) -> SQLScript: 81 | # Either the file is a Jinja template 82 | if relative_path.suffixes == [".sql", ".jinja"]: 83 | loader = jinja2.FileSystemLoader(scripts_dir) 84 | environment = jinja2.Environment(loader=loader) 85 | template = environment.get_template(str(relative_path)) 86 | code = template.render(env=os.environ) 87 | # Or it's a regular SQL file 88 | else: 89 | code = (scripts_dir / relative_path).read_text().rstrip().rstrip(";") 90 | 91 | return cls( 92 | table_ref=TableRef.from_path( 93 | scripts_dir=scripts_dir, relative_path=relative_path, project_name=project_name 94 | ), 95 | code=code, 96 | sql_dialect=sql_dialect, 97 | updated_at=dt.datetime.fromtimestamp( 98 | (scripts_dir / relative_path).stat().st_mtime, tz=dt.timezone.utc 99 | ), 100 | ) 101 | 102 | @property 103 | def is_test(self) -> bool: 104 | return self.table_ref.is_test 105 | 106 | @functools.cached_property 107 | def ast(self): 108 | ast = sqlglot.parse_one(self.code, dialect=self.sql_dialect.sqlglot_dialect) 109 | try: 110 | return sqlglot.optimizer.qualify.qualify(ast) 111 | except sqlglot.errors.OptimizeError: 112 | return ast 113 | 114 | @functools.cached_property 115 | def dependencies(self) -> set[TableRef]: 116 | def add_default_project(table_ref: TableRef) -> TableRef: 117 | if table_ref.project is None: 118 | return table_ref.replace_project(self.table_ref.project) 119 | return table_ref 120 | 121 | dependencies = set() 122 | 123 | for scope in sqlglot.optimizer.scope.traverse_scope(self.ast): 124 | for table in scope.tables: 125 | if ( 126 | not isinstance(table.this, sqlglot.exp.Func) 127 | and sqlglot.exp.table_name(table) not in scope.cte_sources 128 | ): 129 | try: 130 | table_ref = self.sql_dialect.parse_table_ref( 131 | table_ref=sqlglot.exp.table_name(table) 132 | ) 133 | except ValueError as e: 134 | raise ValueError( 135 | f"Unable to parse table reference {sqlglot.exp.table_name(table)!r} " 136 | f"in {self.table_ref.replace_project(None)}" 137 | ) from e 138 | dependencies.add(add_default_project(table_ref)) 139 | 140 | return dependencies 141 | 142 | @property 143 | def assertion_tests(self) -> list[SQLScript]: 144 | """ 145 | 146 | Assertion tests are gleaned from the comments in the script. They are used to test the 147 | quality of the data. The following tags are supported: 148 | 149 | - #NO_NULLS: Asserts that the column has no null values. 150 | - #UNIQUE: Asserts that the column has unique values. 151 | - #UNIQUE_BY(field): Asserts that the column has unique values when grouped by field. 152 | - #SET{value1, value2, ...}: Asserts that the column only contains the specified elements. 153 | 154 | """ 155 | 156 | def make_table_ref(field, tag): 157 | return TableRef( 158 | dataset=self.table_ref.dataset, 159 | schema=("tests",), 160 | name=f"{'__'.join(self.table_ref.schema)}__{self.table_ref.name}__{field.name}___{tag.lower().lstrip('#')}", 161 | project=self.table_ref.project, 162 | ) 163 | 164 | def make_assertion_test(table_ref, field, tag): 165 | if tag == FieldTag.NO_NULLS: 166 | return SQLScript( 167 | table_ref=make_table_ref(field, FieldTag.NO_NULLS), 168 | code=self.sql_dialect.make_column_test_no_nulls(table_ref, field.name), 169 | sql_dialect=self.sql_dialect, 170 | ) 171 | elif tag == FieldTag.UNIQUE: 172 | return SQLScript( 173 | table_ref=make_table_ref(field, FieldTag.UNIQUE), 174 | code=self.sql_dialect.make_column_test_unique(table_ref, field.name), 175 | sql_dialect=self.sql_dialect, 176 | ) 177 | elif unique_by := re.fullmatch(FieldTag.UNIQUE_BY + r"\((?P.+)\)", tag): 178 | by = unique_by.group("by") 179 | return SQLScript( 180 | table_ref=make_table_ref(field, FieldTag.UNIQUE_BY), 181 | code=self.sql_dialect.make_column_test_unique_by(table_ref, field.name, by), 182 | sql_dialect=self.sql_dialect, 183 | ) 184 | elif set_ := re.fullmatch( 185 | FieldTag.SET + r"\{(?P'[^']+'(?:,\s*'[^']+')*)\}", tag 186 | ): 187 | elements = {element.strip() for element in set_.group("elements").split(",")} 188 | return SQLScript( 189 | table_ref=make_table_ref(field, FieldTag.SET), 190 | code=self.sql_dialect.make_column_test_set(table_ref, field.name, elements), 191 | sql_dialect=self.sql_dialect, 192 | ) 193 | else: 194 | raise ValueError(f"Unhandled tag: {tag}") 195 | 196 | return [ 197 | # We don't need to include the target table_ref's project in the assertion test, 198 | # because that would include the project in the code generated by the SQL dialect. 199 | # This is not needed, because the project will be set downstream in each script anyway. 200 | make_assertion_test(self.table_ref.replace_project(None), field, tag) 201 | for field in self.fields or [] 202 | for tag in field.tags 203 | if tag not in {FieldTag.INCREMENTAL} 204 | ] 205 | 206 | def replace_table_ref(self, table_ref: TableRef) -> SQLScript: 207 | return dataclasses.replace(self, table_ref=table_ref) 208 | 209 | def __rich__(self): 210 | code = textwrap.dedent(self.code).strip() 211 | code_with_table_ref = f"""-- {self.table_ref}\n\n{code}\n""" 212 | return rich.syntax.Syntax(code_with_table_ref, "sql", line_numbers=False, theme="ansi_dark") 213 | 214 | 215 | Script = SQLScript 216 | 217 | 218 | def read_scripts( 219 | scripts_dir: pathlib.Path, sql_dialect: SQLDialect, dataset_name: str, project_name: str 220 | ) -> list[Script]: 221 | def read_script(path: pathlib.Path) -> Script: 222 | match tuple(path.suffixes): 223 | case (".sql",) | (".sql", ".jinja"): 224 | return SQLScript.from_path( 225 | scripts_dir=scripts_dir, 226 | relative_path=path.relative_to(scripts_dir), 227 | sql_dialect=sql_dialect, 228 | project_name=project_name, 229 | ) 230 | case _: 231 | raise ValueError(f"Unsupported script type: {path}") 232 | 233 | def set_dataset(script: Script) -> Script: 234 | return script.replace_table_ref(script.table_ref.replace_dataset(dataset=dataset_name)) 235 | 236 | return [ 237 | set_dataset(read_script(path)) 238 | for path in scripts_dir.rglob("*") 239 | if not path.is_dir() 240 | and tuple(path.suffixes) in {(".sql",), (".sql", ".jinja"), (".json",)} 241 | and not path.name.startswith("_") 242 | and path.stat().st_size > 0 243 | ] 244 | -------------------------------------------------------------------------------- /lea/session.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import concurrent.futures 4 | import dataclasses 5 | import datetime as dt 6 | import re 7 | import threading 8 | import time 9 | from collections.abc import Callable 10 | 11 | import lea 12 | from lea.databases import DatabaseClient, TableStats 13 | from lea.field import FieldTag 14 | from lea.job import Job, JobStatus 15 | from lea.scripts import Script 16 | from lea.table_ref import TableRef 17 | 18 | 19 | class Session: 20 | def __init__( 21 | self, 22 | database_client: DatabaseClient, 23 | base_dataset: str, 24 | write_dataset: str, 25 | scripts: dict[TableRef, Script], 26 | selected_table_refs: set[TableRef], 27 | unselected_table_refs: set[TableRef], 28 | existing_tables: dict[TableRef, TableStats], 29 | existing_audit_tables: dict[TableRef, TableStats], 30 | incremental_field_name=None, 31 | incremental_field_values=None, 32 | ): 33 | self.database_client = database_client 34 | self.base_dataset = base_dataset 35 | self.write_dataset = write_dataset 36 | self.scripts = scripts 37 | self.selected_table_refs = selected_table_refs 38 | self.unselected_table_refs = unselected_table_refs 39 | self.existing_tables = existing_tables 40 | self.existing_audit_tables = existing_audit_tables 41 | self.incremental_field_name = incremental_field_name 42 | self.incremental_field_values = incremental_field_values 43 | 44 | self.jobs: list[Job] = [] 45 | self.started_at = dt.datetime.now() 46 | self.ended_at: dt.datetime | None = None 47 | self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=None) 48 | self.run_script_futures: dict = {} 49 | self.run_script_futures_complete: dict = {} 50 | self.promote_audit_tables_futures: dict = {} 51 | self.stop_event = threading.Event() 52 | 53 | if self.incremental_field_name is not None: 54 | self.filterable_table_refs = { 55 | table_ref.replace_dataset(self.write_dataset) 56 | for table_ref in scripts 57 | if any( 58 | field.name == incremental_field_name 59 | for field in scripts[table_ref].fields or [] 60 | ) 61 | } 62 | self.incremental_table_refs = { 63 | table_ref.replace_dataset(self.write_dataset).remove_audit_suffix() 64 | for table_ref in selected_table_refs | set(existing_audit_tables) 65 | if any( 66 | field.name == incremental_field_name and FieldTag.INCREMENTAL in field.tags 67 | for field in scripts[ 68 | table_ref.remove_audit_suffix().replace_dataset(self.base_dataset) 69 | ].fields 70 | or [] 71 | ) 72 | } 73 | else: 74 | self.filterable_table_refs = set() 75 | self.incremental_table_refs = set() 76 | 77 | def add_write_context_to_table_ref(self, table_ref: TableRef) -> TableRef: 78 | table_ref = table_ref.replace_dataset(self.write_dataset) 79 | table_ref = table_ref.add_audit_suffix() 80 | return table_ref 81 | 82 | def remove_write_context_from_table_ref(self, table_ref: TableRef) -> TableRef: 83 | table_ref = table_ref.replace_dataset(self.base_dataset) 84 | table_ref = table_ref.remove_audit_suffix() 85 | return table_ref 86 | 87 | def add_context_to_script(self, script: Script) -> Script: 88 | def add_context_to_dependency(dependency: TableRef) -> TableRef | None: 89 | # We don't modify the project if is has been deliberately set 90 | if dependency.project is not None and dependency.project != script.table_ref.project: 91 | return None 92 | 93 | if ( 94 | dependency.replace_dataset(self.base_dataset) 95 | in self.selected_table_refs 96 | | { 97 | self.remove_write_context_from_table_ref(table_ref) 98 | for table_ref in self.existing_audit_tables 99 | } 100 | and dependency.replace_dataset(self.base_dataset) in self.scripts 101 | ): 102 | dependency = dependency.add_audit_suffix() 103 | 104 | dependency = dependency.replace_dataset(self.write_dataset) 105 | 106 | return dependency 107 | 108 | script = replace_script_dependencies(script=script, replace_func=add_context_to_dependency) 109 | 110 | # If a script is marked as incremental, it implies that it can be run incrementally. This 111 | # means that we have to filter the script's dependencies, as well as filter the output. 112 | # This logic is implemented by the script's SQL dialect. 113 | if script.table_ref.replace_dataset(self.write_dataset) in self.incremental_table_refs: 114 | script = dataclasses.replace( 115 | script, 116 | code=script.sql_dialect.add_dependency_filters( 117 | code=script.code, 118 | incremental_field_name=self.incremental_field_name, 119 | incremental_field_values=self.incremental_field_values, 120 | # One caveat is the dependencies which are not incremental do not have to be 121 | # filtered. Indeed, they are already filtered by the fact that they are 122 | # incremental. 123 | dependencies_to_filter=self.filterable_table_refs - self.incremental_table_refs, 124 | ), 125 | ) 126 | 127 | # If the script is not incremental, we're not out of the woods! All scripts are 128 | # materialized into side-tables which we call "audit" tables. This is the WAP pattern. 129 | # Therefore, if a script is not incremental, but it depends on an incremental script, we 130 | # have to modify the script to use both the incremental and non-incremental versions of 131 | # the dependency. This is handled by the script's SQL dialect. 132 | elif self.incremental_table_refs: 133 | script = dataclasses.replace( 134 | script, 135 | code=script.sql_dialect.handle_incremental_dependencies( 136 | code=script.code, 137 | incremental_field_name=self.incremental_field_name, 138 | incremental_field_values=self.incremental_field_values, 139 | incremental_dependencies={ 140 | incremental_table_ref: incremental_table_ref.add_audit_suffix() 141 | for incremental_table_ref in self.incremental_table_refs 142 | }, 143 | ), 144 | ) 145 | 146 | return script.replace_table_ref(self.add_write_context_to_table_ref(script.table_ref)) 147 | 148 | def run_script(self, script: Script): 149 | # If the script is a test, we don't materialize it, we just query it. A test fails if it 150 | # returns any rows. 151 | if script.is_test: 152 | database_job = self.database_client.query_script(script=script) 153 | # If the script is not a test, it's a regular table, so we materialize it. Instead of 154 | # directly materializing it to the destination table, we materialize it to a side-table 155 | # which we call an "audit" table. Once all the scripts have run successfully, we will 156 | # promote the audit tables to the destination tables. This is the WAP pattern. 157 | else: 158 | database_job = self.database_client.materialize_script(script=script) 159 | 160 | job = Job(table_ref=script.table_ref, is_test=script.is_test, database_job=database_job) 161 | self.jobs.append(job) 162 | 163 | msg = f"{job.status} {script.table_ref}" 164 | 165 | if script.table_ref.remove_audit_suffix() in self.incremental_table_refs: 166 | msg += " (incremental)" 167 | lea.log.info(msg) 168 | 169 | self.monitor_job(job) 170 | 171 | def monitor_job(self, job: Job): 172 | # We're going to do exponential backoff. This is because we don't want to overload 173 | # whatever API is used to check whether a database job is over or not. We're going to 174 | # check every second, then every two seconds, then every four seconds, etc. until we 175 | # reach a maximum delay of 10 seconds. 176 | base_delay = 1 177 | max_delay = 10 178 | retries = 0 179 | checked_at = dt.datetime.now() 180 | 181 | while not self.stop_event.is_set(): 182 | if not job.database_job.is_done: 183 | delay = min(max_delay, base_delay * (2**retries)) 184 | retries += 1 185 | if (now := dt.datetime.now()) - checked_at >= dt.timedelta(seconds=10): 186 | duration_str = str(now - job.started_at).split(".")[0] 187 | lea.log.info(f"{job.status} {job.table_ref} after {duration_str}") 188 | checked_at = now 189 | time.sleep(delay) 190 | continue 191 | 192 | # Case 1: the job raised an exception 193 | if (exception := job.database_job.exception) is not None: 194 | job.status = JobStatus.ERRORED 195 | lea.log.error(f"{job.status} {job.table_ref}\n{exception}") 196 | 197 | # Case 2: the job succeeded, but it's a test and there are negative cases 198 | elif job.is_test and not (dataframe := job.database_job.result).empty: 199 | job.status = JobStatus.ERRORED 200 | lea.log.error(f"{job.status} {job.table_ref}\n{dataframe.head()}") 201 | 202 | # Case 3: the job succeeded! 203 | else: 204 | job.status = JobStatus.SUCCESS 205 | msg = f"{job.status} {job.table_ref}" 206 | job.ended_at = dt.datetime.now() 207 | # Depending on the warehouse in use, jobs may have a conclude() method, for example 208 | # for recording job statistics. 209 | job.database_job.conclude() 210 | duration_str = str(job.ended_at - job.started_at).split(".")[0] 211 | if job.ended_at - job.started_at >= dt.timedelta(seconds=1): 212 | msg += f", took {duration_str}" 213 | if job.database_job.billed_dollars is not None: 214 | msg += f", cost ${job.database_job.billed_dollars:.2f}" 215 | if not job.is_test: 216 | if (stats := job.database_job.statistics) is not None: 217 | msg += f", contains {stats.n_rows:,d} rows" 218 | if stats.n_bytes is not None: 219 | msg += f", weighs {format_bytes(stats.n_bytes)}" 220 | if job.database_job.metadata: 221 | msg += f" ({', '.join(job.database_job.metadata)})" 222 | lea.log.info(msg) 223 | 224 | return 225 | 226 | def promote_audit_table(self, table_ref: TableRef): 227 | from_table_ref = table_ref 228 | to_table_ref = table_ref.remove_audit_suffix() 229 | 230 | is_incremental = ( 231 | self.incremental_field_name is not None and to_table_ref in self.incremental_table_refs 232 | ) 233 | if is_incremental: 234 | database_job = self.database_client.delete_and_insert( 235 | from_table_ref=from_table_ref, 236 | to_table_ref=to_table_ref, 237 | on=self.incremental_field_name, 238 | ) 239 | else: 240 | database_job = self.database_client.clone_table( 241 | from_table_ref=from_table_ref, to_table_ref=to_table_ref 242 | ) 243 | 244 | job = Job(table_ref=to_table_ref, is_test=False, database_job=database_job) 245 | self.jobs.append(job) 246 | lea.log.info(f"{job.status} {job.table_ref}" + (" (incremental)" if is_incremental else "")) 247 | 248 | self.monitor_job(job) 249 | 250 | def end(self): 251 | lea.log.info("😴 Ending session") 252 | self.stop_event.set() 253 | for job in self.jobs: 254 | if job.status == JobStatus.RUNNING: 255 | job.database_job.stop() 256 | job.status = JobStatus.STOPPED 257 | lea.log.info(f"{job.status} {job.table_ref}") 258 | self.executor.shutdown() 259 | self.ended_at = dt.datetime.now() 260 | 261 | @property 262 | def any_error_has_occurred(self) -> bool: 263 | return any(job.status == JobStatus.ERRORED for job in self.jobs) or any( 264 | future.exception() is not None for future in self.run_script_futures_complete 265 | ) 266 | 267 | @property 268 | def total_billed_dollars(self) -> float: 269 | return sum( 270 | job.database_job.billed_dollars 271 | for job in self.jobs 272 | if job.database_job.billed_dollars is not None 273 | ) 274 | 275 | 276 | def replace_script_dependencies( 277 | script: Script, replace_func: Callable[[TableRef], TableRef] 278 | ) -> Script: 279 | """ 280 | 281 | It's often necessary to edit the dependencies of a script. For example, we might want 282 | to change the dataset of a dependency. Or we might want to append a suffix a table name 283 | when we're doing a write/audit/publish operation. 284 | 285 | """ 286 | code = script.code 287 | 288 | for dependency_to_edit in script.dependencies: 289 | new_dependency = replace_func(dependency_to_edit) 290 | if new_dependency is None: 291 | continue 292 | 293 | dependency_to_edit_without_project_str = script.sql_dialect.format_table_ref( 294 | dependency_to_edit.replace_project(None) 295 | ) 296 | new_dependency_str = script.sql_dialect.format_table_ref(new_dependency) 297 | code = re.sub( 298 | rf"\b{dependency_to_edit_without_project_str}\b", 299 | new_dependency_str, 300 | code, 301 | ) 302 | 303 | # We also have to handle the case where the table is referenced to access a field. 304 | # TODO: refactor this with the above 305 | dependency_to_edit_without_dataset = dataclasses.replace( 306 | dependency_to_edit, dataset="", project=None 307 | ) 308 | dependency_to_edit_without_dataset_str = script.sql_dialect.format_table_ref( 309 | dependency_to_edit_without_dataset 310 | ) 311 | new_dependency_without_dataset = dataclasses.replace( 312 | new_dependency, dataset="", project=None 313 | ) 314 | new_dependency_without_dataset_str = script.sql_dialect.format_table_ref( 315 | new_dependency_without_dataset 316 | ) 317 | code = re.sub( 318 | rf"\b{dependency_to_edit_without_dataset_str}\b", 319 | new_dependency_without_dataset_str, 320 | code, 321 | ) 322 | 323 | return dataclasses.replace(script, code=code) 324 | 325 | 326 | def format_bytes(size: float) -> str: 327 | # Define the size units in ascending order 328 | power = 1024 329 | n = 0 330 | units = ["B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"] 331 | 332 | # Convert bytes to the highest possible unit 333 | while size >= power and n < len(units) - 1: 334 | size /= power 335 | n += 1 336 | 337 | # Format the result with two decimal places 338 | return f"{size:.0f}{units[n]}" 339 | -------------------------------------------------------------------------------- /lea/table_ref.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import dataclasses 4 | import pathlib 5 | import re 6 | 7 | AUDIT_TABLE_SUFFIX = "___audit" 8 | 9 | 10 | @dataclasses.dataclass(eq=True, frozen=True) 11 | class TableRef: 12 | dataset: str 13 | schema: tuple[str, ...] 14 | name: str 15 | project: str | None 16 | 17 | def __str__(self): 18 | return ".".join(filter(None, [self.project, self.dataset, *self.schema, self.name])) 19 | 20 | @classmethod 21 | def from_path( 22 | cls, scripts_dir: pathlib.Path, relative_path: pathlib.Path, project_name: str 23 | ) -> TableRef: 24 | parts = list(filter(None, relative_path.parts)) 25 | *schema, filename = parts 26 | return cls( 27 | dataset=scripts_dir.name, 28 | schema=tuple(schema), 29 | name=filename.split(".")[0], # remove the extension 30 | project=project_name, 31 | ) 32 | 33 | def replace_dataset(self, dataset: str) -> TableRef: 34 | return dataclasses.replace(self, dataset=dataset) 35 | 36 | def replace_project(self, project: str) -> TableRef: 37 | return dataclasses.replace(self, project=project) 38 | 39 | def add_audit_suffix(self) -> TableRef: 40 | if self.is_audit_table: 41 | return self 42 | return dataclasses.replace(self, name=f"{self.name}{AUDIT_TABLE_SUFFIX}") 43 | 44 | def remove_audit_suffix(self) -> TableRef: 45 | if self.is_audit_table: 46 | return dataclasses.replace(self, name=re.sub(rf"{AUDIT_TABLE_SUFFIX}$", "", self.name)) 47 | return self 48 | 49 | @property 50 | def is_audit_table(self) -> bool: 51 | return self.name.endswith(AUDIT_TABLE_SUFFIX) 52 | 53 | @property 54 | def is_test(self) -> bool: 55 | return len(self.schema) > 0 and self.schema[0] == "tests" 56 | -------------------------------------------------------------------------------- /lea/test_big_query.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import re 4 | 5 | import pytest 6 | from google.auth.credentials import AnonymousCredentials 7 | 8 | from lea.conductor import Session 9 | from lea.databases import BigQueryClient, TableStats 10 | from lea.dialects import BigQueryDialect 11 | from lea.scripts import Script, TableRef 12 | 13 | DUMMY_TABLE_STATS = TableStats(n_rows=0, n_bytes=0, updated_at=None) 14 | 15 | 16 | @pytest.fixture 17 | def scripts() -> dict[TableRef, Script]: 18 | return { 19 | script.table_ref: script 20 | for script in [ 21 | Script( 22 | table_ref=TableRef("read", ("raw",), "users", "test_project"), 23 | code=""" 24 | SELECT * FROM UNNEST([ 25 | STRUCT(1 AS id, 'Alice' AS name, 30 AS age), 26 | STRUCT(2 AS id, 'Bob' AS name, 25 AS age), 27 | STRUCT(3 AS id, 'Charlie' AS name, 35 AS age) 28 | ]) 29 | """, 30 | sql_dialect=BigQueryDialect(), 31 | ), 32 | Script( 33 | table_ref=TableRef("read", ("core",), "users", "test_project"), 34 | code=""" 35 | SELECT 36 | id, 37 | -- #INCREMENTAL 38 | name, 39 | age 40 | FROM read.raw__users 41 | """, 42 | sql_dialect=BigQueryDialect(), 43 | ), 44 | Script( 45 | table_ref=TableRef("read", ("analytics",), "n_users", "test_project"), 46 | code=""" 47 | SELECT COUNT(*) 48 | FROM read.core__users 49 | """, 50 | sql_dialect=BigQueryDialect(), 51 | ), 52 | Script( 53 | table_ref=TableRef("read", ("analytics",), "n_users_with_unnest", "test_project"), 54 | code=""" 55 | SELECT COUNT(*) 56 | FROM read.core__users, UNNEST([1, 2, 3]) AS n 57 | """, 58 | sql_dialect=BigQueryDialect(), 59 | ), 60 | ] 61 | } 62 | 63 | 64 | def assert_queries_are_equal(query1: str, query2: str): 65 | normalized_query1 = re.sub(r"\s+", " ", query1).strip() 66 | normalized_query2 = re.sub(r"\s+", " ", query2).strip() 67 | assert normalized_query1 == normalized_query2 68 | 69 | 70 | def test_simple_run(scripts): 71 | session = Session( 72 | database_client=None, 73 | base_dataset="read", 74 | write_dataset="write", 75 | scripts=scripts, 76 | selected_table_refs=scripts.keys(), 77 | unselected_table_refs=set(), 78 | existing_tables={}, 79 | existing_audit_tables={}, 80 | ) 81 | 82 | assert_queries_are_equal( 83 | session.add_context_to_script( 84 | scripts[TableRef("read", ("raw",), "users", "test_project")] 85 | ).code, 86 | """ 87 | SELECT * FROM UNNEST([ 88 | STRUCT(1 AS id, 'Alice' AS name, 30 AS age), 89 | STRUCT(2 AS id, 'Bob' AS name, 25 AS age), 90 | STRUCT(3 AS id, 'Charlie' AS name, 35 AS age) 91 | ]) 92 | """, 93 | ) 94 | assert_queries_are_equal( 95 | session.add_context_to_script( 96 | scripts[TableRef("read", ("analytics",), "n_users", "test_project")] 97 | ).code, 98 | """ 99 | SELECT COUNT(*) 100 | FROM test_project.write.core__users___audit 101 | """, 102 | ) 103 | 104 | 105 | def test_incremental_field(scripts): 106 | session = Session( 107 | database_client=None, 108 | base_dataset="read", 109 | write_dataset="write", 110 | scripts=scripts, 111 | selected_table_refs=scripts.keys(), 112 | unselected_table_refs=set(), 113 | existing_tables={}, 114 | existing_audit_tables={}, 115 | incremental_field_name="name", 116 | incremental_field_values={"Alice"}, 117 | ) 118 | 119 | assert_queries_are_equal( 120 | session.add_context_to_script( 121 | scripts[TableRef("read", ("core",), "users", "test_project")] 122 | ).code, 123 | """ 124 | SELECT * 125 | FROM ( 126 | SELECT id, name, age 127 | FROM test_project.write.raw__users___audit 128 | ) 129 | WHERE name IN ('Alice') 130 | """, 131 | ) 132 | 133 | assert_queries_are_equal( 134 | session.add_context_to_script( 135 | scripts[TableRef("read", ("analytics",), "n_users", "test_project")] 136 | ).code, 137 | """ 138 | SELECT COUNT(*) FROM ( 139 | SELECT * 140 | FROM test_project.write.core__users___audit 141 | WHERE name IN ('Alice') 142 | 143 | UNION ALL 144 | 145 | SELECT * 146 | FROM test_project.write.core__users 147 | WHERE name NOT IN ('Alice') 148 | ) 149 | """, 150 | ) 151 | 152 | 153 | def test_incremental_field_with_comma(scripts): 154 | session = Session( 155 | database_client=None, 156 | base_dataset="read", 157 | write_dataset="write", 158 | scripts=scripts, 159 | selected_table_refs=scripts.keys(), 160 | unselected_table_refs=set(), 161 | existing_tables={}, 162 | existing_audit_tables={}, 163 | incremental_field_name="name", 164 | incremental_field_values={"Alice"}, 165 | ) 166 | 167 | assert_queries_are_equal( 168 | session.add_context_to_script( 169 | scripts[TableRef("read", ("core",), "users", "test_project")] 170 | ).code, 171 | """ 172 | SELECT * 173 | FROM ( 174 | SELECT id, name, age 175 | FROM test_project.write.raw__users___audit 176 | ) 177 | WHERE name IN ('Alice') 178 | """, 179 | ) 180 | 181 | assert_queries_are_equal( 182 | session.add_context_to_script( 183 | scripts[TableRef("read", ("analytics",), "n_users_with_unnest", "test_project")] 184 | ).code, 185 | """ 186 | SELECT COUNT(*) FROM ( 187 | SELECT * 188 | FROM test_project.write.core__users___audit 189 | WHERE name IN ('Alice') 190 | 191 | UNION ALL 192 | 193 | SELECT * 194 | FROM test_project.write.core__users 195 | WHERE name NOT IN ('Alice') 196 | ) , UNNEST([1, 2, 3]) AS n 197 | """, 198 | ) 199 | 200 | 201 | def test_incremental_field_but_no_incremental_table_selected(scripts): 202 | session = Session( 203 | database_client=None, 204 | base_dataset="read", 205 | write_dataset="write", 206 | scripts=scripts, 207 | selected_table_refs={TableRef("read", ("analytics",), "n_users", "test_project")}, 208 | unselected_table_refs=set(), 209 | existing_tables={}, 210 | existing_audit_tables={}, 211 | incremental_field_name="name", 212 | incremental_field_values={"Alice"}, 213 | ) 214 | 215 | assert_queries_are_equal( 216 | session.add_context_to_script( 217 | scripts[TableRef("read", ("core",), "users", "test_project")] 218 | ).code, 219 | """ 220 | SELECT 221 | id, 222 | -- #INCREMENTAL 223 | name, 224 | age 225 | FROM test_project.write.raw__users 226 | """, 227 | ) 228 | 229 | 230 | def test_incremental_field_with_just_incremental_table_selected(scripts): 231 | session = Session( 232 | database_client=None, 233 | base_dataset="read", 234 | write_dataset="write", 235 | scripts=scripts, 236 | selected_table_refs={TableRef("read", ("core",), "users", "test_project")}, 237 | unselected_table_refs=set(), 238 | existing_tables={}, 239 | existing_audit_tables={}, 240 | incremental_field_name="name", 241 | incremental_field_values={"Alice"}, 242 | ) 243 | 244 | assert_queries_are_equal( 245 | session.add_context_to_script( 246 | scripts[TableRef("read", ("core",), "users", "test_project")] 247 | ).code, 248 | """ 249 | SELECT * 250 | FROM ( 251 | SELECT id, name, age 252 | FROM test_project.write.raw__users 253 | ) 254 | WHERE name IN ('Alice') 255 | """, 256 | ) 257 | 258 | 259 | def test_incremental_field_with_just_incremental_table_selected_and_materialized_dependency( 260 | scripts, 261 | ): 262 | session = Session( 263 | database_client=None, 264 | base_dataset="read", 265 | write_dataset="write", 266 | scripts=scripts, 267 | selected_table_refs={TableRef("read", ("core",), "users", "test_project")}, 268 | unselected_table_refs=set(), 269 | existing_tables={}, 270 | existing_audit_tables={ 271 | TableRef("read", ("raw",), "users", "test_project"): DUMMY_TABLE_STATS 272 | }, 273 | incremental_field_name="name", 274 | incremental_field_values={"Alice"}, 275 | ) 276 | 277 | assert_queries_are_equal( 278 | session.add_context_to_script( 279 | scripts[TableRef("read", ("core",), "users", "test_project")] 280 | ).code, 281 | """ 282 | SELECT * 283 | FROM ( 284 | SELECT id, name, age 285 | FROM test_project.write.raw__users___audit 286 | ) 287 | WHERE name IN ('Alice') 288 | """, 289 | ) 290 | 291 | 292 | def test_incremental_field_but_no_incremental_table_selected_and_yet_dependency_is_materialized( 293 | scripts, 294 | ): 295 | session = Session( 296 | database_client=None, 297 | base_dataset="read", 298 | write_dataset="write", 299 | scripts=scripts, 300 | selected_table_refs={TableRef("read", ("analytics",), "n_users", "test_project")}, 301 | unselected_table_refs=set(), 302 | existing_tables={}, 303 | existing_audit_tables={ 304 | TableRef("read", ("core",), "users", "test_project"): DUMMY_TABLE_STATS, 305 | }, 306 | incremental_field_name="name", 307 | incremental_field_values={"Alice"}, 308 | ) 309 | 310 | assert_queries_are_equal( 311 | session.add_context_to_script( 312 | scripts[TableRef("read", ("analytics",), "n_users", "test_project")] 313 | ).code, 314 | """ 315 | SELECT COUNT(*) 316 | FROM ( 317 | SELECT * 318 | FROM test_project.write.core__users___audit 319 | WHERE name IN ('Alice') 320 | 321 | UNION ALL 322 | 323 | SELECT * 324 | FROM test_project.write.core__users 325 | WHERE name NOT IN ('Alice') 326 | ) 327 | """, 328 | ) 329 | 330 | 331 | def test_incremental_field_but_no_incremental_table_selected_and_yet_dependency_is_materialized_with_client( 332 | scripts, 333 | ): 334 | session = Session( 335 | database_client=BigQueryClient( 336 | credentials=AnonymousCredentials(), 337 | location="EU", 338 | write_project_id="write-project-id", 339 | compute_project_id="compute-project-id", 340 | ), 341 | base_dataset="read", 342 | write_dataset="write", 343 | scripts=scripts, 344 | selected_table_refs={TableRef("read", ("analytics",), "n_users", "test_project")}, 345 | unselected_table_refs=set(), 346 | existing_tables={}, 347 | existing_audit_tables={ 348 | TableRef("read", ("core",), "users", "test_project"): DUMMY_TABLE_STATS, 349 | }, 350 | incremental_field_name="name", 351 | incremental_field_values={"Alice"}, 352 | ) 353 | 354 | assert_queries_are_equal( 355 | session.add_context_to_script( 356 | scripts[TableRef("read", ("analytics",), "n_users", "test_project")] 357 | ).code, 358 | """ 359 | SELECT COUNT(*) 360 | FROM ( 361 | SELECT * 362 | FROM test_project.write.core__users___audit 363 | WHERE name IN ('Alice') 364 | 365 | UNION ALL 366 | 367 | SELECT * 368 | FROM test_project.write.core__users 369 | WHERE name NOT IN ('Alice') 370 | ) 371 | """, 372 | ) 373 | -------------------------------------------------------------------------------- /lea/test_duckdb.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import re 4 | from pathlib import Path 5 | 6 | import pytest 7 | 8 | from lea.conductor import Session 9 | from lea.databases import DuckDBClient, TableStats 10 | from lea.dialects import DuckDBDialect 11 | from lea.scripts import Script, TableRef 12 | 13 | DUMMY_TABLE_STATS = TableStats(n_rows=0, n_bytes=0, updated_at=None) 14 | 15 | 16 | @pytest.fixture 17 | def scripts() -> dict[TableRef, Script]: 18 | return { 19 | script.table_ref: script 20 | for script in [ 21 | Script( 22 | table_ref=TableRef("read", ("raw",), "users", "test_project"), 23 | code=""" 24 | SELECT * FROM ( 25 | SELECT UNNEST( 26 | [ 27 | {'id': 1, 'name': 'Alice', 'age': 30}, 28 | {'id': 2, 'name': 'Bob', 'age': 25}, 29 | {'id': 3, 'name': 'Charlie', 'age': 35} 30 | ], max_depth => 2 31 | ) 32 | ) 33 | """, 34 | sql_dialect=DuckDBDialect(), 35 | ), 36 | Script( 37 | table_ref=TableRef("read", ("core",), "users", "test_project"), 38 | code=""" 39 | SELECT 40 | id, 41 | -- #INCREMENTAL 42 | name, 43 | age 44 | FROM raw.users 45 | """, 46 | sql_dialect=DuckDBDialect(), 47 | ), 48 | Script( 49 | table_ref=TableRef("read", ("analytics",), "n_users", "test_project"), 50 | code=""" 51 | SELECT COUNT(*) 52 | FROM core.users 53 | """, 54 | sql_dialect=DuckDBDialect(), 55 | ), 56 | ] 57 | } 58 | 59 | 60 | def assert_queries_are_equal(query1: str, query2: str): 61 | normalized_query1 = re.sub(r"\s+", " ", query1).strip() 62 | normalized_query2 = re.sub(r"\s+", " ", query2).strip() 63 | assert normalized_query1 == normalized_query2 64 | 65 | 66 | def test_simple_run(scripts): 67 | session = Session( 68 | database_client=None, 69 | base_dataset="read", 70 | write_dataset="write", 71 | scripts=scripts, 72 | selected_table_refs=scripts.keys(), 73 | unselected_table_refs=set(), 74 | existing_tables={}, 75 | existing_audit_tables={}, 76 | ) 77 | 78 | assert_queries_are_equal( 79 | session.add_context_to_script( 80 | scripts[TableRef("read", ("raw",), "users", "test_project")] 81 | ).code, 82 | """ 83 | SELECT * FROM ( 84 | SELECT UNNEST( 85 | [ 86 | {'id': 1, 'name': 'Alice', 'age': 30}, 87 | {'id': 2, 'name': 'Bob', 'age': 25}, 88 | {'id': 3, 'name': 'Charlie', 'age': 35} 89 | ], max_depth => 2 90 | ) 91 | ) 92 | """, 93 | ) 94 | assert_queries_are_equal( 95 | session.add_context_to_script( 96 | scripts[TableRef("read", ("analytics",), "n_users", "test_project")] 97 | ).code, 98 | """ 99 | SELECT COUNT(*) 100 | FROM core.users___audit 101 | """, 102 | ) 103 | 104 | 105 | def test_incremental_field(scripts): 106 | session = Session( 107 | database_client=None, 108 | base_dataset="read", 109 | write_dataset="write", 110 | scripts=scripts, 111 | selected_table_refs=scripts.keys(), 112 | unselected_table_refs=set(), 113 | existing_tables={}, 114 | existing_audit_tables={}, 115 | incremental_field_name="name", 116 | incremental_field_values={"Alice"}, 117 | ) 118 | 119 | assert_queries_are_equal( 120 | session.add_context_to_script( 121 | scripts[TableRef("read", ("core",), "users", "test_project")] 122 | ).code, 123 | """ 124 | SELECT * 125 | FROM ( 126 | SELECT id, name, age 127 | FROM raw.users___audit 128 | ) 129 | WHERE name IN ('Alice') 130 | """, 131 | ) 132 | 133 | assert_queries_are_equal( 134 | session.add_context_to_script( 135 | scripts[TableRef("read", ("analytics",), "n_users", "test_project")] 136 | ).code, 137 | """ 138 | SELECT COUNT(*) FROM ( 139 | SELECT * 140 | FROM core.users___audit 141 | WHERE name IN ('Alice') 142 | 143 | UNION ALL 144 | 145 | SELECT * 146 | FROM core.users 147 | WHERE name NOT IN ('Alice') 148 | ) 149 | """, 150 | ) 151 | 152 | 153 | def test_incremental_field_but_no_incremental_table_selected(scripts): 154 | session = Session( 155 | database_client=None, 156 | base_dataset="read", 157 | write_dataset="write", 158 | scripts=scripts, 159 | selected_table_refs={TableRef("read", ("analytics",), "n_users", "test_project")}, 160 | unselected_table_refs=set(), 161 | existing_tables={}, 162 | existing_audit_tables={}, 163 | incremental_field_name="name", 164 | incremental_field_values={"Alice"}, 165 | ) 166 | 167 | assert_queries_are_equal( 168 | session.add_context_to_script( 169 | scripts[TableRef("read", ("core",), "users", "test_project")] 170 | ).code, 171 | """ 172 | SELECT 173 | id, 174 | -- #INCREMENTAL 175 | name, 176 | age 177 | FROM raw.users 178 | """, 179 | ) 180 | 181 | 182 | @pytest.mark.duckdb 183 | def test_incremental_field_with_just_incremental_table_selected(scripts): 184 | session = Session( 185 | database_client=None, 186 | base_dataset="read", 187 | write_dataset="write", 188 | scripts=scripts, 189 | selected_table_refs={TableRef("read", ("core",), "users", "test_project")}, 190 | unselected_table_refs=set(), 191 | existing_tables={}, 192 | existing_audit_tables={}, 193 | incremental_field_name="name", 194 | incremental_field_values={"Alice"}, 195 | ) 196 | 197 | assert_queries_are_equal( 198 | session.add_context_to_script( 199 | scripts[TableRef("read", ("core",), "users", "test_project")] 200 | ).code, 201 | """ 202 | SELECT * 203 | FROM ( 204 | SELECT id, name, age 205 | FROM raw.users 206 | ) 207 | WHERE name IN ('Alice') 208 | """, 209 | ) 210 | 211 | 212 | def test_incremental_field_with_just_incremental_table_selected_and_materialized_dependency( 213 | scripts, 214 | ): 215 | session = Session( 216 | database_client=None, 217 | base_dataset="read", 218 | write_dataset="write", 219 | scripts=scripts, 220 | selected_table_refs={TableRef("read", ("core",), "users", "test_project")}, 221 | unselected_table_refs=set(), 222 | existing_tables={}, 223 | existing_audit_tables={ 224 | TableRef("read", ("raw",), "users", "test_project"): DUMMY_TABLE_STATS 225 | }, 226 | incremental_field_name="name", 227 | incremental_field_values={"Alice"}, 228 | ) 229 | 230 | assert_queries_are_equal( 231 | session.add_context_to_script( 232 | scripts[TableRef("read", ("core",), "users", "test_project")] 233 | ).code, 234 | """ 235 | SELECT * 236 | FROM ( 237 | SELECT id, name, age 238 | FROM raw.users___audit 239 | ) 240 | WHERE name IN ('Alice') 241 | """, 242 | ) 243 | 244 | 245 | def test_incremental_field_but_no_incremental_table_selected_and_yet_dependency_is_materialized( 246 | scripts, 247 | ): 248 | session = Session( 249 | database_client=None, 250 | base_dataset="read", 251 | write_dataset="write", 252 | scripts=scripts, 253 | selected_table_refs={TableRef("read", ("analytics",), "n_users", "test_project")}, 254 | unselected_table_refs=set(), 255 | existing_tables={}, 256 | existing_audit_tables={ 257 | TableRef("read", ("core",), "users", "test_project"): DUMMY_TABLE_STATS, 258 | }, 259 | incremental_field_name="name", 260 | incremental_field_values={"Alice"}, 261 | ) 262 | 263 | assert_queries_are_equal( 264 | session.add_context_to_script( 265 | scripts[TableRef("read", ("analytics",), "n_users", "test_project")] 266 | ).code, 267 | """ 268 | SELECT COUNT(*) 269 | FROM ( 270 | SELECT * 271 | FROM core.users___audit 272 | WHERE name IN ('Alice') 273 | 274 | UNION ALL 275 | 276 | SELECT * 277 | FROM core.users 278 | WHERE name NOT IN ('Alice') 279 | ) 280 | """, 281 | ) 282 | 283 | 284 | def test_incremental_field_but_no_incremental_table_selected_and_yet_dependency_is_materialized_with_client( 285 | scripts, 286 | ): 287 | session = Session( 288 | database_client=DuckDBClient( 289 | database_path=Path("./test_duckdb"), 290 | dry_run=False, 291 | print_mode=False, 292 | ), 293 | base_dataset="read", 294 | write_dataset="write", 295 | scripts=scripts, 296 | selected_table_refs={TableRef("read", ("analytics",), "n_users", "test_project")}, 297 | unselected_table_refs=set(), 298 | existing_tables={}, 299 | existing_audit_tables={ 300 | TableRef("read", ("core",), "users", "test_project"): DUMMY_TABLE_STATS, 301 | }, 302 | incremental_field_name="name", 303 | incremental_field_values={"Alice"}, 304 | ) 305 | 306 | assert_queries_are_equal( 307 | session.add_context_to_script( 308 | scripts[TableRef("read", ("analytics",), "n_users", "test_project")] 309 | ).code, 310 | """ 311 | SELECT COUNT(*) 312 | FROM ( 313 | SELECT * 314 | FROM core.users___audit 315 | WHERE name IN ('Alice') 316 | 317 | UNION ALL 318 | 319 | SELECT * 320 | FROM core.users 321 | WHERE name NOT IN ('Alice') 322 | ) 323 | """, 324 | ) 325 | -------------------------------------------------------------------------------- /lea/test_table_ref.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pathlib 4 | 5 | import pytest 6 | 7 | from lea.table_ref import TableRef 8 | 9 | 10 | @pytest.mark.parametrize( 11 | "table_ref, expected", 12 | [ 13 | pytest.param(table_ref, expected, id=str(table_ref)) 14 | for table_ref, expected in [ 15 | ( 16 | TableRef("my_dataset", ("my_schema",), "my_table", "my_project"), 17 | "my_project.my_dataset.my_schema.my_table", 18 | ), 19 | ( 20 | TableRef("my_dataset", (), "my_table", "my_project"), 21 | "my_project.my_dataset.my_table", 22 | ), 23 | ( 24 | TableRef("my_dataset", ("my_schema", "my_subschema"), "my_table", "my_project"), 25 | "my_project.my_dataset.my_schema.my_subschema.my_table", 26 | ), 27 | ] 28 | ], 29 | ) 30 | def test_str(table_ref, expected): 31 | assert str(table_ref) == expected 32 | 33 | 34 | @pytest.mark.parametrize( 35 | "table_ref, expected", 36 | [ 37 | pytest.param(table_ref, expected, id=str(table_ref)) 38 | for table_ref, expected in [ 39 | ( 40 | TableRef("my_dataset", ("my_schema",), "my_table", None), 41 | "TableRef(dataset='my_dataset', schema=('my_schema',), name='my_table', project=None)", 42 | ), 43 | ( 44 | TableRef("my_dataset", (), "my_table", None), 45 | "TableRef(dataset='my_dataset', schema=(), name='my_table', project=None)", 46 | ), 47 | ( 48 | TableRef("my_dataset", ("my_schema", "my_subschema"), "my_table", "my_project"), 49 | "TableRef(dataset='my_dataset', schema=('my_schema', 'my_subschema'), name='my_table', project='my_project')", 50 | ), 51 | ] 52 | ], 53 | ) 54 | def test_repr(table_ref, expected): 55 | assert repr(table_ref) == expected 56 | 57 | 58 | def test_from_path(): 59 | scripts_dir = pathlib.Path("my_dataset") 60 | relative_path = pathlib.Path("my_schema/my_table.sql") 61 | table_ref = TableRef.from_path(scripts_dir, relative_path, "my_project") 62 | assert table_ref == TableRef("my_dataset", ("my_schema",), "my_table", "my_project") 63 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | authors = ["Max Halford "] 3 | description = "A minimalist alternative to dbt" 4 | name = "lea-cli" 5 | packages = [ 6 | {include = "lea", from = "."}, 7 | ] 8 | version = "0.10.3" 9 | 10 | [tool.poetry.dependencies] 11 | click = "^8.1.7" 12 | Jinja2 = "^3.1.2" 13 | db-dtypes = "^1.1.1" 14 | duckdb = "^1.0.0" 15 | gitpython = "^3.1.43" 16 | google-cloud-bigquery = "^3.11.4" 17 | pandas = "^2.1.3" 18 | python = ">=3.10,<4" 19 | python-dotenv = "^1.0.0" 20 | rich = ">=13.5.3,<15.0.0" 21 | sqlglot = "^26.0.0" 22 | rsa = "^4.7" 23 | google-cloud-bigquery-storage = "^2.27.0" 24 | requests = "^2.32.3" 25 | 26 | [tool.poetry.group.dev.dependencies] 27 | ipykernel = "^6.21.2" 28 | pre-commit = ">=3.5,<5.0" 29 | pytest = ">=7.4.2,<9.0.0" 30 | ruff = ">=0.1,<0.12" 31 | 32 | [build-system] 33 | build-backend = "poetry.core.masonry.api" 34 | requires = ["poetry-core>=1.0.0"] 35 | 36 | [tool.poetry.scripts] 37 | lea = "lea.cli:app" 38 | 39 | [tool.ruff] 40 | lint.ignore = ["E501"] 41 | line-length = 100 42 | lint.select = ["E", "F", "I", "UP"] # https://beta.ruff.rs/docs/rules/ 43 | target-version = 'py310' 44 | 45 | [tool.ruff.lint.isort] 46 | required-imports = ["from __future__ import annotations"] 47 | 48 | [tool.pytest.ini_options] 49 | addopts = [ 50 | "--doctest-modules", 51 | "--doctest-glob=README.md", 52 | "--ignore=examples", 53 | "--verbose", 54 | "--color=yes", 55 | ] 56 | markers = [ 57 | "duckdb: quack quack" 58 | ] 59 | --------------------------------------------------------------------------------