├── .github
├── actions
│ └── install-env
│ │ └── action.yml
├── dependabot.yml
└── workflows
│ ├── code-quality.yml
│ └── unit-tests.yml
├── .gitignore
├── .gitmodules
├── .pre-commit-config.yaml
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── examples
├── diff
│ ├── README.md
│ └── views
│ │ ├── dev
│ │ ├── analytics
│ │ │ └── kpis.sql
│ │ ├── core
│ │ │ └── orders.sql.jinja
│ │ └── staging
│ │ │ ├── customers.py
│ │ │ ├── orders.py
│ │ │ └── payments.py
│ │ └── prod
│ │ ├── core
│ │ ├── customers.sql
│ │ └── orders.sql.jinja
│ │ └── staging
│ │ ├── customers.py
│ │ ├── orders.py
│ │ └── payments.py
├── incremental
│ ├── README.md
│ ├── scripts_today
│ │ └── core
│ │ │ └── events.sql
│ └── scripts_tomorrow
│ │ └── core
│ │ └── events.sql
├── jaffle_shop
│ ├── README.md
│ ├── docs
│ │ ├── README.md
│ │ ├── analytics
│ │ │ └── README.md
│ │ ├── core
│ │ │ └── README.md
│ │ └── staging
│ │ │ └── README.md
│ └── scripts
│ │ ├── analytics
│ │ ├── finance
│ │ │ └── kpis.sql
│ │ └── kpis.sql
│ │ ├── core
│ │ ├── customers.sql
│ │ └── orders.sql.jinja
│ │ ├── staging
│ │ ├── customers.sql
│ │ ├── orders.sql
│ │ └── payments.sql
│ │ └── tests
│ │ └── orders_are_dated.sql
├── motherduck
│ └── README.md
└── school
│ ├── README.md
│ ├── scripts
│ ├── analytics
│ │ ├── finance
│ │ │ └── expenses.sql
│ │ ├── major.sql
│ │ └── scholarship_award.sql
│ ├── core
│ │ └── yearly_results.sql
│ ├── staging
│ │ ├── grades.sql
│ │ └── students.sql
│ └── tests
│ │ └── budget.sql
│ └── seeds
│ ├── raw_grades.csv
│ └── raw_students.csv
├── lea
├── __init__.py
├── assertions
│ ├── NO_NULLS.sql.jinja
│ ├── SET.sql.jinja
│ ├── UNIQUE.sql.jinja
│ └── UNIQUE_BY.sql.jinja
├── cli.py
├── comment.py
├── conductor.py
├── dag.py
├── databases.py
├── dialects.py
├── field.py
├── job.py
├── scripts.py
├── session.py
├── table_ref.py
├── test_big_query.py
├── test_duckdb.py
└── test_table_ref.py
├── poetry.lock
└── pyproject.toml
/.github/actions/install-env/action.yml:
--------------------------------------------------------------------------------
1 | name: Install env
2 | runs:
3 | using: "composite"
4 | steps:
5 | - name: Check out repository
6 | uses: actions/checkout@v4
7 | with:
8 | submodules: true
9 |
10 | - name: Set up python
11 | id: set-up-python
12 | uses: actions/setup-python@v4
13 | with:
14 | python-version: 3.11
15 |
16 | - name: Load cached Poetry installation
17 | uses: actions/cache@v3
18 | with:
19 | path: ~/.local
20 | key: poetry-0
21 |
22 | - name: Install poetry
23 | uses: snok/install-poetry@v1
24 | with:
25 | virtualenvs-create: true
26 | virtualenvs-in-project: true
27 | installer-parallel: true
28 |
29 | - name: Load cached virtual env
30 | uses: actions/cache@v3
31 | with:
32 | path: .venv
33 | key: venv-${{ runner.os }}-${{ steps.set-up-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
34 |
35 | - name: Install dependencies
36 | shell: bash
37 | if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
38 | run: poetry install --no-interaction --no-ansi
39 |
40 | - name: Activate environment
41 | shell: bash
42 | run: source $VENV
43 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: pip
4 | directory: "/"
5 | schedule:
6 | interval: "weekly"
7 | day: "tuesday"
8 | reviewers:
9 | - MaxHalford
10 | groups:
11 | python-packages:
12 | patterns:
13 | - "*"
14 |
--------------------------------------------------------------------------------
/.github/workflows/code-quality.yml:
--------------------------------------------------------------------------------
1 | name: Code quality
2 |
3 | on:
4 | pull_request:
5 | branches:
6 | - "*"
7 | push:
8 | branches:
9 | - main
10 |
11 | jobs:
12 | run:
13 | runs-on: ubuntu-latest
14 | steps:
15 | - uses: actions/checkout@v3
16 | - uses: ./.github/actions/install-env
17 | - name: Run pre-commit on all files
18 | run: poetry run pre-commit run --all-files
19 |
--------------------------------------------------------------------------------
/.github/workflows/unit-tests.yml:
--------------------------------------------------------------------------------
1 | name: Unit tests
2 |
3 | on:
4 | pull_request:
5 | branches:
6 | - "*"
7 | push:
8 | branches:
9 | - main
10 |
11 | jobs:
12 | run:
13 | runs-on: ubuntu-latest
14 | steps:
15 | - uses: actions/checkout@v3
16 | - uses: ./.github/actions/install-env
17 | - name: Run pytest
18 | run: poetry run pytest
19 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | *.pyc
3 | *.db
4 | .env
5 | dist/
6 | /*.ipynb
7 | .DS_Store
8 | *.wal
9 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "examples/jaffle_shop/jaffle_shop"]
2 | path = examples/jaffle_shop/jaffle_shop
3 | url = https://github.com/dbt-labs/jaffle_shop/
4 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | files: .
2 | repos:
3 | - repo: https://github.com/pre-commit/pre-commit-hooks
4 | rev: v4.4.0
5 | hooks:
6 | - id: check-json
7 | - id: check-yaml
8 | - id: trailing-whitespace
9 | - id: mixed-line-ending
10 |
11 | - repo: https://github.com/astral-sh/ruff-pre-commit
12 | rev: v0.1.7
13 | hooks:
14 | - id: ruff
15 | - id: ruff-format
16 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | ## Setup
4 |
5 | Start by cloning the repository:
6 |
7 | ```sh
8 | git clone https://github.com/carbonfact/lea
9 | ```
10 |
11 | There are submodules in this repository, so you'll need to fetch/update them:
12 |
13 | ```sh
14 | git submodule init
15 | git submodule update
16 | ```
17 |
18 | Next, you'll need a Python environment:
19 |
20 | ```sh
21 | pyenv install -v 3.11
22 | ```
23 |
24 | You'll also need [Poetry](https://python-poetry.org/):
25 |
26 | ```sh
27 | curl -sSL https://install.python-poetry.org | python3 -
28 | poetry install
29 | poetry shell
30 | ```
31 |
32 | ## Testing
33 |
34 | You can run tests once the environment is set up:
35 |
36 | ```sh
37 | pytest
38 | ```
39 |
40 | ## Code quality
41 |
42 | Install the code quality routine so that it runs each time you try to push your commits.
43 |
44 | ```sh
45 | pre-commit install --hook-type pre-push
46 | ```
47 |
48 | You can also run the code quality routine ad-hoc.
49 |
50 | ```sh
51 | pre-commit run --all-files
52 | ```
53 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2023 Carbonfact
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | fomo:
2 | git fetch && git rebase origin/main
3 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
lea
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 | lea is a minimalist alternative to SQL orchestrators like [dbt](https://www.getdbt.com/) and [SQLMesh](https://sqlmesh.com/).
28 |
29 | lea aims to be simple and provides sane defaults. We happily use it every day at [Carbonfact](https://www.carbonfact.com/) to manage our BigQuery data warehouse. We will actively maintain it and add features, while welcoming contributions.
30 |
31 | - [Examples](#examples)
32 | - [Installation](#installation)
33 | - [Configuration](#configuration)
34 | - [DuckDB](#duckdb)
35 | - [BigQuery](#bigquery)
36 | - [Usage](#usage)
37 | - [`lea run`](#lea-run)
38 | - [File structure](#file-structure)
39 | - [Jinja templating](#jinja-templating)
40 | - [Development vs. production](#development-vs-production)
41 | - [Selecting scripts](#selecting-scripts)
42 | - [Write-Audit-Publish (WAP)](#write-audit-publish-wap)
43 | - [Testing while running](#testing-while-running)
44 | - [Skipping unmodified scripts during development](#skipping-unmodified-scripts-during-development)
45 | - [Warehouse specific features](#warehouse-specific-features)
46 | - [BigQuery](#bigquery-1)
47 | - [Default clustering](#default-clustering)
48 | - [Big Blue Pick API](#big-blue-pick-api)
49 | - [Contributing](#contributing)
50 | - [License](#license)
51 |
52 | ## Examples
53 |
54 | - [Jaffle shop 🥪](examples/jaffle_shop/)
55 | - [Incremental 🕐](examples/incremental)
56 | - [School 🏫](examples/school/)
57 | - [Compare development to production 👯♀️](examples/diff/)
58 | - [Using MotherDuck 🦆](examples/motherduck/)
59 |
60 | ## Installation
61 |
62 | Use one of the following commands, depending on which warehouse you wish to use:
63 |
64 | ```sh
65 | pip install lea-cli
66 | ```
67 |
68 | This installs the `lea` command. It also makes the `lea` Python library available.
69 |
70 | ## Configuration
71 |
72 | lea is configured via environment variables.
73 |
74 | ### DuckDB
75 |
76 | ```sh
77 | LEA_WAREHOUSE=duckdb
78 | LEA_DUCKDB_PATH=duckdb.db
79 | ```
80 |
81 | ### BigQuery
82 |
83 | ```sh
84 | # Required
85 | LEA_WAREHOUSE=bigquery
86 | # Required
87 | LEA_BQ_LOCATION=EU
88 | # Required
89 | LEA_BQ_DATASET_NAME=kaya
90 | # Required, the project where the dataset is located
91 | LEA_BQ_PROJECT_ID=carbonfact-dwh
92 | # Optional, allows using a different project for compute
93 | LEA_BQ_COMPUTE_PROJECT_ID=carbonfact-dwh-compute
94 | # Not necessary if you're logged in with the gcloud CLI
95 | LEA_BQ_SERVICE_ACCOUNT= # not a path ⚠️
96 | # Defaults to https://www.googleapis.com/auth/bigquery
97 | LEA_BQ_SCOPES=https://www.googleapis.com/auth/bigquery,https://www.googleapis.com/auth/drive
98 | # LOGICAL or PHYSICAL, defaults to PHYSICAL
99 | LEA_BQ_STORAGE_BILLING_MODEL=PHYSICAL
100 | ```
101 |
102 | ## Usage
103 |
104 | These parameters can be provided in an `.env` file, or directly in the shell. Each command also has an `--env` flag to provide a path to an `.env` file.
105 |
106 | ### `lea run`
107 |
108 | This is the main command. It runs SQL queries stored in the `scripts` directory:
109 |
110 | ```sh
111 | lea run
112 | ```
113 |
114 | You can indicate the directory where the scripts are stored:
115 |
116 | ```sh
117 | lea run --scripts /path/to/scripts
118 | ```
119 |
120 | The scripts are run concurrently. They are organized in a DAG, which is traversed in a topological order. The DAG's structure is determined [automatically](https://maxhalford.github.io/blog/dbt-ref-rant/) by analyzing the dependency between queries.
121 |
122 | ### File structure
123 |
124 | Each query is expected to be placed under a schema, represented by a directory. Schemas can have sub-schemas. Here's an example:
125 |
126 | ```
127 | scripts/
128 | schema_1/
129 | table_1.sql
130 | table_2.sql
131 | schema_2/
132 | table_3.sql
133 | table_4.sql
134 | sub_schema_2_1/
135 | table_5.sql
136 | table_6.sql
137 | ```
138 |
139 | Each script is materialized into a table. The table is named according to the script's name, following the warehouse convention.
140 |
141 | #### Jinja templating
142 |
143 | SQL queries can be templated with [Jinja](https://jinja.palletsprojects.com/en/3.1.x/). A `.sql.jinja` extension is necessary for lea to recognise them.
144 |
145 | You have access to an `env` variable within the template context, which is simply an access point to `os.environ`.
146 |
147 | ### Development vs. production
148 |
149 | By default, lea creates an isolation layer with production. The way this is done depends on your warehouse:
150 |
151 | - BigQuery : by appending a `_` suffix to schema names
152 | - DuckDB : by adding a suffix `_` to database file.
153 |
154 | In other words, a development environment is used by default. Use the `--production` flag when executing `lea run` to disable this behaviour, and instead target the product environment.
155 |
156 | ```sh
157 | lea run --production
158 | ```
159 |
160 | The `` is determined automatically from the [login name](https://docs.python.org/3/library/getpass.html#getpass.getuser). It can be overriden by setting the `LEA_USERNAME` environment variable.
161 |
162 | ### Selecting scripts
163 |
164 | A single script can be run:
165 |
166 | ```sh
167 | lea run --select core.users
168 | ```
169 |
170 | Several scripts can be run:
171 |
172 | ```sh
173 | lea run --select core.users --select core.orders
174 | ```
175 |
176 | Similar to dbt, lea also supports graph operators:
177 |
178 | ```sh
179 | lea run --select core.users+ # users and everything that depends on it
180 | lea run --select +core.users # users and everything it depends on
181 | lea run --select +core.users+ # users and all its dependencies
182 | ```
183 |
184 | You can select all scripts in a schema:
185 |
186 | ```sh
187 | lea run --select core/ # the trailing slash matters
188 | ```
189 |
190 | This also work with sub-schemas:
191 |
192 | ```sh
193 | lea run --select analytics.finance/
194 | ```
195 |
196 | There are thus 8 possible operators:
197 |
198 | ```
199 | schema.table (table by itself)
200 | schema.table+ (table with its descendants)
201 | +schema.table (table with its ancestors)
202 | +schema.table+ (table with its ancestors and descendants)
203 | schema/ (all tables in schema)
204 | schema/+ (all tables in schema with their descendants)
205 | +schema/ (all tables in schema with their ancestors)
206 | +schema/+ (all tables in schema with their ancestors and descendants)
207 | ```
208 |
209 | Combinations are possible:
210 |
211 | ```sh
212 | lea run --select core.users+ --select +core.orders
213 | ```
214 |
215 | There's an Easter egg that allows choosing scripts that have been committed or modified in the current Git branch:
216 |
217 | ```sh
218 | lea run --select git
219 | lea run --select git+ # includes all descendants
220 | ```
221 |
222 | This becomes very handy when using lea in continuous integration.
223 |
224 | ### Write-Audit-Publish (WAP)
225 |
226 | [WAP](https://lakefs.io/blog/data-engineering-patterns-write-audit-publish/) is a data engineering pattern that ensures data consistency and reliability. It's the data engineering equivalent of [blue-green deployment](https://en.wikipedia.org/wiki/Blue%E2%80%93green_deployment) in the software engineering world.
227 |
228 | lea follows the WAP pattern by default. When you execute `lea run`, it actually creates temporary tables that have an `___audit` suffix. The latter tables are promoted to replace the existing tables, once they have all been materialized without errors.
229 |
230 | This is a good default behavior. Let's say you refresh table `foo`. Then you refresh table `bar` that depends on `foo`. If the refresh of `bar` fails, you're left with a corrupt state. This is what the WAP pattern solves. In WAP mode, when you run `foo`'s script, it creates a `foo___audit` table. If `bar`'s script fails, then the run stops and `foo` is not modified.
231 |
232 | ### Testing while running
233 |
234 | There is no `lea test` command. Tests are run together with the regular script when `lea run` is executed. The run stops whenever a test fails.
235 |
236 | There are two types of tests:
237 |
238 | - Singular tests — these are queries which return failing rows. They are stored in a `tests` directory.
239 | - Assertion tests — these are comment annotations in the queries themselves:
240 | - `#NO_NULLS` — checks that all values in a column are not null.
241 | - `#UNIQUE` — checks that a column's values are unique.
242 | - `#UNIQUE_BY()` — checks that a column's values are unique within a group.
243 | - `#SET{}` — checks that a column's values are in a set of values.
244 |
245 | Here's an example of a query annotated with assertion tests:
246 |
247 | ```sql
248 | SELECT
249 | -- #UNIQUE
250 | -- #NO_NULLS
251 | user_id,
252 | -- #NO_NULLS
253 | address,
254 | -- #UNIQUE_BY(address)
255 | full_name,
256 | -- #SET{'A', 'B', 'AB', 'O'}
257 | blood_type
258 | FROM core.users
259 | ```
260 |
261 | You can run a single test via the `--select` flag:
262 |
263 | ```sh
264 | lea run --select tests.check_n_users
265 | ```
266 |
267 | Or even run all the tests, as so:
268 |
269 | ```sh
270 | lea run --select tests/ # the trailing slash matters
271 | ```
272 |
273 | ☝️ When you run a script that is not a test, all the applicable tests are run as well. For instance, the following command will run the `core.users` script and all the tests that are applicable to it:
274 |
275 | ```sh
276 | lea run --select core.users
277 | ```
278 |
279 | You may decide to run all scripts without executing tests, which is obviously not advisable:
280 |
281 | ```sh
282 | lea run --unselect tests/
283 | lea run --select core.users --unselect tests/
284 | ```
285 |
286 | ### Skipping unmodified scripts during development
287 |
288 | When you call `lea run`, it generates audit tables, which are then promoted to replace the original tables. This is done to ensure that the data is consistent and reliable. lea doesn't run scripts when the audit table already exists, and when the script hasn't modified since the last time the audit table was created. This is to avoid unnecessary re-runs of scripts that haven't changed.
289 |
290 | For instance:
291 |
292 | 1. You execute `lea run` to sync all tables from sources, no errors, all tables are materialized.
293 | 2. You modify a script named `core/expenses.sql` depending on `staging/customers.sql` and `staging/orders.sql`
294 | 3. You execute `lea run core.expenses+` to run again all impacted tables
295 | 4. `core__expenses___audit` is materialized in your data warehouse but the `-- #NO_NULLS` assertion test on a column fails
296 | 5. After reviewing data in `core__expenses___audit`, you edit and fix `core/expenses.sql` to filter out results where NULLs are appearing
297 | 6. You execute `lea run`
298 | 7. The `staging/customers.sql` and `staging/orders.sql` scripts are skipped because they were modified before `staging__customers` and `staging__orders` was last materialized
299 | 8. The `core/expenses.sql` script is run because it was modified after `core__expenses` was last materialized
300 | 9. All audit tables are wipped out from database as the whole DAG has run successfully ! 🎉
301 |
302 | You can disable this behavior altogether:
303 |
304 | ```sh
305 | lea run --restart
306 | ```
307 |
308 | ## Warehouse specific features
309 |
310 | ### BigQuery
311 |
312 | #### Default clustering
313 |
314 | At Carbonfact, we cluster most of our tables by customer. This is done to optimize query performance and reduce costs. lea allows you to automatically cluster tables that contain a given field:
315 |
316 | ```sh
317 | LEA_BQ_DEFAULT_CLUSTERING_FIELDS=account_slug
318 | ```
319 |
320 | You can also specify multiple fields, meaning that tables which contain both fields will be clustered:
321 |
322 | ```sh
323 | LEA_BQ_DEFAULT_CLUSTERING_FIELDS=account_slug,brand_slug
324 | ```
325 |
326 | For each table, lea will use the clustering fields it can and ignore the others. With the previous configuration, if your table defines `account_slug` and not `brand_slug`, it will cluster by `account_slug`.
327 |
328 | #### Big Blue Pick API
329 |
330 | [Big Blue](https://biq.blue/) is a SaaS product to monitor and optimize BigQuery costs. As part of their offering, they provide a [Pick API](https://biq.blue/blog/compute/how-to-implement-bigquery-autoscaling-reservation-in-10-minutes). The idea is that some queries should be run on-demand, while others should be run on a reservation. Big Blue's Pick API suggests which billing model to use for each query.
331 |
332 | We use this at Carbonfact, and so this API is available out of the box in lea. You can enable it by setting the following environment variables:
333 |
334 | ```sh
335 | LEA_BQ_BIG_BLUE_PICK_API_KEY=
336 | LEA_BQ_BIG_BLUE_PICK_API_URL=https://pick.biq.blue
337 | LEA_BQ_BIG_BLUE_PICK_API_ON_DEMAND_PROJECT_ID=on-demand-compute-project-id
338 | LEA_BQ_BIG_BLUE_PICK_API_REVERVATION_PROJECT_ID=reservation-compute-project-id
339 | ```
340 |
341 | ## Contributing
342 |
343 | Feel free to reach out to [max@carbonfact.com](mailto:max@carbonfact.com) if you want to know more and/or contribute 😊
344 |
345 | We have suggested [some issues](https://github.com/carbonfact/lea/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc+label%3A%22good+first+issue%22) as good places to get started.
346 |
347 | ## License
348 |
349 | lea is free and open-source software licensed under the Apache License, Version 2.0.
350 |
--------------------------------------------------------------------------------
/examples/diff/README.md:
--------------------------------------------------------------------------------
1 | # Compare development to production
2 |
3 | The first thing to do is create an `.env` file, as so:
4 |
5 | ```sh
6 | echo "
7 | LEA_USERNAME=max
8 | LEA_WAREHOUSE=duckdb
9 | LEA_DUCKDB_PATH=jaffle_shop.db
10 | " > .env
11 | ```
12 |
13 | This example is about comparing data in development to what's in production. For the purpose of this example, there's a `views/prod` directory and a `views/dev` directory.
14 |
15 | Let's start by running the views in production. First, the schemas needs to be created:
16 |
17 | ```sh
18 | lea prepare views/prod --production
19 | ```
20 |
21 | ```
22 | Created schema staging
23 | Created schema core
24 | ```
25 |
26 | The views can now be run in production:
27 |
28 | ```sh
29 | lea run views/prod --production
30 | ```
31 |
32 | Now let's say we're working in development. We would start by creating the schemas:
33 |
34 | ```sh
35 | lea prepare views/dev
36 | ```
37 |
38 | ```
39 | Created schema staging
40 | Created schema core
41 | Created schema analytics
42 | ```
43 |
44 | We do some changes by editing the `views/dev` directory. Then we can run the views in development:
45 |
46 | ```sh
47 | lea run views/dev
48 | ```
49 |
50 | Now we can compare the data in development to the data in production:
51 |
52 | ```sh
53 | lea diff
54 | ```
55 |
56 | ```diff
57 | + analytics.kpis
58 | + 1 rows
59 | + metric
60 | + value
61 |
62 | - core.customers
63 | - 100 rows
64 | - customer_id
65 | - customer_lifetime_value
66 | - first_name
67 | - first_order
68 | - last_name
69 | - most_recent_order
70 | - number_of_orders
71 |
72 | core.orders
73 | - 29 rows
74 | ```
75 |
76 | The diff shows several things:
77 |
78 | - The `customers` view got dropped.
79 | - The `orders` didn't get dropped, but it lost some rows. This is because we added a `WHERE` to the underlying SQL.
80 | - The `kpis` view got added, and it contains a single row.
81 |
82 | The nice thing is that `lea diff` prints out a neat summary. This output can be highlighted on GitHub, which what we've done above, by using a `diff` code block.
83 |
84 | In a pull request, an automated message can be posted with the diff. Here is an example of a GitHub action that does this:
85 |
86 | ````yaml
87 | name: Branch tests
88 |
89 | on:
90 | pull_request:
91 | branches:
92 | - "*"
93 |
94 | jobs:
95 | run:
96 | runs-on: ubuntu-latest
97 | env:
98 | LEA_WAREHOUSE: bigquery
99 | LEA_BQ_SERVICE_ACCOUNT: ${{ secrets.LEA_BQ_SERVICE_ACCOUNT }}
100 | LEA_BQ_LOCATION: EU
101 | LEA_BQ_PROJECT_ID: carbonlytics
102 | LEA_SCHEMA: kaya
103 | steps:
104 | - uses: actions/checkout@v4
105 | - uses: ./.github/actions/install-env
106 |
107 | - name: Check code quality
108 | run: poetry run pre-commit run --all-files
109 |
110 | - name: Set environment variables
111 | run: |
112 | export PR_NUMBER=$(cut -d'/' -f3 <<< "$GITHUB_REF")
113 | export LEA_USERNAME="pr$PR_NUMBER"
114 | echo "LEA_USERNAME=$LEA_USERNAME" >> $GITHUB_ENV
115 |
116 | - name: Create BigQuery dataset for this pull request
117 | run: poetry run lea prepare
118 |
119 | - name: Refresh views
120 | run: poetry run lea run --raise-exceptions
121 |
122 | - name: Calculate diff
123 | run: |
124 | export DIFF=$(poetry run lea diff kaya_$LEA_USERNAME kaya)
125 | DIFF=$(echo "$DIFF" | sed '1d')
126 | EOF=$(dd if=/dev/urandom bs=15 count=1 status=none | base64)
127 | echo "DIFF<<$EOF" >> "$GITHUB_ENV"
128 | echo "$DIFF" >> "$GITHUB_ENV"
129 | echo "$EOF" >> "$GITHUB_ENV"
130 |
131 | - name: Comment PR with execution number
132 | uses: thollander/actions-comment-pull-request@v2
133 | with:
134 | message: |
135 | ```diff
136 | ${{ env.DIFF }}
137 | ```
138 | comment_tag: execution
139 |
140 | - name: Run tests
141 | run: poetry run lea test --raise-exceptions
142 | ````
143 |
--------------------------------------------------------------------------------
/examples/diff/views/dev/analytics/kpis.sql:
--------------------------------------------------------------------------------
1 | SELECT
2 | 'n_orders' AS metric,
3 | COUNT(*) AS value
4 | FROM
5 | core.orders
6 |
--------------------------------------------------------------------------------
/examples/diff/views/dev/core/orders.sql.jinja:
--------------------------------------------------------------------------------
1 | {% set payment_methods = ['credit_card', 'coupon', 'bank_transfer', 'gift_card'] %}
2 |
3 | with order_payments as (
4 |
5 | select
6 | order_id,
7 |
8 | {% for payment_method in payment_methods -%}
9 | sum(case when payment_method = '{{ payment_method }}' then amount else 0 end) as {{ payment_method }}_amount,
10 | {% endfor -%}
11 |
12 | sum(amount) as total_amount
13 |
14 | from staging.payments
15 |
16 | group by order_id
17 |
18 | )
19 |
20 | select
21 | orders.order_id,
22 | orders.customer_id,
23 | orders.order_date,
24 | orders.status,
25 |
26 | {% for payment_method in payment_methods -%}
27 |
28 | order_payments.{{ payment_method }}_amount,
29 |
30 | {% endfor -%}
31 |
32 | order_payments.total_amount as amount
33 |
34 | from staging.orders
35 | left join order_payments
36 | on orders.order_id = order_payments.order_id
37 |
38 | where date_part('month', cast(order_date AS date)) > 1
39 |
--------------------------------------------------------------------------------
/examples/diff/views/dev/staging/customers.py:
--------------------------------------------------------------------------------
1 | """Docstring for the customers view."""
2 |
3 | from __future__ import annotations
4 |
5 | import pathlib
6 |
7 | import pandas as pd
8 |
9 | here = pathlib.Path(__file__).parent
10 | customers = pd.read_csv(
11 | here.parents[3] / "jaffle_shop" / "jaffle_shop" / "seeds" / "raw_customers.csv"
12 | )
13 | customers = customers.rename(columns={"id": "customer_id"})
14 |
--------------------------------------------------------------------------------
/examples/diff/views/dev/staging/orders.py:
--------------------------------------------------------------------------------
1 | """Docstring for the orders view."""
2 | from __future__ import annotations
3 |
4 | import pathlib
5 |
6 | import pandas as pd
7 |
8 | here = pathlib.Path(__file__).parent
9 | orders = pd.read_csv(here.parents[3] / "jaffle_shop" / "jaffle_shop" / "seeds" / "raw_orders.csv")
10 | orders = orders.rename(columns={"id": "order_id", "user_id": "customer_id"})
11 |
--------------------------------------------------------------------------------
/examples/diff/views/dev/staging/payments.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pathlib
4 |
5 | import pandas as pd
6 |
7 | here = pathlib.Path(__file__).parent
8 | payments = pd.read_csv(
9 | here.parents[3] / "jaffle_shop" / "jaffle_shop" / "seeds" / "raw_payments.csv"
10 | )
11 | payments = payments.rename(columns={"id": "payment_id"})
12 | payments["amount"] = payments["amount"]
13 |
--------------------------------------------------------------------------------
/examples/diff/views/prod/core/customers.sql:
--------------------------------------------------------------------------------
1 | with customer_orders as (
2 |
3 | select
4 | customer_id,
5 |
6 | min(order_date) as first_order,
7 | max(order_date) as most_recent_order,
8 | count(order_id) as number_of_orders
9 | from staging.orders
10 |
11 | group by customer_id
12 |
13 | ),
14 |
15 | customer_payments as (
16 |
17 | select
18 | orders.customer_id,
19 | sum(amount) as total_amount
20 |
21 | from staging.payments
22 |
23 | left join staging.orders on
24 | payments.order_id = orders.order_id
25 |
26 | group by orders.customer_id
27 |
28 | )
29 |
30 | select
31 | -- #UNIQUE
32 | customers.customer_id,
33 | customers.first_name,
34 | customers.last_name,
35 | customer_orders.first_order,
36 | customer_orders.most_recent_order,
37 | customer_orders.number_of_orders,
38 | customer_payments.total_amount as customer_lifetime_value
39 |
40 | from staging.customers
41 |
42 | left join customer_orders
43 | on customers.customer_id = customer_orders.customer_id
44 |
45 | left join customer_payments
46 | on customers.customer_id = customer_payments.customer_id
47 |
--------------------------------------------------------------------------------
/examples/diff/views/prod/core/orders.sql.jinja:
--------------------------------------------------------------------------------
1 | {% set payment_methods = ['credit_card', 'coupon', 'bank_transfer', 'gift_card'] %}
2 |
3 | with order_payments as (
4 |
5 | select
6 | order_id,
7 |
8 | {% for payment_method in payment_methods -%}
9 | sum(case when payment_method = '{{ payment_method }}' then amount else 0 end) as {{ payment_method }}_amount,
10 | {% endfor -%}
11 |
12 | sum(amount) as total_amount
13 |
14 | from staging.payments
15 |
16 | group by order_id
17 |
18 | )
19 |
20 | select
21 | orders.order_id,
22 | orders.customer_id,
23 | orders.order_date,
24 | orders.status,
25 |
26 | {% for payment_method in payment_methods -%}
27 |
28 | order_payments.{{ payment_method }}_amount,
29 |
30 | {% endfor -%}
31 |
32 | order_payments.total_amount as amount
33 |
34 | from staging.orders orders
35 |
36 |
37 | left join order_payments
38 | on orders.order_id = order_payments.order_id
39 |
--------------------------------------------------------------------------------
/examples/diff/views/prod/staging/customers.py:
--------------------------------------------------------------------------------
1 | """Docstring for the customers view."""
2 |
3 | from __future__ import annotations
4 |
5 | import pathlib
6 |
7 | import pandas as pd
8 |
9 | here = pathlib.Path(__file__).parent
10 | customers = pd.read_csv(
11 | here.parents[3] / "jaffle_shop" / "jaffle_shop" / "seeds" / "raw_customers.csv"
12 | )
13 | customers = customers.rename(columns={"id": "customer_id"})
14 |
--------------------------------------------------------------------------------
/examples/diff/views/prod/staging/orders.py:
--------------------------------------------------------------------------------
1 | """Docstring for the orders view."""
2 | from __future__ import annotations
3 |
4 | import pathlib
5 |
6 | import pandas as pd
7 |
8 | here = pathlib.Path(__file__).parent
9 | orders = pd.read_csv(here.parents[3] / "jaffle_shop" / "jaffle_shop" / "seeds" / "raw_orders.csv")
10 | orders = orders.rename(columns={"id": "order_id", "user_id": "customer_id"})
11 |
--------------------------------------------------------------------------------
/examples/diff/views/prod/staging/payments.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pathlib
4 |
5 | import pandas as pd
6 |
7 | here = pathlib.Path(__file__).parent
8 | payments = pd.read_csv(
9 | here.parents[3] / "jaffle_shop" / "jaffle_shop" / "seeds" / "raw_payments.csv"
10 | )
11 | payments = payments.rename(columns={"id": "payment_id"})
12 | payments["amount"] = payments["amount"] / 100 # convert cents to dollars
13 |
--------------------------------------------------------------------------------
/examples/incremental/README.md:
--------------------------------------------------------------------------------
1 | # Incremental scripts
2 |
3 | Let's start with creating the database in the usual way:
4 |
5 | ```sh
6 | echo "
7 | LEA_USERNAME=max
8 | LEA_WAREHOUSE=duckdb
9 | LEA_DUCKDB_PATH=incremental.db
10 | " > .env
11 | ```
12 |
13 | There are two `scripts` folders to simulate two days with different amounts of data. Let's say we're the 4th of January, and we run our views:
14 |
15 | ```sh
16 | lea run --scripts scripts_today
17 | ```
18 |
19 | ```sh
20 | python -c "import duckdb; print(duckdb.connect('incremental_max.db').execute('SELECT created_at, day_of_year FROM core.events').df())"
21 | ```
22 |
23 | ```
24 | created_at day_of_year
25 | 0 2023-01-02 2
26 | 1 2023-01-03 3
27 | 2 2023-01-04 4
28 | ```
29 |
30 | The next day, there's new data. When we refresh, we don't want to start from scratch. We want to keep the data from the previous day and only add the new data. This will happen automatically because the view is tagged with a `#INCREMENTAL` comment.
31 |
32 | ```sh
33 | lea run --script scripts_tomorrow --select core.events --incremental day_of_year 5
34 | ```
35 |
36 | ```sh
37 | python -c "import duckdb; print(duckdb.connect('incremental_max.db').execute('SELECT created_at, day_of_year FROM core.events').df())"
38 | ```
39 |
40 | ```
41 | created_at day_of_year
42 | 0 2023-01-02 2
43 | 1 2023-01-03 3
44 | 2 2023-01-04 4
45 | 3 2023-01-05 5
46 | ```
47 |
48 | We can see the new event from the 5th of January. However, in this case there is an event from the 1st of January that is missing. This is because the event has arrived with a delay. In such cases, we can force a full refresh by ommitting the flag --incremental:
49 |
50 | ```sh
51 | lea run --script scripts_tomorrow
52 | ```
53 |
54 | ```sh
55 | python -c "import duckdb; print(duckdb.connect('incremental_max.db').execute('SELECT * FROM core.events').df())"
56 | ```
57 |
58 | ```
59 | created_at day_of_year
60 | 0 2023-01-01 1
61 | 1 2023-01-02 2
62 | 2 2023-01-03 3
63 | 3 2023-01-04 4
64 | 4 2023-01-05 5
65 | ```
66 |
--------------------------------------------------------------------------------
/examples/incremental/scripts_today/core/events.sql:
--------------------------------------------------------------------------------
1 | SELECT
2 | DATE '2023-01-01' + INTERVAL (i) DAY AS created_at,
3 | i + 1 AS day_of_year
4 | FROM GENERATE_SERIES(1, 3) AS t(i)
5 |
--------------------------------------------------------------------------------
/examples/incremental/scripts_tomorrow/core/events.sql:
--------------------------------------------------------------------------------
1 | SELECT
2 | DATE '2023-01-01' + INTERVAL (i) DAY AS created_at,
3 | -- #INCREMENTAL
4 | i + 1 AS day_of_year
5 | FROM GENERATE_SERIES(0, 4) AS t(i)
6 |
--------------------------------------------------------------------------------
/examples/jaffle_shop/README.md:
--------------------------------------------------------------------------------
1 | # Jaffle shop example
2 |
3 | This example is taken from the [`jaffle_shop` example](https://github.com/dbt-labs/jaffle_shop/) from dbt. Here is the scripts file structure:
4 |
5 | ```
6 | scripts
7 | ├── analytics
8 | │ ├── finance
9 | │ │ └── kpis.sql
10 | │ └── kpis.sql
11 | ├── core
12 | │ ├── customers.sql
13 | │ └── orders.sql.jinja
14 | ├── staging
15 | │ ├── customers.sql
16 | │ ├── orders.sql
17 | │ └── payments.sql
18 | └── tests
19 | └── orders_are_dated.sql
20 | ```
21 |
22 | The first thing to do is create an `.env` file, as so:
23 |
24 | ```sh
25 | echo "
26 | LEA_USERNAME=max
27 | LEA_WAREHOUSE=duckdb
28 | LEA_DUCKDB_PATH=jaffle_shop.db
29 | " > .env
30 | ```
31 |
32 | This example uses DuckDB as the data warehouse. With lea, the convention when using DuckDB is to use a separate `.db` file per environment. For instance, in production, the file would be called `jaffle_shop.db`. In development, the file would be called `jaffle_shop_max.db`. The `max` suffix is the username from the `.env` file.
33 |
34 | You can run the scripts:
35 |
36 | ```sh
37 | lea run
38 | ```
39 |
40 | lea will create audit tables, run tests against audit tables and if successfull.
41 |
42 | There are a couple of cool things:
43 |
44 | 1. The staging schema is populated using SQL scripts and native DuckDB parsing of CSV files.
45 | 2. The `core.orders` table is created using a Jinja SQL script. lea will automatically run the script through Jinja, and then execute the resulting SQL.
46 | 3. Skip feature can help fasten development cycle during WAP pattern. If a table is not passing through audit, all materialized tables won't be run again if the associated SQL script has'nt changed.
47 | If the script has changed, the audit table will be generated again, and all it's related childs in the DAG.
48 |
49 | Let's take the example given in [README.md](README.md).
50 |
51 | - Tables are materialized since you ran earlier `lea run`
52 |
53 | ## Write
54 |
55 | - Add a new script `core/expenses.sql`
56 |
57 | ```sh
58 | echo '''
59 | with customer_orders as (
60 |
61 | select
62 | customer_id,
63 |
64 | min(order_date) as first_order,
65 | max(order_date) as most_recent_order,
66 | count(order_id) as number_of_orders
67 | from staging.orders
68 |
69 | group by customer_id
70 |
71 | ),
72 |
73 | customer_payments as (
74 |
75 | select
76 | orders.customer_id,
77 | sum(payments.amount) as total_amount
78 |
79 | from staging.payments as payments
80 |
81 | left join staging.orders as orders
82 | on payments.order_id = orders.order_id
83 |
84 | group by orders.customer_id
85 |
86 | ),
87 |
88 | expenses as (
89 | select
90 | -- #UNIQUE
91 | customers.customer_id,
92 | customers.first_name,
93 | customers.last_name,
94 | customer_orders.first_order,
95 | customer_orders.most_recent_order,
96 | customer_orders.number_of_orders,
97 | -- #NO_NULLS
98 | customer_payments.total_amount as customer_lifetime_value
99 | from staging.customers as customers --comment here
100 | left join customer_orders --comment here
101 | on customers.customer_id = customer_orders.customer_id --comment here
102 | -- FROM customer_orders --uncomment here
103 | -- left join staging.customers as customers --uncomment here
104 | -- on customer_orders.customer_id = customers.customer_id --uncomment here
105 | left join customer_payments
106 | on customers.customer_id = customer_payments.customer_id
107 | )
108 |
109 | select * from expenses
110 | ''' > scripts/core/expenses.sql
111 | ```
112 |
113 | ## Audit
114 |
115 | - Run the scripts `lea run` : `lea_duckdb_max.tests.core__expenses__customer_lifetime_value___no_nulls___audit` is failing ❌
116 | - Uncomment and comment lines to reverse the JOIN orders, and exclude customers absent from orders tables.
117 |
118 | ```sh
119 | sed -i '' '/--comment here/s/^/--/' scripts/core/expenses.sql
120 | sed -i '' '/--uncomment here/s/-- //' scripts/core/expenses.sql
121 | ```
122 |
123 | - Run again scripts, you should see that all stagings audit tables are not executed again.
124 | - `core.expenses` is executed as lea detected modification on the script
125 | - All tests are now passing 🎉
126 | - Audit tables are wiped out from development warehouse.
127 |
128 | ## Publish
129 |
130 | - As all tests passed, tables are materialized in the development warehouse.
131 | - If you want now to run it against production and not development warehouse, you would add a `--production` flag to each command:
132 |
133 | ```sh
134 | lea run --production
135 | ```
136 |
--------------------------------------------------------------------------------
/examples/jaffle_shop/docs/README.md:
--------------------------------------------------------------------------------
1 | # Views
2 |
3 | ## Schemas
4 |
5 | - [`analytics`](./analytics)
6 | - [`core`](./core)
7 | - [`staging`](./staging)
8 |
9 | ## Schema flowchart
10 |
11 | ```mermaid
12 | %%{init: {"flowchart": {"defaultRenderer": "elk"}} }%%
13 | flowchart TB
14 | analytics(analytics)
15 | core(core)
16 | staging(staging)
17 | core --> analytics
18 | staging --> core
19 | ```
20 |
21 | ## Flowchart
22 |
23 | ```mermaid
24 | %%{init: {"flowchart": {"defaultRenderer": "elk"}} }%%
25 | flowchart TB
26 |
27 | subgraph analytics
28 |
29 | subgraph finance
30 | analytics.finance.kpis(kpis)
31 | end
32 |
33 | analytics.kpis(kpis)
34 | end
35 |
36 |
37 | subgraph core
38 | core.customers(customers)
39 | core.orders(orders)
40 | end
41 |
42 |
43 | subgraph staging
44 | staging.customers(customers)
45 | staging.orders(orders)
46 | staging.payments(payments)
47 | end
48 |
49 | core.orders --> analytics.finance.kpis
50 | core.customers --> analytics.kpis
51 | core.orders --> analytics.kpis
52 | staging.customers --> core.customers
53 | staging.orders --> core.customers
54 | staging.payments --> core.customers
55 | staging.orders --> core.orders
56 | staging.payments --> core.orders
57 | ```
58 |
59 |
--------------------------------------------------------------------------------
/examples/jaffle_shop/docs/analytics/README.md:
--------------------------------------------------------------------------------
1 | # analytics
2 |
3 | ## Table of contents
4 |
5 | - [analytics.finance.kpis](#analyticsfinancekpis)
6 | - [analytics.kpis](#analyticskpis)
7 |
8 | ## Views
9 |
10 | ### analytics.finance.kpis
11 |
12 | ```sql
13 | SELECT *
14 | FROM analytics.finance__kpis
15 | ```
16 |
17 | | Column | Description | Unique |
18 | |:--------------------|:--------------|:---------|
19 | | total_order_value | | |
20 | | average_order_value | | |
21 |
22 | ### analytics.kpis
23 |
24 | ```sql
25 | SELECT *
26 | FROM analytics.kpis
27 | ```
28 |
29 | | Column | Description | Unique |
30 | |:---------|:--------------|:---------|
31 | | metric | | |
32 | | value | | |
33 |
34 |
--------------------------------------------------------------------------------
/examples/jaffle_shop/docs/core/README.md:
--------------------------------------------------------------------------------
1 | # core
2 |
3 | ## Table of contents
4 |
5 | - [core.customers](#corecustomers)
6 | - [core.orders](#coreorders)
7 |
8 | ## Views
9 |
10 | ### core.customers
11 |
12 | ```sql
13 | SELECT *
14 | FROM core.customers
15 | ```
16 |
17 | | Column | Description | Unique |
18 | |:------------------------|:--------------|:---------|
19 | | customer_id | | ✅ |
20 | | first_name | | |
21 | | last_name | | |
22 | | first_order | | |
23 | | most_recent_order | | |
24 | | number_of_orders | | |
25 | | customer_lifetime_value | | |
26 |
27 | ### core.orders
28 |
29 | ```sql
30 | SELECT *
31 | FROM core.orders
32 | ```
33 |
34 | | Column | Description | Unique |
35 | |:---------------------|:--------------|:---------|
36 | | order_id | | |
37 | | customer_id | | |
38 | | order_date | | |
39 | | status | | |
40 | | credit_card_amount | | |
41 | | coupon_amount | | |
42 | | bank_transfer_amount | | |
43 | | gift_card_amount | | |
44 | | amount | | |
45 |
46 |
--------------------------------------------------------------------------------
/examples/jaffle_shop/docs/staging/README.md:
--------------------------------------------------------------------------------
1 | # staging
2 |
3 | ## Table of contents
4 |
5 | - [staging.customers](#stagingcustomers)
6 | - [staging.orders](#stagingorders)
7 | - [staging.payments](#stagingpayments)
8 |
9 | ## Views
10 |
11 | ### staging.customers
12 |
13 | Docstring for the customers view.
14 |
15 | ```sql
16 | SELECT *
17 | FROM staging.customers
18 | ```
19 |
20 | | Column | Description | Unique |
21 | |----------|---------------|----------|
22 |
23 | ### staging.orders
24 |
25 | Docstring for the orders view.
26 |
27 | ```sql
28 | SELECT *
29 | FROM staging.orders
30 | ```
31 |
32 | | Column | Description | Unique |
33 | |----------|---------------|----------|
34 |
35 | ### staging.payments
36 |
37 | ```sql
38 | SELECT *
39 | FROM staging.payments
40 | ```
41 |
42 | | Column | Description | Unique |
43 | |----------|---------------|----------|
44 |
45 |
--------------------------------------------------------------------------------
/examples/jaffle_shop/scripts/analytics/finance/kpis.sql:
--------------------------------------------------------------------------------
1 | SELECT
2 | SUM(amount) AS total_order_value,
3 | AVG(amount) AS average_order_value
4 | FROM core.orders
5 |
--------------------------------------------------------------------------------
/examples/jaffle_shop/scripts/analytics/kpis.sql:
--------------------------------------------------------------------------------
1 | SELECT
2 | 'n_customers' AS metric,
3 | COUNT(*) AS value
4 | FROM
5 | core.customers
6 |
7 | UNION ALL
8 |
9 | SELECT
10 | 'n_orders' AS metric,
11 | COUNT(*) AS value
12 | FROM
13 | core.orders
14 |
--------------------------------------------------------------------------------
/examples/jaffle_shop/scripts/core/customers.sql:
--------------------------------------------------------------------------------
1 | with customer_orders as (
2 |
3 | select
4 | customer_id,
5 |
6 | min(order_date) as first_order,
7 | max(order_date) as most_recent_order,
8 | count(order_id) as number_of_orders
9 | from staging.orders
10 |
11 | group by customer_id
12 |
13 | ),
14 |
15 | customer_payments as (
16 |
17 | select
18 | orders.customer_id,
19 | sum(amount) as total_amount
20 |
21 | from staging.payments
22 |
23 | left join staging.orders orders using (order_id)
24 |
25 | group by orders.customer_id
26 |
27 | )
28 |
29 | select
30 | -- #UNIQUE
31 | customers.customer_id,
32 | customers.first_name,
33 | customers.last_name,
34 | customer_orders.first_order,
35 | customer_orders.most_recent_order,
36 | customer_orders.number_of_orders,
37 | customer_payments.total_amount as customer_lifetime_value
38 |
39 | from staging.customers customers
40 |
41 | left join customer_orders
42 | on customers.customer_id = customer_orders.customer_id
43 |
44 | left join customer_payments
45 | on customers.customer_id = customer_payments.customer_id
46 |
--------------------------------------------------------------------------------
/examples/jaffle_shop/scripts/core/orders.sql.jinja:
--------------------------------------------------------------------------------
1 | {% set payment_methods = ['credit_card', 'coupon', 'bank_transfer', 'gift_card'] %}
2 |
3 | with order_payments as (
4 |
5 | select
6 | order_id,
7 |
8 | {% for payment_method in payment_methods -%}
9 | sum(case when payment_method = '{{ payment_method }}' then amount else 0 end) as {{ payment_method }}_amount,
10 | {% endfor -%}
11 |
12 | sum(amount) as total_amount
13 |
14 | from staging.payments
15 |
16 | group by order_id
17 |
18 | )
19 |
20 | select
21 | orders.order_id,
22 | orders.customer_id,
23 | orders.order_date,
24 | orders.status,
25 |
26 | {% for payment_method in payment_methods -%}
27 |
28 | order_payments.{{ payment_method }}_amount,
29 |
30 | {% endfor -%}
31 |
32 | order_payments.total_amount as amount
33 |
34 | from staging.orders orders
35 |
36 |
37 | left join order_payments
38 | on orders.order_id = order_payments.order_id
39 |
--------------------------------------------------------------------------------
/examples/jaffle_shop/scripts/staging/customers.sql:
--------------------------------------------------------------------------------
1 | WITH raw_customers AS (
2 | SELECT * FROM 'jaffle_shop/seeds/raw_customers.csv'
3 | )
4 |
5 | SELECT
6 | id AS customer_id,
7 | first_name,
8 | last_name
9 | FROM raw_customers;
10 |
--------------------------------------------------------------------------------
/examples/jaffle_shop/scripts/staging/orders.sql:
--------------------------------------------------------------------------------
1 | WITH raw_orders AS (
2 | SELECT
3 | id,
4 | user_id,
5 | order_date,
6 | status
7 | FROM 'jaffle_shop/seeds/raw_orders.csv'
8 | )
9 |
10 | SELECT
11 | id AS order_id,
12 | user_id AS customer_id,
13 | order_date,
14 | status
15 | FROM raw_orders;
16 |
--------------------------------------------------------------------------------
/examples/jaffle_shop/scripts/staging/payments.sql:
--------------------------------------------------------------------------------
1 | WITH raw_payments AS (SELECT * FROM 'jaffle_shop/seeds/raw_payments.csv')
2 |
3 | SELECT
4 | id AS payments_id,
5 | order_id,
6 | payment_method,
7 | amount / 100 AS amount
8 | FROM raw_payments;
9 |
--------------------------------------------------------------------------------
/examples/jaffle_shop/scripts/tests/orders_are_dated.sql:
--------------------------------------------------------------------------------
1 | SELECT *
2 | FROM core.orders
3 | WHERE order_date IS NULL
4 |
--------------------------------------------------------------------------------
/examples/motherduck/README.md:
--------------------------------------------------------------------------------
1 | # Using MotherDuck
2 |
3 | lea works with DuckDB, and thus can be used with [MotherDuck](https://motherduck.com/) too.
4 |
5 | Here is an example `.env` file:
6 |
7 | ```sh
8 | echo "
9 | LEA_USERNAME=max
10 | LEA_WAREHOUSE=duckdb
11 | LEA_DUCKDB_PATH=md:jaffle_shop
12 | MOTHERDUCK_TOKEN=
13 | " > .env
14 | ```
15 |
16 | The token can be obtained by logging into MotherDuck from the terminal, as documented [here](https://motherduck.com/docs/getting-started/connect-query-from-python/installation-authentication#authenticating-to-motherduck).
17 |
18 | Then, you can run the usual commands. For the sake of example, let's re-use the jaffle shop views:
19 |
20 | ```sh
21 | lea prepare ../jaffle_shop/views
22 | ```
23 |
24 | ```
25 | Created schema analytics
26 | Created schema staging
27 | Created schema core
28 | ```
29 |
30 | ```sh
31 | lea run ../jaffle_shop/views
32 | ```
33 |
34 | You should see the views in your MotherDuck UI:
35 |
--------------------------------------------------------------------------------
/examples/school/README.md:
--------------------------------------------------------------------------------
1 | # School example
2 |
3 | Let's back to school with an example and demonstrate the use of tests.
4 |
5 | ## Bootstrapping
6 |
7 | First, usual bootstrapping of database for `lea`, it goes by creating a `.env`:
8 |
9 | ```sh
10 | echo "
11 | LEA_USERNAME=max
12 | LEA_WAREHOUSE=duckdb
13 | LEA_DUCKDB_PATH=school.db
14 | " > .env
15 | ```
16 |
17 | This example uses DuckDB as the datawarehouse.
18 |
19 | You can run the scripts:
20 |
21 | ```sh
22 | lea run
23 | ```
24 |
25 | Lea will create schema in DuckDB, create audit tables based on scripts definition.
26 | Once audit tables are generated, lea run tests against audit tables.
27 |
28 | Let's review some tests together.
29 |
30 | ## Exploration
31 |
32 | Vizualize students in this school :
33 |
34 | ```sh
35 | python -c import duckdb; print(duckdb.connect('school_max.db').execute('SELECT student_id, first_name, last_name, university FROM staging.students').df())
36 | ```
37 |
38 | ```
39 | student_id first_name last_name university
40 | 0 1 Lauren Levine Stanford University
41 | 1 2 Daniel Lopez Massachusetts Institute of Technology
42 | 2 3 Melanie Foster University of California Berkeley
43 | 3 4 Gabriel Cooke Harvard University
44 | 4 5 Anne Porter Harvard University
45 | 5 6 Amy Lee Princeton University
46 | 6 7 Rebecca Chavez Princeton University
47 | ```
48 |
49 | You can see some students, let's review their grades !
50 |
51 | ```sh
52 | python -c "import duckdb; print(duckdb.connect('school_max.db').execute('SELECT student_id, student_name, class_name, semester, average_grade FROM core.yearly_results USING SAMPLE 5').df())"
53 | ```
54 |
55 | ```
56 | student_id student_name class_name semester average_grade
57 | 0 6 Amy Lee Mathematics Semester 1 59.0
58 | 1 5 Anne Porter Literature Semester 2 100.0
59 | 2 5 Anne Porter Physics Semester 2 46.0
60 | 3 1 Lauren Levine Biology Semester 1 28.5
61 | 4 1 Lauren Levine Literature Semester 2 52.5
62 | ```
63 |
64 | ## Tests
65 |
66 | Awesome ! Pretty good students, let's review some tests made.
67 |
68 | ```sql
69 | WITH raw_students AS (
70 | SELECT * FROM './seeds/raw_students.csv'
71 | )
72 |
73 | SELECT
74 | -- #UNIQUE
75 | -- #NO_NULLS
76 | id AS student_id,
77 | first_name,
78 | -- #UNIQUE_BY(first_name)
79 | last_name,
80 | -- #SET{'Stanford University', 'University of California Berkeley', 'Princeton University', 'Harvard University', 'Massachusetts Institute of Technology'}
81 | university,
82 | FROM raw_students;
83 | ```
84 |
85 | During the Write-Audit-Publish pattern, thoses checks will ensure Data Quality making assertions tests.
86 |
87 | Here for instance, the staging model during Audit step will ensure that :
88 |
89 | - `student_id` values are not null and unique
90 | - `last_name` are unique by first_name
91 | - `university` values are in the exposed list
92 |
93 | ## WAP pattern in action - break during auditing
94 |
95 | Let's break a test on purpose for demonstration :
96 |
97 | Under `seeds/raw_students`, let's add a new student :
98 |
99 | ```sh
100 | echo "8,Andy,Bernard,Cornell University,23" >> seeds/raw_students.csv
101 | ```
102 |
103 | Let's run again scripts :
104 |
105 | ```sh
106 | lea run
107 | ```
108 |
109 | Cornell University is not allowed here :
110 |
111 | ```
112 | ✋ Early ending because an error occurred
113 | 😴 Ending session
114 | STOPPED school_max.core.yearly_results___audit
115 | SUCCESS school_max.core.yearly_results___audit, contains 112 rows
116 | ERRORED school_max.tests.staging__students__university___set___audit
117 | university
118 | 0 Cornell University
119 | ❌ Finished, took less than a second 🚀
120 | ```
121 |
122 | Remove last line added to restore source:
123 |
124 | ```sh
125 | sed -i '' '$d' seeds/raw_students.csv
126 | ```
127 |
128 | As audit prevented from corrupting intermediate tables, your tables
129 | are still healthy.
130 |
131 | ## Restart Feature demo - Get a fresh environment
132 |
133 | However, as our audit tables are messy and not sync with source, let's rerun them:
134 |
135 | ```sh
136 | lea run --restart
137 | ```
138 |
139 | It will flush the audit table, as if it was a fresh start.
140 |
141 | ## Skipping feature demonstration
142 |
143 | You might think now, "hey, how does lea know which audit table should be run again
144 | or not ?". That's an excellent question !
145 |
146 | Have you noticed that `lea` automatically skip table that are not relevant to process during audits ?
147 |
148 | Let's see together with a closer look by practicing with an example !
149 |
150 | First you can vizualize award winner of scholarships:
151 |
152 | Each top performing student get a 1000$ grant and 2nd gets a 500$ grant.
153 |
154 | You can see the winners of the year in `Economics`:
155 |
156 | ```sh
157 | python -c "import duckdb; print(duckdb.connect('school_max.db').execute('SELECT student_name, domain, scholarship_amount FROM analytics.scholarship_award WHERE domain = \'Economics\'').df())"
158 | ```
159 |
160 | ```
161 | student_name domain scholarship_amount
162 | 0 Daniel Lopez Economics 1000
163 | 1 Gabriel Cooke Economics 500
164 | ```
165 |
166 | You can review the total amount of money spent :
167 |
168 | ```sh
169 | lea run --select analytics.finance.expenses
170 | python -c "import duckdb; print(duckdb.connect('school_max.db').execute('SELECT total_expenses FROM analytics.finance__expenses').df())"
171 | ```
172 |
173 | ```
174 | total_expenses
175 | 0 12000.0
176 | ```
177 |
178 | Let's modify a script and demonstrate that lea will run again only scripts that have been modified.
179 |
180 | Good news, the academy got 2x more budget this year ! You can deliver a scholarship award
181 | for top performing student **each semester**.
182 |
183 | To apply this evolution, uncomment all lines under `analytics.scholarship_award` with `--uncomment here` and comment the `--comment here` ones
184 |
185 | ```sh
186 | sed -i '' '/--comment here/s/^/--/' scripts/analytics/scholarship_award.sql
187 | sed -i '' '/--uncomment here/s/-- //' scripts/analytics/scholarship_award.sql
188 | ```
189 |
190 | Then run again the finance script.
191 |
192 | ```sh
193 | lea run --select analytics.finance.expenses
194 | ```
195 |
196 | Oh no, the budget test is failing ! Modify the value under `scripts/tests/budget.sql` :
197 |
198 | ```sh
199 | sed -i '' '/--comment here/s/^/--/' scripts/tests/budget.sql
200 | sed -i '' '/--uncomment here/s/-- //' scripts/tests/budget.sql
201 | ```
202 |
203 | Now let's run again the scripts :
204 |
205 | ```sh
206 | lea run
207 | ```
208 |
209 | Everything pass 🎉
210 |
211 | Look closely : **audit tables haven't been materialized again for `school*max.core.yearly_results***audit`
212 | as they were already existing and the script modification date was **anterior** to materialization date !
213 |
214 | But it would has been executed, if the script was modified **prior** last table materialization.
215 |
216 | You can check the table materialization date with :
217 |
218 | ```
219 | python -c "import duckdb; print(duckdb.connect('school_max.db').execute('SELECT MAX(_materialized_timestamp) AS last_materialized FROM analytics.scholarship_award').df())"
220 | ```
221 |
222 | last_materialized
223 |
224 | 0 2025-03-14 00:31:28.114
225 |
226 | Now the school has extra budget, you can view the new scholarship award winners !
227 |
228 | There is twice more winners now, 2 at each semester :
229 |
230 | ```sh
231 | python -c "import duckdb; print(duckdb.connect('school_max.db').execute('SELECT student_name, domain, semester, scholarship_amount FROM analytics.scholarship_award WHERE domain = \'Economics\'').df())"
232 | ```
233 |
234 | ```
235 | student_name domain semester scholarship_amount
236 | 0 Lauren Levine Economics Semester 2 1000
237 | 1 Gabriel Cooke Economics Semester 2 500
238 | 2 Daniel Lopez Economics Semester 1 1000
239 | 3 Gabriel Cooke Economics Semester 1 500
240 | ```
241 |
242 | As you can see now, the expenses have doubled :
243 |
244 | ```sh
245 | lea run --select analytics.finance.expenses
246 | python -c "import duckdb; print(duckdb.connect('school_max.db').execute('SELECT total_expenses FROM analytics.finance__expenses').df())"
247 | ```
248 |
249 | ```
250 | total_expenses
251 | 0 24000.0
252 | ```
253 |
--------------------------------------------------------------------------------
/examples/school/scripts/analytics/finance/expenses.sql:
--------------------------------------------------------------------------------
1 | SELECT
2 | SUM(scholarship_amount) AS total_expenses
3 | FROM
4 | analytics.scholarship_award
5 |
--------------------------------------------------------------------------------
/examples/school/scripts/analytics/major.sql:
--------------------------------------------------------------------------------
1 | WITH ordered_yearly_results AS (
2 | SELECT
3 | student_name,
4 | AVG(average_grade) AS total_grade
5 | FROM
6 | core.yearly_results
7 | GROUP BY student_name
8 | )
9 | SELECT
10 | student_name,
11 | total_grade
12 | FROM ordered_yearly_results
13 | ORDER BY total_grade DESC
14 | LIMIT 1;
15 |
--------------------------------------------------------------------------------
/examples/school/scripts/analytics/scholarship_award.sql:
--------------------------------------------------------------------------------
1 | WITH ordered_results AS (
2 | SELECT
3 | student_name,
4 | class_name,
5 | -- semester, --uncomment here
6 | -- row_number() OVER (PARTITION BY class_name, semester --uncomment here
7 | row_number() OVER (PARTITION BY class_name --comment here
8 | ORDER BY average_grade DESC
9 | ) AS ranking
10 | FROM
11 | core.yearly_results
12 | )
13 | SELECT
14 | student_name,
15 | -- semester, --uncomment here
16 | class_name AS domain,
17 | CASE
18 | WHEN ranking = 1 THEN 1000
19 | WHEN ranking = 2 THEN 500
20 | ELSE 0
21 | END AS scholarship_amount
22 | FROM ordered_results
23 | WHERE ranking <= 2;
24 |
--------------------------------------------------------------------------------
/examples/school/scripts/core/yearly_results.sql:
--------------------------------------------------------------------------------
1 | WITH grades_per_class_per_semester AS (
2 | SELECT
3 | student_id,
4 | class_name,
5 | grade,
6 | CASE
7 | WHEN datepart('month', exam_date) BETWEEN 1 AND 6 THEN 'Semester 1'
8 | ELSE 'Semester 2'
9 | END AS semester
10 | FROM staging.grades
11 | ),
12 | avg_grades_per_class AS (
13 | SELECT
14 | student_id,
15 | class_name,
16 | semester,
17 | AVG(grade) AS average_grade
18 | FROM grades_per_class_per_semester
19 | GROUP BY class_name, semester, student_id
20 | )
21 | SELECT
22 | students.student_id,
23 | CONCAT(students.first_name, ' ', students.last_name) AS student_name,
24 | grades_per_class.class_name,
25 | grades_per_class.semester,
26 | grades_per_class.average_grade,
27 | students.university
28 | FROM avg_grades_per_class AS grades_per_class
29 | LEFT JOIN staging.students AS students
30 | ON grades_per_class.student_id = students.student_id
31 | ORDER BY student_name, class_name;
32 |
--------------------------------------------------------------------------------
/examples/school/scripts/staging/grades.sql:
--------------------------------------------------------------------------------
1 | WITH raw_grades AS (
2 | SELECT * FROM './seeds/raw_grades.csv'
3 | )
4 |
5 | SELECT
6 | -- #NO_NULLS
7 | student_id,
8 | -- #NO_NULLS
9 | class_name,
10 | -- #NO_NULLS
11 | grade,
12 | -- #NO_NULLS
13 | strptime(exam_date, '%m-%Y') AS exam_date,
14 | FROM raw_grades;
15 |
--------------------------------------------------------------------------------
/examples/school/scripts/staging/students.sql:
--------------------------------------------------------------------------------
1 | WITH raw_students AS (
2 | SELECT * FROM './seeds/raw_students.csv'
3 | )
4 |
5 | SELECT
6 | -- #UNIQUE
7 | -- #NO_NULLS
8 | id AS student_id,
9 | first_name,
10 | -- #UNIQUE_BY(first_name)
11 | last_name,
12 | -- #SET{'Stanford University', 'University of California Berkeley', 'Princeton University', 'Harvard University', 'Massachusetts Institute of Technology'}
13 | university,
14 | FROM raw_students;
15 |
--------------------------------------------------------------------------------
/examples/school/scripts/tests/budget.sql:
--------------------------------------------------------------------------------
1 | SELECT
2 | *
3 | FROM analytics.finance__expenses
4 | -- WHERE total_expenses > 24000; --uncomment here
5 | WHERE total_expenses > 12000; --comment here
6 |
--------------------------------------------------------------------------------
/examples/school/seeds/raw_grades.csv:
--------------------------------------------------------------------------------
1 | student_id,class_name,grade,exam_date
2 | 1,Mathematics,88,5-2025
3 | 1,Mathematics,57,10-2025
4 | 1,Mathematics,9,6-2025
5 | 1,Mathematics,80,9-2025
6 | 1,Physics,52,3-2025
7 | 1,Physics,81,10-2025
8 | 1,Physics,100,2-2025
9 | 1,Physics,99,9-2025
10 | 1,Chemistry,49,6-2025
11 | 1,Chemistry,1,11-2025
12 | 1,Chemistry,45,5-2025
13 | 1,Chemistry,0,11-2025
14 | 1,Biology,37,3-2025
15 | 1,Biology,88,9-2025
16 | 1,Biology,20,5-2025
17 | 1,Biology,32,10-2025
18 | 1,Computer Science,1,2-2025
19 | 1,Computer Science,72,9-2025
20 | 1,Computer Science,48,3-2025
21 | 1,Computer Science,70,7-2025
22 | 1,History,14,6-2025
23 | 1,History,100,7-2025
24 | 1,History,57,6-2025
25 | 1,History,58,12-2025
26 | 1,Literature,23,6-2025
27 | 1,Literature,67,10-2025
28 | 1,Literature,48,5-2025
29 | 1,Literature,38,7-2025
30 | 1,Economics,18,5-2025
31 | 1,Economics,94,11-2025
32 | 1,Economics,97,4-2025
33 | 1,Economics,20,7-2025
34 | 2,Mathematics,16,4-2025
35 | 2,Mathematics,38,10-2025
36 | 2,Mathematics,93,5-2025
37 | 2,Mathematics,73,10-2025
38 | 2,Physics,23,3-2025
39 | 2,Physics,83,10-2025
40 | 2,Physics,15,5-2025
41 | 2,Physics,35,11-2025
42 | 2,Chemistry,88,2-2025
43 | 2,Chemistry,96,7-2025
44 | 2,Chemistry,98,3-2025
45 | 2,Chemistry,63,8-2025
46 | 2,Biology,17,1-2025
47 | 2,Biology,32,7-2025
48 | 2,Biology,66,1-2025
49 | 2,Biology,52,8-2025
50 | 2,Computer Science,53,1-2025
51 | 2,Computer Science,53,7-2025
52 | 2,Computer Science,28,3-2025
53 | 2,Computer Science,97,11-2025
54 | 2,History,42,4-2025
55 | 2,History,23,9-2025
56 | 2,History,24,4-2025
57 | 2,History,100,8-2025
58 | 2,Literature,0,2-2025
59 | 2,Literature,56,7-2025
60 | 2,Literature,78,1-2025
61 | 2,Literature,37,10-2025
62 | 2,Economics,52,1-2025
63 | 2,Economics,55,7-2025
64 | 2,Economics,74,4-2025
65 | 2,Economics,20,10-2025
66 | 3,Mathematics,9,6-2025
67 | 3,Mathematics,3,10-2025
68 | 3,Mathematics,67,6-2025
69 | 3,Mathematics,100,11-2025
70 | 3,Physics,78,5-2025
71 | 3,Physics,100,12-2025
72 | 3,Physics,69,4-2025
73 | 3,Physics,97,8-2025
74 | 3,Chemistry,64,1-2025
75 | 3,Chemistry,58,9-2025
76 | 3,Chemistry,94,2-2025
77 | 3,Chemistry,100,8-2025
78 | 3,Biology,96,4-2025
79 | 3,Biology,23,10-2025
80 | 3,Biology,13,5-2025
81 | 3,Biology,89,8-2025
82 | 3,Computer Science,92,1-2025
83 | 3,Computer Science,100,10-2025
84 | 3,Computer Science,100,1-2025
85 | 3,Computer Science,27,9-2025
86 | 3,History,2,6-2025
87 | 3,History,100,7-2025
88 | 3,History,62,6-2025
89 | 3,History,100,11-2025
90 | 3,Literature,78,6-2025
91 | 3,Literature,100,12-2025
92 | 3,Literature,69,6-2025
93 | 3,Literature,16,8-2025
94 | 3,Economics,37,5-2025
95 | 3,Economics,85,8-2025
96 | 3,Economics,42,6-2025
97 | 3,Economics,2,7-2025
98 | 4,Mathematics,99,2-2025
99 | 4,Mathematics,56,11-2025
100 | 4,Mathematics,68,6-2025
101 | 4,Mathematics,43,8-2025
102 | 4,Physics,97,4-2025
103 | 4,Physics,52,9-2025
104 | 4,Physics,16,4-2025
105 | 4,Physics,58,10-2025
106 | 4,Chemistry,71,6-2025
107 | 4,Chemistry,64,12-2025
108 | 4,Chemistry,52,6-2025
109 | 4,Chemistry,88,9-2025
110 | 4,Biology,24,3-2025
111 | 4,Biology,75,7-2025
112 | 4,Biology,77,5-2025
113 | 4,Biology,11,7-2025
114 | 4,Computer Science,46,2-2025
115 | 4,Computer Science,48,8-2025
116 | 4,Computer Science,1,3-2025
117 | 4,Computer Science,93,8-2025
118 | 4,History,60,5-2025
119 | 4,History,69,10-2025
120 | 4,History,35,6-2025
121 | 4,History,63,10-2025
122 | 4,Literature,41,2-2025
123 | 4,Literature,100,8-2025
124 | 4,Literature,42,4-2025
125 | 4,Literature,11,9-2025
126 | 4,Economics,100,5-2025
127 | 4,Economics,100,8-2025
128 | 4,Economics,24,1-2025
129 | 4,Economics,13,10-2025
130 | 5,Mathematics,92,3-2025
131 | 5,Mathematics,73,9-2025
132 | 5,Mathematics,92,5-2025
133 | 5,Mathematics,100,9-2025
134 | 5,Physics,99,5-2025
135 | 5,Physics,17,7-2025
136 | 5,Physics,29,1-2025
137 | 5,Physics,75,9-2025
138 | 5,Chemistry,35,6-2025
139 | 5,Chemistry,60,10-2025
140 | 5,Chemistry,60,3-2025
141 | 5,Chemistry,29,9-2025
142 | 5,Biology,49,4-2025
143 | 5,Biology,65,10-2025
144 | 5,Biology,55,2-2025
145 | 5,Biology,35,11-2025
146 | 5,Computer Science,40,6-2025
147 | 5,Computer Science,77,11-2025
148 | 5,Computer Science,47,1-2025
149 | 5,Computer Science,75,10-2025
150 | 5,History,22,5-2025
151 | 5,History,33,9-2025
152 | 5,History,62,4-2025
153 | 5,History,75,9-2025
154 | 5,Literature,18,5-2025
155 | 5,Literature,100,9-2025
156 | 5,Literature,84,2-2025
157 | 5,Literature,100,10-2025
158 | 5,Economics,63,3-2025
159 | 5,Economics,40,10-2025
160 | 5,Economics,18,6-2025
161 | 5,Economics,37,8-2025
162 | 6,Mathematics,100,4-2025
163 | 6,Mathematics,39,11-2025
164 | 6,Mathematics,18,4-2025
165 | 6,Mathematics,43,12-2025
166 | 6,Physics,96,3-2025
167 | 6,Physics,67,10-2025
168 | 6,Physics,3,3-2025
169 | 6,Physics,37,8-2025
170 | 6,Chemistry,38,2-2025
171 | 6,Chemistry,29,11-2025
172 | 6,Chemistry,62,2-2025
173 | 6,Chemistry,4,11-2025
174 | 6,Biology,89,4-2025
175 | 6,Biology,100,10-2025
176 | 6,Biology,26,1-2025
177 | 6,Biology,100,11-2025
178 | 6,Computer Science,66,1-2025
179 | 6,Computer Science,62,11-2025
180 | 6,Computer Science,12,1-2025
181 | 6,Computer Science,51,12-2025
182 | 6,History,56,3-2025
183 | 6,History,21,12-2025
184 | 6,History,97,3-2025
185 | 6,History,18,9-2025
186 | 6,Literature,12,1-2025
187 | 6,Literature,31,7-2025
188 | 6,Literature,56,6-2025
189 | 6,Literature,100,10-2025
190 | 6,Economics,60,3-2025
191 | 6,Economics,100,7-2025
192 | 6,Economics,62,3-2025
193 | 6,Economics,1,11-2025
194 | 7,Mathematics,10,3-2025
195 | 7,Mathematics,65,8-2025
196 | 7,Mathematics,21,1-2025
197 | 7,Mathematics,19,9-2025
198 | 7,Physics,30,6-2025
199 | 7,Physics,0,9-2025
200 | 7,Physics,2,1-2025
201 | 7,Physics,91,11-2025
202 | 7,Chemistry,28,3-2025
203 | 7,Chemistry,6,12-2025
204 | 7,Chemistry,7,1-2025
205 | 7,Chemistry,5,7-2025
206 | 7,Biology,96,4-2025
207 | 7,Biology,38,9-2025
208 | 7,Biology,17,5-2025
209 | 7,Biology,87,7-2025
210 | 7,Computer Science,80,6-2025
211 | 7,Computer Science,41,12-2025
212 | 7,Computer Science,79,3-2025
213 | 7,Computer Science,94,9-2025
214 | 7,History,15,3-2025
215 | 7,History,57,9-2025
216 | 7,History,15,2-2025
217 | 7,History,0,7-2025
218 | 7,Literature,51,3-2025
219 | 7,Literature,64,8-2025
220 | 7,Literature,1,1-2025
221 | 7,Literature,57,10-2025
222 | 7,Economics,1,5-2025
223 | 7,Economics,22,8-2025
224 | 7,Economics,42,4-2025
225 | 7,Economics,78,10-2025
226 |
--------------------------------------------------------------------------------
/examples/school/seeds/raw_students.csv:
--------------------------------------------------------------------------------
1 | id,first_name,last_name,university,age
2 | 1,Lauren,Levine,Stanford University,22
3 | 2,Daniel,Lopez,Massachusetts Institute of Technology,24
4 | 3,Melanie,Foster,University of California Berkeley,20
5 | 4,Gabriel,Cooke,Harvard University,19
6 | 5,Anne,Porter,Harvard University,23
7 | 6,Amy,Lee,Princeton University,24
8 | 7,Rebecca,Chavez,Princeton University,25
9 |
--------------------------------------------------------------------------------
/lea/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import logging
4 |
5 | import click
6 | from rich.logging import RichHandler
7 |
8 | from lea import cli, databases
9 | from lea.conductor import Conductor
10 |
11 | logging.basicConfig(
12 | level=logging.INFO,
13 | format="%(message)s",
14 | datefmt="[%X]",
15 | handlers=[
16 | RichHandler(
17 | rich_tracebacks=True,
18 | show_level=False,
19 | show_path=False,
20 | markup=True,
21 | tracebacks_suppress=[click],
22 | )
23 | ],
24 | )
25 |
26 | log = logging.getLogger("rich")
27 |
28 |
29 | __all__ = ["cli", "log", "Conductor", "databases"]
30 |
--------------------------------------------------------------------------------
/lea/assertions/NO_NULLS.sql.jinja:
--------------------------------------------------------------------------------
1 | SELECT ROW_NUMBER() OVER () AS row_number
2 | FROM {{ table }}
3 | WHERE {{ column }} IS NULL
4 |
--------------------------------------------------------------------------------
/lea/assertions/SET.sql.jinja:
--------------------------------------------------------------------------------
1 | SELECT DISTINCT({{ column }}) AS {{ column }}
2 | FROM {{ table }}
3 | WHERE {{ column }} NOT IN ({{ ', '.join(elements) }})
4 |
--------------------------------------------------------------------------------
/lea/assertions/UNIQUE.sql.jinja:
--------------------------------------------------------------------------------
1 | SELECT
2 | {{ column }},
3 | COUNT(*) AS n
4 | FROM {{ table }}
5 | GROUP BY {{ column }}
6 | HAVING n > 1
7 |
--------------------------------------------------------------------------------
/lea/assertions/UNIQUE_BY.sql.jinja:
--------------------------------------------------------------------------------
1 | SELECT
2 | {{ by }},
3 | COUNT(*) AS n,
4 | COUNT(DISTINCT {{ column }}) AS n_distinct
5 | FROM {{ table }}
6 | GROUP BY {{ by }}
7 | HAVING n != n_distinct
8 |
--------------------------------------------------------------------------------
/lea/cli.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import collections
4 | import pathlib
5 |
6 | import click
7 |
8 | import lea
9 |
10 |
11 | @click.group()
12 | def app():
13 | ...
14 |
15 |
16 | @app.command()
17 | @click.option("--select", "-m", multiple=True, default=["*"], help="Scripts to materialize.")
18 | @click.option("--unselect", "-m", multiple=True, default=[], help="Scripts to unselect.")
19 | @click.option("--dataset", default=None, help="Name of the base dataset.")
20 | @click.option("--scripts", default="scripts", help="Directory where the scripts are located.")
21 | @click.option(
22 | "--incremental", nargs=2, type=str, multiple=True, help="Incremental field name and value."
23 | )
24 | @click.option("--dry", is_flag=True, default=False, help="Whether to run in dry mode.")
25 | @click.option("--print", is_flag=True, default=False, help="Whether to print the SQL code.")
26 | @click.option(
27 | "--production", is_flag=True, default=False, help="Whether to run the scripts in production."
28 | )
29 | @click.option("--restart", is_flag=True, default=False, help="Whether to restart from scratch.")
30 | def run(select, unselect, dataset, scripts, incremental, dry, print, production, restart):
31 | if select in {"", "Ø"}:
32 | select = []
33 |
34 | if not pathlib.Path(scripts).is_dir():
35 | raise click.ClickException(f"Directory {scripts} does not exist")
36 |
37 | # Handle incremental option
38 | incremental_field_values = collections.defaultdict(set)
39 | for field, value in incremental:
40 | incremental_field_values[field].add(value)
41 | if len(incremental_field_values) > 1:
42 | raise click.ClickException("Specifying multiple incremental fields is not supported")
43 | incremental_field_name = next(iter(incremental_field_values), None)
44 | incremental_field_values = incremental_field_values[incremental_field_name]
45 |
46 | conductor = lea.Conductor(scripts_dir=scripts, dataset_name=dataset)
47 | conductor.run(
48 | select=select,
49 | unselect=unselect,
50 | production=production,
51 | dry_run=dry,
52 | restart=restart,
53 | incremental_field_name=incremental_field_name,
54 | incremental_field_values=incremental_field_values,
55 | print_mode=print,
56 | )
57 |
--------------------------------------------------------------------------------
/lea/comment.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import collections
4 | import dataclasses
5 |
6 | import sqlglot
7 |
8 | from .dialects import SQLDialect
9 |
10 |
11 | @dataclasses.dataclass
12 | class Comment:
13 | line: int
14 | text: str
15 |
16 |
17 | class CommentBlock(collections.UserList):
18 | def __init__(self, comments: list[Comment]):
19 | super().__init__(sorted(comments, key=lambda c: c.line))
20 |
21 | @property
22 | def first_line(self):
23 | return self[0].line
24 |
25 | @property
26 | def last_line(self):
27 | return self[-1].line
28 |
29 |
30 | def extract_comments(
31 | code: str, expected_field_names: list[str], sql_dialect: SQLDialect
32 | ) -> dict[str, CommentBlock]:
33 | dialect = sqlglot.Dialect.get_or_raise(sql_dialect.sqlglot_dialect.value)
34 | tokens = dialect.tokenizer_class().tokenize(code)
35 |
36 | # Extract comments, which are lines that start with --
37 | comments = [
38 | Comment(line=line, text=comment.replace("--", "").strip())
39 | for line, comment in enumerate(code.splitlines(), start=1)
40 | if comment.strip().startswith("--")
41 | ]
42 |
43 | # Pack comments into CommentBlock objects
44 | comment_blocks = merge_adjacent_comments(comments)
45 |
46 | # We assume the tokens are stored. Therefore, by looping over them and building a dictionary,
47 | # each key will be unique and the last value will be the last variable in the line.
48 | var_tokens = [
49 | token
50 | for token in tokens
51 | if token.token_type.value == "VAR" and token.text in expected_field_names
52 | ]
53 |
54 | def is_var_line(line):
55 | line_tokens = [t for t in tokens if t.line == line and t.token_type.value != "COMMA"]
56 | return line_tokens[-1].token_type.value == "VAR"
57 |
58 | last_var_per_line = {token.line: token.text for token in var_tokens if is_var_line(token.line)}
59 |
60 | # Now assign each comment block to a variable
61 | var_comments = {}
62 | for comment_block in comment_blocks:
63 | adjacent_var = next(
64 | (var for line, var in last_var_per_line.items() if comment_block.last_line == line - 1),
65 | None,
66 | )
67 | if adjacent_var:
68 | var_comments[adjacent_var] = comment_block
69 |
70 | return var_comments
71 |
72 |
73 | def merge_adjacent_comments(comments: list[Comment]) -> list[CommentBlock]:
74 | if not comments:
75 | return []
76 |
77 | # Sort comments by their line number
78 | comments.sort(key=lambda c: c.line)
79 |
80 | merged_blocks = []
81 | current_block = [comments[0]]
82 |
83 | # Iterate through comments and group adjacent ones
84 | for i in range(1, len(comments)):
85 | if comments[i].line == comments[i - 1].line + 1: # Check if adjacent
86 | current_block.append(comments[i])
87 | else:
88 | # Create a CommentBlock for the current group
89 | merged_blocks.append(CommentBlock(current_block))
90 | # Start a new block
91 | current_block = [comments[i]]
92 |
93 | # Add the last block
94 | merged_blocks.append(CommentBlock(current_block))
95 |
96 | return merged_blocks
97 |
--------------------------------------------------------------------------------
/lea/conductor.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import concurrent.futures
4 | import datetime as dt
5 | import getpass
6 | import json
7 | import os
8 | import pathlib
9 | import sys
10 |
11 | import dotenv
12 |
13 | import lea
14 | from lea import databases
15 | from lea.dag import DAGOfScripts
16 | from lea.databases import DatabaseClient, TableStats
17 | from lea.dialects import BigQueryDialect, DuckDBDialect
18 | from lea.session import Session
19 | from lea.table_ref import AUDIT_TABLE_SUFFIX, TableRef
20 |
21 |
22 | class Conductor:
23 | def __init__(
24 | self, scripts_dir: str, dataset_name: str | None = None, project_name: str | None = None
25 | ):
26 | # Load environment variables from .env file
27 | # TODO: is is Pythonic to do this here?
28 | dotenv.load_dotenv(".env", verbose=True)
29 |
30 | self.warehouse = os.environ["LEA_WAREHOUSE"].lower()
31 |
32 | self.scripts_dir = pathlib.Path(scripts_dir)
33 | if not self.scripts_dir.is_dir():
34 | raise ValueError(f"Directory {self.scripts_dir} not found")
35 |
36 | if dataset_name is None:
37 | if self.warehouse == "bigquery":
38 | dataset_name = os.environ.get("LEA_BQ_DATASET_NAME")
39 |
40 | if self.warehouse == "duckdb":
41 | duckdb_path = pathlib.Path(os.environ.get("LEA_DUCKDB_PATH", ""))
42 | dataset_name = duckdb_path.stem
43 | if dataset_name is None:
44 | raise ValueError("Dataset name could not be inferred")
45 | self.dataset_name = dataset_name
46 |
47 | if project_name is None:
48 | if self.warehouse == "bigquery":
49 | project_name = os.environ.get("LEA_BQ_PROJECT_ID")
50 | if self.warehouse == "duckdb":
51 | project_name = dataset_name
52 | if project_name is None:
53 | raise ValueError("Project name could not be inferred")
54 | self.project_name = project_name
55 |
56 | lea.log.info("📝 Reading scripts")
57 |
58 | if self.warehouse == "bigquery":
59 | self.dag = DAGOfScripts.from_directory(
60 | scripts_dir=self.scripts_dir,
61 | sql_dialect=BigQueryDialect(),
62 | dataset_name=self.dataset_name,
63 | project_name=self.project_name if self.warehouse == "bigquery" else None,
64 | )
65 | if self.warehouse == "duckdb":
66 | self.dag = DAGOfScripts.from_directory(
67 | scripts_dir=self.scripts_dir,
68 | sql_dialect=DuckDBDialect(),
69 | dataset_name=self.dataset_name,
70 | project_name=None,
71 | )
72 | lea.log.info(f"{sum(1 for s in self.dag.scripts if not s.is_test):,d} table scripts")
73 | lea.log.info(f"{sum(1 for s in self.dag.scripts if s.is_test):,d} test scripts")
74 |
75 | def run(
76 | self,
77 | select: list[str],
78 | unselect: list[str],
79 | production: bool = False,
80 | dry_run: bool = False,
81 | restart: bool = False,
82 | incremental_field_name: str | None = None,
83 | incremental_field_values: list[str] | None = None,
84 | print_mode: bool = False,
85 | ):
86 | session = self.prepare_session(
87 | select=select,
88 | unselect=unselect,
89 | production=production,
90 | dry_run=dry_run,
91 | incremental_field_name=incremental_field_name,
92 | incremental_field_values=incremental_field_values,
93 | print_mode=print_mode,
94 | )
95 |
96 | try:
97 | self.run_session(session, restart=restart, dry_run=dry_run)
98 | if session.any_error_has_occurred:
99 | return sys.exit(1)
100 | except KeyboardInterrupt:
101 | lea.log.error("🛑 Keyboard interrupt")
102 | session.end()
103 | return sys.exit(1)
104 |
105 | def prepare_session(
106 | self,
107 | select: list[str],
108 | unselect: list[str],
109 | production: bool = False,
110 | dry_run: bool = False,
111 | incremental_field_name: str | None = None,
112 | incremental_field_values: list[str] | None = None,
113 | print_mode: bool = False,
114 | ) -> Session:
115 | # We need a database client to run scripts
116 | database_client = self.make_client(dry_run=dry_run, print_mode=print_mode)
117 |
118 | # We need to select the scripts we want to run. We do this by querying the DAG.
119 | selected_table_refs = self.dag.select(*select)
120 | unselected_table_refs = self.dag.select(*unselect)
121 | if not selected_table_refs - unselected_table_refs:
122 | msg = "Nothing found for select " + ", ".join(select)
123 | if unselect:
124 | msg += " and unselect: " + ", ".join(unselect)
125 | lea.log.error(msg)
126 | return sys.exit(1)
127 |
128 | # We need a dataset to materialize the scripts. If we're in production mode, we use the
129 | # base dataset. If we're in user mode, we use a dataset named after the user.
130 | write_dataset = self.dataset_name if production else self.name_user_dataset()
131 | database_client.create_dataset(write_dataset)
132 |
133 | # When using DuckDB, we need to create schema for the tables
134 | if self.warehouse == "duckdb":
135 | lea.log.info("🔩 Creating schemas")
136 | for table_ref in selected_table_refs - unselected_table_refs:
137 | database_client.create_schema(table_ref)
138 |
139 | # When the scripts run, they are materialized into side-tables which we call "audit"
140 | # tables. When a run stops because of an error, the audit tables are left behind. If we
141 | # want to start fresh, we have to delete the audit tables. If not, the materialized tables
142 | # can be skipped.
143 | existing_tables = self.list_existing_tables(
144 | database_client=database_client, dataset=write_dataset
145 | )
146 | lea.log.info(f"{len(existing_tables):,d} tables already exist")
147 | existing_audit_tables = self.list_existing_audit_tables(
148 | database_client=database_client, dataset=write_dataset
149 | )
150 |
151 | lea.log.info(f"{len(existing_audit_tables):,d} audit tables already exist")
152 |
153 | session = Session(
154 | database_client=database_client,
155 | base_dataset=self.dataset_name,
156 | write_dataset=write_dataset,
157 | scripts=self.dag.scripts,
158 | selected_table_refs=selected_table_refs,
159 | unselected_table_refs=unselected_table_refs,
160 | existing_tables=existing_tables,
161 | existing_audit_tables=existing_audit_tables,
162 | incremental_field_name=incremental_field_name,
163 | incremental_field_values=incremental_field_values,
164 | )
165 |
166 | return session
167 |
168 | def run_session(self, session: Session, restart: bool, dry_run: bool):
169 | if restart:
170 | delete_audit_tables(session)
171 |
172 | # Loop over table references in topological order
173 | materialize_scripts(dag=self.dag, session=session)
174 |
175 | # At this point, the scripts have been materialized into side-tables which we call "audit"
176 | # tables. We can now take care of promoting the audit tables to production.
177 | if not session.any_error_has_occurred and not dry_run:
178 | promote_audit_tables(session)
179 |
180 | # If all the scripts succeeded, we can delete the audit tables.
181 | if not session.any_error_has_occurred and not dry_run:
182 | delete_audit_tables(session)
183 |
184 | # Let's also delete orphan tables, which are tables that exist but who's scripts have
185 | # been deleted.
186 | delete_orphan_tables(session)
187 |
188 | # Regardless of whether all the jobs succeeded or not, we want to summarize the session.
189 | session.end()
190 | duration_str = str(session.ended_at - session.started_at).split(".")[0] # type: ignore[operator]
191 | emoji = "✅" if not session.any_error_has_occurred else "❌"
192 | msg = f"{emoji} Finished"
193 | if session.ended_at - session.started_at > dt.timedelta(seconds=1):
194 | msg += f", took {duration_str}"
195 | else:
196 | msg += ", took less than a second 🚀"
197 | if session.total_billed_dollars > 0:
198 | msg += f", cost ${session.total_billed_dollars:.2f}"
199 | lea.log.info(msg)
200 |
201 | def make_client(self, dry_run: bool = False, print_mode: bool = False) -> DatabaseClient:
202 | if self.warehouse.lower() == "bigquery":
203 | # Do imports here to avoid loading them all the time
204 | from google.oauth2 import service_account
205 |
206 | scopes_str = os.environ.get("LEA_BQ_SCOPES", "https://www.googleapis.com/auth/bigquery")
207 | scopes = scopes_str.split(",")
208 | scopes = [scope.strip() for scope in scopes]
209 |
210 | credentials = (
211 | service_account.Credentials.from_service_account_info(
212 | json.loads(bq_service_account_info_str, strict=False), scopes=scopes
213 | )
214 | if (bq_service_account_info_str := os.environ.get("LEA_BQ_SERVICE_ACCOUNT"))
215 | is not None
216 | else None
217 | )
218 | client = databases.BigQueryClient(
219 | credentials=credentials,
220 | location=os.environ["LEA_BQ_LOCATION"],
221 | write_project_id=os.environ["LEA_BQ_PROJECT_ID"],
222 | compute_project_id=os.environ.get(
223 | "LEA_BQ_COMPUTE_PROJECT_ID",
224 | credentials.project_id if credentials is not None else None,
225 | ),
226 | storage_billing_model=os.environ.get("LEA_BQ_STORAGE_BILLING_MODEL"),
227 | dry_run=dry_run,
228 | print_mode=print_mode,
229 | default_clustering_fields=[
230 | clustering_field.strip()
231 | for clustering_field in os.environ.get(
232 | "LEA_BQ_DEFAULT_CLUSTERING_FIELDS", ""
233 | ).split(",")
234 | if clustering_field.strip()
235 | ],
236 | big_blue_pick_api_url=os.environ.get("LEA_BQ_BIG_BLUE_PICK_API_URL"),
237 | big_blue_pick_api_key=os.environ.get("LEA_BQ_BIG_BLUE_PICK_API_KEY"),
238 | big_blue_pick_api_on_demand_project_id=os.environ.get(
239 | "LEA_BQ_BIG_BLUE_PICK_API_ON_DEMAND_PROJECT_ID"
240 | ),
241 | big_blue_pick_api_reservation_project_id=os.environ.get(
242 | "LEA_BQ_BIG_BLUE_PICK_API_REVERVATION_PROJECT_ID"
243 | ),
244 | )
245 | if client.big_blue_pick_api is not None:
246 | lea.log.info("🧔♂️ Using Big Blue Pick API")
247 | return client
248 |
249 | if self.warehouse.lower() == "duckdb":
250 | return databases.DuckDBClient(
251 | database_path=pathlib.Path(os.environ.get("LEA_DUCKDB_PATH", "")),
252 | dry_run=dry_run,
253 | print_mode=print_mode,
254 | )
255 |
256 | raise ValueError(f"Unsupported warehouse {self.warehouse!r}")
257 |
258 | def name_user_dataset(self) -> str:
259 | username = os.environ.get("LEA_USERNAME", getpass.getuser())
260 | return f"{self.dataset_name}_{username}"
261 |
262 | def list_existing_tables(
263 | self, database_client: DatabaseClient, dataset: str
264 | ) -> dict[TableRef, TableStats]:
265 | existing_tables = database_client.list_table_stats(dataset)
266 | existing_tables = {
267 | table_ref: stats
268 | for table_ref, stats in existing_tables.items()
269 | if not table_ref.name.endswith(AUDIT_TABLE_SUFFIX)
270 | }
271 | return existing_tables
272 |
273 | def list_existing_audit_tables(
274 | self, database_client: DatabaseClient, dataset: str
275 | ) -> dict[TableRef, TableStats]:
276 | existing_audit_tables = database_client.list_table_stats(dataset)
277 | existing_audit_tables = {
278 | table_ref: stats
279 | for table_ref, stats in existing_audit_tables.items()
280 | if table_ref.name.endswith(AUDIT_TABLE_SUFFIX)
281 | }
282 | return existing_audit_tables
283 |
284 |
285 | def materialize_scripts(dag: DAGOfScripts, session: Session):
286 | table_refs_to_run = determine_table_refs_to_run(
287 | selected_table_refs=session.selected_table_refs,
288 | unselected_table_refs=session.unselected_table_refs,
289 | existing_audit_tables=session.existing_audit_tables,
290 | dag=dag,
291 | base_dataset=session.base_dataset,
292 | )
293 | if not table_refs_to_run:
294 | lea.log.info("✅ Nothing needs materializing")
295 | return
296 | lea.log.info(f"🔵 Running {len(table_refs_to_run):,d} scripts")
297 | dag.prepare()
298 | while dag.is_active():
299 | # If we're in early end mode, we need to check if any script errored, in which case we
300 | # have to stop everything.
301 | if session.any_error_has_occurred:
302 | lea.log.error("✋ Early ending because an error occurred")
303 | break
304 |
305 | # Start available jobs
306 | for script_to_run in dag.iter_scripts(table_refs_to_run):
307 | # Before executing a script, we need to contextualize it. We have to edit its
308 | # dependencies, add incremental logic, and set the write context.
309 | script_to_run = session.add_context_to_script(script_to_run)
310 | # 🔨 if you're developping on lea, you can call session.run_script(script_to_run) here
311 | # to get a better stack trace. This is because the executor will run the script in a
312 | # different thread, and the exception will be raised in that thread, not in the main
313 | # thread.
314 | future = session.executor.submit(session.run_script, script_to_run)
315 | session.run_script_futures[future] = script_to_run
316 |
317 | # Check for scripts that have finished
318 | done, _ = concurrent.futures.wait(
319 | session.run_script_futures, return_when=concurrent.futures.FIRST_COMPLETED
320 | )
321 | for future in done:
322 | script_done = session.run_script_futures[future]
323 | if exception := future.exception():
324 | lea.log.error(f"Failed running {script_done.table_ref}\n{exception}")
325 | table_ref = session.remove_write_context_from_table_ref(script_done.table_ref)
326 | session.run_script_futures_complete[future] = session.run_script_futures.pop(future)
327 | dag.done(table_ref)
328 |
329 |
330 | def promote_audit_tables(session: Session):
331 | lea.log.info("🟢 Promoting audit tables")
332 | # Ideally, we would like to do this automatically, but BigQuery does not support DDL
333 | # statements in a transaction. So we do it concurrently. This isn't ideal, but it's the
334 | # best we can do for now. There's a very small chance that at least one promotion job will
335 | # fail.
336 | # https://hiflylabs.com/blog/2022/11/22/dbt-deployment-best-practices
337 | # https://calogica.com/sql/bigquery/dbt/2020/05/24/dbt-bigquery-blue-green-wap.html
338 | # https://calogica.com/assets/wap_dbt_bigquery.pdf
339 | # Note: it's important for the following loop to be a list comprehension. If we used a
340 | # generator expression, the loop would be infinite because jobs are being added to
341 | # session.jobs when session.promote is called.
342 | for selected_table_ref in session.selected_table_refs:
343 | if selected_table_ref.is_test:
344 | continue
345 | selected_table_ref = session.add_write_context_to_table_ref(selected_table_ref)
346 | future = session.executor.submit(session.promote_audit_table, selected_table_ref)
347 | session.promote_audit_tables_futures[future] = selected_table_ref
348 |
349 | # Wait for all promotion jobs to finish
350 | for future in concurrent.futures.as_completed(session.promote_audit_tables_futures):
351 | if (exception := future.exception()) is not None:
352 | lea.log.error(f"Promotion failed\n{exception}")
353 |
354 |
355 | def delete_audit_tables(session: Session):
356 | # Depending on when delete_audit_tables is called, there might be new audit tables that have
357 | # been created. We need to delete them too. We do this by adding the write context to the
358 | # table references. This will add the audit suffix to the table reference, which will make
359 | # it match the audit tables that have been created.
360 | table_refs_to_delete = set(session.existing_audit_tables) | {
361 | session.add_write_context_to_table_ref(table_ref)
362 | for table_ref in session.selected_table_refs
363 | }
364 | if table_refs_to_delete:
365 | lea.log.info("🧹 Deleting audit tables")
366 | delete_table_refs(
367 | table_refs=table_refs_to_delete,
368 | database_client=session.database_client,
369 | executor=concurrent.futures.ThreadPoolExecutor(max_workers=None),
370 | verbose=False,
371 | )
372 | session.existing_audit_tables = {}
373 |
374 |
375 | def delete_orphan_tables(session: Session):
376 | table_refs_to_delete = set(session.existing_tables) - {
377 | session.add_write_context_to_table_ref(table_ref).remove_audit_suffix()
378 | for table_ref in session.scripts
379 | }
380 | if table_refs_to_delete:
381 | lea.log.info("🧹 Deleting orphan tables")
382 | delete_table_refs(
383 | table_refs=table_refs_to_delete,
384 | database_client=session.database_client,
385 | executor=concurrent.futures.ThreadPoolExecutor(max_workers=None),
386 | verbose=True,
387 | )
388 | session.existing_audit_tables = {}
389 |
390 |
391 | def delete_table_refs(
392 | table_refs: set[TableRef],
393 | database_client: DatabaseClient,
394 | executor: concurrent.futures.ThreadPoolExecutor,
395 | verbose: bool,
396 | ):
397 | futures: dict[concurrent.futures.Future, TableRef] = {}
398 | for table_ref in table_refs:
399 | future = executor.submit(database_client.delete_table, table_ref)
400 | futures[future] = table_ref
401 |
402 | for future in concurrent.futures.as_completed(futures):
403 | if (exception := future.exception()) is not None:
404 | lea.log.error(exception)
405 | continue
406 | if verbose:
407 | lea.log.info(f"Deleted {futures[future]}")
408 |
409 |
410 | def determine_table_refs_to_run(
411 | selected_table_refs: set[TableRef],
412 | unselected_table_refs: set[TableRef],
413 | existing_audit_tables: dict[TableRef, TableStats],
414 | dag: DAGOfScripts,
415 | base_dataset: str,
416 | ) -> set[TableRef]:
417 | """Determine which table references need to be run.
418 |
419 | We want to:
420 |
421 | 1. Run tables that have been selected. This is obtained from the DAGOfScripts.select method.
422 | 2. Skip tables that already exist. This is obtained from the database client.
423 | 3. Don't skip tables that have been edited since last being run. This is obtained from the
424 | scripts themselves.
425 |
426 | This last requirement is why we need an extra method to determine which table references need
427 | to be run. We compare the updated_at of the script with the updated_at of the corresponding
428 | table (if it exists): a script that has been modified since the last time it was run needs to
429 | be run again. All the descendants of this script also need to be run.
430 |
431 | On top of this, we also include each test script that is associated with the selected table
432 | references. We do this because it's a good default behavior.
433 |
434 | """
435 | table_refs_to_run = selected_table_refs.copy()
436 |
437 | # By default, we do not run scripts that have an audit table materialized. We will determine
438 | # afterwards, based on each script's modified_at, if we need to run them again.
439 | existing_audit_table_refs = {
440 | table_ref.remove_audit_suffix().replace_dataset(base_dataset): stats
441 | for table_ref, stats in existing_audit_tables.items()
442 | }
443 | table_refs_to_run -= set(existing_audit_table_refs)
444 |
445 | # Now we check if any of the audit tables have had their script modified since the last time
446 | # they were materialized. If so, we need to run them again, as well as their descendants.
447 | for table_ref in selected_table_refs & set(existing_audit_table_refs):
448 | script = dag.scripts[table_ref]
449 | if script.updated_at > existing_audit_table_refs[table_ref].updated_at:
450 | lea.log.info(f"📝 {table_ref} was modified, re-materializing it")
451 | table_refs_to_run.add(table_ref)
452 | table_refs_to_run |= set(dag.iter_descendants(table_ref)) & selected_table_refs
453 |
454 | # Include applicable tests. That is, test scripts whose dependencies are all in the set of
455 | # selected table references.
456 | applicable_test_scripts_table_refs = {
457 | script.table_ref
458 | for script in dag.scripts.values()
459 | if script.is_test
460 | and all(dependency in table_refs_to_run for dependency in script.dependencies)
461 | }
462 | table_refs_to_run |= applicable_test_scripts_table_refs
463 |
464 | # Now we remove the unselected table references from the set of table references to run. We do
465 | # this at the very end, because of the above logic which adds table references to the set of
466 | # table references to run. For instance, if we run
467 | #
468 | # lea --select core.accounts --unselect tests
469 | #
470 | # we don't want the tests which are applicable to core.accounts to be run.
471 | table_refs_to_run -= unselected_table_refs
472 |
473 | return table_refs_to_run
474 |
--------------------------------------------------------------------------------
/lea/dag.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import graphlib
4 | import pathlib
5 | import re
6 | from collections.abc import Iterator
7 |
8 | import git
9 |
10 | from .dialects import SQLDialect
11 | from .scripts import Script, read_scripts
12 | from .table_ref import TableRef
13 |
14 |
15 | class DAGOfScripts(graphlib.TopologicalSorter):
16 | def __init__(
17 | self,
18 | dependency_graph: dict[TableRef, set[TableRef]],
19 | scripts: list[Script],
20 | scripts_dir: pathlib.Path,
21 | dataset_name: str,
22 | project_name: str | None,
23 | ):
24 | graphlib.TopologicalSorter.__init__(self, dependency_graph)
25 | self.dependency_graph = dependency_graph
26 | self.scripts = {script.table_ref: script for script in scripts}
27 | self.scripts_dir = scripts_dir
28 | self.dataset_name = dataset_name
29 | self.project_name = project_name
30 |
31 | @classmethod
32 | def from_directory(
33 | cls,
34 | scripts_dir: pathlib.Path,
35 | sql_dialect: SQLDialect,
36 | dataset_name: str,
37 | project_name: str | None,
38 | ) -> DAGOfScripts:
39 | scripts = read_scripts(
40 | scripts_dir=scripts_dir,
41 | sql_dialect=sql_dialect,
42 | dataset_name=dataset_name,
43 | project_name=project_name,
44 | )
45 |
46 | # Fields in the script's code may contain tags. These tags induce assertion tests, which
47 | # are also scripts. We need to include these assertion tests in the dependency graph.
48 | for script in scripts:
49 | scripts.extend(script.assertion_tests)
50 |
51 | # TODO: the following is quite slow. This is because parsing dependencies from each script
52 | # is slow. There are several optimizations that could be done.
53 | dependency_graph = {
54 | script.table_ref: {
55 | dependency.replace_dataset(dataset_name) for dependency in script.dependencies
56 | }
57 | for script in scripts
58 | }
59 |
60 | return cls(
61 | dependency_graph=dependency_graph,
62 | scripts=scripts,
63 | scripts_dir=scripts_dir,
64 | dataset_name=dataset_name,
65 | project_name=project_name,
66 | )
67 |
68 | def select(self, *queries: str) -> set[TableRef]:
69 | """Select a subset of the views in the DAG."""
70 |
71 | def _select(
72 | query: str,
73 | include_ancestors: bool = False,
74 | include_descendants: bool = False,
75 | ):
76 | if query == "*":
77 | yield from self.scripts.keys()
78 | return
79 |
80 | # It's possible to query views via git. For example:
81 | # * `git` will select all the views that have been modified compared to the main branch.
82 | # * `git+` will select all the modified views, and their descendants.
83 | # * `+git` will select all the modified views, and their ancestors.
84 | # * `+git+` will select all the modified views, with their ancestors and descendants.
85 | if m := re.match(r"(?P\+?)git(?P\+?)", query):
86 | include_ancestors = include_ancestors or m.group("ancestors") == "+"
87 | include_descendants = include_descendants or m.group("descendants") == "+"
88 | for table_ref in list_table_refs_that_changed(
89 | scripts_dir=self.scripts_dir, project_name=self.project_name
90 | ):
91 | yield from _select(
92 | ".".join([*table_ref.schema, table_ref.name]),
93 | include_ancestors=include_ancestors,
94 | include_descendants=include_descendants,
95 | )
96 | return
97 |
98 | if query.endswith("+"):
99 | yield from _select(
100 | query=query[:-1],
101 | include_ancestors=include_ancestors,
102 | include_descendants=True,
103 | )
104 | return
105 |
106 | if query.startswith("+"):
107 | yield from _select(
108 | query=query[1:],
109 | include_ancestors=True,
110 | include_descendants=include_descendants,
111 | )
112 | return
113 |
114 | if "/" in query:
115 | schema = tuple(query.strip("/").split("/"))
116 | for table_ref in self.dependency_graph:
117 | if table_ref.schema == schema:
118 | yield from _select(
119 | ".".join([*table_ref.schema, table_ref.name]),
120 | include_ancestors=include_ancestors,
121 | include_descendants=include_descendants,
122 | )
123 | return
124 |
125 | *schema, name = query.split(".")
126 | table_ref = TableRef(
127 | dataset=self.dataset_name,
128 | schema=tuple(schema),
129 | name=name,
130 | project=self.project_name,
131 | )
132 | yield table_ref
133 | if include_ancestors:
134 | yield from self.iter_ancestors(node=table_ref)
135 | if include_descendants:
136 | yield from self.iter_descendants(node=table_ref)
137 |
138 | all_selected_table_refs = set()
139 | for query in queries:
140 | selected_table_refs = set(_select(query))
141 | all_selected_table_refs.update(selected_table_refs)
142 |
143 | return {
144 | table_ref
145 | for table_ref in all_selected_table_refs
146 | # Some nodes in the graph are not part of the views, such as external dependencies
147 | if table_ref in self.scripts
148 | }
149 |
150 | def iter_scripts(self, table_refs: set[TableRef]) -> Iterator[Script]:
151 | """Loop over scripts in topological order.
152 |
153 | This method does not have the responsibility of calling .prepare() and .done() when a
154 | script terminates. This is the responsibility of the caller.
155 |
156 | """
157 |
158 | for table_ref in self.get_ready():
159 | if (
160 | # The DAG contains all the scripts as well as all the dependencies of each script.
161 | # Not all of these dependencies are scripts. We need to filter out the non-script
162 | # dependencies.
163 | table_ref not in self.scripts
164 | # We also need to filter out the scripts that are not part of the selected table
165 | # refs.
166 | or table_ref not in table_refs
167 | ):
168 | self.done(table_ref)
169 | continue
170 |
171 | yield self.scripts[table_ref]
172 |
173 | def iter_ancestors(self, node: TableRef):
174 | for child in self.dependency_graph.get(node, []):
175 | yield child
176 | yield from self.iter_ancestors(node=child)
177 |
178 | def iter_descendants(self, node: TableRef):
179 | for potential_child in self.dependency_graph:
180 | if node in self.dependency_graph[potential_child]:
181 | yield potential_child
182 | yield from self.iter_descendants(node=potential_child)
183 |
184 |
185 | def list_table_refs_that_changed(scripts_dir: pathlib.Path, project_name: str) -> set[TableRef]:
186 | repo = git.Repo(search_parent_directories=True)
187 | repo_root = pathlib.Path(repo.working_tree_dir)
188 |
189 | absolute_scripts_dir = scripts_dir.resolve()
190 |
191 | # Changes that have been committed
192 | staged_diffs = repo.index.diff(
193 | repo.refs.main.commit
194 | # repo.remotes.origin.refs.main.commit
195 | )
196 | # Changes that have not been committed
197 | unstage_diffs = repo.head.commit.diff(None)
198 |
199 | table_refs = set()
200 | for diff in staged_diffs + unstage_diffs:
201 | # One thing to note is that we don't filter out deleted views. This is because
202 | # these views will get filtered out by dag.select anyway.
203 | diff_path = pathlib.Path(repo_root / diff.a_path).resolve()
204 | if diff_path.is_relative_to(absolute_scripts_dir) and tuple(diff_path.suffixes) in {
205 | (".sql",),
206 | (".sql", ".jinja"),
207 | }:
208 | table_ref = TableRef.from_path(
209 | scripts_dir=scripts_dir,
210 | relative_path=diff_path.relative_to(absolute_scripts_dir),
211 | project_name=project_name,
212 | )
213 | table_refs.add(table_ref)
214 |
215 | return table_refs
216 |
--------------------------------------------------------------------------------
/lea/databases.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import dataclasses
4 | import datetime as dt
5 | import hashlib
6 | import typing
7 | import urllib.parse
8 | from pathlib import Path
9 |
10 | import duckdb
11 | import pandas as pd
12 | import requests
13 | import rich
14 | from google.cloud import bigquery
15 | from google.oauth2 import service_account
16 |
17 | import lea
18 | from lea import scripts
19 | from lea.dialects import BigQueryDialect, DuckDBDialect
20 | from lea.table_ref import TableRef
21 |
22 |
23 | class DatabaseJob(typing.Protocol):
24 | @property
25 | def is_done(self) -> bool:
26 | pass
27 |
28 | def stop(self):
29 | pass
30 |
31 | @property
32 | def result(self) -> pd.DataFrame:
33 | pass
34 |
35 | @property
36 | def exception(self) -> Exception:
37 | pass
38 |
39 | @property
40 | def billed_dollars(self) -> float:
41 | pass
42 |
43 | @property
44 | def statistics(self) -> TableStats | None:
45 | pass
46 |
47 | @property
48 | def metadata(self) -> list[str]:
49 | return []
50 |
51 | def conclude(self):
52 | pass
53 |
54 |
55 | class DatabaseClient(typing.Protocol):
56 | def create_dataset(self, dataset_name: str):
57 | pass
58 |
59 | def delete_dataset(self, dataset_name: str):
60 | pass
61 |
62 | def materialize_script(self, script: scripts.Script) -> DatabaseJob:
63 | pass
64 |
65 | def query_script(self, script: scripts.Script) -> DatabaseJob:
66 | pass
67 |
68 | def clone_table(
69 | self, from_table_ref: scripts.TableRef, to_table_ref: scripts.TableRef
70 | ) -> DatabaseJob:
71 | pass
72 |
73 | def delete_and_insert(
74 | self, from_table_ref: scripts.TableRef, to_table_ref: scripts.TableRef, on: str
75 | ) -> DatabaseJob:
76 | pass
77 |
78 | def delete_table(self, table_ref: scripts.TableRef) -> DatabaseJob:
79 | pass
80 |
81 | def list_table_stats(self, dataset_name: str) -> dict[scripts.TableRef, TableStats]:
82 | pass
83 |
84 | def list_table_fields(self, dataset_name: str) -> dict[scripts.TableRef, list[scripts.Field]]:
85 | pass
86 |
87 |
88 | @dataclasses.dataclass
89 | class BigQueryJob:
90 | client: BigQueryClient
91 | query_job: bigquery.QueryJob
92 | destination: bigquery.TableReference | None = None
93 | script: scripts.SQLScript | None = None
94 |
95 | @property
96 | def is_done(self) -> bool:
97 | return self.query_job.done()
98 |
99 | @property
100 | def billed_dollars(self) -> float:
101 | bytes_billed = (
102 | self.query_job.total_bytes_processed
103 | if self.client.dry_run
104 | else self.query_job.total_bytes_billed
105 | )
106 | if bytes_billed is None:
107 | return 0.0
108 | return self.client.estimate_cost_in_dollars(bytes_billed)
109 |
110 | @property
111 | def statistics(self) -> TableStats | None:
112 | if self.client.dry_run or self.destination is None:
113 | return None
114 | table = self.client.client.get_table(
115 | self.destination, retry=bigquery.DEFAULT_RETRY.with_deadline(10)
116 | )
117 | return TableStats(n_rows=table.num_rows, n_bytes=table.num_bytes, updated_at=table.modified)
118 |
119 | def stop(self):
120 | self.client.client.cancel_job(self.query_job.job_id)
121 |
122 | @property
123 | def result(self) -> pd.DataFrame:
124 | return self.query_job.result().to_dataframe()
125 |
126 | @property
127 | def exception(self) -> Exception:
128 | return self.query_job.exception()
129 |
130 | @property
131 | def is_using_reservation(self) -> bool:
132 | return (
133 | self.query_job._properties.get("statistics", {})
134 | .get("reservationUsage", [{}])[0]
135 | .get("name")
136 | ) is not None
137 |
138 | @property
139 | def metadata(self) -> list[str]:
140 | billing_model = ("reservation" if self.is_using_reservation else "on-demand") + " billing"
141 | return [billing_model]
142 |
143 | def conclude(self):
144 | if self.client.big_blue_pick_api is not None and self.script is not None:
145 | self.client.big_blue_pick_api.record_job_for_script(
146 | script=self.script, job=self.query_job
147 | )
148 |
149 |
150 | @dataclasses.dataclass(frozen=True)
151 | class TableStats:
152 | n_rows: int
153 | n_bytes: int | None
154 | updated_at: dt.datetime
155 |
156 |
157 | class BigBluePickAPI:
158 | """Big Blue Pick API implementation.
159 |
160 | https://biq.blue/blog/compute/how-to-implement-bigquery-autoscaling-reservation-in-10-minutes
161 |
162 | Parameters
163 | ----------
164 | on_demand_project_id
165 | The project ID of the on-demand BigQuery project.
166 | reservation_project_id
167 | The project ID of the reservation BigQuery project.
168 | default_project_id
169 | The project ID of the default BigQuery project. This is used if something with the
170 | BigBlue Pick API fails.
171 |
172 | """
173 |
174 | def __init__(
175 | self,
176 | api_url: str,
177 | api_key: str,
178 | on_demand_project_id: str,
179 | reservation_project_id: str,
180 | default_project_id: str,
181 | ):
182 | self.api_url = api_url
183 | self.api_key = api_key
184 | self.on_demand_project_id = on_demand_project_id
185 | self.reservation_project_id = reservation_project_id
186 | self.default_project_id = default_project_id
187 |
188 | def call_pick_api(self, path, body):
189 | try:
190 | response = requests.post(
191 | urllib.parse.urljoin(self.api_url, path),
192 | json=body,
193 | headers={
194 | "Content-Type": "application/json",
195 | "Authorization": f"Bearer {self.api_key}",
196 | },
197 | )
198 | response.raise_for_status()
199 | return response.json()
200 | except requests.exceptions.RequestException as e:
201 | lea.log.warning(f"Big Blue Pick API call failed: {e}")
202 | return None
203 |
204 | @staticmethod
205 | def hash_script(script: scripts.SQLScript) -> str:
206 | return hashlib.sha256(
207 | str(script.table_ref.replace_dataset("FREEZE").replace_project("FREEZE")).encode()
208 | ).hexdigest()
209 |
210 | def pick_project_id_for_script(self, script: scripts.SQLScript) -> str:
211 | response = self.call_pick_api(
212 | path="/pick",
213 | body={"hash": self.hash_script(script)},
214 | )
215 | if not response or not (pick := response.get("pick")):
216 | lea.log.warning("Big Blue Pick API call failed, using default project ID")
217 | elif pick not in {"ON-DEMAND", "RESERVATION"}:
218 | lea.log.warning(
219 | f"Big Blue Pick API returned unexpected choice {response['pick']!r}, using default project ID"
220 | )
221 | elif pick == "ON-DEMAND":
222 | return self.on_demand_project_id
223 | elif pick == "RESERVATION":
224 | return self.reservation_project_id
225 | return self.default_project_id
226 |
227 | def pick_client(
228 | self, script: scripts.SQLScript, credentials: service_account.Credentials, location: str
229 | ) -> DatabaseClient:
230 | project_id = self.pick_project_id_for_script(script=script)
231 | return bigquery.Client(project=project_id, credentials=credentials, location=location)
232 |
233 | def record_job_for_script(self, script: scripts.SQLScript, job: bigquery.QueryJob):
234 | self.call_pick_api(
235 | path="/write",
236 | # https://github.com/biqblue/docs/blob/1ec0eae06ccfabb339cf11bc19dbcbe04b404373/examples/python/pick.py#L42
237 | body={
238 | "hash": self.hash_script(script),
239 | "job_id": job.job_id,
240 | "creation_time": str(int(job.created.timestamp() * 1000)),
241 | "start_time": str(int(job.started.timestamp() * 1000)),
242 | "end_time": str(int(job.ended.timestamp() * 1000)),
243 | "total_slot_ms": job.slot_millis,
244 | "total_bytes_billed": job.total_bytes_billed,
245 | "total_bytes_processed": job.total_bytes_processed,
246 | "bi_engine_mode": getattr(job, "bi_engine_statistics", {}).get(
247 | "bi_engine_mode", ""
248 | ),
249 | "reservation_id": (
250 | job._properties.get("statistics", {})
251 | .get("reservationUsage", [{}])[0]
252 | .get("name", "")
253 | ),
254 | },
255 | )
256 |
257 |
258 | class BigQueryClient(BigBluePickAPI):
259 | def __init__(
260 | self,
261 | credentials: service_account.Credentials,
262 | location: str,
263 | write_project_id: str,
264 | compute_project_id: str,
265 | storage_billing_model: str = "PHYSICAL",
266 | dry_run: bool = False,
267 | print_mode: bool = False,
268 | default_clustering_fields: list[str] = None,
269 | big_blue_pick_api_url: str = None,
270 | big_blue_pick_api_key: str = None,
271 | big_blue_pick_api_on_demand_project_id: str = None,
272 | big_blue_pick_api_reservation_project_id: str = None,
273 | ):
274 | self.credentials = credentials
275 | self.write_project_id = write_project_id
276 | self.compute_project_id = compute_project_id
277 | self.storage_billing_model = storage_billing_model
278 | self.location = location
279 | self.client = bigquery.Client(
280 | project=self.compute_project_id,
281 | credentials=self.credentials,
282 | location=self.location,
283 | )
284 | self.dry_run = dry_run
285 | self.print_mode = print_mode
286 | self.default_clustering_fields = default_clustering_fields or []
287 |
288 | self.big_blue_pick_api = (
289 | BigBluePickAPI(
290 | api_url=big_blue_pick_api_url,
291 | api_key=big_blue_pick_api_key,
292 | on_demand_project_id=big_blue_pick_api_on_demand_project_id,
293 | reservation_project_id=big_blue_pick_api_reservation_project_id,
294 | default_project_id=self.write_project_id,
295 | )
296 | if (
297 | big_blue_pick_api_url is not None
298 | and big_blue_pick_api_key is not None
299 | and big_blue_pick_api_on_demand_project_id is not None
300 | and big_blue_pick_api_reservation_project_id is not None
301 | )
302 | else None
303 | )
304 |
305 | def create_dataset(self, dataset_name: str):
306 | dataset_ref = bigquery.DatasetReference(
307 | project=self.write_project_id, dataset_id=dataset_name
308 | )
309 | dataset = bigquery.Dataset(dataset_ref)
310 | dataset.location = self.location
311 | dataset.storage_billing_model = self.storage_billing_model
312 | dataset = self.client.create_dataset(dataset, exists_ok=True)
313 |
314 | def delete_dataset(self, dataset_name: str):
315 | self.client.delete_dataset(
316 | dataset=f"{self.write_project_id}.{dataset_name}",
317 | delete_contents=True,
318 | not_found_ok=True,
319 | )
320 |
321 | @staticmethod
322 | def estimate_cost_in_dollars(bytes_billed: int) -> float:
323 | cost_per_tb = 5
324 | return (bytes_billed / 10**12) * cost_per_tb
325 |
326 | def materialize_script(self, script: scripts.Script) -> BigQueryJob:
327 | if isinstance(script, scripts.SQLScript):
328 | return self.materialize_sql_script(sql_script=script)
329 | raise ValueError("Unsupported script type")
330 |
331 | def materialize_sql_script(self, sql_script: scripts.SQLScript) -> BigQueryJob:
332 | destination = BigQueryDialect.convert_table_ref_to_bigquery_table_reference(
333 | table_ref=sql_script.table_ref, project=self.write_project_id
334 | )
335 | clustering_fields = (
336 | (
337 | [
338 | clustering_field
339 | for clustering_field in self.default_clustering_fields
340 | if clustering_field in {field.name for field in sql_script.fields}
341 | ]
342 | )
343 | if self.default_clustering_fields
344 | else None
345 | )
346 | job_config = self.make_job_config(
347 | script=sql_script,
348 | destination=destination,
349 | write_disposition="WRITE_TRUNCATE",
350 | clustering_fields=clustering_fields
351 | if clustering_fields and not sql_script.table_ref.is_test
352 | else None,
353 | )
354 |
355 | client = (
356 | self.big_blue_pick_api.pick_client(
357 | script=sql_script,
358 | credentials=self.credentials,
359 | location=self.location,
360 | )
361 | if self.big_blue_pick_api is not None
362 | else self.client
363 | )
364 |
365 | return BigQueryJob(
366 | client=self,
367 | query_job=client.query(
368 | query=sql_script.code, job_config=job_config, location=self.location
369 | ),
370 | destination=destination,
371 | script=sql_script,
372 | )
373 |
374 | def query_script(self, script: scripts.Script) -> BigQueryJob:
375 | if isinstance(script, scripts.SQLScript):
376 | return self.query_sql_script(sql_script=script)
377 | raise ValueError("Unsupported script type")
378 |
379 | def query_sql_script(self, sql_script: scripts.SQLScript) -> BigQueryJob:
380 | job_config = self.make_job_config(script=sql_script)
381 | client = (
382 | self.big_blue_pick_api.pick_client(
383 | script=sql_script,
384 | credentials=self.credentials,
385 | location=self.location,
386 | )
387 | if self.big_blue_pick_api is not None
388 | else self.client
389 | )
390 | return BigQueryJob(
391 | client=self,
392 | query_job=client.query(
393 | query=sql_script.code, job_config=job_config, location=self.location
394 | ),
395 | script=sql_script,
396 | )
397 |
398 | def clone_table(
399 | self, from_table_ref: scripts.TableRef, to_table_ref: scripts.TableRef
400 | ) -> BigQueryJob:
401 | destination = BigQueryDialect.convert_table_ref_to_bigquery_table_reference(
402 | table_ref=to_table_ref, project=self.write_project_id
403 | )
404 | source = BigQueryDialect.convert_table_ref_to_bigquery_table_reference(
405 | table_ref=from_table_ref, project=self.write_project_id
406 | )
407 | clone_code = f"""
408 | CREATE OR REPLACE TABLE {destination}
409 | CLONE {source}
410 | """
411 | job_config = self.make_job_config(
412 | script=scripts.SQLScript(
413 | table_ref=to_table_ref, code=clone_code, sql_dialect=BigQueryDialect, fields=[]
414 | )
415 | )
416 | return BigQueryJob(
417 | client=self,
418 | query_job=self.client.query(clone_code, job_config=job_config, location=self.location),
419 | destination=destination,
420 | )
421 |
422 | def delete_and_insert(
423 | self, from_table_ref: scripts.TableRef, to_table_ref: scripts.TableRef, on: str
424 | ) -> BigQueryJob:
425 | source = BigQueryDialect.convert_table_ref_to_bigquery_table_reference(
426 | table_ref=from_table_ref, project=self.write_project_id
427 | )
428 | destination = BigQueryDialect.convert_table_ref_to_bigquery_table_reference(
429 | table_ref=to_table_ref, project=self.write_project_id
430 | )
431 | # TODO: the following could instead be done with a MERGE statement.
432 | delete_and_insert_code = f"""
433 | BEGIN TRANSACTION;
434 |
435 | -- Delete existing data
436 | DELETE FROM {destination}
437 | WHERE {on} IN (SELECT DISTINCT {on} FROM {source});
438 |
439 | -- Insert new data
440 | INSERT INTO {destination}
441 | SELECT * FROM {source};
442 |
443 | COMMIT TRANSACTION;
444 | """
445 | job_config = self.make_job_config(
446 | script=scripts.SQLScript(
447 | table_ref=to_table_ref,
448 | code=delete_and_insert_code,
449 | sql_dialect=BigQueryDialect,
450 | fields=[],
451 | )
452 | )
453 | return BigQueryJob(
454 | client=self,
455 | query_job=self.client.query(
456 | delete_and_insert_code, job_config=job_config, location=self.location
457 | ),
458 | destination=destination,
459 | )
460 |
461 | def delete_table(self, table_ref: scripts.TableRef) -> BigQueryJob:
462 | table_reference = BigQueryDialect.convert_table_ref_to_bigquery_table_reference(
463 | table_ref=table_ref, project=self.write_project_id
464 | )
465 | delete_code = f"""
466 | DROP TABLE IF EXISTS {table_reference}
467 | """
468 | job_config = self.make_job_config(
469 | script=scripts.SQLScript(
470 | table_ref=table_ref,
471 | code=delete_code,
472 | sql_dialect=BigQueryDialect,
473 | fields=[],
474 | )
475 | )
476 | return BigQueryJob(
477 | client=self,
478 | query_job=self.client.query(delete_code, job_config=job_config, location=self.location),
479 | )
480 |
481 | def list_table_stats(self, dataset_name: str) -> dict[scripts.TableRef, TableStats]:
482 | query = f"""
483 | SELECT table_id, row_count, size_bytes, last_modified_time
484 | FROM `{self.write_project_id}.{dataset_name}.__TABLES__`
485 | """
486 | job = self.client.query(query, location=self.location)
487 | return {
488 | BigQueryDialect.parse_table_ref(
489 | f"{self.write_project_id}.{dataset_name}.{row['table_id']}"
490 | ): TableStats(
491 | n_rows=row["row_count"],
492 | n_bytes=row["size_bytes"],
493 | updated_at=(
494 | dt.datetime.fromtimestamp(row["last_modified_time"] // 1000, tz=dt.timezone.utc)
495 | ),
496 | )
497 | for row in job.result()
498 | }
499 |
500 | def list_table_fields(self, dataset_name: str) -> dict[scripts.TableRef, set[scripts.Field]]:
501 | query = f"""
502 | SELECT table_name, column_name
503 | FROM `{self.write_project_id}.{dataset_name}.INFORMATION_SCHEMA.COLUMNS`
504 | """
505 | job = self.client.query(query, location=self.location)
506 | return {
507 | BigQueryDialect.parse_table_ref(
508 | f"{self.write_project_id}.{dataset_name}.{table_name}"
509 | ): [scripts.Field(name=row["column_name"]) for _, row in rows.iterrows()]
510 | for table_name, rows in job.result()
511 | .to_dataframe()
512 | .sort_values(["table_name", "column_name"])
513 | .groupby("table_name")
514 | }
515 |
516 | def make_job_config(self, script: scripts.SQLScript, **kwargs) -> bigquery.QueryJobConfig:
517 | if self.print_mode:
518 | rich.print(script)
519 | return bigquery.QueryJobConfig(
520 | priority=bigquery.QueryPriority.INTERACTIVE,
521 | use_query_cache=False,
522 | dry_run=self.dry_run,
523 | **kwargs,
524 | )
525 |
526 |
527 | @dataclasses.dataclass
528 | class DuckDBJob:
529 | query: str
530 | connection: duckdb.DuckDBPyConnection
531 | destination: str | None = None
532 | exception: str | None = None
533 |
534 | def execute(self):
535 | self.connection.execute(self.query)
536 |
537 | @property
538 | def is_done(self) -> bool:
539 | try:
540 | self.execute()
541 | except Exception as e:
542 | self.exception = repr(e)
543 | raise e
544 | else:
545 | return True
546 |
547 | def stop(self):
548 | pass # No support for stopping queries in DuckDB
549 |
550 | @property
551 | def result(self) -> pd.DataFrame:
552 | return self.connection.execute(self.query).fetchdf()
553 |
554 | @property
555 | def billed_dollars(self) -> float:
556 | return None # DuckDB is free to use
557 |
558 | @property
559 | def statistics(self) -> TableStats | None:
560 | query = f"SELECT COUNT(*) AS n_rows, MAX(_materialized_timestamp) AS updated_at FROM {self.destination}"
561 | table = self.connection.execute(query).fetchdf().iloc[0]
562 | return TableStats(
563 | n_rows=int(table["n_rows"]),
564 | n_bytes=None,
565 | updated_at=table["updated_at"],
566 | )
567 |
568 |
569 | class DuckDBClient:
570 | def __init__(self, database_path: Path, dry_run: bool = False, print_mode: bool = False):
571 | self.database_path = database_path
572 | if self.database_path == "":
573 | raise ValueError("DuckDB path not configured")
574 | self.dry_run = dry_run
575 | self.print_mode = print_mode
576 |
577 | @property
578 | def connection(self) -> duckdb.DuckDBPyConnection:
579 | return duckdb.connect(database=str(self.database_path))
580 |
581 | @property
582 | def dataset(self) -> str:
583 | return self.database_path.stem
584 |
585 | def create_dataset(self, dataset_name: str):
586 | self.database_path = self.database_path.with_stem(dataset_name)
587 |
588 | def create_schema(self, table_ref: scripts.TableRef):
589 | self.connection.execute(f"CREATE SCHEMA IF NOT EXISTS {table_ref.schema[0]}")
590 |
591 | def materialize_script(self, script: scripts.Script) -> DuckDBJob:
592 | if isinstance(script, scripts.SQLScript):
593 | return self.materialize_sql_script(sql_script=script)
594 | raise ValueError("Unsupported script type")
595 |
596 | def materialize_sql_script(self, sql_script: scripts.SQLScript) -> DuckDBJob:
597 | destination = DuckDBDialect.convert_table_ref_to_duckdb_table_reference(
598 | table_ref=sql_script.table_ref
599 | )
600 | # We need to materialize the script with a timestamp to keep track of when it was materialized.
601 | # DuckDB does not provide a metadata table, so we need to create one with a technical column.
602 | # bear in mind that this is a workaround and not a best practice. Any change done outside
603 | # lea will not be reflected in the metadata column and could break orchestration mecanism.
604 | materialize_code = f"""
605 | CREATE OR REPLACE TABLE {destination} AS (
606 | WITH logic_table AS ({sql_script.code}),
607 | materialized_infos AS (SELECT CURRENT_LOCALTIMESTAMP() AS _materialized_timestamp)
608 | SELECT * FROM logic_table, materialized_infos
609 | );
610 | """
611 | return self.make_job_config(
612 | script=scripts.SQLScript(
613 | table_ref=sql_script.table_ref,
614 | code=materialize_code,
615 | sql_dialect=DuckDBDialect,
616 | fields=[],
617 | ),
618 | destination=destination,
619 | )
620 |
621 | def query_script(self, script: scripts.Script) -> DuckDBJob:
622 | if isinstance(script, scripts.SQLScript):
623 | job = self.make_job_config(script=script)
624 | return job
625 | raise ValueError("Unsupported script type")
626 |
627 | def clone_table(
628 | self, from_table_ref: scripts.TableRef, to_table_ref: scripts.TableRef
629 | ) -> DuckDBJob:
630 | destination = DuckDBDialect.convert_table_ref_to_duckdb_table_reference(
631 | table_ref=to_table_ref
632 | )
633 | source = DuckDBDialect.convert_table_ref_to_duckdb_table_reference(table_ref=from_table_ref)
634 | clone_code = f"""
635 | CREATE OR REPLACE TABLE {destination} AS SELECT * FROM {source}
636 | """
637 | job = self.make_job_config(
638 | script=scripts.SQLScript(
639 | table_ref=to_table_ref, code=clone_code, sql_dialect=DuckDBDialect, fields=[]
640 | ),
641 | destination=destination,
642 | )
643 | return job
644 |
645 | def delete_and_insert(
646 | self, from_table_ref: scripts.TableRef, to_table_ref: scripts.TableRef, on: str
647 | ) -> DuckDBJob:
648 | to_table_reference = DuckDBDialect.convert_table_ref_to_duckdb_table_reference(
649 | table_ref=to_table_ref
650 | )
651 | from_table_reference = DuckDBDialect.convert_table_ref_to_duckdb_table_reference(
652 | table_ref=from_table_ref
653 | )
654 |
655 | delete_and_insert_code = f"""
656 | DELETE FROM {to_table_reference} WHERE {on} IN (SELECT DISTINCT {on} FROM {from_table_reference});
657 | INSERT INTO {to_table_reference} SELECT * FROM {from_table_reference};
658 | """
659 | job = self.make_job_config(
660 | script=scripts.SQLScript(
661 | table_ref=to_table_ref,
662 | code=delete_and_insert_code,
663 | sql_dialect=DuckDBDialect,
664 | fields=[],
665 | ),
666 | destination=to_table_reference,
667 | )
668 | job.execute()
669 | return job
670 |
671 | def delete_table(self, table_ref: scripts.TableRef) -> DuckDBJob:
672 | table_reference = DuckDBDialect.convert_table_ref_to_duckdb_table_reference(
673 | table_ref=table_ref
674 | )
675 | delete_code = f"DROP TABLE IF EXISTS {table_reference}"
676 | job = self.make_job_config(
677 | script=scripts.SQLScript(
678 | table_ref=table_ref, code=delete_code, sql_dialect=DuckDBDialect, fields=[]
679 | )
680 | )
681 | job.execute()
682 | return job
683 |
684 | def list_table_stats(self, dataset_name: str) -> dict[TableRef, TableStats]:
685 | tables_query = """
686 | SELECT table_name, schema_name, estimated_size
687 | FROM duckdb_tables();
688 | """
689 | tables_result = self.connection.execute(tables_query).fetchdf()
690 |
691 | table_stats = {}
692 | for _, row in tables_result.iterrows():
693 | table_name = row["table_name"]
694 | table_schema = row["schema_name"]
695 | n_rows = int(row["estimated_size"])
696 | stats_query = f"""
697 | SELECT
698 | MAX(_materialized_timestamp) AS last_modified
699 | FROM {table_schema}.{table_name}
700 | """
701 | result = self.connection.execute(stats_query).fetchdf().dropna()
702 | if result.empty:
703 | updated_at = dt.datetime.now(dt.timezone.utc)
704 | else:
705 | updated_at = dt.datetime.fromtimestamp(
706 | result.iloc[0]["last_modified"].to_pydatetime().timestamp(),
707 | tz=dt.timezone.utc,
708 | )
709 | table_stats[
710 | DuckDBDialect.parse_table_ref(f"{table_schema}.{table_name}").replace_dataset(
711 | dataset_name
712 | )
713 | ] = TableStats(
714 | n_rows=n_rows,
715 | n_bytes=None,
716 | updated_at=updated_at,
717 | )
718 | return table_stats
719 |
720 | def list_table_fields(self, dataset_name: str) -> dict[scripts.TableRef, list[scripts.Field]]:
721 | query = f"""
722 | SELECT table_name, column_name
723 | FROM information_schema.columns
724 | WHERE table_schema = '{dataset_name}'
725 | """
726 | result = self.connection.execute(query).fetchdf()
727 | return {
728 | scripts.TableRef(name=table_name): [
729 | scripts.Field(name=row["column_name"]) for _, row in rows.iterrows()
730 | ]
731 | for table_name, rows in result.groupby("table_name")
732 | }
733 |
734 | def make_job_config(
735 | self, script: scripts.SQLScript, destination: str | None = None
736 | ) -> DuckDBJob:
737 | if self.print_mode:
738 | rich.print(script)
739 | job = DuckDBJob(query=script.code, connection=self.connection, destination=destination)
740 | return job
741 |
--------------------------------------------------------------------------------
/lea/dialects.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pathlib
4 | import re
5 | import textwrap
6 |
7 | import jinja2
8 | import sqlglot
9 | from google.cloud import bigquery
10 |
11 | from lea.field import FieldTag
12 | from lea.table_ref import TableRef
13 |
14 |
15 | class SQLDialect:
16 | sqlglot_dialect: sqlglot.dialects.Dialects | None = None
17 |
18 | @staticmethod
19 | def parse_table_ref(table_ref: str) -> TableRef:
20 | raise NotImplementedError
21 |
22 | @staticmethod
23 | def format_table_ref(table_ref: TableRef) -> str:
24 | raise NotImplementedError
25 |
26 | def make_column_test_unique(self, table_ref: TableRef, field_name: str) -> str:
27 | table_ref_str = self.format_table_ref(table_ref)
28 | return load_assertion_test_template(FieldTag.UNIQUE).render(
29 | table=table_ref_str, column=field_name
30 | )
31 |
32 | def make_column_test_unique_by(self, table_ref: TableRef, field_name: str, by: str) -> str:
33 | table_ref_str = self.format_table_ref(table_ref)
34 | return load_assertion_test_template(FieldTag.UNIQUE_BY).render(
35 | table=table_ref_str,
36 | column=field_name,
37 | by=by,
38 | )
39 |
40 | def make_column_test_no_nulls(self, table_ref: TableRef, field_name: str) -> str:
41 | table_ref_str = self.format_table_ref(table_ref)
42 | return load_assertion_test_template(FieldTag.NO_NULLS).render(
43 | table=table_ref_str, column=field_name
44 | )
45 |
46 | def make_column_test_set(self, table_ref: TableRef, field_name: str, elements: set[str]) -> str:
47 | table_ref_str = self.format_table_ref(table_ref)
48 | return load_assertion_test_template(FieldTag.SET).render(
49 | table=table_ref_str,
50 | column=field_name,
51 | elements=elements,
52 | )
53 |
54 | @classmethod
55 | def add_dependency_filters(
56 | cls,
57 | code: str,
58 | incremental_field_name: str,
59 | incremental_field_values: set[str],
60 | dependencies_to_filter: set[TableRef],
61 | ) -> str:
62 | code = remove_comment_lines(code)
63 | incremental_field_values_str = ", ".join(f"'{value}'" for value in incremental_field_values)
64 | for dependency in dependencies_to_filter:
65 | dependency_str = cls.format_table_ref(dependency)
66 | code = re.sub(
67 | # We could use \b, but it doesn't work with backticks
68 | rf"(? str:
86 | code = remove_comment_lines(code)
87 | incremental_field_values_str = ", ".join(f"'{value}'" for value in incremental_field_values)
88 | for (
89 | dependency_without_wap_suffix,
90 | dependency_with_wap_suffix,
91 | ) in incremental_dependencies.items():
92 | dependency_without_wap_suffix_str = cls.format_table_ref(dependency_without_wap_suffix)
93 | dependency_with_wap_suffix_str = cls.format_table_ref(dependency_with_wap_suffix)
94 | code = re.sub(
95 | # We could use \b, but it doesn't work with backticks
96 | rf"(? str:
112 | return "\n".join(line for line in code.split("\n") if not line.strip().startswith("--"))
113 |
114 |
115 | def load_assertion_test_template(tag: str) -> jinja2.Template:
116 | return jinja2.Template(
117 | (pathlib.Path(__file__).parent / "assertions" / f"{tag.lstrip('#')}.sql.jinja").read_text()
118 | )
119 |
120 |
121 | class BigQueryDialect(SQLDialect):
122 | sqlglot_dialect = sqlglot.dialects.Dialects.BIGQUERY
123 |
124 | @staticmethod
125 | def parse_table_ref(table_ref: str) -> TableRef:
126 | """
127 |
128 | >>> BigQueryDialect.parse_table_ref("my_dataset.my_schema__my_table")
129 | TableRef(dataset='my_dataset', schema=('my_schema',), name='my_table', project=None)
130 |
131 | >>> BigQueryDialect.parse_table_ref("my_dataset.my_table")
132 | TableRef(dataset='my_dataset', schema=(), name='my_table', project=None)
133 |
134 | >>> BigQueryDialect.parse_table_ref("my_dataset.my_schema__my_table___audit")
135 | TableRef(dataset='my_dataset', schema=('my_schema',), name='my_table___audit', project=None)
136 |
137 | >>> BigQueryDialect.parse_table_ref("my_project.my_dataset.my_schema__my_table___audit")
138 | TableRef(dataset='my_dataset', schema=('my_schema',), name='my_table___audit', project='my_project')
139 |
140 | >>> BigQueryDialect.parse_table_ref("`carbonfact-gsheet`.hubspot.company")
141 | TableRef(dataset='hubspot', schema=(), name='company', project='carbonfact-gsheet')
142 |
143 | """
144 | project, dataset, leftover = None, *tuple(table_ref.rsplit(".", 1))
145 | if "." in dataset:
146 | project, dataset = dataset.split(".")
147 | *schema, name = tuple(re.split(r"(? str:
157 | table_ref_str = ""
158 | if table_ref.project:
159 | table_ref_str += f"{table_ref.project}."
160 | if table_ref.dataset:
161 | table_ref_str += f"{table_ref.dataset}."
162 | table_ref_str += f"{'__'.join([*table_ref.schema, table_ref.name])}"
163 | return table_ref_str
164 |
165 | @staticmethod
166 | def convert_table_ref_to_bigquery_table_reference(
167 | table_ref: TableRef, project: str
168 | ) -> bigquery.TableReference:
169 | return bigquery.TableReference(
170 | dataset_ref=bigquery.DatasetReference(project=project, dataset_id=table_ref.dataset),
171 | table_id=f"{'__'.join([*table_ref.schema, table_ref.name])}",
172 | )
173 |
174 |
175 | class DuckDBDialect(SQLDialect):
176 | sqlglot_dialect = sqlglot.dialects.Dialects.DUCKDB
177 |
178 | @staticmethod
179 | def parse_table_ref(table_ref: str) -> TableRef:
180 | """
181 | Parses a DuckDB table reference string into a TableRef object.
182 |
183 | >>> DuckDBDialect.parse_table_ref("my_schema.my_table")
184 | TableRef(dataset=None, schema=('my_schema',), name='my_table', project=None)
185 |
186 | >>> DuckDBDialect.parse_table_ref("my_schema.my_subschema__my_table")
187 | TableRef(dataset=None, schema=('my_schema', 'my_subschema'), name='my_table', project=None)
188 |
189 | >>> DuckDBDialect.parse_table_ref("my_table")
190 | TableRef(dataset=None, schema=(), name='my_table', project=None)
191 | """
192 | if "." in table_ref:
193 | project, schema, leftover = None, *tuple(table_ref.rsplit(".", 1))
194 | *subschema, name = tuple(re.split(r"(? str:
206 | """
207 | Formats a TableRef object into a DuckDB table reference string.
208 |
209 | >>> DuckDBDialect.format_table_ref(TableRef(dataset=None, schema=('my_schema',), name='my_table', project=None))
210 | 'my_schema.my_table'
211 |
212 | >>> DuckDBDialect.format_table_ref(TableRef(dataset=None, schema=('my_schema', 'my_subschema'), name='my_table', project=None))
213 | 'my_schema.my_subschema__my_table'
214 |
215 | >>> DuckDBDialect.format_table_ref(TableRef(dataset=None, schema=(), name='my_table', project=None))
216 | 'my_table'
217 | """
218 | if len(table_ref.schema) > 0:
219 | schema = table_ref.schema[0]
220 | if len(table_ref.schema) > 1:
221 | full_table_ref = f"{schema}.{'__'.join([*table_ref.schema[1:], table_ref.name])}"
222 | else:
223 | full_table_ref = f"{schema}.{table_ref.name}"
224 | return full_table_ref
225 | return table_ref.name
226 |
227 | @staticmethod
228 | def convert_table_ref_to_duckdb_table_reference(table_ref: TableRef) -> str:
229 | return DuckDBDialect.format_table_ref(table_ref)
230 |
231 |
232 | def strip_quotes(x: str) -> str:
233 | return x.strip('"').strip("`")
234 |
--------------------------------------------------------------------------------
/lea/field.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import dataclasses
4 | import enum
5 |
6 |
7 | @dataclasses.dataclass(frozen=True)
8 | class Field:
9 | name: str
10 | tags: set[FieldTag] = dataclasses.field(default_factory=set)
11 | description: str | None = None
12 |
13 | @property
14 | def is_unique(self):
15 | return FieldTag.UNIQUE in self.tags
16 |
17 |
18 | class FieldTag(enum.StrEnum):
19 | NO_NULLS = "#NO_NULLS"
20 | UNIQUE = "#UNIQUE"
21 | UNIQUE_BY = "#UNIQUE_BY"
22 | SET = "#SET"
23 | INCREMENTAL = "#INCREMENTAL"
24 |
--------------------------------------------------------------------------------
/lea/job.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import dataclasses
4 | import datetime as dt
5 | import enum
6 |
7 | from lea.databases import DatabaseJob
8 | from lea.table_ref import TableRef
9 |
10 |
11 | class JobStatus(enum.Enum):
12 | RUNNING = "RUNNING"
13 | SUCCESS = "[green]SUCCESS[/green]"
14 | ERRORED = "[red]ERRORED[/red]"
15 | STOPPED = "[yellow]STOPPED[/yellow]"
16 |
17 | def __str__(self):
18 | return self.value
19 |
20 |
21 | @dataclasses.dataclass
22 | class Job:
23 | table_ref: TableRef
24 | is_test: bool
25 | database_job: DatabaseJob
26 | started_at: dt.datetime = dataclasses.field(default_factory=dt.datetime.now)
27 | ended_at: dt.datetime | None = None
28 | status: JobStatus = JobStatus.RUNNING
29 |
30 | def __hash__(self):
31 | return hash(self.table_ref)
32 |
--------------------------------------------------------------------------------
/lea/scripts.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import dataclasses
4 | import datetime as dt
5 | import functools
6 | import os
7 | import pathlib
8 | import re
9 | import textwrap
10 |
11 | import jinja2
12 | import rich.syntax
13 | import sqlglot
14 | import sqlglot.optimizer
15 |
16 | from .comment import extract_comments
17 | from .dialects import SQLDialect
18 | from .field import Field, FieldTag
19 | from .table_ref import TableRef
20 |
21 |
22 | @dataclasses.dataclass(frozen=True)
23 | class SQLScript:
24 | table_ref: TableRef
25 | code: str
26 | sql_dialect: SQLDialect
27 | fields: list[Field] | None = dataclasses.field(default=None)
28 | updated_at: dt.datetime | None = None
29 |
30 | def __post_init__(self):
31 | """
32 |
33 | This part is a bit tricky. We extract fields from each script for different reasons. For
34 | instance, the fields are used to generate assertion tests.
35 |
36 | The logic to extract fields is based on SQLGlot. The latter usually works well, but it
37 | sometimes fail for complex queries. For instance, in incremental mode, we have to edit
38 | the queries to filter their dependencies. These queries are not always parsed correctly by
39 | SQLGlot.
40 |
41 | To circumvent this issue, we extract fields, and cache them. This way, whenever we call
42 | dataclasses.replace, they won't have to be recomputed. This makes sense because the scripts
43 | are never edited to add or remove fields. They are only edited to change the filtering
44 | conditions.
45 |
46 | """
47 | if self.fields is not None:
48 | return
49 | field_names = self.ast.named_selects
50 | field_comments = extract_comments(
51 | code=self.code, expected_field_names=field_names, sql_dialect=self.sql_dialect
52 | )
53 | fields = [
54 | Field(
55 | name=name,
56 | tags={
57 | comment.text
58 | for comment in field_comments.get(name, [])
59 | if comment.text.startswith("#")
60 | },
61 | description=" ".join(
62 | comment.text
63 | for comment in field_comments.get(name, [])
64 | if not comment.text.startswith("#")
65 | ),
66 | )
67 | for name in field_names
68 | if name != "*"
69 | ]
70 | # https://stackoverflow.com/a/54119384
71 | object.__setattr__(self, "fields", fields)
72 |
73 | @classmethod
74 | def from_path(
75 | cls,
76 | scripts_dir: pathlib.Path,
77 | relative_path: pathlib.Path,
78 | sql_dialect: SQLDialect,
79 | project_name: str,
80 | ) -> SQLScript:
81 | # Either the file is a Jinja template
82 | if relative_path.suffixes == [".sql", ".jinja"]:
83 | loader = jinja2.FileSystemLoader(scripts_dir)
84 | environment = jinja2.Environment(loader=loader)
85 | template = environment.get_template(str(relative_path))
86 | code = template.render(env=os.environ)
87 | # Or it's a regular SQL file
88 | else:
89 | code = (scripts_dir / relative_path).read_text().rstrip().rstrip(";")
90 |
91 | return cls(
92 | table_ref=TableRef.from_path(
93 | scripts_dir=scripts_dir, relative_path=relative_path, project_name=project_name
94 | ),
95 | code=code,
96 | sql_dialect=sql_dialect,
97 | updated_at=dt.datetime.fromtimestamp(
98 | (scripts_dir / relative_path).stat().st_mtime, tz=dt.timezone.utc
99 | ),
100 | )
101 |
102 | @property
103 | def is_test(self) -> bool:
104 | return self.table_ref.is_test
105 |
106 | @functools.cached_property
107 | def ast(self):
108 | ast = sqlglot.parse_one(self.code, dialect=self.sql_dialect.sqlglot_dialect)
109 | try:
110 | return sqlglot.optimizer.qualify.qualify(ast)
111 | except sqlglot.errors.OptimizeError:
112 | return ast
113 |
114 | @functools.cached_property
115 | def dependencies(self) -> set[TableRef]:
116 | def add_default_project(table_ref: TableRef) -> TableRef:
117 | if table_ref.project is None:
118 | return table_ref.replace_project(self.table_ref.project)
119 | return table_ref
120 |
121 | dependencies = set()
122 |
123 | for scope in sqlglot.optimizer.scope.traverse_scope(self.ast):
124 | for table in scope.tables:
125 | if (
126 | not isinstance(table.this, sqlglot.exp.Func)
127 | and sqlglot.exp.table_name(table) not in scope.cte_sources
128 | ):
129 | try:
130 | table_ref = self.sql_dialect.parse_table_ref(
131 | table_ref=sqlglot.exp.table_name(table)
132 | )
133 | except ValueError as e:
134 | raise ValueError(
135 | f"Unable to parse table reference {sqlglot.exp.table_name(table)!r} "
136 | f"in {self.table_ref.replace_project(None)}"
137 | ) from e
138 | dependencies.add(add_default_project(table_ref))
139 |
140 | return dependencies
141 |
142 | @property
143 | def assertion_tests(self) -> list[SQLScript]:
144 | """
145 |
146 | Assertion tests are gleaned from the comments in the script. They are used to test the
147 | quality of the data. The following tags are supported:
148 |
149 | - #NO_NULLS: Asserts that the column has no null values.
150 | - #UNIQUE: Asserts that the column has unique values.
151 | - #UNIQUE_BY(field): Asserts that the column has unique values when grouped by field.
152 | - #SET{value1, value2, ...}: Asserts that the column only contains the specified elements.
153 |
154 | """
155 |
156 | def make_table_ref(field, tag):
157 | return TableRef(
158 | dataset=self.table_ref.dataset,
159 | schema=("tests",),
160 | name=f"{'__'.join(self.table_ref.schema)}__{self.table_ref.name}__{field.name}___{tag.lower().lstrip('#')}",
161 | project=self.table_ref.project,
162 | )
163 |
164 | def make_assertion_test(table_ref, field, tag):
165 | if tag == FieldTag.NO_NULLS:
166 | return SQLScript(
167 | table_ref=make_table_ref(field, FieldTag.NO_NULLS),
168 | code=self.sql_dialect.make_column_test_no_nulls(table_ref, field.name),
169 | sql_dialect=self.sql_dialect,
170 | )
171 | elif tag == FieldTag.UNIQUE:
172 | return SQLScript(
173 | table_ref=make_table_ref(field, FieldTag.UNIQUE),
174 | code=self.sql_dialect.make_column_test_unique(table_ref, field.name),
175 | sql_dialect=self.sql_dialect,
176 | )
177 | elif unique_by := re.fullmatch(FieldTag.UNIQUE_BY + r"\((?P.+)\)", tag):
178 | by = unique_by.group("by")
179 | return SQLScript(
180 | table_ref=make_table_ref(field, FieldTag.UNIQUE_BY),
181 | code=self.sql_dialect.make_column_test_unique_by(table_ref, field.name, by),
182 | sql_dialect=self.sql_dialect,
183 | )
184 | elif set_ := re.fullmatch(
185 | FieldTag.SET + r"\{(?P'[^']+'(?:,\s*'[^']+')*)\}", tag
186 | ):
187 | elements = {element.strip() for element in set_.group("elements").split(",")}
188 | return SQLScript(
189 | table_ref=make_table_ref(field, FieldTag.SET),
190 | code=self.sql_dialect.make_column_test_set(table_ref, field.name, elements),
191 | sql_dialect=self.sql_dialect,
192 | )
193 | else:
194 | raise ValueError(f"Unhandled tag: {tag}")
195 |
196 | return [
197 | # We don't need to include the target table_ref's project in the assertion test,
198 | # because that would include the project in the code generated by the SQL dialect.
199 | # This is not needed, because the project will be set downstream in each script anyway.
200 | make_assertion_test(self.table_ref.replace_project(None), field, tag)
201 | for field in self.fields or []
202 | for tag in field.tags
203 | if tag not in {FieldTag.INCREMENTAL}
204 | ]
205 |
206 | def replace_table_ref(self, table_ref: TableRef) -> SQLScript:
207 | return dataclasses.replace(self, table_ref=table_ref)
208 |
209 | def __rich__(self):
210 | code = textwrap.dedent(self.code).strip()
211 | code_with_table_ref = f"""-- {self.table_ref}\n\n{code}\n"""
212 | return rich.syntax.Syntax(code_with_table_ref, "sql", line_numbers=False, theme="ansi_dark")
213 |
214 |
215 | Script = SQLScript
216 |
217 |
218 | def read_scripts(
219 | scripts_dir: pathlib.Path, sql_dialect: SQLDialect, dataset_name: str, project_name: str
220 | ) -> list[Script]:
221 | def read_script(path: pathlib.Path) -> Script:
222 | match tuple(path.suffixes):
223 | case (".sql",) | (".sql", ".jinja"):
224 | return SQLScript.from_path(
225 | scripts_dir=scripts_dir,
226 | relative_path=path.relative_to(scripts_dir),
227 | sql_dialect=sql_dialect,
228 | project_name=project_name,
229 | )
230 | case _:
231 | raise ValueError(f"Unsupported script type: {path}")
232 |
233 | def set_dataset(script: Script) -> Script:
234 | return script.replace_table_ref(script.table_ref.replace_dataset(dataset=dataset_name))
235 |
236 | return [
237 | set_dataset(read_script(path))
238 | for path in scripts_dir.rglob("*")
239 | if not path.is_dir()
240 | and tuple(path.suffixes) in {(".sql",), (".sql", ".jinja"), (".json",)}
241 | and not path.name.startswith("_")
242 | and path.stat().st_size > 0
243 | ]
244 |
--------------------------------------------------------------------------------
/lea/session.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import concurrent.futures
4 | import dataclasses
5 | import datetime as dt
6 | import re
7 | import threading
8 | import time
9 | from collections.abc import Callable
10 |
11 | import lea
12 | from lea.databases import DatabaseClient, TableStats
13 | from lea.field import FieldTag
14 | from lea.job import Job, JobStatus
15 | from lea.scripts import Script
16 | from lea.table_ref import TableRef
17 |
18 |
19 | class Session:
20 | def __init__(
21 | self,
22 | database_client: DatabaseClient,
23 | base_dataset: str,
24 | write_dataset: str,
25 | scripts: dict[TableRef, Script],
26 | selected_table_refs: set[TableRef],
27 | unselected_table_refs: set[TableRef],
28 | existing_tables: dict[TableRef, TableStats],
29 | existing_audit_tables: dict[TableRef, TableStats],
30 | incremental_field_name=None,
31 | incremental_field_values=None,
32 | ):
33 | self.database_client = database_client
34 | self.base_dataset = base_dataset
35 | self.write_dataset = write_dataset
36 | self.scripts = scripts
37 | self.selected_table_refs = selected_table_refs
38 | self.unselected_table_refs = unselected_table_refs
39 | self.existing_tables = existing_tables
40 | self.existing_audit_tables = existing_audit_tables
41 | self.incremental_field_name = incremental_field_name
42 | self.incremental_field_values = incremental_field_values
43 |
44 | self.jobs: list[Job] = []
45 | self.started_at = dt.datetime.now()
46 | self.ended_at: dt.datetime | None = None
47 | self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=None)
48 | self.run_script_futures: dict = {}
49 | self.run_script_futures_complete: dict = {}
50 | self.promote_audit_tables_futures: dict = {}
51 | self.stop_event = threading.Event()
52 |
53 | if self.incremental_field_name is not None:
54 | self.filterable_table_refs = {
55 | table_ref.replace_dataset(self.write_dataset)
56 | for table_ref in scripts
57 | if any(
58 | field.name == incremental_field_name
59 | for field in scripts[table_ref].fields or []
60 | )
61 | }
62 | self.incremental_table_refs = {
63 | table_ref.replace_dataset(self.write_dataset).remove_audit_suffix()
64 | for table_ref in selected_table_refs | set(existing_audit_tables)
65 | if any(
66 | field.name == incremental_field_name and FieldTag.INCREMENTAL in field.tags
67 | for field in scripts[
68 | table_ref.remove_audit_suffix().replace_dataset(self.base_dataset)
69 | ].fields
70 | or []
71 | )
72 | }
73 | else:
74 | self.filterable_table_refs = set()
75 | self.incremental_table_refs = set()
76 |
77 | def add_write_context_to_table_ref(self, table_ref: TableRef) -> TableRef:
78 | table_ref = table_ref.replace_dataset(self.write_dataset)
79 | table_ref = table_ref.add_audit_suffix()
80 | return table_ref
81 |
82 | def remove_write_context_from_table_ref(self, table_ref: TableRef) -> TableRef:
83 | table_ref = table_ref.replace_dataset(self.base_dataset)
84 | table_ref = table_ref.remove_audit_suffix()
85 | return table_ref
86 |
87 | def add_context_to_script(self, script: Script) -> Script:
88 | def add_context_to_dependency(dependency: TableRef) -> TableRef | None:
89 | # We don't modify the project if is has been deliberately set
90 | if dependency.project is not None and dependency.project != script.table_ref.project:
91 | return None
92 |
93 | if (
94 | dependency.replace_dataset(self.base_dataset)
95 | in self.selected_table_refs
96 | | {
97 | self.remove_write_context_from_table_ref(table_ref)
98 | for table_ref in self.existing_audit_tables
99 | }
100 | and dependency.replace_dataset(self.base_dataset) in self.scripts
101 | ):
102 | dependency = dependency.add_audit_suffix()
103 |
104 | dependency = dependency.replace_dataset(self.write_dataset)
105 |
106 | return dependency
107 |
108 | script = replace_script_dependencies(script=script, replace_func=add_context_to_dependency)
109 |
110 | # If a script is marked as incremental, it implies that it can be run incrementally. This
111 | # means that we have to filter the script's dependencies, as well as filter the output.
112 | # This logic is implemented by the script's SQL dialect.
113 | if script.table_ref.replace_dataset(self.write_dataset) in self.incremental_table_refs:
114 | script = dataclasses.replace(
115 | script,
116 | code=script.sql_dialect.add_dependency_filters(
117 | code=script.code,
118 | incremental_field_name=self.incremental_field_name,
119 | incremental_field_values=self.incremental_field_values,
120 | # One caveat is the dependencies which are not incremental do not have to be
121 | # filtered. Indeed, they are already filtered by the fact that they are
122 | # incremental.
123 | dependencies_to_filter=self.filterable_table_refs - self.incremental_table_refs,
124 | ),
125 | )
126 |
127 | # If the script is not incremental, we're not out of the woods! All scripts are
128 | # materialized into side-tables which we call "audit" tables. This is the WAP pattern.
129 | # Therefore, if a script is not incremental, but it depends on an incremental script, we
130 | # have to modify the script to use both the incremental and non-incremental versions of
131 | # the dependency. This is handled by the script's SQL dialect.
132 | elif self.incremental_table_refs:
133 | script = dataclasses.replace(
134 | script,
135 | code=script.sql_dialect.handle_incremental_dependencies(
136 | code=script.code,
137 | incremental_field_name=self.incremental_field_name,
138 | incremental_field_values=self.incremental_field_values,
139 | incremental_dependencies={
140 | incremental_table_ref: incremental_table_ref.add_audit_suffix()
141 | for incremental_table_ref in self.incremental_table_refs
142 | },
143 | ),
144 | )
145 |
146 | return script.replace_table_ref(self.add_write_context_to_table_ref(script.table_ref))
147 |
148 | def run_script(self, script: Script):
149 | # If the script is a test, we don't materialize it, we just query it. A test fails if it
150 | # returns any rows.
151 | if script.is_test:
152 | database_job = self.database_client.query_script(script=script)
153 | # If the script is not a test, it's a regular table, so we materialize it. Instead of
154 | # directly materializing it to the destination table, we materialize it to a side-table
155 | # which we call an "audit" table. Once all the scripts have run successfully, we will
156 | # promote the audit tables to the destination tables. This is the WAP pattern.
157 | else:
158 | database_job = self.database_client.materialize_script(script=script)
159 |
160 | job = Job(table_ref=script.table_ref, is_test=script.is_test, database_job=database_job)
161 | self.jobs.append(job)
162 |
163 | msg = f"{job.status} {script.table_ref}"
164 |
165 | if script.table_ref.remove_audit_suffix() in self.incremental_table_refs:
166 | msg += " (incremental)"
167 | lea.log.info(msg)
168 |
169 | self.monitor_job(job)
170 |
171 | def monitor_job(self, job: Job):
172 | # We're going to do exponential backoff. This is because we don't want to overload
173 | # whatever API is used to check whether a database job is over or not. We're going to
174 | # check every second, then every two seconds, then every four seconds, etc. until we
175 | # reach a maximum delay of 10 seconds.
176 | base_delay = 1
177 | max_delay = 10
178 | retries = 0
179 | checked_at = dt.datetime.now()
180 |
181 | while not self.stop_event.is_set():
182 | if not job.database_job.is_done:
183 | delay = min(max_delay, base_delay * (2**retries))
184 | retries += 1
185 | if (now := dt.datetime.now()) - checked_at >= dt.timedelta(seconds=10):
186 | duration_str = str(now - job.started_at).split(".")[0]
187 | lea.log.info(f"{job.status} {job.table_ref} after {duration_str}")
188 | checked_at = now
189 | time.sleep(delay)
190 | continue
191 |
192 | # Case 1: the job raised an exception
193 | if (exception := job.database_job.exception) is not None:
194 | job.status = JobStatus.ERRORED
195 | lea.log.error(f"{job.status} {job.table_ref}\n{exception}")
196 |
197 | # Case 2: the job succeeded, but it's a test and there are negative cases
198 | elif job.is_test and not (dataframe := job.database_job.result).empty:
199 | job.status = JobStatus.ERRORED
200 | lea.log.error(f"{job.status} {job.table_ref}\n{dataframe.head()}")
201 |
202 | # Case 3: the job succeeded!
203 | else:
204 | job.status = JobStatus.SUCCESS
205 | msg = f"{job.status} {job.table_ref}"
206 | job.ended_at = dt.datetime.now()
207 | # Depending on the warehouse in use, jobs may have a conclude() method, for example
208 | # for recording job statistics.
209 | job.database_job.conclude()
210 | duration_str = str(job.ended_at - job.started_at).split(".")[0]
211 | if job.ended_at - job.started_at >= dt.timedelta(seconds=1):
212 | msg += f", took {duration_str}"
213 | if job.database_job.billed_dollars is not None:
214 | msg += f", cost ${job.database_job.billed_dollars:.2f}"
215 | if not job.is_test:
216 | if (stats := job.database_job.statistics) is not None:
217 | msg += f", contains {stats.n_rows:,d} rows"
218 | if stats.n_bytes is not None:
219 | msg += f", weighs {format_bytes(stats.n_bytes)}"
220 | if job.database_job.metadata:
221 | msg += f" ({', '.join(job.database_job.metadata)})"
222 | lea.log.info(msg)
223 |
224 | return
225 |
226 | def promote_audit_table(self, table_ref: TableRef):
227 | from_table_ref = table_ref
228 | to_table_ref = table_ref.remove_audit_suffix()
229 |
230 | is_incremental = (
231 | self.incremental_field_name is not None and to_table_ref in self.incremental_table_refs
232 | )
233 | if is_incremental:
234 | database_job = self.database_client.delete_and_insert(
235 | from_table_ref=from_table_ref,
236 | to_table_ref=to_table_ref,
237 | on=self.incremental_field_name,
238 | )
239 | else:
240 | database_job = self.database_client.clone_table(
241 | from_table_ref=from_table_ref, to_table_ref=to_table_ref
242 | )
243 |
244 | job = Job(table_ref=to_table_ref, is_test=False, database_job=database_job)
245 | self.jobs.append(job)
246 | lea.log.info(f"{job.status} {job.table_ref}" + (" (incremental)" if is_incremental else ""))
247 |
248 | self.monitor_job(job)
249 |
250 | def end(self):
251 | lea.log.info("😴 Ending session")
252 | self.stop_event.set()
253 | for job in self.jobs:
254 | if job.status == JobStatus.RUNNING:
255 | job.database_job.stop()
256 | job.status = JobStatus.STOPPED
257 | lea.log.info(f"{job.status} {job.table_ref}")
258 | self.executor.shutdown()
259 | self.ended_at = dt.datetime.now()
260 |
261 | @property
262 | def any_error_has_occurred(self) -> bool:
263 | return any(job.status == JobStatus.ERRORED for job in self.jobs) or any(
264 | future.exception() is not None for future in self.run_script_futures_complete
265 | )
266 |
267 | @property
268 | def total_billed_dollars(self) -> float:
269 | return sum(
270 | job.database_job.billed_dollars
271 | for job in self.jobs
272 | if job.database_job.billed_dollars is not None
273 | )
274 |
275 |
276 | def replace_script_dependencies(
277 | script: Script, replace_func: Callable[[TableRef], TableRef]
278 | ) -> Script:
279 | """
280 |
281 | It's often necessary to edit the dependencies of a script. For example, we might want
282 | to change the dataset of a dependency. Or we might want to append a suffix a table name
283 | when we're doing a write/audit/publish operation.
284 |
285 | """
286 | code = script.code
287 |
288 | for dependency_to_edit in script.dependencies:
289 | new_dependency = replace_func(dependency_to_edit)
290 | if new_dependency is None:
291 | continue
292 |
293 | dependency_to_edit_without_project_str = script.sql_dialect.format_table_ref(
294 | dependency_to_edit.replace_project(None)
295 | )
296 | new_dependency_str = script.sql_dialect.format_table_ref(new_dependency)
297 | code = re.sub(
298 | rf"\b{dependency_to_edit_without_project_str}\b",
299 | new_dependency_str,
300 | code,
301 | )
302 |
303 | # We also have to handle the case where the table is referenced to access a field.
304 | # TODO: refactor this with the above
305 | dependency_to_edit_without_dataset = dataclasses.replace(
306 | dependency_to_edit, dataset="", project=None
307 | )
308 | dependency_to_edit_without_dataset_str = script.sql_dialect.format_table_ref(
309 | dependency_to_edit_without_dataset
310 | )
311 | new_dependency_without_dataset = dataclasses.replace(
312 | new_dependency, dataset="", project=None
313 | )
314 | new_dependency_without_dataset_str = script.sql_dialect.format_table_ref(
315 | new_dependency_without_dataset
316 | )
317 | code = re.sub(
318 | rf"\b{dependency_to_edit_without_dataset_str}\b",
319 | new_dependency_without_dataset_str,
320 | code,
321 | )
322 |
323 | return dataclasses.replace(script, code=code)
324 |
325 |
326 | def format_bytes(size: float) -> str:
327 | # Define the size units in ascending order
328 | power = 1024
329 | n = 0
330 | units = ["B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"]
331 |
332 | # Convert bytes to the highest possible unit
333 | while size >= power and n < len(units) - 1:
334 | size /= power
335 | n += 1
336 |
337 | # Format the result with two decimal places
338 | return f"{size:.0f}{units[n]}"
339 |
--------------------------------------------------------------------------------
/lea/table_ref.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import dataclasses
4 | import pathlib
5 | import re
6 |
7 | AUDIT_TABLE_SUFFIX = "___audit"
8 |
9 |
10 | @dataclasses.dataclass(eq=True, frozen=True)
11 | class TableRef:
12 | dataset: str
13 | schema: tuple[str, ...]
14 | name: str
15 | project: str | None
16 |
17 | def __str__(self):
18 | return ".".join(filter(None, [self.project, self.dataset, *self.schema, self.name]))
19 |
20 | @classmethod
21 | def from_path(
22 | cls, scripts_dir: pathlib.Path, relative_path: pathlib.Path, project_name: str
23 | ) -> TableRef:
24 | parts = list(filter(None, relative_path.parts))
25 | *schema, filename = parts
26 | return cls(
27 | dataset=scripts_dir.name,
28 | schema=tuple(schema),
29 | name=filename.split(".")[0], # remove the extension
30 | project=project_name,
31 | )
32 |
33 | def replace_dataset(self, dataset: str) -> TableRef:
34 | return dataclasses.replace(self, dataset=dataset)
35 |
36 | def replace_project(self, project: str) -> TableRef:
37 | return dataclasses.replace(self, project=project)
38 |
39 | def add_audit_suffix(self) -> TableRef:
40 | if self.is_audit_table:
41 | return self
42 | return dataclasses.replace(self, name=f"{self.name}{AUDIT_TABLE_SUFFIX}")
43 |
44 | def remove_audit_suffix(self) -> TableRef:
45 | if self.is_audit_table:
46 | return dataclasses.replace(self, name=re.sub(rf"{AUDIT_TABLE_SUFFIX}$", "", self.name))
47 | return self
48 |
49 | @property
50 | def is_audit_table(self) -> bool:
51 | return self.name.endswith(AUDIT_TABLE_SUFFIX)
52 |
53 | @property
54 | def is_test(self) -> bool:
55 | return len(self.schema) > 0 and self.schema[0] == "tests"
56 |
--------------------------------------------------------------------------------
/lea/test_big_query.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import re
4 |
5 | import pytest
6 | from google.auth.credentials import AnonymousCredentials
7 |
8 | from lea.conductor import Session
9 | from lea.databases import BigQueryClient, TableStats
10 | from lea.dialects import BigQueryDialect
11 | from lea.scripts import Script, TableRef
12 |
13 | DUMMY_TABLE_STATS = TableStats(n_rows=0, n_bytes=0, updated_at=None)
14 |
15 |
16 | @pytest.fixture
17 | def scripts() -> dict[TableRef, Script]:
18 | return {
19 | script.table_ref: script
20 | for script in [
21 | Script(
22 | table_ref=TableRef("read", ("raw",), "users", "test_project"),
23 | code="""
24 | SELECT * FROM UNNEST([
25 | STRUCT(1 AS id, 'Alice' AS name, 30 AS age),
26 | STRUCT(2 AS id, 'Bob' AS name, 25 AS age),
27 | STRUCT(3 AS id, 'Charlie' AS name, 35 AS age)
28 | ])
29 | """,
30 | sql_dialect=BigQueryDialect(),
31 | ),
32 | Script(
33 | table_ref=TableRef("read", ("core",), "users", "test_project"),
34 | code="""
35 | SELECT
36 | id,
37 | -- #INCREMENTAL
38 | name,
39 | age
40 | FROM read.raw__users
41 | """,
42 | sql_dialect=BigQueryDialect(),
43 | ),
44 | Script(
45 | table_ref=TableRef("read", ("analytics",), "n_users", "test_project"),
46 | code="""
47 | SELECT COUNT(*)
48 | FROM read.core__users
49 | """,
50 | sql_dialect=BigQueryDialect(),
51 | ),
52 | Script(
53 | table_ref=TableRef("read", ("analytics",), "n_users_with_unnest", "test_project"),
54 | code="""
55 | SELECT COUNT(*)
56 | FROM read.core__users, UNNEST([1, 2, 3]) AS n
57 | """,
58 | sql_dialect=BigQueryDialect(),
59 | ),
60 | ]
61 | }
62 |
63 |
64 | def assert_queries_are_equal(query1: str, query2: str):
65 | normalized_query1 = re.sub(r"\s+", " ", query1).strip()
66 | normalized_query2 = re.sub(r"\s+", " ", query2).strip()
67 | assert normalized_query1 == normalized_query2
68 |
69 |
70 | def test_simple_run(scripts):
71 | session = Session(
72 | database_client=None,
73 | base_dataset="read",
74 | write_dataset="write",
75 | scripts=scripts,
76 | selected_table_refs=scripts.keys(),
77 | unselected_table_refs=set(),
78 | existing_tables={},
79 | existing_audit_tables={},
80 | )
81 |
82 | assert_queries_are_equal(
83 | session.add_context_to_script(
84 | scripts[TableRef("read", ("raw",), "users", "test_project")]
85 | ).code,
86 | """
87 | SELECT * FROM UNNEST([
88 | STRUCT(1 AS id, 'Alice' AS name, 30 AS age),
89 | STRUCT(2 AS id, 'Bob' AS name, 25 AS age),
90 | STRUCT(3 AS id, 'Charlie' AS name, 35 AS age)
91 | ])
92 | """,
93 | )
94 | assert_queries_are_equal(
95 | session.add_context_to_script(
96 | scripts[TableRef("read", ("analytics",), "n_users", "test_project")]
97 | ).code,
98 | """
99 | SELECT COUNT(*)
100 | FROM test_project.write.core__users___audit
101 | """,
102 | )
103 |
104 |
105 | def test_incremental_field(scripts):
106 | session = Session(
107 | database_client=None,
108 | base_dataset="read",
109 | write_dataset="write",
110 | scripts=scripts,
111 | selected_table_refs=scripts.keys(),
112 | unselected_table_refs=set(),
113 | existing_tables={},
114 | existing_audit_tables={},
115 | incremental_field_name="name",
116 | incremental_field_values={"Alice"},
117 | )
118 |
119 | assert_queries_are_equal(
120 | session.add_context_to_script(
121 | scripts[TableRef("read", ("core",), "users", "test_project")]
122 | ).code,
123 | """
124 | SELECT *
125 | FROM (
126 | SELECT id, name, age
127 | FROM test_project.write.raw__users___audit
128 | )
129 | WHERE name IN ('Alice')
130 | """,
131 | )
132 |
133 | assert_queries_are_equal(
134 | session.add_context_to_script(
135 | scripts[TableRef("read", ("analytics",), "n_users", "test_project")]
136 | ).code,
137 | """
138 | SELECT COUNT(*) FROM (
139 | SELECT *
140 | FROM test_project.write.core__users___audit
141 | WHERE name IN ('Alice')
142 |
143 | UNION ALL
144 |
145 | SELECT *
146 | FROM test_project.write.core__users
147 | WHERE name NOT IN ('Alice')
148 | )
149 | """,
150 | )
151 |
152 |
153 | def test_incremental_field_with_comma(scripts):
154 | session = Session(
155 | database_client=None,
156 | base_dataset="read",
157 | write_dataset="write",
158 | scripts=scripts,
159 | selected_table_refs=scripts.keys(),
160 | unselected_table_refs=set(),
161 | existing_tables={},
162 | existing_audit_tables={},
163 | incremental_field_name="name",
164 | incremental_field_values={"Alice"},
165 | )
166 |
167 | assert_queries_are_equal(
168 | session.add_context_to_script(
169 | scripts[TableRef("read", ("core",), "users", "test_project")]
170 | ).code,
171 | """
172 | SELECT *
173 | FROM (
174 | SELECT id, name, age
175 | FROM test_project.write.raw__users___audit
176 | )
177 | WHERE name IN ('Alice')
178 | """,
179 | )
180 |
181 | assert_queries_are_equal(
182 | session.add_context_to_script(
183 | scripts[TableRef("read", ("analytics",), "n_users_with_unnest", "test_project")]
184 | ).code,
185 | """
186 | SELECT COUNT(*) FROM (
187 | SELECT *
188 | FROM test_project.write.core__users___audit
189 | WHERE name IN ('Alice')
190 |
191 | UNION ALL
192 |
193 | SELECT *
194 | FROM test_project.write.core__users
195 | WHERE name NOT IN ('Alice')
196 | ) , UNNEST([1, 2, 3]) AS n
197 | """,
198 | )
199 |
200 |
201 | def test_incremental_field_but_no_incremental_table_selected(scripts):
202 | session = Session(
203 | database_client=None,
204 | base_dataset="read",
205 | write_dataset="write",
206 | scripts=scripts,
207 | selected_table_refs={TableRef("read", ("analytics",), "n_users", "test_project")},
208 | unselected_table_refs=set(),
209 | existing_tables={},
210 | existing_audit_tables={},
211 | incremental_field_name="name",
212 | incremental_field_values={"Alice"},
213 | )
214 |
215 | assert_queries_are_equal(
216 | session.add_context_to_script(
217 | scripts[TableRef("read", ("core",), "users", "test_project")]
218 | ).code,
219 | """
220 | SELECT
221 | id,
222 | -- #INCREMENTAL
223 | name,
224 | age
225 | FROM test_project.write.raw__users
226 | """,
227 | )
228 |
229 |
230 | def test_incremental_field_with_just_incremental_table_selected(scripts):
231 | session = Session(
232 | database_client=None,
233 | base_dataset="read",
234 | write_dataset="write",
235 | scripts=scripts,
236 | selected_table_refs={TableRef("read", ("core",), "users", "test_project")},
237 | unselected_table_refs=set(),
238 | existing_tables={},
239 | existing_audit_tables={},
240 | incremental_field_name="name",
241 | incremental_field_values={"Alice"},
242 | )
243 |
244 | assert_queries_are_equal(
245 | session.add_context_to_script(
246 | scripts[TableRef("read", ("core",), "users", "test_project")]
247 | ).code,
248 | """
249 | SELECT *
250 | FROM (
251 | SELECT id, name, age
252 | FROM test_project.write.raw__users
253 | )
254 | WHERE name IN ('Alice')
255 | """,
256 | )
257 |
258 |
259 | def test_incremental_field_with_just_incremental_table_selected_and_materialized_dependency(
260 | scripts,
261 | ):
262 | session = Session(
263 | database_client=None,
264 | base_dataset="read",
265 | write_dataset="write",
266 | scripts=scripts,
267 | selected_table_refs={TableRef("read", ("core",), "users", "test_project")},
268 | unselected_table_refs=set(),
269 | existing_tables={},
270 | existing_audit_tables={
271 | TableRef("read", ("raw",), "users", "test_project"): DUMMY_TABLE_STATS
272 | },
273 | incremental_field_name="name",
274 | incremental_field_values={"Alice"},
275 | )
276 |
277 | assert_queries_are_equal(
278 | session.add_context_to_script(
279 | scripts[TableRef("read", ("core",), "users", "test_project")]
280 | ).code,
281 | """
282 | SELECT *
283 | FROM (
284 | SELECT id, name, age
285 | FROM test_project.write.raw__users___audit
286 | )
287 | WHERE name IN ('Alice')
288 | """,
289 | )
290 |
291 |
292 | def test_incremental_field_but_no_incremental_table_selected_and_yet_dependency_is_materialized(
293 | scripts,
294 | ):
295 | session = Session(
296 | database_client=None,
297 | base_dataset="read",
298 | write_dataset="write",
299 | scripts=scripts,
300 | selected_table_refs={TableRef("read", ("analytics",), "n_users", "test_project")},
301 | unselected_table_refs=set(),
302 | existing_tables={},
303 | existing_audit_tables={
304 | TableRef("read", ("core",), "users", "test_project"): DUMMY_TABLE_STATS,
305 | },
306 | incremental_field_name="name",
307 | incremental_field_values={"Alice"},
308 | )
309 |
310 | assert_queries_are_equal(
311 | session.add_context_to_script(
312 | scripts[TableRef("read", ("analytics",), "n_users", "test_project")]
313 | ).code,
314 | """
315 | SELECT COUNT(*)
316 | FROM (
317 | SELECT *
318 | FROM test_project.write.core__users___audit
319 | WHERE name IN ('Alice')
320 |
321 | UNION ALL
322 |
323 | SELECT *
324 | FROM test_project.write.core__users
325 | WHERE name NOT IN ('Alice')
326 | )
327 | """,
328 | )
329 |
330 |
331 | def test_incremental_field_but_no_incremental_table_selected_and_yet_dependency_is_materialized_with_client(
332 | scripts,
333 | ):
334 | session = Session(
335 | database_client=BigQueryClient(
336 | credentials=AnonymousCredentials(),
337 | location="EU",
338 | write_project_id="write-project-id",
339 | compute_project_id="compute-project-id",
340 | ),
341 | base_dataset="read",
342 | write_dataset="write",
343 | scripts=scripts,
344 | selected_table_refs={TableRef("read", ("analytics",), "n_users", "test_project")},
345 | unselected_table_refs=set(),
346 | existing_tables={},
347 | existing_audit_tables={
348 | TableRef("read", ("core",), "users", "test_project"): DUMMY_TABLE_STATS,
349 | },
350 | incremental_field_name="name",
351 | incremental_field_values={"Alice"},
352 | )
353 |
354 | assert_queries_are_equal(
355 | session.add_context_to_script(
356 | scripts[TableRef("read", ("analytics",), "n_users", "test_project")]
357 | ).code,
358 | """
359 | SELECT COUNT(*)
360 | FROM (
361 | SELECT *
362 | FROM test_project.write.core__users___audit
363 | WHERE name IN ('Alice')
364 |
365 | UNION ALL
366 |
367 | SELECT *
368 | FROM test_project.write.core__users
369 | WHERE name NOT IN ('Alice')
370 | )
371 | """,
372 | )
373 |
--------------------------------------------------------------------------------
/lea/test_duckdb.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import re
4 | from pathlib import Path
5 |
6 | import pytest
7 |
8 | from lea.conductor import Session
9 | from lea.databases import DuckDBClient, TableStats
10 | from lea.dialects import DuckDBDialect
11 | from lea.scripts import Script, TableRef
12 |
13 | DUMMY_TABLE_STATS = TableStats(n_rows=0, n_bytes=0, updated_at=None)
14 |
15 |
16 | @pytest.fixture
17 | def scripts() -> dict[TableRef, Script]:
18 | return {
19 | script.table_ref: script
20 | for script in [
21 | Script(
22 | table_ref=TableRef("read", ("raw",), "users", "test_project"),
23 | code="""
24 | SELECT * FROM (
25 | SELECT UNNEST(
26 | [
27 | {'id': 1, 'name': 'Alice', 'age': 30},
28 | {'id': 2, 'name': 'Bob', 'age': 25},
29 | {'id': 3, 'name': 'Charlie', 'age': 35}
30 | ], max_depth => 2
31 | )
32 | )
33 | """,
34 | sql_dialect=DuckDBDialect(),
35 | ),
36 | Script(
37 | table_ref=TableRef("read", ("core",), "users", "test_project"),
38 | code="""
39 | SELECT
40 | id,
41 | -- #INCREMENTAL
42 | name,
43 | age
44 | FROM raw.users
45 | """,
46 | sql_dialect=DuckDBDialect(),
47 | ),
48 | Script(
49 | table_ref=TableRef("read", ("analytics",), "n_users", "test_project"),
50 | code="""
51 | SELECT COUNT(*)
52 | FROM core.users
53 | """,
54 | sql_dialect=DuckDBDialect(),
55 | ),
56 | ]
57 | }
58 |
59 |
60 | def assert_queries_are_equal(query1: str, query2: str):
61 | normalized_query1 = re.sub(r"\s+", " ", query1).strip()
62 | normalized_query2 = re.sub(r"\s+", " ", query2).strip()
63 | assert normalized_query1 == normalized_query2
64 |
65 |
66 | def test_simple_run(scripts):
67 | session = Session(
68 | database_client=None,
69 | base_dataset="read",
70 | write_dataset="write",
71 | scripts=scripts,
72 | selected_table_refs=scripts.keys(),
73 | unselected_table_refs=set(),
74 | existing_tables={},
75 | existing_audit_tables={},
76 | )
77 |
78 | assert_queries_are_equal(
79 | session.add_context_to_script(
80 | scripts[TableRef("read", ("raw",), "users", "test_project")]
81 | ).code,
82 | """
83 | SELECT * FROM (
84 | SELECT UNNEST(
85 | [
86 | {'id': 1, 'name': 'Alice', 'age': 30},
87 | {'id': 2, 'name': 'Bob', 'age': 25},
88 | {'id': 3, 'name': 'Charlie', 'age': 35}
89 | ], max_depth => 2
90 | )
91 | )
92 | """,
93 | )
94 | assert_queries_are_equal(
95 | session.add_context_to_script(
96 | scripts[TableRef("read", ("analytics",), "n_users", "test_project")]
97 | ).code,
98 | """
99 | SELECT COUNT(*)
100 | FROM core.users___audit
101 | """,
102 | )
103 |
104 |
105 | def test_incremental_field(scripts):
106 | session = Session(
107 | database_client=None,
108 | base_dataset="read",
109 | write_dataset="write",
110 | scripts=scripts,
111 | selected_table_refs=scripts.keys(),
112 | unselected_table_refs=set(),
113 | existing_tables={},
114 | existing_audit_tables={},
115 | incremental_field_name="name",
116 | incremental_field_values={"Alice"},
117 | )
118 |
119 | assert_queries_are_equal(
120 | session.add_context_to_script(
121 | scripts[TableRef("read", ("core",), "users", "test_project")]
122 | ).code,
123 | """
124 | SELECT *
125 | FROM (
126 | SELECT id, name, age
127 | FROM raw.users___audit
128 | )
129 | WHERE name IN ('Alice')
130 | """,
131 | )
132 |
133 | assert_queries_are_equal(
134 | session.add_context_to_script(
135 | scripts[TableRef("read", ("analytics",), "n_users", "test_project")]
136 | ).code,
137 | """
138 | SELECT COUNT(*) FROM (
139 | SELECT *
140 | FROM core.users___audit
141 | WHERE name IN ('Alice')
142 |
143 | UNION ALL
144 |
145 | SELECT *
146 | FROM core.users
147 | WHERE name NOT IN ('Alice')
148 | )
149 | """,
150 | )
151 |
152 |
153 | def test_incremental_field_but_no_incremental_table_selected(scripts):
154 | session = Session(
155 | database_client=None,
156 | base_dataset="read",
157 | write_dataset="write",
158 | scripts=scripts,
159 | selected_table_refs={TableRef("read", ("analytics",), "n_users", "test_project")},
160 | unselected_table_refs=set(),
161 | existing_tables={},
162 | existing_audit_tables={},
163 | incremental_field_name="name",
164 | incremental_field_values={"Alice"},
165 | )
166 |
167 | assert_queries_are_equal(
168 | session.add_context_to_script(
169 | scripts[TableRef("read", ("core",), "users", "test_project")]
170 | ).code,
171 | """
172 | SELECT
173 | id,
174 | -- #INCREMENTAL
175 | name,
176 | age
177 | FROM raw.users
178 | """,
179 | )
180 |
181 |
182 | @pytest.mark.duckdb
183 | def test_incremental_field_with_just_incremental_table_selected(scripts):
184 | session = Session(
185 | database_client=None,
186 | base_dataset="read",
187 | write_dataset="write",
188 | scripts=scripts,
189 | selected_table_refs={TableRef("read", ("core",), "users", "test_project")},
190 | unselected_table_refs=set(),
191 | existing_tables={},
192 | existing_audit_tables={},
193 | incremental_field_name="name",
194 | incremental_field_values={"Alice"},
195 | )
196 |
197 | assert_queries_are_equal(
198 | session.add_context_to_script(
199 | scripts[TableRef("read", ("core",), "users", "test_project")]
200 | ).code,
201 | """
202 | SELECT *
203 | FROM (
204 | SELECT id, name, age
205 | FROM raw.users
206 | )
207 | WHERE name IN ('Alice')
208 | """,
209 | )
210 |
211 |
212 | def test_incremental_field_with_just_incremental_table_selected_and_materialized_dependency(
213 | scripts,
214 | ):
215 | session = Session(
216 | database_client=None,
217 | base_dataset="read",
218 | write_dataset="write",
219 | scripts=scripts,
220 | selected_table_refs={TableRef("read", ("core",), "users", "test_project")},
221 | unselected_table_refs=set(),
222 | existing_tables={},
223 | existing_audit_tables={
224 | TableRef("read", ("raw",), "users", "test_project"): DUMMY_TABLE_STATS
225 | },
226 | incremental_field_name="name",
227 | incremental_field_values={"Alice"},
228 | )
229 |
230 | assert_queries_are_equal(
231 | session.add_context_to_script(
232 | scripts[TableRef("read", ("core",), "users", "test_project")]
233 | ).code,
234 | """
235 | SELECT *
236 | FROM (
237 | SELECT id, name, age
238 | FROM raw.users___audit
239 | )
240 | WHERE name IN ('Alice')
241 | """,
242 | )
243 |
244 |
245 | def test_incremental_field_but_no_incremental_table_selected_and_yet_dependency_is_materialized(
246 | scripts,
247 | ):
248 | session = Session(
249 | database_client=None,
250 | base_dataset="read",
251 | write_dataset="write",
252 | scripts=scripts,
253 | selected_table_refs={TableRef("read", ("analytics",), "n_users", "test_project")},
254 | unselected_table_refs=set(),
255 | existing_tables={},
256 | existing_audit_tables={
257 | TableRef("read", ("core",), "users", "test_project"): DUMMY_TABLE_STATS,
258 | },
259 | incremental_field_name="name",
260 | incremental_field_values={"Alice"},
261 | )
262 |
263 | assert_queries_are_equal(
264 | session.add_context_to_script(
265 | scripts[TableRef("read", ("analytics",), "n_users", "test_project")]
266 | ).code,
267 | """
268 | SELECT COUNT(*)
269 | FROM (
270 | SELECT *
271 | FROM core.users___audit
272 | WHERE name IN ('Alice')
273 |
274 | UNION ALL
275 |
276 | SELECT *
277 | FROM core.users
278 | WHERE name NOT IN ('Alice')
279 | )
280 | """,
281 | )
282 |
283 |
284 | def test_incremental_field_but_no_incremental_table_selected_and_yet_dependency_is_materialized_with_client(
285 | scripts,
286 | ):
287 | session = Session(
288 | database_client=DuckDBClient(
289 | database_path=Path("./test_duckdb"),
290 | dry_run=False,
291 | print_mode=False,
292 | ),
293 | base_dataset="read",
294 | write_dataset="write",
295 | scripts=scripts,
296 | selected_table_refs={TableRef("read", ("analytics",), "n_users", "test_project")},
297 | unselected_table_refs=set(),
298 | existing_tables={},
299 | existing_audit_tables={
300 | TableRef("read", ("core",), "users", "test_project"): DUMMY_TABLE_STATS,
301 | },
302 | incremental_field_name="name",
303 | incremental_field_values={"Alice"},
304 | )
305 |
306 | assert_queries_are_equal(
307 | session.add_context_to_script(
308 | scripts[TableRef("read", ("analytics",), "n_users", "test_project")]
309 | ).code,
310 | """
311 | SELECT COUNT(*)
312 | FROM (
313 | SELECT *
314 | FROM core.users___audit
315 | WHERE name IN ('Alice')
316 |
317 | UNION ALL
318 |
319 | SELECT *
320 | FROM core.users
321 | WHERE name NOT IN ('Alice')
322 | )
323 | """,
324 | )
325 |
--------------------------------------------------------------------------------
/lea/test_table_ref.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pathlib
4 |
5 | import pytest
6 |
7 | from lea.table_ref import TableRef
8 |
9 |
10 | @pytest.mark.parametrize(
11 | "table_ref, expected",
12 | [
13 | pytest.param(table_ref, expected, id=str(table_ref))
14 | for table_ref, expected in [
15 | (
16 | TableRef("my_dataset", ("my_schema",), "my_table", "my_project"),
17 | "my_project.my_dataset.my_schema.my_table",
18 | ),
19 | (
20 | TableRef("my_dataset", (), "my_table", "my_project"),
21 | "my_project.my_dataset.my_table",
22 | ),
23 | (
24 | TableRef("my_dataset", ("my_schema", "my_subschema"), "my_table", "my_project"),
25 | "my_project.my_dataset.my_schema.my_subschema.my_table",
26 | ),
27 | ]
28 | ],
29 | )
30 | def test_str(table_ref, expected):
31 | assert str(table_ref) == expected
32 |
33 |
34 | @pytest.mark.parametrize(
35 | "table_ref, expected",
36 | [
37 | pytest.param(table_ref, expected, id=str(table_ref))
38 | for table_ref, expected in [
39 | (
40 | TableRef("my_dataset", ("my_schema",), "my_table", None),
41 | "TableRef(dataset='my_dataset', schema=('my_schema',), name='my_table', project=None)",
42 | ),
43 | (
44 | TableRef("my_dataset", (), "my_table", None),
45 | "TableRef(dataset='my_dataset', schema=(), name='my_table', project=None)",
46 | ),
47 | (
48 | TableRef("my_dataset", ("my_schema", "my_subschema"), "my_table", "my_project"),
49 | "TableRef(dataset='my_dataset', schema=('my_schema', 'my_subschema'), name='my_table', project='my_project')",
50 | ),
51 | ]
52 | ],
53 | )
54 | def test_repr(table_ref, expected):
55 | assert repr(table_ref) == expected
56 |
57 |
58 | def test_from_path():
59 | scripts_dir = pathlib.Path("my_dataset")
60 | relative_path = pathlib.Path("my_schema/my_table.sql")
61 | table_ref = TableRef.from_path(scripts_dir, relative_path, "my_project")
62 | assert table_ref == TableRef("my_dataset", ("my_schema",), "my_table", "my_project")
63 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | authors = ["Max Halford "]
3 | description = "A minimalist alternative to dbt"
4 | name = "lea-cli"
5 | packages = [
6 | {include = "lea", from = "."},
7 | ]
8 | version = "0.10.3"
9 |
10 | [tool.poetry.dependencies]
11 | click = "^8.1.7"
12 | Jinja2 = "^3.1.2"
13 | db-dtypes = "^1.1.1"
14 | duckdb = "^1.0.0"
15 | gitpython = "^3.1.43"
16 | google-cloud-bigquery = "^3.11.4"
17 | pandas = "^2.1.3"
18 | python = ">=3.10,<4"
19 | python-dotenv = "^1.0.0"
20 | rich = ">=13.5.3,<15.0.0"
21 | sqlglot = "^26.0.0"
22 | rsa = "^4.7"
23 | google-cloud-bigquery-storage = "^2.27.0"
24 | requests = "^2.32.3"
25 |
26 | [tool.poetry.group.dev.dependencies]
27 | ipykernel = "^6.21.2"
28 | pre-commit = ">=3.5,<5.0"
29 | pytest = ">=7.4.2,<9.0.0"
30 | ruff = ">=0.1,<0.12"
31 |
32 | [build-system]
33 | build-backend = "poetry.core.masonry.api"
34 | requires = ["poetry-core>=1.0.0"]
35 |
36 | [tool.poetry.scripts]
37 | lea = "lea.cli:app"
38 |
39 | [tool.ruff]
40 | lint.ignore = ["E501"]
41 | line-length = 100
42 | lint.select = ["E", "F", "I", "UP"] # https://beta.ruff.rs/docs/rules/
43 | target-version = 'py310'
44 |
45 | [tool.ruff.lint.isort]
46 | required-imports = ["from __future__ import annotations"]
47 |
48 | [tool.pytest.ini_options]
49 | addopts = [
50 | "--doctest-modules",
51 | "--doctest-glob=README.md",
52 | "--ignore=examples",
53 | "--verbose",
54 | "--color=yes",
55 | ]
56 | markers = [
57 | "duckdb: quack quack"
58 | ]
59 |
--------------------------------------------------------------------------------