├── .coveragerc
├── .github
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
├── PULL_REQUEST_TEMPLATE.md
└── workflows
│ └── onpush.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CODEOWNERS
├── CONTRIBUTING.md
├── CONTRIBUTORS.md
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── brickflow
├── __init__.py
├── bundles
│ ├── __init__.py
│ └── model.py
├── cli
│ ├── __init__.py
│ ├── bundles.py
│ ├── commands.py
│ ├── configure.py
│ ├── constants.py
│ ├── entrypoint.template
│ ├── gitignore_template.txt
│ └── projects.py
├── codegen
│ ├── __init__.py
│ └── databricks_bundle.py
├── context
│ ├── __init__.py
│ └── context.py
├── engine
│ ├── __init__.py
│ ├── compute.py
│ ├── hooks.py
│ ├── project.py
│ ├── task.py
│ ├── utils.py
│ └── workflow.py
├── hints
│ ├── __init__.py
│ ├── hint.py
│ └── py.typed
└── resolver
│ └── __init__.py
├── brickflow_plugins
├── __init__.py
├── airflow
│ ├── __init__.py
│ ├── brickflow_task_plugin.py
│ ├── context
│ │ └── __init__.py
│ ├── cronhelper.py
│ ├── operators
│ │ ├── __init__.py
│ │ ├── external_tasks.py
│ │ ├── external_tasks_tableau.py
│ │ └── native_operators.py
│ └── vendor
│ │ ├── __init__.py
│ │ ├── context.py
│ │ ├── timetable.py
│ │ └── timezone.py
├── databricks
│ ├── __init__.py
│ ├── box_operator.py
│ ├── run_job.py
│ ├── sla_sensor.py
│ ├── uc_to_snowflake_operator.py
│ └── workflow_dependency_sensor.py
└── secrets
│ └── __init__.py
├── docs
├── api
│ ├── airflow_external_task_dependency.md
│ ├── airflow_native_operators.md
│ ├── airflow_tableau_operators.md
│ ├── box_operator.md
│ ├── cli.md
│ ├── compute.md
│ ├── context.md
│ ├── project.md
│ ├── secrets.md
│ ├── sla_sensor.md
│ ├── task.md
│ ├── uc_to_snowflake_operator.md
│ ├── workflow.md
│ └── workflow_dependency_sensor.md
├── bundles-quickstart.md
├── cli
│ └── reference.md
├── css
│ └── custom.css
├── environment-variables.md
├── faq
│ └── faq.md
├── highlevel.md
├── how-imports-work.md
├── img
│ ├── bf_logo.png
│ ├── bf_logo_1.png
│ ├── maintainance.png
│ └── workflow.png
├── index.md
├── projects.md
├── tasks.md
├── upgrades
│ └── upgrade-pre-0-10-0-to-0-10-0.md
└── workflows.md
├── examples
├── brickflow_examples
│ ├── .brickflow-project-root.yml
│ ├── .gitignore
│ ├── README.md
│ ├── __init__.py
│ ├── brickflow-multi-project.yml
│ ├── notebooks
│ │ ├── __init__.py
│ │ └── example_notebook.py
│ ├── src
│ │ ├── __init__.py
│ │ ├── python
│ │ │ ├── __init__.py
│ │ │ ├── lending_data_show.py
│ │ │ └── setup_data.py
│ │ └── sql
│ │ │ └── sample.sql
│ └── workflows
│ │ ├── __init__.py
│ │ ├── demo_wf.py
│ │ └── entrypoint.py
├── brickflow_for_each_task_examples
│ ├── .brickflow-project-root.yml
│ ├── README.md
│ ├── __init__.py
│ ├── brickflow-multi-project.yml
│ ├── notebooks
│ │ ├── __init__.py
│ │ └── example_notebook.py
│ ├── src
│ │ ├── __init__.py
│ │ └── python
│ │ │ ├── __init__.py
│ │ │ └── print_args.py
│ └── workflows
│ │ ├── __init__.py
│ │ ├── entrypoint.py
│ │ └── for_each_task_wf.py
└── brickflow_serverless_examples
│ ├── .brickflow-project-root.yml
│ ├── .gitignore
│ ├── README.md
│ ├── __init__.py
│ ├── brickflow-multi-project.yml
│ ├── notebooks
│ ├── __init__.py
│ └── example_notebook.py
│ ├── src
│ ├── __init__.py
│ └── python
│ │ ├── __init__.py
│ │ └── example.py
│ └── workflows
│ ├── __init__.py
│ ├── demo_serverless_wf.py
│ └── entrypoint.py
├── mkdocs.yml
├── poetry.lock
├── prospector.yaml
├── pyproject.toml
├── tests
├── __init__.py
├── airflow_plugins
│ ├── __init__.py
│ ├── test_autosys.py
│ ├── test_tableau.py
│ └── test_task_dependency.py
├── cli
│ ├── __init__.py
│ ├── sample_yaml_project
│ │ ├── .brickflow-project-root.yaml
│ │ └── brickflow-multi-project.yaml
│ ├── sample_yml_project
│ │ ├── .brickflow-project-root.yml
│ │ └── brickflow-multi-project.yml
│ ├── test_bundles.py
│ ├── test_cli.py
│ └── test_projects.py
├── codegen
│ ├── __init__.py
│ ├── expected_bundles
│ │ ├── dev_bundle_monorepo.yml
│ │ ├── dev_bundle_polyrepo.yml
│ │ ├── dev_bundle_polyrepo_with_auto_libs.yml
│ │ ├── local_bundle.yml
│ │ ├── local_bundle_continuous_schedule.yml
│ │ ├── local_bundle_foreach_task.yml
│ │ ├── local_bundle_prefix_suffix.yml
│ │ └── local_serverless_bundle.yml
│ ├── sample_serverless_workflow.py
│ ├── sample_workflows.py
│ └── test_databricks_bundle.py
├── context
│ ├── __init__.py
│ └── test_context.py
├── databricks_plugins
│ ├── __init__.py
│ ├── test_box_operator.py
│ ├── test_run_job.py
│ ├── test_sla_sensor.py
│ ├── test_workflow_dependency_sensor.py
│ └── test_workflow_task_dependency_sensor.py
├── engine
│ ├── __init__.py
│ ├── sample_workflow.py
│ ├── sample_workflow_2.py
│ ├── test_compute.py
│ ├── test_engine.py
│ ├── test_project.py
│ ├── test_task.py
│ ├── test_utils.py
│ └── test_workflow.py
├── resolver
│ └── test_resolver.py
├── sample_workflows
│ ├── __init__.py
│ ├── sample_workflow_1.py
│ └── sample_workflow_2.py
├── test_brickflow.py
└── test_plugins.py
└── tools
├── README.md
├── gen-bundle.sh
├── install_databricks_cli.py
├── modify_model.py
└── modify_schema.py
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit =
3 | *tests*
4 | brickflow/tf/*
5 | '*/.local/*',
6 | '**',
7 | 'tests/*',
8 | '*/tests/*',
9 | # omit anything in a .venv directory anywhere
10 | '.venv/*',
11 | "*/site-packages/*"
12 |
13 | [html]
14 | skip_empty = true
15 |
16 | [report]
17 | skip_empty = true
18 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: "[BUG] Please add your bug title here"
5 | labels: bug
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 |
26 | **Cloud Information**
27 |
28 |
29 | - [ ] AWS
30 | - [ ] Azure
31 | - [ ] GCP
32 | - [ ] Other
33 |
34 | **Desktop (please complete the following information):**
35 | - OS: [e.g. iOS]
36 | - Browser [e.g. chrome, safari]
37 | - Version [e.g. 22]
38 |
39 | **Additional context**
40 | Add any other context about the problem here.
41 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: "[FEATURE] Please add your feature request title"
5 | labels: enhancement
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Cloud Information**
14 |
15 |
16 | - [ ] AWS
17 | - [ ] Azure
18 | - [ ] GCP
19 | - [ ] Other
20 |
21 | **Describe the solution you'd like**
22 | A clear and concise description of what you want to happen.
23 |
24 | **Describe alternatives you've considered**
25 | A clear and concise description of any alternative solutions or features you've considered.
26 |
27 | **Additional context**
28 | Add any other context or screenshots about the feature request here.
29 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## Description
4 |
5 |
6 | ## Related Issue
7 |
8 |
9 |
10 |
11 |
12 | ## Motivation and Context
13 |
14 |
15 | ## How Has This Been Tested?
16 |
17 |
18 |
19 |
20 | ## Screenshots (if appropriate):
21 |
22 | ## Types of changes
23 |
24 | - [ ] Bug fix (non-breaking change which fixes an issue)
25 | - [ ] New feature (non-breaking change which adds functionality)
26 | - [ ] Breaking change (fix or feature that would cause existing functionality to change)
27 |
28 | ## Checklist:
29 |
30 |
31 | - [ ] My code follows the code style of this project.
32 | - [ ] My change requires a change to the documentation.
33 | - [ ] I have updated the documentation accordingly.
34 | - [ ] I have read the **CONTRIBUTING** document.
35 | - [ ] I have added tests to cover my changes.
36 | - [ ] All new and existing tests passed.
37 |
--------------------------------------------------------------------------------
/.github/workflows/onpush.yml:
--------------------------------------------------------------------------------
1 | name: build
2 |
3 | on:
4 | pull_request:
5 | types: [ opened, synchronize ]
6 | push:
7 | branches: [ main ]
8 | release:
9 | types: [ created ]
10 |
11 | jobs:
12 | test-pipeline:
13 | runs-on: ${{ matrix.os }}
14 | container:
15 | image: python:${{ matrix.python-version }}
16 | options: --user 1001 # run as the runner user instead of root
17 | strategy:
18 | max-parallel: 2
19 | matrix:
20 | python-version: [ '3.9' ]
21 | os: [ ubuntu-latest ]
22 |
23 | steps:
24 | - uses: actions/checkout@v3
25 |
26 | - name: Set up JDK # used for py4j for cronutils
27 | uses: actions/setup-java@v3
28 | with:
29 | java-version: '8'
30 | distribution: 'adopt'
31 |
32 | - name: Install pip
33 | run: python -m pip install --upgrade pip
34 |
35 | - name: Install and configure Poetry
36 | uses: snok/install-poetry@v1
37 |
38 | - name: Install poetry and build tools
39 | run: |
40 | export PATH=$PATH:$HOME/.local/bin
41 | poetry self add "poetry-dynamic-versioning[plugin]"
42 |
43 | - name: Install dependencies
44 | run: |
45 | export PATH=$PATH:$HOME/.local/bin
46 | make poetry
47 |
48 | - name: Install, lint and test
49 | run: |
50 | export PATH=$PATH:$HOME/.local/bin
51 | export GITHUB_ACTIONS=true
52 | make cov
53 |
54 | - name: Publish test coverage
55 | uses: codecov/codecov-action@v3
56 | with:
57 | token: ${{ secrets.CODECOV_TOKEN }}
58 | files: coverage.xml
59 |
60 | deploy:
61 | name: Deploy to PyPi
62 | runs-on: ${{ matrix.os }}
63 | container:
64 | image: python:${{ matrix.python-version }}
65 | options: --user 1001 # run as the runner user instead of root
66 | strategy:
67 | max-parallel: 2
68 | matrix:
69 | python-version: [ '3.9' ]
70 | os: [ ubuntu-latest ]
71 | needs:
72 | - test-pipeline
73 | if: github.event_name == 'release'
74 | steps:
75 | - uses: actions/checkout@v3 # use latest version of the checkout action
76 |
77 | - name: Set up JDK # used for py4j for cronutils
78 | uses: actions/setup-java@v3
79 | with:
80 | java-version: '8'
81 | distribution: 'adopt'
82 |
83 | - name: Install pip
84 | run: python -m pip install --upgrade pip
85 |
86 | - name: Install and configure Poetry
87 | uses: snok/install-poetry@v1
88 |
89 | - name: Install build tools
90 | run: |
91 | export PATH=$PATH:$HOME/.local/bin
92 | poetry self add "poetry-dynamic-versioning[plugin]"
93 |
94 | - name: Install dependencies
95 | run: |
96 | export PATH=$PATH:$HOME/.local/bin
97 | make poetry
98 |
99 | - name: Install wheel and twine
100 | run: python -m pip install wheel twine
101 |
102 | - name: Build and publish
103 | env:
104 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
105 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
106 | run: |
107 | export PATH=$PATH:$HOME/.local/bin
108 | make build
109 | twine upload dist/*
110 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by https://www.toptal.com/developers/gitignore/api/terraform,pycharm+all,macos,windows
2 | # Edit at https://www.toptal.com/developers/gitignore?templates=terraform,pycharm+all,macos,windows
3 |
4 | ### macOS ###
5 | # General
6 | .DS_Store
7 | .AppleDouble
8 | .LSOverride
9 |
10 | # Icon must end with two
11 | Icon
12 |
13 |
14 | # Thumbnails
15 | ._*
16 |
17 | # Files that might appear in the root of a volume
18 | .DocumentRevisions-V100
19 | .fseventsd
20 | .Spotlight-V100
21 | .TemporaryItems
22 | .Trashes
23 | .VolumeIcon.icns
24 | .com.apple.timemachine.donotpresent
25 |
26 | # Directories potentially created on remote AFP share
27 | .AppleDB
28 | .AppleDesktop
29 | Network Trash Folder
30 | Temporary Items
31 | .apdisk
32 |
33 | ### macOS Patch ###
34 | # iCloud generated files
35 | *.icloud
36 |
37 | ### PyCharm+all ###
38 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
39 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
40 |
41 | # User-specific stuff
42 | .idea/**/workspace.xml
43 | .idea/**/tasks.xml
44 | .idea/**/usage.statistics.xml
45 | .idea/**/dictionaries
46 | .idea/**/shelf
47 |
48 | # AWS User-specific
49 | .idea/**/aws.xml
50 |
51 | # Generated files
52 | .idea/**/contentModel.xml
53 |
54 | # Sensitive or high-churn files
55 | .idea/**/dataSources/
56 | .idea/**/dataSources.ids
57 | .idea/**/dataSources.local.xml
58 | .idea/**/sqlDataSources.xml
59 | .idea/**/dynamic.xml
60 | .idea/**/uiDesigner.xml
61 | .idea/**/dbnavigator.xml
62 |
63 | # Gradle
64 | .idea/**/gradle.xml
65 | .idea/**/libraries
66 |
67 | # Gradle and Maven with auto-import
68 | # When using Gradle or Maven with auto-import, you should exclude module files,
69 | # since they will be recreated, and may cause churn. Uncomment if using
70 | # auto-import.
71 | # .idea/artifacts
72 | # .idea/compiler.xml
73 | # .idea/jarRepositories.xml
74 | # .idea/modules.xml
75 | # .idea/*.iml
76 | # .idea/modules
77 | # *.iml
78 | # *.ipr
79 |
80 | # CMake
81 | cmake-build-*/
82 |
83 | # Mongo Explorer plugin
84 | .idea/**/mongoSettings.xml
85 |
86 | # File-based project format
87 | *.iws
88 |
89 | # IntelliJ
90 | out/
91 |
92 | # mpeltonen/sbt-idea plugin
93 | .idea_modules/
94 |
95 | # JIRA plugin
96 | atlassian-ide-plugin.xml
97 |
98 | # Cursive Clojure plugin
99 | .idea/replstate.xml
100 |
101 | # SonarLint plugin
102 | .idea/sonarlint/
103 |
104 | # Crashlytics plugin (for Android Studio and IntelliJ)
105 | com_crashlytics_export_strings.xml
106 | crashlytics.properties
107 | crashlytics-build.properties
108 | fabric.properties
109 |
110 | # Editor-based Rest Client
111 | .idea/httpRequests
112 |
113 | # Android studio 3.1+ serialized cache file
114 | .idea/caches/build_file_checksums.ser
115 |
116 | ### PyCharm+all Patch ###
117 | # Ignore everything but code style settings and run configurations
118 | # that are supposed to be shared within teams.
119 |
120 | .idea/*
121 |
122 | ### Terraform ###
123 | # Local .terraform directories
124 | **/.terraform/*
125 |
126 | # .tfstate files
127 | *.tfstate
128 | *.tfstate.*
129 |
130 | # Crash log files
131 | crash.log
132 | crash.*.log
133 |
134 | # Exclude all .tfvars files, which are likely to contain sensitive data, such as
135 | # password, private keys, and other secrets. These should not be part of version
136 | # control as they are data points which are potentially sensitive and subject
137 | # to change depending on the environment.
138 | *.tfvars
139 | *.tfvars.json
140 |
141 | # Ignore override files as they are usually used to override resources locally and so
142 | # are not checked in
143 | override.tf
144 | override.tf.json
145 | *_override.tf
146 | *_override.tf.json
147 |
148 | # Include override files you do wish to add to version control using negated pattern
149 | # !example_override.tf
150 |
151 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan
152 | # example: *tfplan*
153 |
154 | # Ignore CLI configuration files
155 | .terraformrc
156 | terraform.rc
157 |
158 | ### Windows ###
159 | # Windows thumbnail cache files
160 | Thumbs.db
161 | Thumbs.db:encryptable
162 | ehthumbs.db
163 | ehthumbs_vista.db
164 |
165 | # Dump file
166 | *.stackdump
167 |
168 | # Folder config file
169 | [Dd]esktop.ini
170 |
171 | # Recycle Bin used on file shares
172 | $RECYCLE.BIN/
173 |
174 | # Windows Installer files
175 | *.cab
176 | *.msi
177 | *.msix
178 | *.msm
179 | *.msp
180 |
181 | # Windows shortcuts
182 | *.lnk
183 |
184 | # End of https://www.toptal.com/developers/gitignore/api/terraform,pycharm+all,macos,windows
185 |
186 | # BUILD
187 |
188 | brickflow.egg-info
189 | .eggs
190 | dist
191 | build
192 |
193 | # SAMPLES / TESTING
194 | brickflow/sample_dags
195 | main*.py
196 |
197 | # Coverage related
198 | .coverage
199 | coverage.xml
200 | site
201 | scripts
202 | __pycache__
203 | integration_workflows
204 |
205 | *venv
206 |
207 | # VScode
208 | .vscode
209 |
210 | # GENERATED BY BRICKFLOW CLI --START--
211 |
212 | ### Terraform ###
213 | # Local .terraform directories
214 | **/.terraform/*
215 |
216 | # .tfstate files
217 | *.tfstate
218 | *.tfstate.*
219 |
220 | # Crash log files
221 | crash.log
222 | crash.*.log
223 |
224 | # Exclude all .tfvars files, which are likely to contain sensitive data, such as
225 | # password, private keys, and other secrets. These should not be part of version
226 | # control as they are data points which are potentially sensitive and subject
227 | # to change depending on the environment.
228 | *.tfvars
229 | *.tfvars.json
230 |
231 | # Ignore override files as they are usually used to override resources locally and so
232 | # are not checked in
233 | override.tf
234 | override.tf.json
235 | *_override.tf
236 | *_override.tf.json
237 |
238 | # Include override files you do wish to add to version control using negated pattern
239 | # !example_override.tf
240 |
241 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan
242 | # example: *tfplan*
243 |
244 | # Ignore CLI configuration files
245 | .terraformrc
246 | terraform.rc
247 |
248 | # GENERATED BY BRICKFLOW CLI --END--
249 |
250 | bundle.yml
251 |
252 | brickflow/bundles/schema.json
253 | brickflow/bundles/transformed_schema.json
254 | .databricks
255 | cdktf.out
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: local
3 | hooks:
4 | - id: make-check
5 | name: Running Lint Checks
6 | entry: make check
7 | language: system
8 | files: '\.py$'
9 | pass_filenames: false
10 | always_run: true
11 | stages: [commit]
12 | - id: make-cov
13 | name: Running Lint Checks & Test Suite
14 | entry: make cov
15 | language: system
16 | files: '\.py$'
17 | pass_filenames: false
18 | always_run: true
19 | stages: [push]
--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # This is a comment.
2 | # Each line is a file pattern followed by one or more owners.
3 |
4 | # These owners will be the default owners for everything in
5 | # the repo. Unless a later match takes precedence,
6 | # @Nike-Inc/brickflow-dev will be requested for
7 | # review when someone opens a pull request.
8 | * @Nike-Inc/brickflow-dev @asingamaneni @stikkireddy @newfront
9 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to contribute
2 |
3 | There are a few guidelines that we need contributors to follow so that we are able to process requests as efficiently as possible. If you have any questions or concerns please feel free to contact us at [opensource@nike.com](mailto:opensource@nike.com).
4 |
5 | ## Getting Started
6 |
7 | * Review our [Code of Conduct](https://github.com/Nike-Inc/nike-inc.github.io/blob/master/CONDUCT.md)
8 | * Submit the [Individual Contributor License Agreement](https://www.clahub.com/agreements/Nike-Inc/fastbreak)
9 | * Make sure you have a [GitHub account](https://github.com/signup/free)
10 | * Submit a ticket for your issue, assuming one does not already exist.
11 | * Clearly describe the issue including steps to reproduce when it is a bug.
12 | * Make sure you fill in the earliest version that you know has the issue.
13 | * Fork the repository on GitHub
14 |
15 | ## Making Changes
16 |
17 | * Create a feature branch off of `main` before you start your work.
18 | * Please avoid working directly on the `main` branch.
19 | * Setup the required package manager [poetry](#-package-manager)
20 | * Setup the dev environment [see below](#-dev-environment-setup)
21 | * Make commits of logical units.
22 | * You may be asked to squash unnecessary commits down to logical units.
23 | * Check for unnecessary whitespace with `git diff --check` before committing.
24 | * Write meaningful, descriptive commit messages.
25 | * Please follow existing code conventions when working on a file
26 | * Make sure to check the standards on the code [see below](#-linting-and-standards)
27 | * Install java 11 since it's required for unit tests while running 'make tests'
28 | * Make sure to test the code before you push changes [see below](#-testing)
29 |
30 | ## 🤝 Submitting Changes
31 |
32 | * Push your changes to a topic branch in your fork of the repository.
33 | * Submit a pull request to the repository in the Nike-Inc organization.
34 | * After feedback has been given we expect responses within two weeks. After two weeks we may close the pull request
35 | if it isn't showing any activity.
36 | * Bug fixes or features that lack appropriate tests may not be considered for merge.
37 | * Changes that lower test coverage may not be considered for merge.
38 |
39 | ### 📦 Package manager
40 |
41 | We use `make` for managing different steps of setup and maintenance in the project. You can install make by following
42 | the instructions [here](https://formulae.brew.sh/formula/make)
43 |
44 | We use `poetry` as our package manager.
45 |
46 | Please DO NOT use pip or conda to install the dependencies. Instead, use poetry:
47 |
48 | ```bash
49 | make poetry-install
50 | ```
51 |
52 | ### 📌 Dev Environment Setup
53 |
54 | To ensure our standards, make sure to install the required packages.
55 |
56 | ```bash
57 | make dev
58 | ```
59 |
60 | ### 🧹 Linting and Standards
61 |
62 | We use `pylint`, `black` and `mypy` to maintain standards in the codebase
63 |
64 | ```bash
65 | make check
66 | ```
67 |
68 | Make sure that the linter does not report any errors or warnings before submitting a pull request.
69 |
70 | ### 🧪 Testing
71 |
72 | We use `pytest` to test our code. You can run the tests by running the following command:
73 |
74 | ```bash
75 | make test
76 | ```
77 |
78 | #### 🧪 Integration Testing
79 | * Once you add a feature or a bug fix in brickflow, create a whl file from your feature branch
80 | * run 'poetry build' to generate the whl under the dist folder
81 | * Install brickflow from the whl file
82 | * pip install -whl file path-
83 | * Upload the whl file to Databricks workspace
84 | * Databricks Workspace --> Add --> Library
85 | * Copy the path of the uploaded whl file and paste it in the entrypoint.py as a Wheel Library
86 | * libraries=[
87 | WheelTaskLibrary("dbfs:/FileStore/jars/dummy.whl")
88 | ],
89 | * Create a workflow and deploy it to make sure the feature or bug fix works as expected
90 |
91 | Make sure that all tests pass before submitting a pull request.
92 |
93 | ## 🚀 Release Process
94 |
95 | At the moment, the release process is manual. We try to make frequent releases. Usually, we release a new version when we have a new feature or bugfix. A developer with admin rights to the repository will create a new release on GitHub, and then publish the new version to PyPI.
96 |
97 | # Additional Resources
98 |
99 | * [General GitHub documentation](https://help.github.com/)
100 | * [GitHub pull request documentation](https://help.github.com/send-pull-requests/)
101 | * [Nike's Code of Conduct](https://github.com/Nike-Inc/nike-inc.github.io/blob/master/CONDUCT.md)
102 | * [Nike's Individual Contributor License Agreement](https://www.clahub.com/agreements/Nike-Inc/fastbreak)
103 | * [Nike OSS](https://nike-inc.github.io/)
--------------------------------------------------------------------------------
/CONTRIBUTORS.md:
--------------------------------------------------------------------------------
1 | # Authors
2 | * [Ashok Singamaneni](https://www.linkedin.com/in/ashok-singamaneni-193b1a32/)
3 | * [Sriharsha Tikkireddy](https://www.linkedin.com/in/sriharsha-tikkireddy/)
4 |
5 | # Contributors
6 | Thanks to the contributors who helped on this project apart from the authors
7 | * [Danny Meijer](https://www.linkedin.com/in/dannydatascientist/)
8 | * [Pariksheet Marotrao Barapatre](https://www.linkedin.com/in/pari-data-products/)
9 | * [Bhargav Sangars](https://www.linkedin.com/in/bhargav-sangars-a4b61037/)
10 | * [Brend Braeckmans](https://www.linkedin.com/in/brendbraeckmans/)
11 | * [Rebecca Raj Shree](https://www.linkedin.com/in/rebecca-raj-shree/)
12 | * [Brent (Johnson) Spetner](https://www.linkedin.com/in/brentjohnsoneng/)
13 | * [Dmitrii Grigorev](https://www.linkedin.com/in/dmitrii-grigorev-074739135/)
14 | * [Chanukya Konuganti](https://www.linkedin.com/in/chanukyakonuganti/)
15 | * [Maxim Mityutko](https://www.linkedin.com/in/mityutko/)
16 | * [Raju Gujjalapati](https://in.linkedin.com/in/raju-gujjalapati-470a88171)
17 | * [Madhusudan Koukutla](https://www.linkedin.com/in/madhusudan-reddy/)
18 | * [Surya Teja Jagatha](https://www.linkedin.com/in/surya-teja-jagatha/)
19 | * [Iris Meerman](https://www.linkedin.com/in/iris-meerman-92694675/)
20 | * [Michael Espiritu](https://www.linkedin.com/in/michaelespiritu92/)
21 | * [Riccardo Iacomini](https://www.linkedin.com/in/riccardo-iacomini-b757b6118/)
22 |
23 | # Honorary Mentions
24 | Thanks to the team below for invaluable insights and support throughout the initial release of this project
25 |
26 | * [Joe Hollow](https://www.linkedin.com/in/joe-hollow-23088b1/)
27 | * [Aditya Chaturvedi](https://www.linkedin.com/in/chaturvediaditya/)
28 | * [Scott Haines](https://www.linkedin.com/in/scotthaines/)
29 | * [Arijit Banerjee](https://www.linkedin.com/in/massborn/)
30 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:20.04
2 | USER root
3 |
4 | # DO NOT ADD AS ENV:
5 | # debconf noninteractive
6 | # This is the anti-frontend. It never interacts with you at all,
7 | # and makes the default answers be used for all questions. It
8 | # might mail error messages to root, but that's it; otherwise it
9 | # is completely silent and unobtrusive, a perfect frontend for
10 | # automatic installs. If you are using this front-end, and require
11 | # non-default answers to questions, you will need to preseed the
12 | # debconf database; see the section below on Unattended Package
13 | # Installation for more details.
14 |
15 | RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections && \
16 | apt-get update -y && apt-get install -y git curl wget unzip software-properties-common
17 | SHELL ["/bin/bash", "-c"]
18 |
19 | ENV NODE_VERSION 18.14.0
20 |
21 | RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections \
22 | && curl https://raw.githubusercontent.com/creationix/nvm/master/install.sh | bash \
23 | && . $HOME/.nvm/nvm.sh \
24 | && nvm install $NODE_VERSION \
25 | && nvm use $NODE_VERSION \
26 | && npm install --global cdktf-cli@latest
27 |
28 | ENV NODE_PATH /root/.nvm/versions/node/v$NODE_VERSION/lib/node_modules
29 | ENV PATH /root/.nvm/versions/node/v$NODE_VERSION/bin:$PATH
30 | ENV NVM_DIR /root/.nvm
31 |
32 | RUN add-apt-repository ppa:deadsnakes/ppa
33 | RUN apt-get install -y python3.9 python3-pip python3.9-distutils && ln -s /usr/bin/python3.9 /usr/bin/python
34 |
35 | ARG CACHEBUST=1
36 |
37 | RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
38 | python3.9 get-pip.py && \
39 | ln -s /usr/local/bin/pip3.9 /usr/bin/pip3 && \
40 | ln -s /usr/local/bin/pip3.9 /usr/bin/pip
41 |
42 | RUN python -m pip install -U pip && pip install -U setuptools poetry
43 |
44 | WORKDIR /brickflow
45 |
46 | COPY . .
47 |
48 | VOLUME ["/brickflow", "$(pwd)"]
49 |
50 | RUN poetry install
51 |
52 | CMD ["/bin/bash"]
53 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | # Include the license file
2 | include LICENSE.txt
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | black-check:
2 | @poetry run black --check .
3 |
4 | fmt:
5 | @poetry run black .
6 |
7 | check: black-check mypy
8 | @poetry run prospector --profile prospector.yaml --no-autodetect
9 |
10 | mypy:
11 | @poetry run mypy
12 |
13 | cov: check
14 | @poetry run coverage run --source=brickflow --omit "brickflow/sample_dags/*,sample_workflows/*,brickflow/tf/*" -m pytest && \
15 | poetry run coverage report -m && \
16 | poetry run coverage xml
17 |
18 | gen-bundle-sdk:
19 | @pip install . --force-reinstall
20 | @./tools/gen-bundle.sh
21 |
22 | dev:
23 | @poetry install --all-extras --with dev
24 | @poetry run pre-commit install
25 | @poetry run pre-commit install --hook-type pre-push
26 |
27 | deploy_env_setup:
28 | @poetry install --all-extras --with dev
29 |
30 | test:
31 | @poetry run coverage run --source=brickflow --omit "brickflow/bundles/*,brickflow/sample_dags/*,sample_workflows/*,brickflow/tf/*" -m pytest && \
32 | poetry run coverage report -m && \
33 | poetry run coverage html
34 |
35 | clean:
36 | @rm -rf dist
37 |
38 | build: clean
39 | @poetry build
40 |
41 | poetry:
42 | @poetry install --all-extras --with dev
43 |
44 | coverage: check test
45 |
46 | docs:
47 | @poetry run mike deploy -u dev latest
48 | @poetry run mike set-default latest
49 | @poetry run mike serve
50 |
51 | deploy-docs:
52 | @poetry run mike deploy --push --update-aliases $(version) latest
53 |
54 | docker-local:
55 | docker build -t brickflow:latest --build-arg CACHEBUST="$(shell date +%s)" .
56 |
57 | poetry-install:
58 | @pip install --upgrade setuptools && pip install poetry && poetry self add "poetry-dynamic-versioning[plugin]"
59 |
60 | get-version:
61 | @poetry version
62 |
63 | requirements:
64 | @poetry export -f requirements.txt --output requirements.txt --with dev --without-hashes
65 |
66 | docker-build:
67 | @docker build -t brickflow-local .
68 |
69 | docker: docker-build
70 | @docker run -it -v "$(shell pwd)":/brickflow brickflow-local /bin/bash
71 |
72 | .PHONY: docs
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Brickflow
2 |
3 | [//]: # ([](https://github.com/Nike-Inc/brickflow/actions/workflows/codeql-analysis.yml))
4 | [](https://github.com/Nike-Inc/brickflow/actions/workflows/onpush.yml)
5 | [](https://codecov.io/gh/Nike-Inc/brickflow)
6 | [](https://github.com/psf/black)
7 | [](http://mypy-lang.org/)
8 | [](https://opensource.org/licenses/Apache-2.0)
9 | 
10 | 
11 | 
12 |
13 |
14 | BrickFlow is specifically designed to enable the development of Databricks workflows using Python, streamlining the
15 | process through a command-line interface (CLI) tool.
16 |
17 |
18 |
19 |
20 | ---
21 |
22 | ### Contributors
23 |
24 | Thanks to all the [contributors](https://github.com/Nike-Inc/brickflow/blob/main/CONTRIBUTORS.md) who have helped ideate, develop and bring Brickflow to its current state.
25 |
26 | ### Contributing
27 |
28 | We're delighted that you're interested in contributing to our project! To get started,
29 | please carefully read and follow the guidelines provided in our [contributing](https://github.com/Nike-Inc/brickflow/blob/main/CONTRIBUTING.md) document.
30 |
31 | ### Documentation
32 |
33 | Brickflow documentation can be found [here](https://engineering.nike.com/brickflow/).
34 |
35 | ### Getting Started
36 |
37 | #### Prerequisites
38 | 1. Install brickflows
39 |
40 | ```shell
41 | pip install brickflows
42 | ```
43 |
44 | 2. Install [Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/databricks-cli.html)
45 |
46 | ```shell
47 | curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sudo sh
48 | ```
49 |
50 | 3. Configure Databricks cli with workspace token. This configures your `~/.databrickscfg` file.
51 |
52 | ```shell
53 | databricks configure --token
54 | ```
55 |
56 | #### Hello World workflow
57 | 1. Create your first workflow using brickflow
58 | ```shell
59 | mkdir hello-world-brickflow
60 | cd hello-world-brickflow
61 | brickflow projects add
62 | ```
63 |
64 | 2. Provide the following inputs
65 | ```shell
66 | Project name: hello-world-brickflow
67 | Path from repo root to project root (optional) [.]: .
68 | Path from project root to workflows dir: workflows
69 | Git https url: https://github.com/Nike-Inc/brickflow.git
70 | Brickflow version [auto]:
71 | Spark expectations version [0.5.0]: 0.8.0
72 | Skip entrypoint [y/N]: N
73 | ```
74 | _Note: You can provide your own github repo url._
75 |
76 | 3. Create a new file hello_world_wf.py in the workflows directory
77 | ```shell
78 | touch workflows/hello_world_wf.py
79 | ```
80 |
81 | 4. Copy the following code in hello_world_wf.py file
82 | ```python
83 | from brickflow import (
84 | ctx,
85 | Cluster,
86 | Workflow,
87 | NotebookTask,
88 | )
89 | from airflow.operators.bash import BashOperator
90 |
91 |
92 | cluster = Cluster(
93 | name="job_cluster",
94 | node_type_id="m6gd.xlarge",
95 | spark_version="13.3.x-scala2.12",
96 | min_workers=1,
97 | max_workers=2,
98 | )
99 |
100 | wf = Workflow(
101 | "hello_world_workflow",
102 | default_cluster=cluster,
103 | tags={
104 | "product_id": "brickflow_demo",
105 | },
106 | common_task_parameters={
107 | "catalog": "",
108 | "database": "",
109 | },
110 | )
111 |
112 | @wf.task
113 | # this task does nothing but explains the use of context object
114 | def start():
115 | print(f"Environment: {ctx.env}")
116 |
117 | @wf.notebook_task
118 | # this task runs a databricks notebook
119 | def example_notebook():
120 | return NotebookTask(
121 | notebook_path="notebooks/example_notebook.py",
122 | base_parameters={
123 | "some_parameter": "some_value", # in the notebook access these via dbutils.widgets.get("some_parameter")
124 | },
125 | )
126 |
127 |
128 | @wf.task(depends_on=[start, example_notebook])
129 | # this task runs a bash command
130 | def list_lending_club_data_files():
131 | return BashOperator(
132 | task_id=list_lending_club_data_files.__name__,
133 | bash_command="ls -lrt /dbfs/databricks-datasets/samples/lending_club/parquet/",
134 | )
135 |
136 | @wf.task(depends_on=list_lending_club_data_files)
137 | # this task runs the pyspark code
138 | def lending_data_ingest():
139 | ctx.spark.sql(
140 | f"""
141 | CREATE TABLE IF NOT EXISTS
142 | {ctx.dbutils_widget_get_or_else(key="catalog", debug="development")}.\
143 | {ctx.dbutils_widget_get_or_else(key="database", debug="dummy_database")}.\
144 | {ctx.dbutils_widget_get_or_else(key="brickflow_env", debug="local")}_lending_data_ingest
145 | USING DELTA -- this is default just for explicit purpose
146 | SELECT * FROM parquet.`dbfs:/databricks-datasets/samples/lending_club/parquet/`
147 | """
148 | )
149 | ```
150 | _Note: Modify the values of catalog/database for common_task_parameters._
151 |
152 |
153 | 5. Create a new file example_notebook.py in the notebooks directory
154 | ```shell
155 | mkdir notebooks
156 | touch notebooks/example_notebook.py
157 | ```
158 | 6. Copy the following code in the example_notebook.py file
159 | ```python
160 | # Databricks notebook source
161 |
162 | print("hello world")
163 | ```
164 |
165 | #### Deploy the workflow to databricks
166 | ```shell
167 | brickflow projects deploy --project hello-world-brickflow -e local
168 | ```
169 |
170 | ### Run the demo workflow
171 | 1. Login to databricks workspace
172 | 2. Go to the workflows and select the workflow
173 |
174 |
175 | 4. click on the run button
176 |
177 | ### Examples
178 | Refer to the [examples](https://github.com/Nike-Inc/brickflow/tree/main/examples/brickflow_examples) for more examples.
179 |
180 |
181 |
--------------------------------------------------------------------------------
/brickflow/bundles/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/brickflow/bundles/__init__.py
--------------------------------------------------------------------------------
/brickflow/cli/commands.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import os
4 | import subprocess
5 | from typing import Optional, Union, Tuple, List
6 |
7 | from click import ClickException
8 |
9 | from brickflow import _ilog
10 |
11 |
12 | def exec_command(
13 | path_to_executable: str,
14 | base_command: Optional[str],
15 | args: Union[Tuple[str] | List[str]],
16 | capture_output: bool = False,
17 | ) -> Optional[str]:
18 | os.environ["PYTHONPATH"] = os.getcwd()
19 | my_env = os.environ.copy()
20 | try:
21 | _args = list(args)
22 | # add a base command if its provided for proxying for brickflow deploy
23 | if base_command is not None:
24 | _args = [base_command] + _args
25 | _ilog.info("Executing command: %s", " ".join([path_to_executable, *_args]))
26 |
27 | if capture_output is True:
28 | res = subprocess.run(
29 | [path_to_executable, *_args],
30 | check=True,
31 | env=my_env,
32 | capture_output=True,
33 | text=True,
34 | )
35 | return res.stdout.strip()
36 |
37 | subprocess.run([path_to_executable, *_args], check=True, env=my_env)
38 | except subprocess.CalledProcessError as e:
39 | raise ClickException(str(e))
40 |
41 | return None
42 |
--------------------------------------------------------------------------------
/brickflow/cli/configure.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import importlib
4 | import os
5 | import re
6 | import sys
7 | from pathlib import Path
8 | from typing import Callable, Any, Optional
9 |
10 | import click
11 | from jinja2 import Environment, BaseLoader
12 |
13 | from brickflow import _ilog, BrickflowProjectConstants, get_entrypoint_python
14 | from brickflow.cli.commands import exec_command
15 |
16 | PWD = Path(__file__).parent.absolute()
17 | GITIGNORE_TEMPLATE = PWD / "gitignore_template.txt"
18 | GIT_PATH = Path(".git")
19 |
20 |
21 | class GitNotFoundError(Exception):
22 | pass
23 |
24 |
25 | class GitIgnoreNotFoundError(Exception):
26 | pass
27 |
28 |
29 | def _gitignore_exists() -> bool:
30 | return os.path.exists(".gitignore") and os.path.isfile(".gitignore")
31 |
32 |
33 | def _create_gitignore_if_not_exists() -> None:
34 | if _gitignore_exists() is False:
35 | Path(".gitignore").touch(mode=0o755)
36 |
37 |
38 | def _get_gitignore() -> str:
39 | return Path(".gitignore").read_text(encoding="utf-8")
40 |
41 |
42 | def _get_gitignore_template() -> str:
43 | return GITIGNORE_TEMPLATE.read_text()
44 |
45 |
46 | def _write_gitignore(data: str) -> None:
47 | Path(".gitignore").write_text(encoding="utf-8", data=data)
48 |
49 |
50 | def _update_gitignore() -> None:
51 | search_regex = re.compile(
52 | r"(# GENERATED BY BRICKFLOW CLI --START--(.|\n)*# GENERATED BY BRICKFLOW CLI --END--)"
53 | )
54 |
55 | git_ignore_data = _get_gitignore()
56 | git_ignore_template = _get_gitignore_template()
57 | search = search_regex.findall(git_ignore_data)
58 | if len(search) > 0:
59 | search_match = search[0][0]
60 | gitignore_file_data = git_ignore_data.replace(search_match, git_ignore_template)
61 | else:
62 | gitignore_file_data = "\n\n".join([git_ignore_data, git_ignore_template])
63 | _write_gitignore(gitignore_file_data)
64 |
65 |
66 | def _validate_package(path_str: str) -> str:
67 | folder_path: Path = Path(path_str)
68 |
69 | if not folder_path.exists():
70 | raise ImportError(f"Invalid pkg error: {folder_path.as_posix()}")
71 |
72 | sys.path.append(os.getcwd())
73 | folder_pkg_path: str = folder_path.as_posix().replace("/", ".")
74 |
75 | for module in folder_path.glob("**/*.py"): # only find python files
76 | # ignore __init__.py
77 | if module.name == "__init__.py":
78 | continue
79 | module_name = module.as_posix().replace(".py", "").replace("/", ".")
80 | # import all the modules into the mod object and not actually import them using __import__
81 | mod = importlib.import_module(module_name)
82 | click.echo(f"Scanned module: {mod.__name__}")
83 |
84 | return folder_pkg_path
85 |
86 |
87 | def render_template(**kwargs) -> str: # type: ignore
88 | template = Path(__file__).parent.absolute() / "entrypoint.template"
89 | with template.open("r") as f:
90 | data = f.read()
91 | return Environment(loader=BaseLoader()).from_string(data).render(**kwargs)
92 |
93 |
94 | def create_entry_point(working_dir: str, data: str) -> None:
95 | path = Path(working_dir) / "entrypoint.py"
96 | if path.exists():
97 | click.echo(f"Path: {str(path.absolute())} already exists...")
98 | # path = Path(working_dir) / "entrypoint.py.new"
99 | else:
100 | click.echo(f"Creating file in path: {str(path.absolute())}...")
101 | path.write_text(data)
102 |
103 |
104 | def create_brickflow_project_root_marker() -> None:
105 | path = Path(
106 | f"{BrickflowProjectConstants.DEFAULT_MULTI_PROJECT_ROOT_FILE_NAME.value}."
107 | f"{BrickflowProjectConstants.DEFAULT_CONFIG_FILE_TYPE.value}"
108 | )
109 | if path.exists():
110 | click.echo(f"Path: {str(path.absolute())} already exists...")
111 | # path = Path(working_dir) / "entrypoint.py.new"
112 | else:
113 | click.echo(f"Creating file in path: {str(path.absolute())}...")
114 | path.write_text(
115 | "# DO NOT MODIFY THIS FILE - IT IS AUTO GENERATED BY BRICKFLOW AND RESERVED FOR FUTURE USAGE",
116 | encoding="utf-8",
117 | )
118 |
119 |
120 | def bind_env_var(env_var: str) -> Callable:
121 | def callback(
122 | ctx: click.Context, # noqa
123 | param: str, # noqa
124 | value: Any,
125 | ) -> None:
126 | # pylint: disable=unused-argument
127 | if value is not None and len(value) > 0:
128 | _ilog.info("Setting env var: %s to %s...", env_var, value)
129 | if isinstance(value, list):
130 | os.environ[env_var] = ",".join(value)
131 | if isinstance(value, tuple):
132 | os.environ[env_var] = ",".join(value)
133 | elif isinstance(value, bool):
134 | os.environ[env_var] = str(value).lower()
135 | else:
136 | os.environ[env_var] = value
137 |
138 | return callback
139 |
140 |
141 | def get_entrypoint(**kwargs: Any) -> str:
142 | wd: Optional[str] = kwargs.get("workflows_dir")
143 | if wd is None:
144 | raise ValueError(
145 | "workflows_dir not set, please set it using --workflows-dir or -wd"
146 | )
147 | return str(Path(wd) / "entrypoint.py")
148 |
149 |
150 | def log_important_versions(bundle_cli: str) -> None:
151 | version = exec_command(bundle_cli, "--version", [], capture_output=True)
152 | _ilog.info("Using bundle version: %s", version)
153 | log_python_version()
154 |
155 |
156 | def log_python_version() -> None:
157 | version = exec_command(
158 | get_entrypoint_python(), "--version", [], capture_output=True
159 | )
160 | _ilog.info("Using python version: %s", version)
161 |
--------------------------------------------------------------------------------
/brickflow/cli/constants.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from enum import Enum
4 |
5 | from decouple import config
6 |
7 | from brickflow import BrickflowEnvVars
8 |
9 |
10 | class BrickflowDeployMode(Enum):
11 | BUNDLE = "bundle"
12 |
13 |
14 | INTERACTIVE_MODE = config(
15 | BrickflowEnvVars.BRICKFLOW_INTERACTIVE_MODE.value, default=True, cast=bool
16 | )
17 |
--------------------------------------------------------------------------------
/brickflow/cli/entrypoint.template:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 |
3 | from brickflow import Project, PypiTaskLibrary, MavenTaskLibrary # make sure brickflow imports are at the top
4 |
5 | import {{ pkg }}
6 |
7 | def main() -> None:
8 | """Project entrypoint"""
9 | with Project(
10 | "{{ project_name }}",
11 | git_repo="{{ git_https_url }}",
12 | provider="{{ git_provider }}",
13 | libraries=[
14 | # PypiTaskLibrary(package="spark-expectations=={{spark_expectations_version}}"), # Uncomment if spark-expectations is needed
15 | ],
16 | ) as f:
17 | f.add_pkg({{pkg}})
18 |
19 |
20 | if __name__ == "__main__":
21 | main()
22 |
23 |
--------------------------------------------------------------------------------
/brickflow/cli/gitignore_template.txt:
--------------------------------------------------------------------------------
1 | # GENERATED BY BRICKFLOW CLI --START--
2 |
3 | ### Terraform ###
4 | # Local .terraform directories
5 | **/.terraform/*
6 |
7 | # .tfstate files
8 | *.tfstate
9 | *.tfstate.*
10 |
11 | # Crash log files
12 | crash.log
13 | crash.*.log
14 |
15 | # Exclude all .tfvars files, which are likely to contain sensitive data, such as
16 | # password, private keys, and other secrets. These should not be part of version
17 | # control as they are data points which are potentially sensitive and subject
18 | # to change depending on the environment.
19 | *.tfvars
20 | *.tfvars.json
21 |
22 | # Ignore override files as they are usually used to override resources locally and so
23 | # are not checked in
24 | override.tf
25 | override.tf.json
26 | *_override.tf
27 | *_override.tf.json
28 |
29 | # Include override files you do wish to add to version control using negated pattern
30 | # !example_override.tf
31 |
32 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan
33 | # example: *tfplan*
34 |
35 | # Ignore CLI configuration files
36 | .terraformrc
37 | terraform.rc
38 |
39 | # GENERATED BY BRICKFLOW CLI --END--
--------------------------------------------------------------------------------
/brickflow/codegen/__init__.py:
--------------------------------------------------------------------------------
1 | import abc
2 | from enum import Enum
3 | from pathlib import Path
4 |
5 | from typing import TYPE_CHECKING, Optional, Dict, Any
6 |
7 | from decouple import config
8 |
9 | from brickflow import get_brickflow_version, BrickflowEnvVars, BrickflowDefaultEnvs
10 |
11 | if TYPE_CHECKING:
12 | from brickflow.engine.project import _Project
13 |
14 |
15 | class CodegenInterface(abc.ABC):
16 | def __init__(
17 | self, project: "_Project", id_: str, env: str, **_: Any
18 | ) -> None: # noqa
19 | self.env: str = env
20 | self.project: "_Project" = project
21 | self.id_ = id_
22 |
23 | @abc.abstractmethod
24 | def synth(self) -> None:
25 | pass
26 |
27 |
28 | class DatabricksDefaultClusterTagKeys(Enum):
29 | ENVIRONMENT = "environment"
30 | DEPLOYED_BY = "deployed_by"
31 | DEPLOYED_AT = "deployed_at"
32 | BRICKFLOW_PROJECT_NAME = "brickflow_project_name"
33 | BRICKFLOW_DEPLOYMENT_MODE = "brickflow_deployment_mode"
34 | DATABRICKS_TF_PROVIDER_VERSION = "databricks_tf_provider_version"
35 | BRICKFLOW_VERSION = "brickflow_version"
36 |
37 |
38 | BRICKFLOW_BUILTIN_DEPLOY_TAGS = {
39 | "brickflow_version": get_brickflow_version()
40 | or "undefined", # certain scenarios get_brickflow_version maybe None
41 | }
42 |
43 |
44 | def get_brickflow_tags(
45 | user_defined_tags: Optional[Dict[str, str]], other_tags: Dict[str, str]
46 | ) -> Dict[str, str]:
47 | return {**(user_defined_tags or {}), **other_tags, **BRICKFLOW_BUILTIN_DEPLOY_TAGS}
48 |
49 |
50 | def handle_mono_repo_path(project: "_Project", env: str) -> str:
51 | base_path = config(
52 | BrickflowEnvVars.BRICKFLOW_MONOREPO_PATH_TO_BUNDLE_ROOT.value, None
53 | )
54 |
55 | if project.entry_point_path is None:
56 | raise ValueError("project.entry_point_path is None")
57 |
58 | if base_path is None or env == BrickflowDefaultEnvs.LOCAL.value:
59 | return project.entry_point_path
60 | else:
61 | return str(Path(base_path) / project.entry_point_path)
62 |
--------------------------------------------------------------------------------
/brickflow/context/__init__.py:
--------------------------------------------------------------------------------
1 | from .context import (
2 | ctx,
3 | Context,
4 | BrickflowTaskComs,
5 | BRANCH_SKIP_EXCEPT,
6 | SKIP_EXCEPT_HACK,
7 | RETURN_VALUE_KEY,
8 | BrickflowInternalVariables,
9 | BrickflowBuiltInTaskVariables,
10 | BrickflowTaskComsObject,
11 | TaskComsObjectResult,
12 | )
13 |
14 | __all__ = [
15 | "ctx",
16 | "Context",
17 | "BrickflowTaskComs",
18 | "BRANCH_SKIP_EXCEPT",
19 | "SKIP_EXCEPT_HACK",
20 | "RETURN_VALUE_KEY",
21 | "BrickflowInternalVariables",
22 | "BrickflowBuiltInTaskVariables",
23 | "BrickflowTaskComsObject",
24 | "TaskComsObjectResult",
25 | ]
26 |
--------------------------------------------------------------------------------
/brickflow/engine/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import functools
4 | import logging
5 | import subprocess
6 | import sys
7 | from typing import Callable
8 |
9 | from brickflow import log, get_default_log_handler
10 |
11 |
12 | def _call(cmd: str, **kwargs: bool) -> bytes:
13 | return subprocess.check_output( # type: ignore
14 | [
15 | cmd,
16 | ],
17 | **kwargs,
18 | )
19 |
20 |
21 | def get_current_commit() -> str:
22 | p = _call('git log -n 1 --pretty=format:"%H"', shell=True)
23 | return p.strip().decode("utf-8")
24 |
25 |
26 | def with_brickflow_logger(f: Callable) -> Callable:
27 | @functools.wraps(f)
28 | def func(*args, **kwargs): # type: ignore
29 | _self = args[0]
30 | log.handlers = []
31 | logger_handler = logging.StreamHandler(
32 | stream=sys.stdout
33 | ) # Handler for the logger
34 | # First, generic formatter:
35 | logger_handler.setFormatter(
36 | logging.Formatter(
37 | f"[%(asctime)s] [%(levelname)s] [brickflow:{_self.name}] "
38 | "{%(module)s.py:%(funcName)s:%(lineno)d} - %(message)s"
39 | )
40 | )
41 | log.addHandler(logger_handler)
42 | resp = f(*args, **kwargs)
43 |
44 | log.handlers = [get_default_log_handler()]
45 |
46 | return resp
47 |
48 | return func
49 |
50 |
51 | ROOT_NODE = "root"
52 |
--------------------------------------------------------------------------------
/brickflow/engine/hooks.py:
--------------------------------------------------------------------------------
1 | from typing import TYPE_CHECKING
2 |
3 | import pluggy
4 |
5 | if TYPE_CHECKING:
6 | from brickflow.engine.task import Task, TaskResponse # pragma: no cover
7 | from brickflow.engine.workflow import Workflow # pragma: no cover
8 |
9 | BRICKFLOW_TASK_PLUGINS = "brickflow_task_plugins"
10 |
11 | brickflow_plugin_spec = pluggy.HookspecMarker(BRICKFLOW_TASK_PLUGINS)
12 |
13 |
14 | class BrickflowTaskPluginSpec:
15 | @staticmethod
16 | def handle_user_result_errors(resp: "TaskResponse") -> None:
17 | """Custom execute method that is able to be plugged in."""
18 | if resp.user_code_error is not None:
19 | original_message = str(resp.user_code_error)
20 | additional_info = (
21 | "BRICKFLOW_USER_OR_DBR_ERROR: This is an error thrown in user code. \n"
22 | f"BRICKFLOW_INPUT_ARGS: {resp.input_kwargs}\n"
23 | "Original Exception Message: "
24 | )
25 | new_message = additional_info + original_message
26 | resp.user_code_error.args = (new_message,)
27 | raise resp.user_code_error
28 |
29 | @staticmethod
30 | @brickflow_plugin_spec(firstresult=True)
31 | def task_execute(task: "Task", workflow: "Workflow") -> "TaskResponse":
32 | """Custom execute method that is able to be plugged in."""
33 | raise NotImplementedError("task_execute must be implemented by a plugin")
34 |
35 | @staticmethod
36 | @brickflow_plugin_spec(firstresult=True)
37 | def handle_results(
38 | resp: "TaskResponse", task: "Task", workflow: "Workflow"
39 | ) -> "TaskResponse":
40 | """Custom execute method that is able to be plugged in."""
41 | raise NotImplementedError("handle_results must be implemented by a plugin")
42 |
--------------------------------------------------------------------------------
/brickflow/engine/utils.py:
--------------------------------------------------------------------------------
1 | import functools
2 | from typing import Callable, Type, List, Iterator, Union
3 | import pathlib
4 | import os
5 |
6 | from pydantic import SecretStr
7 | from databricks.sdk import WorkspaceClient
8 |
9 | from brickflow.context import ctx
10 | from brickflow.hints import propagate_hint
11 |
12 |
13 | @propagate_hint
14 | def wraps_keyerror(error_class: Type[Exception], msg: str) -> Callable:
15 | def wrapper(f: Callable) -> Callable:
16 | @functools.wraps(f)
17 | def func(*args, **kwargs): # type: ignore
18 | try:
19 | return f(*args, **kwargs)
20 | except KeyError as e:
21 | raise error_class(
22 | f"{msg}; err: {str(e)}; args: {args}; kwargs: {kwargs}"
23 | )
24 |
25 | return func
26 |
27 | return wrapper
28 |
29 |
30 | def get_properties(some_obj: Type) -> List[str]:
31 | def _property_iter() -> Iterator[str]:
32 | for k, v in some_obj.__dict__.items():
33 | if isinstance(v, property):
34 | yield k
35 |
36 | return list(_property_iter())
37 |
38 |
39 | def get_job_id(
40 | job_name: str, host: Union[str, None] = None, token: Union[str, SecretStr] = None
41 | ) -> Union[float, None]:
42 | """
43 | Get the job id from the specified Databricks workspace for a given job name.
44 |
45 | Parameters
46 | ----------
47 | job_name: str
48 | Job name (case-insensitive)
49 | host: str
50 | Databricks workspace URL
51 | token: str
52 | Databricks API token
53 |
54 | Returns
55 | -------
56 | str
57 | Databricks job id
58 | """
59 | ctx.log.info("Searching job id for job name: %s", job_name)
60 |
61 | if host:
62 | host = host.rstrip("/")
63 | token = token.get_secret_value() if isinstance(token, SecretStr) else token
64 |
65 | workspace_obj = WorkspaceClient(host=host, token=token)
66 | jobs_list = workspace_obj.jobs.list(name=job_name)
67 |
68 | try:
69 | for job in jobs_list:
70 | ctx.log.info("Job id for job '%s' is %s", job_name, job.job_id)
71 | return job.job_id
72 | else: # pylint: disable=useless-else-on-loop
73 | raise ValueError
74 | except ValueError:
75 | raise ValueError(f"No job found with name {job_name}")
76 | except Exception as e:
77 | ctx.log.info("An error occurred: %s", e)
78 |
79 | return None
80 |
81 |
82 | def get_bf_project_root() -> pathlib.Path:
83 | """Returns the root directory of the current Brickflow project
84 |
85 | Parameters:
86 | _file (str): file path where the function is called
87 |
88 | Returns:
89 | pathlib.Path: Brickflow project root directory
90 | """
91 | try:
92 | _file_name = os.getcwd()
93 | _project_root = pathlib.Path(_file_name).resolve().parents[0]
94 | ctx.log.info("Setting Brickflow project root as %s", _project_root)
95 | return _project_root
96 | except Exception as e:
97 | ctx.log.info("An error occurred: %s", e)
98 | raise e
99 |
--------------------------------------------------------------------------------
/brickflow/hints/__init__.py:
--------------------------------------------------------------------------------
1 | from brickflow.hints.hint import propagate_hint
2 |
3 | __all__ = ["propagate_hint"]
4 |
--------------------------------------------------------------------------------
/brickflow/hints/hint.py:
--------------------------------------------------------------------------------
1 | from typing import Callable
2 |
3 |
4 | # propagate type hints for decorated functions
5 | def propagate_hint(decorator: Callable) -> Callable:
6 | return decorator
7 |
--------------------------------------------------------------------------------
/brickflow/hints/py.typed:
--------------------------------------------------------------------------------
1 | from typing import Callable
2 |
3 | def propagate_hint(decorator: Callable) -> Callable: ...
--------------------------------------------------------------------------------
/brickflow/resolver/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import inspect
4 | import os
5 | import sys
6 | from pathlib import Path
7 | from typing import Union, Any, List, Optional
8 | import pathlib
9 |
10 | from brickflow import BrickflowProjectConstants, _ilog, ctx
11 |
12 |
13 | def add_to_sys_path(directory: Union[str, pathlib.Path]) -> None:
14 | dir_str = str(directory)
15 | if dir_str not in sys.path and os.path.isdir(dir_str):
16 | sys.path.append(dir_str)
17 |
18 |
19 | def get_caller_file_paths() -> List[str]:
20 | caller_file_paths = []
21 | frames = inspect.stack()[1:] # Exclude the current frame
22 |
23 | for frame in frames:
24 | caller_file_paths.append(frame.filename)
25 |
26 | return list(set(caller_file_paths))
27 |
28 |
29 | class BrickflowRootNotFound(Exception):
30 | pass
31 |
32 |
33 | def go_up_till_brickflow_root(cur_path: str) -> str:
34 | if cur_path.startswith("<"):
35 | raise BrickflowRootNotFound("Invalid brickflow root.")
36 |
37 | path = pathlib.Path(cur_path).resolve()
38 |
39 | valid_roots = [
40 | f"{BrickflowProjectConstants.DEFAULT_MULTI_PROJECT_ROOT_FILE_NAME.value}."
41 | f"{BrickflowProjectConstants.DEFAULT_CONFIG_FILE_TYPE.value}",
42 | f"{BrickflowProjectConstants.DEFAULT_MULTI_PROJECT_CONFIG_FILE_NAME.value}."
43 | f"{BrickflowProjectConstants.DEFAULT_CONFIG_FILE_TYPE.value}",
44 | ]
45 |
46 | # recurse to see if there is a brickflow root and return the path
47 | while not path.is_dir() or not any(
48 | file.name in valid_roots for file in path.iterdir()
49 | ):
50 | path = path.parent
51 |
52 | if path == path.parent:
53 | raise BrickflowRootNotFound(
54 | "Brickflow root directory not found in path hierarchy."
55 | )
56 |
57 | return str(path.resolve())
58 |
59 |
60 | def get_relative_path_to_brickflow_root() -> None:
61 | paths = get_caller_file_paths()
62 | _ilog.info("Brickflow setting up python path resolution...")
63 | # if inside notebook also get that path
64 | notebook_path = get_notebook_ws_path(ctx.dbutils)
65 | if notebook_path is not None:
66 | paths.append(notebook_path)
67 |
68 | for path in paths:
69 | try:
70 | resolved_path = go_up_till_brickflow_root(path)
71 | _ilog.info("Brickflow root input path - %s", path)
72 | _ilog.info("Brickflow root found - %s", resolved_path)
73 | add_to_sys_path(resolved_path)
74 | _ilog.info("Sys path set to: %s", str(sys.path))
75 | except BrickflowRootNotFound:
76 | _ilog.info("Unable to find for path: %s", path)
77 | except PermissionError:
78 | _ilog.info("Most likely not accessible due to shared cluster: %s", path)
79 |
80 |
81 | def get_notebook_ws_path(dbutils: Optional[Any]) -> Optional[str]:
82 | if dbutils is not None:
83 | return str(
84 | "/Workspace"
85 | / Path(
86 | dbutils.notebook.entry_point.getDbutils()
87 | .notebook()
88 | .getContext()
89 | .notebookPath()
90 | .get()
91 | .lstrip("/")
92 | )
93 | )
94 | return None
95 |
--------------------------------------------------------------------------------
/brickflow_plugins/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from typing import List, Optional
3 |
4 | import pluggy
5 |
6 | from brickflow import get_default_log_handler
7 |
8 |
9 | def setup_logger():
10 | _log = logging.getLogger(__name__) # Logger
11 | _log.setLevel(logging.INFO)
12 | logger_handler = get_default_log_handler("brickflow-plugins")
13 | _log.addHandler(logger_handler)
14 | _log.propagate = False
15 | return _log
16 |
17 |
18 | log = setup_logger()
19 |
20 | from brickflow_plugins.airflow.operators.external_tasks import (
21 | TaskDependencySensor,
22 | AutosysSensor,
23 | AirflowProxyOktaClusterAuth,
24 | )
25 | from brickflow_plugins.airflow.operators.external_tasks_tableau import (
26 | TableauRefreshDataSourceOperator,
27 | TableauRefreshWorkBookOperator,
28 | )
29 | from brickflow_plugins.airflow.operators.native_operators import (
30 | BashOperator,
31 | BranchPythonOperator,
32 | ShortCircuitOperator,
33 | )
34 | from brickflow_plugins.databricks.workflow_dependency_sensor import (
35 | WorkflowDependencySensor,
36 | WorkflowTaskDependencySensor,
37 | )
38 | from brickflow_plugins.databricks.uc_to_snowflake_operator import (
39 | SnowflakeOperator,
40 | UcToSnowflakeOperator,
41 | )
42 | from brickflow_plugins.databricks.box_operator import (
43 | BoxToVolumesOperator,
44 | VolumesToBoxOperator,
45 | BoxOperator,
46 | )
47 | from brickflow_plugins.databricks.sla_sensor import SLASensor
48 |
49 |
50 | def load_plugins(cache_bust: Optional[pluggy.PluginManager] = None) -> None:
51 | from brickflow.engine.task import get_plugin_manager
52 | from brickflow_plugins.airflow.brickflow_task_plugin import (
53 | AirflowOperatorBrickflowTaskPluginImpl,
54 | )
55 |
56 | if cache_bust is not None:
57 | cache_bust.register(
58 | AirflowOperatorBrickflowTaskPluginImpl(), name="airflow-plugin"
59 | )
60 | return
61 |
62 | get_plugin_manager().register(AirflowOperatorBrickflowTaskPluginImpl())
63 |
64 |
65 | def ensure_installation():
66 | """Ensures that the brickflow_plugins package is installed in the current environment."""
67 | from brickflow_plugins.airflow.cronhelper import cron_helper # noqa
68 | import airflow # noqa
69 |
70 |
71 | __all__: List[str] = [
72 | "TaskDependencySensor",
73 | "AutosysSensor",
74 | "AirflowProxyOktaClusterAuth",
75 | "BashOperator",
76 | "BranchPythonOperator",
77 | "ShortCircuitOperator",
78 | "WorkflowDependencySensor",
79 | "WorkflowTaskDependencySensor",
80 | "SnowflakeOperator",
81 | "UcToSnowflakeOperator",
82 | "TableauRefreshDataSourceOperator",
83 | "TableauRefreshWorkBookOperator",
84 | "BoxToVolumesOperator",
85 | "VolumesToBoxOperator",
86 | "BoxOperator",
87 | "SLASensor",
88 | "load_plugins",
89 | "ensure_installation",
90 | ]
91 |
--------------------------------------------------------------------------------
/brickflow_plugins/airflow/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/brickflow_plugins/airflow/__init__.py
--------------------------------------------------------------------------------
/brickflow_plugins/airflow/brickflow_task_plugin.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 | import datetime
3 | import pendulum
4 |
5 | try:
6 | from airflow import macros
7 | from airflow.models import BaseOperator
8 | from airflow.utils.context import Context
9 | except ImportError:
10 | raise ImportError(
11 | "You must install airflow to use airflow plugins, "
12 | "please try pip install brickflow[apache-airflow]"
13 | )
14 |
15 | from jinja2 import Environment
16 | from brickflow.context import ctx
17 | from brickflow.engine.hooks import BrickflowTaskPluginSpec
18 | from brickflow.engine.task import brickflow_task_plugin_impl, Task, TaskResponse
19 | from brickflow.engine.workflow import Workflow
20 |
21 | from brickflow_plugins import log
22 | from brickflow_plugins.airflow.context import get_task_context
23 | from brickflow_plugins.airflow.operators import get_modifier_chain
24 | from brickflow_plugins.secrets import BrickflowSecretsBackend
25 |
26 |
27 | def epoch_to_pendulum_datetime(epoch_str: Optional[str]):
28 | if epoch_str is None:
29 | return None
30 | return pendulum.instance(datetime.datetime.fromtimestamp(int(epoch_str) / 1000))
31 |
32 |
33 | class AirflowOperatorBrickflowTaskPluginImpl(BrickflowTaskPluginSpec):
34 | @staticmethod
35 | @brickflow_task_plugin_impl(tryfirst=True)
36 | def handle_results(
37 | resp: "TaskResponse", task: "Task", workflow: "Workflow"
38 | ) -> "TaskResponse":
39 | log.info(
40 | "using AirflowOperatorBrickflowTaskPlugin for handling results for task: %s",
41 | task.task_id,
42 | )
43 |
44 | BrickflowTaskPluginSpec.handle_user_result_errors(resp)
45 |
46 | _operator = resp.response
47 |
48 | if not isinstance(_operator, BaseOperator):
49 | return resp
50 |
51 | operator_modifier_chain = get_modifier_chain()
52 | # modify any functionality of operators and then
53 | _operator = operator_modifier_chain.modify(_operator, task, workflow)
54 |
55 | if hasattr(_operator, "log"):
56 | # overwrite the operator logger if it has one to the brickflow logger
57 | setattr(_operator, "_log", ctx.log)
58 |
59 | context: Context = get_task_context(
60 | task.task_id,
61 | _operator,
62 | workflow.schedule_quartz_expression,
63 | epoch_to_pendulum_datetime(ctx.start_time(debug=None)),
64 | tz=workflow.timezone,
65 | )
66 |
67 | env: Optional[Environment] = Environment()
68 | env.globals.update({"macros": macros, "ti": context})
69 | with BrickflowSecretsBackend():
70 | _operator.render_template_fields(context, jinja_env=env)
71 | op_resp = _operator.execute(context)
72 | return TaskResponse(
73 | response=op_resp,
74 | push_return_value=_operator.do_xcom_push,
75 | )
76 |
--------------------------------------------------------------------------------
/brickflow_plugins/airflow/context/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | try:
4 | from airflow.models import BaseOperator
5 | from airflow.utils.context import Context
6 | except ImportError:
7 | raise ImportError(
8 | "You must install airflow to use airflow plugins, "
9 | "please try pip install brickflow[apache-airflow]"
10 | )
11 |
12 | from pendulum import DateTime
13 | from brickflow.context import ctx, RETURN_VALUE_KEY
14 | from brickflow_plugins.airflow.cronhelper import cron_helper
15 | from brickflow_plugins.airflow.vendor.timetable import create_timetable
16 | from brickflow_plugins.airflow.vendor.timezone import TIMEZONE
17 |
18 |
19 | class CrossDagXComsNotSupportedError(Exception):
20 | pass
21 |
22 |
23 | class XComsPullMultipleTaskIdsError(Exception):
24 | pass
25 |
26 |
27 | class FakeTaskInstance(object):
28 | def __init__(
29 | self,
30 | task_id: str,
31 | operator: BaseOperator,
32 | execution_date: str,
33 | ):
34 | self._operator = operator
35 | self._execution_date = execution_date
36 | self._task_id = task_id
37 |
38 | def xcom_push(self, key, value):
39 | ctx.task_coms.put(task_id=self._task_id, key=key, value=value)
40 |
41 | def xcom_pull(self, task_ids, key=RETURN_VALUE_KEY, dag_id=None):
42 | if dag_id is not None:
43 | raise CrossDagXComsNotSupportedError(
44 | "Cross dag xcoms not supported in framework raise feature request."
45 | )
46 | if isinstance(task_ids, list) and len(task_ids) > 1:
47 | raise XComsPullMultipleTaskIdsError(
48 | "Currently xcoms pull only supports one task_id please raise feature "
49 | "request."
50 | )
51 | task_id = task_ids[0] if isinstance(task_ids, list) else task_ids
52 | return ctx.task_coms.get(task_id, key)
53 |
54 | @property
55 | def execution_date(self):
56 | return self._execution_date
57 |
58 | @property
59 | def operator(self):
60 | return self._operator
61 |
62 |
63 | def execution_timestamp(
64 | quartz_cron_statement: Optional[str] = None,
65 | ts: Optional[DateTime] = None,
66 | tz=TIMEZONE,
67 | ) -> DateTime:
68 | if quartz_cron_statement is None:
69 | return DateTime.utcnow()
70 | if ts is None:
71 | ts = DateTime.utcnow()
72 | cron = cron_helper.quartz_to_unix(quartz_cron_statement)
73 | tt = create_timetable(cron, tz)
74 | return tt.align_to_prev(ts)
75 |
76 |
77 | def get_task_context(
78 | task_id, operator: BaseOperator, quartz_cron_statement, ts, tz=TIMEZONE
79 | ) -> Context:
80 | execution_ts = execution_timestamp(quartz_cron_statement, ts, tz)
81 | return Context(
82 | **{
83 | "execution_date": str(execution_ts),
84 | "ds": execution_ts.strftime("%Y-%m-%d"),
85 | "ds_nodash": execution_ts.strftime("%Y%m%d"),
86 | "ts": str(execution_ts),
87 | "ts_nodash": execution_ts.strftime("%Y%m%d%H%M%S"),
88 | "ti": FakeTaskInstance(task_id, operator, str(execution_ts)),
89 | }
90 | )
91 |
--------------------------------------------------------------------------------
/brickflow_plugins/airflow/cronhelper.py:
--------------------------------------------------------------------------------
1 | import re
2 | import functools
3 |
4 | from brickflow_plugins import log
5 |
6 |
7 | class CronHelper:
8 | EVERY_X_UNITS_REPLACE_PLACEHOLDER = "%s"
9 | QUARTZ_EVERY_X_UNITS_REGEX = re.compile(r"^0/(\d+)$") # For handling 0/5 units
10 | UNIX_EVERY_X_UNITS_REGEX = re.compile(r"^\*/(\d+)$") # For handling */5 units
11 | QUARTZ_EVERY_X_UNITS_REPLACE_PATTERN = f"0/{EVERY_X_UNITS_REPLACE_PLACEHOLDER}"
12 | UNIX_EVERY_X_UNITS_REPLACE_PATTERN = f"*/{EVERY_X_UNITS_REPLACE_PLACEHOLDER}"
13 |
14 | @staticmethod
15 | def __get_expression_parts(expression: str) -> list:
16 | parts = [part.strip() for part in expression.split(" ")]
17 |
18 | # Unix cron expression have 5 parts, Quartz cron expression have 6 or 7 parts
19 | if len(parts) in [5, 7]:
20 | return parts
21 | # Year is an optional part in Quartz cron expression, adding the extra element to mimic 7 part Quartz expression
22 | if len(parts) == 6:
23 | parts.append("*")
24 | return parts
25 |
26 | raise ValueError("Invalid cron expression!")
27 |
28 | @staticmethod
29 | def convert_interval_parts(part: str, is_quartz: bool = False) -> str:
30 | every_x_units_pattern = (
31 | CronHelper.QUARTZ_EVERY_X_UNITS_REGEX
32 | if is_quartz
33 | else CronHelper.UNIX_EVERY_X_UNITS_REGEX
34 | )
35 | matches = every_x_units_pattern.match(part)
36 | every_x_units_replace_pattern = (
37 | CronHelper.QUARTZ_EVERY_X_UNITS_REPLACE_PATTERN
38 | if is_quartz
39 | else CronHelper.UNIX_EVERY_X_UNITS_REPLACE_PATTERN
40 | )
41 |
42 | if matches:
43 | return every_x_units_replace_pattern.replace(
44 | CronHelper.EVERY_X_UNITS_REPLACE_PLACEHOLDER, matches.group(1)
45 | )
46 |
47 | return part
48 |
49 | @functools.lru_cache(maxsize=128) # cron expression conversion will not change
50 | def unix_to_quartz(self, unix_cron: str) -> str:
51 | parts = self.__get_expression_parts(expression=unix_cron)
52 |
53 | if len(parts) != 5:
54 | raise ValueError("Invalid Unix cron expression")
55 |
56 | minute, hour, dom, month, dow = map(self.convert_interval_parts, parts)
57 |
58 | # Converting Unix DOW to Quartz DOW
59 | def shift_days(day: str) -> str:
60 | """
61 | Quartz DOW starts from 1 (Sunday) while Unix DOW starts from 0 (Sunday)
62 | """
63 | if "-" in day:
64 | return "-".join([shift_days(day=d) for d in day.split("-")])
65 |
66 | # Unix cron Sunday can be represented as 0 or 7, but only as 1 in Quartz cron
67 | if day in ["0", "7"]:
68 | return "1"
69 | if day in ["SUN", "MON", "TUE", "WED", "THU", "FRI", "SAT"]:
70 | return day
71 | return str(int(day) + 1)
72 |
73 | if "," in dow:
74 | quartz_dow = ",".join([shift_days(day=day) for day in dow.split(",")])
75 | elif dow == "*":
76 | quartz_dow = dow
77 | else:
78 | quartz_dow = shift_days(day=dow)
79 |
80 | quartz_dom = dom
81 |
82 | if dom != "*" and dow == "*":
83 | quartz_dow = "?"
84 | elif dom == "*":
85 | quartz_dom = "?"
86 |
87 | quartz_cron = f"0 {minute} {hour} {quartz_dom} {month} {quartz_dow} *"
88 | log.info("Converted unix cron %s to quartz cron %s", unix_cron, quartz_cron)
89 | return quartz_cron
90 |
91 | @functools.lru_cache(maxsize=128) # cron expression conversion will not change
92 | def quartz_to_unix(self, quartz_cron: str) -> str:
93 | parts = self.__get_expression_parts(expression=quartz_cron)
94 |
95 | if len(parts) != 7:
96 | raise ValueError("Invalid Quartz cron expression")
97 |
98 | if "L" in quartz_cron or "W" in quartz_cron or "#" in quartz_cron:
99 | raise ValueError("Support for 'L, W, #' in Quartz cron is not implemented")
100 |
101 | # Unix cron expression does not support '?'
102 | parts = [part.replace("?", "*") for part in parts]
103 |
104 | _, minute, hour, dom, month, dow, _ = map(
105 | lambda part: self.convert_interval_parts(part, True), parts
106 | )
107 |
108 | # Converting Quartz DOW to Unix DOW
109 | def shift_days(day: str) -> str:
110 | """
111 | Quartz DOW starts from 1 (Sunday) while Unix DOW starts from 0 (Sunday)
112 | """
113 | if "-" in day:
114 | return "-".join([shift_days(day=d) for d in day.split("-")])
115 | if day in ["SUN", "MON", "TUE", "WED", "THU", "FRI", "SAT"]:
116 | return day
117 |
118 | return str(int(day) - 1)
119 |
120 | if "," in dow:
121 | unix_dow = ",".join([shift_days(day=day) for day in dow.split(",")])
122 | elif dow == "*":
123 | unix_dow = "*"
124 | else:
125 | unix_dow = shift_days(day=dow)
126 |
127 | unix_dom = dom
128 |
129 | unix_cron = f"{minute} {hour} {unix_dom} {month} {unix_dow}"
130 | log.info("Converted quartz cron %s to unix cron %s", quartz_cron, unix_cron)
131 | return unix_cron
132 |
133 |
134 | cron_helper = CronHelper()
135 |
--------------------------------------------------------------------------------
/brickflow_plugins/airflow/operators/__init__.py:
--------------------------------------------------------------------------------
1 | import functools
2 | import os
3 | from abc import abstractmethod, ABCMeta
4 | from typing import Optional
5 |
6 | try:
7 | from airflow.models import BaseOperator, Pool
8 | from airflow.utils.weight_rule import WeightRule
9 | except ImportError:
10 | raise ImportError(
11 | "You must install airflow to use airflow plugins, "
12 | "please try pip install brickflow[apache-airflow]"
13 | )
14 |
15 | from brickflow.engine.task import Task
16 | from brickflow.engine.workflow import Workflow
17 |
18 |
19 | class AirflowTaskDoesNotExistError(Exception):
20 | pass
21 |
22 |
23 | class UnsupportedAirflowTaskFieldError(Exception):
24 | pass
25 |
26 |
27 | class UnsupportedAirflowOperatorError(Exception):
28 | pass
29 |
30 |
31 | class AbstractOperatorModifier(metaclass=ABCMeta):
32 | @abstractmethod
33 | def set_next(
34 | self, op_handler: "AbstractOperatorModifier"
35 | ) -> "AbstractOperatorModifier":
36 | pass
37 |
38 | @abstractmethod
39 | def modify(
40 | self, operator: BaseOperator, task: Task, workflow: Workflow
41 | ) -> "BaseOperator":
42 | pass
43 |
44 |
45 | class OperatorModifier(AbstractOperatorModifier):
46 | def __init__(self):
47 | self._next_handler: Optional[AbstractOperatorModifier] = None
48 |
49 | def set_next(
50 | self, op_handler: "AbstractOperatorModifier"
51 | ) -> "AbstractOperatorModifier":
52 | self._next_handler = op_handler
53 | return op_handler
54 |
55 | @abstractmethod
56 | def modify(
57 | self, operator: BaseOperator, task: Task, workflow: Workflow
58 | ) -> Optional["BaseOperator"]:
59 | if self._next_handler is not None:
60 | return self._next_handler.modify(operator, task, workflow)
61 |
62 | return None
63 |
64 |
65 | class InvalidFieldChecker(OperatorModifier):
66 | UNSUPPORTED_TASK_NONE_FIELDS = {
67 | "email_on_retry": True,
68 | "email_on_failure": True,
69 | "sla": None,
70 | "execution_timeout": None,
71 | "on_failure_callback": None,
72 | "on_success_callback": None,
73 | "on_retry_callback": None,
74 | "inlets": [],
75 | "outlets": [],
76 | "task_concurrency": None,
77 | "max_active_tis_per_dag": None,
78 | "run_as_user": None,
79 | "depends_on_past": False,
80 | "wait_for_downstream": False,
81 | "max_retry_delay": None,
82 | "priority_weight": 1,
83 | "weight_rule": WeightRule.DOWNSTREAM,
84 | "pool": Pool.DEFAULT_POOL_NAME,
85 | "pool_slots": 1,
86 | "resources": None,
87 | "executor_config": {},
88 | "email": None,
89 | }
90 |
91 | def _validate_task_fields(self, operator: BaseOperator, task: Task) -> None:
92 | unsupported_fields = []
93 | for field, default_value in self.UNSUPPORTED_TASK_NONE_FIELDS.items():
94 | if hasattr(operator, field) is False:
95 | continue
96 | value = getattr(operator, field)
97 | if value != default_value:
98 | unsupported_fields.append(field)
99 | if unsupported_fields:
100 | raise UnsupportedAirflowTaskFieldError(
101 | f"Unsupported fields: {unsupported_fields} for task: {task.task_id}"
102 | )
103 |
104 | def modify(
105 | self, operator: BaseOperator, task: Task, workflow: Workflow
106 | ) -> Optional["BaseOperator"]:
107 | if isinstance(operator, BaseOperator):
108 | self._validate_task_fields(operator, task)
109 | return super().modify(operator, task, workflow)
110 |
111 |
112 | class CatchAllOperatorModifier(OperatorModifier):
113 | SUPPORTED_OPERATORS = [
114 | "BranchPythonOperator",
115 | "PythonOperator",
116 | "BashOperator",
117 | "ShortCircuitOperator",
118 | "TaskDependencySensor",
119 | "AutosysSensor",
120 | "TableauRefreshDataSourceOperator",
121 | "TableauRefreshWorkBookOperator",
122 | ]
123 |
124 | def _validate_operators(self, operator: BaseOperator, task: Task) -> None:
125 | if (
126 | issubclass(operator.__class__, BaseOperator)
127 | and operator.__class__.__name__ in self.SUPPORTED_OPERATORS
128 | ):
129 | return
130 | raise UnsupportedAirflowOperatorError(
131 | f"Unsupported airflow operator: {type(task)} for task: {task.task_id}"
132 | )
133 |
134 | def modify(
135 | self, operator: BaseOperator, task: Task, workflow: Workflow
136 | ) -> Optional["BaseOperator"]:
137 | if isinstance(operator, BaseOperator):
138 | self._validate_operators(operator, task)
139 | return operator
140 |
141 |
142 | def get_modifier_chain():
143 | from brickflow_plugins.airflow import operators
144 | import importlib
145 | import inspect
146 |
147 | start_chain = InvalidFieldChecker()
148 | next_node = start_chain
149 | pkg = operators
150 | file_name = pkg.__file__
151 | for module in os.listdir(os.path.dirname(file_name)):
152 | # only find python files and ignore __init__.py
153 | if module == "__init__.py" or module[-3:] != ".py":
154 | continue
155 | module_name = module.replace(".py", "")
156 | # import all the modules into the mod object and not actually import them using __import__
157 | mod = importlib.import_module(f"{pkg.__name__}.{module_name}")
158 | for obj in dir(mod):
159 | module_item = getattr(mod, obj)
160 | # if issubclass(module_item, OperatorModifier):
161 | if (
162 | inspect.isclass(module_item)
163 | and module_item != operators.OperatorModifier
164 | and issubclass(module_item, operators.OperatorModifier)
165 | ):
166 | # print(module_item)
167 | next_node = next_node.set_next(module_item())
168 |
169 | next_node.set_next(CatchAllOperatorModifier())
170 | return start_chain
171 |
172 |
173 | def check_if(klass):
174 | def outer(f):
175 | @functools.wraps(f)
176 | def inner(*args, **kwargs) -> Optional["BaseOperator"]:
177 | self, operator = args[0], args[1]
178 | super_func = getattr(super(type(self), self), f.__name__)
179 | if not isinstance(operator, klass):
180 | # super function won't accept self
181 | # this is to go along the chain
182 | return super_func(*args[1:], **kwargs)
183 | return f(*args, **kwargs)
184 |
185 | return inner
186 |
187 | return outer
188 |
--------------------------------------------------------------------------------
/brickflow_plugins/airflow/operators/native_operators.py:
--------------------------------------------------------------------------------
1 | import os
2 | import subprocess
3 | import sys
4 | import tempfile
5 | import time
6 | import types
7 | from typing import Optional
8 |
9 | from airflow.operators.bash import BashOperator
10 | from airflow.operators.python import BranchPythonOperator, ShortCircuitOperator
11 |
12 | from brickflow.context import BRANCH_SKIP_EXCEPT, SKIP_EXCEPT_HACK
13 | from brickflow.engine.task import Task
14 | from brickflow.engine.workflow import Workflow
15 | from brickflow_plugins import log
16 | from brickflow_plugins.airflow.operators import OperatorModifier, check_if
17 |
18 |
19 | def _bash_execute(self, context): # pylint:disable=unused-argument
20 | p = None
21 | returncode = None
22 | start = time.time()
23 | env = self.env
24 | if env is None:
25 | env = os.environ.copy()
26 |
27 | # log.info("Command: %s", self.bash_command)
28 |
29 | with tempfile.TemporaryDirectory(prefix="airflowtmp") as tmp_dir:
30 | try:
31 | p = subprocess.Popen( # pylint:disable=consider-using-with
32 | self.bash_command,
33 | shell=True,
34 | cwd=tmp_dir,
35 | executable="/bin/bash",
36 | stderr=subprocess.STDOUT,
37 | stdout=subprocess.PIPE,
38 | universal_newlines=True,
39 | env=env,
40 | )
41 | for line in iter(p.stdout.readline, ""):
42 | resp = line
43 | log.info("[STDOUT]: %s", line.rstrip())
44 | returncode = p.wait()
45 | p = None
46 | sys.stdout.flush()
47 | if returncode != 0:
48 | raise subprocess.CalledProcessError(returncode, self.bash_command)
49 | finally:
50 | end = time.time()
51 | if p is not None:
52 | p.terminate()
53 | p.wait()
54 | log.info("Command: exited with return code %s", returncode)
55 | log.info("Command took %s seconds", end - start)
56 |
57 | if self.do_xcom_push is True:
58 | return resp[:-1] # skip newline char at end
59 | return
60 |
61 |
62 | def _bash_empty_on_kill(self): # pylint:disable=unused-argument
63 | pass
64 |
65 |
66 | def _skip_all_except(
67 | self, ti: "FakeTaskInstance", branch_task_ids
68 | ): # pylint:disable=unused-argument
69 | log.info("Skipping all tasks except: %s", branch_task_ids)
70 | ti.xcom_push(BRANCH_SKIP_EXCEPT, branch_task_ids)
71 |
72 |
73 | def _short_circuit_execute(self, context):
74 | condition = super(ShortCircuitOperator, self).execute(context)
75 | log.info("Condition result is %s", condition)
76 |
77 | if condition:
78 | log.info("Proceeding with downstream tasks...")
79 | return
80 |
81 | # log
82 | log.info("Skipping downstream tasks...")
83 | ti = context["ti"]
84 | ti.xcom_push(BRANCH_SKIP_EXCEPT, SKIP_EXCEPT_HACK)
85 |
86 |
87 | class BashOperatorModifier(OperatorModifier):
88 | @check_if(BashOperator)
89 | def modify(
90 | self, operator: BashOperator, task: Task, workflow: Workflow
91 | ) -> Optional["BashOperator"]:
92 | f = types.MethodType(_bash_execute, operator)
93 | operator.execute = f
94 | operator.on_kill = _bash_empty_on_kill
95 | return operator
96 |
97 |
98 | class BranchPythonOperatorModifier(OperatorModifier):
99 | @check_if(BranchPythonOperator)
100 | def modify(
101 | self, operator: BranchPythonOperator, task: Task, workflow: Workflow
102 | ) -> Optional["BranchPythonOperator"]:
103 | f = types.MethodType(_skip_all_except, operator)
104 | operator.skip_all_except = f
105 | return operator
106 |
107 |
108 | class ShortCircuitOperatorModifier(OperatorModifier):
109 | @check_if(ShortCircuitOperator)
110 | def modify(
111 | self, operator: ShortCircuitOperator, task: Task, workflow: Workflow
112 | ) -> Optional["ShortCircuitOperator"]:
113 | f = types.MethodType(_short_circuit_execute, operator)
114 | operator.execute = f
115 | return operator
116 |
--------------------------------------------------------------------------------
/brickflow_plugins/airflow/vendor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/brickflow_plugins/airflow/vendor/__init__.py
--------------------------------------------------------------------------------
/brickflow_plugins/airflow/vendor/context.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import contextlib
4 | import copy
5 | from typing import MutableMapping, Any, Iterator, KeysView, ItemsView, ValuesView
6 |
7 |
8 | class Context(MutableMapping[str, Any]):
9 | """Jinja2 template context for task rendering.
10 |
11 | This is a mapping (dict-like) class that can lazily emit warnings when
12 | (and only when) deprecated context keys are accessed.
13 | """
14 |
15 | _DEPRECATION_REPLACEMENTS: dict[str, list[str]] = {
16 | "execution_date": ["data_interval_start", "logical_date"],
17 | "next_ds": ["{{ data_interval_end | ds }}"],
18 | "next_ds_nodash": ["{{ data_interval_end | ds_nodash }}"],
19 | "next_execution_date": ["data_interval_end"],
20 | "prev_ds": [],
21 | "prev_ds_nodash": [],
22 | "prev_execution_date": [],
23 | "prev_execution_date_success": ["prev_data_interval_start_success"],
24 | "tomorrow_ds": [],
25 | "tomorrow_ds_nodash": [],
26 | "yesterday_ds": [],
27 | "yesterday_ds_nodash": [],
28 | }
29 |
30 | def __init__(
31 | self, context: MutableMapping[str, Any] | None = None, **kwargs: Any
32 | ) -> None:
33 | self._context: MutableMapping[str, Any] = context or {}
34 | if kwargs:
35 | self._context.update(kwargs)
36 | self._deprecation_replacements = self._DEPRECATION_REPLACEMENTS.copy()
37 |
38 | def __repr__(self) -> str:
39 | return repr(self._context)
40 |
41 | def __reduce_ex__(self, protocol: int) -> tuple[Any, ...]:
42 | """Pickle the context as a dict.
43 |
44 | We are intentionally going through ``__getitem__`` in this function,
45 | instead of using ``items()``, to trigger deprecation warnings.
46 | """
47 | items = [(key, self[key]) for key in self._context]
48 | return dict, (items,)
49 |
50 | def __copy__(self) -> Context:
51 | new = type(self)(copy.copy(self._context))
52 | new._deprecation_replacements = self._deprecation_replacements.copy()
53 | return new
54 |
55 | def __getitem__(self, key: str) -> Any:
56 | # with contextlib.suppress(KeyError):
57 | # warnings.warn(_create_deprecation_warning(key, self._deprecation_replacements[key]))
58 | with contextlib.suppress(KeyError):
59 | return self._context[key]
60 | raise KeyError(key)
61 |
62 | def __setitem__(self, key: str, value: Any) -> None:
63 | self._deprecation_replacements.pop(key, None)
64 | self._context[key] = value
65 |
66 | def __delitem__(self, key: str) -> None:
67 | self._deprecation_replacements.pop(key, None)
68 | del self._context[key]
69 |
70 | def __contains__(self, key: object) -> bool:
71 | return key in self._context
72 |
73 | def __iter__(self) -> Iterator[str]:
74 | return iter(self._context)
75 |
76 | def __len__(self) -> int:
77 | return len(self._context)
78 |
79 | def __eq__(self, other: Any) -> bool:
80 | if not isinstance(other, Context):
81 | return NotImplemented
82 | return self._context == other._context
83 |
84 | def __ne__(self, other: Any) -> bool:
85 | if not isinstance(other, Context):
86 | return NotImplemented
87 | return self._context != other._context
88 |
89 | def keys(self) -> KeysView[str]:
90 | return self._context.keys()
91 |
92 | def items(self):
93 | return ItemsView(self._context)
94 |
95 | def values(self):
96 | return ValuesView(self._context)
97 |
--------------------------------------------------------------------------------
/brickflow_plugins/databricks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/brickflow_plugins/databricks/__init__.py
--------------------------------------------------------------------------------
/brickflow_plugins/databricks/run_job.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 | from pydantic import SecretStr
3 |
4 | from databricks.sdk import WorkspaceClient
5 | from brickflow.context import ctx
6 | from brickflow.engine.utils import get_job_id
7 |
8 |
9 | class RunJobInRemoteWorkspace:
10 | """
11 | Currently Databricks does not natively support running a job in a remote workspace via the RunJobTask.
12 | This plugin adds this functionality. However, it aims to be a temporary solution until Databricks adds this
13 | functionality natively.
14 | The plugin does not support neither passing the parameters to the remote job, nor waiting for the job to finish.
15 |
16 | Examples
17 | --------
18 | service_principle_pat = ctx.dbutils.secrets.get("scope", "service_principle_id")
19 | WorkflowDependencySensor(
20 | databricks_host=https://your_workspace_url.cloud.databricks.com,
21 | databricks_token=service_principle_pat,
22 | job_name="foo",
23 | )
24 | In the above snippet Databricks secrets are used as a secure service to store the databricks token.
25 | If you get your token from another secret management service, like AWS Secrets Manager, GCP Secret Manager
26 | or Azure Key Vault, just pass it in the databricks_token argument.
27 | """
28 |
29 | def __init__(
30 | self,
31 | databricks_host: str,
32 | databricks_token: Union[str, SecretStr],
33 | job_name: str,
34 | ):
35 | self.databricks_host = databricks_host
36 | self.databricks_token = (
37 | databricks_token
38 | if isinstance(databricks_token, SecretStr)
39 | else SecretStr(databricks_token)
40 | )
41 | self.job_name = job_name
42 | self._workspace_obj = WorkspaceClient(
43 | host=self.databricks_host, token=self.databricks_token.get_secret_value()
44 | )
45 |
46 | def execute(self):
47 | job_id = get_job_id(
48 | host=self.databricks_host,
49 | token=self.databricks_token,
50 | job_name=self.job_name,
51 | )
52 | # TODO: add support for passing parameters to the remote job
53 | # TODO: wait for the job to finish
54 | run = self._workspace_obj.jobs.run_now(job_id)
55 | ctx.log.info("Job run status: %s", run.response)
56 |
--------------------------------------------------------------------------------
/brickflow_plugins/secrets/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import abc
4 | import base64
5 | import functools
6 | import os
7 | from typing import Optional, Tuple, Union, List
8 | from urllib.parse import urlparse, ParseResult
9 |
10 | import pluggy
11 |
12 | try:
13 | from airflow.secrets import BaseSecretsBackend
14 | except ImportError:
15 | raise ImportError(
16 | "You must install airflow to use airflow plugins, "
17 | "please try pip install brickflow[apache-airflow]"
18 | )
19 |
20 | from brickflow_plugins import log
21 |
22 | BRICKFLOW_SECRETS_BACKEND = "brickflow_secrets_backend"
23 |
24 | brickflow_secrets_plugin_spec = pluggy.HookspecMarker(BRICKFLOW_SECRETS_BACKEND)
25 |
26 |
27 | class BrickflowSecretPluginSpec:
28 | @staticmethod
29 | @brickflow_secrets_plugin_spec(firstresult=True)
30 | def get_secret_value(url_parsed_result: ParseResult) -> Optional["str"]:
31 | """Custom execute method that is able to be plugged in."""
32 |
33 |
34 | @functools.lru_cache
35 | def get_brickflow_tasks_hook() -> BrickflowSecretPluginSpec:
36 | pm = pluggy.PluginManager(BRICKFLOW_SECRETS_BACKEND)
37 | pm.add_hookspecs(BrickflowSecretPluginSpec)
38 | pm.load_setuptools_entrypoints(BRICKFLOW_SECRETS_BACKEND)
39 | pm.register(CerberusBrickflowSecretPluginImpl())
40 | pm.register(Base64BrickflowSecretPluginImpl())
41 | for name, plugin_instance in pm.list_name_plugin():
42 | log.info(
43 | "Loaded plugin with name: %s and class: %s",
44 | name,
45 | plugin_instance.__class__.__name__,
46 | )
47 | return pm.hook
48 |
49 |
50 | brickflow_secrets_backend_plugin_impl = pluggy.HookimplMarker(BRICKFLOW_SECRETS_BACKEND)
51 |
52 |
53 | class AbstractSecretsHelper(abc.ABC):
54 | PROTOCOL_STARTS_WITH: Optional[Union[str, List[str]]] = None
55 |
56 | def get_secret_value_from_url(self, url_parsed_result: ParseResult):
57 | allowed_protocols = (
58 | [self.PROTOCOL_STARTS_WITH]
59 | if isinstance(self.PROTOCOL_STARTS_WITH, str)
60 | else self.PROTOCOL_STARTS_WITH
61 | )
62 | if self.PROTOCOL_STARTS_WITH is not None and not any(
63 | [
64 | url_parsed_result.scheme.lower().startswith(protocol)
65 | for protocol in allowed_protocols
66 | ]
67 | ):
68 | return None
69 | return self._get_secret_value_from_url(url_parsed_result)
70 |
71 | @staticmethod
72 | @abc.abstractmethod
73 | def _get_secret_value_from_url(url_parsed_result: ParseResult) -> str:
74 | pass
75 |
76 |
77 | class B64SecretsHelper(AbstractSecretsHelper):
78 | PROTOCOL_STARTS_WITH = ["base64", "b64"]
79 |
80 | @staticmethod
81 | def _get_secret_value_from_url(url_parsed_result: ParseResult) -> str:
82 | b64data = url_parsed_result.netloc.encode("utf-8")
83 | return base64.b64decode(b64data).decode("utf-8")
84 |
85 |
86 | class CerberusSecretsHelper(AbstractSecretsHelper):
87 | PROTOCOL_STARTS_WITH = "cerberus"
88 |
89 | @staticmethod
90 | def parse_path_and_key(path: Optional[str]) -> Optional[Tuple[str, str]]:
91 | if path is not None:
92 | _cleaned_path = path.lstrip("/").rstrip("/")
93 | return "/".join(_cleaned_path.split("/")[:-1]), _cleaned_path.split("/")[-1]
94 | return None
95 |
96 | @staticmethod
97 | def _get_secret_value_from_url(url_parsed_result: ParseResult) -> str:
98 | try:
99 | from cerberus.client import CerberusClient
100 | except ImportError:
101 | raise ImportError(
102 | "You must install cerberus-client to use the cerberus secrets backend, "
103 | "please try pip install brickflow[cerberus]"
104 | )
105 | parts = url_parsed_result.scheme.lower().split("+")
106 | protocol = "https"
107 | if len(parts) == 2:
108 | protocol = parts[1]
109 | _client = CerberusClient(f"{protocol}://{url_parsed_result.netloc}")
110 | _path, _key = CerberusSecretsHelper.parse_path_and_key(url_parsed_result.path)
111 | data = _client.get_secrets_data(_path)
112 | return data[_key]
113 |
114 |
115 | class CerberusBrickflowSecretPluginImpl(BrickflowSecretPluginSpec):
116 | @staticmethod
117 | @brickflow_secrets_backend_plugin_impl
118 | def get_secret_value(url_parsed_result: ParseResult) -> Optional["str"]:
119 | return CerberusSecretsHelper().get_secret_value_from_url(url_parsed_result)
120 |
121 |
122 | class Base64BrickflowSecretPluginImpl(BrickflowSecretPluginSpec):
123 | @staticmethod
124 | @brickflow_secrets_backend_plugin_impl
125 | def get_secret_value(url_parsed_result: ParseResult) -> Optional["str"]:
126 | return B64SecretsHelper().get_secret_value_from_url(url_parsed_result)
127 |
128 |
129 | class DatabricksSecretsBrickflowSecretPluginImpl(BrickflowSecretPluginSpec):
130 | @staticmethod
131 | @brickflow_secrets_backend_plugin_impl
132 | def get_secret_value(url_parsed_result: ParseResult) -> Optional["str"]:
133 | # not implemented yet
134 | return None
135 |
136 |
137 | class BrickflowSecretsBackend(BaseSecretsBackend): # noqa
138 | def __enter__(self):
139 | self.set_backend_env()
140 | return self
141 |
142 | def __exit__(self, exc_type, exc_val, exc_tb):
143 | self.unset_backend_env()
144 |
145 | def get_conn_value(self, conn_id: str) -> str | None:
146 | parsed_url = urlparse(conn_id)
147 | return get_brickflow_tasks_hook().get_secret_value(url_parsed_result=parsed_url)
148 |
149 | def _get_secrets_backend_env(self):
150 | return {
151 | "AIRFLOW__SECRETS__BACKEND": f"{self.__class__.__module__}.{self.__class__.__name__}",
152 | "AIRFLOW__SECRETS__BACKEND_KWARGS": "",
153 | }
154 |
155 | def set_backend_env(self):
156 | for k, v in self._get_secrets_backend_env().items():
157 | os.environ[k] = v
158 |
159 | def unset_backend_env(self):
160 | for k in self._get_secrets_backend_env().keys():
161 | os.environ.pop(k, None)
162 |
--------------------------------------------------------------------------------
/docs/api/airflow_external_task_dependency.md:
--------------------------------------------------------------------------------
1 | ---
2 | search:
3 | exclude: true
4 | ---
5 |
6 | ::: brickflow_plugins.airflow.operators.external_tasks
7 | handler: python
8 | options:
9 | filters:
10 | - "!^_[^_]"
11 | - "!^__[^__]"
12 |
--------------------------------------------------------------------------------
/docs/api/airflow_native_operators.md:
--------------------------------------------------------------------------------
1 | ---
2 | search:
3 | exclude: true
4 | ---
5 |
6 | ::: brickflow_plugins.airflow.operators.native_operators
7 | handler: python
8 | options:
9 | filters:
10 | - "!^_[^_]"
11 | - "!^__[^__]"
12 |
--------------------------------------------------------------------------------
/docs/api/airflow_tableau_operators.md:
--------------------------------------------------------------------------------
1 | ---
2 | search:
3 | exclude: true
4 | ---
5 |
6 | ::: brickflow_plugins.airflow.operators.external_tasks_tableau
7 | handler: python
8 | options:
9 | filters:
10 | - "!^_[^_]"
11 | - "!^__[^__]"
12 |
--------------------------------------------------------------------------------
/docs/api/box_operator.md:
--------------------------------------------------------------------------------
1 | ---
2 | search:
3 | exclude: true
4 | ---
5 |
6 | ::: brickflow_plugins.databricks.box_operator
7 | handler: python
8 | options:
9 | filters:
10 | - "!^_[^_]"
11 | - "!^__[^__]"
12 |
--------------------------------------------------------------------------------
/docs/api/cli.md:
--------------------------------------------------------------------------------
1 | ---
2 | search:
3 | exclude: true
4 | ---
5 |
6 | ::: brickflow.cli
7 | handler: python
8 | options:
9 | filters:
10 | - "!^_[^_]"
11 |
12 |
--------------------------------------------------------------------------------
/docs/api/compute.md:
--------------------------------------------------------------------------------
1 | ---
2 | search:
3 | exclude: true
4 | ---
5 |
6 | ::: brickflow.engine.compute
7 | handler: python
8 | options:
9 | members:
10 | - Cluster
11 | - Runtimes
12 | filters:
13 | - "!^_[^_]"
14 |
15 |
--------------------------------------------------------------------------------
/docs/api/context.md:
--------------------------------------------------------------------------------
1 | ---
2 | search:
3 | exclude: true
4 | ---
5 |
6 | ::: brickflow.context.context
7 | handler: python
8 | options:
9 | filters:
10 | - "!^_[^_]"
11 | - "!^__[^__]"
12 |
13 |
--------------------------------------------------------------------------------
/docs/api/project.md:
--------------------------------------------------------------------------------
1 | ---
2 | search:
3 | exclude: true
4 | ---
5 |
6 | ::: brickflow.engine.project
7 | handler: python
8 | options:
9 | members:
10 | - Project
11 | - BrickFlowEnvVars
12 | filters:
13 | - "!^_[^_]"
14 |
15 |
--------------------------------------------------------------------------------
/docs/api/secrets.md:
--------------------------------------------------------------------------------
1 | ---
2 | search:
3 | exclude: true
4 | ---
5 |
6 | ::: brickflow_plugins.secrets
7 | handler: python
8 | options:
9 | filters:
10 | - "!^_[^_]"
11 | - "!^__[^__]"
12 |
--------------------------------------------------------------------------------
/docs/api/sla_sensor.md:
--------------------------------------------------------------------------------
1 | ---
2 | search:
3 | exclude: true
4 | ---
5 |
6 | ::: brickflow_plugins.databricks.sla_sensor
7 | handler: python
8 | options:
9 | members:
10 | - SLASensor
11 | filters:
12 | - "!^_[^_]"
13 | - "!^__[^__]"
14 |
--------------------------------------------------------------------------------
/docs/api/task.md:
--------------------------------------------------------------------------------
1 | ---
2 | search:
3 | exclude: true
4 | ---
5 |
6 | ::: brickflow.engine.task
7 | handler: python
8 | options:
9 | members:
10 | - Task
11 | - EmailNotifications
12 | - JarTaskLibrary
13 | - EggTaskLibrary
14 | - WheelTaskLibrary
15 | - PypiTaskLibrary
16 | - MavenTaskLibrary
17 | - CranTaskLibrary
18 | - BrickflowTriggerRule
19 | - BrickflowTaskEnvVars
20 | - TaskSettings
21 | - TaskType
22 | filters:
23 | - "!^_[^_]"
24 | - "!^__[^__]"
25 |
26 |
--------------------------------------------------------------------------------
/docs/api/uc_to_snowflake_operator.md:
--------------------------------------------------------------------------------
1 | ---
2 | search:
3 | exclude: true
4 | ---
5 |
6 | ::: brickflow_plugins.databricks.uc_to_snowflake_operator
7 | handler: python
8 | options:
9 | filters:
10 | - "!^_[^_]"
11 | - "!^__[^__]"
12 |
--------------------------------------------------------------------------------
/docs/api/workflow.md:
--------------------------------------------------------------------------------
1 | ---
2 | search:
3 | exclude: true
4 | ---
5 |
6 | ::: brickflow.engine.workflow
7 | handler: python
8 | options:
9 | members:
10 | - Workflow
11 | - WorkspacePermissions
12 | - User
13 | - Group
14 | - ServicePrincipal
15 | filters:
16 | - "!^_[^_]"
17 | - "!^__[^__]"
18 |
19 |
--------------------------------------------------------------------------------
/docs/api/workflow_dependency_sensor.md:
--------------------------------------------------------------------------------
1 | ---
2 | search:
3 | exclude: true
4 | ---
5 |
6 | ::: brickflow_plugins.databricks.workflow_dependency_sensor
7 | handler: python
8 | options:
9 | members:
10 | - WorkflowDependencySensor
11 | - WorkflowTaskDependencySensor
12 | filters:
13 | - "!^_[^_]"
14 | - "!^__[^__]"
15 |
--------------------------------------------------------------------------------
/docs/bundles-quickstart.md:
--------------------------------------------------------------------------------
1 | # BrickFlow v1.3.1 Quickstart Guide
2 |
3 | This guide will help you get started with BrickFlow v1.3.1, walking you through project setup and deployment.
4 |
5 | ## Prerequisites
6 |
7 | 1. Local environment setup:
8 | - Python >= 3.8
9 | - Databricks CLI configured with access token
10 | - BrickFlow CLI
11 |
12 | ### Installation Steps
13 |
14 | 1. Install Databricks CLI and configure it:
15 | ```bash
16 | pip install databricks-cli
17 | databricks configure -t
18 | ```
19 |
20 | 2. Install BrickFlow CLI:
21 | ```bash
22 | pip install brickflows
23 | ```
24 |
25 | 3. Verify your installation:
26 | ```bash
27 | bf --help
28 | databricks workspace list / # Add --profile if using specific profile
29 | ```
30 |
31 | ## Creating Your First Project
32 |
33 | 1. Navigate to your repository root (where `.git` folder is located)
34 |
35 | 2. Initialize a new BrickFlow project:
36 | ```bash
37 | bf projects add
38 | ```
39 |
40 | 3. Follow the prompts:
41 | - Project Name: Enter your desired project name
42 | - Path from repo root to project root: Press Enter for default (`.`) or specify path
43 | - Path from project root to workflows dir: Enter the directory for your workflows
44 | - Git https url: Enter your repository URL
45 | - Brickflow version: Enter `1.3.1` (or press Enter for `auto`)
46 | - Spark expectations version: Press Enter for default (`0.8.0`)
47 | - Skip entrypoint: Choose `N` unless you have a specific reason to skip
48 |
49 | 4. Update your `.gitignore` file:
50 | ```
51 | **/bundle.yml
52 | .databricks/
53 | ```
54 |
55 | ## Project Structure
56 |
57 | Your project will follow either a monorepo or polyrepo style:
58 |
59 | ### Monorepo Structure Example:
60 | ```
61 | repo-root/
62 | ├── .git
63 | ├── projects/
64 | │ ├── project_abc/
65 | │ │ ├── lib/
66 | │ │ │ ├── __init__.py
67 | │ │ │ └── shared_functions.py
68 | │ │ ├── workflows/
69 | │ │ │ ├── __init__.py
70 | │ │ │ ├── entrypoint.py
71 | │ │ │ └── workflow_abc.py
72 | │ │ └── .brickflow-project-root.yml
73 | ```
74 |
75 | ### Polyrepo Structure Example:
76 | ```
77 | repo-root/
78 | ├── .git
79 | ├── src/
80 | │ ├── lib/
81 | │ │ ├── __init__.py
82 | │ │ └── shared_functions.py
83 | │ ├── workflows/
84 | │ │ ├── __init__.py
85 | │ │ ├── entrypoint.py
86 | │ │ └── workflow.py
87 | ├── .brickflow-project-root.yml
88 | ```
89 |
90 | ## Validating Your Project
91 |
92 | 1. Synthesize your project configuration:
93 | ```bash
94 | bf projects synth --project --profile
95 | ```
96 |
97 | 2. Verify the output shows:
98 | ```
99 | SUCCESSFULLY SYNTHESIZED BUNDLE.YML FOR PROJECT:
100 | ```
101 |
102 | ## Deploying Your Project
103 |
104 | ### Development Deployment
105 | ```bash
106 | bf projects deploy --project -p --force-acquire-lock
107 | ```
108 |
109 | ### Environment-Specific Deployments
110 | ```bash
111 | # Dev environment
112 | bf projects deploy --project -p -e dev --force-acquire-lock
113 |
114 | # Test environment
115 | bf projects deploy --project -p -e test --force-acquire-lock
116 |
117 | # Production environment
118 | bf projects deploy --project -p -e prod --force-acquire-lock
119 | ```
120 |
121 | ### Release Candidate Deployments
122 | For testing specific versions or pull requests:
123 |
124 | ```bash
125 | # Deploy RC version
126 | BRICKFLOW_WORKFLOW_SUFFIX="1.3.1-rc1" bf projects deploy --project -p -e test --force-acquire-lock
127 |
128 | # Deploy PR version
129 | BRICKFLOW_WORKFLOW_SUFFIX="1.3.1-pr34" bf projects deploy --project -p -e test --force-acquire-lock
130 | ```
131 |
132 | ## Cleaning Up
133 |
134 | ### Destroying Deployments
135 | ```bash
136 | # Destroy main deployment
137 | bf projects destroy --project -p --force-acquire-lock
138 |
139 | # Destroy RC deployment
140 | BRICKFLOW_WORKFLOW_SUFFIX="1.3.1-rc1" bf projects destroy --project -p -e test --force-acquire-lock
141 |
142 | # Destroy PR deployment
143 | BRICKFLOW_WORKFLOW_SUFFIX="1.3.1-pr34" bf projects destroy --project -p -e test --force-acquire-lock
144 | ```
145 |
146 | ## Troubleshooting
147 |
148 | 1. If synthesis fails:
149 | - Verify you're in the repository root directory
150 | - Check that all paths in configuration files are correct
151 | - Ensure all required __init__.py files exist
152 |
153 | 2. If deployment fails:
154 | - Verify Databricks CLI configuration
155 | - Check permissions in your Databricks workspace
156 | - Verify environment variables are set correctly
157 |
158 | ## Next Steps
159 |
160 | After successful deployment:
161 | 1. Monitor your workflows in the Databricks workspace
162 | 2. Set up CI/CD pipelines for automated deployments
163 | 3. Configure environment-specific variables
164 | 4. Set up monitoring and alerting
--------------------------------------------------------------------------------
/docs/cli/reference.md:
--------------------------------------------------------------------------------
1 | ---
2 | search:
3 | exclude: true
4 | ---
5 |
6 | This page provides documentation for our command line tools.
7 |
8 |
9 | ::: mkdocs-click
10 | :module: brickflow.cli
11 | :command: cli
12 | :prog_name: bf
13 | :depth: 1
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/docs/css/custom.css:
--------------------------------------------------------------------------------
1 | .md-footer-nav { display: none; }
2 |
3 | .md-footer__inner:not([hidden]) {
4 | display: none
5 | }
6 |
7 | /* Indentation. */
8 | div.doc-contents:not(.first) {
9 | padding-left: 25px;
10 | border-left: .05rem solid var(--md-typeset-table-color);
11 | }
12 |
13 | /* Mark external links as such. */
14 | a.autorefs-external::after {
15 | /* https://primer.style/octicons/arrow-up-right-24 */
16 | background-image: url('data:image/svg+xml,');
17 | content: ' ';
18 |
19 | display: inline-block;
20 | position: relative;
21 | top: 0.1em;
22 | margin-left: 0.2em;
23 | margin-right: 0.1em;
24 |
25 | height: 1em;
26 | width: 1em;
27 | border-radius: 100%;
28 | background-color: var(--md-typeset-a-color);
29 | }
30 | a.autorefs-external:hover::after {
31 | background-color: var(--md-accent-fg-color);
32 | }
--------------------------------------------------------------------------------
/docs/highlevel.md:
--------------------------------------------------------------------------------
1 | ## Brickflow Overview
2 |
3 | The objective of Brickflow is to provide a thin layer on top of databricks workflows to help deploy
4 | and manage workflows in Databricks. It also provides plugins/extras to be able to run airflow
5 | operators directly in the workflows.
6 |
7 | ## Brickflow to Airflow Term Mapping
8 |
9 | | Object | Airflow | Brickflow |
10 | |-------------------------------------------|-----------------------------------|---------------------------------------------------|
11 | | Collection of Workflows | Airflow Cluster (Airflow Dag Bag) | Project/Entrypoint |
12 | | Workflow | Airflow Dag | Workflow |
13 | | Task | Airflow Operator | Task |
14 | | Schedule | Unix Cron | Quartz Cron |
15 | | Inter Task Communication | XComs | Task Values |
16 | | Managing Connections to External Services | Airflow Connections | Mocked Airflow connections or Databricks Secrets |
17 | | Variables to Tasks | Variables | Task Parameters [ctx.get_parameter(key, default)] |
18 | | Context values (execution_date, etc.) | Airflow Macros, context["ti"] | ctx. |
19 |
--------------------------------------------------------------------------------
/docs/how-imports-work.md:
--------------------------------------------------------------------------------
1 | ### How do imports work?
2 |
3 | !!! warning
4 |
5 | **This is very important to understand how imports work for mono repos. Please read this carefully. Otherwise you might run into issues during deployments.**
6 |
7 | When using brickflow projects every project will have a `.brickflow-project-root.yml` file. When you import brickflow,
8 | which you will
9 | in your entrypoint or workflows, brickflow will inspect all paths all stackframes during the import and recursively go
10 | up the path until it finds the `.brickflow-project-root.yml` file.
11 | The first instance of brickflow-project-root.yml will be added to the sys.path to help with module imports.
12 |
13 | Let us take a quick example of how to get imports to properly work!
14 |
15 | Let us say you have a project structure like this:
16 |
17 | ```
18 | repo-root/
19 | ├── .git
20 | ├── projects/
21 | │ ├── project_abc/
22 | │ │ ├── lib/
23 | │ │ │ ├── __init__.py
24 | │ │ │ └── shared_functions.py
25 | │ │ ├── workflows/
26 | │ │ │ ├── __init__.py
27 | │ │ │ ├── entrypoint.py
28 | │ │ │ └── workflow_abc.py
29 | │ │ ├── setup.py
30 | │ │ └── .brickflow-project-root.yml
31 | │ └── project_xyz/
32 | │ ├── workflows_geo_b/
33 | │ │ ├── entrypoint.py
34 | │ │ └── workflow_xyz.py
35 | │ ├── workflows_geo_a/
36 | │ │ ├── entrypoint.py
37 | │ │ └── workflow_xyz.py
38 | │ └── .brickflow-project-root.yml
39 | ├── .gitignore
40 | ├── brickflow-multi-project.yml
41 | └── README.md
42 | ```
43 |
44 | If let us say you are looking at adding imports from lib into `workflow_abc.py`, you need to:
45 |
46 | ```python
47 | from lib import share_functions
48 |
49 | share_functions.some_function(....)
50 | ```
51 |
52 | Since in the project structure the `.brickflow-project-root.yml` is at `repo-root/projects/project_abc` then everything
53 | in that `project_abc` folder is
54 | added to sys.path in python. So you can import any of the folders under there.
--------------------------------------------------------------------------------
/docs/img/bf_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/docs/img/bf_logo.png
--------------------------------------------------------------------------------
/docs/img/bf_logo_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/docs/img/bf_logo_1.png
--------------------------------------------------------------------------------
/docs/img/maintainance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/docs/img/maintainance.png
--------------------------------------------------------------------------------
/docs/img/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/docs/img/workflow.png
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | hide:
3 | - navigation
4 | ---
5 |
6 | # BrickFlow
7 |
8 | BrickFlow is a CLI tool for development and deployment of Python based Databricks Workflows in a declarative way.
9 |
10 | ## Concept
11 |
12 | `brickflow` aims to improve development experience for building any pipelines on databricks via:
13 |
14 | - Providing a declarative way to describe workflows via decorators
15 | - Provide intelligent defaults to compute targets
16 | - Provide a code and git first approach to managing and deploying workflows
17 | - Use databricks asset bundles to deploy workflows seamlessly. It is powered using terraform which helps manage state
18 | across deployments.
19 | - CLI tool helps facilitate setting up a projects
20 | - Provides additional functionality through the context library to be able to do additional things for workflows.
21 |
22 |
23 | ## Feedback
24 |
25 | Issues with `brickflow`? Found a :octicons-bug-24: bug?
26 | Have a great idea for an addition? Want to improve the documentation? Please feel
27 | free to file an [issue](https://github.com/Nike-Inc/brickflow/issues/new/choose).
28 |
29 | ## Contributing
30 |
31 | To contribute please fork and create a pull request. Here is
32 | a [guide](https://github.com/Nike-Inc/brickflow/blob/main/CONTRIBUTING.md) to help you through this process.
--------------------------------------------------------------------------------
/docs/projects.md:
--------------------------------------------------------------------------------
1 | The project is similar to a map cluster it can be composed of various different Workflows or dags.
2 |
3 |
4 | Here is an example of an entrypoint.
5 | Click the plus buttons to understand all the parts of the entrypoint file.
6 |
7 | ```python title="entrypoint.py"
8 | # Databricks notebook source (1)
9 |
10 | import examples.brickflow_examples.workflows
11 |
12 | from brickflow import Project, PypiTaskLibrary, MavenTaskLibrary
13 |
14 |
15 | def main() -> None:
16 | """Project entrypoint"""
17 | with Project(
18 | "brickflow-demo", # (3)!
19 | git_repo="https://github.com/nike-inc/brickflow", # (4)!
20 | provider="github", # (5)!
21 | libraries=[ # (6)!
22 | PypiTaskLibrary(package="networkx"),
23 | ],
24 | ) as f:
25 | f.add_pkg(examples.brickflow_examples.workflows) # (7)!
26 |
27 |
28 | if __name__ == "__main__": # (2)!
29 | main()
30 | ```
31 |
32 |
33 | 1. Uploading this Python file into databricks with this comment on the first line treats the python file
34 | as a notebook.
35 | 2. This makes sure this only runs when this file is run via python entrypoint.py
36 | 3. This is the project name you provided when you do `bf projects add`
37 | 4. This is the git repo that is introspected when running `bf projects add`
38 | 5. This is the github provider that you decide on.
39 | 6. You can provide a list of packages that need to be installed in all of your clusters when running ETL.
40 | 7. You can add multiple packages in your project where you are defining workflows.
--------------------------------------------------------------------------------
/docs/upgrades/upgrade-pre-0-10-0-to-0-10-0.md:
--------------------------------------------------------------------------------
1 | ---
2 | search:
3 | boost: 2
4 | ---
5 |
6 | ## Upgrade checklist
7 |
8 | * [x] The package has been renamed from `brickflow` to `brickflows`. Please run:
9 |
10 | ```
11 | pip uninstall brickflow
12 | ```
13 |
14 | and then
15 |
16 | ```
17 | pip install brickflows>=0.10.0
18 | bf --version
19 | ```
20 |
21 | * [x] If you are upgrading from a CDKTF version of brickflow then do not worry, the existing workflows as long as you do
22 | not change their names will be imported.
23 |
24 | * [x] Start using project configurations following the [quickstart guide](../../bundles-quickstart/#brickflow-projects-setup).
25 |
26 | * [x] Confirm the existence of the following files:
27 |
28 | * brickflow-multi-project.yml
29 | * brickflow-project-root.yml
30 | * Please reference [concepts](../../bundles-quickstart/#concepts)
31 | and [initialize project](../../bundles-quickstart/#initialize-project) for more details.
32 |
33 | * [x] RelativePathPackageResolver has been removed from the project to offer a seamless import
34 | as long as you import brickflow at the top.
35 |
36 | * [x] Ensure import for brickflow is at the top of your entrypoint.py
37 |
38 | * [x] Ensure import for brickflow is at the top of your entrypoint.py
39 |
40 |
41 | * [x] Ensure your entrypoint looks like this. **Make sure to click the plus buttons and read the highlighted sections**:
42 |
43 | ```python linenums="1" hl_lines="5 7 15 18"
44 | # Databricks notebook source
45 |
46 | # COMMAND ----------
47 |
48 | from brickflow import Project # (1)!
49 |
50 | import workflows # (2)!
51 |
52 | def main() -> None:
53 | """Project entrypoint"""
54 | with Project(
55 | "product_abc_workflows_2",
56 | git_repo="https://github.com/stikkireddy/mono-repo-test",
57 | provider="github",
58 | libraries=[ # (3)!
59 | # PypiTaskLibrary(package="spark-expectations==0.5.0"), # Uncomment if spark-expectations is needed
60 | ],
61 | enable_plugins=True, # (4)!
62 | ) as f:
63 | f.add_pkg(workflows)
64 |
65 |
66 | if __name__ == "__main__":
67 | main()
68 | ```
69 |
70 | 1. Make sure brickflow is at the top of your imports! This will help resolve paths and allow other libraries to be
71 | imported correctly.
72 | 2. Import your modules after brickflow has been imported! Make sure your optimize imports doesnt reorder your imports!
73 | 3. Make sure you remove brickflow and brickflow plugins and cron utils from this list.
74 | 4. Make sure you have enable_plugins=True. This will enable the plugins to be loaded to support airflow operators, etc.
75 | Disable this if you dont want to install airflow.
76 |
77 |
78 |
--------------------------------------------------------------------------------
/examples/brickflow_examples/.brickflow-project-root.yml:
--------------------------------------------------------------------------------
1 | # DO NOT MODIFY THIS FILE - IT IS AUTO GENERATED BY BRICKFLOW AND RESERVED FOR FUTURE USAGE
2 | projects:
3 | brickflow-demo:
4 | brickflow_version: auto
5 | deployment_mode: bundle
6 | enable_plugins: true
7 | name: brickflow-demo
8 | path_from_repo_root_to_project_root: .
9 | path_project_root_to_workflows_dir: workflows
10 | version: v1
11 |
--------------------------------------------------------------------------------
/examples/brickflow_examples/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 |
132 | # GENERATED BY BRICKFLOW CLI --START--
133 |
134 | ### Terraform ###
135 | # Local .terraform directories
136 | **/.terraform/*
137 |
138 | # .tfstate files
139 | *.tfstate
140 | *.tfstate.*
141 |
142 | # Crash log files
143 | crash.log
144 | crash.*.log
145 |
146 | # Exclude all .tfvars files, which are likely to contain sensitive data, such as
147 | # password, private keys, and other secrets. These should not be part of version
148 | # control as they are data points which are potentially sensitive and subject
149 | # to change depending on the environment.
150 | *.tfvars
151 | *.tfvars.json
152 |
153 | # Ignore override files as they are usually used to override resources locally and so
154 | # are not checked in
155 | override.tf
156 | override.tf.json
157 | *_override.tf
158 | *_override.tf.json
159 |
160 | # Include override files you do wish to add to version control using negated pattern
161 | # !example_override.tf
162 |
163 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan
164 | # example: *tfplan*
165 |
166 | # Ignore CLI configuration files
167 | .terraformrc
168 | terraform.rc
169 |
170 | # GENERATED BY BRICKFLOW CLI --END--
171 |
172 | .idea
173 | bundle.yml
--------------------------------------------------------------------------------
/examples/brickflow_examples/README.md:
--------------------------------------------------------------------------------
1 | # brickflow-examples
2 | This repository consists of examples for brickflow
3 |
4 | ## Getting Started
5 |
6 | ### Prerequisites
7 | 1.Install brickflows
8 |
9 | ```shell
10 | pip install brickflows
11 | ```
12 |
13 | 2.Install [Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/databricks-cli.html)
14 |
15 | ```shell
16 | curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sudo sh
17 | ```
18 |
19 | 3.Configure Databricks cli with workspace token. This configures your `~/.databrickscfg` file.
20 |
21 | ```shell
22 | databricks configure --token
23 | ```
24 |
25 | ### Clone the repository
26 |
27 | ```shell
28 | git clone https://github.com/Nike-Inc/brickflow.git
29 | cd brickflow/examples/brickflow_examples
30 | ```
31 |
32 | ### Hello World workflow
33 | - Create your first workflow using brickflow
34 | - Create a new file hello_world_workflow.py in the workflows directory
35 | - Add the following code to the file
36 | ```python
37 | from brickflow import (
38 | Cluster,
39 | Workflow,
40 | NotebookTask,
41 | )
42 | from brickflow.context import ctx
43 | from airflow.operators.bash import BashOperator
44 |
45 |
46 | cluster = Cluster(
47 | name="job_cluster",
48 | node_type_id="m6gd.xlarge",
49 | spark_version="13.3.x-scala2.12",
50 | min_workers=1,
51 | max_workers=2,
52 | )
53 |
54 | wf = Workflow(
55 | "hello_world_workflow",
56 | default_cluster=cluster,
57 | tags={
58 | "product_id": "brickflow_demo",
59 | },
60 | common_task_parameters={
61 | "catalog": "",
62 | "database": "",
63 | },
64 | )
65 |
66 | @wf.task
67 | # this task does nothing but explains the use of context object
68 | def start():
69 | print(f"Environment: {ctx.env}")
70 |
71 | @wf.notebook_task
72 | # this task runs a databricks notebook
73 | def example_notebook():
74 | return NotebookTask(
75 | notebook_path="notebooks/example_notebook.py",
76 | base_parameters={
77 | "some_parameter": "some_value", # in the notebook access these via dbutils.widgets.get("some_parameter")
78 | },
79 | )
80 |
81 |
82 | @wf.task(depends_on=[start, example_notebook])
83 | # this task runs a bash command
84 | def list_lending_club_data_files():
85 | return BashOperator(
86 | task_id=list_lending_club_data_files.__name__,
87 | bash_command="ls -lrt /dbfs/databricks-datasets/samples/lending_club/parquet/",
88 | )
89 |
90 | @wf.task(depends_on=list_lending_club_data_files)
91 | # this task runs the pyspark code
92 | def lending_data_ingest():
93 | ctx.spark.sql(
94 | f"""
95 | CREATE TABLE IF NOT EXISTS
96 | {ctx.dbutils_widget_get_or_else(key="catalog", debug="development")}.\
97 | {ctx.dbutils_widget_get_or_else(key="database", debug="dummy_database")}.\
98 | {ctx.dbutils_widget_get_or_else(key="brickflow_env", debug="local")}_lending_data_ingest
99 | USING DELTA -- this is default just for explicit purpose
100 | SELECT * FROM parquet.`dbfs:/databricks-datasets/samples/lending_club/parquet/`
101 | """
102 | )
103 | ```
104 | _Note: Modify the values of catalog/database for common_task_parameters._
105 |
106 | ### Update demo_wf.py
107 | - demo_wf.py explains the various tasks and options available for the tasks
108 | - You can remove the demo_wf.py in case you just to run the hello_world_workflow.py
109 | - In case you want to run the demo_wf.py, update the below params with your values
110 | - default_cluster
111 | - common_task_parameters
112 | - permissions
113 | - default_task_settings
114 |
115 | ### Deploy the workflow to databricks
116 | ```shell
117 | brickflow projects deploy --project brickflow-demo -e local
118 | ```
119 |
120 | ### Run the demo workflow
121 | - login to databricks workspace
122 | - go to the workflows and select the workflow
123 | 
124 | - click on the run button
125 |
--------------------------------------------------------------------------------
/examples/brickflow_examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_examples/__init__.py
--------------------------------------------------------------------------------
/examples/brickflow_examples/brickflow-multi-project.yml:
--------------------------------------------------------------------------------
1 | project_roots:
2 | brickflow-demo:
3 | root_yaml_rel_path: .
4 | version: v1
5 |
--------------------------------------------------------------------------------
/examples/brickflow_examples/notebooks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_examples/notebooks/__init__.py
--------------------------------------------------------------------------------
/examples/brickflow_examples/notebooks/example_notebook.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 |
3 | print("hello world")
4 |
--------------------------------------------------------------------------------
/examples/brickflow_examples/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_examples/src/__init__.py
--------------------------------------------------------------------------------
/examples/brickflow_examples/src/python/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_examples/src/python/__init__.py
--------------------------------------------------------------------------------
/examples/brickflow_examples/src/python/lending_data_show.py:
--------------------------------------------------------------------------------
1 | from brickflow.context import ctx
2 |
3 |
4 | def lending_data_print():
5 | ctx.spark.sql(
6 | """
7 | SELECT
8 | addr_state, *
9 | FROM
10 | parquet.`dbfs:/databricks-datasets/samples/lending_club/parquet/` limit 10
11 | """
12 | ).show(truncate=False)
13 |
14 |
15 | if __name__ == "__main__":
16 | lending_data_print()
17 |
--------------------------------------------------------------------------------
/examples/brickflow_examples/src/python/setup_data.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # MAGIC %fs ls dbfs:/databricks-datasets/samples/lending_club/parquet/
3 |
4 | # COMMAND ----------
5 |
6 | # MAGIC %sql
7 | # MAGIC SELECT
8 | # MAGIC addr_state, *
9 | # MAGIC FROM
10 | # MAGIC parquet.`dbfs:/databricks-datasets/samples/lending_club/parquet/`
11 |
12 | # COMMAND ----------
13 |
14 |
15 | # -- ingest step
16 | catalog = "development"
17 | database = "team_databricks_sme"
18 | spark.sql(
19 | f"""
20 | CREATE TABLE IF NOT EXISTS {catalog}.{database}.lending_data
21 | USING DELTA -- this is default just for explicit purpose
22 | SELECT * FROM parquet.`dbfs:/databricks-datasets/samples/lending_club/parquet/`
23 | """
24 | )
25 |
26 | # COMMAND ----------
27 |
28 | # Step 2
29 | catalog = "development"
30 | database = "team_databricks_sme"
31 | spark.sql(
32 | f"""
33 | OPTIMIZE {catalog}.{database}.lending_data;
34 | """
35 | )
36 |
37 | # COMMAND ----------
38 |
39 | # MAGIC %sql
40 | # MAGIC SELECT distinct addr_state FROM development.team_databricks_sme.lending_data
41 |
42 | # COMMAND ----------
43 |
44 |
45 | # -- T&S 1 process AZ data
46 | catalog = "development"
47 | database = "team_databricks_sme"
48 | spark.sql(
49 | f"""
50 | CREATE OR REPLACE TABLE {catalog}.{database}.lending_data_az_geo
51 | USING DELTA -- this is default just for explicit purpose
52 | SELECT * FROM {catalog}.{database}.lending_data where addr_state = 'AZ'
53 | """
54 | )
55 |
56 | # COMMAND ----------
57 |
58 | # -- T&S 2 process CA data
59 | catalog = "development"
60 | database = "team_databricks_sme"
61 | spark.sql(
62 | f"""
63 | CREATE OR REPLACE TABLE {catalog}.{database}.lending_data_ca_geo
64 | USING DELTA -- this is default just for explicit purpose
65 | SELECT * FROM {catalog}.{database}.lending_data where addr_state = 'CA'
66 | """
67 | )
68 |
69 | # COMMAND ----------
70 |
71 | # -- T&S 3 process IL data
72 | catalog = "development"
73 | database = "team_databricks_sme"
74 | spark.sql(
75 | f"""
76 | CREATE OR REPLACE TABLE {catalog}.{database}.lending_data_il_geo
77 | USING DELTA -- this is default just for explicit purpose
78 | SELECT * FROM {catalog}.{database}.≈ where addr_state = 'IL'
79 | """
80 | )
81 |
82 | # COMMAND ----------
83 |
84 | # -- Union Data Together
85 | catalog = "development"
86 | database = "team_databricks_sme"
87 | spark.sql(
88 | f"""
89 | CREATE OR REPLACE TABLE {catalog}.{database}.lending_data_az_ca_il_geo
90 | USING DELTA -- this is default just for explicit purpose
91 | SELECT * FROM {catalog}.{database}.lending_data_az_geo
92 | UNION ALL
93 | SELECT * FROM {catalog}.{database}.lending_data_ca_geo
94 | UNION ALL
95 | SELECT * FROM {catalog}.{database}.lending_data_il_geo
96 | """
97 | )
98 |
99 | # COMMAND ----------
100 |
101 | # -- Union Data Together
102 | catalog = "development"
103 | database = "team_databricks_sme"
104 | spark.sql(
105 | f"""
106 | SELECT * FROM {catalog}.{database}.lending_data_az_ca_il_geo
107 | """
108 | ).limit(10).toPandas().to_csv("data.csv")
109 | with open("data.csv", "r") as f:
110 | print(f.read())
111 |
112 | # COMMAND ----------
113 |
--------------------------------------------------------------------------------
/examples/brickflow_examples/src/sql/sample.sql:
--------------------------------------------------------------------------------
1 | create or replace table $database.$schema.sample as
2 | select * from $database.$schema.source
--------------------------------------------------------------------------------
/examples/brickflow_examples/workflows/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_examples/workflows/__init__.py
--------------------------------------------------------------------------------
/examples/brickflow_examples/workflows/entrypoint.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 |
3 | import brickflow
4 | from brickflow import Project, PypiTaskLibrary
5 | import workflows
6 |
7 |
8 | def main() -> None:
9 | with Project(
10 | "brickflow-demo",
11 | git_repo="https://github.com/Nike-Inc/brickflow",
12 | provider="github",
13 | libraries=[
14 | PypiTaskLibrary(
15 | package="spark-expectations==0.8.0"
16 | ), # comment if spark-expectations is not needed
17 | ],
18 | ) as f:
19 | f.add_pkg(workflows)
20 |
21 |
22 | if __name__ == "__main__":
23 | main()
24 |
--------------------------------------------------------------------------------
/examples/brickflow_for_each_task_examples/.brickflow-project-root.yml:
--------------------------------------------------------------------------------
1 | # DO NOT MODIFY THIS FILE - IT IS AUTO GENERATED BY BRICKFLOW AND RESERVED FOR FUTURE USAGE
2 | projects:
3 | for_each_task_examples:
4 | brickflow_version: auto
5 | deployment_mode: bundle
6 | enable_plugins: true
7 | name: for_each_task_examples
8 | path_from_repo_root_to_project_root: examples/brickflow_for_each_task_examples
9 | path_project_root_to_workflows_dir: workflows
10 | version: v1
11 |
--------------------------------------------------------------------------------
/examples/brickflow_for_each_task_examples/README.md:
--------------------------------------------------------------------------------
1 | # Brickflow for each task examples
2 | This repository contains some examples on how to use the fo each task type in brickflow.
3 |
4 | ## Getting Started
5 |
6 | ### Prerequisites
7 | 1.Install brickflows
8 |
9 | ```shell
10 | pip install brickflows
11 | ```
12 |
13 | 2.Install [Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/databricks-cli.html)
14 |
15 | ```shell
16 | curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sudo sh
17 | ```
18 |
19 | 3.Configure Databricks cli with workspace token. This configures your `~/.databrickscfg` file.
20 |
21 | ```shell
22 | databricks configure --token
23 | ```
24 |
25 | ### Clone the repository
26 |
27 | ```shell
28 | git clone https://github.com/Nike-Inc/brickflow.git
29 | cd brickflow/examples/brickflow_serverless_examples
30 | ```
31 |
32 | ### Customize the workflow
33 |
34 | Replace all the placeholders in workflows/for_each_task_workflow.py with configuration values compatible with your databricks workspace
35 |
36 |
37 | ### Deploy the workflow to databricks
38 | ```shell
39 | brickflow projects deploy --project for_each_task_examples -e local
40 | ```
41 |
42 | ### Run the demo workflow
43 | - login to databricks workspace
44 | - go to the workflows and select the workflow
45 | - click on the run button
46 |
--------------------------------------------------------------------------------
/examples/brickflow_for_each_task_examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_for_each_task_examples/__init__.py
--------------------------------------------------------------------------------
/examples/brickflow_for_each_task_examples/brickflow-multi-project.yml:
--------------------------------------------------------------------------------
1 | project_roots:
2 | for_each_task_examples:
3 | root_yaml_rel_path: .
4 | version: v1
5 |
--------------------------------------------------------------------------------
/examples/brickflow_for_each_task_examples/notebooks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_for_each_task_examples/notebooks/__init__.py
--------------------------------------------------------------------------------
/examples/brickflow_for_each_task_examples/notebooks/example_notebook.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 |
3 | param = dbutils.widgets.get("looped_parameter")
4 | print(f"Hey this is a nested notebook running with inputs: {param}")
5 |
--------------------------------------------------------------------------------
/examples/brickflow_for_each_task_examples/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_for_each_task_examples/src/__init__.py
--------------------------------------------------------------------------------
/examples/brickflow_for_each_task_examples/src/python/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_for_each_task_examples/src/python/__init__.py
--------------------------------------------------------------------------------
/examples/brickflow_for_each_task_examples/src/python/print_args.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | if __name__ == "__main__":
4 | print(f"Hello, running with input {sys.argv}")
5 |
--------------------------------------------------------------------------------
/examples/brickflow_for_each_task_examples/workflows/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_for_each_task_examples/workflows/__init__.py
--------------------------------------------------------------------------------
/examples/brickflow_for_each_task_examples/workflows/entrypoint.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 |
3 | import brickflow
4 | from brickflow import Project
5 | import workflows
6 |
7 |
8 | def main() -> None:
9 | with Project(
10 | "for_each_task_examples",
11 | git_repo="https://github.com/Nike-Inc/brickflow",
12 | provider="github",
13 | ) as f:
14 | f.add_pkg(workflows)
15 |
16 |
17 | if __name__ == "__main__":
18 | main()
19 |
--------------------------------------------------------------------------------
/examples/brickflow_for_each_task_examples/workflows/for_each_task_wf.py:
--------------------------------------------------------------------------------
1 | from brickflow import (
2 | Workflow,
3 | WorkflowPermissions,
4 | User,
5 | NotebookTask,
6 | Cluster,
7 | JarTaskLibrary,
8 | SparkJarTask,
9 | SparkPythonTask,
10 | SqlTask,
11 | )
12 |
13 | from brickflow.context import ctx
14 | from brickflow.engine.task import JobsTasksForEachTaskConfigs
15 |
16 | cluster = Cluster(
17 | name=f"job_cluster_for_each_task_examples",
18 | driver_node_type_id="r7g.large",
19 | node_type_id="r7g.large",
20 | spark_version="13.3.x-scala2.12",
21 | min_workers=1,
22 | max_workers=1,
23 | policy_id="", # replace with an existing policy id
24 | )
25 |
26 | wf = Workflow(
27 | "for_each_task_examples_wf",
28 | default_cluster=cluster,
29 | permissions=WorkflowPermissions(
30 | can_manage=[
31 | User(
32 | "" # replace email with existing users' email on databricks
33 | )
34 | ],
35 | ),
36 | )
37 |
38 |
39 | @wf.task
40 | def example_task():
41 | print("This is a dependant task!")
42 |
43 |
44 | @wf.for_each_task(
45 | depends_on=example_task,
46 | for_each_task_conf=JobsTasksForEachTaskConfigs(
47 | # Inputs can be provided by either a python iterable or a json-string
48 | inputs=[
49 | "AZ",
50 | "CA",
51 | "IL",
52 | ],
53 | concurrency=3,
54 | ),
55 | )
56 | def example_notebook():
57 | return NotebookTask(
58 | notebook_path="notebooks/example_notebook.py",
59 | base_parameters={"looped_parameter": "{{input}}"},
60 | )
61 |
62 |
63 | @wf.for_each_task(
64 | depends_on=example_task,
65 | for_each_task_conf=JobsTasksForEachTaskConfigs(
66 | inputs='["1", "2", "3"]', concurrency=3
67 | ),
68 | )
69 | def example_brickflow_task(*, test_param="{{input}}"):
70 | print(f"Test param: {test_param}")
71 | param = ctx.get_parameter("looped_parameter")
72 | print(f"Nested brickflow task running with input: {param}")
73 |
74 |
75 | @wf.for_each_task(
76 | depends_on=example_task,
77 | libraries=[
78 | JarTaskLibrary(
79 | jar=""
80 | ) # Replace with actual jar path
81 | ],
82 | for_each_task_conf=JobsTasksForEachTaskConfigs(
83 | inputs="[1,2,3]",
84 | concurrency=1,
85 | ),
86 | )
87 | def for_each_spark_jar():
88 | return SparkJarTask(
89 | main_class_name="com.example.MainClass", # Replace with actual main class name
90 | parameters=["{{input}}"],
91 | )
92 |
93 |
94 | @wf.for_each_task(
95 | depends_on=example_task,
96 | for_each_task_conf=JobsTasksForEachTaskConfigs(
97 | inputs="[1,2,3]",
98 | concurrency=1,
99 | ),
100 | )
101 | def for_each_spark_python():
102 | return SparkPythonTask(
103 | python_file="examples/brickflow_for_each_task_examples/src/python/print_args.py",
104 | source="WORKSPACE",
105 | parameters=["{{input}}"],
106 | )
107 |
108 |
109 | @wf.for_each_task(
110 | depends_on=example_notebook,
111 | for_each_task_conf=JobsTasksForEachTaskConfigs(
112 | inputs="[1,2,3]",
113 | concurrency=1,
114 | ),
115 | )
116 | def for_each_sql_task() -> any:
117 | return SqlTask(
118 | query_id="", # Replace with actual query id
119 | warehouse_id="", # Replace with actual warehouse id
120 | parameters={"looped_parameter": "{{input}}"},
121 | )
122 |
--------------------------------------------------------------------------------
/examples/brickflow_serverless_examples/.brickflow-project-root.yml:
--------------------------------------------------------------------------------
1 | # DO NOT MODIFY THIS FILE - IT IS AUTO GENERATED BY BRICKFLOW AND RESERVED FOR FUTURE USAGE
2 | projects:
3 | brickflow-serverless-demo:
4 | brickflow_version: auto
5 | deployment_mode: bundle
6 | enable_plugins: true
7 | name: brickflow-serverless-demo
8 | path_from_repo_root_to_project_root: .
9 | path_project_root_to_workflows_dir: workflows
10 | version: v1
11 |
--------------------------------------------------------------------------------
/examples/brickflow_serverless_examples/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 |
132 | # GENERATED BY BRICKFLOW CLI --START--
133 |
134 | ### Terraform ###
135 | # Local .terraform directories
136 | **/.terraform/*
137 |
138 | # .tfstate files
139 | *.tfstate
140 | *.tfstate.*
141 |
142 | # Crash log files
143 | crash.log
144 | crash.*.log
145 |
146 | # Exclude all .tfvars files, which are likely to contain sensitive data, such as
147 | # password, private keys, and other secrets. These should not be part of version
148 | # control as they are data points which are potentially sensitive and subject
149 | # to change depending on the environment.
150 | *.tfvars
151 | *.tfvars.json
152 |
153 | # Ignore override files as they are usually used to override resources locally and so
154 | # are not checked in
155 | override.tf
156 | override.tf.json
157 | *_override.tf
158 | *_override.tf.json
159 |
160 | # Include override files you do wish to add to version control using negated pattern
161 | # !example_override.tf
162 |
163 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan
164 | # example: *tfplan*
165 |
166 | # Ignore CLI configuration files
167 | .terraformrc
168 | terraform.rc
169 |
170 | # GENERATED BY BRICKFLOW CLI --END--
171 |
172 | .idea
173 | bundle.yml
--------------------------------------------------------------------------------
/examples/brickflow_serverless_examples/README.md:
--------------------------------------------------------------------------------
1 | # Brickflows Serverless Example
2 | This project contains the example of the serverless workflow, that contains:
3 | - notebook task
4 | - python task
5 | - native Brickflow entrypoint task
6 |
7 | Note that in notebook task and entrypoint task the dependencies are set through magic `pip install` commands within
8 | the notebook.
9 |
10 | ## Getting Started
11 |
12 | ### Prerequisites
13 | 1.Install brickflows
14 |
15 | ```shell
16 | pip install brickflows
17 | ```
18 |
19 | 2.Install [Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/databricks-cli.html)
20 |
21 | ```shell
22 | curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sudo sh
23 | ```
24 |
25 | 3.Configure Databricks cli with workspace token. This configures your `~/.databrickscfg` file.
26 |
27 | ```shell
28 | databricks configure --token
29 | ```
30 |
31 | ### Clone the repository
32 |
33 | ```shell
34 | git clone https://github.com/Nike-Inc/brickflow.git
35 | cd brickflow/examples/brickflow_serverless_examples
36 | ```
37 |
38 | ### Deploy the workflow to databricks
39 | ```shell
40 | brickflow projects deploy --project brickflow-serverless-demo -e local
41 | ```
42 |
43 | ### Run the demo workflow
44 | - login to databricks workspace
45 | - go to the workflows and select the workflow
46 | - click on the run button
47 |
--------------------------------------------------------------------------------
/examples/brickflow_serverless_examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_serverless_examples/__init__.py
--------------------------------------------------------------------------------
/examples/brickflow_serverless_examples/brickflow-multi-project.yml:
--------------------------------------------------------------------------------
1 | project_roots:
2 | brickflow-serverless-demo:
3 | root_yaml_rel_path: .
4 | version: v1
5 |
--------------------------------------------------------------------------------
/examples/brickflow_serverless_examples/notebooks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_serverless_examples/notebooks/__init__.py
--------------------------------------------------------------------------------
/examples/brickflow_serverless_examples/notebooks/example_notebook.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # MAGIC %pip install pytz==2024.2
3 |
4 | # COMMAND ----------
5 | import pytz
6 | from datetime import datetime
7 |
8 |
9 | def get_current_time_in_timezone(timezone_str):
10 | # Get the timezone object
11 | timezone = pytz.timezone(timezone_str)
12 | # Get the current time in the specified timezone
13 | current_time = datetime.now(timezone)
14 | return current_time
15 |
16 |
17 | # Example usage
18 | timezones = ["UTC", "Europe/Amsterdam", "Asia/Tokyo", "America/New_York"]
19 | for tz in timezones:
20 | print(f"Current time in {tz}: {get_current_time_in_timezone(tz)}")
21 |
--------------------------------------------------------------------------------
/examples/brickflow_serverless_examples/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_serverless_examples/src/__init__.py
--------------------------------------------------------------------------------
/examples/brickflow_serverless_examples/src/python/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_serverless_examples/src/python/__init__.py
--------------------------------------------------------------------------------
/examples/brickflow_serverless_examples/src/python/example.py:
--------------------------------------------------------------------------------
1 | import pytz
2 | from datetime import datetime
3 | import argparse
4 |
5 |
6 | def get_current_time_in_timezone(timezone_str):
7 | # Get the timezone object
8 | timezone = pytz.timezone(timezone_str)
9 | # Get the current time in the specified timezone
10 | current_time = datetime.now(timezone)
11 | return current_time
12 |
13 |
14 | if __name__ == "__main__":
15 | parser = argparse.ArgumentParser(
16 | description="Get the current time in a specified timezone."
17 | )
18 | parser.add_argument(
19 | "--timezone",
20 | type=str,
21 | required=True,
22 | help="The timezone to get the current time for.",
23 | )
24 | args = parser.parse_args()
25 |
26 | try:
27 | current_time = get_current_time_in_timezone(args.timezone)
28 | print(f"Current time in {args.timezone}: {current_time}")
29 | except pytz.UnknownTimeZoneError:
30 | print(f"Unknown timezone: {args.timezone}")
31 |
--------------------------------------------------------------------------------
/examples/brickflow_serverless_examples/workflows/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_serverless_examples/workflows/__init__.py
--------------------------------------------------------------------------------
/examples/brickflow_serverless_examples/workflows/demo_serverless_wf.py:
--------------------------------------------------------------------------------
1 | from brickflow import (
2 | Workflow,
3 | NotebookTask,
4 | SparkPythonTask,
5 | )
6 | from brickflow.engine.task import PypiTaskLibrary
7 |
8 | wf = Workflow(
9 | "brickflow-serverless-demo",
10 | schedule_quartz_expression="0 0/20 0 ? * * *",
11 | libraries=[
12 | PypiTaskLibrary(package="pytz==2024.2"),
13 | # Custom repositories are not supported for serverless workloads, due to Databricks CLI limitations.
14 | # Refer to: https://github.com/databricks/cli/pull/1842This will be fixed in the future releases, use wheel instead.
15 | # PypiTaskLibrary(
16 | # package="my-lib==1.2.3", repo="https://artifactory.my-org.com/api/pypi/python-virtual/simple"
17 | # ),
18 | ],
19 | )
20 |
21 |
22 | @wf.task
23 | def entrypoint_task():
24 | pass
25 |
26 |
27 | @wf.notebook_task
28 | def notebook_task():
29 | return NotebookTask(
30 | notebook_path="notebooks/example_notebook.py",
31 | base_parameters={
32 | "some_parameter": "some_value", # in the notebook access these via dbutils.widgets.get("some_parameter")
33 | },
34 | ) # type: ignore
35 |
36 |
37 | @wf.spark_python_task
38 | def spark_python_task():
39 | return SparkPythonTask(
40 | python_file="/src/python/example.py",
41 | source="GIT",
42 | parameters=["--timezone", "UTC"],
43 | ) # type: ignore
44 |
--------------------------------------------------------------------------------
/examples/brickflow_serverless_examples/workflows/entrypoint.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # This should point to the `brickflows` version with serverless support or the wheel file with the same
3 | # MAGIC %pip install brickflows==1.2.1
4 | # MAGIC %pip install koheesio==0.8.1
5 | # MAGIC %restart_python
6 |
7 | # COMMAND ----------
8 | import brickflow
9 | from brickflow import Project, PypiTaskLibrary
10 | import workflows
11 |
12 |
13 | def main() -> None:
14 | with Project(
15 | "brickflow-serverless-demo",
16 | git_repo="https://github.com/Nike-Inc/brickflow",
17 | provider="github",
18 | ) as f:
19 | f.add_pkg(workflows)
20 |
21 |
22 | if __name__ == "__main__":
23 | main()
24 |
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: BrickFlow
2 | site_description: Brickflow is a tool for managing and deploying scalable workflows on Databricks.
3 | site_url: https://brickflow.readthedocs.io/en/latest/
4 |
5 | theme:
6 | name: material
7 | palette:
8 | - scheme: default
9 | primary: indigo
10 | accent: indigo
11 | toggle:
12 | icon: material/brightness-7
13 | name: Switch to dark mode
14 | - scheme: slate
15 | primary: indigo
16 | accent: indigo
17 | toggle:
18 | icon: material/brightness-4
19 | name: Switch to light mode
20 | features:
21 | # - announce.dismiss
22 | - content.code.annotate
23 | # - content.tabs.link
24 | - content.tooltips
25 | - content.code.copy
26 | # - header.autohide
27 | # - navigation.expand
28 | - navigation.indexes
29 | - navigation.instant
30 | # - navigation.prune
31 | # - navigation.sections
32 | - navigation.tabs
33 | - navigation.tabs.sticky
34 | - navigation.top
35 | - navigation.tracking
36 | - navigation.expand
37 | - search.highlight
38 | - search.share
39 | - search.suggest
40 | - toc.follow
41 | font:
42 | text: Roboto
43 | code: Roboto Mono
44 | logo: img/bf_logo.png
45 | favicon: img/bf_logo.png
46 | language: en
47 |
48 | repo_name: nike/brickflow
49 | repo_url: https://github.com/Nike-Inc/brickflow
50 |
51 | plugins:
52 | - search:
53 | lang: en
54 | - mkdocstrings:
55 | handlers:
56 | python:
57 | paths: [ "brickflow" ] # search packages in the src folder
58 | options:
59 | show_source: true
60 | show_root_heading: false
61 | heading_level: 1
62 | merge_init_into_class: true
63 | show_if_no_docstring: true
64 | show_root_full_path: true
65 | show_root_members_full_path: true
66 | show_root_toc_entry: false
67 | show_category_heading: true
68 | show_signature_annotations: true
69 | separate_signature: false
70 |
71 | markdown_extensions:
72 | - abbr
73 | - admonition
74 | - mkdocs-click
75 | - attr_list
76 | - def_list
77 | - footnotes
78 | - md_in_html
79 | - toc:
80 | permalink: true
81 | - pymdownx.arithmatex:
82 | generic: true
83 | - pymdownx.betterem:
84 | smart_enable: all
85 | - pymdownx.caret
86 | - pymdownx.details
87 | - pymdownx.emoji:
88 | emoji_generator: !!python/name:materialx.emoji.to_svg
89 | emoji_index: !!python/name:materialx.emoji.twemoji
90 | - pymdownx.highlight:
91 | anchor_linenums: true
92 | - pymdownx.inlinehilite
93 | - pymdownx.keys
94 | - pymdownx.magiclink:
95 | repo_url_shorthand: true
96 | user: squidfunk
97 | repo: mkdocs-material
98 | - pymdownx.mark
99 | - pymdownx.smartsymbols
100 | - pymdownx.superfences:
101 | custom_fences:
102 | - name: mermaid
103 | class: mermaid
104 | format: !!python/name:pymdownx.superfences.fence_code_format
105 | - pymdownx.tabbed:
106 | alternate_style: true
107 | - pymdownx.tasklist:
108 | custom_checkbox: true
109 | - pymdownx.tilde
110 |
111 | watch:
112 | - brickflow
113 | - brickflow_plugins
114 |
115 | extra_css:
116 | - css/custom.css
117 |
118 | nav:
119 | - Home: index.md
120 | - Quickstart:
121 | - Brickflow Projects: bundles-quickstart.md
122 | - Upgrading Versions:
123 | - Upgrading to v0.10.x: upgrades/upgrade-pre-0-10-0-to-0-10-0.md
124 | - Concepts:
125 | - HighLevel: highlevel.md
126 | - Workflows: workflows.md
127 | - Tasks: tasks.md
128 | - Projects: projects.md
129 | - ENV Variables: environment-variables.md
130 | - Importing Modules: how-imports-work.md
131 | - FAQ: faq/faq.md
132 | - CLI:
133 | - Commands: cli/reference.md
134 | - Python API:
135 | - Engine:
136 | - Project: api/project.md
137 | - Workflow: api/workflow.md
138 | - Compute: api/compute.md
139 | - Task: api/task.md
140 | - Context: api/context.md
141 | - CLI: api/cli.md
142 | - Brickflow Plugins:
143 | - AirflowTaskDependencySensor: api/airflow_external_task_dependency.md
144 | - AirflowNativeOperators: api/airflow_native_operators.md
145 | - WorkflowDependencySensor: api/workflow_dependency_sensor.md
146 | - SnowflakeOperator: api/uc_to_snowflake_operator.md
147 | - UcToSnowflakeOperator: api/uc_to_snowflake_operator.md
148 | - Secrets: api/secrets.md
149 | - TableauRefreshDataSourceOperator: api/airflow_tableau_operators.md
150 | - TableauRefreshWorkbookOperator: api/airflow_tableau_operators.md
151 | - BoxToVolumeOperator: api/box_operator.md
152 | - VolumeToBoxOperator: api/box_operator.md
153 | - BoxOperator: api/box_operator.md
154 |
155 |
156 | extra:
157 | generator: false
158 | version:
159 | provider: mike
160 | default: latest
--------------------------------------------------------------------------------
/prospector.yaml:
--------------------------------------------------------------------------------
1 | strictness: high
2 | test-warnings: True
3 | doc-warnings: false
4 |
5 | ignore-paths:
6 | - build
7 | - venv
8 | - venv3
9 | - venv2
10 | - site
11 | - docs
12 | - tests/engine/sample_workflows.py
13 | - tools
14 | - .databricks
15 | - .mypy_cache
16 | - brickflow/bundles
17 | - brickflow/sample_dags
18 | - main.py
19 | - main2.py
20 | - .eggs
21 | - htmlcov
22 | - sample_workflows
23 | - integration_workflows
24 | - scripts
25 | - tests/test_brickflow.py
26 | - examples
27 | - brickflow_plugins # will eventually need to remove once there are tests and linting logic is applied
28 |
29 | max-line-length: 120
30 |
31 | pylint:
32 | disable:
33 | - too-many-branches
34 | - too-many-statements
35 | - too-many-instance-attributes
36 | - cyclic-import
37 | - len-as-condition
38 | - invalid-name
39 | - no-else-return
40 | - no-self-use
41 | - protected-access
42 | - too-many-arguments
43 | - too-many-locals # TBD: this rule is actually a good one, we need to enable it and refactor code
44 | - inconsistent-return-statements
45 | - import-outside-toplevel
46 | - consider-using-set-comprehension
47 | - useless-object-inheritance
48 | - unnecessary-pass
49 | - raise-missing-from # pretty strange requirement with acquaint logic
50 | - broad-except
51 | - arguments-differ
52 |
53 | pycodestyle:
54 | # W293: disabled because we have newlines in docstrings
55 | # E203: disabled because pep8 and black disagree on whitespace before colon in some cases
56 | disable: W293,E203,E203 # conflicts with black formatting
57 |
58 | pyflakes:
59 | disable:
60 | - F821 # ignore undefined name errors
61 |
62 | mccabe:
63 | disable:
64 | - MC0001
65 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "brickflows"
3 | version = "0.11.0a0"
4 | description = "Deploy scalable workflows to databricks using python"
5 | authors = ["Ashok Singamaneni, Sriharsha Tikkireddy"]
6 | readme = "README.md"
7 | license = "Apache License 2.0"
8 | homepage = "https://github.com/Nike-Inc/brickflow"
9 | repository = "https://github.com/Nike-Inc/brickflow"
10 | packages = [{ include = "brickflow" }, { include = "brickflow_plugins" }]
11 | include = ["LICENSE", "entrypoint.template", "gitignore_template.txt"]
12 | exclude = ["sample_workflows", "tests"]
13 |
14 | [tool.black]
15 | line-length = 88
16 | target-version = ['py39', 'py310']
17 | include = '\.pyi?$'
18 | extend-exclude = '''
19 | /(
20 | # The following are specific to Black, you probably don't want those.
21 | | brickflow/tf
22 | | venv
23 | | brickflow.egg-info
24 | | dist
25 | | brickflow/bundles
26 | )/
27 | '''
28 |
29 | [tool.poetry.dependencies]
30 | python = ">=3.9,<3.12" # pyspark <3.5 does not play happy with python 3.11. The latest DBRs Runtime (15.4) ships with Python 3.11.
31 | Jinja2 = ">=3.1.5"
32 | click = "^8.1.3"
33 | databricks-sdk = ">=0.1.8 <1.0.0"
34 | networkx = "3.1"
35 | pendulum = "2.1.2"
36 | pluggy = "^1.0.0"
37 | pydantic = ">=2.0.0 <3.0.0"
38 | python-decouple = "3.8"
39 | pyyaml = "^6.0"
40 | requests = ">=2.28.2 <3.0.0"
41 | # cerberus-python-client = {version = "~2.5.4", optional = true } # Users might have to manually install cerberus-python-client if required
42 | # tableauserverclient = {version = "~0.25", optional = true } # Users might have to manually install tableauserverclient if required
43 |
44 |
45 | [tool.poetry.scripts]
46 | bf = "brickflow.cli:cli"
47 | brickflow = "brickflow.cli:cli"
48 |
49 | [tool.poetry.group.dev.dependencies]
50 | black = "^24.3.0"
51 | coverage = "^7.2.5"
52 | datamodel-code-generator = "^0.25.2"
53 | deepdiff = "^6.3.0"
54 | mypy = "^1.3.0"
55 | pre-commit = "^3.3.1"
56 | prospector = "^1.10.3"
57 | py4j = "^0.10.9.7"
58 | pytest = ">=7.3.1 <8.0.0"
59 | pytest-mock = "^3.10.0"
60 | types-PyYAML = "*" # only for development purposes no need to make installation req
61 | types-requests = ">=2.28.11.16 <3.0.0.0" # only for development purposes no need to make installation req
62 | apache-airflow = "^2.7.3"
63 | snowflake = "^0.6.0"
64 | tableauserverclient = "^0.25"
65 | boxsdk = "^3.9.2"
66 | cerberus-python-client = "^2.5.4"
67 | watchdog = "<4.0.0"
68 | requests-mock = "1.12.1"
69 | pyspark = "^3.0.0"
70 | apache-airflow-providers-fab = ">=1.5.2"
71 |
72 | [tool.poetry.group.docs.dependencies]
73 | mdx-include = "^1.4.2"
74 | mike = "^2.1.3"
75 | mkdocs-click = "^0.8.1"
76 | mkdocs-material = "^9.5.49"
77 | mkdocstrings = { extras = ["python"], version = "^0.27.0" }
78 |
79 | [build-system]
80 | requires = ["poetry-core", "poetry-dynamic-versioning"]
81 | build-backend = "poetry_dynamic_versioning.backend"
82 |
83 | [tool.poetry-dynamic-versioning]
84 | enable = true
85 | vcs = "git"
86 | bump = true
87 | style = "semver"
88 |
89 | [tool.coverage]
90 | [tool.coverage.run]
91 | omit = [
92 | # omit anything in a .local directory anywhere
93 | '*/.local/*',
94 | '**',
95 | 'tests/*',
96 | '*/tests/*',
97 | # omit anything in a .venv directory anywhere
98 | '.venv/*',
99 | "*/site-packages/*",
100 | ]
101 |
102 | [tool.coverage.report]
103 | skip_empty = true
104 |
105 | [tool.mypy]
106 | disallow_untyped_defs = true
107 | ignore_missing_imports = true
108 | files = [
109 | "brickflow/context/*.py",
110 | "brickflow/cli/*.py",
111 | "brickflow/hints/*.py",
112 | "brickflow/engine/*.py",
113 | "brickflow/resolver/*.py",
114 | "brickflow/codegen/*.py",
115 | ]
116 | follow_imports = "skip"
117 |
118 | [tool.pylint.main]
119 | fail-under = 9.0
120 |
121 |
122 | [tool.pylint."messages control"]
123 | disable = ["too-many-lines", "too-many-positional-arguments"]
124 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/tests/__init__.py
--------------------------------------------------------------------------------
/tests/airflow_plugins/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/tests/airflow_plugins/__init__.py
--------------------------------------------------------------------------------
/tests/airflow_plugins/test_autosys.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from requests.exceptions import HTTPError
3 | from requests_mock.mocker import Mocker as RequestsMocker
4 |
5 | from brickflow_plugins.airflow.operators.external_tasks import AutosysSensor
6 |
7 |
8 | class TestAutosysSensor:
9 | @pytest.fixture(autouse=True, name="api", scope="class")
10 | def mock_api(self):
11 | rm = RequestsMocker()
12 | rm.register_uri(
13 | method="GET",
14 | url="https://42.autosys.my-org.com/foo",
15 | response_list=[
16 | # Test 1: Success
17 | {
18 | "json": {"status": "SU", "lastEndUTC": "2024-01-01T00:55:00Z"},
19 | "status_code": int(200),
20 | },
21 | # Test 2: Raise Error
22 | {
23 | "json": {},
24 | "status_code": int(404),
25 | },
26 | # Test 3: Poke 4 times until success
27 | {
28 | "json": {"status": "FA", "lastEndUTC": "2024-01-01T00:55:00Z"},
29 | "status_code": int(200),
30 | },
31 | {
32 | "json": {"status": "UNK", "lastEndUTC": None},
33 | "status_code": int(200),
34 | },
35 | {
36 | "json": {"status": "UNK", "lastEndUTC": ""},
37 | "status_code": int(200),
38 | },
39 | {
40 | "json": {"status": "SU", "lastEndUTC": "2024-01-01T01:55:00Z"},
41 | "status_code": int(200),
42 | },
43 | ],
44 | )
45 | yield rm
46 |
47 | @pytest.fixture()
48 | def sensor(self):
49 | yield AutosysSensor(
50 | task_id="test",
51 | url="https://42.autosys.my-org.com/",
52 | job_name="foo",
53 | poke_interval=1,
54 | time_delta={"hours": 1},
55 | )
56 |
57 | def test_success(self, api, caplog, sensor):
58 | with api:
59 | sensor.poke(context={"execution_date": "2024-01-01T01:00:00Z"})
60 | assert caplog.text.count("Poking again") == 0
61 | assert "Success criteria met. Exiting" in caplog.text
62 |
63 | def test_non_200(self, api, sensor):
64 | with pytest.raises(HTTPError):
65 | with api:
66 | sensor.poke(context={"execution_date": "2024-01-01T01:00:00Z"})
67 |
68 | def test_poking(self, api, caplog, sensor):
69 | with api:
70 | sensor.poke(context={"execution_date": "2024-01-01T02:00:00Z"})
71 | assert caplog.text.count("Poking again") == 3
72 | assert "Success criteria met. Exiting" in caplog.text
73 |
--------------------------------------------------------------------------------
/tests/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/tests/cli/__init__.py
--------------------------------------------------------------------------------
/tests/cli/sample_yaml_project/.brickflow-project-root.yaml:
--------------------------------------------------------------------------------
1 | version: v1
2 | projects:
3 | test_cli_project:
4 | name: test_cli_project
5 | brickflow_version: 1.2.1
6 | deployment_mode: bundle
7 | enable_plugins: false
8 | path_from_repo_root_to_project_root: some/test/path
9 | path_project_root_to_workflows_dir: path/to/workflows
--------------------------------------------------------------------------------
/tests/cli/sample_yaml_project/brickflow-multi-project.yaml:
--------------------------------------------------------------------------------
1 | version: v1
2 | project_roots:
3 | test_cli_project:
4 | root_yaml_rel_path: .
5 |
--------------------------------------------------------------------------------
/tests/cli/sample_yml_project/.brickflow-project-root.yml:
--------------------------------------------------------------------------------
1 | version: v1
2 | projects:
3 | test_cli_project:
4 | name: test_cli_project
5 | brickflow_version: 1.2.1
6 | deployment_mode: bundle
7 | enable_plugins: false
8 | path_from_repo_root_to_project_root: some/test/path
9 | path_project_root_to_workflows_dir: path/to/workflows
--------------------------------------------------------------------------------
/tests/cli/sample_yml_project/brickflow-multi-project.yml:
--------------------------------------------------------------------------------
1 | version: v1
2 | project_roots:
3 | test_cli_project:
4 | root_yaml_rel_path: .
5 |
--------------------------------------------------------------------------------
/tests/cli/test_bundles.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | from typing import Optional
4 | from unittest.mock import patch, Mock
5 | from pytest import LogCaptureFixture
6 | import pytest
7 |
8 | from brickflow import BrickflowEnvVars, _ilog
9 | from brickflow.cli.bundles import bundle_deploy, bundle_destroy
10 |
11 |
12 | class TestBundles:
13 | @patch("brickflow.cli.bundles.should_deploy", return_value=True)
14 | @patch("brickflow.cli.bundles.exec_command")
15 | @patch.dict(
16 | os.environ, {BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_VERSION.value: "0.203.0"}
17 | )
18 | def test_bundle_deploy_new_cli(self, mock_exec_command: Mock, _: Mock):
19 | mock_exec_command.side_effect = lambda *args, **kwargs: None
20 | mock_exec_command.return_value = None
21 | # workflows_dir needed to make the function work due to bundle sync
22 | bundle_deploy(
23 | force_acquire_lock=True,
24 | workflows_dir="somedir",
25 | debug=True,
26 | fail_on_active_runs=True,
27 | )
28 | bundle_cli = os.environ[BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_EXEC.value]
29 | mock_exec_command.assert_called_with(
30 | bundle_cli,
31 | "bundle",
32 | [
33 | "deploy",
34 | "-t",
35 | "local",
36 | "--fail-on-active-runs",
37 | "--force-lock",
38 | "--debug",
39 | ],
40 | )
41 | bundle_destroy(force_acquire_lock=True, workflows_dir="somedir", debug=True)
42 | bundle_cli = os.environ[BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_EXEC.value]
43 | mock_exec_command.assert_called_with(
44 | bundle_cli,
45 | "bundle",
46 | ["destroy", "-t", "local", "--force-lock", "--debug"],
47 | )
48 |
49 | @patch("brickflow.cli.bundles.should_deploy", return_value=True)
50 | @patch("brickflow.cli.bundles.exec_command")
51 | @patch.dict(
52 | os.environ,
53 | {
54 | BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_VERSION.value: "0.201.0",
55 | BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_EXEC.value: "databricks",
56 | },
57 | )
58 | def test_bundle_deploy_old_cli(self, mock_exec_command: Mock, _: Mock):
59 | mock_exec_command.side_effect = lambda *args, **kwargs: None
60 | mock_exec_command.return_value = None
61 | # workflows_dir needed to make the function work due to bundle sync
62 | bundle_deploy(force_acquire_lock=True, workflows_dir="somedir")
63 | bundle_cli = os.environ[BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_EXEC.value]
64 | mock_exec_command.assert_called_with(
65 | bundle_cli,
66 | "bundle",
67 | ["deploy", "-t", "local", "--force"],
68 | )
69 | bundle_destroy(force_acquire_lock=True, workflows_dir="somedir")
70 | bundle_cli = os.environ[BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_EXEC.value]
71 | mock_exec_command.assert_called_with(
72 | bundle_cli,
73 | "bundle",
74 | ["destroy", "-t", "local", "--force"],
75 | )
76 |
77 | @patch("brickflow.cli.bundles.exec_command")
78 | @patch.dict(
79 | os.environ, {BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_VERSION.value: "0.203.0"}
80 | )
81 | def test_deploy_no_workflows(
82 | self, mock_exec_command: Mock, caplog: LogCaptureFixture
83 | ):
84 | mock_exec_command.side_effect = lambda *args, **kwargs: None
85 | mock_exec_command.return_value = None
86 |
87 | # Adjusting the log level and propagating it to the root logger to make sure it's captured by caplog
88 | _ilog.propagate = True
89 | _ilog.level = logging.WARN
90 |
91 | with caplog.at_level(logging.WARN):
92 | # running this should not fail but log a warning stating that no bundle has been found
93 | bundle_deploy(force_acquire_lock=True, workflows_dir="somedir")
94 |
95 | assert "No bundle.yml found, skipping deployment." in [
96 | rec.message for rec in caplog.records
97 | ]
98 |
99 | @pytest.mark.parametrize(
100 | "input_arch,expected_arch",
101 | [
102 | ("x86_64", "amd64"), # Test one x86_64 variant
103 | ("amd64", "amd64"), # Test alternative x86_64 name
104 | ("i386", "386"), # Test one 32-bit variant
105 | ("i686", "386"), # Test alternative 32-bit name
106 | ("arm64", "arm64"), # Test one ARM variant
107 | ("aarch64", "arm64"), # Test alternative ARM name
108 | ("X86_64", "amd64"), # Test case insensitivity
109 | ("unsupported_arch", None), # Test unsupported architecture
110 | ],
111 | )
112 | def test_get_arch_mappings(
113 | self, input_arch: str, expected_arch: Optional[str]
114 | ) -> None:
115 | from brickflow.cli.bundles import get_arch
116 |
117 | with patch("platform.machine") as mock_machine:
118 | mock_machine.return_value = input_arch
119 |
120 | if expected_arch is None:
121 | with pytest.raises(RuntimeError) as exc_info:
122 | get_arch()
123 | assert f"Unsupported architecture: {input_arch}" in str(exc_info.value)
124 | else:
125 | assert get_arch() == expected_arch
126 |
--------------------------------------------------------------------------------
/tests/cli/test_cli.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 | import traceback
4 | from unittest.mock import patch, Mock
5 |
6 | import click
7 | from click.testing import CliRunner
8 |
9 | from brickflow import BrickflowProjectDeploymentSettings, BrickflowEnvVars
10 | from brickflow.cli import (
11 | cli,
12 | exec_command,
13 | )
14 | from brickflow.cli.bundles import (
15 | bundle_download_path,
16 | download_and_unzip_databricks_cli,
17 | get_force_lock_flag,
18 | )
19 | from brickflow.cli.projects import handle_libraries
20 |
21 |
22 | def fake_run(*_, **__):
23 | click.echo("hello world")
24 |
25 |
26 | # TODO: Add more tests to the cli
27 | class TestCli:
28 | def test_no_command_error(self):
29 | runner = CliRunner()
30 | non_existent_command = "non_existent_command"
31 | result = runner.invoke(cli, ["non_existent_command"]) # noqa
32 | assert result.exit_code == 2
33 | assert result.output.strip().endswith(
34 | f"Error: No such command '{non_existent_command}'."
35 | )
36 |
37 | @patch("webbrowser.open")
38 | def test_docs(self, browser: Mock):
39 | runner = CliRunner()
40 | browser.return_value = None
41 | result = runner.invoke(cli, ["docs"]) # noqa
42 | assert result.exit_code == 0, traceback.print_exception(*result.exc_info)
43 | assert result.output.strip().startswith("Opening browser for docs...")
44 | browser.assert_called_once_with(
45 | "https://engineering.nike.com/brickflow/", new=2
46 | )
47 |
48 | def test_force_arg(self):
49 | with patch.dict(
50 | os.environ, {BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_VERSION.value: "0.203.0"}
51 | ):
52 | assert get_force_lock_flag() == "--force-lock"
53 | with patch.dict(
54 | os.environ, {BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_VERSION.value: "auto"}
55 | ):
56 | assert get_force_lock_flag() == "--force-lock"
57 | with patch.dict(
58 | os.environ,
59 | {BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_VERSION.value: "something else"},
60 | ):
61 | assert get_force_lock_flag() == "--force-lock"
62 | with patch.dict(
63 | os.environ, {BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_VERSION.value: "0.202.0"}
64 | ):
65 | assert get_force_lock_flag() == "--force"
66 |
67 | def test_install_cli(self):
68 | expected_version = "0.200.0"
69 | url = bundle_download_path(expected_version)
70 | file_path = download_and_unzip_databricks_cli(url, expected_version)
71 | assert url is not None
72 | version_value = exec_command(file_path, "--version", [], capture_output=True)
73 | assert (
74 | version_value.strip() == f"Databricks CLI v{expected_version}"
75 | ), version_value
76 | directory_path = ".databricks"
77 | if os.path.exists(directory_path):
78 | shutil.rmtree(directory_path)
79 |
80 | def test_projects_handle_libraries(self):
81 | bpd = BrickflowProjectDeploymentSettings()
82 | bpd.brickflow_auto_add_libraries = None
83 | handle_libraries(skip_libraries=True)
84 | assert bpd.brickflow_auto_add_libraries is False
85 | handle_libraries(skip_libraries=False)
86 | assert bpd.brickflow_auto_add_libraries is True
87 | bpd.brickflow_auto_add_libraries = None
88 |
--------------------------------------------------------------------------------
/tests/cli/test_projects.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | import shutil
3 | import os
4 | import pytest
5 | from brickflow import ConfigFileType
6 | from brickflow.cli.projects import MultiProjectManager, get_brickflow_root
7 |
8 |
9 | @pytest.mark.parametrize(
10 | "project_folder,extension",
11 | [("sample_yml_project", "yml"), ("sample_yaml_project", "yaml")],
12 | )
13 | def test_get_brickflow_root(project_folder, extension):
14 | cwd = os.getcwd()
15 | test_folder = str(Path(__file__).parent)
16 |
17 | # Creating empty test directories
18 | os.makedirs(f"{test_folder}/{project_folder}/some/dummy/dir", exist_ok=True)
19 | os.chdir(f"{test_folder}/{project_folder}/some/dummy/dir")
20 |
21 | actual = get_brickflow_root()
22 | assert actual == Path(
23 | f"{test_folder}/{project_folder}/brickflow-multi-project.{extension}"
24 | )
25 |
26 | # Cleanup
27 | shutil.rmtree(f"{test_folder}/{project_folder}/some")
28 | os.chdir(cwd)
29 |
30 |
31 | @pytest.mark.parametrize(
32 | "project_folder, config_type",
33 | [
34 | ("sample_yml_project", ConfigFileType.YML),
35 | ("sample_yaml_project", ConfigFileType.YAML),
36 | ],
37 | )
38 | def test_multi_project_manager_yaml(project_folder, config_type):
39 | cwd = os.getcwd()
40 | test_folder = str(Path(__file__).parent)
41 | os.chdir(test_folder)
42 |
43 | config_file_name = (
44 | f"{test_folder}/{project_folder}/brickflow-multi-project.{config_type.value}"
45 | )
46 | manager = MultiProjectManager(
47 | config_file_name=config_file_name, file_type=config_type
48 | )
49 | assert manager._brickflow_multi_project_config.version == "v1"
50 | expected_project_config = {
51 | "version": "v1",
52 | "projects": {
53 | "test_cli_project": {
54 | "name": "test_cli_project",
55 | "path_from_repo_root_to_project_root": "some/test/path",
56 | "path_project_root_to_workflows_dir": "path/to/workflows",
57 | "deployment_mode": "bundle",
58 | "brickflow_version": "1.2.1",
59 | "enable_plugins": False,
60 | }
61 | },
62 | }
63 | assert manager._project_config_dict["."].model_dump() == expected_project_config
64 |
65 | os.chdir(cwd)
66 |
--------------------------------------------------------------------------------
/tests/codegen/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/tests/codegen/__init__.py
--------------------------------------------------------------------------------
/tests/codegen/expected_bundles/local_bundle_continuous_schedule.yml:
--------------------------------------------------------------------------------
1 | "bundle":
2 | "name": "test-project"
3 | "targets":
4 | "test-project-local":
5 | "resources":
6 | "jobs":
7 | "wf-test-2":
8 | "continuous":
9 | "pause_status": "PAUSED"
10 | "email_notifications": null
11 | "git_source": null
12 | "health":
13 | "rules":
14 | - "metric": "RUN_DURATION_SECONDS"
15 | "op": "GREATER_THAN"
16 | "value": 7200.0
17 | "job_clusters":
18 | - "job_cluster_key": "sample_job_cluster"
19 | "new_cluster":
20 | "aws_attributes": null
21 | "custom_tags":
22 | "brickflow_deployment_mode": "Databricks Asset Bundles"
23 | "brickflow_project_name": "test-project"
24 | "brickflow_version": "1.0.0"
25 | "deployed_at": "1704067200000"
26 | "deployed_by": "test_user"
27 | "environment": "local"
28 | "data_security_mode": "SINGLE_USER"
29 | "driver_instance_pool_id": null
30 | "driver_node_type_id": null
31 | "enable_elastic_disk": null
32 | "init_scripts": null
33 | "instance_pool_id": null
34 | "node_type_id": "m6gd.xlarge"
35 | "num_workers": 1.0
36 | "policy_id": null
37 | "runtime_engine": null
38 | "spark_conf": null
39 | "spark_env_vars": null
40 | "spark_version": "13.3.x-scala2.12"
41 | "max_concurrent_runs": 1.0
42 | "name": "test_user_wf-test-2"
43 | "notification_settings": null
44 | "permissions":
45 | - "level": "IS_OWNER"
46 | "user_name": "abc@abc.com"
47 | - "level": "CAN_MANAGE"
48 | "user_name": "abc@abc.com"
49 | - "level": "CAN_MANAGE_RUN"
50 | "user_name": "abc@abc.com"
51 | - "level": "CAN_VIEW"
52 | "user_name": "abc@abc.com"
53 | "run_as":
54 | "user_name": "abc@abc.com"
55 | "schedule": null
56 | "tags":
57 | "brickflow_deployment_mode": "Databricks Asset Bundles"
58 | "brickflow_project_name": "test-project"
59 | "brickflow_version": "1.0.0"
60 | "deployed_at": "1704067200000"
61 | "deployed_by": "test_user"
62 | "environment": "local"
63 | "test": "test2"
64 | "tasks":
65 | - "depends_on": []
66 | "email_notifications": {}
67 | "webhook_notifications": {}
68 | "job_cluster_key": "sample_job_cluster"
69 | "libraries": []
70 | "max_retries": null
71 | "min_retry_interval_millis": null
72 | "notebook_task":
73 | "base_parameters":
74 | "all_tasks1": "test"
75 | "all_tasks3": "123"
76 | "brickflow_env": "local"
77 | "brickflow_internal_only_run_tasks": ""
78 | "brickflow_internal_task_name": "{{task_key}}"
79 | "brickflow_internal_workflow_name": "wf-test-2"
80 | "brickflow_internal_workflow_prefix": ""
81 | "brickflow_internal_workflow_suffix": ""
82 | "brickflow_job_id": "{{job_id}}"
83 | "brickflow_parent_run_id": "{{parent_run_id}}"
84 | "brickflow_run_id": "{{run_id}}"
85 | "brickflow_start_date": "{{start_date}}"
86 | "brickflow_start_time": "{{start_time}}"
87 | "brickflow_task_key": "{{task_key}}"
88 | "brickflow_task_retry_count": "{{task_retry_count}}"
89 | "test": "var"
90 | "notebook_path": "test_databricks_bundle.py"
91 | "source": "WORKSPACE"
92 | "retry_on_timeout": null
93 | "task_key": "task_function2"
94 | "timeout_seconds": null
95 | "timeout_seconds": null
96 | "trigger": null
97 | "webhook_notifications": null
98 | "pipelines": {}
99 | "workspace":
100 | "file_path": "/Users/${workspace.current_user.userName}/.brickflow_bundles/test-project/local/files"
101 | "root_path": "/Users/${workspace.current_user.userName}/.brickflow_bundles/test-project/local"
102 | "state_path": "/Users/${workspace.current_user.userName}/.brickflow_bundles/test-project/local/state"
103 | "workspace": {}
--------------------------------------------------------------------------------
/tests/codegen/expected_bundles/local_serverless_bundle.yml:
--------------------------------------------------------------------------------
1 | "bundle":
2 | "name": "test-project"
3 | "targets":
4 | "test-project-local":
5 | "resources":
6 | "jobs":
7 | "brickflow-serverless-demo":
8 | "continuous": null
9 | "email_notifications": null
10 | "environments":
11 | - "environment_key": "Default"
12 | "spec":
13 | "client": "1"
14 | "dependencies":
15 | - "pytz==2024.2"
16 | "health": {}
17 | "job_clusters": []
18 | "max_concurrent_runs": 1.0
19 | "name": "test_user_brickflow-serverless-demo"
20 | "notification_settings": null
21 | "parameters": null
22 | "permissions": null
23 | "schedule":
24 | "pause_status": "PAUSED"
25 | "quartz_cron_expression": "0 0/20 0 ? * * *"
26 | "timezone_id": "UTC"
27 | "tags":
28 | "brickflow_deployment_mode": "Databricks Asset Bundles"
29 | "brickflow_project_name": "test-project"
30 | "brickflow_version": "1.0.0"
31 | "deployed_at": "1704067200000"
32 | "deployed_by": "test_user"
33 | "environment": "local"
34 | "tasks":
35 | - "depends_on": []
36 | "email_notifications": {}
37 | "webhook_notifications": {}
38 | "max_retries": null
39 | "min_retry_interval_millis": null
40 | "notebook_task":
41 | "base_parameters":
42 | "brickflow_env": "local"
43 | "brickflow_internal_only_run_tasks": ""
44 | "brickflow_internal_task_name": "{{task_key}}"
45 | "brickflow_internal_workflow_name": "brickflow-serverless-demo"
46 | "brickflow_internal_workflow_prefix": ""
47 | "brickflow_internal_workflow_suffix": ""
48 | "brickflow_job_id": "{{job_id}}"
49 | "brickflow_parent_run_id": "{{parent_run_id}}"
50 | "brickflow_run_id": "{{run_id}}"
51 | "brickflow_start_date": "{{start_date}}"
52 | "brickflow_start_time": "{{start_time}}"
53 | "brickflow_task_key": "{{task_key}}"
54 | "brickflow_task_retry_count": "{{task_retry_count}}"
55 | "notebook_path": "test_databricks_bundle.py"
56 | "source": "WORKSPACE"
57 | "retry_on_timeout": null
58 | "task_key": "entrypoint_task"
59 | "timeout_seconds": null
60 | - "depends_on": []
61 | "email_notifications": {}
62 | "webhook_notifications": {}
63 | "max_retries": null
64 | "min_retry_interval_millis": null
65 | "notebook_task":
66 | "base_parameters":
67 | "some_parameter": "some_value"
68 | "notebook_path": "notebooks/example_notebook.py"
69 | "retry_on_timeout": null
70 | "task_key": "notebook_task"
71 | "timeout_seconds": null
72 | - "depends_on": []
73 | "email_notifications": {}
74 | "webhook_notifications": {}
75 | "environment_key": "Default"
76 | "max_retries": null
77 | "min_retry_interval_millis": null
78 | "retry_on_timeout": null
79 | "spark_python_task":
80 | "parameters":
81 | - "--timezone"
82 | - "UTC"
83 | "python_file": "/Workspace/Users/${workspace.current_user.userName}/.brickflow_bundles/test-project/local/files/spark/python/src/run_task.py"
84 | "source": "WORKSPACE"
85 | "task_key": "spark_python_task"
86 | "timeout_seconds": null
87 | "timeout_seconds": null
88 | "trigger": null
89 | "webhook_notifications": null
90 | "pipelines": {}
91 | "workspace":
92 | "file_path": "/Users/${workspace.current_user.userName}/.brickflow_bundles/test-project/local/files"
93 | "root_path": "/Users/${workspace.current_user.userName}/.brickflow_bundles/test-project/local"
94 | "state_path": "/Users/${workspace.current_user.userName}/.brickflow_bundles/test-project/local/state"
95 | "workspace": {}
96 |
--------------------------------------------------------------------------------
/tests/codegen/sample_serverless_workflow.py:
--------------------------------------------------------------------------------
1 | from brickflow import (
2 | Workflow,
3 | NotebookTask,
4 | SparkPythonTask,
5 | )
6 | from brickflow.engine.task import PypiTaskLibrary
7 |
8 | wf = Workflow(
9 | "brickflow-serverless-demo",
10 | schedule_quartz_expression="0 0/20 0 ? * * *",
11 | libraries=[PypiTaskLibrary(package="pytz==2024.2")],
12 | )
13 |
14 |
15 | @wf.task
16 | def entrypoint_task():
17 | pass
18 |
19 |
20 | @wf.notebook_task
21 | def notebook_task():
22 | return NotebookTask(
23 | notebook_path="notebooks/example_notebook.py",
24 | base_parameters={
25 | "some_parameter": "some_value", # in the notebook access these via dbutils.widgets.get("some_parameter")
26 | },
27 | ) # type: ignore
28 |
29 |
30 | @wf.spark_python_task
31 | def spark_python_task():
32 | return SparkPythonTask(
33 | python_file="./products/test-project/spark/python/src/run_task.py",
34 | source="GIT",
35 | parameters=["--timezone", "UTC"],
36 | ) # type: ignore
37 |
--------------------------------------------------------------------------------
/tests/context/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/tests/context/__init__.py
--------------------------------------------------------------------------------
/tests/databricks_plugins/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/tests/databricks_plugins/__init__.py
--------------------------------------------------------------------------------
/tests/databricks_plugins/test_run_job.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import pytest
4 | from requests_mock.mocker import Mocker as RequestsMocker
5 |
6 | from brickflow.engine.utils import ctx
7 | from brickflow_plugins.databricks.run_job import RunJobInRemoteWorkspace
8 |
9 |
10 | class TestRunJob:
11 | workspace_url = "https://42.cloud.databricks.com"
12 | endpoint_url = f"{workspace_url}/api/.*/jobs/run-now"
13 | response = {"run_id": 37, "number_in_job": 42}
14 |
15 | ctx.log.propagate = True
16 |
17 | @pytest.fixture(autouse=True)
18 | def mock_get_job_id(self, mocker):
19 | mocker.patch(
20 | "brickflow_plugins.databricks.run_job.get_job_id",
21 | return_value=1,
22 | )
23 |
24 | @pytest.fixture(autouse=True, name="api")
25 | def mock_api(self):
26 | rm = RequestsMocker()
27 | rm.post(re.compile(self.endpoint_url), json=self.response, status_code=int(200))
28 | yield rm
29 |
30 | def test_run_job(self, api, caplog):
31 | with api:
32 | RunJobInRemoteWorkspace(
33 | databricks_host=self.workspace_url,
34 | databricks_token="token",
35 | job_name="foo",
36 | ).execute()
37 |
38 | assert "RunNowResponse(number_in_job=42, run_id=37)" in caplog.text
39 |
--------------------------------------------------------------------------------
/tests/databricks_plugins/test_workflow_dependency_sensor.py:
--------------------------------------------------------------------------------
1 | from datetime import timedelta
2 |
3 | import pytest
4 | from requests_mock.mocker import Mocker as RequestsMocker
5 |
6 | from brickflow_plugins.databricks.workflow_dependency_sensor import (
7 | WorkflowDependencySensor,
8 | )
9 |
10 |
11 | class TestWorkflowDependencySensor:
12 | workspace_url = "https://42.cloud.databricks.com"
13 | endpoint_url = f"{workspace_url}/api/2.1/jobs/get"
14 | response = {}
15 |
16 | def test_sensor_failure_403(self):
17 | api = RequestsMocker()
18 | api.get(self.endpoint_url, json=self.response, status_code=int(403))
19 |
20 | # Databricks SDK will throw PermissionDenied exception if the job_id is not found or
21 | # user doesn't have permission
22 | from databricks.sdk.errors.platform import PermissionDenied
23 |
24 | with api:
25 | sensor = WorkflowDependencySensor(
26 | databricks_host=self.workspace_url,
27 | databricks_token="token",
28 | dependency_job_id="1",
29 | delta=timedelta(seconds=1),
30 | timeout_seconds=1,
31 | poke_interval_seconds=1,
32 | )
33 |
34 | with pytest.raises(PermissionDenied):
35 | sensor.execute()
36 |
--------------------------------------------------------------------------------
/tests/databricks_plugins/test_workflow_task_dependency_sensor.py:
--------------------------------------------------------------------------------
1 | from datetime import timedelta
2 |
3 | import pytest
4 | from requests_mock.mocker import Mocker as RequestsMocker
5 |
6 | from brickflow_plugins.databricks.workflow_dependency_sensor import (
7 | WorkflowTaskDependencySensor,
8 | WorkflowDependencySensorTimeOutException,
9 | )
10 |
11 |
12 | class TestWorkflowTaskDependencySensor:
13 | workspace_url = "https://42.cloud.databricks.com"
14 | endpoint_url = f"{workspace_url}/api/2.1/jobs/runs/list"
15 | response = {
16 | "runs": [
17 | {
18 | "job_id": 1,
19 | "run_id": 1,
20 | "start_time": 1704063600000,
21 | "state": {
22 | "result_state": "SUCCESS",
23 | },
24 | "tasks": [
25 | {
26 | "run_id": 100,
27 | "task_key": "foo",
28 | "state": {
29 | "result_state": "SUCCESS",
30 | },
31 | },
32 | {
33 | "run_id": 200,
34 | "task_key": "bar",
35 | "state": {
36 | "result_state": "FAILED",
37 | },
38 | },
39 | {
40 | "run_id": 300,
41 | "task_key": "baz",
42 | "state": {},
43 | },
44 | ],
45 | }
46 | ]
47 | }
48 |
49 | @pytest.fixture(autouse=True)
50 | def mock_get_execution_start_time_unix_milliseconds(self, mocker):
51 | mocker.patch.object(
52 | WorkflowTaskDependencySensor,
53 | "get_execution_start_time_unix_milliseconds",
54 | return_value=1704063600000,
55 | )
56 |
57 | @pytest.fixture(autouse=True)
58 | def mock_get_job_id(self, mocker):
59 | mocker.patch(
60 | "brickflow_plugins.databricks.workflow_dependency_sensor.get_job_id",
61 | return_value=1,
62 | )
63 |
64 | @pytest.fixture(autouse=True, name="api")
65 | def mock_api(self):
66 | rm = RequestsMocker()
67 | rm.get(self.endpoint_url, json=self.response, status_code=int(200))
68 | yield rm
69 |
70 | def test_sensor_success(self, caplog, api):
71 | with api:
72 | sensor = WorkflowTaskDependencySensor(
73 | databricks_host=self.workspace_url,
74 | databricks_token="token",
75 | dependency_job_name="job",
76 | dependency_task_name="foo",
77 | delta=timedelta(seconds=1),
78 | timeout_seconds=1,
79 | poke_interval_seconds=1,
80 | )
81 |
82 | sensor.execute()
83 |
84 | assert (
85 | "Found the run_id '1' and 'foo' task with state: SUCCESS" in caplog.text
86 | )
87 | assert "Found a successful run: 1" in caplog.text
88 |
89 | def test_sensor_failure(self, caplog, api):
90 | with api:
91 | sensor = WorkflowTaskDependencySensor(
92 | databricks_host=self.workspace_url,
93 | databricks_token="token",
94 | dependency_job_name="job",
95 | dependency_task_name="bar",
96 | delta=timedelta(seconds=1),
97 | timeout_seconds=1,
98 | poke_interval_seconds=1,
99 | )
100 |
101 | with pytest.raises(WorkflowDependencySensorTimeOutException):
102 | sensor.execute()
103 |
104 | assert (
105 | "Found the run_id '1' and 'bar' task with state: FAILED"
106 | in caplog.messages
107 | )
108 | assert "Didn't find a successful task run yet..." in caplog.messages
109 |
110 | def test_sensor_no_state(self, caplog, api):
111 | with api:
112 | sensor = WorkflowTaskDependencySensor(
113 | databricks_host=self.workspace_url,
114 | databricks_token="token",
115 | dependency_job_name="job",
116 | dependency_task_name="baz",
117 | delta=timedelta(seconds=1),
118 | timeout_seconds=1,
119 | poke_interval_seconds=1,
120 | )
121 |
122 | with pytest.raises(WorkflowDependencySensorTimeOutException):
123 | sensor.execute()
124 |
125 | assert (
126 | "Found the run_id '1' and 'baz' but the task has not started yet..."
127 | in caplog.messages
128 | )
129 | assert "Didn't find a successful task run yet..." in caplog.messages
130 |
--------------------------------------------------------------------------------
/tests/engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/tests/engine/__init__.py
--------------------------------------------------------------------------------
/tests/engine/sample_workflow.py:
--------------------------------------------------------------------------------
1 | from brickflow.engine.compute import Cluster
2 | from brickflow.engine.task import (
3 | BrickflowTriggerRule,
4 | TaskType,
5 | TaskResponse,
6 | DLTPipeline,
7 | RunJobTask,
8 | )
9 | from brickflow.engine.workflow import Workflow, WorkflowPermissions, User
10 |
11 | wf = Workflow(
12 | "test",
13 | default_cluster=Cluster.from_existing_cluster("existing_cluster_id"),
14 | schedule_quartz_expression="* * * * *",
15 | permissions=WorkflowPermissions(
16 | owner=User("abc@abc.com"),
17 | can_manage_run=[User("abc@abc.com")],
18 | can_view=[User("abc@abc.com")],
19 | can_manage=[User("abc@abc.com")],
20 | ),
21 | tags={"test": "test2"},
22 | common_task_parameters={"all_tasks1": "test", "all_tasks3": "123"}, # type: ignore
23 | health={
24 | "rules": [
25 | {"metric": "RUN_DURATION_SECONDS", "op": "GREATER_THAN", "value": 7200}
26 | ]
27 | },
28 | timeout_seconds=42,
29 | )
30 |
31 |
32 | @wf.task()
33 | def task_function(*, test="var"):
34 | return test
35 |
36 |
37 | @wf.task()
38 | def task_function_with_error(*, test="var"):
39 | raise ValueError("throwing random error")
40 |
41 |
42 | @wf.task
43 | def task_function_no_deco_args():
44 | return "hello world"
45 |
46 |
47 | @wf.dlt_task
48 | def dlt_pipeline():
49 | # pass
50 | return DLTPipeline(
51 | name="hello world",
52 | storage="123",
53 | language="PYTHON",
54 | configuration={},
55 | cluster=Cluster(
56 | "test",
57 | "someversion",
58 | "vm-node",
59 | custom_tags={"name": "test"},
60 | min_workers=2,
61 | max_workers=10,
62 | ),
63 | notebook_path="scripts/spark_script_1.py",
64 | )
65 |
66 |
67 | @wf.dlt_task
68 | def dlt_pipeline_2():
69 | # pass
70 | return DLTPipeline(
71 | name="hello world",
72 | storage="123",
73 | language="PYTHON",
74 | configuration={},
75 | notebook_path="scripts/spark_script_2.py",
76 | )
77 |
78 |
79 | @wf.task()
80 | def task_function_nokwargs():
81 | return "hello world"
82 |
83 |
84 | @wf.task(depends_on=task_function)
85 | def task_function_2():
86 | return "hello world"
87 |
88 |
89 | @wf.task(depends_on="task_function_2")
90 | def task_function_3():
91 | return "hello world"
92 |
93 |
94 | @wf.task(depends_on="task_function_3", trigger_rule=BrickflowTriggerRule.NONE_FAILED)
95 | def task_function_4():
96 | return "hello world"
97 |
98 |
99 | @wf.task(
100 | task_type=TaskType.CUSTOM_PYTHON_TASK,
101 | trigger_rule=BrickflowTriggerRule.NONE_FAILED,
102 | custom_execute_callback=lambda x: TaskResponse(x.name, push_return_value=True),
103 | )
104 | def custom_python_task_push():
105 | pass
106 |
107 |
108 | @wf.run_job_task()
109 | def run_job_task():
110 | return RunJobTask(job_name="foo", host="https://foo.cloud.databricks.com")
111 |
--------------------------------------------------------------------------------
/tests/engine/sample_workflow_2.py:
--------------------------------------------------------------------------------
1 | from brickflow import Cluster, Workflow
2 |
3 | wf = Workflow(
4 | "test1", default_cluster=Cluster.from_existing_cluster("existing_cluster_id")
5 | )
6 |
7 |
8 | @wf.task()
9 | def task_function(*, test="var"):
10 | return test
11 |
--------------------------------------------------------------------------------
/tests/engine/test_compute.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from brickflow.engine.compute import Cluster
4 |
5 |
6 | class TestCompute:
7 | def test_autoscale(self):
8 | workers = 1234
9 | cluster = Cluster(
10 | "name", "spark_version", "vm-node", min_workers=workers, max_workers=workers
11 | )
12 | assert cluster.autoscale() == {
13 | "autoscale": {
14 | "min_workers": workers,
15 | "max_workers": workers,
16 | }
17 | }
18 |
19 | cluster = Cluster("name", "spark_version", "vm-node")
20 | assert not cluster.autoscale()
21 |
22 | def test_job_task_field(self):
23 | cluster = Cluster.from_existing_cluster("existing_cluster_id")
24 | assert cluster.job_task_field_dict == {
25 | "existing_cluster_id": "existing_cluster_id"
26 | }
27 | cluster = Cluster("name", "spark_version", "vm-node")
28 | assert cluster.job_task_field_dict == {"job_cluster_key": "name"}
29 |
30 | def test_dict(self):
31 | cluster = Cluster.from_existing_cluster("existing_cluster_id")
32 | assert "existing_cluster_id" not in cluster.as_dict()
33 |
34 | def test_valid_cluster(self):
35 | with pytest.raises(AssertionError):
36 | Cluster(
37 | "some_name", "some_version", "some_vm", min_workers=8, max_workers=4
38 | )
39 |
40 | with pytest.raises(AssertionError):
41 | Cluster(
42 | "some_name",
43 | "some_version",
44 | "some_vm",
45 | num_workers=3,
46 | min_workers=2,
47 | max_workers=4,
48 | )
49 |
50 | with pytest.raises(AssertionError):
51 | Cluster("some_name", "some_version", "some_vm", max_workers=4)
52 |
53 | def test_node_type_or_instance_pool(self):
54 | assert (
55 | Cluster(
56 | "some_name",
57 | "some_version",
58 | node_type_id="some_vm",
59 | driver_node_type_id="other_vm",
60 | ).node_type_id
61 | == "some_vm"
62 | )
63 | assert (
64 | Cluster(
65 | "some_name", "some_version", instance_pool_id="some_instance_pool_id"
66 | ).instance_pool_id
67 | == "some_instance_pool_id"
68 | )
69 | with pytest.raises(
70 | AssertionError, match="Must specify either instance_pool_id or node_type_id"
71 | ):
72 | Cluster(
73 | "some_name",
74 | "some_version",
75 | )
76 |
77 | with pytest.raises(
78 | AssertionError,
79 | match="Cannot specify instance_pool_id if node_type_id has been specified",
80 | ):
81 | Cluster(
82 | "some_name",
83 | "some_version",
84 | node_type_id="some_vm",
85 | instance_pool_id="1234",
86 | )
87 | with pytest.raises(
88 | AssertionError,
89 | match=(
90 | "Cannot specify driver_node_type_id if instance_pool_id"
91 | " or driver_instance_pool_id has been specified"
92 | ),
93 | ):
94 | Cluster(
95 | "some_name",
96 | "some_version",
97 | driver_node_type_id="other_vm",
98 | instance_pool_id="1234",
99 | )
100 | with pytest.raises(
101 | AssertionError,
102 | match=(
103 | "Cannot specify driver_node_type_id if instance_pool_id"
104 | " or driver_instance_pool_id has been specified"
105 | ),
106 | ):
107 | Cluster(
108 | "some_name",
109 | "some_version",
110 | node_type_id="some_vm",
111 | driver_node_type_id="other_vm",
112 | driver_instance_pool_id="1234",
113 | )
114 | with pytest.raises(
115 | AssertionError,
116 | match=(
117 | "Cannot specify driver_node_type_id if instance_pool_id"
118 | " or driver_instance_pool_id has been specified"
119 | ),
120 | ):
121 | Cluster(
122 | "some_name",
123 | "some_version",
124 | driver_node_type_id="other_vm",
125 | instance_pool_id="1234",
126 | driver_instance_pool_id="12345",
127 | )
128 |
--------------------------------------------------------------------------------
/tests/engine/test_engine.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 |
3 | from brickflow.engine import (
4 | get_current_commit,
5 | )
6 |
7 |
8 | class TestEngine:
9 | def test_get_current_commit(self, mocker):
10 | branch = "some_random_sha"
11 | mocker.patch("subprocess.check_output")
12 | subprocess.check_output.return_value = branch.encode("utf-8")
13 | assert get_current_commit() == branch
14 | subprocess.check_output.assert_called_once_with(
15 | ['git log -n 1 --pretty=format:"%H"'], shell=True
16 | ) # noqa
17 |
--------------------------------------------------------------------------------
/tests/engine/test_utils.py:
--------------------------------------------------------------------------------
1 | import re
2 | import pathlib
3 | import pytest
4 | from requests_mock.mocker import Mocker as RequestsMocker
5 |
6 | from pydantic import SecretStr
7 |
8 | from brickflow.engine.utils import get_job_id, ctx, get_bf_project_root
9 |
10 |
11 | class TestUtils:
12 | workspace_url = "https://42.cloud.databricks.com"
13 | endpoint_url = f"{workspace_url}/api/.*/jobs/list"
14 |
15 | ctx.log.propagate = True
16 |
17 | @pytest.fixture(autouse=True, name="api", scope="class")
18 | def mock_api(self):
19 | rm = RequestsMocker()
20 | rm.register_uri(
21 | method="GET",
22 | url=re.compile(self.endpoint_url),
23 | response_list=[
24 | {
25 | "json": {"jobs": [{"job_id": 1234, "settings": {"name": "foo"}}]},
26 | "status_code": int(200),
27 | },
28 | {
29 | "json": {"has_more": False},
30 | "status_code": int(200),
31 | },
32 | {
33 | "json": {},
34 | "status_code": int(404),
35 | },
36 | ],
37 | )
38 | yield rm
39 |
40 | def test_get_job_id_success(self, api):
41 | with api:
42 | job_id = get_job_id(
43 | job_name="foo",
44 | host=self.workspace_url,
45 | token=SecretStr("token"),
46 | )
47 | assert job_id == 1234
48 |
49 | def test_get_job_id_failure(self, api):
50 | with pytest.raises(ValueError):
51 | with api:
52 | get_job_id(job_name="bar", host=self.workspace_url, token="token")
53 |
54 | def test_get_job_id_non_200(self, caplog, api):
55 | with api:
56 | get_job_id(job_name="buz", host=self.workspace_url, token="token")
57 | assert "An error occurred: request failed" in caplog.text
58 |
59 | def test_get_bf_project_root(self):
60 | # Set up expected path which is the root of the repo
61 | expected_root = pathlib.Path.cwd().parents[0]
62 | # Execute the function
63 | actual_root = get_bf_project_root()
64 | # Assert the result
65 | assert actual_root == expected_root
66 |
--------------------------------------------------------------------------------
/tests/resolver/test_resolver.py:
--------------------------------------------------------------------------------
1 | # test_resolver.py
2 | from typing import Type
3 |
4 | import pytest
5 |
6 | import brickflow
7 | from brickflow.resolver import (
8 | BrickflowRootNotFound,
9 | )
10 |
11 |
12 | @pytest.fixture
13 | def default_mocks(mocker):
14 | # Create mocks for the three methods
15 | mocker.patch(
16 | "brickflow.resolver.get_caller_file_paths", return_value=["path1", "path2"]
17 | )
18 | mocker.patch(
19 | "brickflow.resolver.get_notebook_ws_path", return_value="/notebook/ws/path"
20 | )
21 |
22 |
23 | def test_resolver_methods(default_mocks, mocker): # noqa
24 | error_msg = "This is a test message"
25 |
26 | def make_exception_function(exc: Type[Exception]):
27 | def raise_exception(*args, **kwargs):
28 | raise exc(error_msg)
29 |
30 | return raise_exception
31 |
32 | # catch random error
33 | mocker.patch(
34 | "brickflow.resolver.go_up_till_brickflow_root",
35 | side_effect=make_exception_function(ValueError),
36 | )
37 | with pytest.raises(ValueError, match=error_msg):
38 | brickflow.resolver.get_relative_path_to_brickflow_root()
39 |
40 | mocker.patch(
41 | "brickflow.resolver.go_up_till_brickflow_root",
42 | side_effect=make_exception_function(BrickflowRootNotFound),
43 | )
44 |
45 | brickflow.resolver.get_relative_path_to_brickflow_root()
46 |
47 | mocker.patch(
48 | "brickflow.resolver.go_up_till_brickflow_root",
49 | side_effect=make_exception_function(PermissionError),
50 | )
51 |
52 | brickflow.resolver.get_relative_path_to_brickflow_root()
53 |
--------------------------------------------------------------------------------
/tests/sample_workflows/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/tests/sample_workflows/__init__.py
--------------------------------------------------------------------------------
/tests/sample_workflows/sample_workflow_1.py:
--------------------------------------------------------------------------------
1 | from brickflow.engine.compute import Cluster
2 | from brickflow.engine.task import BrickflowTriggerRule, TaskType, TaskResponse
3 | from brickflow.engine.workflow import Workflow
4 |
5 | wf = Workflow(
6 | "test",
7 | default_cluster=Cluster.from_existing_cluster("XXXX-XXXXXX-XXXXXXXX"),
8 | tags={"test": "test2"},
9 | common_task_parameters={"all_tasks1": "test", "all_tasks3": "123"}, # type: ignore
10 | )
11 |
12 |
13 | @wf.task()
14 | def task_function():
15 | return "hello world"
16 |
17 |
18 | @wf.task
19 | def task_function_no_deco_args():
20 | return "hello world"
21 |
22 |
23 | @wf.task()
24 | def task_function_nokwargs():
25 | return "hello world"
26 |
27 |
28 | @wf.task(depends_on=task_function)
29 | def task_function_2():
30 | return "hello world"
31 |
32 |
33 | @wf.task(depends_on="task_function_2")
34 | def task_function_3():
35 | return "hello world"
36 |
37 |
38 | @wf.task(depends_on="task_function_3", trigger_rule=BrickflowTriggerRule.NONE_FAILED)
39 | def task_function_4():
40 | return "hello world"
41 |
42 |
43 | @wf.task(
44 | task_type=TaskType.CUSTOM_PYTHON_TASK,
45 | trigger_rule=BrickflowTriggerRule.NONE_FAILED,
46 | custom_execute_callback=lambda x: TaskResponse(x.name, push_return_value=True),
47 | )
48 | def custom_python_task_push():
49 | pass
50 |
--------------------------------------------------------------------------------
/tests/sample_workflows/sample_workflow_2.py:
--------------------------------------------------------------------------------
1 | from brickflow.engine.compute import Cluster
2 | from brickflow.engine.task import BrickflowTriggerRule, TaskType, TaskResponse
3 | from brickflow.engine.workflow import Workflow
4 |
5 | wf = Workflow(
6 | "test2",
7 | default_cluster=Cluster.from_existing_cluster("XXXX-XXXXXX-XXXXXXXX"),
8 | tags={"test": "test2"},
9 | )
10 |
11 |
12 | @wf.task()
13 | def task_function():
14 | return "hello world"
15 |
16 |
17 | @wf.task
18 | def task_function_no_deco_args():
19 | return "hello world"
20 |
21 |
22 | @wf.task()
23 | def task_function_nokwargs():
24 | return "hello world"
25 |
26 |
27 | @wf.task(depends_on=task_function)
28 | def task_function_2():
29 | return "hello world"
30 |
31 |
32 | @wf.task(depends_on="task_function_2")
33 | def task_function_3():
34 | return "hello world"
35 |
36 |
37 | @wf.task(depends_on="task_function_3", trigger_rule=BrickflowTriggerRule.NONE_FAILED)
38 | def task_function_4():
39 | return "hello world"
40 |
41 |
42 | @wf.task(
43 | task_type=TaskType.CUSTOM_PYTHON_TASK,
44 | trigger_rule=BrickflowTriggerRule.NONE_FAILED,
45 | custom_execute_callback=lambda x: TaskResponse(x.name, push_return_value=True),
46 | )
47 | def custom_python_task_push():
48 | pass
49 |
--------------------------------------------------------------------------------
/tests/test_brickflow.py:
--------------------------------------------------------------------------------
1 | # pylint: disable=unused-import
2 | import pytest
3 | from brickflow import get_config_file_type, ConfigFileType
4 |
5 |
6 | def test_imports():
7 | try:
8 | from brickflow import (
9 | log,
10 | _ilog,
11 | BrickflowEnvVars,
12 | BrickflowDefaultEnvs,
13 | ctx,
14 | Workflow,
15 | WorkflowPermissions,
16 | User,
17 | Group,
18 | ServicePrincipal,
19 | Task,
20 | TaskType,
21 | TaskResponse,
22 | BrickflowTriggerRule,
23 | BrickflowTaskEnvVars,
24 | StorageBasedTaskLibrary,
25 | JarTaskLibrary,
26 | EggTaskLibrary,
27 | WheelTaskLibrary,
28 | PypiTaskLibrary,
29 | MavenTaskLibrary,
30 | CranTaskLibrary,
31 | EmailNotifications,
32 | DLTPipeline,
33 | DLTEdition,
34 | DLTChannels,
35 | Cluster,
36 | Runtimes,
37 | Project,
38 | )
39 |
40 | print("All imports Succeeded")
41 | except ImportError as e:
42 | print(f"Import failed: {e}")
43 |
44 |
45 | @pytest.mark.parametrize(
46 | "config_file_name,expected_extension",
47 | [
48 | (".brickflow-project-root.yaml", ConfigFileType.YAML),
49 | (".brickflow-project-root.yml", ConfigFileType.YML),
50 | (".brickflow-project-root.json", ConfigFileType.YAML),
51 | ],
52 | )
53 | def test_get_config_type(config_file_name, expected_extension):
54 | actual = get_config_file_type(f"some/brickflow/root/{config_file_name}")
55 | assert actual == expected_extension
56 |
--------------------------------------------------------------------------------
/tests/test_plugins.py:
--------------------------------------------------------------------------------
1 | import copy
2 | from typing import List
3 | from unittest import mock
4 |
5 | import pluggy
6 | import pytest
7 |
8 | from brickflow.engine.task import get_plugin_manager, get_brickflow_tasks_hook
9 |
10 |
11 | def assert_plugin_manager(
12 | pm: pluggy.PluginManager, expected_plugins: List[str]
13 | ) -> None:
14 | num_expected_plugins = len(expected_plugins)
15 | assert (
16 | len(pm.get_plugins()) == num_expected_plugins
17 | ), f"import error should only {num_expected_plugins} plugins"
18 | for plugin in expected_plugins:
19 | assert pm.has_plugin(plugin), f"plugin manager should have {plugin} plugin"
20 |
21 | all_plugins = set([pm.get_name(plugin_impl) for plugin_impl in pm.get_plugins()])
22 | assert all_plugins == set(expected_plugins), (
23 | f"plugin manager should have {expected_plugins} " f"plugins and nothing more"
24 | )
25 |
26 |
27 | class TestBrickflowPlugins:
28 | def test_plugins_installed(self):
29 | pm = copy.deepcopy(get_plugin_manager())
30 | get_brickflow_tasks_hook(pm)
31 | assert_plugin_manager(pm, ["airflow-plugin", "default"])
32 |
33 | def test_plugins_load_plugins_import_error(self):
34 | with mock.patch("brickflow_plugins.load_plugins") as load_plugins_mock:
35 | load_plugins_mock.side_effect = ImportError
36 | pm = copy.deepcopy(get_plugin_manager())
37 | get_brickflow_tasks_hook(pm)
38 | assert_plugin_manager(pm, ["default"])
39 |
40 | def test_plugins_ensure_installation_import_error(self):
41 | with mock.patch("brickflow_plugins.ensure_installation") as load_plugins_mock:
42 | load_plugins_mock.side_effect = ImportError
43 | pm = copy.deepcopy(get_plugin_manager())
44 | get_brickflow_tasks_hook(pm)
45 | assert_plugin_manager(pm, ["default"])
46 |
47 | @pytest.mark.parametrize(
48 | "quartz_cron, expected_unix_cron",
49 | [
50 | ("0 * * ? * * *", "* * * * *"),
51 | ("0 */5 * ? * * *", "*/5 * * * *"),
52 | ("0 30 * ? * * *", "30 * * * *"),
53 | ("0 0 12 ? * * *", "0 12 * * *"),
54 | ("0 0 12 ? * 2 *", "0 12 * * 1"),
55 | ("0 0 0 10 * ? *", "0 0 10 * *"),
56 | ("0 0 0 1 1 ? *", "0 0 1 1 *"),
57 | ("0 0/5 14,18 * * ?", "0/5 14,18 * * *"),
58 | ("0 0 12 ? * 1,2,5-7 *", "0 12 * * 0,1,4-6"),
59 | ("0 0 12 ? * SUN,MON,THU-SAT *", "0 12 * * SUN,MON,THU-SAT"),
60 | ],
61 | )
62 | def test_cron_conversion(self, quartz_cron, expected_unix_cron):
63 | import brickflow_plugins.airflow.cronhelper as cronhelper # noqa
64 |
65 | converted_unix_cron = cronhelper.cron_helper.quartz_to_unix(quartz_cron)
66 | converted_quartz_cron = cronhelper.cron_helper.unix_to_quartz(
67 | converted_unix_cron
68 | )
69 | converted_unix_cron_second = cronhelper.cron_helper.quartz_to_unix(
70 | converted_quartz_cron
71 | )
72 |
73 | assert (
74 | converted_unix_cron == converted_unix_cron_second
75 | ), "cron conversion should be idempotent"
76 | assert converted_unix_cron == expected_unix_cron
77 |
78 | @pytest.mark.parametrize(
79 | "quartz_cron",
80 | [
81 | "0 0 12 ? * L *",
82 | "0 0 12 ? * 1L *",
83 | "0 0 12 ? * 1W *",
84 | "0 0 12 ? * 1#5 *",
85 | ],
86 | )
87 | def test_unsupported_cron_expressions(self, quartz_cron):
88 | import brickflow_plugins.airflow.cronhelper as cronhelper # noqa
89 |
90 | with pytest.raises(ValueError):
91 | cronhelper.cron_helper.quartz_to_unix(quartz_cron)
92 |
--------------------------------------------------------------------------------
/tools/README.md:
--------------------------------------------------------------------------------
1 | # Code generate tools
2 |
3 | Use this to code generate `brickflow/bundles/model.py`
4 |
5 | Make sure you are in the repository root and are using a *nix machine.
6 |
7 | ```shell
8 | ./tools/gen-bundle.sh # example: ./tools/gen-bundle.sh 0.201.0
9 | ```
10 |
11 | Please note the version defaults to what is defaulted in brickflow.
--------------------------------------------------------------------------------
/tools/gen-bundle.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | # Check if the version argument is provided
4 | if [ $# -lt 1 ]; then
5 | echo "Usage: $0 "
6 | # exit 1
7 | fi
8 |
9 | set -e # Exit on any command failure
10 |
11 | # Set the provided version as an environment variable
12 | export BUNDLE_CODE_GEN_CLI_VERSION="$1"
13 |
14 | rm -rf .databricks/bin/cli/
15 | poetry install
16 | poetry run python tools/install_databricks_cli.py
17 | poetry run python tools/modify_schema.py
18 | poetry run datamodel-codegen --input brickflow/bundles/transformed_schema.json \
19 | --use-title-as-name \
20 | --disable-appending-item-suffix \
21 | --collapse-root-models \
22 | --capitalise-enum-members \
23 | --enum-field-as-literal all \
24 | --input-file-type jsonschema \
25 | --output brickflow/bundles/model.py
26 | echo "✅ Code generation completed successfully!"
27 | poetry run python tools/modify_model.py
28 | echo "✅ Updated and patched model successfully!"
29 | echo "# generated with Databricks CLI Version: $(.databricks/bin/cli/*/databricks --version)" | \
30 | cat - brickflow/bundles/model.py > /tmp/codegen && \
31 | mv /tmp/codegen brickflow/bundles/model.py
32 | echo "✅ Modified the front matter of the script!"
33 | poetry run python brickflow/bundles/model.py # validate python file
34 | echo "✅ Validated the file is proper python code!"
35 |
--------------------------------------------------------------------------------
/tools/install_databricks_cli.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | try:
4 | from brickflow import BrickflowEnvVars
5 | from brickflow.cli import bundle_cli_setup
6 | from brickflow.cli.bundles import get_valid_bundle_cli
7 | from brickflow.engine import _call
8 | except ImportError:
9 | raise ImportError("Please install brickflow to use this script")
10 |
11 | if __name__ == "__main__":
12 | cli_version = os.environ.get("BUNDLE_CODE_GEN_CLI_VERSION", None)
13 | if cli_version is not None and cli_version != "":
14 | os.environ[BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_VERSION.value] = cli_version
15 |
16 | bundle_cli_setup()
17 | bundle_cli = get_valid_bundle_cli(
18 | os.environ[BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_EXEC.value]
19 | )
20 | print(f"Using Databricks CLI: {bundle_cli}")
21 | print(_call(f"{bundle_cli} --version", shell=True).decode("utf-8"))
22 | _call(f"{bundle_cli} bundle schema > brickflow/bundles/schema.json", shell=True)
23 |
--------------------------------------------------------------------------------
/tools/modify_model.py:
--------------------------------------------------------------------------------
1 | if __name__ == "__main__":
2 | import re
3 |
4 | # string = "class Artifacts1(BaseModel)"
5 | regex_pattern = r"(?<=class\s)[A-Za-z]\w+"
6 | file_path = "brickflow/bundles/model.py"
7 |
8 | bad_class_names = {}
9 |
10 | def remove_number_from_end(string):
11 | match = re.search(r"\d+$", string)
12 | if match:
13 | number = match.group(0)
14 | string_without_number = string[: -len(number)]
15 | return string_without_number
16 | else:
17 | return None
18 |
19 | def remove_timestamp_line(input_code: str) -> str:
20 | return "\n".join(
21 | [
22 | _line
23 | for _line in input_code.split("\n")
24 | if not _line.startswith("# timestamp: ")
25 | ]
26 | )
27 |
28 | def replace_class_config_extras(input_code: str) -> str:
29 | pattern = r"extra\s*=\s*Extra\.forbid"
30 | return re.sub(
31 | pattern, 'extra = "forbid"\n protected_namespaces = ()', input_code
32 | )
33 |
34 | def replace_regex_with_pattern(input_code: str) -> str:
35 | pattern = r"regex="
36 | return re.sub(pattern, "pattern=", input_code)
37 |
38 | with open(file_path, "r") as f:
39 | lines = f.readlines()
40 | for line in lines:
41 | match = re.search(regex_pattern, line)
42 | if match:
43 | dynamic_value = match.group(0)
44 | if remove_number_from_end(dynamic_value):
45 | bad_class_names[dynamic_value] = remove_number_from_end(
46 | dynamic_value
47 | )
48 |
49 | with open(file_path, "r") as r:
50 | data = r.read()
51 |
52 | with open(file_path, "w") as w:
53 | for key, value in bad_class_names.items():
54 | data = data.replace(key, value)
55 | data = remove_timestamp_line(data)
56 | # remove extra config to remove deprecation warning
57 | data = replace_class_config_extras(data)
58 | # replace regex with pattern
59 | data = replace_regex_with_pattern(data)
60 | w.write(data)
61 |
--------------------------------------------------------------------------------