├── .coveragerc ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ └── onpush.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CODEOWNERS ├── CONTRIBUTING.md ├── CONTRIBUTORS.md ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── brickflow ├── __init__.py ├── bundles │ ├── __init__.py │ └── model.py ├── cli │ ├── __init__.py │ ├── bundles.py │ ├── commands.py │ ├── configure.py │ ├── constants.py │ ├── entrypoint.template │ ├── gitignore_template.txt │ └── projects.py ├── codegen │ ├── __init__.py │ └── databricks_bundle.py ├── context │ ├── __init__.py │ └── context.py ├── engine │ ├── __init__.py │ ├── compute.py │ ├── hooks.py │ ├── project.py │ ├── task.py │ ├── utils.py │ └── workflow.py ├── hints │ ├── __init__.py │ ├── hint.py │ └── py.typed └── resolver │ └── __init__.py ├── brickflow_plugins ├── __init__.py ├── airflow │ ├── __init__.py │ ├── brickflow_task_plugin.py │ ├── context │ │ └── __init__.py │ ├── cronhelper.py │ ├── operators │ │ ├── __init__.py │ │ ├── external_tasks.py │ │ ├── external_tasks_tableau.py │ │ └── native_operators.py │ └── vendor │ │ ├── __init__.py │ │ ├── context.py │ │ ├── timetable.py │ │ └── timezone.py ├── databricks │ ├── __init__.py │ ├── box_operator.py │ ├── run_job.py │ ├── sla_sensor.py │ ├── uc_to_snowflake_operator.py │ └── workflow_dependency_sensor.py └── secrets │ └── __init__.py ├── docs ├── api │ ├── airflow_external_task_dependency.md │ ├── airflow_native_operators.md │ ├── airflow_tableau_operators.md │ ├── box_operator.md │ ├── cli.md │ ├── compute.md │ ├── context.md │ ├── project.md │ ├── secrets.md │ ├── sla_sensor.md │ ├── task.md │ ├── uc_to_snowflake_operator.md │ ├── workflow.md │ └── workflow_dependency_sensor.md ├── bundles-quickstart.md ├── cli │ └── reference.md ├── css │ └── custom.css ├── environment-variables.md ├── faq │ └── faq.md ├── highlevel.md ├── how-imports-work.md ├── img │ ├── bf_logo.png │ ├── bf_logo_1.png │ ├── maintainance.png │ └── workflow.png ├── index.md ├── projects.md ├── tasks.md ├── upgrades │ └── upgrade-pre-0-10-0-to-0-10-0.md └── workflows.md ├── examples ├── brickflow_examples │ ├── .brickflow-project-root.yml │ ├── .gitignore │ ├── README.md │ ├── __init__.py │ ├── brickflow-multi-project.yml │ ├── notebooks │ │ ├── __init__.py │ │ └── example_notebook.py │ ├── src │ │ ├── __init__.py │ │ ├── python │ │ │ ├── __init__.py │ │ │ ├── lending_data_show.py │ │ │ └── setup_data.py │ │ └── sql │ │ │ └── sample.sql │ └── workflows │ │ ├── __init__.py │ │ ├── demo_wf.py │ │ └── entrypoint.py ├── brickflow_for_each_task_examples │ ├── .brickflow-project-root.yml │ ├── README.md │ ├── __init__.py │ ├── brickflow-multi-project.yml │ ├── notebooks │ │ ├── __init__.py │ │ └── example_notebook.py │ ├── src │ │ ├── __init__.py │ │ └── python │ │ │ ├── __init__.py │ │ │ └── print_args.py │ └── workflows │ │ ├── __init__.py │ │ ├── entrypoint.py │ │ └── for_each_task_wf.py └── brickflow_serverless_examples │ ├── .brickflow-project-root.yml │ ├── .gitignore │ ├── README.md │ ├── __init__.py │ ├── brickflow-multi-project.yml │ ├── notebooks │ ├── __init__.py │ └── example_notebook.py │ ├── src │ ├── __init__.py │ └── python │ │ ├── __init__.py │ │ └── example.py │ └── workflows │ ├── __init__.py │ ├── demo_serverless_wf.py │ └── entrypoint.py ├── mkdocs.yml ├── poetry.lock ├── prospector.yaml ├── pyproject.toml ├── tests ├── __init__.py ├── airflow_plugins │ ├── __init__.py │ ├── test_autosys.py │ ├── test_tableau.py │ └── test_task_dependency.py ├── cli │ ├── __init__.py │ ├── sample_yaml_project │ │ ├── .brickflow-project-root.yaml │ │ └── brickflow-multi-project.yaml │ ├── sample_yml_project │ │ ├── .brickflow-project-root.yml │ │ └── brickflow-multi-project.yml │ ├── test_bundles.py │ ├── test_cli.py │ └── test_projects.py ├── codegen │ ├── __init__.py │ ├── expected_bundles │ │ ├── dev_bundle_monorepo.yml │ │ ├── dev_bundle_polyrepo.yml │ │ ├── dev_bundle_polyrepo_with_auto_libs.yml │ │ ├── local_bundle.yml │ │ ├── local_bundle_continuous_schedule.yml │ │ ├── local_bundle_foreach_task.yml │ │ ├── local_bundle_prefix_suffix.yml │ │ └── local_serverless_bundle.yml │ ├── sample_serverless_workflow.py │ ├── sample_workflows.py │ └── test_databricks_bundle.py ├── context │ ├── __init__.py │ └── test_context.py ├── databricks_plugins │ ├── __init__.py │ ├── test_box_operator.py │ ├── test_run_job.py │ ├── test_sla_sensor.py │ ├── test_workflow_dependency_sensor.py │ └── test_workflow_task_dependency_sensor.py ├── engine │ ├── __init__.py │ ├── sample_workflow.py │ ├── sample_workflow_2.py │ ├── test_compute.py │ ├── test_engine.py │ ├── test_project.py │ ├── test_task.py │ ├── test_utils.py │ └── test_workflow.py ├── resolver │ └── test_resolver.py ├── sample_workflows │ ├── __init__.py │ ├── sample_workflow_1.py │ └── sample_workflow_2.py ├── test_brickflow.py └── test_plugins.py └── tools ├── README.md ├── gen-bundle.sh ├── install_databricks_cli.py ├── modify_model.py └── modify_schema.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | *tests* 4 | brickflow/tf/* 5 | '*/.local/*', 6 | '**', 7 | 'tests/*', 8 | '*/tests/*', 9 | # omit anything in a .venv directory anywhere 10 | '.venv/*', 11 | "*/site-packages/*" 12 | 13 | [html] 14 | skip_empty = true 15 | 16 | [report] 17 | skip_empty = true 18 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "[BUG] Please add your bug title here" 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Cloud Information** 27 | 28 | 29 | - [ ] AWS 30 | - [ ] Azure 31 | - [ ] GCP 32 | - [ ] Other 33 | 34 | **Desktop (please complete the following information):** 35 | - OS: [e.g. iOS] 36 | - Browser [e.g. chrome, safari] 37 | - Version [e.g. 22] 38 | 39 | **Additional context** 40 | Add any other context about the problem here. 41 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "[FEATURE] Please add your feature request title" 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Cloud Information** 14 | 15 | 16 | - [ ] AWS 17 | - [ ] Azure 18 | - [ ] GCP 19 | - [ ] Other 20 | 21 | **Describe the solution you'd like** 22 | A clear and concise description of what you want to happen. 23 | 24 | **Describe alternatives you've considered** 25 | A clear and concise description of any alternative solutions or features you've considered. 26 | 27 | **Additional context** 28 | Add any other context or screenshots about the feature request here. 29 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Description 4 | 5 | 6 | ## Related Issue 7 | 8 | 9 | 10 | 11 | 12 | ## Motivation and Context 13 | 14 | 15 | ## How Has This Been Tested? 16 | 17 | 18 | 19 | 20 | ## Screenshots (if appropriate): 21 | 22 | ## Types of changes 23 | 24 | - [ ] Bug fix (non-breaking change which fixes an issue) 25 | - [ ] New feature (non-breaking change which adds functionality) 26 | - [ ] Breaking change (fix or feature that would cause existing functionality to change) 27 | 28 | ## Checklist: 29 | 30 | 31 | - [ ] My code follows the code style of this project. 32 | - [ ] My change requires a change to the documentation. 33 | - [ ] I have updated the documentation accordingly. 34 | - [ ] I have read the **CONTRIBUTING** document. 35 | - [ ] I have added tests to cover my changes. 36 | - [ ] All new and existing tests passed. 37 | -------------------------------------------------------------------------------- /.github/workflows/onpush.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | 3 | on: 4 | pull_request: 5 | types: [ opened, synchronize ] 6 | push: 7 | branches: [ main ] 8 | release: 9 | types: [ created ] 10 | 11 | jobs: 12 | test-pipeline: 13 | runs-on: ${{ matrix.os }} 14 | container: 15 | image: python:${{ matrix.python-version }} 16 | options: --user 1001 # run as the runner user instead of root 17 | strategy: 18 | max-parallel: 2 19 | matrix: 20 | python-version: [ '3.9' ] 21 | os: [ ubuntu-latest ] 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | 26 | - name: Set up JDK # used for py4j for cronutils 27 | uses: actions/setup-java@v3 28 | with: 29 | java-version: '8' 30 | distribution: 'adopt' 31 | 32 | - name: Install pip 33 | run: python -m pip install --upgrade pip 34 | 35 | - name: Install and configure Poetry 36 | uses: snok/install-poetry@v1 37 | 38 | - name: Install poetry and build tools 39 | run: | 40 | export PATH=$PATH:$HOME/.local/bin 41 | poetry self add "poetry-dynamic-versioning[plugin]" 42 | 43 | - name: Install dependencies 44 | run: | 45 | export PATH=$PATH:$HOME/.local/bin 46 | make poetry 47 | 48 | - name: Install, lint and test 49 | run: | 50 | export PATH=$PATH:$HOME/.local/bin 51 | export GITHUB_ACTIONS=true 52 | make cov 53 | 54 | - name: Publish test coverage 55 | uses: codecov/codecov-action@v3 56 | with: 57 | token: ${{ secrets.CODECOV_TOKEN }} 58 | files: coverage.xml 59 | 60 | deploy: 61 | name: Deploy to PyPi 62 | runs-on: ${{ matrix.os }} 63 | container: 64 | image: python:${{ matrix.python-version }} 65 | options: --user 1001 # run as the runner user instead of root 66 | strategy: 67 | max-parallel: 2 68 | matrix: 69 | python-version: [ '3.9' ] 70 | os: [ ubuntu-latest ] 71 | needs: 72 | - test-pipeline 73 | if: github.event_name == 'release' 74 | steps: 75 | - uses: actions/checkout@v3 # use latest version of the checkout action 76 | 77 | - name: Set up JDK # used for py4j for cronutils 78 | uses: actions/setup-java@v3 79 | with: 80 | java-version: '8' 81 | distribution: 'adopt' 82 | 83 | - name: Install pip 84 | run: python -m pip install --upgrade pip 85 | 86 | - name: Install and configure Poetry 87 | uses: snok/install-poetry@v1 88 | 89 | - name: Install build tools 90 | run: | 91 | export PATH=$PATH:$HOME/.local/bin 92 | poetry self add "poetry-dynamic-versioning[plugin]" 93 | 94 | - name: Install dependencies 95 | run: | 96 | export PATH=$PATH:$HOME/.local/bin 97 | make poetry 98 | 99 | - name: Install wheel and twine 100 | run: python -m pip install wheel twine 101 | 102 | - name: Build and publish 103 | env: 104 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 105 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 106 | run: | 107 | export PATH=$PATH:$HOME/.local/bin 108 | make build 109 | twine upload dist/* 110 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/terraform,pycharm+all,macos,windows 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=terraform,pycharm+all,macos,windows 3 | 4 | ### macOS ### 5 | # General 6 | .DS_Store 7 | .AppleDouble 8 | .LSOverride 9 | 10 | # Icon must end with two 11 | Icon 12 | 13 | 14 | # Thumbnails 15 | ._* 16 | 17 | # Files that might appear in the root of a volume 18 | .DocumentRevisions-V100 19 | .fseventsd 20 | .Spotlight-V100 21 | .TemporaryItems 22 | .Trashes 23 | .VolumeIcon.icns 24 | .com.apple.timemachine.donotpresent 25 | 26 | # Directories potentially created on remote AFP share 27 | .AppleDB 28 | .AppleDesktop 29 | Network Trash Folder 30 | Temporary Items 31 | .apdisk 32 | 33 | ### macOS Patch ### 34 | # iCloud generated files 35 | *.icloud 36 | 37 | ### PyCharm+all ### 38 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 39 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 40 | 41 | # User-specific stuff 42 | .idea/**/workspace.xml 43 | .idea/**/tasks.xml 44 | .idea/**/usage.statistics.xml 45 | .idea/**/dictionaries 46 | .idea/**/shelf 47 | 48 | # AWS User-specific 49 | .idea/**/aws.xml 50 | 51 | # Generated files 52 | .idea/**/contentModel.xml 53 | 54 | # Sensitive or high-churn files 55 | .idea/**/dataSources/ 56 | .idea/**/dataSources.ids 57 | .idea/**/dataSources.local.xml 58 | .idea/**/sqlDataSources.xml 59 | .idea/**/dynamic.xml 60 | .idea/**/uiDesigner.xml 61 | .idea/**/dbnavigator.xml 62 | 63 | # Gradle 64 | .idea/**/gradle.xml 65 | .idea/**/libraries 66 | 67 | # Gradle and Maven with auto-import 68 | # When using Gradle or Maven with auto-import, you should exclude module files, 69 | # since they will be recreated, and may cause churn. Uncomment if using 70 | # auto-import. 71 | # .idea/artifacts 72 | # .idea/compiler.xml 73 | # .idea/jarRepositories.xml 74 | # .idea/modules.xml 75 | # .idea/*.iml 76 | # .idea/modules 77 | # *.iml 78 | # *.ipr 79 | 80 | # CMake 81 | cmake-build-*/ 82 | 83 | # Mongo Explorer plugin 84 | .idea/**/mongoSettings.xml 85 | 86 | # File-based project format 87 | *.iws 88 | 89 | # IntelliJ 90 | out/ 91 | 92 | # mpeltonen/sbt-idea plugin 93 | .idea_modules/ 94 | 95 | # JIRA plugin 96 | atlassian-ide-plugin.xml 97 | 98 | # Cursive Clojure plugin 99 | .idea/replstate.xml 100 | 101 | # SonarLint plugin 102 | .idea/sonarlint/ 103 | 104 | # Crashlytics plugin (for Android Studio and IntelliJ) 105 | com_crashlytics_export_strings.xml 106 | crashlytics.properties 107 | crashlytics-build.properties 108 | fabric.properties 109 | 110 | # Editor-based Rest Client 111 | .idea/httpRequests 112 | 113 | # Android studio 3.1+ serialized cache file 114 | .idea/caches/build_file_checksums.ser 115 | 116 | ### PyCharm+all Patch ### 117 | # Ignore everything but code style settings and run configurations 118 | # that are supposed to be shared within teams. 119 | 120 | .idea/* 121 | 122 | ### Terraform ### 123 | # Local .terraform directories 124 | **/.terraform/* 125 | 126 | # .tfstate files 127 | *.tfstate 128 | *.tfstate.* 129 | 130 | # Crash log files 131 | crash.log 132 | crash.*.log 133 | 134 | # Exclude all .tfvars files, which are likely to contain sensitive data, such as 135 | # password, private keys, and other secrets. These should not be part of version 136 | # control as they are data points which are potentially sensitive and subject 137 | # to change depending on the environment. 138 | *.tfvars 139 | *.tfvars.json 140 | 141 | # Ignore override files as they are usually used to override resources locally and so 142 | # are not checked in 143 | override.tf 144 | override.tf.json 145 | *_override.tf 146 | *_override.tf.json 147 | 148 | # Include override files you do wish to add to version control using negated pattern 149 | # !example_override.tf 150 | 151 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan 152 | # example: *tfplan* 153 | 154 | # Ignore CLI configuration files 155 | .terraformrc 156 | terraform.rc 157 | 158 | ### Windows ### 159 | # Windows thumbnail cache files 160 | Thumbs.db 161 | Thumbs.db:encryptable 162 | ehthumbs.db 163 | ehthumbs_vista.db 164 | 165 | # Dump file 166 | *.stackdump 167 | 168 | # Folder config file 169 | [Dd]esktop.ini 170 | 171 | # Recycle Bin used on file shares 172 | $RECYCLE.BIN/ 173 | 174 | # Windows Installer files 175 | *.cab 176 | *.msi 177 | *.msix 178 | *.msm 179 | *.msp 180 | 181 | # Windows shortcuts 182 | *.lnk 183 | 184 | # End of https://www.toptal.com/developers/gitignore/api/terraform,pycharm+all,macos,windows 185 | 186 | # BUILD 187 | 188 | brickflow.egg-info 189 | .eggs 190 | dist 191 | build 192 | 193 | # SAMPLES / TESTING 194 | brickflow/sample_dags 195 | main*.py 196 | 197 | # Coverage related 198 | .coverage 199 | coverage.xml 200 | site 201 | scripts 202 | __pycache__ 203 | integration_workflows 204 | 205 | *venv 206 | 207 | # VScode 208 | .vscode 209 | 210 | # GENERATED BY BRICKFLOW CLI --START-- 211 | 212 | ### Terraform ### 213 | # Local .terraform directories 214 | **/.terraform/* 215 | 216 | # .tfstate files 217 | *.tfstate 218 | *.tfstate.* 219 | 220 | # Crash log files 221 | crash.log 222 | crash.*.log 223 | 224 | # Exclude all .tfvars files, which are likely to contain sensitive data, such as 225 | # password, private keys, and other secrets. These should not be part of version 226 | # control as they are data points which are potentially sensitive and subject 227 | # to change depending on the environment. 228 | *.tfvars 229 | *.tfvars.json 230 | 231 | # Ignore override files as they are usually used to override resources locally and so 232 | # are not checked in 233 | override.tf 234 | override.tf.json 235 | *_override.tf 236 | *_override.tf.json 237 | 238 | # Include override files you do wish to add to version control using negated pattern 239 | # !example_override.tf 240 | 241 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan 242 | # example: *tfplan* 243 | 244 | # Ignore CLI configuration files 245 | .terraformrc 246 | terraform.rc 247 | 248 | # GENERATED BY BRICKFLOW CLI --END-- 249 | 250 | bundle.yml 251 | 252 | brickflow/bundles/schema.json 253 | brickflow/bundles/transformed_schema.json 254 | .databricks 255 | cdktf.out -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: local 3 | hooks: 4 | - id: make-check 5 | name: Running Lint Checks 6 | entry: make check 7 | language: system 8 | files: '\.py$' 9 | pass_filenames: false 10 | always_run: true 11 | stages: [commit] 12 | - id: make-cov 13 | name: Running Lint Checks & Test Suite 14 | entry: make cov 15 | language: system 16 | files: '\.py$' 17 | pass_filenames: false 18 | always_run: true 19 | stages: [push] -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | # This is a comment. 2 | # Each line is a file pattern followed by one or more owners. 3 | 4 | # These owners will be the default owners for everything in 5 | # the repo. Unless a later match takes precedence, 6 | # @Nike-Inc/brickflow-dev will be requested for 7 | # review when someone opens a pull request. 8 | * @Nike-Inc/brickflow-dev @asingamaneni @stikkireddy @newfront 9 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to contribute 2 | 3 | There are a few guidelines that we need contributors to follow so that we are able to process requests as efficiently as possible. If you have any questions or concerns please feel free to contact us at [opensource@nike.com](mailto:opensource@nike.com). 4 | 5 | ## Getting Started 6 | 7 | * Review our [Code of Conduct](https://github.com/Nike-Inc/nike-inc.github.io/blob/master/CONDUCT.md) 8 | * Submit the [Individual Contributor License Agreement](https://www.clahub.com/agreements/Nike-Inc/fastbreak) 9 | * Make sure you have a [GitHub account](https://github.com/signup/free) 10 | * Submit a ticket for your issue, assuming one does not already exist. 11 | * Clearly describe the issue including steps to reproduce when it is a bug. 12 | * Make sure you fill in the earliest version that you know has the issue. 13 | * Fork the repository on GitHub 14 | 15 | ## Making Changes 16 | 17 | * Create a feature branch off of `main` before you start your work. 18 | * Please avoid working directly on the `main` branch. 19 | * Setup the required package manager [poetry](#-package-manager) 20 | * Setup the dev environment [see below](#-dev-environment-setup) 21 | * Make commits of logical units. 22 | * You may be asked to squash unnecessary commits down to logical units. 23 | * Check for unnecessary whitespace with `git diff --check` before committing. 24 | * Write meaningful, descriptive commit messages. 25 | * Please follow existing code conventions when working on a file 26 | * Make sure to check the standards on the code [see below](#-linting-and-standards) 27 | * Install java 11 since it's required for unit tests while running 'make tests' 28 | * Make sure to test the code before you push changes [see below](#-testing) 29 | 30 | ## 🤝 Submitting Changes 31 | 32 | * Push your changes to a topic branch in your fork of the repository. 33 | * Submit a pull request to the repository in the Nike-Inc organization. 34 | * After feedback has been given we expect responses within two weeks. After two weeks we may close the pull request 35 | if it isn't showing any activity. 36 | * Bug fixes or features that lack appropriate tests may not be considered for merge. 37 | * Changes that lower test coverage may not be considered for merge. 38 | 39 | ### 📦 Package manager 40 | 41 | We use `make` for managing different steps of setup and maintenance in the project. You can install make by following 42 | the instructions [here](https://formulae.brew.sh/formula/make) 43 | 44 | We use `poetry` as our package manager. 45 | 46 | Please DO NOT use pip or conda to install the dependencies. Instead, use poetry: 47 | 48 | ```bash 49 | make poetry-install 50 | ``` 51 | 52 | ### 📌 Dev Environment Setup 53 | 54 | To ensure our standards, make sure to install the required packages. 55 | 56 | ```bash 57 | make dev 58 | ``` 59 | 60 | ### 🧹 Linting and Standards 61 | 62 | We use `pylint`, `black` and `mypy` to maintain standards in the codebase 63 | 64 | ```bash 65 | make check 66 | ``` 67 | 68 | Make sure that the linter does not report any errors or warnings before submitting a pull request. 69 | 70 | ### 🧪 Testing 71 | 72 | We use `pytest` to test our code. You can run the tests by running the following command: 73 | 74 | ```bash 75 | make test 76 | ``` 77 | 78 | #### 🧪 Integration Testing 79 | * Once you add a feature or a bug fix in brickflow, create a whl file from your feature branch 80 | * run 'poetry build' to generate the whl under the dist folder 81 | * Install brickflow from the whl file 82 | * pip install -whl file path- 83 | * Upload the whl file to Databricks workspace 84 | * Databricks Workspace --> Add --> Library 85 | * Copy the path of the uploaded whl file and paste it in the entrypoint.py as a Wheel Library 86 | * libraries=[ 87 | WheelTaskLibrary("dbfs:/FileStore/jars/dummy.whl") 88 | ], 89 | * Create a workflow and deploy it to make sure the feature or bug fix works as expected 90 | 91 | Make sure that all tests pass before submitting a pull request. 92 | 93 | ## 🚀 Release Process 94 | 95 | At the moment, the release process is manual. We try to make frequent releases. Usually, we release a new version when we have a new feature or bugfix. A developer with admin rights to the repository will create a new release on GitHub, and then publish the new version to PyPI. 96 | 97 | # Additional Resources 98 | 99 | * [General GitHub documentation](https://help.github.com/) 100 | * [GitHub pull request documentation](https://help.github.com/send-pull-requests/) 101 | * [Nike's Code of Conduct](https://github.com/Nike-Inc/nike-inc.github.io/blob/master/CONDUCT.md) 102 | * [Nike's Individual Contributor License Agreement](https://www.clahub.com/agreements/Nike-Inc/fastbreak) 103 | * [Nike OSS](https://nike-inc.github.io/) -------------------------------------------------------------------------------- /CONTRIBUTORS.md: -------------------------------------------------------------------------------- 1 | # Authors 2 | * [Ashok Singamaneni](https://www.linkedin.com/in/ashok-singamaneni-193b1a32/) 3 | * [Sriharsha Tikkireddy](https://www.linkedin.com/in/sriharsha-tikkireddy/) 4 | 5 | # Contributors 6 | Thanks to the contributors who helped on this project apart from the authors 7 | * [Danny Meijer](https://www.linkedin.com/in/dannydatascientist/) 8 | * [Pariksheet Marotrao Barapatre](https://www.linkedin.com/in/pari-data-products/) 9 | * [Bhargav Sangars](https://www.linkedin.com/in/bhargav-sangars-a4b61037/) 10 | * [Brend Braeckmans](https://www.linkedin.com/in/brendbraeckmans/) 11 | * [Rebecca Raj Shree](https://www.linkedin.com/in/rebecca-raj-shree/) 12 | * [Brent (Johnson) Spetner](https://www.linkedin.com/in/brentjohnsoneng/) 13 | * [Dmitrii Grigorev](https://www.linkedin.com/in/dmitrii-grigorev-074739135/) 14 | * [Chanukya Konuganti](https://www.linkedin.com/in/chanukyakonuganti/) 15 | * [Maxim Mityutko](https://www.linkedin.com/in/mityutko/) 16 | * [Raju Gujjalapati](https://in.linkedin.com/in/raju-gujjalapati-470a88171) 17 | * [Madhusudan Koukutla](https://www.linkedin.com/in/madhusudan-reddy/) 18 | * [Surya Teja Jagatha](https://www.linkedin.com/in/surya-teja-jagatha/) 19 | * [Iris Meerman](https://www.linkedin.com/in/iris-meerman-92694675/) 20 | * [Michael Espiritu](https://www.linkedin.com/in/michaelespiritu92/) 21 | * [Riccardo Iacomini](https://www.linkedin.com/in/riccardo-iacomini-b757b6118/) 22 | 23 | # Honorary Mentions 24 | Thanks to the team below for invaluable insights and support throughout the initial release of this project 25 | 26 | * [Joe Hollow](https://www.linkedin.com/in/joe-hollow-23088b1/) 27 | * [Aditya Chaturvedi](https://www.linkedin.com/in/chaturvediaditya/) 28 | * [Scott Haines](https://www.linkedin.com/in/scotthaines/) 29 | * [Arijit Banerjee](https://www.linkedin.com/in/massborn/) 30 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 2 | USER root 3 | 4 | # DO NOT ADD AS ENV: 5 | # debconf noninteractive 6 | # This is the anti-frontend. It never interacts with you at all, 7 | # and makes the default answers be used for all questions. It 8 | # might mail error messages to root, but that's it; otherwise it 9 | # is completely silent and unobtrusive, a perfect frontend for 10 | # automatic installs. If you are using this front-end, and require 11 | # non-default answers to questions, you will need to preseed the 12 | # debconf database; see the section below on Unattended Package 13 | # Installation for more details. 14 | 15 | RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections && \ 16 | apt-get update -y && apt-get install -y git curl wget unzip software-properties-common 17 | SHELL ["/bin/bash", "-c"] 18 | 19 | ENV NODE_VERSION 18.14.0 20 | 21 | RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections \ 22 | && curl https://raw.githubusercontent.com/creationix/nvm/master/install.sh | bash \ 23 | && . $HOME/.nvm/nvm.sh \ 24 | && nvm install $NODE_VERSION \ 25 | && nvm use $NODE_VERSION \ 26 | && npm install --global cdktf-cli@latest 27 | 28 | ENV NODE_PATH /root/.nvm/versions/node/v$NODE_VERSION/lib/node_modules 29 | ENV PATH /root/.nvm/versions/node/v$NODE_VERSION/bin:$PATH 30 | ENV NVM_DIR /root/.nvm 31 | 32 | RUN add-apt-repository ppa:deadsnakes/ppa 33 | RUN apt-get install -y python3.9 python3-pip python3.9-distutils && ln -s /usr/bin/python3.9 /usr/bin/python 34 | 35 | ARG CACHEBUST=1 36 | 37 | RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \ 38 | python3.9 get-pip.py && \ 39 | ln -s /usr/local/bin/pip3.9 /usr/bin/pip3 && \ 40 | ln -s /usr/local/bin/pip3.9 /usr/bin/pip 41 | 42 | RUN python -m pip install -U pip && pip install -U setuptools poetry 43 | 44 | WORKDIR /brickflow 45 | 46 | COPY . . 47 | 48 | VOLUME ["/brickflow", "$(pwd)"] 49 | 50 | RUN poetry install 51 | 52 | CMD ["/bin/bash"] 53 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # Include the license file 2 | include LICENSE.txt -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | black-check: 2 | @poetry run black --check . 3 | 4 | fmt: 5 | @poetry run black . 6 | 7 | check: black-check mypy 8 | @poetry run prospector --profile prospector.yaml --no-autodetect 9 | 10 | mypy: 11 | @poetry run mypy 12 | 13 | cov: check 14 | @poetry run coverage run --source=brickflow --omit "brickflow/sample_dags/*,sample_workflows/*,brickflow/tf/*" -m pytest && \ 15 | poetry run coverage report -m && \ 16 | poetry run coverage xml 17 | 18 | gen-bundle-sdk: 19 | @pip install . --force-reinstall 20 | @./tools/gen-bundle.sh 21 | 22 | dev: 23 | @poetry install --all-extras --with dev 24 | @poetry run pre-commit install 25 | @poetry run pre-commit install --hook-type pre-push 26 | 27 | deploy_env_setup: 28 | @poetry install --all-extras --with dev 29 | 30 | test: 31 | @poetry run coverage run --source=brickflow --omit "brickflow/bundles/*,brickflow/sample_dags/*,sample_workflows/*,brickflow/tf/*" -m pytest && \ 32 | poetry run coverage report -m && \ 33 | poetry run coverage html 34 | 35 | clean: 36 | @rm -rf dist 37 | 38 | build: clean 39 | @poetry build 40 | 41 | poetry: 42 | @poetry install --all-extras --with dev 43 | 44 | coverage: check test 45 | 46 | docs: 47 | @poetry run mike deploy -u dev latest 48 | @poetry run mike set-default latest 49 | @poetry run mike serve 50 | 51 | deploy-docs: 52 | @poetry run mike deploy --push --update-aliases $(version) latest 53 | 54 | docker-local: 55 | docker build -t brickflow:latest --build-arg CACHEBUST="$(shell date +%s)" . 56 | 57 | poetry-install: 58 | @pip install --upgrade setuptools && pip install poetry && poetry self add "poetry-dynamic-versioning[plugin]" 59 | 60 | get-version: 61 | @poetry version 62 | 63 | requirements: 64 | @poetry export -f requirements.txt --output requirements.txt --with dev --without-hashes 65 | 66 | docker-build: 67 | @docker build -t brickflow-local . 68 | 69 | docker: docker-build 70 | @docker run -it -v "$(shell pwd)":/brickflow brickflow-local /bin/bash 71 | 72 | .PHONY: docs -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Brickflow 2 | 3 | [//]: # ([![CodeQL](https://github.com/Nike-Inc/brickflow/actions/workflows/codeql-analysis.yml/badge.svg)](https://github.com/Nike-Inc/brickflow/actions/workflows/codeql-analysis.yml)) 4 | [![build](https://github.com/Nike-Inc/brickflow/actions/workflows/onpush.yml/badge.svg)](https://github.com/Nike-Inc/brickflow/actions/workflows/onpush.yml) 5 | [![codecov](https://codecov.io/gh/Nike-Inc/brickflow/branch/main/graph/badge.svg)](https://codecov.io/gh/Nike-Inc/brickflow) 6 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) 7 | [![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/) 8 | [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) 9 | ![PYPI version](https://img.shields.io/pypi/v/brickflows.svg) 10 | ![PYPI - Downloads](https://static.pepy.tech/badge/brickflows) 11 | ![PYPI - Python Version](https://img.shields.io/pypi/pyversions/brickflows.svg) 12 | 13 |

14 | BrickFlow is specifically designed to enable the development of Databricks workflows using Python, streamlining the 15 | process through a command-line interface (CLI) tool.

16 | 17 |

18 |

19 | 20 | --- 21 | 22 | ### Contributors 23 | 24 | Thanks to all the [contributors](https://github.com/Nike-Inc/brickflow/blob/main/CONTRIBUTORS.md) who have helped ideate, develop and bring Brickflow to its current state. 25 | 26 | ### Contributing 27 | 28 | We're delighted that you're interested in contributing to our project! To get started, 29 | please carefully read and follow the guidelines provided in our [contributing](https://github.com/Nike-Inc/brickflow/blob/main/CONTRIBUTING.md) document. 30 | 31 | ### Documentation 32 | 33 | Brickflow documentation can be found [here](https://engineering.nike.com/brickflow/). 34 | 35 | ### Getting Started 36 | 37 | #### Prerequisites 38 | 1. Install brickflows 39 | 40 | ```shell 41 | pip install brickflows 42 | ``` 43 | 44 | 2. Install [Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/databricks-cli.html) 45 | 46 | ```shell 47 | curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sudo sh 48 | ``` 49 | 50 | 3. Configure Databricks cli with workspace token. This configures your `~/.databrickscfg` file. 51 | 52 | ```shell 53 | databricks configure --token 54 | ``` 55 | 56 | #### Hello World workflow 57 | 1. Create your first workflow using brickflow 58 | ```shell 59 | mkdir hello-world-brickflow 60 | cd hello-world-brickflow 61 | brickflow projects add 62 | ``` 63 | 64 | 2. Provide the following inputs 65 | ```shell 66 | Project name: hello-world-brickflow 67 | Path from repo root to project root (optional) [.]: . 68 | Path from project root to workflows dir: workflows 69 | Git https url: https://github.com/Nike-Inc/brickflow.git 70 | Brickflow version [auto]: 71 | Spark expectations version [0.5.0]: 0.8.0 72 | Skip entrypoint [y/N]: N 73 | ``` 74 | _Note: You can provide your own github repo url._ 75 | 76 | 3. Create a new file hello_world_wf.py in the workflows directory 77 | ```shell 78 | touch workflows/hello_world_wf.py 79 | ``` 80 | 81 | 4. Copy the following code in hello_world_wf.py file 82 | ```python 83 | from brickflow import ( 84 | ctx, 85 | Cluster, 86 | Workflow, 87 | NotebookTask, 88 | ) 89 | from airflow.operators.bash import BashOperator 90 | 91 | 92 | cluster = Cluster( 93 | name="job_cluster", 94 | node_type_id="m6gd.xlarge", 95 | spark_version="13.3.x-scala2.12", 96 | min_workers=1, 97 | max_workers=2, 98 | ) 99 | 100 | wf = Workflow( 101 | "hello_world_workflow", 102 | default_cluster=cluster, 103 | tags={ 104 | "product_id": "brickflow_demo", 105 | }, 106 | common_task_parameters={ 107 | "catalog": "", 108 | "database": "", 109 | }, 110 | ) 111 | 112 | @wf.task 113 | # this task does nothing but explains the use of context object 114 | def start(): 115 | print(f"Environment: {ctx.env}") 116 | 117 | @wf.notebook_task 118 | # this task runs a databricks notebook 119 | def example_notebook(): 120 | return NotebookTask( 121 | notebook_path="notebooks/example_notebook.py", 122 | base_parameters={ 123 | "some_parameter": "some_value", # in the notebook access these via dbutils.widgets.get("some_parameter") 124 | }, 125 | ) 126 | 127 | 128 | @wf.task(depends_on=[start, example_notebook]) 129 | # this task runs a bash command 130 | def list_lending_club_data_files(): 131 | return BashOperator( 132 | task_id=list_lending_club_data_files.__name__, 133 | bash_command="ls -lrt /dbfs/databricks-datasets/samples/lending_club/parquet/", 134 | ) 135 | 136 | @wf.task(depends_on=list_lending_club_data_files) 137 | # this task runs the pyspark code 138 | def lending_data_ingest(): 139 | ctx.spark.sql( 140 | f""" 141 | CREATE TABLE IF NOT EXISTS 142 | {ctx.dbutils_widget_get_or_else(key="catalog", debug="development")}.\ 143 | {ctx.dbutils_widget_get_or_else(key="database", debug="dummy_database")}.\ 144 | {ctx.dbutils_widget_get_or_else(key="brickflow_env", debug="local")}_lending_data_ingest 145 | USING DELTA -- this is default just for explicit purpose 146 | SELECT * FROM parquet.`dbfs:/databricks-datasets/samples/lending_club/parquet/` 147 | """ 148 | ) 149 | ``` 150 | _Note: Modify the values of catalog/database for common_task_parameters._ 151 | 152 | 153 | 5. Create a new file example_notebook.py in the notebooks directory 154 | ```shell 155 | mkdir notebooks 156 | touch notebooks/example_notebook.py 157 | ``` 158 | 6. Copy the following code in the example_notebook.py file 159 | ```python 160 | # Databricks notebook source 161 | 162 | print("hello world") 163 | ``` 164 | 165 | #### Deploy the workflow to databricks 166 | ```shell 167 | brickflow projects deploy --project hello-world-brickflow -e local 168 | ``` 169 | 170 | ### Run the demo workflow 171 | 1. Login to databricks workspace 172 | 2. Go to the workflows and select the workflow 173 |

174 |

175 | 4. click on the run button 176 | 177 | ### Examples 178 | Refer to the [examples](https://github.com/Nike-Inc/brickflow/tree/main/examples/brickflow_examples) for more examples. 179 | 180 | 181 | -------------------------------------------------------------------------------- /brickflow/bundles/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/brickflow/bundles/__init__.py -------------------------------------------------------------------------------- /brickflow/cli/commands.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import subprocess 5 | from typing import Optional, Union, Tuple, List 6 | 7 | from click import ClickException 8 | 9 | from brickflow import _ilog 10 | 11 | 12 | def exec_command( 13 | path_to_executable: str, 14 | base_command: Optional[str], 15 | args: Union[Tuple[str] | List[str]], 16 | capture_output: bool = False, 17 | ) -> Optional[str]: 18 | os.environ["PYTHONPATH"] = os.getcwd() 19 | my_env = os.environ.copy() 20 | try: 21 | _args = list(args) 22 | # add a base command if its provided for proxying for brickflow deploy 23 | if base_command is not None: 24 | _args = [base_command] + _args 25 | _ilog.info("Executing command: %s", " ".join([path_to_executable, *_args])) 26 | 27 | if capture_output is True: 28 | res = subprocess.run( 29 | [path_to_executable, *_args], 30 | check=True, 31 | env=my_env, 32 | capture_output=True, 33 | text=True, 34 | ) 35 | return res.stdout.strip() 36 | 37 | subprocess.run([path_to_executable, *_args], check=True, env=my_env) 38 | except subprocess.CalledProcessError as e: 39 | raise ClickException(str(e)) 40 | 41 | return None 42 | -------------------------------------------------------------------------------- /brickflow/cli/configure.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import importlib 4 | import os 5 | import re 6 | import sys 7 | from pathlib import Path 8 | from typing import Callable, Any, Optional 9 | 10 | import click 11 | from jinja2 import Environment, BaseLoader 12 | 13 | from brickflow import _ilog, BrickflowProjectConstants, get_entrypoint_python 14 | from brickflow.cli.commands import exec_command 15 | 16 | PWD = Path(__file__).parent.absolute() 17 | GITIGNORE_TEMPLATE = PWD / "gitignore_template.txt" 18 | GIT_PATH = Path(".git") 19 | 20 | 21 | class GitNotFoundError(Exception): 22 | pass 23 | 24 | 25 | class GitIgnoreNotFoundError(Exception): 26 | pass 27 | 28 | 29 | def _gitignore_exists() -> bool: 30 | return os.path.exists(".gitignore") and os.path.isfile(".gitignore") 31 | 32 | 33 | def _create_gitignore_if_not_exists() -> None: 34 | if _gitignore_exists() is False: 35 | Path(".gitignore").touch(mode=0o755) 36 | 37 | 38 | def _get_gitignore() -> str: 39 | return Path(".gitignore").read_text(encoding="utf-8") 40 | 41 | 42 | def _get_gitignore_template() -> str: 43 | return GITIGNORE_TEMPLATE.read_text() 44 | 45 | 46 | def _write_gitignore(data: str) -> None: 47 | Path(".gitignore").write_text(encoding="utf-8", data=data) 48 | 49 | 50 | def _update_gitignore() -> None: 51 | search_regex = re.compile( 52 | r"(# GENERATED BY BRICKFLOW CLI --START--(.|\n)*# GENERATED BY BRICKFLOW CLI --END--)" 53 | ) 54 | 55 | git_ignore_data = _get_gitignore() 56 | git_ignore_template = _get_gitignore_template() 57 | search = search_regex.findall(git_ignore_data) 58 | if len(search) > 0: 59 | search_match = search[0][0] 60 | gitignore_file_data = git_ignore_data.replace(search_match, git_ignore_template) 61 | else: 62 | gitignore_file_data = "\n\n".join([git_ignore_data, git_ignore_template]) 63 | _write_gitignore(gitignore_file_data) 64 | 65 | 66 | def _validate_package(path_str: str) -> str: 67 | folder_path: Path = Path(path_str) 68 | 69 | if not folder_path.exists(): 70 | raise ImportError(f"Invalid pkg error: {folder_path.as_posix()}") 71 | 72 | sys.path.append(os.getcwd()) 73 | folder_pkg_path: str = folder_path.as_posix().replace("/", ".") 74 | 75 | for module in folder_path.glob("**/*.py"): # only find python files 76 | # ignore __init__.py 77 | if module.name == "__init__.py": 78 | continue 79 | module_name = module.as_posix().replace(".py", "").replace("/", ".") 80 | # import all the modules into the mod object and not actually import them using __import__ 81 | mod = importlib.import_module(module_name) 82 | click.echo(f"Scanned module: {mod.__name__}") 83 | 84 | return folder_pkg_path 85 | 86 | 87 | def render_template(**kwargs) -> str: # type: ignore 88 | template = Path(__file__).parent.absolute() / "entrypoint.template" 89 | with template.open("r") as f: 90 | data = f.read() 91 | return Environment(loader=BaseLoader()).from_string(data).render(**kwargs) 92 | 93 | 94 | def create_entry_point(working_dir: str, data: str) -> None: 95 | path = Path(working_dir) / "entrypoint.py" 96 | if path.exists(): 97 | click.echo(f"Path: {str(path.absolute())} already exists...") 98 | # path = Path(working_dir) / "entrypoint.py.new" 99 | else: 100 | click.echo(f"Creating file in path: {str(path.absolute())}...") 101 | path.write_text(data) 102 | 103 | 104 | def create_brickflow_project_root_marker() -> None: 105 | path = Path( 106 | f"{BrickflowProjectConstants.DEFAULT_MULTI_PROJECT_ROOT_FILE_NAME.value}." 107 | f"{BrickflowProjectConstants.DEFAULT_CONFIG_FILE_TYPE.value}" 108 | ) 109 | if path.exists(): 110 | click.echo(f"Path: {str(path.absolute())} already exists...") 111 | # path = Path(working_dir) / "entrypoint.py.new" 112 | else: 113 | click.echo(f"Creating file in path: {str(path.absolute())}...") 114 | path.write_text( 115 | "# DO NOT MODIFY THIS FILE - IT IS AUTO GENERATED BY BRICKFLOW AND RESERVED FOR FUTURE USAGE", 116 | encoding="utf-8", 117 | ) 118 | 119 | 120 | def bind_env_var(env_var: str) -> Callable: 121 | def callback( 122 | ctx: click.Context, # noqa 123 | param: str, # noqa 124 | value: Any, 125 | ) -> None: 126 | # pylint: disable=unused-argument 127 | if value is not None and len(value) > 0: 128 | _ilog.info("Setting env var: %s to %s...", env_var, value) 129 | if isinstance(value, list): 130 | os.environ[env_var] = ",".join(value) 131 | if isinstance(value, tuple): 132 | os.environ[env_var] = ",".join(value) 133 | elif isinstance(value, bool): 134 | os.environ[env_var] = str(value).lower() 135 | else: 136 | os.environ[env_var] = value 137 | 138 | return callback 139 | 140 | 141 | def get_entrypoint(**kwargs: Any) -> str: 142 | wd: Optional[str] = kwargs.get("workflows_dir") 143 | if wd is None: 144 | raise ValueError( 145 | "workflows_dir not set, please set it using --workflows-dir or -wd" 146 | ) 147 | return str(Path(wd) / "entrypoint.py") 148 | 149 | 150 | def log_important_versions(bundle_cli: str) -> None: 151 | version = exec_command(bundle_cli, "--version", [], capture_output=True) 152 | _ilog.info("Using bundle version: %s", version) 153 | log_python_version() 154 | 155 | 156 | def log_python_version() -> None: 157 | version = exec_command( 158 | get_entrypoint_python(), "--version", [], capture_output=True 159 | ) 160 | _ilog.info("Using python version: %s", version) 161 | -------------------------------------------------------------------------------- /brickflow/cli/constants.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from enum import Enum 4 | 5 | from decouple import config 6 | 7 | from brickflow import BrickflowEnvVars 8 | 9 | 10 | class BrickflowDeployMode(Enum): 11 | BUNDLE = "bundle" 12 | 13 | 14 | INTERACTIVE_MODE = config( 15 | BrickflowEnvVars.BRICKFLOW_INTERACTIVE_MODE.value, default=True, cast=bool 16 | ) 17 | -------------------------------------------------------------------------------- /brickflow/cli/entrypoint.template: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | 3 | from brickflow import Project, PypiTaskLibrary, MavenTaskLibrary # make sure brickflow imports are at the top 4 | 5 | import {{ pkg }} 6 | 7 | def main() -> None: 8 | """Project entrypoint""" 9 | with Project( 10 | "{{ project_name }}", 11 | git_repo="{{ git_https_url }}", 12 | provider="{{ git_provider }}", 13 | libraries=[ 14 | # PypiTaskLibrary(package="spark-expectations=={{spark_expectations_version}}"), # Uncomment if spark-expectations is needed 15 | ], 16 | ) as f: 17 | f.add_pkg({{pkg}}) 18 | 19 | 20 | if __name__ == "__main__": 21 | main() 22 | 23 | -------------------------------------------------------------------------------- /brickflow/cli/gitignore_template.txt: -------------------------------------------------------------------------------- 1 | # GENERATED BY BRICKFLOW CLI --START-- 2 | 3 | ### Terraform ### 4 | # Local .terraform directories 5 | **/.terraform/* 6 | 7 | # .tfstate files 8 | *.tfstate 9 | *.tfstate.* 10 | 11 | # Crash log files 12 | crash.log 13 | crash.*.log 14 | 15 | # Exclude all .tfvars files, which are likely to contain sensitive data, such as 16 | # password, private keys, and other secrets. These should not be part of version 17 | # control as they are data points which are potentially sensitive and subject 18 | # to change depending on the environment. 19 | *.tfvars 20 | *.tfvars.json 21 | 22 | # Ignore override files as they are usually used to override resources locally and so 23 | # are not checked in 24 | override.tf 25 | override.tf.json 26 | *_override.tf 27 | *_override.tf.json 28 | 29 | # Include override files you do wish to add to version control using negated pattern 30 | # !example_override.tf 31 | 32 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan 33 | # example: *tfplan* 34 | 35 | # Ignore CLI configuration files 36 | .terraformrc 37 | terraform.rc 38 | 39 | # GENERATED BY BRICKFLOW CLI --END-- -------------------------------------------------------------------------------- /brickflow/codegen/__init__.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from enum import Enum 3 | from pathlib import Path 4 | 5 | from typing import TYPE_CHECKING, Optional, Dict, Any 6 | 7 | from decouple import config 8 | 9 | from brickflow import get_brickflow_version, BrickflowEnvVars, BrickflowDefaultEnvs 10 | 11 | if TYPE_CHECKING: 12 | from brickflow.engine.project import _Project 13 | 14 | 15 | class CodegenInterface(abc.ABC): 16 | def __init__( 17 | self, project: "_Project", id_: str, env: str, **_: Any 18 | ) -> None: # noqa 19 | self.env: str = env 20 | self.project: "_Project" = project 21 | self.id_ = id_ 22 | 23 | @abc.abstractmethod 24 | def synth(self) -> None: 25 | pass 26 | 27 | 28 | class DatabricksDefaultClusterTagKeys(Enum): 29 | ENVIRONMENT = "environment" 30 | DEPLOYED_BY = "deployed_by" 31 | DEPLOYED_AT = "deployed_at" 32 | BRICKFLOW_PROJECT_NAME = "brickflow_project_name" 33 | BRICKFLOW_DEPLOYMENT_MODE = "brickflow_deployment_mode" 34 | DATABRICKS_TF_PROVIDER_VERSION = "databricks_tf_provider_version" 35 | BRICKFLOW_VERSION = "brickflow_version" 36 | 37 | 38 | BRICKFLOW_BUILTIN_DEPLOY_TAGS = { 39 | "brickflow_version": get_brickflow_version() 40 | or "undefined", # certain scenarios get_brickflow_version maybe None 41 | } 42 | 43 | 44 | def get_brickflow_tags( 45 | user_defined_tags: Optional[Dict[str, str]], other_tags: Dict[str, str] 46 | ) -> Dict[str, str]: 47 | return {**(user_defined_tags or {}), **other_tags, **BRICKFLOW_BUILTIN_DEPLOY_TAGS} 48 | 49 | 50 | def handle_mono_repo_path(project: "_Project", env: str) -> str: 51 | base_path = config( 52 | BrickflowEnvVars.BRICKFLOW_MONOREPO_PATH_TO_BUNDLE_ROOT.value, None 53 | ) 54 | 55 | if project.entry_point_path is None: 56 | raise ValueError("project.entry_point_path is None") 57 | 58 | if base_path is None or env == BrickflowDefaultEnvs.LOCAL.value: 59 | return project.entry_point_path 60 | else: 61 | return str(Path(base_path) / project.entry_point_path) 62 | -------------------------------------------------------------------------------- /brickflow/context/__init__.py: -------------------------------------------------------------------------------- 1 | from .context import ( 2 | ctx, 3 | Context, 4 | BrickflowTaskComs, 5 | BRANCH_SKIP_EXCEPT, 6 | SKIP_EXCEPT_HACK, 7 | RETURN_VALUE_KEY, 8 | BrickflowInternalVariables, 9 | BrickflowBuiltInTaskVariables, 10 | BrickflowTaskComsObject, 11 | TaskComsObjectResult, 12 | ) 13 | 14 | __all__ = [ 15 | "ctx", 16 | "Context", 17 | "BrickflowTaskComs", 18 | "BRANCH_SKIP_EXCEPT", 19 | "SKIP_EXCEPT_HACK", 20 | "RETURN_VALUE_KEY", 21 | "BrickflowInternalVariables", 22 | "BrickflowBuiltInTaskVariables", 23 | "BrickflowTaskComsObject", 24 | "TaskComsObjectResult", 25 | ] 26 | -------------------------------------------------------------------------------- /brickflow/engine/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import functools 4 | import logging 5 | import subprocess 6 | import sys 7 | from typing import Callable 8 | 9 | from brickflow import log, get_default_log_handler 10 | 11 | 12 | def _call(cmd: str, **kwargs: bool) -> bytes: 13 | return subprocess.check_output( # type: ignore 14 | [ 15 | cmd, 16 | ], 17 | **kwargs, 18 | ) 19 | 20 | 21 | def get_current_commit() -> str: 22 | p = _call('git log -n 1 --pretty=format:"%H"', shell=True) 23 | return p.strip().decode("utf-8") 24 | 25 | 26 | def with_brickflow_logger(f: Callable) -> Callable: 27 | @functools.wraps(f) 28 | def func(*args, **kwargs): # type: ignore 29 | _self = args[0] 30 | log.handlers = [] 31 | logger_handler = logging.StreamHandler( 32 | stream=sys.stdout 33 | ) # Handler for the logger 34 | # First, generic formatter: 35 | logger_handler.setFormatter( 36 | logging.Formatter( 37 | f"[%(asctime)s] [%(levelname)s] [brickflow:{_self.name}] " 38 | "{%(module)s.py:%(funcName)s:%(lineno)d} - %(message)s" 39 | ) 40 | ) 41 | log.addHandler(logger_handler) 42 | resp = f(*args, **kwargs) 43 | 44 | log.handlers = [get_default_log_handler()] 45 | 46 | return resp 47 | 48 | return func 49 | 50 | 51 | ROOT_NODE = "root" 52 | -------------------------------------------------------------------------------- /brickflow/engine/hooks.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | import pluggy 4 | 5 | if TYPE_CHECKING: 6 | from brickflow.engine.task import Task, TaskResponse # pragma: no cover 7 | from brickflow.engine.workflow import Workflow # pragma: no cover 8 | 9 | BRICKFLOW_TASK_PLUGINS = "brickflow_task_plugins" 10 | 11 | brickflow_plugin_spec = pluggy.HookspecMarker(BRICKFLOW_TASK_PLUGINS) 12 | 13 | 14 | class BrickflowTaskPluginSpec: 15 | @staticmethod 16 | def handle_user_result_errors(resp: "TaskResponse") -> None: 17 | """Custom execute method that is able to be plugged in.""" 18 | if resp.user_code_error is not None: 19 | original_message = str(resp.user_code_error) 20 | additional_info = ( 21 | "BRICKFLOW_USER_OR_DBR_ERROR: This is an error thrown in user code. \n" 22 | f"BRICKFLOW_INPUT_ARGS: {resp.input_kwargs}\n" 23 | "Original Exception Message: " 24 | ) 25 | new_message = additional_info + original_message 26 | resp.user_code_error.args = (new_message,) 27 | raise resp.user_code_error 28 | 29 | @staticmethod 30 | @brickflow_plugin_spec(firstresult=True) 31 | def task_execute(task: "Task", workflow: "Workflow") -> "TaskResponse": 32 | """Custom execute method that is able to be plugged in.""" 33 | raise NotImplementedError("task_execute must be implemented by a plugin") 34 | 35 | @staticmethod 36 | @brickflow_plugin_spec(firstresult=True) 37 | def handle_results( 38 | resp: "TaskResponse", task: "Task", workflow: "Workflow" 39 | ) -> "TaskResponse": 40 | """Custom execute method that is able to be plugged in.""" 41 | raise NotImplementedError("handle_results must be implemented by a plugin") 42 | -------------------------------------------------------------------------------- /brickflow/engine/utils.py: -------------------------------------------------------------------------------- 1 | import functools 2 | from typing import Callable, Type, List, Iterator, Union 3 | import pathlib 4 | import os 5 | 6 | from pydantic import SecretStr 7 | from databricks.sdk import WorkspaceClient 8 | 9 | from brickflow.context import ctx 10 | from brickflow.hints import propagate_hint 11 | 12 | 13 | @propagate_hint 14 | def wraps_keyerror(error_class: Type[Exception], msg: str) -> Callable: 15 | def wrapper(f: Callable) -> Callable: 16 | @functools.wraps(f) 17 | def func(*args, **kwargs): # type: ignore 18 | try: 19 | return f(*args, **kwargs) 20 | except KeyError as e: 21 | raise error_class( 22 | f"{msg}; err: {str(e)}; args: {args}; kwargs: {kwargs}" 23 | ) 24 | 25 | return func 26 | 27 | return wrapper 28 | 29 | 30 | def get_properties(some_obj: Type) -> List[str]: 31 | def _property_iter() -> Iterator[str]: 32 | for k, v in some_obj.__dict__.items(): 33 | if isinstance(v, property): 34 | yield k 35 | 36 | return list(_property_iter()) 37 | 38 | 39 | def get_job_id( 40 | job_name: str, host: Union[str, None] = None, token: Union[str, SecretStr] = None 41 | ) -> Union[float, None]: 42 | """ 43 | Get the job id from the specified Databricks workspace for a given job name. 44 | 45 | Parameters 46 | ---------- 47 | job_name: str 48 | Job name (case-insensitive) 49 | host: str 50 | Databricks workspace URL 51 | token: str 52 | Databricks API token 53 | 54 | Returns 55 | ------- 56 | str 57 | Databricks job id 58 | """ 59 | ctx.log.info("Searching job id for job name: %s", job_name) 60 | 61 | if host: 62 | host = host.rstrip("/") 63 | token = token.get_secret_value() if isinstance(token, SecretStr) else token 64 | 65 | workspace_obj = WorkspaceClient(host=host, token=token) 66 | jobs_list = workspace_obj.jobs.list(name=job_name) 67 | 68 | try: 69 | for job in jobs_list: 70 | ctx.log.info("Job id for job '%s' is %s", job_name, job.job_id) 71 | return job.job_id 72 | else: # pylint: disable=useless-else-on-loop 73 | raise ValueError 74 | except ValueError: 75 | raise ValueError(f"No job found with name {job_name}") 76 | except Exception as e: 77 | ctx.log.info("An error occurred: %s", e) 78 | 79 | return None 80 | 81 | 82 | def get_bf_project_root() -> pathlib.Path: 83 | """Returns the root directory of the current Brickflow project 84 | 85 | Parameters: 86 | _file (str): file path where the function is called 87 | 88 | Returns: 89 | pathlib.Path: Brickflow project root directory 90 | """ 91 | try: 92 | _file_name = os.getcwd() 93 | _project_root = pathlib.Path(_file_name).resolve().parents[0] 94 | ctx.log.info("Setting Brickflow project root as %s", _project_root) 95 | return _project_root 96 | except Exception as e: 97 | ctx.log.info("An error occurred: %s", e) 98 | raise e 99 | -------------------------------------------------------------------------------- /brickflow/hints/__init__.py: -------------------------------------------------------------------------------- 1 | from brickflow.hints.hint import propagate_hint 2 | 3 | __all__ = ["propagate_hint"] 4 | -------------------------------------------------------------------------------- /brickflow/hints/hint.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | 3 | 4 | # propagate type hints for decorated functions 5 | def propagate_hint(decorator: Callable) -> Callable: 6 | return decorator 7 | -------------------------------------------------------------------------------- /brickflow/hints/py.typed: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | 3 | def propagate_hint(decorator: Callable) -> Callable: ... -------------------------------------------------------------------------------- /brickflow/resolver/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import inspect 4 | import os 5 | import sys 6 | from pathlib import Path 7 | from typing import Union, Any, List, Optional 8 | import pathlib 9 | 10 | from brickflow import BrickflowProjectConstants, _ilog, ctx 11 | 12 | 13 | def add_to_sys_path(directory: Union[str, pathlib.Path]) -> None: 14 | dir_str = str(directory) 15 | if dir_str not in sys.path and os.path.isdir(dir_str): 16 | sys.path.append(dir_str) 17 | 18 | 19 | def get_caller_file_paths() -> List[str]: 20 | caller_file_paths = [] 21 | frames = inspect.stack()[1:] # Exclude the current frame 22 | 23 | for frame in frames: 24 | caller_file_paths.append(frame.filename) 25 | 26 | return list(set(caller_file_paths)) 27 | 28 | 29 | class BrickflowRootNotFound(Exception): 30 | pass 31 | 32 | 33 | def go_up_till_brickflow_root(cur_path: str) -> str: 34 | if cur_path.startswith("<"): 35 | raise BrickflowRootNotFound("Invalid brickflow root.") 36 | 37 | path = pathlib.Path(cur_path).resolve() 38 | 39 | valid_roots = [ 40 | f"{BrickflowProjectConstants.DEFAULT_MULTI_PROJECT_ROOT_FILE_NAME.value}." 41 | f"{BrickflowProjectConstants.DEFAULT_CONFIG_FILE_TYPE.value}", 42 | f"{BrickflowProjectConstants.DEFAULT_MULTI_PROJECT_CONFIG_FILE_NAME.value}." 43 | f"{BrickflowProjectConstants.DEFAULT_CONFIG_FILE_TYPE.value}", 44 | ] 45 | 46 | # recurse to see if there is a brickflow root and return the path 47 | while not path.is_dir() or not any( 48 | file.name in valid_roots for file in path.iterdir() 49 | ): 50 | path = path.parent 51 | 52 | if path == path.parent: 53 | raise BrickflowRootNotFound( 54 | "Brickflow root directory not found in path hierarchy." 55 | ) 56 | 57 | return str(path.resolve()) 58 | 59 | 60 | def get_relative_path_to_brickflow_root() -> None: 61 | paths = get_caller_file_paths() 62 | _ilog.info("Brickflow setting up python path resolution...") 63 | # if inside notebook also get that path 64 | notebook_path = get_notebook_ws_path(ctx.dbutils) 65 | if notebook_path is not None: 66 | paths.append(notebook_path) 67 | 68 | for path in paths: 69 | try: 70 | resolved_path = go_up_till_brickflow_root(path) 71 | _ilog.info("Brickflow root input path - %s", path) 72 | _ilog.info("Brickflow root found - %s", resolved_path) 73 | add_to_sys_path(resolved_path) 74 | _ilog.info("Sys path set to: %s", str(sys.path)) 75 | except BrickflowRootNotFound: 76 | _ilog.info("Unable to find for path: %s", path) 77 | except PermissionError: 78 | _ilog.info("Most likely not accessible due to shared cluster: %s", path) 79 | 80 | 81 | def get_notebook_ws_path(dbutils: Optional[Any]) -> Optional[str]: 82 | if dbutils is not None: 83 | return str( 84 | "/Workspace" 85 | / Path( 86 | dbutils.notebook.entry_point.getDbutils() 87 | .notebook() 88 | .getContext() 89 | .notebookPath() 90 | .get() 91 | .lstrip("/") 92 | ) 93 | ) 94 | return None 95 | -------------------------------------------------------------------------------- /brickflow_plugins/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import List, Optional 3 | 4 | import pluggy 5 | 6 | from brickflow import get_default_log_handler 7 | 8 | 9 | def setup_logger(): 10 | _log = logging.getLogger(__name__) # Logger 11 | _log.setLevel(logging.INFO) 12 | logger_handler = get_default_log_handler("brickflow-plugins") 13 | _log.addHandler(logger_handler) 14 | _log.propagate = False 15 | return _log 16 | 17 | 18 | log = setup_logger() 19 | 20 | from brickflow_plugins.airflow.operators.external_tasks import ( 21 | TaskDependencySensor, 22 | AutosysSensor, 23 | AirflowProxyOktaClusterAuth, 24 | ) 25 | from brickflow_plugins.airflow.operators.external_tasks_tableau import ( 26 | TableauRefreshDataSourceOperator, 27 | TableauRefreshWorkBookOperator, 28 | ) 29 | from brickflow_plugins.airflow.operators.native_operators import ( 30 | BashOperator, 31 | BranchPythonOperator, 32 | ShortCircuitOperator, 33 | ) 34 | from brickflow_plugins.databricks.workflow_dependency_sensor import ( 35 | WorkflowDependencySensor, 36 | WorkflowTaskDependencySensor, 37 | ) 38 | from brickflow_plugins.databricks.uc_to_snowflake_operator import ( 39 | SnowflakeOperator, 40 | UcToSnowflakeOperator, 41 | ) 42 | from brickflow_plugins.databricks.box_operator import ( 43 | BoxToVolumesOperator, 44 | VolumesToBoxOperator, 45 | BoxOperator, 46 | ) 47 | from brickflow_plugins.databricks.sla_sensor import SLASensor 48 | 49 | 50 | def load_plugins(cache_bust: Optional[pluggy.PluginManager] = None) -> None: 51 | from brickflow.engine.task import get_plugin_manager 52 | from brickflow_plugins.airflow.brickflow_task_plugin import ( 53 | AirflowOperatorBrickflowTaskPluginImpl, 54 | ) 55 | 56 | if cache_bust is not None: 57 | cache_bust.register( 58 | AirflowOperatorBrickflowTaskPluginImpl(), name="airflow-plugin" 59 | ) 60 | return 61 | 62 | get_plugin_manager().register(AirflowOperatorBrickflowTaskPluginImpl()) 63 | 64 | 65 | def ensure_installation(): 66 | """Ensures that the brickflow_plugins package is installed in the current environment.""" 67 | from brickflow_plugins.airflow.cronhelper import cron_helper # noqa 68 | import airflow # noqa 69 | 70 | 71 | __all__: List[str] = [ 72 | "TaskDependencySensor", 73 | "AutosysSensor", 74 | "AirflowProxyOktaClusterAuth", 75 | "BashOperator", 76 | "BranchPythonOperator", 77 | "ShortCircuitOperator", 78 | "WorkflowDependencySensor", 79 | "WorkflowTaskDependencySensor", 80 | "SnowflakeOperator", 81 | "UcToSnowflakeOperator", 82 | "TableauRefreshDataSourceOperator", 83 | "TableauRefreshWorkBookOperator", 84 | "BoxToVolumesOperator", 85 | "VolumesToBoxOperator", 86 | "BoxOperator", 87 | "SLASensor", 88 | "load_plugins", 89 | "ensure_installation", 90 | ] 91 | -------------------------------------------------------------------------------- /brickflow_plugins/airflow/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/brickflow_plugins/airflow/__init__.py -------------------------------------------------------------------------------- /brickflow_plugins/airflow/brickflow_task_plugin.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | import datetime 3 | import pendulum 4 | 5 | try: 6 | from airflow import macros 7 | from airflow.models import BaseOperator 8 | from airflow.utils.context import Context 9 | except ImportError: 10 | raise ImportError( 11 | "You must install airflow to use airflow plugins, " 12 | "please try pip install brickflow[apache-airflow]" 13 | ) 14 | 15 | from jinja2 import Environment 16 | from brickflow.context import ctx 17 | from brickflow.engine.hooks import BrickflowTaskPluginSpec 18 | from brickflow.engine.task import brickflow_task_plugin_impl, Task, TaskResponse 19 | from brickflow.engine.workflow import Workflow 20 | 21 | from brickflow_plugins import log 22 | from brickflow_plugins.airflow.context import get_task_context 23 | from brickflow_plugins.airflow.operators import get_modifier_chain 24 | from brickflow_plugins.secrets import BrickflowSecretsBackend 25 | 26 | 27 | def epoch_to_pendulum_datetime(epoch_str: Optional[str]): 28 | if epoch_str is None: 29 | return None 30 | return pendulum.instance(datetime.datetime.fromtimestamp(int(epoch_str) / 1000)) 31 | 32 | 33 | class AirflowOperatorBrickflowTaskPluginImpl(BrickflowTaskPluginSpec): 34 | @staticmethod 35 | @brickflow_task_plugin_impl(tryfirst=True) 36 | def handle_results( 37 | resp: "TaskResponse", task: "Task", workflow: "Workflow" 38 | ) -> "TaskResponse": 39 | log.info( 40 | "using AirflowOperatorBrickflowTaskPlugin for handling results for task: %s", 41 | task.task_id, 42 | ) 43 | 44 | BrickflowTaskPluginSpec.handle_user_result_errors(resp) 45 | 46 | _operator = resp.response 47 | 48 | if not isinstance(_operator, BaseOperator): 49 | return resp 50 | 51 | operator_modifier_chain = get_modifier_chain() 52 | # modify any functionality of operators and then 53 | _operator = operator_modifier_chain.modify(_operator, task, workflow) 54 | 55 | if hasattr(_operator, "log"): 56 | # overwrite the operator logger if it has one to the brickflow logger 57 | setattr(_operator, "_log", ctx.log) 58 | 59 | context: Context = get_task_context( 60 | task.task_id, 61 | _operator, 62 | workflow.schedule_quartz_expression, 63 | epoch_to_pendulum_datetime(ctx.start_time(debug=None)), 64 | tz=workflow.timezone, 65 | ) 66 | 67 | env: Optional[Environment] = Environment() 68 | env.globals.update({"macros": macros, "ti": context}) 69 | with BrickflowSecretsBackend(): 70 | _operator.render_template_fields(context, jinja_env=env) 71 | op_resp = _operator.execute(context) 72 | return TaskResponse( 73 | response=op_resp, 74 | push_return_value=_operator.do_xcom_push, 75 | ) 76 | -------------------------------------------------------------------------------- /brickflow_plugins/airflow/context/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | try: 4 | from airflow.models import BaseOperator 5 | from airflow.utils.context import Context 6 | except ImportError: 7 | raise ImportError( 8 | "You must install airflow to use airflow plugins, " 9 | "please try pip install brickflow[apache-airflow]" 10 | ) 11 | 12 | from pendulum import DateTime 13 | from brickflow.context import ctx, RETURN_VALUE_KEY 14 | from brickflow_plugins.airflow.cronhelper import cron_helper 15 | from brickflow_plugins.airflow.vendor.timetable import create_timetable 16 | from brickflow_plugins.airflow.vendor.timezone import TIMEZONE 17 | 18 | 19 | class CrossDagXComsNotSupportedError(Exception): 20 | pass 21 | 22 | 23 | class XComsPullMultipleTaskIdsError(Exception): 24 | pass 25 | 26 | 27 | class FakeTaskInstance(object): 28 | def __init__( 29 | self, 30 | task_id: str, 31 | operator: BaseOperator, 32 | execution_date: str, 33 | ): 34 | self._operator = operator 35 | self._execution_date = execution_date 36 | self._task_id = task_id 37 | 38 | def xcom_push(self, key, value): 39 | ctx.task_coms.put(task_id=self._task_id, key=key, value=value) 40 | 41 | def xcom_pull(self, task_ids, key=RETURN_VALUE_KEY, dag_id=None): 42 | if dag_id is not None: 43 | raise CrossDagXComsNotSupportedError( 44 | "Cross dag xcoms not supported in framework raise feature request." 45 | ) 46 | if isinstance(task_ids, list) and len(task_ids) > 1: 47 | raise XComsPullMultipleTaskIdsError( 48 | "Currently xcoms pull only supports one task_id please raise feature " 49 | "request." 50 | ) 51 | task_id = task_ids[0] if isinstance(task_ids, list) else task_ids 52 | return ctx.task_coms.get(task_id, key) 53 | 54 | @property 55 | def execution_date(self): 56 | return self._execution_date 57 | 58 | @property 59 | def operator(self): 60 | return self._operator 61 | 62 | 63 | def execution_timestamp( 64 | quartz_cron_statement: Optional[str] = None, 65 | ts: Optional[DateTime] = None, 66 | tz=TIMEZONE, 67 | ) -> DateTime: 68 | if quartz_cron_statement is None: 69 | return DateTime.utcnow() 70 | if ts is None: 71 | ts = DateTime.utcnow() 72 | cron = cron_helper.quartz_to_unix(quartz_cron_statement) 73 | tt = create_timetable(cron, tz) 74 | return tt.align_to_prev(ts) 75 | 76 | 77 | def get_task_context( 78 | task_id, operator: BaseOperator, quartz_cron_statement, ts, tz=TIMEZONE 79 | ) -> Context: 80 | execution_ts = execution_timestamp(quartz_cron_statement, ts, tz) 81 | return Context( 82 | **{ 83 | "execution_date": str(execution_ts), 84 | "ds": execution_ts.strftime("%Y-%m-%d"), 85 | "ds_nodash": execution_ts.strftime("%Y%m%d"), 86 | "ts": str(execution_ts), 87 | "ts_nodash": execution_ts.strftime("%Y%m%d%H%M%S"), 88 | "ti": FakeTaskInstance(task_id, operator, str(execution_ts)), 89 | } 90 | ) 91 | -------------------------------------------------------------------------------- /brickflow_plugins/airflow/cronhelper.py: -------------------------------------------------------------------------------- 1 | import re 2 | import functools 3 | 4 | from brickflow_plugins import log 5 | 6 | 7 | class CronHelper: 8 | EVERY_X_UNITS_REPLACE_PLACEHOLDER = "%s" 9 | QUARTZ_EVERY_X_UNITS_REGEX = re.compile(r"^0/(\d+)$") # For handling 0/5 units 10 | UNIX_EVERY_X_UNITS_REGEX = re.compile(r"^\*/(\d+)$") # For handling */5 units 11 | QUARTZ_EVERY_X_UNITS_REPLACE_PATTERN = f"0/{EVERY_X_UNITS_REPLACE_PLACEHOLDER}" 12 | UNIX_EVERY_X_UNITS_REPLACE_PATTERN = f"*/{EVERY_X_UNITS_REPLACE_PLACEHOLDER}" 13 | 14 | @staticmethod 15 | def __get_expression_parts(expression: str) -> list: 16 | parts = [part.strip() for part in expression.split(" ")] 17 | 18 | # Unix cron expression have 5 parts, Quartz cron expression have 6 or 7 parts 19 | if len(parts) in [5, 7]: 20 | return parts 21 | # Year is an optional part in Quartz cron expression, adding the extra element to mimic 7 part Quartz expression 22 | if len(parts) == 6: 23 | parts.append("*") 24 | return parts 25 | 26 | raise ValueError("Invalid cron expression!") 27 | 28 | @staticmethod 29 | def convert_interval_parts(part: str, is_quartz: bool = False) -> str: 30 | every_x_units_pattern = ( 31 | CronHelper.QUARTZ_EVERY_X_UNITS_REGEX 32 | if is_quartz 33 | else CronHelper.UNIX_EVERY_X_UNITS_REGEX 34 | ) 35 | matches = every_x_units_pattern.match(part) 36 | every_x_units_replace_pattern = ( 37 | CronHelper.QUARTZ_EVERY_X_UNITS_REPLACE_PATTERN 38 | if is_quartz 39 | else CronHelper.UNIX_EVERY_X_UNITS_REPLACE_PATTERN 40 | ) 41 | 42 | if matches: 43 | return every_x_units_replace_pattern.replace( 44 | CronHelper.EVERY_X_UNITS_REPLACE_PLACEHOLDER, matches.group(1) 45 | ) 46 | 47 | return part 48 | 49 | @functools.lru_cache(maxsize=128) # cron expression conversion will not change 50 | def unix_to_quartz(self, unix_cron: str) -> str: 51 | parts = self.__get_expression_parts(expression=unix_cron) 52 | 53 | if len(parts) != 5: 54 | raise ValueError("Invalid Unix cron expression") 55 | 56 | minute, hour, dom, month, dow = map(self.convert_interval_parts, parts) 57 | 58 | # Converting Unix DOW to Quartz DOW 59 | def shift_days(day: str) -> str: 60 | """ 61 | Quartz DOW starts from 1 (Sunday) while Unix DOW starts from 0 (Sunday) 62 | """ 63 | if "-" in day: 64 | return "-".join([shift_days(day=d) for d in day.split("-")]) 65 | 66 | # Unix cron Sunday can be represented as 0 or 7, but only as 1 in Quartz cron 67 | if day in ["0", "7"]: 68 | return "1" 69 | if day in ["SUN", "MON", "TUE", "WED", "THU", "FRI", "SAT"]: 70 | return day 71 | return str(int(day) + 1) 72 | 73 | if "," in dow: 74 | quartz_dow = ",".join([shift_days(day=day) for day in dow.split(",")]) 75 | elif dow == "*": 76 | quartz_dow = dow 77 | else: 78 | quartz_dow = shift_days(day=dow) 79 | 80 | quartz_dom = dom 81 | 82 | if dom != "*" and dow == "*": 83 | quartz_dow = "?" 84 | elif dom == "*": 85 | quartz_dom = "?" 86 | 87 | quartz_cron = f"0 {minute} {hour} {quartz_dom} {month} {quartz_dow} *" 88 | log.info("Converted unix cron %s to quartz cron %s", unix_cron, quartz_cron) 89 | return quartz_cron 90 | 91 | @functools.lru_cache(maxsize=128) # cron expression conversion will not change 92 | def quartz_to_unix(self, quartz_cron: str) -> str: 93 | parts = self.__get_expression_parts(expression=quartz_cron) 94 | 95 | if len(parts) != 7: 96 | raise ValueError("Invalid Quartz cron expression") 97 | 98 | if "L" in quartz_cron or "W" in quartz_cron or "#" in quartz_cron: 99 | raise ValueError("Support for 'L, W, #' in Quartz cron is not implemented") 100 | 101 | # Unix cron expression does not support '?' 102 | parts = [part.replace("?", "*") for part in parts] 103 | 104 | _, minute, hour, dom, month, dow, _ = map( 105 | lambda part: self.convert_interval_parts(part, True), parts 106 | ) 107 | 108 | # Converting Quartz DOW to Unix DOW 109 | def shift_days(day: str) -> str: 110 | """ 111 | Quartz DOW starts from 1 (Sunday) while Unix DOW starts from 0 (Sunday) 112 | """ 113 | if "-" in day: 114 | return "-".join([shift_days(day=d) for d in day.split("-")]) 115 | if day in ["SUN", "MON", "TUE", "WED", "THU", "FRI", "SAT"]: 116 | return day 117 | 118 | return str(int(day) - 1) 119 | 120 | if "," in dow: 121 | unix_dow = ",".join([shift_days(day=day) for day in dow.split(",")]) 122 | elif dow == "*": 123 | unix_dow = "*" 124 | else: 125 | unix_dow = shift_days(day=dow) 126 | 127 | unix_dom = dom 128 | 129 | unix_cron = f"{minute} {hour} {unix_dom} {month} {unix_dow}" 130 | log.info("Converted quartz cron %s to unix cron %s", quartz_cron, unix_cron) 131 | return unix_cron 132 | 133 | 134 | cron_helper = CronHelper() 135 | -------------------------------------------------------------------------------- /brickflow_plugins/airflow/operators/__init__.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import os 3 | from abc import abstractmethod, ABCMeta 4 | from typing import Optional 5 | 6 | try: 7 | from airflow.models import BaseOperator, Pool 8 | from airflow.utils.weight_rule import WeightRule 9 | except ImportError: 10 | raise ImportError( 11 | "You must install airflow to use airflow plugins, " 12 | "please try pip install brickflow[apache-airflow]" 13 | ) 14 | 15 | from brickflow.engine.task import Task 16 | from brickflow.engine.workflow import Workflow 17 | 18 | 19 | class AirflowTaskDoesNotExistError(Exception): 20 | pass 21 | 22 | 23 | class UnsupportedAirflowTaskFieldError(Exception): 24 | pass 25 | 26 | 27 | class UnsupportedAirflowOperatorError(Exception): 28 | pass 29 | 30 | 31 | class AbstractOperatorModifier(metaclass=ABCMeta): 32 | @abstractmethod 33 | def set_next( 34 | self, op_handler: "AbstractOperatorModifier" 35 | ) -> "AbstractOperatorModifier": 36 | pass 37 | 38 | @abstractmethod 39 | def modify( 40 | self, operator: BaseOperator, task: Task, workflow: Workflow 41 | ) -> "BaseOperator": 42 | pass 43 | 44 | 45 | class OperatorModifier(AbstractOperatorModifier): 46 | def __init__(self): 47 | self._next_handler: Optional[AbstractOperatorModifier] = None 48 | 49 | def set_next( 50 | self, op_handler: "AbstractOperatorModifier" 51 | ) -> "AbstractOperatorModifier": 52 | self._next_handler = op_handler 53 | return op_handler 54 | 55 | @abstractmethod 56 | def modify( 57 | self, operator: BaseOperator, task: Task, workflow: Workflow 58 | ) -> Optional["BaseOperator"]: 59 | if self._next_handler is not None: 60 | return self._next_handler.modify(operator, task, workflow) 61 | 62 | return None 63 | 64 | 65 | class InvalidFieldChecker(OperatorModifier): 66 | UNSUPPORTED_TASK_NONE_FIELDS = { 67 | "email_on_retry": True, 68 | "email_on_failure": True, 69 | "sla": None, 70 | "execution_timeout": None, 71 | "on_failure_callback": None, 72 | "on_success_callback": None, 73 | "on_retry_callback": None, 74 | "inlets": [], 75 | "outlets": [], 76 | "task_concurrency": None, 77 | "max_active_tis_per_dag": None, 78 | "run_as_user": None, 79 | "depends_on_past": False, 80 | "wait_for_downstream": False, 81 | "max_retry_delay": None, 82 | "priority_weight": 1, 83 | "weight_rule": WeightRule.DOWNSTREAM, 84 | "pool": Pool.DEFAULT_POOL_NAME, 85 | "pool_slots": 1, 86 | "resources": None, 87 | "executor_config": {}, 88 | "email": None, 89 | } 90 | 91 | def _validate_task_fields(self, operator: BaseOperator, task: Task) -> None: 92 | unsupported_fields = [] 93 | for field, default_value in self.UNSUPPORTED_TASK_NONE_FIELDS.items(): 94 | if hasattr(operator, field) is False: 95 | continue 96 | value = getattr(operator, field) 97 | if value != default_value: 98 | unsupported_fields.append(field) 99 | if unsupported_fields: 100 | raise UnsupportedAirflowTaskFieldError( 101 | f"Unsupported fields: {unsupported_fields} for task: {task.task_id}" 102 | ) 103 | 104 | def modify( 105 | self, operator: BaseOperator, task: Task, workflow: Workflow 106 | ) -> Optional["BaseOperator"]: 107 | if isinstance(operator, BaseOperator): 108 | self._validate_task_fields(operator, task) 109 | return super().modify(operator, task, workflow) 110 | 111 | 112 | class CatchAllOperatorModifier(OperatorModifier): 113 | SUPPORTED_OPERATORS = [ 114 | "BranchPythonOperator", 115 | "PythonOperator", 116 | "BashOperator", 117 | "ShortCircuitOperator", 118 | "TaskDependencySensor", 119 | "AutosysSensor", 120 | "TableauRefreshDataSourceOperator", 121 | "TableauRefreshWorkBookOperator", 122 | ] 123 | 124 | def _validate_operators(self, operator: BaseOperator, task: Task) -> None: 125 | if ( 126 | issubclass(operator.__class__, BaseOperator) 127 | and operator.__class__.__name__ in self.SUPPORTED_OPERATORS 128 | ): 129 | return 130 | raise UnsupportedAirflowOperatorError( 131 | f"Unsupported airflow operator: {type(task)} for task: {task.task_id}" 132 | ) 133 | 134 | def modify( 135 | self, operator: BaseOperator, task: Task, workflow: Workflow 136 | ) -> Optional["BaseOperator"]: 137 | if isinstance(operator, BaseOperator): 138 | self._validate_operators(operator, task) 139 | return operator 140 | 141 | 142 | def get_modifier_chain(): 143 | from brickflow_plugins.airflow import operators 144 | import importlib 145 | import inspect 146 | 147 | start_chain = InvalidFieldChecker() 148 | next_node = start_chain 149 | pkg = operators 150 | file_name = pkg.__file__ 151 | for module in os.listdir(os.path.dirname(file_name)): 152 | # only find python files and ignore __init__.py 153 | if module == "__init__.py" or module[-3:] != ".py": 154 | continue 155 | module_name = module.replace(".py", "") 156 | # import all the modules into the mod object and not actually import them using __import__ 157 | mod = importlib.import_module(f"{pkg.__name__}.{module_name}") 158 | for obj in dir(mod): 159 | module_item = getattr(mod, obj) 160 | # if issubclass(module_item, OperatorModifier): 161 | if ( 162 | inspect.isclass(module_item) 163 | and module_item != operators.OperatorModifier 164 | and issubclass(module_item, operators.OperatorModifier) 165 | ): 166 | # print(module_item) 167 | next_node = next_node.set_next(module_item()) 168 | 169 | next_node.set_next(CatchAllOperatorModifier()) 170 | return start_chain 171 | 172 | 173 | def check_if(klass): 174 | def outer(f): 175 | @functools.wraps(f) 176 | def inner(*args, **kwargs) -> Optional["BaseOperator"]: 177 | self, operator = args[0], args[1] 178 | super_func = getattr(super(type(self), self), f.__name__) 179 | if not isinstance(operator, klass): 180 | # super function won't accept self 181 | # this is to go along the chain 182 | return super_func(*args[1:], **kwargs) 183 | return f(*args, **kwargs) 184 | 185 | return inner 186 | 187 | return outer 188 | -------------------------------------------------------------------------------- /brickflow_plugins/airflow/operators/native_operators.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import sys 4 | import tempfile 5 | import time 6 | import types 7 | from typing import Optional 8 | 9 | from airflow.operators.bash import BashOperator 10 | from airflow.operators.python import BranchPythonOperator, ShortCircuitOperator 11 | 12 | from brickflow.context import BRANCH_SKIP_EXCEPT, SKIP_EXCEPT_HACK 13 | from brickflow.engine.task import Task 14 | from brickflow.engine.workflow import Workflow 15 | from brickflow_plugins import log 16 | from brickflow_plugins.airflow.operators import OperatorModifier, check_if 17 | 18 | 19 | def _bash_execute(self, context): # pylint:disable=unused-argument 20 | p = None 21 | returncode = None 22 | start = time.time() 23 | env = self.env 24 | if env is None: 25 | env = os.environ.copy() 26 | 27 | # log.info("Command: %s", self.bash_command) 28 | 29 | with tempfile.TemporaryDirectory(prefix="airflowtmp") as tmp_dir: 30 | try: 31 | p = subprocess.Popen( # pylint:disable=consider-using-with 32 | self.bash_command, 33 | shell=True, 34 | cwd=tmp_dir, 35 | executable="/bin/bash", 36 | stderr=subprocess.STDOUT, 37 | stdout=subprocess.PIPE, 38 | universal_newlines=True, 39 | env=env, 40 | ) 41 | for line in iter(p.stdout.readline, ""): 42 | resp = line 43 | log.info("[STDOUT]: %s", line.rstrip()) 44 | returncode = p.wait() 45 | p = None 46 | sys.stdout.flush() 47 | if returncode != 0: 48 | raise subprocess.CalledProcessError(returncode, self.bash_command) 49 | finally: 50 | end = time.time() 51 | if p is not None: 52 | p.terminate() 53 | p.wait() 54 | log.info("Command: exited with return code %s", returncode) 55 | log.info("Command took %s seconds", end - start) 56 | 57 | if self.do_xcom_push is True: 58 | return resp[:-1] # skip newline char at end 59 | return 60 | 61 | 62 | def _bash_empty_on_kill(self): # pylint:disable=unused-argument 63 | pass 64 | 65 | 66 | def _skip_all_except( 67 | self, ti: "FakeTaskInstance", branch_task_ids 68 | ): # pylint:disable=unused-argument 69 | log.info("Skipping all tasks except: %s", branch_task_ids) 70 | ti.xcom_push(BRANCH_SKIP_EXCEPT, branch_task_ids) 71 | 72 | 73 | def _short_circuit_execute(self, context): 74 | condition = super(ShortCircuitOperator, self).execute(context) 75 | log.info("Condition result is %s", condition) 76 | 77 | if condition: 78 | log.info("Proceeding with downstream tasks...") 79 | return 80 | 81 | # log 82 | log.info("Skipping downstream tasks...") 83 | ti = context["ti"] 84 | ti.xcom_push(BRANCH_SKIP_EXCEPT, SKIP_EXCEPT_HACK) 85 | 86 | 87 | class BashOperatorModifier(OperatorModifier): 88 | @check_if(BashOperator) 89 | def modify( 90 | self, operator: BashOperator, task: Task, workflow: Workflow 91 | ) -> Optional["BashOperator"]: 92 | f = types.MethodType(_bash_execute, operator) 93 | operator.execute = f 94 | operator.on_kill = _bash_empty_on_kill 95 | return operator 96 | 97 | 98 | class BranchPythonOperatorModifier(OperatorModifier): 99 | @check_if(BranchPythonOperator) 100 | def modify( 101 | self, operator: BranchPythonOperator, task: Task, workflow: Workflow 102 | ) -> Optional["BranchPythonOperator"]: 103 | f = types.MethodType(_skip_all_except, operator) 104 | operator.skip_all_except = f 105 | return operator 106 | 107 | 108 | class ShortCircuitOperatorModifier(OperatorModifier): 109 | @check_if(ShortCircuitOperator) 110 | def modify( 111 | self, operator: ShortCircuitOperator, task: Task, workflow: Workflow 112 | ) -> Optional["ShortCircuitOperator"]: 113 | f = types.MethodType(_short_circuit_execute, operator) 114 | operator.execute = f 115 | return operator 116 | -------------------------------------------------------------------------------- /brickflow_plugins/airflow/vendor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/brickflow_plugins/airflow/vendor/__init__.py -------------------------------------------------------------------------------- /brickflow_plugins/airflow/vendor/context.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import contextlib 4 | import copy 5 | from typing import MutableMapping, Any, Iterator, KeysView, ItemsView, ValuesView 6 | 7 | 8 | class Context(MutableMapping[str, Any]): 9 | """Jinja2 template context for task rendering. 10 | 11 | This is a mapping (dict-like) class that can lazily emit warnings when 12 | (and only when) deprecated context keys are accessed. 13 | """ 14 | 15 | _DEPRECATION_REPLACEMENTS: dict[str, list[str]] = { 16 | "execution_date": ["data_interval_start", "logical_date"], 17 | "next_ds": ["{{ data_interval_end | ds }}"], 18 | "next_ds_nodash": ["{{ data_interval_end | ds_nodash }}"], 19 | "next_execution_date": ["data_interval_end"], 20 | "prev_ds": [], 21 | "prev_ds_nodash": [], 22 | "prev_execution_date": [], 23 | "prev_execution_date_success": ["prev_data_interval_start_success"], 24 | "tomorrow_ds": [], 25 | "tomorrow_ds_nodash": [], 26 | "yesterday_ds": [], 27 | "yesterday_ds_nodash": [], 28 | } 29 | 30 | def __init__( 31 | self, context: MutableMapping[str, Any] | None = None, **kwargs: Any 32 | ) -> None: 33 | self._context: MutableMapping[str, Any] = context or {} 34 | if kwargs: 35 | self._context.update(kwargs) 36 | self._deprecation_replacements = self._DEPRECATION_REPLACEMENTS.copy() 37 | 38 | def __repr__(self) -> str: 39 | return repr(self._context) 40 | 41 | def __reduce_ex__(self, protocol: int) -> tuple[Any, ...]: 42 | """Pickle the context as a dict. 43 | 44 | We are intentionally going through ``__getitem__`` in this function, 45 | instead of using ``items()``, to trigger deprecation warnings. 46 | """ 47 | items = [(key, self[key]) for key in self._context] 48 | return dict, (items,) 49 | 50 | def __copy__(self) -> Context: 51 | new = type(self)(copy.copy(self._context)) 52 | new._deprecation_replacements = self._deprecation_replacements.copy() 53 | return new 54 | 55 | def __getitem__(self, key: str) -> Any: 56 | # with contextlib.suppress(KeyError): 57 | # warnings.warn(_create_deprecation_warning(key, self._deprecation_replacements[key])) 58 | with contextlib.suppress(KeyError): 59 | return self._context[key] 60 | raise KeyError(key) 61 | 62 | def __setitem__(self, key: str, value: Any) -> None: 63 | self._deprecation_replacements.pop(key, None) 64 | self._context[key] = value 65 | 66 | def __delitem__(self, key: str) -> None: 67 | self._deprecation_replacements.pop(key, None) 68 | del self._context[key] 69 | 70 | def __contains__(self, key: object) -> bool: 71 | return key in self._context 72 | 73 | def __iter__(self) -> Iterator[str]: 74 | return iter(self._context) 75 | 76 | def __len__(self) -> int: 77 | return len(self._context) 78 | 79 | def __eq__(self, other: Any) -> bool: 80 | if not isinstance(other, Context): 81 | return NotImplemented 82 | return self._context == other._context 83 | 84 | def __ne__(self, other: Any) -> bool: 85 | if not isinstance(other, Context): 86 | return NotImplemented 87 | return self._context != other._context 88 | 89 | def keys(self) -> KeysView[str]: 90 | return self._context.keys() 91 | 92 | def items(self): 93 | return ItemsView(self._context) 94 | 95 | def values(self): 96 | return ValuesView(self._context) 97 | -------------------------------------------------------------------------------- /brickflow_plugins/databricks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/brickflow_plugins/databricks/__init__.py -------------------------------------------------------------------------------- /brickflow_plugins/databricks/run_job.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | from pydantic import SecretStr 3 | 4 | from databricks.sdk import WorkspaceClient 5 | from brickflow.context import ctx 6 | from brickflow.engine.utils import get_job_id 7 | 8 | 9 | class RunJobInRemoteWorkspace: 10 | """ 11 | Currently Databricks does not natively support running a job in a remote workspace via the RunJobTask. 12 | This plugin adds this functionality. However, it aims to be a temporary solution until Databricks adds this 13 | functionality natively. 14 | The plugin does not support neither passing the parameters to the remote job, nor waiting for the job to finish. 15 | 16 | Examples 17 | -------- 18 | service_principle_pat = ctx.dbutils.secrets.get("scope", "service_principle_id") 19 | WorkflowDependencySensor( 20 | databricks_host=https://your_workspace_url.cloud.databricks.com, 21 | databricks_token=service_principle_pat, 22 | job_name="foo", 23 | ) 24 | In the above snippet Databricks secrets are used as a secure service to store the databricks token. 25 | If you get your token from another secret management service, like AWS Secrets Manager, GCP Secret Manager 26 | or Azure Key Vault, just pass it in the databricks_token argument. 27 | """ 28 | 29 | def __init__( 30 | self, 31 | databricks_host: str, 32 | databricks_token: Union[str, SecretStr], 33 | job_name: str, 34 | ): 35 | self.databricks_host = databricks_host 36 | self.databricks_token = ( 37 | databricks_token 38 | if isinstance(databricks_token, SecretStr) 39 | else SecretStr(databricks_token) 40 | ) 41 | self.job_name = job_name 42 | self._workspace_obj = WorkspaceClient( 43 | host=self.databricks_host, token=self.databricks_token.get_secret_value() 44 | ) 45 | 46 | def execute(self): 47 | job_id = get_job_id( 48 | host=self.databricks_host, 49 | token=self.databricks_token, 50 | job_name=self.job_name, 51 | ) 52 | # TODO: add support for passing parameters to the remote job 53 | # TODO: wait for the job to finish 54 | run = self._workspace_obj.jobs.run_now(job_id) 55 | ctx.log.info("Job run status: %s", run.response) 56 | -------------------------------------------------------------------------------- /brickflow_plugins/secrets/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import abc 4 | import base64 5 | import functools 6 | import os 7 | from typing import Optional, Tuple, Union, List 8 | from urllib.parse import urlparse, ParseResult 9 | 10 | import pluggy 11 | 12 | try: 13 | from airflow.secrets import BaseSecretsBackend 14 | except ImportError: 15 | raise ImportError( 16 | "You must install airflow to use airflow plugins, " 17 | "please try pip install brickflow[apache-airflow]" 18 | ) 19 | 20 | from brickflow_plugins import log 21 | 22 | BRICKFLOW_SECRETS_BACKEND = "brickflow_secrets_backend" 23 | 24 | brickflow_secrets_plugin_spec = pluggy.HookspecMarker(BRICKFLOW_SECRETS_BACKEND) 25 | 26 | 27 | class BrickflowSecretPluginSpec: 28 | @staticmethod 29 | @brickflow_secrets_plugin_spec(firstresult=True) 30 | def get_secret_value(url_parsed_result: ParseResult) -> Optional["str"]: 31 | """Custom execute method that is able to be plugged in.""" 32 | 33 | 34 | @functools.lru_cache 35 | def get_brickflow_tasks_hook() -> BrickflowSecretPluginSpec: 36 | pm = pluggy.PluginManager(BRICKFLOW_SECRETS_BACKEND) 37 | pm.add_hookspecs(BrickflowSecretPluginSpec) 38 | pm.load_setuptools_entrypoints(BRICKFLOW_SECRETS_BACKEND) 39 | pm.register(CerberusBrickflowSecretPluginImpl()) 40 | pm.register(Base64BrickflowSecretPluginImpl()) 41 | for name, plugin_instance in pm.list_name_plugin(): 42 | log.info( 43 | "Loaded plugin with name: %s and class: %s", 44 | name, 45 | plugin_instance.__class__.__name__, 46 | ) 47 | return pm.hook 48 | 49 | 50 | brickflow_secrets_backend_plugin_impl = pluggy.HookimplMarker(BRICKFLOW_SECRETS_BACKEND) 51 | 52 | 53 | class AbstractSecretsHelper(abc.ABC): 54 | PROTOCOL_STARTS_WITH: Optional[Union[str, List[str]]] = None 55 | 56 | def get_secret_value_from_url(self, url_parsed_result: ParseResult): 57 | allowed_protocols = ( 58 | [self.PROTOCOL_STARTS_WITH] 59 | if isinstance(self.PROTOCOL_STARTS_WITH, str) 60 | else self.PROTOCOL_STARTS_WITH 61 | ) 62 | if self.PROTOCOL_STARTS_WITH is not None and not any( 63 | [ 64 | url_parsed_result.scheme.lower().startswith(protocol) 65 | for protocol in allowed_protocols 66 | ] 67 | ): 68 | return None 69 | return self._get_secret_value_from_url(url_parsed_result) 70 | 71 | @staticmethod 72 | @abc.abstractmethod 73 | def _get_secret_value_from_url(url_parsed_result: ParseResult) -> str: 74 | pass 75 | 76 | 77 | class B64SecretsHelper(AbstractSecretsHelper): 78 | PROTOCOL_STARTS_WITH = ["base64", "b64"] 79 | 80 | @staticmethod 81 | def _get_secret_value_from_url(url_parsed_result: ParseResult) -> str: 82 | b64data = url_parsed_result.netloc.encode("utf-8") 83 | return base64.b64decode(b64data).decode("utf-8") 84 | 85 | 86 | class CerberusSecretsHelper(AbstractSecretsHelper): 87 | PROTOCOL_STARTS_WITH = "cerberus" 88 | 89 | @staticmethod 90 | def parse_path_and_key(path: Optional[str]) -> Optional[Tuple[str, str]]: 91 | if path is not None: 92 | _cleaned_path = path.lstrip("/").rstrip("/") 93 | return "/".join(_cleaned_path.split("/")[:-1]), _cleaned_path.split("/")[-1] 94 | return None 95 | 96 | @staticmethod 97 | def _get_secret_value_from_url(url_parsed_result: ParseResult) -> str: 98 | try: 99 | from cerberus.client import CerberusClient 100 | except ImportError: 101 | raise ImportError( 102 | "You must install cerberus-client to use the cerberus secrets backend, " 103 | "please try pip install brickflow[cerberus]" 104 | ) 105 | parts = url_parsed_result.scheme.lower().split("+") 106 | protocol = "https" 107 | if len(parts) == 2: 108 | protocol = parts[1] 109 | _client = CerberusClient(f"{protocol}://{url_parsed_result.netloc}") 110 | _path, _key = CerberusSecretsHelper.parse_path_and_key(url_parsed_result.path) 111 | data = _client.get_secrets_data(_path) 112 | return data[_key] 113 | 114 | 115 | class CerberusBrickflowSecretPluginImpl(BrickflowSecretPluginSpec): 116 | @staticmethod 117 | @brickflow_secrets_backend_plugin_impl 118 | def get_secret_value(url_parsed_result: ParseResult) -> Optional["str"]: 119 | return CerberusSecretsHelper().get_secret_value_from_url(url_parsed_result) 120 | 121 | 122 | class Base64BrickflowSecretPluginImpl(BrickflowSecretPluginSpec): 123 | @staticmethod 124 | @brickflow_secrets_backend_plugin_impl 125 | def get_secret_value(url_parsed_result: ParseResult) -> Optional["str"]: 126 | return B64SecretsHelper().get_secret_value_from_url(url_parsed_result) 127 | 128 | 129 | class DatabricksSecretsBrickflowSecretPluginImpl(BrickflowSecretPluginSpec): 130 | @staticmethod 131 | @brickflow_secrets_backend_plugin_impl 132 | def get_secret_value(url_parsed_result: ParseResult) -> Optional["str"]: 133 | # not implemented yet 134 | return None 135 | 136 | 137 | class BrickflowSecretsBackend(BaseSecretsBackend): # noqa 138 | def __enter__(self): 139 | self.set_backend_env() 140 | return self 141 | 142 | def __exit__(self, exc_type, exc_val, exc_tb): 143 | self.unset_backend_env() 144 | 145 | def get_conn_value(self, conn_id: str) -> str | None: 146 | parsed_url = urlparse(conn_id) 147 | return get_brickflow_tasks_hook().get_secret_value(url_parsed_result=parsed_url) 148 | 149 | def _get_secrets_backend_env(self): 150 | return { 151 | "AIRFLOW__SECRETS__BACKEND": f"{self.__class__.__module__}.{self.__class__.__name__}", 152 | "AIRFLOW__SECRETS__BACKEND_KWARGS": "", 153 | } 154 | 155 | def set_backend_env(self): 156 | for k, v in self._get_secrets_backend_env().items(): 157 | os.environ[k] = v 158 | 159 | def unset_backend_env(self): 160 | for k in self._get_secrets_backend_env().keys(): 161 | os.environ.pop(k, None) 162 | -------------------------------------------------------------------------------- /docs/api/airflow_external_task_dependency.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow_plugins.airflow.operators.external_tasks 7 | handler: python 8 | options: 9 | filters: 10 | - "!^_[^_]" 11 | - "!^__[^__]" 12 | -------------------------------------------------------------------------------- /docs/api/airflow_native_operators.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow_plugins.airflow.operators.native_operators 7 | handler: python 8 | options: 9 | filters: 10 | - "!^_[^_]" 11 | - "!^__[^__]" 12 | -------------------------------------------------------------------------------- /docs/api/airflow_tableau_operators.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow_plugins.airflow.operators.external_tasks_tableau 7 | handler: python 8 | options: 9 | filters: 10 | - "!^_[^_]" 11 | - "!^__[^__]" 12 | -------------------------------------------------------------------------------- /docs/api/box_operator.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow_plugins.databricks.box_operator 7 | handler: python 8 | options: 9 | filters: 10 | - "!^_[^_]" 11 | - "!^__[^__]" 12 | -------------------------------------------------------------------------------- /docs/api/cli.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow.cli 7 | handler: python 8 | options: 9 | filters: 10 | - "!^_[^_]" 11 | 12 | -------------------------------------------------------------------------------- /docs/api/compute.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow.engine.compute 7 | handler: python 8 | options: 9 | members: 10 | - Cluster 11 | - Runtimes 12 | filters: 13 | - "!^_[^_]" 14 | 15 | -------------------------------------------------------------------------------- /docs/api/context.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow.context.context 7 | handler: python 8 | options: 9 | filters: 10 | - "!^_[^_]" 11 | - "!^__[^__]" 12 | 13 | -------------------------------------------------------------------------------- /docs/api/project.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow.engine.project 7 | handler: python 8 | options: 9 | members: 10 | - Project 11 | - BrickFlowEnvVars 12 | filters: 13 | - "!^_[^_]" 14 | 15 | -------------------------------------------------------------------------------- /docs/api/secrets.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow_plugins.secrets 7 | handler: python 8 | options: 9 | filters: 10 | - "!^_[^_]" 11 | - "!^__[^__]" 12 | -------------------------------------------------------------------------------- /docs/api/sla_sensor.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow_plugins.databricks.sla_sensor 7 | handler: python 8 | options: 9 | members: 10 | - SLASensor 11 | filters: 12 | - "!^_[^_]" 13 | - "!^__[^__]" 14 | -------------------------------------------------------------------------------- /docs/api/task.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow.engine.task 7 | handler: python 8 | options: 9 | members: 10 | - Task 11 | - EmailNotifications 12 | - JarTaskLibrary 13 | - EggTaskLibrary 14 | - WheelTaskLibrary 15 | - PypiTaskLibrary 16 | - MavenTaskLibrary 17 | - CranTaskLibrary 18 | - BrickflowTriggerRule 19 | - BrickflowTaskEnvVars 20 | - TaskSettings 21 | - TaskType 22 | filters: 23 | - "!^_[^_]" 24 | - "!^__[^__]" 25 | 26 | -------------------------------------------------------------------------------- /docs/api/uc_to_snowflake_operator.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow_plugins.databricks.uc_to_snowflake_operator 7 | handler: python 8 | options: 9 | filters: 10 | - "!^_[^_]" 11 | - "!^__[^__]" 12 | -------------------------------------------------------------------------------- /docs/api/workflow.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow.engine.workflow 7 | handler: python 8 | options: 9 | members: 10 | - Workflow 11 | - WorkspacePermissions 12 | - User 13 | - Group 14 | - ServicePrincipal 15 | filters: 16 | - "!^_[^_]" 17 | - "!^__[^__]" 18 | 19 | -------------------------------------------------------------------------------- /docs/api/workflow_dependency_sensor.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow_plugins.databricks.workflow_dependency_sensor 7 | handler: python 8 | options: 9 | members: 10 | - WorkflowDependencySensor 11 | - WorkflowTaskDependencySensor 12 | filters: 13 | - "!^_[^_]" 14 | - "!^__[^__]" 15 | -------------------------------------------------------------------------------- /docs/bundles-quickstart.md: -------------------------------------------------------------------------------- 1 | # BrickFlow v1.3.1 Quickstart Guide 2 | 3 | This guide will help you get started with BrickFlow v1.3.1, walking you through project setup and deployment. 4 | 5 | ## Prerequisites 6 | 7 | 1. Local environment setup: 8 | - Python >= 3.8 9 | - Databricks CLI configured with access token 10 | - BrickFlow CLI 11 | 12 | ### Installation Steps 13 | 14 | 1. Install Databricks CLI and configure it: 15 | ```bash 16 | pip install databricks-cli 17 | databricks configure -t 18 | ``` 19 | 20 | 2. Install BrickFlow CLI: 21 | ```bash 22 | pip install brickflows 23 | ``` 24 | 25 | 3. Verify your installation: 26 | ```bash 27 | bf --help 28 | databricks workspace list / # Add --profile if using specific profile 29 | ``` 30 | 31 | ## Creating Your First Project 32 | 33 | 1. Navigate to your repository root (where `.git` folder is located) 34 | 35 | 2. Initialize a new BrickFlow project: 36 | ```bash 37 | bf projects add 38 | ``` 39 | 40 | 3. Follow the prompts: 41 | - Project Name: Enter your desired project name 42 | - Path from repo root to project root: Press Enter for default (`.`) or specify path 43 | - Path from project root to workflows dir: Enter the directory for your workflows 44 | - Git https url: Enter your repository URL 45 | - Brickflow version: Enter `1.3.1` (or press Enter for `auto`) 46 | - Spark expectations version: Press Enter for default (`0.8.0`) 47 | - Skip entrypoint: Choose `N` unless you have a specific reason to skip 48 | 49 | 4. Update your `.gitignore` file: 50 | ``` 51 | **/bundle.yml 52 | .databricks/ 53 | ``` 54 | 55 | ## Project Structure 56 | 57 | Your project will follow either a monorepo or polyrepo style: 58 | 59 | ### Monorepo Structure Example: 60 | ``` 61 | repo-root/ 62 | ├── .git 63 | ├── projects/ 64 | │ ├── project_abc/ 65 | │ │ ├── lib/ 66 | │ │ │ ├── __init__.py 67 | │ │ │ └── shared_functions.py 68 | │ │ ├── workflows/ 69 | │ │ │ ├── __init__.py 70 | │ │ │ ├── entrypoint.py 71 | │ │ │ └── workflow_abc.py 72 | │ │ └── .brickflow-project-root.yml 73 | ``` 74 | 75 | ### Polyrepo Structure Example: 76 | ``` 77 | repo-root/ 78 | ├── .git 79 | ├── src/ 80 | │ ├── lib/ 81 | │ │ ├── __init__.py 82 | │ │ └── shared_functions.py 83 | │ ├── workflows/ 84 | │ │ ├── __init__.py 85 | │ │ ├── entrypoint.py 86 | │ │ └── workflow.py 87 | ├── .brickflow-project-root.yml 88 | ``` 89 | 90 | ## Validating Your Project 91 | 92 | 1. Synthesize your project configuration: 93 | ```bash 94 | bf projects synth --project --profile 95 | ``` 96 | 97 | 2. Verify the output shows: 98 | ``` 99 | SUCCESSFULLY SYNTHESIZED BUNDLE.YML FOR PROJECT: 100 | ``` 101 | 102 | ## Deploying Your Project 103 | 104 | ### Development Deployment 105 | ```bash 106 | bf projects deploy --project -p --force-acquire-lock 107 | ``` 108 | 109 | ### Environment-Specific Deployments 110 | ```bash 111 | # Dev environment 112 | bf projects deploy --project -p -e dev --force-acquire-lock 113 | 114 | # Test environment 115 | bf projects deploy --project -p -e test --force-acquire-lock 116 | 117 | # Production environment 118 | bf projects deploy --project -p -e prod --force-acquire-lock 119 | ``` 120 | 121 | ### Release Candidate Deployments 122 | For testing specific versions or pull requests: 123 | 124 | ```bash 125 | # Deploy RC version 126 | BRICKFLOW_WORKFLOW_SUFFIX="1.3.1-rc1" bf projects deploy --project -p -e test --force-acquire-lock 127 | 128 | # Deploy PR version 129 | BRICKFLOW_WORKFLOW_SUFFIX="1.3.1-pr34" bf projects deploy --project -p -e test --force-acquire-lock 130 | ``` 131 | 132 | ## Cleaning Up 133 | 134 | ### Destroying Deployments 135 | ```bash 136 | # Destroy main deployment 137 | bf projects destroy --project -p --force-acquire-lock 138 | 139 | # Destroy RC deployment 140 | BRICKFLOW_WORKFLOW_SUFFIX="1.3.1-rc1" bf projects destroy --project -p -e test --force-acquire-lock 141 | 142 | # Destroy PR deployment 143 | BRICKFLOW_WORKFLOW_SUFFIX="1.3.1-pr34" bf projects destroy --project -p -e test --force-acquire-lock 144 | ``` 145 | 146 | ## Troubleshooting 147 | 148 | 1. If synthesis fails: 149 | - Verify you're in the repository root directory 150 | - Check that all paths in configuration files are correct 151 | - Ensure all required __init__.py files exist 152 | 153 | 2. If deployment fails: 154 | - Verify Databricks CLI configuration 155 | - Check permissions in your Databricks workspace 156 | - Verify environment variables are set correctly 157 | 158 | ## Next Steps 159 | 160 | After successful deployment: 161 | 1. Monitor your workflows in the Databricks workspace 162 | 2. Set up CI/CD pipelines for automated deployments 163 | 3. Configure environment-specific variables 164 | 4. Set up monitoring and alerting -------------------------------------------------------------------------------- /docs/cli/reference.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | This page provides documentation for our command line tools. 7 | 8 | 9 | ::: mkdocs-click 10 | :module: brickflow.cli 11 | :command: cli 12 | :prog_name: bf 13 | :depth: 1 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /docs/css/custom.css: -------------------------------------------------------------------------------- 1 | .md-footer-nav { display: none; } 2 | 3 | .md-footer__inner:not([hidden]) { 4 | display: none 5 | } 6 | 7 | /* Indentation. */ 8 | div.doc-contents:not(.first) { 9 | padding-left: 25px; 10 | border-left: .05rem solid var(--md-typeset-table-color); 11 | } 12 | 13 | /* Mark external links as such. */ 14 | a.autorefs-external::after { 15 | /* https://primer.style/octicons/arrow-up-right-24 */ 16 | background-image: url('data:image/svg+xml,'); 17 | content: ' '; 18 | 19 | display: inline-block; 20 | position: relative; 21 | top: 0.1em; 22 | margin-left: 0.2em; 23 | margin-right: 0.1em; 24 | 25 | height: 1em; 26 | width: 1em; 27 | border-radius: 100%; 28 | background-color: var(--md-typeset-a-color); 29 | } 30 | a.autorefs-external:hover::after { 31 | background-color: var(--md-accent-fg-color); 32 | } -------------------------------------------------------------------------------- /docs/highlevel.md: -------------------------------------------------------------------------------- 1 | ## Brickflow Overview 2 | 3 | The objective of Brickflow is to provide a thin layer on top of databricks workflows to help deploy 4 | and manage workflows in Databricks. It also provides plugins/extras to be able to run airflow 5 | operators directly in the workflows. 6 | 7 | ## Brickflow to Airflow Term Mapping 8 | 9 | | Object | Airflow | Brickflow | 10 | |-------------------------------------------|-----------------------------------|---------------------------------------------------| 11 | | Collection of Workflows | Airflow Cluster (Airflow Dag Bag) | Project/Entrypoint | 12 | | Workflow | Airflow Dag | Workflow | 13 | | Task | Airflow Operator | Task | 14 | | Schedule | Unix Cron | Quartz Cron | 15 | | Inter Task Communication | XComs | Task Values | 16 | | Managing Connections to External Services | Airflow Connections | Mocked Airflow connections or Databricks Secrets | 17 | | Variables to Tasks | Variables | Task Parameters [ctx.get_parameter(key, default)] | 18 | | Context values (execution_date, etc.) | Airflow Macros, context["ti"] | ctx. | 19 | -------------------------------------------------------------------------------- /docs/how-imports-work.md: -------------------------------------------------------------------------------- 1 | ### How do imports work? 2 | 3 | !!! warning 4 | 5 | **This is very important to understand how imports work for mono repos. Please read this carefully. Otherwise you might run into issues during deployments.** 6 | 7 | When using brickflow projects every project will have a `.brickflow-project-root.yml` file. When you import brickflow, 8 | which you will 9 | in your entrypoint or workflows, brickflow will inspect all paths all stackframes during the import and recursively go 10 | up the path until it finds the `.brickflow-project-root.yml` file. 11 | The first instance of brickflow-project-root.yml will be added to the sys.path to help with module imports. 12 | 13 | Let us take a quick example of how to get imports to properly work! 14 | 15 | Let us say you have a project structure like this: 16 | 17 | ``` 18 | repo-root/ 19 | ├── .git 20 | ├── projects/ 21 | │ ├── project_abc/ 22 | │ │ ├── lib/ 23 | │ │ │ ├── __init__.py 24 | │ │ │ └── shared_functions.py 25 | │ │ ├── workflows/ 26 | │ │ │ ├── __init__.py 27 | │ │ │ ├── entrypoint.py 28 | │ │ │ └── workflow_abc.py 29 | │ │ ├── setup.py 30 | │ │ └── .brickflow-project-root.yml 31 | │ └── project_xyz/ 32 | │ ├── workflows_geo_b/ 33 | │ │ ├── entrypoint.py 34 | │ │ └── workflow_xyz.py 35 | │ ├── workflows_geo_a/ 36 | │ │ ├── entrypoint.py 37 | │ │ └── workflow_xyz.py 38 | │ └── .brickflow-project-root.yml 39 | ├── .gitignore 40 | ├── brickflow-multi-project.yml 41 | └── README.md 42 | ``` 43 | 44 | If let us say you are looking at adding imports from lib into `workflow_abc.py`, you need to: 45 | 46 | ```python 47 | from lib import share_functions 48 | 49 | share_functions.some_function(....) 50 | ``` 51 | 52 | Since in the project structure the `.brickflow-project-root.yml` is at `repo-root/projects/project_abc` then everything 53 | in that `project_abc` folder is 54 | added to sys.path in python. So you can import any of the folders under there. -------------------------------------------------------------------------------- /docs/img/bf_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/docs/img/bf_logo.png -------------------------------------------------------------------------------- /docs/img/bf_logo_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/docs/img/bf_logo_1.png -------------------------------------------------------------------------------- /docs/img/maintainance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/docs/img/maintainance.png -------------------------------------------------------------------------------- /docs/img/workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/docs/img/workflow.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | hide: 3 | - navigation 4 | --- 5 | 6 | # BrickFlow 7 | 8 | BrickFlow is a CLI tool for development and deployment of Python based Databricks Workflows in a declarative way. 9 | 10 | ## Concept 11 | 12 | `brickflow` aims to improve development experience for building any pipelines on databricks via: 13 | 14 | - Providing a declarative way to describe workflows via decorators 15 | - Provide intelligent defaults to compute targets 16 | - Provide a code and git first approach to managing and deploying workflows 17 | - Use databricks asset bundles to deploy workflows seamlessly. It is powered using terraform which helps manage state 18 | across deployments. 19 | - CLI tool helps facilitate setting up a projects 20 | - Provides additional functionality through the context library to be able to do additional things for workflows. 21 | 22 | 23 | ## Feedback 24 | 25 | Issues with `brickflow`? Found a :octicons-bug-24: bug? 26 | Have a great idea for an addition? Want to improve the documentation? Please feel 27 | free to file an [issue](https://github.com/Nike-Inc/brickflow/issues/new/choose). 28 | 29 | ## Contributing 30 | 31 | To contribute please fork and create a pull request. Here is 32 | a [guide](https://github.com/Nike-Inc/brickflow/blob/main/CONTRIBUTING.md) to help you through this process. -------------------------------------------------------------------------------- /docs/projects.md: -------------------------------------------------------------------------------- 1 | The project is similar to a map cluster it can be composed of various different Workflows or dags. 2 | 3 | 4 | Here is an example of an entrypoint. 5 | Click the plus buttons to understand all the parts of the entrypoint file. 6 | 7 | ```python title="entrypoint.py" 8 | # Databricks notebook source (1) 9 | 10 | import examples.brickflow_examples.workflows 11 | 12 | from brickflow import Project, PypiTaskLibrary, MavenTaskLibrary 13 | 14 | 15 | def main() -> None: 16 | """Project entrypoint""" 17 | with Project( 18 | "brickflow-demo", # (3)! 19 | git_repo="https://github.com/nike-inc/brickflow", # (4)! 20 | provider="github", # (5)! 21 | libraries=[ # (6)! 22 | PypiTaskLibrary(package="networkx"), 23 | ], 24 | ) as f: 25 | f.add_pkg(examples.brickflow_examples.workflows) # (7)! 26 | 27 | 28 | if __name__ == "__main__": # (2)! 29 | main() 30 | ``` 31 | 32 | 33 | 1. Uploading this Python file into databricks with this comment on the first line treats the python file 34 | as a notebook. 35 | 2. This makes sure this only runs when this file is run via python entrypoint.py 36 | 3. This is the project name you provided when you do `bf projects add` 37 | 4. This is the git repo that is introspected when running `bf projects add` 38 | 5. This is the github provider that you decide on. 39 | 6. You can provide a list of packages that need to be installed in all of your clusters when running ETL. 40 | 7. You can add multiple packages in your project where you are defining workflows. -------------------------------------------------------------------------------- /docs/upgrades/upgrade-pre-0-10-0-to-0-10-0.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | boost: 2 4 | --- 5 | 6 | ## Upgrade checklist 7 | 8 | * [x] The package has been renamed from `brickflow` to `brickflows`. Please run: 9 | 10 | ``` 11 | pip uninstall brickflow 12 | ``` 13 | 14 | and then 15 | 16 | ``` 17 | pip install brickflows>=0.10.0 18 | bf --version 19 | ``` 20 | 21 | * [x] If you are upgrading from a CDKTF version of brickflow then do not worry, the existing workflows as long as you do 22 | not change their names will be imported. 23 | 24 | * [x] Start using project configurations following the [quickstart guide](../../bundles-quickstart/#brickflow-projects-setup). 25 | 26 | * [x] Confirm the existence of the following files: 27 | 28 | * brickflow-multi-project.yml 29 | * brickflow-project-root.yml 30 | * Please reference [concepts](../../bundles-quickstart/#concepts) 31 | and [initialize project](../../bundles-quickstart/#initialize-project) for more details. 32 | 33 | * [x] RelativePathPackageResolver has been removed from the project to offer a seamless import 34 | as long as you import brickflow at the top. 35 | 36 | * [x] Ensure import for brickflow is at the top of your entrypoint.py 37 | 38 | * [x] Ensure import for brickflow is at the top of your entrypoint.py 39 | 40 | 41 | * [x] Ensure your entrypoint looks like this. **Make sure to click the plus buttons and read the highlighted sections**: 42 | 43 | ```python linenums="1" hl_lines="5 7 15 18" 44 | # Databricks notebook source 45 | 46 | # COMMAND ---------- 47 | 48 | from brickflow import Project # (1)! 49 | 50 | import workflows # (2)! 51 | 52 | def main() -> None: 53 | """Project entrypoint""" 54 | with Project( 55 | "product_abc_workflows_2", 56 | git_repo="https://github.com/stikkireddy/mono-repo-test", 57 | provider="github", 58 | libraries=[ # (3)! 59 | # PypiTaskLibrary(package="spark-expectations==0.5.0"), # Uncomment if spark-expectations is needed 60 | ], 61 | enable_plugins=True, # (4)! 62 | ) as f: 63 | f.add_pkg(workflows) 64 | 65 | 66 | if __name__ == "__main__": 67 | main() 68 | ``` 69 | 70 | 1. Make sure brickflow is at the top of your imports! This will help resolve paths and allow other libraries to be 71 | imported correctly. 72 | 2. Import your modules after brickflow has been imported! Make sure your optimize imports doesnt reorder your imports! 73 | 3. Make sure you remove brickflow and brickflow plugins and cron utils from this list. 74 | 4. Make sure you have enable_plugins=True. This will enable the plugins to be loaded to support airflow operators, etc. 75 | Disable this if you dont want to install airflow. 76 | 77 | 78 | -------------------------------------------------------------------------------- /examples/brickflow_examples/.brickflow-project-root.yml: -------------------------------------------------------------------------------- 1 | # DO NOT MODIFY THIS FILE - IT IS AUTO GENERATED BY BRICKFLOW AND RESERVED FOR FUTURE USAGE 2 | projects: 3 | brickflow-demo: 4 | brickflow_version: auto 5 | deployment_mode: bundle 6 | enable_plugins: true 7 | name: brickflow-demo 8 | path_from_repo_root_to_project_root: . 9 | path_project_root_to_workflows_dir: workflows 10 | version: v1 11 | -------------------------------------------------------------------------------- /examples/brickflow_examples/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | 132 | # GENERATED BY BRICKFLOW CLI --START-- 133 | 134 | ### Terraform ### 135 | # Local .terraform directories 136 | **/.terraform/* 137 | 138 | # .tfstate files 139 | *.tfstate 140 | *.tfstate.* 141 | 142 | # Crash log files 143 | crash.log 144 | crash.*.log 145 | 146 | # Exclude all .tfvars files, which are likely to contain sensitive data, such as 147 | # password, private keys, and other secrets. These should not be part of version 148 | # control as they are data points which are potentially sensitive and subject 149 | # to change depending on the environment. 150 | *.tfvars 151 | *.tfvars.json 152 | 153 | # Ignore override files as they are usually used to override resources locally and so 154 | # are not checked in 155 | override.tf 156 | override.tf.json 157 | *_override.tf 158 | *_override.tf.json 159 | 160 | # Include override files you do wish to add to version control using negated pattern 161 | # !example_override.tf 162 | 163 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan 164 | # example: *tfplan* 165 | 166 | # Ignore CLI configuration files 167 | .terraformrc 168 | terraform.rc 169 | 170 | # GENERATED BY BRICKFLOW CLI --END-- 171 | 172 | .idea 173 | bundle.yml -------------------------------------------------------------------------------- /examples/brickflow_examples/README.md: -------------------------------------------------------------------------------- 1 | # brickflow-examples 2 | This repository consists of examples for brickflow 3 | 4 | ## Getting Started 5 | 6 | ### Prerequisites 7 | 1.Install brickflows 8 | 9 | ```shell 10 | pip install brickflows 11 | ``` 12 | 13 | 2.Install [Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/databricks-cli.html) 14 | 15 | ```shell 16 | curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sudo sh 17 | ``` 18 | 19 | 3.Configure Databricks cli with workspace token. This configures your `~/.databrickscfg` file. 20 | 21 | ```shell 22 | databricks configure --token 23 | ``` 24 | 25 | ### Clone the repository 26 | 27 | ```shell 28 | git clone https://github.com/Nike-Inc/brickflow.git 29 | cd brickflow/examples/brickflow_examples 30 | ``` 31 | 32 | ### Hello World workflow 33 | - Create your first workflow using brickflow 34 | - Create a new file hello_world_workflow.py in the workflows directory 35 | - Add the following code to the file 36 | ```python 37 | from brickflow import ( 38 | Cluster, 39 | Workflow, 40 | NotebookTask, 41 | ) 42 | from brickflow.context import ctx 43 | from airflow.operators.bash import BashOperator 44 | 45 | 46 | cluster = Cluster( 47 | name="job_cluster", 48 | node_type_id="m6gd.xlarge", 49 | spark_version="13.3.x-scala2.12", 50 | min_workers=1, 51 | max_workers=2, 52 | ) 53 | 54 | wf = Workflow( 55 | "hello_world_workflow", 56 | default_cluster=cluster, 57 | tags={ 58 | "product_id": "brickflow_demo", 59 | }, 60 | common_task_parameters={ 61 | "catalog": "", 62 | "database": "", 63 | }, 64 | ) 65 | 66 | @wf.task 67 | # this task does nothing but explains the use of context object 68 | def start(): 69 | print(f"Environment: {ctx.env}") 70 | 71 | @wf.notebook_task 72 | # this task runs a databricks notebook 73 | def example_notebook(): 74 | return NotebookTask( 75 | notebook_path="notebooks/example_notebook.py", 76 | base_parameters={ 77 | "some_parameter": "some_value", # in the notebook access these via dbutils.widgets.get("some_parameter") 78 | }, 79 | ) 80 | 81 | 82 | @wf.task(depends_on=[start, example_notebook]) 83 | # this task runs a bash command 84 | def list_lending_club_data_files(): 85 | return BashOperator( 86 | task_id=list_lending_club_data_files.__name__, 87 | bash_command="ls -lrt /dbfs/databricks-datasets/samples/lending_club/parquet/", 88 | ) 89 | 90 | @wf.task(depends_on=list_lending_club_data_files) 91 | # this task runs the pyspark code 92 | def lending_data_ingest(): 93 | ctx.spark.sql( 94 | f""" 95 | CREATE TABLE IF NOT EXISTS 96 | {ctx.dbutils_widget_get_or_else(key="catalog", debug="development")}.\ 97 | {ctx.dbutils_widget_get_or_else(key="database", debug="dummy_database")}.\ 98 | {ctx.dbutils_widget_get_or_else(key="brickflow_env", debug="local")}_lending_data_ingest 99 | USING DELTA -- this is default just for explicit purpose 100 | SELECT * FROM parquet.`dbfs:/databricks-datasets/samples/lending_club/parquet/` 101 | """ 102 | ) 103 | ``` 104 | _Note: Modify the values of catalog/database for common_task_parameters._ 105 | 106 | ### Update demo_wf.py 107 | - demo_wf.py explains the various tasks and options available for the tasks 108 | - You can remove the demo_wf.py in case you just to run the hello_world_workflow.py 109 | - In case you want to run the demo_wf.py, update the below params with your values 110 | - default_cluster 111 | - common_task_parameters 112 | - permissions 113 | - default_task_settings 114 | 115 | ### Deploy the workflow to databricks 116 | ```shell 117 | brickflow projects deploy --project brickflow-demo -e local 118 | ``` 119 | 120 | ### Run the demo workflow 121 | - login to databricks workspace 122 | - go to the workflows and select the workflow 123 | ![img.png](../../docs/img/workflow.png) 124 | - click on the run button 125 | -------------------------------------------------------------------------------- /examples/brickflow_examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_examples/__init__.py -------------------------------------------------------------------------------- /examples/brickflow_examples/brickflow-multi-project.yml: -------------------------------------------------------------------------------- 1 | project_roots: 2 | brickflow-demo: 3 | root_yaml_rel_path: . 4 | version: v1 5 | -------------------------------------------------------------------------------- /examples/brickflow_examples/notebooks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_examples/notebooks/__init__.py -------------------------------------------------------------------------------- /examples/brickflow_examples/notebooks/example_notebook.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | 3 | print("hello world") 4 | -------------------------------------------------------------------------------- /examples/brickflow_examples/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_examples/src/__init__.py -------------------------------------------------------------------------------- /examples/brickflow_examples/src/python/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_examples/src/python/__init__.py -------------------------------------------------------------------------------- /examples/brickflow_examples/src/python/lending_data_show.py: -------------------------------------------------------------------------------- 1 | from brickflow.context import ctx 2 | 3 | 4 | def lending_data_print(): 5 | ctx.spark.sql( 6 | """ 7 | SELECT 8 | addr_state, * 9 | FROM 10 | parquet.`dbfs:/databricks-datasets/samples/lending_club/parquet/` limit 10 11 | """ 12 | ).show(truncate=False) 13 | 14 | 15 | if __name__ == "__main__": 16 | lending_data_print() 17 | -------------------------------------------------------------------------------- /examples/brickflow_examples/src/python/setup_data.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %fs ls dbfs:/databricks-datasets/samples/lending_club/parquet/ 3 | 4 | # COMMAND ---------- 5 | 6 | # MAGIC %sql 7 | # MAGIC SELECT 8 | # MAGIC addr_state, * 9 | # MAGIC FROM 10 | # MAGIC parquet.`dbfs:/databricks-datasets/samples/lending_club/parquet/` 11 | 12 | # COMMAND ---------- 13 | 14 | 15 | # -- ingest step 16 | catalog = "development" 17 | database = "team_databricks_sme" 18 | spark.sql( 19 | f""" 20 | CREATE TABLE IF NOT EXISTS {catalog}.{database}.lending_data 21 | USING DELTA -- this is default just for explicit purpose 22 | SELECT * FROM parquet.`dbfs:/databricks-datasets/samples/lending_club/parquet/` 23 | """ 24 | ) 25 | 26 | # COMMAND ---------- 27 | 28 | # Step 2 29 | catalog = "development" 30 | database = "team_databricks_sme" 31 | spark.sql( 32 | f""" 33 | OPTIMIZE {catalog}.{database}.lending_data; 34 | """ 35 | ) 36 | 37 | # COMMAND ---------- 38 | 39 | # MAGIC %sql 40 | # MAGIC SELECT distinct addr_state FROM development.team_databricks_sme.lending_data 41 | 42 | # COMMAND ---------- 43 | 44 | 45 | # -- T&S 1 process AZ data 46 | catalog = "development" 47 | database = "team_databricks_sme" 48 | spark.sql( 49 | f""" 50 | CREATE OR REPLACE TABLE {catalog}.{database}.lending_data_az_geo 51 | USING DELTA -- this is default just for explicit purpose 52 | SELECT * FROM {catalog}.{database}.lending_data where addr_state = 'AZ' 53 | """ 54 | ) 55 | 56 | # COMMAND ---------- 57 | 58 | # -- T&S 2 process CA data 59 | catalog = "development" 60 | database = "team_databricks_sme" 61 | spark.sql( 62 | f""" 63 | CREATE OR REPLACE TABLE {catalog}.{database}.lending_data_ca_geo 64 | USING DELTA -- this is default just for explicit purpose 65 | SELECT * FROM {catalog}.{database}.lending_data where addr_state = 'CA' 66 | """ 67 | ) 68 | 69 | # COMMAND ---------- 70 | 71 | # -- T&S 3 process IL data 72 | catalog = "development" 73 | database = "team_databricks_sme" 74 | spark.sql( 75 | f""" 76 | CREATE OR REPLACE TABLE {catalog}.{database}.lending_data_il_geo 77 | USING DELTA -- this is default just for explicit purpose 78 | SELECT * FROM {catalog}.{database}.≈ where addr_state = 'IL' 79 | """ 80 | ) 81 | 82 | # COMMAND ---------- 83 | 84 | # -- Union Data Together 85 | catalog = "development" 86 | database = "team_databricks_sme" 87 | spark.sql( 88 | f""" 89 | CREATE OR REPLACE TABLE {catalog}.{database}.lending_data_az_ca_il_geo 90 | USING DELTA -- this is default just for explicit purpose 91 | SELECT * FROM {catalog}.{database}.lending_data_az_geo 92 | UNION ALL 93 | SELECT * FROM {catalog}.{database}.lending_data_ca_geo 94 | UNION ALL 95 | SELECT * FROM {catalog}.{database}.lending_data_il_geo 96 | """ 97 | ) 98 | 99 | # COMMAND ---------- 100 | 101 | # -- Union Data Together 102 | catalog = "development" 103 | database = "team_databricks_sme" 104 | spark.sql( 105 | f""" 106 | SELECT * FROM {catalog}.{database}.lending_data_az_ca_il_geo 107 | """ 108 | ).limit(10).toPandas().to_csv("data.csv") 109 | with open("data.csv", "r") as f: 110 | print(f.read()) 111 | 112 | # COMMAND ---------- 113 | -------------------------------------------------------------------------------- /examples/brickflow_examples/src/sql/sample.sql: -------------------------------------------------------------------------------- 1 | create or replace table $database.$schema.sample as 2 | select * from $database.$schema.source -------------------------------------------------------------------------------- /examples/brickflow_examples/workflows/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_examples/workflows/__init__.py -------------------------------------------------------------------------------- /examples/brickflow_examples/workflows/entrypoint.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | 3 | import brickflow 4 | from brickflow import Project, PypiTaskLibrary 5 | import workflows 6 | 7 | 8 | def main() -> None: 9 | with Project( 10 | "brickflow-demo", 11 | git_repo="https://github.com/Nike-Inc/brickflow", 12 | provider="github", 13 | libraries=[ 14 | PypiTaskLibrary( 15 | package="spark-expectations==0.8.0" 16 | ), # comment if spark-expectations is not needed 17 | ], 18 | ) as f: 19 | f.add_pkg(workflows) 20 | 21 | 22 | if __name__ == "__main__": 23 | main() 24 | -------------------------------------------------------------------------------- /examples/brickflow_for_each_task_examples/.brickflow-project-root.yml: -------------------------------------------------------------------------------- 1 | # DO NOT MODIFY THIS FILE - IT IS AUTO GENERATED BY BRICKFLOW AND RESERVED FOR FUTURE USAGE 2 | projects: 3 | for_each_task_examples: 4 | brickflow_version: auto 5 | deployment_mode: bundle 6 | enable_plugins: true 7 | name: for_each_task_examples 8 | path_from_repo_root_to_project_root: examples/brickflow_for_each_task_examples 9 | path_project_root_to_workflows_dir: workflows 10 | version: v1 11 | -------------------------------------------------------------------------------- /examples/brickflow_for_each_task_examples/README.md: -------------------------------------------------------------------------------- 1 | # Brickflow for each task examples 2 | This repository contains some examples on how to use the fo each task type in brickflow. 3 | 4 | ## Getting Started 5 | 6 | ### Prerequisites 7 | 1.Install brickflows 8 | 9 | ```shell 10 | pip install brickflows 11 | ``` 12 | 13 | 2.Install [Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/databricks-cli.html) 14 | 15 | ```shell 16 | curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sudo sh 17 | ``` 18 | 19 | 3.Configure Databricks cli with workspace token. This configures your `~/.databrickscfg` file. 20 | 21 | ```shell 22 | databricks configure --token 23 | ``` 24 | 25 | ### Clone the repository 26 | 27 | ```shell 28 | git clone https://github.com/Nike-Inc/brickflow.git 29 | cd brickflow/examples/brickflow_serverless_examples 30 | ``` 31 | 32 | ### Customize the workflow 33 | 34 | Replace all the placeholders in workflows/for_each_task_workflow.py with configuration values compatible with your databricks workspace 35 | 36 | 37 | ### Deploy the workflow to databricks 38 | ```shell 39 | brickflow projects deploy --project for_each_task_examples -e local 40 | ``` 41 | 42 | ### Run the demo workflow 43 | - login to databricks workspace 44 | - go to the workflows and select the workflow 45 | - click on the run button 46 | -------------------------------------------------------------------------------- /examples/brickflow_for_each_task_examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_for_each_task_examples/__init__.py -------------------------------------------------------------------------------- /examples/brickflow_for_each_task_examples/brickflow-multi-project.yml: -------------------------------------------------------------------------------- 1 | project_roots: 2 | for_each_task_examples: 3 | root_yaml_rel_path: . 4 | version: v1 5 | -------------------------------------------------------------------------------- /examples/brickflow_for_each_task_examples/notebooks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_for_each_task_examples/notebooks/__init__.py -------------------------------------------------------------------------------- /examples/brickflow_for_each_task_examples/notebooks/example_notebook.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | 3 | param = dbutils.widgets.get("looped_parameter") 4 | print(f"Hey this is a nested notebook running with inputs: {param}") 5 | -------------------------------------------------------------------------------- /examples/brickflow_for_each_task_examples/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_for_each_task_examples/src/__init__.py -------------------------------------------------------------------------------- /examples/brickflow_for_each_task_examples/src/python/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_for_each_task_examples/src/python/__init__.py -------------------------------------------------------------------------------- /examples/brickflow_for_each_task_examples/src/python/print_args.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | if __name__ == "__main__": 4 | print(f"Hello, running with input {sys.argv}") 5 | -------------------------------------------------------------------------------- /examples/brickflow_for_each_task_examples/workflows/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_for_each_task_examples/workflows/__init__.py -------------------------------------------------------------------------------- /examples/brickflow_for_each_task_examples/workflows/entrypoint.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | 3 | import brickflow 4 | from brickflow import Project 5 | import workflows 6 | 7 | 8 | def main() -> None: 9 | with Project( 10 | "for_each_task_examples", 11 | git_repo="https://github.com/Nike-Inc/brickflow", 12 | provider="github", 13 | ) as f: 14 | f.add_pkg(workflows) 15 | 16 | 17 | if __name__ == "__main__": 18 | main() 19 | -------------------------------------------------------------------------------- /examples/brickflow_for_each_task_examples/workflows/for_each_task_wf.py: -------------------------------------------------------------------------------- 1 | from brickflow import ( 2 | Workflow, 3 | WorkflowPermissions, 4 | User, 5 | NotebookTask, 6 | Cluster, 7 | JarTaskLibrary, 8 | SparkJarTask, 9 | SparkPythonTask, 10 | SqlTask, 11 | ) 12 | 13 | from brickflow.context import ctx 14 | from brickflow.engine.task import JobsTasksForEachTaskConfigs 15 | 16 | cluster = Cluster( 17 | name=f"job_cluster_for_each_task_examples", 18 | driver_node_type_id="r7g.large", 19 | node_type_id="r7g.large", 20 | spark_version="13.3.x-scala2.12", 21 | min_workers=1, 22 | max_workers=1, 23 | policy_id="", # replace with an existing policy id 24 | ) 25 | 26 | wf = Workflow( 27 | "for_each_task_examples_wf", 28 | default_cluster=cluster, 29 | permissions=WorkflowPermissions( 30 | can_manage=[ 31 | User( 32 | "" # replace email with existing users' email on databricks 33 | ) 34 | ], 35 | ), 36 | ) 37 | 38 | 39 | @wf.task 40 | def example_task(): 41 | print("This is a dependant task!") 42 | 43 | 44 | @wf.for_each_task( 45 | depends_on=example_task, 46 | for_each_task_conf=JobsTasksForEachTaskConfigs( 47 | # Inputs can be provided by either a python iterable or a json-string 48 | inputs=[ 49 | "AZ", 50 | "CA", 51 | "IL", 52 | ], 53 | concurrency=3, 54 | ), 55 | ) 56 | def example_notebook(): 57 | return NotebookTask( 58 | notebook_path="notebooks/example_notebook.py", 59 | base_parameters={"looped_parameter": "{{input}}"}, 60 | ) 61 | 62 | 63 | @wf.for_each_task( 64 | depends_on=example_task, 65 | for_each_task_conf=JobsTasksForEachTaskConfigs( 66 | inputs='["1", "2", "3"]', concurrency=3 67 | ), 68 | ) 69 | def example_brickflow_task(*, test_param="{{input}}"): 70 | print(f"Test param: {test_param}") 71 | param = ctx.get_parameter("looped_parameter") 72 | print(f"Nested brickflow task running with input: {param}") 73 | 74 | 75 | @wf.for_each_task( 76 | depends_on=example_task, 77 | libraries=[ 78 | JarTaskLibrary( 79 | jar="" 80 | ) # Replace with actual jar path 81 | ], 82 | for_each_task_conf=JobsTasksForEachTaskConfigs( 83 | inputs="[1,2,3]", 84 | concurrency=1, 85 | ), 86 | ) 87 | def for_each_spark_jar(): 88 | return SparkJarTask( 89 | main_class_name="com.example.MainClass", # Replace with actual main class name 90 | parameters=["{{input}}"], 91 | ) 92 | 93 | 94 | @wf.for_each_task( 95 | depends_on=example_task, 96 | for_each_task_conf=JobsTasksForEachTaskConfigs( 97 | inputs="[1,2,3]", 98 | concurrency=1, 99 | ), 100 | ) 101 | def for_each_spark_python(): 102 | return SparkPythonTask( 103 | python_file="examples/brickflow_for_each_task_examples/src/python/print_args.py", 104 | source="WORKSPACE", 105 | parameters=["{{input}}"], 106 | ) 107 | 108 | 109 | @wf.for_each_task( 110 | depends_on=example_notebook, 111 | for_each_task_conf=JobsTasksForEachTaskConfigs( 112 | inputs="[1,2,3]", 113 | concurrency=1, 114 | ), 115 | ) 116 | def for_each_sql_task() -> any: 117 | return SqlTask( 118 | query_id="", # Replace with actual query id 119 | warehouse_id="", # Replace with actual warehouse id 120 | parameters={"looped_parameter": "{{input}}"}, 121 | ) 122 | -------------------------------------------------------------------------------- /examples/brickflow_serverless_examples/.brickflow-project-root.yml: -------------------------------------------------------------------------------- 1 | # DO NOT MODIFY THIS FILE - IT IS AUTO GENERATED BY BRICKFLOW AND RESERVED FOR FUTURE USAGE 2 | projects: 3 | brickflow-serverless-demo: 4 | brickflow_version: auto 5 | deployment_mode: bundle 6 | enable_plugins: true 7 | name: brickflow-serverless-demo 8 | path_from_repo_root_to_project_root: . 9 | path_project_root_to_workflows_dir: workflows 10 | version: v1 11 | -------------------------------------------------------------------------------- /examples/brickflow_serverless_examples/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | 132 | # GENERATED BY BRICKFLOW CLI --START-- 133 | 134 | ### Terraform ### 135 | # Local .terraform directories 136 | **/.terraform/* 137 | 138 | # .tfstate files 139 | *.tfstate 140 | *.tfstate.* 141 | 142 | # Crash log files 143 | crash.log 144 | crash.*.log 145 | 146 | # Exclude all .tfvars files, which are likely to contain sensitive data, such as 147 | # password, private keys, and other secrets. These should not be part of version 148 | # control as they are data points which are potentially sensitive and subject 149 | # to change depending on the environment. 150 | *.tfvars 151 | *.tfvars.json 152 | 153 | # Ignore override files as they are usually used to override resources locally and so 154 | # are not checked in 155 | override.tf 156 | override.tf.json 157 | *_override.tf 158 | *_override.tf.json 159 | 160 | # Include override files you do wish to add to version control using negated pattern 161 | # !example_override.tf 162 | 163 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan 164 | # example: *tfplan* 165 | 166 | # Ignore CLI configuration files 167 | .terraformrc 168 | terraform.rc 169 | 170 | # GENERATED BY BRICKFLOW CLI --END-- 171 | 172 | .idea 173 | bundle.yml -------------------------------------------------------------------------------- /examples/brickflow_serverless_examples/README.md: -------------------------------------------------------------------------------- 1 | # Brickflows Serverless Example 2 | This project contains the example of the serverless workflow, that contains: 3 | - notebook task 4 | - python task 5 | - native Brickflow entrypoint task 6 | 7 | Note that in notebook task and entrypoint task the dependencies are set through magic `pip install` commands within 8 | the notebook. 9 | 10 | ## Getting Started 11 | 12 | ### Prerequisites 13 | 1.Install brickflows 14 | 15 | ```shell 16 | pip install brickflows 17 | ``` 18 | 19 | 2.Install [Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/databricks-cli.html) 20 | 21 | ```shell 22 | curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sudo sh 23 | ``` 24 | 25 | 3.Configure Databricks cli with workspace token. This configures your `~/.databrickscfg` file. 26 | 27 | ```shell 28 | databricks configure --token 29 | ``` 30 | 31 | ### Clone the repository 32 | 33 | ```shell 34 | git clone https://github.com/Nike-Inc/brickflow.git 35 | cd brickflow/examples/brickflow_serverless_examples 36 | ``` 37 | 38 | ### Deploy the workflow to databricks 39 | ```shell 40 | brickflow projects deploy --project brickflow-serverless-demo -e local 41 | ``` 42 | 43 | ### Run the demo workflow 44 | - login to databricks workspace 45 | - go to the workflows and select the workflow 46 | - click on the run button 47 | -------------------------------------------------------------------------------- /examples/brickflow_serverless_examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_serverless_examples/__init__.py -------------------------------------------------------------------------------- /examples/brickflow_serverless_examples/brickflow-multi-project.yml: -------------------------------------------------------------------------------- 1 | project_roots: 2 | brickflow-serverless-demo: 3 | root_yaml_rel_path: . 4 | version: v1 5 | -------------------------------------------------------------------------------- /examples/brickflow_serverless_examples/notebooks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_serverless_examples/notebooks/__init__.py -------------------------------------------------------------------------------- /examples/brickflow_serverless_examples/notebooks/example_notebook.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %pip install pytz==2024.2 3 | 4 | # COMMAND ---------- 5 | import pytz 6 | from datetime import datetime 7 | 8 | 9 | def get_current_time_in_timezone(timezone_str): 10 | # Get the timezone object 11 | timezone = pytz.timezone(timezone_str) 12 | # Get the current time in the specified timezone 13 | current_time = datetime.now(timezone) 14 | return current_time 15 | 16 | 17 | # Example usage 18 | timezones = ["UTC", "Europe/Amsterdam", "Asia/Tokyo", "America/New_York"] 19 | for tz in timezones: 20 | print(f"Current time in {tz}: {get_current_time_in_timezone(tz)}") 21 | -------------------------------------------------------------------------------- /examples/brickflow_serverless_examples/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_serverless_examples/src/__init__.py -------------------------------------------------------------------------------- /examples/brickflow_serverless_examples/src/python/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_serverless_examples/src/python/__init__.py -------------------------------------------------------------------------------- /examples/brickflow_serverless_examples/src/python/example.py: -------------------------------------------------------------------------------- 1 | import pytz 2 | from datetime import datetime 3 | import argparse 4 | 5 | 6 | def get_current_time_in_timezone(timezone_str): 7 | # Get the timezone object 8 | timezone = pytz.timezone(timezone_str) 9 | # Get the current time in the specified timezone 10 | current_time = datetime.now(timezone) 11 | return current_time 12 | 13 | 14 | if __name__ == "__main__": 15 | parser = argparse.ArgumentParser( 16 | description="Get the current time in a specified timezone." 17 | ) 18 | parser.add_argument( 19 | "--timezone", 20 | type=str, 21 | required=True, 22 | help="The timezone to get the current time for.", 23 | ) 24 | args = parser.parse_args() 25 | 26 | try: 27 | current_time = get_current_time_in_timezone(args.timezone) 28 | print(f"Current time in {args.timezone}: {current_time}") 29 | except pytz.UnknownTimeZoneError: 30 | print(f"Unknown timezone: {args.timezone}") 31 | -------------------------------------------------------------------------------- /examples/brickflow_serverless_examples/workflows/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/examples/brickflow_serverless_examples/workflows/__init__.py -------------------------------------------------------------------------------- /examples/brickflow_serverless_examples/workflows/demo_serverless_wf.py: -------------------------------------------------------------------------------- 1 | from brickflow import ( 2 | Workflow, 3 | NotebookTask, 4 | SparkPythonTask, 5 | ) 6 | from brickflow.engine.task import PypiTaskLibrary 7 | 8 | wf = Workflow( 9 | "brickflow-serverless-demo", 10 | schedule_quartz_expression="0 0/20 0 ? * * *", 11 | libraries=[ 12 | PypiTaskLibrary(package="pytz==2024.2"), 13 | # Custom repositories are not supported for serverless workloads, due to Databricks CLI limitations. 14 | # Refer to: https://github.com/databricks/cli/pull/1842This will be fixed in the future releases, use wheel instead. 15 | # PypiTaskLibrary( 16 | # package="my-lib==1.2.3", repo="https://artifactory.my-org.com/api/pypi/python-virtual/simple" 17 | # ), 18 | ], 19 | ) 20 | 21 | 22 | @wf.task 23 | def entrypoint_task(): 24 | pass 25 | 26 | 27 | @wf.notebook_task 28 | def notebook_task(): 29 | return NotebookTask( 30 | notebook_path="notebooks/example_notebook.py", 31 | base_parameters={ 32 | "some_parameter": "some_value", # in the notebook access these via dbutils.widgets.get("some_parameter") 33 | }, 34 | ) # type: ignore 35 | 36 | 37 | @wf.spark_python_task 38 | def spark_python_task(): 39 | return SparkPythonTask( 40 | python_file="/src/python/example.py", 41 | source="GIT", 42 | parameters=["--timezone", "UTC"], 43 | ) # type: ignore 44 | -------------------------------------------------------------------------------- /examples/brickflow_serverless_examples/workflows/entrypoint.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # This should point to the `brickflows` version with serverless support or the wheel file with the same 3 | # MAGIC %pip install brickflows==1.2.1 4 | # MAGIC %pip install koheesio==0.8.1 5 | # MAGIC %restart_python 6 | 7 | # COMMAND ---------- 8 | import brickflow 9 | from brickflow import Project, PypiTaskLibrary 10 | import workflows 11 | 12 | 13 | def main() -> None: 14 | with Project( 15 | "brickflow-serverless-demo", 16 | git_repo="https://github.com/Nike-Inc/brickflow", 17 | provider="github", 18 | ) as f: 19 | f.add_pkg(workflows) 20 | 21 | 22 | if __name__ == "__main__": 23 | main() 24 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: BrickFlow 2 | site_description: Brickflow is a tool for managing and deploying scalable workflows on Databricks. 3 | site_url: https://brickflow.readthedocs.io/en/latest/ 4 | 5 | theme: 6 | name: material 7 | palette: 8 | - scheme: default 9 | primary: indigo 10 | accent: indigo 11 | toggle: 12 | icon: material/brightness-7 13 | name: Switch to dark mode 14 | - scheme: slate 15 | primary: indigo 16 | accent: indigo 17 | toggle: 18 | icon: material/brightness-4 19 | name: Switch to light mode 20 | features: 21 | # - announce.dismiss 22 | - content.code.annotate 23 | # - content.tabs.link 24 | - content.tooltips 25 | - content.code.copy 26 | # - header.autohide 27 | # - navigation.expand 28 | - navigation.indexes 29 | - navigation.instant 30 | # - navigation.prune 31 | # - navigation.sections 32 | - navigation.tabs 33 | - navigation.tabs.sticky 34 | - navigation.top 35 | - navigation.tracking 36 | - navigation.expand 37 | - search.highlight 38 | - search.share 39 | - search.suggest 40 | - toc.follow 41 | font: 42 | text: Roboto 43 | code: Roboto Mono 44 | logo: img/bf_logo.png 45 | favicon: img/bf_logo.png 46 | language: en 47 | 48 | repo_name: nike/brickflow 49 | repo_url: https://github.com/Nike-Inc/brickflow 50 | 51 | plugins: 52 | - search: 53 | lang: en 54 | - mkdocstrings: 55 | handlers: 56 | python: 57 | paths: [ "brickflow" ] # search packages in the src folder 58 | options: 59 | show_source: true 60 | show_root_heading: false 61 | heading_level: 1 62 | merge_init_into_class: true 63 | show_if_no_docstring: true 64 | show_root_full_path: true 65 | show_root_members_full_path: true 66 | show_root_toc_entry: false 67 | show_category_heading: true 68 | show_signature_annotations: true 69 | separate_signature: false 70 | 71 | markdown_extensions: 72 | - abbr 73 | - admonition 74 | - mkdocs-click 75 | - attr_list 76 | - def_list 77 | - footnotes 78 | - md_in_html 79 | - toc: 80 | permalink: true 81 | - pymdownx.arithmatex: 82 | generic: true 83 | - pymdownx.betterem: 84 | smart_enable: all 85 | - pymdownx.caret 86 | - pymdownx.details 87 | - pymdownx.emoji: 88 | emoji_generator: !!python/name:materialx.emoji.to_svg 89 | emoji_index: !!python/name:materialx.emoji.twemoji 90 | - pymdownx.highlight: 91 | anchor_linenums: true 92 | - pymdownx.inlinehilite 93 | - pymdownx.keys 94 | - pymdownx.magiclink: 95 | repo_url_shorthand: true 96 | user: squidfunk 97 | repo: mkdocs-material 98 | - pymdownx.mark 99 | - pymdownx.smartsymbols 100 | - pymdownx.superfences: 101 | custom_fences: 102 | - name: mermaid 103 | class: mermaid 104 | format: !!python/name:pymdownx.superfences.fence_code_format 105 | - pymdownx.tabbed: 106 | alternate_style: true 107 | - pymdownx.tasklist: 108 | custom_checkbox: true 109 | - pymdownx.tilde 110 | 111 | watch: 112 | - brickflow 113 | - brickflow_plugins 114 | 115 | extra_css: 116 | - css/custom.css 117 | 118 | nav: 119 | - Home: index.md 120 | - Quickstart: 121 | - Brickflow Projects: bundles-quickstart.md 122 | - Upgrading Versions: 123 | - Upgrading to v0.10.x: upgrades/upgrade-pre-0-10-0-to-0-10-0.md 124 | - Concepts: 125 | - HighLevel: highlevel.md 126 | - Workflows: workflows.md 127 | - Tasks: tasks.md 128 | - Projects: projects.md 129 | - ENV Variables: environment-variables.md 130 | - Importing Modules: how-imports-work.md 131 | - FAQ: faq/faq.md 132 | - CLI: 133 | - Commands: cli/reference.md 134 | - Python API: 135 | - Engine: 136 | - Project: api/project.md 137 | - Workflow: api/workflow.md 138 | - Compute: api/compute.md 139 | - Task: api/task.md 140 | - Context: api/context.md 141 | - CLI: api/cli.md 142 | - Brickflow Plugins: 143 | - AirflowTaskDependencySensor: api/airflow_external_task_dependency.md 144 | - AirflowNativeOperators: api/airflow_native_operators.md 145 | - WorkflowDependencySensor: api/workflow_dependency_sensor.md 146 | - SnowflakeOperator: api/uc_to_snowflake_operator.md 147 | - UcToSnowflakeOperator: api/uc_to_snowflake_operator.md 148 | - Secrets: api/secrets.md 149 | - TableauRefreshDataSourceOperator: api/airflow_tableau_operators.md 150 | - TableauRefreshWorkbookOperator: api/airflow_tableau_operators.md 151 | - BoxToVolumeOperator: api/box_operator.md 152 | - VolumeToBoxOperator: api/box_operator.md 153 | - BoxOperator: api/box_operator.md 154 | 155 | 156 | extra: 157 | generator: false 158 | version: 159 | provider: mike 160 | default: latest -------------------------------------------------------------------------------- /prospector.yaml: -------------------------------------------------------------------------------- 1 | strictness: high 2 | test-warnings: True 3 | doc-warnings: false 4 | 5 | ignore-paths: 6 | - build 7 | - venv 8 | - venv3 9 | - venv2 10 | - site 11 | - docs 12 | - tests/engine/sample_workflows.py 13 | - tools 14 | - .databricks 15 | - .mypy_cache 16 | - brickflow/bundles 17 | - brickflow/sample_dags 18 | - main.py 19 | - main2.py 20 | - .eggs 21 | - htmlcov 22 | - sample_workflows 23 | - integration_workflows 24 | - scripts 25 | - tests/test_brickflow.py 26 | - examples 27 | - brickflow_plugins # will eventually need to remove once there are tests and linting logic is applied 28 | 29 | max-line-length: 120 30 | 31 | pylint: 32 | disable: 33 | - too-many-branches 34 | - too-many-statements 35 | - too-many-instance-attributes 36 | - cyclic-import 37 | - len-as-condition 38 | - invalid-name 39 | - no-else-return 40 | - no-self-use 41 | - protected-access 42 | - too-many-arguments 43 | - too-many-locals # TBD: this rule is actually a good one, we need to enable it and refactor code 44 | - inconsistent-return-statements 45 | - import-outside-toplevel 46 | - consider-using-set-comprehension 47 | - useless-object-inheritance 48 | - unnecessary-pass 49 | - raise-missing-from # pretty strange requirement with acquaint logic 50 | - broad-except 51 | - arguments-differ 52 | 53 | pycodestyle: 54 | # W293: disabled because we have newlines in docstrings 55 | # E203: disabled because pep8 and black disagree on whitespace before colon in some cases 56 | disable: W293,E203,E203 # conflicts with black formatting 57 | 58 | pyflakes: 59 | disable: 60 | - F821 # ignore undefined name errors 61 | 62 | mccabe: 63 | disable: 64 | - MC0001 65 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "brickflows" 3 | version = "0.11.0a0" 4 | description = "Deploy scalable workflows to databricks using python" 5 | authors = ["Ashok Singamaneni, Sriharsha Tikkireddy"] 6 | readme = "README.md" 7 | license = "Apache License 2.0" 8 | homepage = "https://github.com/Nike-Inc/brickflow" 9 | repository = "https://github.com/Nike-Inc/brickflow" 10 | packages = [{ include = "brickflow" }, { include = "brickflow_plugins" }] 11 | include = ["LICENSE", "entrypoint.template", "gitignore_template.txt"] 12 | exclude = ["sample_workflows", "tests"] 13 | 14 | [tool.black] 15 | line-length = 88 16 | target-version = ['py39', 'py310'] 17 | include = '\.pyi?$' 18 | extend-exclude = ''' 19 | /( 20 | # The following are specific to Black, you probably don't want those. 21 | | brickflow/tf 22 | | venv 23 | | brickflow.egg-info 24 | | dist 25 | | brickflow/bundles 26 | )/ 27 | ''' 28 | 29 | [tool.poetry.dependencies] 30 | python = ">=3.9,<3.12" # pyspark <3.5 does not play happy with python 3.11. The latest DBRs Runtime (15.4) ships with Python 3.11. 31 | Jinja2 = ">=3.1.5" 32 | click = "^8.1.3" 33 | databricks-sdk = ">=0.1.8 <1.0.0" 34 | networkx = "3.1" 35 | pendulum = "2.1.2" 36 | pluggy = "^1.0.0" 37 | pydantic = ">=2.0.0 <3.0.0" 38 | python-decouple = "3.8" 39 | pyyaml = "^6.0" 40 | requests = ">=2.28.2 <3.0.0" 41 | # cerberus-python-client = {version = "~2.5.4", optional = true } # Users might have to manually install cerberus-python-client if required 42 | # tableauserverclient = {version = "~0.25", optional = true } # Users might have to manually install tableauserverclient if required 43 | 44 | 45 | [tool.poetry.scripts] 46 | bf = "brickflow.cli:cli" 47 | brickflow = "brickflow.cli:cli" 48 | 49 | [tool.poetry.group.dev.dependencies] 50 | black = "^24.3.0" 51 | coverage = "^7.2.5" 52 | datamodel-code-generator = "^0.25.2" 53 | deepdiff = "^6.3.0" 54 | mypy = "^1.3.0" 55 | pre-commit = "^3.3.1" 56 | prospector = "^1.10.3" 57 | py4j = "^0.10.9.7" 58 | pytest = ">=7.3.1 <8.0.0" 59 | pytest-mock = "^3.10.0" 60 | types-PyYAML = "*" # only for development purposes no need to make installation req 61 | types-requests = ">=2.28.11.16 <3.0.0.0" # only for development purposes no need to make installation req 62 | apache-airflow = "^2.7.3" 63 | snowflake = "^0.6.0" 64 | tableauserverclient = "^0.25" 65 | boxsdk = "^3.9.2" 66 | cerberus-python-client = "^2.5.4" 67 | watchdog = "<4.0.0" 68 | requests-mock = "1.12.1" 69 | pyspark = "^3.0.0" 70 | apache-airflow-providers-fab = ">=1.5.2" 71 | 72 | [tool.poetry.group.docs.dependencies] 73 | mdx-include = "^1.4.2" 74 | mike = "^2.1.3" 75 | mkdocs-click = "^0.8.1" 76 | mkdocs-material = "^9.5.49" 77 | mkdocstrings = { extras = ["python"], version = "^0.27.0" } 78 | 79 | [build-system] 80 | requires = ["poetry-core", "poetry-dynamic-versioning"] 81 | build-backend = "poetry_dynamic_versioning.backend" 82 | 83 | [tool.poetry-dynamic-versioning] 84 | enable = true 85 | vcs = "git" 86 | bump = true 87 | style = "semver" 88 | 89 | [tool.coverage] 90 | [tool.coverage.run] 91 | omit = [ 92 | # omit anything in a .local directory anywhere 93 | '*/.local/*', 94 | '**', 95 | 'tests/*', 96 | '*/tests/*', 97 | # omit anything in a .venv directory anywhere 98 | '.venv/*', 99 | "*/site-packages/*", 100 | ] 101 | 102 | [tool.coverage.report] 103 | skip_empty = true 104 | 105 | [tool.mypy] 106 | disallow_untyped_defs = true 107 | ignore_missing_imports = true 108 | files = [ 109 | "brickflow/context/*.py", 110 | "brickflow/cli/*.py", 111 | "brickflow/hints/*.py", 112 | "brickflow/engine/*.py", 113 | "brickflow/resolver/*.py", 114 | "brickflow/codegen/*.py", 115 | ] 116 | follow_imports = "skip" 117 | 118 | [tool.pylint.main] 119 | fail-under = 9.0 120 | 121 | 122 | [tool.pylint."messages control"] 123 | disable = ["too-many-lines", "too-many-positional-arguments"] 124 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/tests/__init__.py -------------------------------------------------------------------------------- /tests/airflow_plugins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/tests/airflow_plugins/__init__.py -------------------------------------------------------------------------------- /tests/airflow_plugins/test_autosys.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from requests.exceptions import HTTPError 3 | from requests_mock.mocker import Mocker as RequestsMocker 4 | 5 | from brickflow_plugins.airflow.operators.external_tasks import AutosysSensor 6 | 7 | 8 | class TestAutosysSensor: 9 | @pytest.fixture(autouse=True, name="api", scope="class") 10 | def mock_api(self): 11 | rm = RequestsMocker() 12 | rm.register_uri( 13 | method="GET", 14 | url="https://42.autosys.my-org.com/foo", 15 | response_list=[ 16 | # Test 1: Success 17 | { 18 | "json": {"status": "SU", "lastEndUTC": "2024-01-01T00:55:00Z"}, 19 | "status_code": int(200), 20 | }, 21 | # Test 2: Raise Error 22 | { 23 | "json": {}, 24 | "status_code": int(404), 25 | }, 26 | # Test 3: Poke 4 times until success 27 | { 28 | "json": {"status": "FA", "lastEndUTC": "2024-01-01T00:55:00Z"}, 29 | "status_code": int(200), 30 | }, 31 | { 32 | "json": {"status": "UNK", "lastEndUTC": None}, 33 | "status_code": int(200), 34 | }, 35 | { 36 | "json": {"status": "UNK", "lastEndUTC": ""}, 37 | "status_code": int(200), 38 | }, 39 | { 40 | "json": {"status": "SU", "lastEndUTC": "2024-01-01T01:55:00Z"}, 41 | "status_code": int(200), 42 | }, 43 | ], 44 | ) 45 | yield rm 46 | 47 | @pytest.fixture() 48 | def sensor(self): 49 | yield AutosysSensor( 50 | task_id="test", 51 | url="https://42.autosys.my-org.com/", 52 | job_name="foo", 53 | poke_interval=1, 54 | time_delta={"hours": 1}, 55 | ) 56 | 57 | def test_success(self, api, caplog, sensor): 58 | with api: 59 | sensor.poke(context={"execution_date": "2024-01-01T01:00:00Z"}) 60 | assert caplog.text.count("Poking again") == 0 61 | assert "Success criteria met. Exiting" in caplog.text 62 | 63 | def test_non_200(self, api, sensor): 64 | with pytest.raises(HTTPError): 65 | with api: 66 | sensor.poke(context={"execution_date": "2024-01-01T01:00:00Z"}) 67 | 68 | def test_poking(self, api, caplog, sensor): 69 | with api: 70 | sensor.poke(context={"execution_date": "2024-01-01T02:00:00Z"}) 71 | assert caplog.text.count("Poking again") == 3 72 | assert "Success criteria met. Exiting" in caplog.text 73 | -------------------------------------------------------------------------------- /tests/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/tests/cli/__init__.py -------------------------------------------------------------------------------- /tests/cli/sample_yaml_project/.brickflow-project-root.yaml: -------------------------------------------------------------------------------- 1 | version: v1 2 | projects: 3 | test_cli_project: 4 | name: test_cli_project 5 | brickflow_version: 1.2.1 6 | deployment_mode: bundle 7 | enable_plugins: false 8 | path_from_repo_root_to_project_root: some/test/path 9 | path_project_root_to_workflows_dir: path/to/workflows -------------------------------------------------------------------------------- /tests/cli/sample_yaml_project/brickflow-multi-project.yaml: -------------------------------------------------------------------------------- 1 | version: v1 2 | project_roots: 3 | test_cli_project: 4 | root_yaml_rel_path: . 5 | -------------------------------------------------------------------------------- /tests/cli/sample_yml_project/.brickflow-project-root.yml: -------------------------------------------------------------------------------- 1 | version: v1 2 | projects: 3 | test_cli_project: 4 | name: test_cli_project 5 | brickflow_version: 1.2.1 6 | deployment_mode: bundle 7 | enable_plugins: false 8 | path_from_repo_root_to_project_root: some/test/path 9 | path_project_root_to_workflows_dir: path/to/workflows -------------------------------------------------------------------------------- /tests/cli/sample_yml_project/brickflow-multi-project.yml: -------------------------------------------------------------------------------- 1 | version: v1 2 | project_roots: 3 | test_cli_project: 4 | root_yaml_rel_path: . 5 | -------------------------------------------------------------------------------- /tests/cli/test_bundles.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from typing import Optional 4 | from unittest.mock import patch, Mock 5 | from pytest import LogCaptureFixture 6 | import pytest 7 | 8 | from brickflow import BrickflowEnvVars, _ilog 9 | from brickflow.cli.bundles import bundle_deploy, bundle_destroy 10 | 11 | 12 | class TestBundles: 13 | @patch("brickflow.cli.bundles.should_deploy", return_value=True) 14 | @patch("brickflow.cli.bundles.exec_command") 15 | @patch.dict( 16 | os.environ, {BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_VERSION.value: "0.203.0"} 17 | ) 18 | def test_bundle_deploy_new_cli(self, mock_exec_command: Mock, _: Mock): 19 | mock_exec_command.side_effect = lambda *args, **kwargs: None 20 | mock_exec_command.return_value = None 21 | # workflows_dir needed to make the function work due to bundle sync 22 | bundle_deploy( 23 | force_acquire_lock=True, 24 | workflows_dir="somedir", 25 | debug=True, 26 | fail_on_active_runs=True, 27 | ) 28 | bundle_cli = os.environ[BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_EXEC.value] 29 | mock_exec_command.assert_called_with( 30 | bundle_cli, 31 | "bundle", 32 | [ 33 | "deploy", 34 | "-t", 35 | "local", 36 | "--fail-on-active-runs", 37 | "--force-lock", 38 | "--debug", 39 | ], 40 | ) 41 | bundle_destroy(force_acquire_lock=True, workflows_dir="somedir", debug=True) 42 | bundle_cli = os.environ[BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_EXEC.value] 43 | mock_exec_command.assert_called_with( 44 | bundle_cli, 45 | "bundle", 46 | ["destroy", "-t", "local", "--force-lock", "--debug"], 47 | ) 48 | 49 | @patch("brickflow.cli.bundles.should_deploy", return_value=True) 50 | @patch("brickflow.cli.bundles.exec_command") 51 | @patch.dict( 52 | os.environ, 53 | { 54 | BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_VERSION.value: "0.201.0", 55 | BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_EXEC.value: "databricks", 56 | }, 57 | ) 58 | def test_bundle_deploy_old_cli(self, mock_exec_command: Mock, _: Mock): 59 | mock_exec_command.side_effect = lambda *args, **kwargs: None 60 | mock_exec_command.return_value = None 61 | # workflows_dir needed to make the function work due to bundle sync 62 | bundle_deploy(force_acquire_lock=True, workflows_dir="somedir") 63 | bundle_cli = os.environ[BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_EXEC.value] 64 | mock_exec_command.assert_called_with( 65 | bundle_cli, 66 | "bundle", 67 | ["deploy", "-t", "local", "--force"], 68 | ) 69 | bundle_destroy(force_acquire_lock=True, workflows_dir="somedir") 70 | bundle_cli = os.environ[BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_EXEC.value] 71 | mock_exec_command.assert_called_with( 72 | bundle_cli, 73 | "bundle", 74 | ["destroy", "-t", "local", "--force"], 75 | ) 76 | 77 | @patch("brickflow.cli.bundles.exec_command") 78 | @patch.dict( 79 | os.environ, {BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_VERSION.value: "0.203.0"} 80 | ) 81 | def test_deploy_no_workflows( 82 | self, mock_exec_command: Mock, caplog: LogCaptureFixture 83 | ): 84 | mock_exec_command.side_effect = lambda *args, **kwargs: None 85 | mock_exec_command.return_value = None 86 | 87 | # Adjusting the log level and propagating it to the root logger to make sure it's captured by caplog 88 | _ilog.propagate = True 89 | _ilog.level = logging.WARN 90 | 91 | with caplog.at_level(logging.WARN): 92 | # running this should not fail but log a warning stating that no bundle has been found 93 | bundle_deploy(force_acquire_lock=True, workflows_dir="somedir") 94 | 95 | assert "No bundle.yml found, skipping deployment." in [ 96 | rec.message for rec in caplog.records 97 | ] 98 | 99 | @pytest.mark.parametrize( 100 | "input_arch,expected_arch", 101 | [ 102 | ("x86_64", "amd64"), # Test one x86_64 variant 103 | ("amd64", "amd64"), # Test alternative x86_64 name 104 | ("i386", "386"), # Test one 32-bit variant 105 | ("i686", "386"), # Test alternative 32-bit name 106 | ("arm64", "arm64"), # Test one ARM variant 107 | ("aarch64", "arm64"), # Test alternative ARM name 108 | ("X86_64", "amd64"), # Test case insensitivity 109 | ("unsupported_arch", None), # Test unsupported architecture 110 | ], 111 | ) 112 | def test_get_arch_mappings( 113 | self, input_arch: str, expected_arch: Optional[str] 114 | ) -> None: 115 | from brickflow.cli.bundles import get_arch 116 | 117 | with patch("platform.machine") as mock_machine: 118 | mock_machine.return_value = input_arch 119 | 120 | if expected_arch is None: 121 | with pytest.raises(RuntimeError) as exc_info: 122 | get_arch() 123 | assert f"Unsupported architecture: {input_arch}" in str(exc_info.value) 124 | else: 125 | assert get_arch() == expected_arch 126 | -------------------------------------------------------------------------------- /tests/cli/test_cli.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import traceback 4 | from unittest.mock import patch, Mock 5 | 6 | import click 7 | from click.testing import CliRunner 8 | 9 | from brickflow import BrickflowProjectDeploymentSettings, BrickflowEnvVars 10 | from brickflow.cli import ( 11 | cli, 12 | exec_command, 13 | ) 14 | from brickflow.cli.bundles import ( 15 | bundle_download_path, 16 | download_and_unzip_databricks_cli, 17 | get_force_lock_flag, 18 | ) 19 | from brickflow.cli.projects import handle_libraries 20 | 21 | 22 | def fake_run(*_, **__): 23 | click.echo("hello world") 24 | 25 | 26 | # TODO: Add more tests to the cli 27 | class TestCli: 28 | def test_no_command_error(self): 29 | runner = CliRunner() 30 | non_existent_command = "non_existent_command" 31 | result = runner.invoke(cli, ["non_existent_command"]) # noqa 32 | assert result.exit_code == 2 33 | assert result.output.strip().endswith( 34 | f"Error: No such command '{non_existent_command}'." 35 | ) 36 | 37 | @patch("webbrowser.open") 38 | def test_docs(self, browser: Mock): 39 | runner = CliRunner() 40 | browser.return_value = None 41 | result = runner.invoke(cli, ["docs"]) # noqa 42 | assert result.exit_code == 0, traceback.print_exception(*result.exc_info) 43 | assert result.output.strip().startswith("Opening browser for docs...") 44 | browser.assert_called_once_with( 45 | "https://engineering.nike.com/brickflow/", new=2 46 | ) 47 | 48 | def test_force_arg(self): 49 | with patch.dict( 50 | os.environ, {BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_VERSION.value: "0.203.0"} 51 | ): 52 | assert get_force_lock_flag() == "--force-lock" 53 | with patch.dict( 54 | os.environ, {BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_VERSION.value: "auto"} 55 | ): 56 | assert get_force_lock_flag() == "--force-lock" 57 | with patch.dict( 58 | os.environ, 59 | {BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_VERSION.value: "something else"}, 60 | ): 61 | assert get_force_lock_flag() == "--force-lock" 62 | with patch.dict( 63 | os.environ, {BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_VERSION.value: "0.202.0"} 64 | ): 65 | assert get_force_lock_flag() == "--force" 66 | 67 | def test_install_cli(self): 68 | expected_version = "0.200.0" 69 | url = bundle_download_path(expected_version) 70 | file_path = download_and_unzip_databricks_cli(url, expected_version) 71 | assert url is not None 72 | version_value = exec_command(file_path, "--version", [], capture_output=True) 73 | assert ( 74 | version_value.strip() == f"Databricks CLI v{expected_version}" 75 | ), version_value 76 | directory_path = ".databricks" 77 | if os.path.exists(directory_path): 78 | shutil.rmtree(directory_path) 79 | 80 | def test_projects_handle_libraries(self): 81 | bpd = BrickflowProjectDeploymentSettings() 82 | bpd.brickflow_auto_add_libraries = None 83 | handle_libraries(skip_libraries=True) 84 | assert bpd.brickflow_auto_add_libraries is False 85 | handle_libraries(skip_libraries=False) 86 | assert bpd.brickflow_auto_add_libraries is True 87 | bpd.brickflow_auto_add_libraries = None 88 | -------------------------------------------------------------------------------- /tests/cli/test_projects.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import shutil 3 | import os 4 | import pytest 5 | from brickflow import ConfigFileType 6 | from brickflow.cli.projects import MultiProjectManager, get_brickflow_root 7 | 8 | 9 | @pytest.mark.parametrize( 10 | "project_folder,extension", 11 | [("sample_yml_project", "yml"), ("sample_yaml_project", "yaml")], 12 | ) 13 | def test_get_brickflow_root(project_folder, extension): 14 | cwd = os.getcwd() 15 | test_folder = str(Path(__file__).parent) 16 | 17 | # Creating empty test directories 18 | os.makedirs(f"{test_folder}/{project_folder}/some/dummy/dir", exist_ok=True) 19 | os.chdir(f"{test_folder}/{project_folder}/some/dummy/dir") 20 | 21 | actual = get_brickflow_root() 22 | assert actual == Path( 23 | f"{test_folder}/{project_folder}/brickflow-multi-project.{extension}" 24 | ) 25 | 26 | # Cleanup 27 | shutil.rmtree(f"{test_folder}/{project_folder}/some") 28 | os.chdir(cwd) 29 | 30 | 31 | @pytest.mark.parametrize( 32 | "project_folder, config_type", 33 | [ 34 | ("sample_yml_project", ConfigFileType.YML), 35 | ("sample_yaml_project", ConfigFileType.YAML), 36 | ], 37 | ) 38 | def test_multi_project_manager_yaml(project_folder, config_type): 39 | cwd = os.getcwd() 40 | test_folder = str(Path(__file__).parent) 41 | os.chdir(test_folder) 42 | 43 | config_file_name = ( 44 | f"{test_folder}/{project_folder}/brickflow-multi-project.{config_type.value}" 45 | ) 46 | manager = MultiProjectManager( 47 | config_file_name=config_file_name, file_type=config_type 48 | ) 49 | assert manager._brickflow_multi_project_config.version == "v1" 50 | expected_project_config = { 51 | "version": "v1", 52 | "projects": { 53 | "test_cli_project": { 54 | "name": "test_cli_project", 55 | "path_from_repo_root_to_project_root": "some/test/path", 56 | "path_project_root_to_workflows_dir": "path/to/workflows", 57 | "deployment_mode": "bundle", 58 | "brickflow_version": "1.2.1", 59 | "enable_plugins": False, 60 | } 61 | }, 62 | } 63 | assert manager._project_config_dict["."].model_dump() == expected_project_config 64 | 65 | os.chdir(cwd) 66 | -------------------------------------------------------------------------------- /tests/codegen/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/tests/codegen/__init__.py -------------------------------------------------------------------------------- /tests/codegen/expected_bundles/local_bundle_continuous_schedule.yml: -------------------------------------------------------------------------------- 1 | "bundle": 2 | "name": "test-project" 3 | "targets": 4 | "test-project-local": 5 | "resources": 6 | "jobs": 7 | "wf-test-2": 8 | "continuous": 9 | "pause_status": "PAUSED" 10 | "email_notifications": null 11 | "git_source": null 12 | "health": 13 | "rules": 14 | - "metric": "RUN_DURATION_SECONDS" 15 | "op": "GREATER_THAN" 16 | "value": 7200.0 17 | "job_clusters": 18 | - "job_cluster_key": "sample_job_cluster" 19 | "new_cluster": 20 | "aws_attributes": null 21 | "custom_tags": 22 | "brickflow_deployment_mode": "Databricks Asset Bundles" 23 | "brickflow_project_name": "test-project" 24 | "brickflow_version": "1.0.0" 25 | "deployed_at": "1704067200000" 26 | "deployed_by": "test_user" 27 | "environment": "local" 28 | "data_security_mode": "SINGLE_USER" 29 | "driver_instance_pool_id": null 30 | "driver_node_type_id": null 31 | "enable_elastic_disk": null 32 | "init_scripts": null 33 | "instance_pool_id": null 34 | "node_type_id": "m6gd.xlarge" 35 | "num_workers": 1.0 36 | "policy_id": null 37 | "runtime_engine": null 38 | "spark_conf": null 39 | "spark_env_vars": null 40 | "spark_version": "13.3.x-scala2.12" 41 | "max_concurrent_runs": 1.0 42 | "name": "test_user_wf-test-2" 43 | "notification_settings": null 44 | "permissions": 45 | - "level": "IS_OWNER" 46 | "user_name": "abc@abc.com" 47 | - "level": "CAN_MANAGE" 48 | "user_name": "abc@abc.com" 49 | - "level": "CAN_MANAGE_RUN" 50 | "user_name": "abc@abc.com" 51 | - "level": "CAN_VIEW" 52 | "user_name": "abc@abc.com" 53 | "run_as": 54 | "user_name": "abc@abc.com" 55 | "schedule": null 56 | "tags": 57 | "brickflow_deployment_mode": "Databricks Asset Bundles" 58 | "brickflow_project_name": "test-project" 59 | "brickflow_version": "1.0.0" 60 | "deployed_at": "1704067200000" 61 | "deployed_by": "test_user" 62 | "environment": "local" 63 | "test": "test2" 64 | "tasks": 65 | - "depends_on": [] 66 | "email_notifications": {} 67 | "webhook_notifications": {} 68 | "job_cluster_key": "sample_job_cluster" 69 | "libraries": [] 70 | "max_retries": null 71 | "min_retry_interval_millis": null 72 | "notebook_task": 73 | "base_parameters": 74 | "all_tasks1": "test" 75 | "all_tasks3": "123" 76 | "brickflow_env": "local" 77 | "brickflow_internal_only_run_tasks": "" 78 | "brickflow_internal_task_name": "{{task_key}}" 79 | "brickflow_internal_workflow_name": "wf-test-2" 80 | "brickflow_internal_workflow_prefix": "" 81 | "brickflow_internal_workflow_suffix": "" 82 | "brickflow_job_id": "{{job_id}}" 83 | "brickflow_parent_run_id": "{{parent_run_id}}" 84 | "brickflow_run_id": "{{run_id}}" 85 | "brickflow_start_date": "{{start_date}}" 86 | "brickflow_start_time": "{{start_time}}" 87 | "brickflow_task_key": "{{task_key}}" 88 | "brickflow_task_retry_count": "{{task_retry_count}}" 89 | "test": "var" 90 | "notebook_path": "test_databricks_bundle.py" 91 | "source": "WORKSPACE" 92 | "retry_on_timeout": null 93 | "task_key": "task_function2" 94 | "timeout_seconds": null 95 | "timeout_seconds": null 96 | "trigger": null 97 | "webhook_notifications": null 98 | "pipelines": {} 99 | "workspace": 100 | "file_path": "/Users/${workspace.current_user.userName}/.brickflow_bundles/test-project/local/files" 101 | "root_path": "/Users/${workspace.current_user.userName}/.brickflow_bundles/test-project/local" 102 | "state_path": "/Users/${workspace.current_user.userName}/.brickflow_bundles/test-project/local/state" 103 | "workspace": {} -------------------------------------------------------------------------------- /tests/codegen/expected_bundles/local_serverless_bundle.yml: -------------------------------------------------------------------------------- 1 | "bundle": 2 | "name": "test-project" 3 | "targets": 4 | "test-project-local": 5 | "resources": 6 | "jobs": 7 | "brickflow-serverless-demo": 8 | "continuous": null 9 | "email_notifications": null 10 | "environments": 11 | - "environment_key": "Default" 12 | "spec": 13 | "client": "1" 14 | "dependencies": 15 | - "pytz==2024.2" 16 | "health": {} 17 | "job_clusters": [] 18 | "max_concurrent_runs": 1.0 19 | "name": "test_user_brickflow-serverless-demo" 20 | "notification_settings": null 21 | "parameters": null 22 | "permissions": null 23 | "schedule": 24 | "pause_status": "PAUSED" 25 | "quartz_cron_expression": "0 0/20 0 ? * * *" 26 | "timezone_id": "UTC" 27 | "tags": 28 | "brickflow_deployment_mode": "Databricks Asset Bundles" 29 | "brickflow_project_name": "test-project" 30 | "brickflow_version": "1.0.0" 31 | "deployed_at": "1704067200000" 32 | "deployed_by": "test_user" 33 | "environment": "local" 34 | "tasks": 35 | - "depends_on": [] 36 | "email_notifications": {} 37 | "webhook_notifications": {} 38 | "max_retries": null 39 | "min_retry_interval_millis": null 40 | "notebook_task": 41 | "base_parameters": 42 | "brickflow_env": "local" 43 | "brickflow_internal_only_run_tasks": "" 44 | "brickflow_internal_task_name": "{{task_key}}" 45 | "brickflow_internal_workflow_name": "brickflow-serverless-demo" 46 | "brickflow_internal_workflow_prefix": "" 47 | "brickflow_internal_workflow_suffix": "" 48 | "brickflow_job_id": "{{job_id}}" 49 | "brickflow_parent_run_id": "{{parent_run_id}}" 50 | "brickflow_run_id": "{{run_id}}" 51 | "brickflow_start_date": "{{start_date}}" 52 | "brickflow_start_time": "{{start_time}}" 53 | "brickflow_task_key": "{{task_key}}" 54 | "brickflow_task_retry_count": "{{task_retry_count}}" 55 | "notebook_path": "test_databricks_bundle.py" 56 | "source": "WORKSPACE" 57 | "retry_on_timeout": null 58 | "task_key": "entrypoint_task" 59 | "timeout_seconds": null 60 | - "depends_on": [] 61 | "email_notifications": {} 62 | "webhook_notifications": {} 63 | "max_retries": null 64 | "min_retry_interval_millis": null 65 | "notebook_task": 66 | "base_parameters": 67 | "some_parameter": "some_value" 68 | "notebook_path": "notebooks/example_notebook.py" 69 | "retry_on_timeout": null 70 | "task_key": "notebook_task" 71 | "timeout_seconds": null 72 | - "depends_on": [] 73 | "email_notifications": {} 74 | "webhook_notifications": {} 75 | "environment_key": "Default" 76 | "max_retries": null 77 | "min_retry_interval_millis": null 78 | "retry_on_timeout": null 79 | "spark_python_task": 80 | "parameters": 81 | - "--timezone" 82 | - "UTC" 83 | "python_file": "/Workspace/Users/${workspace.current_user.userName}/.brickflow_bundles/test-project/local/files/spark/python/src/run_task.py" 84 | "source": "WORKSPACE" 85 | "task_key": "spark_python_task" 86 | "timeout_seconds": null 87 | "timeout_seconds": null 88 | "trigger": null 89 | "webhook_notifications": null 90 | "pipelines": {} 91 | "workspace": 92 | "file_path": "/Users/${workspace.current_user.userName}/.brickflow_bundles/test-project/local/files" 93 | "root_path": "/Users/${workspace.current_user.userName}/.brickflow_bundles/test-project/local" 94 | "state_path": "/Users/${workspace.current_user.userName}/.brickflow_bundles/test-project/local/state" 95 | "workspace": {} 96 | -------------------------------------------------------------------------------- /tests/codegen/sample_serverless_workflow.py: -------------------------------------------------------------------------------- 1 | from brickflow import ( 2 | Workflow, 3 | NotebookTask, 4 | SparkPythonTask, 5 | ) 6 | from brickflow.engine.task import PypiTaskLibrary 7 | 8 | wf = Workflow( 9 | "brickflow-serverless-demo", 10 | schedule_quartz_expression="0 0/20 0 ? * * *", 11 | libraries=[PypiTaskLibrary(package="pytz==2024.2")], 12 | ) 13 | 14 | 15 | @wf.task 16 | def entrypoint_task(): 17 | pass 18 | 19 | 20 | @wf.notebook_task 21 | def notebook_task(): 22 | return NotebookTask( 23 | notebook_path="notebooks/example_notebook.py", 24 | base_parameters={ 25 | "some_parameter": "some_value", # in the notebook access these via dbutils.widgets.get("some_parameter") 26 | }, 27 | ) # type: ignore 28 | 29 | 30 | @wf.spark_python_task 31 | def spark_python_task(): 32 | return SparkPythonTask( 33 | python_file="./products/test-project/spark/python/src/run_task.py", 34 | source="GIT", 35 | parameters=["--timezone", "UTC"], 36 | ) # type: ignore 37 | -------------------------------------------------------------------------------- /tests/context/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/tests/context/__init__.py -------------------------------------------------------------------------------- /tests/databricks_plugins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/tests/databricks_plugins/__init__.py -------------------------------------------------------------------------------- /tests/databricks_plugins/test_run_job.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pytest 4 | from requests_mock.mocker import Mocker as RequestsMocker 5 | 6 | from brickflow.engine.utils import ctx 7 | from brickflow_plugins.databricks.run_job import RunJobInRemoteWorkspace 8 | 9 | 10 | class TestRunJob: 11 | workspace_url = "https://42.cloud.databricks.com" 12 | endpoint_url = f"{workspace_url}/api/.*/jobs/run-now" 13 | response = {"run_id": 37, "number_in_job": 42} 14 | 15 | ctx.log.propagate = True 16 | 17 | @pytest.fixture(autouse=True) 18 | def mock_get_job_id(self, mocker): 19 | mocker.patch( 20 | "brickflow_plugins.databricks.run_job.get_job_id", 21 | return_value=1, 22 | ) 23 | 24 | @pytest.fixture(autouse=True, name="api") 25 | def mock_api(self): 26 | rm = RequestsMocker() 27 | rm.post(re.compile(self.endpoint_url), json=self.response, status_code=int(200)) 28 | yield rm 29 | 30 | def test_run_job(self, api, caplog): 31 | with api: 32 | RunJobInRemoteWorkspace( 33 | databricks_host=self.workspace_url, 34 | databricks_token="token", 35 | job_name="foo", 36 | ).execute() 37 | 38 | assert "RunNowResponse(number_in_job=42, run_id=37)" in caplog.text 39 | -------------------------------------------------------------------------------- /tests/databricks_plugins/test_workflow_dependency_sensor.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | 3 | import pytest 4 | from requests_mock.mocker import Mocker as RequestsMocker 5 | 6 | from brickflow_plugins.databricks.workflow_dependency_sensor import ( 7 | WorkflowDependencySensor, 8 | ) 9 | 10 | 11 | class TestWorkflowDependencySensor: 12 | workspace_url = "https://42.cloud.databricks.com" 13 | endpoint_url = f"{workspace_url}/api/2.1/jobs/get" 14 | response = {} 15 | 16 | def test_sensor_failure_403(self): 17 | api = RequestsMocker() 18 | api.get(self.endpoint_url, json=self.response, status_code=int(403)) 19 | 20 | # Databricks SDK will throw PermissionDenied exception if the job_id is not found or 21 | # user doesn't have permission 22 | from databricks.sdk.errors.platform import PermissionDenied 23 | 24 | with api: 25 | sensor = WorkflowDependencySensor( 26 | databricks_host=self.workspace_url, 27 | databricks_token="token", 28 | dependency_job_id="1", 29 | delta=timedelta(seconds=1), 30 | timeout_seconds=1, 31 | poke_interval_seconds=1, 32 | ) 33 | 34 | with pytest.raises(PermissionDenied): 35 | sensor.execute() 36 | -------------------------------------------------------------------------------- /tests/databricks_plugins/test_workflow_task_dependency_sensor.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | 3 | import pytest 4 | from requests_mock.mocker import Mocker as RequestsMocker 5 | 6 | from brickflow_plugins.databricks.workflow_dependency_sensor import ( 7 | WorkflowTaskDependencySensor, 8 | WorkflowDependencySensorTimeOutException, 9 | ) 10 | 11 | 12 | class TestWorkflowTaskDependencySensor: 13 | workspace_url = "https://42.cloud.databricks.com" 14 | endpoint_url = f"{workspace_url}/api/2.1/jobs/runs/list" 15 | response = { 16 | "runs": [ 17 | { 18 | "job_id": 1, 19 | "run_id": 1, 20 | "start_time": 1704063600000, 21 | "state": { 22 | "result_state": "SUCCESS", 23 | }, 24 | "tasks": [ 25 | { 26 | "run_id": 100, 27 | "task_key": "foo", 28 | "state": { 29 | "result_state": "SUCCESS", 30 | }, 31 | }, 32 | { 33 | "run_id": 200, 34 | "task_key": "bar", 35 | "state": { 36 | "result_state": "FAILED", 37 | }, 38 | }, 39 | { 40 | "run_id": 300, 41 | "task_key": "baz", 42 | "state": {}, 43 | }, 44 | ], 45 | } 46 | ] 47 | } 48 | 49 | @pytest.fixture(autouse=True) 50 | def mock_get_execution_start_time_unix_milliseconds(self, mocker): 51 | mocker.patch.object( 52 | WorkflowTaskDependencySensor, 53 | "get_execution_start_time_unix_milliseconds", 54 | return_value=1704063600000, 55 | ) 56 | 57 | @pytest.fixture(autouse=True) 58 | def mock_get_job_id(self, mocker): 59 | mocker.patch( 60 | "brickflow_plugins.databricks.workflow_dependency_sensor.get_job_id", 61 | return_value=1, 62 | ) 63 | 64 | @pytest.fixture(autouse=True, name="api") 65 | def mock_api(self): 66 | rm = RequestsMocker() 67 | rm.get(self.endpoint_url, json=self.response, status_code=int(200)) 68 | yield rm 69 | 70 | def test_sensor_success(self, caplog, api): 71 | with api: 72 | sensor = WorkflowTaskDependencySensor( 73 | databricks_host=self.workspace_url, 74 | databricks_token="token", 75 | dependency_job_name="job", 76 | dependency_task_name="foo", 77 | delta=timedelta(seconds=1), 78 | timeout_seconds=1, 79 | poke_interval_seconds=1, 80 | ) 81 | 82 | sensor.execute() 83 | 84 | assert ( 85 | "Found the run_id '1' and 'foo' task with state: SUCCESS" in caplog.text 86 | ) 87 | assert "Found a successful run: 1" in caplog.text 88 | 89 | def test_sensor_failure(self, caplog, api): 90 | with api: 91 | sensor = WorkflowTaskDependencySensor( 92 | databricks_host=self.workspace_url, 93 | databricks_token="token", 94 | dependency_job_name="job", 95 | dependency_task_name="bar", 96 | delta=timedelta(seconds=1), 97 | timeout_seconds=1, 98 | poke_interval_seconds=1, 99 | ) 100 | 101 | with pytest.raises(WorkflowDependencySensorTimeOutException): 102 | sensor.execute() 103 | 104 | assert ( 105 | "Found the run_id '1' and 'bar' task with state: FAILED" 106 | in caplog.messages 107 | ) 108 | assert "Didn't find a successful task run yet..." in caplog.messages 109 | 110 | def test_sensor_no_state(self, caplog, api): 111 | with api: 112 | sensor = WorkflowTaskDependencySensor( 113 | databricks_host=self.workspace_url, 114 | databricks_token="token", 115 | dependency_job_name="job", 116 | dependency_task_name="baz", 117 | delta=timedelta(seconds=1), 118 | timeout_seconds=1, 119 | poke_interval_seconds=1, 120 | ) 121 | 122 | with pytest.raises(WorkflowDependencySensorTimeOutException): 123 | sensor.execute() 124 | 125 | assert ( 126 | "Found the run_id '1' and 'baz' but the task has not started yet..." 127 | in caplog.messages 128 | ) 129 | assert "Didn't find a successful task run yet..." in caplog.messages 130 | -------------------------------------------------------------------------------- /tests/engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/tests/engine/__init__.py -------------------------------------------------------------------------------- /tests/engine/sample_workflow.py: -------------------------------------------------------------------------------- 1 | from brickflow.engine.compute import Cluster 2 | from brickflow.engine.task import ( 3 | BrickflowTriggerRule, 4 | TaskType, 5 | TaskResponse, 6 | DLTPipeline, 7 | RunJobTask, 8 | ) 9 | from brickflow.engine.workflow import Workflow, WorkflowPermissions, User 10 | 11 | wf = Workflow( 12 | "test", 13 | default_cluster=Cluster.from_existing_cluster("existing_cluster_id"), 14 | schedule_quartz_expression="* * * * *", 15 | permissions=WorkflowPermissions( 16 | owner=User("abc@abc.com"), 17 | can_manage_run=[User("abc@abc.com")], 18 | can_view=[User("abc@abc.com")], 19 | can_manage=[User("abc@abc.com")], 20 | ), 21 | tags={"test": "test2"}, 22 | common_task_parameters={"all_tasks1": "test", "all_tasks3": "123"}, # type: ignore 23 | health={ 24 | "rules": [ 25 | {"metric": "RUN_DURATION_SECONDS", "op": "GREATER_THAN", "value": 7200} 26 | ] 27 | }, 28 | timeout_seconds=42, 29 | ) 30 | 31 | 32 | @wf.task() 33 | def task_function(*, test="var"): 34 | return test 35 | 36 | 37 | @wf.task() 38 | def task_function_with_error(*, test="var"): 39 | raise ValueError("throwing random error") 40 | 41 | 42 | @wf.task 43 | def task_function_no_deco_args(): 44 | return "hello world" 45 | 46 | 47 | @wf.dlt_task 48 | def dlt_pipeline(): 49 | # pass 50 | return DLTPipeline( 51 | name="hello world", 52 | storage="123", 53 | language="PYTHON", 54 | configuration={}, 55 | cluster=Cluster( 56 | "test", 57 | "someversion", 58 | "vm-node", 59 | custom_tags={"name": "test"}, 60 | min_workers=2, 61 | max_workers=10, 62 | ), 63 | notebook_path="scripts/spark_script_1.py", 64 | ) 65 | 66 | 67 | @wf.dlt_task 68 | def dlt_pipeline_2(): 69 | # pass 70 | return DLTPipeline( 71 | name="hello world", 72 | storage="123", 73 | language="PYTHON", 74 | configuration={}, 75 | notebook_path="scripts/spark_script_2.py", 76 | ) 77 | 78 | 79 | @wf.task() 80 | def task_function_nokwargs(): 81 | return "hello world" 82 | 83 | 84 | @wf.task(depends_on=task_function) 85 | def task_function_2(): 86 | return "hello world" 87 | 88 | 89 | @wf.task(depends_on="task_function_2") 90 | def task_function_3(): 91 | return "hello world" 92 | 93 | 94 | @wf.task(depends_on="task_function_3", trigger_rule=BrickflowTriggerRule.NONE_FAILED) 95 | def task_function_4(): 96 | return "hello world" 97 | 98 | 99 | @wf.task( 100 | task_type=TaskType.CUSTOM_PYTHON_TASK, 101 | trigger_rule=BrickflowTriggerRule.NONE_FAILED, 102 | custom_execute_callback=lambda x: TaskResponse(x.name, push_return_value=True), 103 | ) 104 | def custom_python_task_push(): 105 | pass 106 | 107 | 108 | @wf.run_job_task() 109 | def run_job_task(): 110 | return RunJobTask(job_name="foo", host="https://foo.cloud.databricks.com") 111 | -------------------------------------------------------------------------------- /tests/engine/sample_workflow_2.py: -------------------------------------------------------------------------------- 1 | from brickflow import Cluster, Workflow 2 | 3 | wf = Workflow( 4 | "test1", default_cluster=Cluster.from_existing_cluster("existing_cluster_id") 5 | ) 6 | 7 | 8 | @wf.task() 9 | def task_function(*, test="var"): 10 | return test 11 | -------------------------------------------------------------------------------- /tests/engine/test_compute.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from brickflow.engine.compute import Cluster 4 | 5 | 6 | class TestCompute: 7 | def test_autoscale(self): 8 | workers = 1234 9 | cluster = Cluster( 10 | "name", "spark_version", "vm-node", min_workers=workers, max_workers=workers 11 | ) 12 | assert cluster.autoscale() == { 13 | "autoscale": { 14 | "min_workers": workers, 15 | "max_workers": workers, 16 | } 17 | } 18 | 19 | cluster = Cluster("name", "spark_version", "vm-node") 20 | assert not cluster.autoscale() 21 | 22 | def test_job_task_field(self): 23 | cluster = Cluster.from_existing_cluster("existing_cluster_id") 24 | assert cluster.job_task_field_dict == { 25 | "existing_cluster_id": "existing_cluster_id" 26 | } 27 | cluster = Cluster("name", "spark_version", "vm-node") 28 | assert cluster.job_task_field_dict == {"job_cluster_key": "name"} 29 | 30 | def test_dict(self): 31 | cluster = Cluster.from_existing_cluster("existing_cluster_id") 32 | assert "existing_cluster_id" not in cluster.as_dict() 33 | 34 | def test_valid_cluster(self): 35 | with pytest.raises(AssertionError): 36 | Cluster( 37 | "some_name", "some_version", "some_vm", min_workers=8, max_workers=4 38 | ) 39 | 40 | with pytest.raises(AssertionError): 41 | Cluster( 42 | "some_name", 43 | "some_version", 44 | "some_vm", 45 | num_workers=3, 46 | min_workers=2, 47 | max_workers=4, 48 | ) 49 | 50 | with pytest.raises(AssertionError): 51 | Cluster("some_name", "some_version", "some_vm", max_workers=4) 52 | 53 | def test_node_type_or_instance_pool(self): 54 | assert ( 55 | Cluster( 56 | "some_name", 57 | "some_version", 58 | node_type_id="some_vm", 59 | driver_node_type_id="other_vm", 60 | ).node_type_id 61 | == "some_vm" 62 | ) 63 | assert ( 64 | Cluster( 65 | "some_name", "some_version", instance_pool_id="some_instance_pool_id" 66 | ).instance_pool_id 67 | == "some_instance_pool_id" 68 | ) 69 | with pytest.raises( 70 | AssertionError, match="Must specify either instance_pool_id or node_type_id" 71 | ): 72 | Cluster( 73 | "some_name", 74 | "some_version", 75 | ) 76 | 77 | with pytest.raises( 78 | AssertionError, 79 | match="Cannot specify instance_pool_id if node_type_id has been specified", 80 | ): 81 | Cluster( 82 | "some_name", 83 | "some_version", 84 | node_type_id="some_vm", 85 | instance_pool_id="1234", 86 | ) 87 | with pytest.raises( 88 | AssertionError, 89 | match=( 90 | "Cannot specify driver_node_type_id if instance_pool_id" 91 | " or driver_instance_pool_id has been specified" 92 | ), 93 | ): 94 | Cluster( 95 | "some_name", 96 | "some_version", 97 | driver_node_type_id="other_vm", 98 | instance_pool_id="1234", 99 | ) 100 | with pytest.raises( 101 | AssertionError, 102 | match=( 103 | "Cannot specify driver_node_type_id if instance_pool_id" 104 | " or driver_instance_pool_id has been specified" 105 | ), 106 | ): 107 | Cluster( 108 | "some_name", 109 | "some_version", 110 | node_type_id="some_vm", 111 | driver_node_type_id="other_vm", 112 | driver_instance_pool_id="1234", 113 | ) 114 | with pytest.raises( 115 | AssertionError, 116 | match=( 117 | "Cannot specify driver_node_type_id if instance_pool_id" 118 | " or driver_instance_pool_id has been specified" 119 | ), 120 | ): 121 | Cluster( 122 | "some_name", 123 | "some_version", 124 | driver_node_type_id="other_vm", 125 | instance_pool_id="1234", 126 | driver_instance_pool_id="12345", 127 | ) 128 | -------------------------------------------------------------------------------- /tests/engine/test_engine.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | from brickflow.engine import ( 4 | get_current_commit, 5 | ) 6 | 7 | 8 | class TestEngine: 9 | def test_get_current_commit(self, mocker): 10 | branch = "some_random_sha" 11 | mocker.patch("subprocess.check_output") 12 | subprocess.check_output.return_value = branch.encode("utf-8") 13 | assert get_current_commit() == branch 14 | subprocess.check_output.assert_called_once_with( 15 | ['git log -n 1 --pretty=format:"%H"'], shell=True 16 | ) # noqa 17 | -------------------------------------------------------------------------------- /tests/engine/test_utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import pathlib 3 | import pytest 4 | from requests_mock.mocker import Mocker as RequestsMocker 5 | 6 | from pydantic import SecretStr 7 | 8 | from brickflow.engine.utils import get_job_id, ctx, get_bf_project_root 9 | 10 | 11 | class TestUtils: 12 | workspace_url = "https://42.cloud.databricks.com" 13 | endpoint_url = f"{workspace_url}/api/.*/jobs/list" 14 | 15 | ctx.log.propagate = True 16 | 17 | @pytest.fixture(autouse=True, name="api", scope="class") 18 | def mock_api(self): 19 | rm = RequestsMocker() 20 | rm.register_uri( 21 | method="GET", 22 | url=re.compile(self.endpoint_url), 23 | response_list=[ 24 | { 25 | "json": {"jobs": [{"job_id": 1234, "settings": {"name": "foo"}}]}, 26 | "status_code": int(200), 27 | }, 28 | { 29 | "json": {"has_more": False}, 30 | "status_code": int(200), 31 | }, 32 | { 33 | "json": {}, 34 | "status_code": int(404), 35 | }, 36 | ], 37 | ) 38 | yield rm 39 | 40 | def test_get_job_id_success(self, api): 41 | with api: 42 | job_id = get_job_id( 43 | job_name="foo", 44 | host=self.workspace_url, 45 | token=SecretStr("token"), 46 | ) 47 | assert job_id == 1234 48 | 49 | def test_get_job_id_failure(self, api): 50 | with pytest.raises(ValueError): 51 | with api: 52 | get_job_id(job_name="bar", host=self.workspace_url, token="token") 53 | 54 | def test_get_job_id_non_200(self, caplog, api): 55 | with api: 56 | get_job_id(job_name="buz", host=self.workspace_url, token="token") 57 | assert "An error occurred: request failed" in caplog.text 58 | 59 | def test_get_bf_project_root(self): 60 | # Set up expected path which is the root of the repo 61 | expected_root = pathlib.Path.cwd().parents[0] 62 | # Execute the function 63 | actual_root = get_bf_project_root() 64 | # Assert the result 65 | assert actual_root == expected_root 66 | -------------------------------------------------------------------------------- /tests/resolver/test_resolver.py: -------------------------------------------------------------------------------- 1 | # test_resolver.py 2 | from typing import Type 3 | 4 | import pytest 5 | 6 | import brickflow 7 | from brickflow.resolver import ( 8 | BrickflowRootNotFound, 9 | ) 10 | 11 | 12 | @pytest.fixture 13 | def default_mocks(mocker): 14 | # Create mocks for the three methods 15 | mocker.patch( 16 | "brickflow.resolver.get_caller_file_paths", return_value=["path1", "path2"] 17 | ) 18 | mocker.patch( 19 | "brickflow.resolver.get_notebook_ws_path", return_value="/notebook/ws/path" 20 | ) 21 | 22 | 23 | def test_resolver_methods(default_mocks, mocker): # noqa 24 | error_msg = "This is a test message" 25 | 26 | def make_exception_function(exc: Type[Exception]): 27 | def raise_exception(*args, **kwargs): 28 | raise exc(error_msg) 29 | 30 | return raise_exception 31 | 32 | # catch random error 33 | mocker.patch( 34 | "brickflow.resolver.go_up_till_brickflow_root", 35 | side_effect=make_exception_function(ValueError), 36 | ) 37 | with pytest.raises(ValueError, match=error_msg): 38 | brickflow.resolver.get_relative_path_to_brickflow_root() 39 | 40 | mocker.patch( 41 | "brickflow.resolver.go_up_till_brickflow_root", 42 | side_effect=make_exception_function(BrickflowRootNotFound), 43 | ) 44 | 45 | brickflow.resolver.get_relative_path_to_brickflow_root() 46 | 47 | mocker.patch( 48 | "brickflow.resolver.go_up_till_brickflow_root", 49 | side_effect=make_exception_function(PermissionError), 50 | ) 51 | 52 | brickflow.resolver.get_relative_path_to_brickflow_root() 53 | -------------------------------------------------------------------------------- /tests/sample_workflows/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/44560e2be1d3b27f587d916d9f6ef81b0edade3d/tests/sample_workflows/__init__.py -------------------------------------------------------------------------------- /tests/sample_workflows/sample_workflow_1.py: -------------------------------------------------------------------------------- 1 | from brickflow.engine.compute import Cluster 2 | from brickflow.engine.task import BrickflowTriggerRule, TaskType, TaskResponse 3 | from brickflow.engine.workflow import Workflow 4 | 5 | wf = Workflow( 6 | "test", 7 | default_cluster=Cluster.from_existing_cluster("XXXX-XXXXXX-XXXXXXXX"), 8 | tags={"test": "test2"}, 9 | common_task_parameters={"all_tasks1": "test", "all_tasks3": "123"}, # type: ignore 10 | ) 11 | 12 | 13 | @wf.task() 14 | def task_function(): 15 | return "hello world" 16 | 17 | 18 | @wf.task 19 | def task_function_no_deco_args(): 20 | return "hello world" 21 | 22 | 23 | @wf.task() 24 | def task_function_nokwargs(): 25 | return "hello world" 26 | 27 | 28 | @wf.task(depends_on=task_function) 29 | def task_function_2(): 30 | return "hello world" 31 | 32 | 33 | @wf.task(depends_on="task_function_2") 34 | def task_function_3(): 35 | return "hello world" 36 | 37 | 38 | @wf.task(depends_on="task_function_3", trigger_rule=BrickflowTriggerRule.NONE_FAILED) 39 | def task_function_4(): 40 | return "hello world" 41 | 42 | 43 | @wf.task( 44 | task_type=TaskType.CUSTOM_PYTHON_TASK, 45 | trigger_rule=BrickflowTriggerRule.NONE_FAILED, 46 | custom_execute_callback=lambda x: TaskResponse(x.name, push_return_value=True), 47 | ) 48 | def custom_python_task_push(): 49 | pass 50 | -------------------------------------------------------------------------------- /tests/sample_workflows/sample_workflow_2.py: -------------------------------------------------------------------------------- 1 | from brickflow.engine.compute import Cluster 2 | from brickflow.engine.task import BrickflowTriggerRule, TaskType, TaskResponse 3 | from brickflow.engine.workflow import Workflow 4 | 5 | wf = Workflow( 6 | "test2", 7 | default_cluster=Cluster.from_existing_cluster("XXXX-XXXXXX-XXXXXXXX"), 8 | tags={"test": "test2"}, 9 | ) 10 | 11 | 12 | @wf.task() 13 | def task_function(): 14 | return "hello world" 15 | 16 | 17 | @wf.task 18 | def task_function_no_deco_args(): 19 | return "hello world" 20 | 21 | 22 | @wf.task() 23 | def task_function_nokwargs(): 24 | return "hello world" 25 | 26 | 27 | @wf.task(depends_on=task_function) 28 | def task_function_2(): 29 | return "hello world" 30 | 31 | 32 | @wf.task(depends_on="task_function_2") 33 | def task_function_3(): 34 | return "hello world" 35 | 36 | 37 | @wf.task(depends_on="task_function_3", trigger_rule=BrickflowTriggerRule.NONE_FAILED) 38 | def task_function_4(): 39 | return "hello world" 40 | 41 | 42 | @wf.task( 43 | task_type=TaskType.CUSTOM_PYTHON_TASK, 44 | trigger_rule=BrickflowTriggerRule.NONE_FAILED, 45 | custom_execute_callback=lambda x: TaskResponse(x.name, push_return_value=True), 46 | ) 47 | def custom_python_task_push(): 48 | pass 49 | -------------------------------------------------------------------------------- /tests/test_brickflow.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=unused-import 2 | import pytest 3 | from brickflow import get_config_file_type, ConfigFileType 4 | 5 | 6 | def test_imports(): 7 | try: 8 | from brickflow import ( 9 | log, 10 | _ilog, 11 | BrickflowEnvVars, 12 | BrickflowDefaultEnvs, 13 | ctx, 14 | Workflow, 15 | WorkflowPermissions, 16 | User, 17 | Group, 18 | ServicePrincipal, 19 | Task, 20 | TaskType, 21 | TaskResponse, 22 | BrickflowTriggerRule, 23 | BrickflowTaskEnvVars, 24 | StorageBasedTaskLibrary, 25 | JarTaskLibrary, 26 | EggTaskLibrary, 27 | WheelTaskLibrary, 28 | PypiTaskLibrary, 29 | MavenTaskLibrary, 30 | CranTaskLibrary, 31 | EmailNotifications, 32 | DLTPipeline, 33 | DLTEdition, 34 | DLTChannels, 35 | Cluster, 36 | Runtimes, 37 | Project, 38 | ) 39 | 40 | print("All imports Succeeded") 41 | except ImportError as e: 42 | print(f"Import failed: {e}") 43 | 44 | 45 | @pytest.mark.parametrize( 46 | "config_file_name,expected_extension", 47 | [ 48 | (".brickflow-project-root.yaml", ConfigFileType.YAML), 49 | (".brickflow-project-root.yml", ConfigFileType.YML), 50 | (".brickflow-project-root.json", ConfigFileType.YAML), 51 | ], 52 | ) 53 | def test_get_config_type(config_file_name, expected_extension): 54 | actual = get_config_file_type(f"some/brickflow/root/{config_file_name}") 55 | assert actual == expected_extension 56 | -------------------------------------------------------------------------------- /tests/test_plugins.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from typing import List 3 | from unittest import mock 4 | 5 | import pluggy 6 | import pytest 7 | 8 | from brickflow.engine.task import get_plugin_manager, get_brickflow_tasks_hook 9 | 10 | 11 | def assert_plugin_manager( 12 | pm: pluggy.PluginManager, expected_plugins: List[str] 13 | ) -> None: 14 | num_expected_plugins = len(expected_plugins) 15 | assert ( 16 | len(pm.get_plugins()) == num_expected_plugins 17 | ), f"import error should only {num_expected_plugins} plugins" 18 | for plugin in expected_plugins: 19 | assert pm.has_plugin(plugin), f"plugin manager should have {plugin} plugin" 20 | 21 | all_plugins = set([pm.get_name(plugin_impl) for plugin_impl in pm.get_plugins()]) 22 | assert all_plugins == set(expected_plugins), ( 23 | f"plugin manager should have {expected_plugins} " f"plugins and nothing more" 24 | ) 25 | 26 | 27 | class TestBrickflowPlugins: 28 | def test_plugins_installed(self): 29 | pm = copy.deepcopy(get_plugin_manager()) 30 | get_brickflow_tasks_hook(pm) 31 | assert_plugin_manager(pm, ["airflow-plugin", "default"]) 32 | 33 | def test_plugins_load_plugins_import_error(self): 34 | with mock.patch("brickflow_plugins.load_plugins") as load_plugins_mock: 35 | load_plugins_mock.side_effect = ImportError 36 | pm = copy.deepcopy(get_plugin_manager()) 37 | get_brickflow_tasks_hook(pm) 38 | assert_plugin_manager(pm, ["default"]) 39 | 40 | def test_plugins_ensure_installation_import_error(self): 41 | with mock.patch("brickflow_plugins.ensure_installation") as load_plugins_mock: 42 | load_plugins_mock.side_effect = ImportError 43 | pm = copy.deepcopy(get_plugin_manager()) 44 | get_brickflow_tasks_hook(pm) 45 | assert_plugin_manager(pm, ["default"]) 46 | 47 | @pytest.mark.parametrize( 48 | "quartz_cron, expected_unix_cron", 49 | [ 50 | ("0 * * ? * * *", "* * * * *"), 51 | ("0 */5 * ? * * *", "*/5 * * * *"), 52 | ("0 30 * ? * * *", "30 * * * *"), 53 | ("0 0 12 ? * * *", "0 12 * * *"), 54 | ("0 0 12 ? * 2 *", "0 12 * * 1"), 55 | ("0 0 0 10 * ? *", "0 0 10 * *"), 56 | ("0 0 0 1 1 ? *", "0 0 1 1 *"), 57 | ("0 0/5 14,18 * * ?", "0/5 14,18 * * *"), 58 | ("0 0 12 ? * 1,2,5-7 *", "0 12 * * 0,1,4-6"), 59 | ("0 0 12 ? * SUN,MON,THU-SAT *", "0 12 * * SUN,MON,THU-SAT"), 60 | ], 61 | ) 62 | def test_cron_conversion(self, quartz_cron, expected_unix_cron): 63 | import brickflow_plugins.airflow.cronhelper as cronhelper # noqa 64 | 65 | converted_unix_cron = cronhelper.cron_helper.quartz_to_unix(quartz_cron) 66 | converted_quartz_cron = cronhelper.cron_helper.unix_to_quartz( 67 | converted_unix_cron 68 | ) 69 | converted_unix_cron_second = cronhelper.cron_helper.quartz_to_unix( 70 | converted_quartz_cron 71 | ) 72 | 73 | assert ( 74 | converted_unix_cron == converted_unix_cron_second 75 | ), "cron conversion should be idempotent" 76 | assert converted_unix_cron == expected_unix_cron 77 | 78 | @pytest.mark.parametrize( 79 | "quartz_cron", 80 | [ 81 | "0 0 12 ? * L *", 82 | "0 0 12 ? * 1L *", 83 | "0 0 12 ? * 1W *", 84 | "0 0 12 ? * 1#5 *", 85 | ], 86 | ) 87 | def test_unsupported_cron_expressions(self, quartz_cron): 88 | import brickflow_plugins.airflow.cronhelper as cronhelper # noqa 89 | 90 | with pytest.raises(ValueError): 91 | cronhelper.cron_helper.quartz_to_unix(quartz_cron) 92 | -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | # Code generate tools 2 | 3 | Use this to code generate `brickflow/bundles/model.py` 4 | 5 | Make sure you are in the repository root and are using a *nix machine. 6 | 7 | ```shell 8 | ./tools/gen-bundle.sh # example: ./tools/gen-bundle.sh 0.201.0 9 | ``` 10 | 11 | Please note the version defaults to what is defaulted in brickflow. -------------------------------------------------------------------------------- /tools/gen-bundle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Check if the version argument is provided 4 | if [ $# -lt 1 ]; then 5 | echo "Usage: $0 " 6 | # exit 1 7 | fi 8 | 9 | set -e # Exit on any command failure 10 | 11 | # Set the provided version as an environment variable 12 | export BUNDLE_CODE_GEN_CLI_VERSION="$1" 13 | 14 | rm -rf .databricks/bin/cli/ 15 | poetry install 16 | poetry run python tools/install_databricks_cli.py 17 | poetry run python tools/modify_schema.py 18 | poetry run datamodel-codegen --input brickflow/bundles/transformed_schema.json \ 19 | --use-title-as-name \ 20 | --disable-appending-item-suffix \ 21 | --collapse-root-models \ 22 | --capitalise-enum-members \ 23 | --enum-field-as-literal all \ 24 | --input-file-type jsonschema \ 25 | --output brickflow/bundles/model.py 26 | echo "✅ Code generation completed successfully!" 27 | poetry run python tools/modify_model.py 28 | echo "✅ Updated and patched model successfully!" 29 | echo "# generated with Databricks CLI Version: $(.databricks/bin/cli/*/databricks --version)" | \ 30 | cat - brickflow/bundles/model.py > /tmp/codegen && \ 31 | mv /tmp/codegen brickflow/bundles/model.py 32 | echo "✅ Modified the front matter of the script!" 33 | poetry run python brickflow/bundles/model.py # validate python file 34 | echo "✅ Validated the file is proper python code!" 35 | -------------------------------------------------------------------------------- /tools/install_databricks_cli.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | try: 4 | from brickflow import BrickflowEnvVars 5 | from brickflow.cli import bundle_cli_setup 6 | from brickflow.cli.bundles import get_valid_bundle_cli 7 | from brickflow.engine import _call 8 | except ImportError: 9 | raise ImportError("Please install brickflow to use this script") 10 | 11 | if __name__ == "__main__": 12 | cli_version = os.environ.get("BUNDLE_CODE_GEN_CLI_VERSION", None) 13 | if cli_version is not None and cli_version != "": 14 | os.environ[BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_VERSION.value] = cli_version 15 | 16 | bundle_cli_setup() 17 | bundle_cli = get_valid_bundle_cli( 18 | os.environ[BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_EXEC.value] 19 | ) 20 | print(f"Using Databricks CLI: {bundle_cli}") 21 | print(_call(f"{bundle_cli} --version", shell=True).decode("utf-8")) 22 | _call(f"{bundle_cli} bundle schema > brickflow/bundles/schema.json", shell=True) 23 | -------------------------------------------------------------------------------- /tools/modify_model.py: -------------------------------------------------------------------------------- 1 | if __name__ == "__main__": 2 | import re 3 | 4 | # string = "class Artifacts1(BaseModel)" 5 | regex_pattern = r"(?<=class\s)[A-Za-z]\w+" 6 | file_path = "brickflow/bundles/model.py" 7 | 8 | bad_class_names = {} 9 | 10 | def remove_number_from_end(string): 11 | match = re.search(r"\d+$", string) 12 | if match: 13 | number = match.group(0) 14 | string_without_number = string[: -len(number)] 15 | return string_without_number 16 | else: 17 | return None 18 | 19 | def remove_timestamp_line(input_code: str) -> str: 20 | return "\n".join( 21 | [ 22 | _line 23 | for _line in input_code.split("\n") 24 | if not _line.startswith("# timestamp: ") 25 | ] 26 | ) 27 | 28 | def replace_class_config_extras(input_code: str) -> str: 29 | pattern = r"extra\s*=\s*Extra\.forbid" 30 | return re.sub( 31 | pattern, 'extra = "forbid"\n protected_namespaces = ()', input_code 32 | ) 33 | 34 | def replace_regex_with_pattern(input_code: str) -> str: 35 | pattern = r"regex=" 36 | return re.sub(pattern, "pattern=", input_code) 37 | 38 | with open(file_path, "r") as f: 39 | lines = f.readlines() 40 | for line in lines: 41 | match = re.search(regex_pattern, line) 42 | if match: 43 | dynamic_value = match.group(0) 44 | if remove_number_from_end(dynamic_value): 45 | bad_class_names[dynamic_value] = remove_number_from_end( 46 | dynamic_value 47 | ) 48 | 49 | with open(file_path, "r") as r: 50 | data = r.read() 51 | 52 | with open(file_path, "w") as w: 53 | for key, value in bad_class_names.items(): 54 | data = data.replace(key, value) 55 | data = remove_timestamp_line(data) 56 | # remove extra config to remove deprecation warning 57 | data = replace_class_config_extras(data) 58 | # replace regex with pattern 59 | data = replace_regex_with_pattern(data) 60 | w.write(data) 61 | --------------------------------------------------------------------------------