├── tests ├── __init__.py ├── cli │ ├── __init__.py │ ├── sample_yaml_project │ │ ├── brickflow-multi-project.yaml │ │ └── .brickflow-project-root.yaml │ ├── sample_yml_project │ │ ├── brickflow-multi-project.yml │ │ └── .brickflow-project-root.yml │ ├── conftest.py │ ├── test_projects.py │ ├── test_cli.py │ └── test_bundles.py ├── codegen │ ├── __init__.py │ ├── sample_serverless_workflow.py │ └── expected_bundles │ │ ├── local_serverless_bundle.yml │ │ └── local_bundle_continuous_schedule.yml ├── context │ └── __init__.py ├── engine │ ├── __init__.py │ ├── sample_workflow_2.py │ ├── test_engine.py │ ├── test_utils.py │ ├── sample_workflow.py │ └── test_compute.py ├── airflow_plugins │ ├── __init__.py │ └── test_autosys.py ├── databricks_plugins │ ├── __init__.py │ ├── test_run_job.py │ ├── test_workflow_dependency_sensor.py │ └── test_workflow_task_dependency_sensor.py ├── sample_workflows │ ├── __init__.py │ ├── sample_workflow_2.py │ └── sample_workflow_1.py ├── resolver │ └── test_resolver.py ├── test_brickflow.py └── test_plugins.py ├── brickflow ├── bundles │ └── __init__.py ├── hints │ ├── __init__.py │ ├── py.typed │ └── hint.py ├── cli │ ├── constants.py │ ├── entrypoint.template │ ├── gitignore_template.txt │ ├── commands.py │ └── configure.py ├── context │ └── __init__.py ├── templates │ └── injected_task_default.py.j2 ├── engine │ ├── __init__.py │ ├── hooks.py │ └── utils.py ├── codegen │ └── __init__.py └── resolver │ └── __init__.py ├── brickflow_plugins ├── airflow │ ├── __init__.py │ ├── vendor │ │ ├── __init__.py │ │ └── context.py │ ├── brickflow_task_plugin.py │ ├── context │ │ └── __init__.py │ ├── operators │ │ └── native_operators.py │ └── cronhelper.py ├── databricks │ ├── __init__.py │ └── run_job.py └── __init__.py ├── examples ├── brickflow_examples │ ├── __init__.py │ ├── src │ │ ├── __init__.py │ │ ├── python │ │ │ ├── __init__.py │ │ │ ├── lending_data_show.py │ │ │ └── setup_data.py │ │ └── sql │ │ │ └── sample.sql │ ├── notebooks │ │ ├── __init__.py │ │ └── example_notebook.py │ ├── workflows │ │ ├── __init__.py │ │ ├── entrypoint.py │ │ └── pattern_matching_example.py │ ├── brickflow-multi-project.yml │ ├── .brickflow-project-root.yml │ ├── .gitignore │ └── README.md ├── brickflow_for_each_task_examples │ ├── __init__.py │ ├── src │ │ ├── __init__.py │ │ └── python │ │ │ ├── __init__.py │ │ │ └── print_args.py │ ├── notebooks │ │ ├── __init__.py │ │ └── example_notebook.py │ ├── workflows │ │ ├── __init__.py │ │ ├── entrypoint.py │ │ └── for_each_task_wf.py │ ├── brickflow-multi-project.yml │ ├── .brickflow-project-root.yml │ └── README.md ├── brickflow_serverless_examples │ ├── __init__.py │ ├── src │ │ ├── __init__.py │ │ └── python │ │ │ ├── __init__.py │ │ │ └── example.py │ ├── notebooks │ │ ├── __init__.py │ │ └── example_notebook.py │ ├── workflows │ │ ├── __init__.py │ │ ├── entrypoint.py │ │ └── demo_serverless_wf.py │ ├── brickflow-multi-project.yml │ ├── .brickflow-project-root.yml │ ├── README.md │ └── .gitignore └── brickflow_task_injection_examples │ ├── src │ ├── __init__.py │ └── python │ │ ├── __init__.py │ │ └── helper.py │ ├── workflows │ ├── __init__.py │ ├── entrypoint.py │ └── demo_workflow.py │ ├── __init__.py │ ├── brickflow-multi-project.yml │ ├── templates │ ├── custom_logger.py.j2 │ └── data_validator.py.j2 │ └── config │ └── injected_tasks.yaml ├── MANIFEST.in ├── docs ├── img │ ├── bf_logo.png │ ├── bf_logo_1.png │ ├── workflow.png │ └── maintainance.png ├── api │ ├── cli.md │ ├── secrets.md │ ├── context.md │ ├── box_operator.md │ ├── airflow_native_operators.md │ ├── uc_to_snowflake_operator.md │ ├── airflow_external_task_dependency.md │ ├── airflow_tableau_operators.md │ ├── compute.md │ ├── project.md │ ├── sla_sensor.md │ ├── workflow_dependency_sensor.md │ ├── workflow.md │ └── task.md ├── cli │ └── reference.md ├── css │ └── custom.css ├── index.md ├── projects.md ├── highlevel.md ├── how-imports-work.md ├── upgrades │ └── upgrade-pre-0-10-0-to-0-10-0.md └── bundles-quickstart.md ├── .coveragerc ├── tools ├── README.md ├── install_databricks_cli.py ├── gen-bundle.sh └── modify_model.py ├── CODEOWNERS ├── .pre-commit-config.yaml ├── .github ├── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ └── onpush.yml ├── prospector.yaml ├── CONTRIBUTORS.md ├── Makefile ├── Dockerfile ├── config └── injected_tasks.yaml ├── pyproject.toml ├── CONTRIBUTING.md └── mkdocs.yml /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/cli/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/codegen/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/context/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/engine/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /brickflow/bundles/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/airflow_plugins/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /brickflow_plugins/airflow/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/databricks_plugins/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/sample_workflows/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /brickflow_plugins/databricks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/brickflow_examples/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/brickflow_examples/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /brickflow_plugins/airflow/vendor/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/brickflow_examples/notebooks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/brickflow_examples/src/python/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/brickflow_examples/workflows/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/brickflow_for_each_task_examples/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/brickflow_serverless_examples/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/brickflow_for_each_task_examples/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/brickflow_serverless_examples/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # Include the license file 2 | include LICENSE.txt -------------------------------------------------------------------------------- /examples/brickflow_for_each_task_examples/notebooks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/brickflow_for_each_task_examples/workflows/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/brickflow_serverless_examples/notebooks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/brickflow_serverless_examples/src/python/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/brickflow_serverless_examples/workflows/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/brickflow_for_each_task_examples/src/python/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/brickflow_task_injection_examples/src/__init__.py: -------------------------------------------------------------------------------- 1 | # Source package 2 | -------------------------------------------------------------------------------- /examples/brickflow_task_injection_examples/workflows/__init__.py: -------------------------------------------------------------------------------- 1 | # Workflows package 2 | -------------------------------------------------------------------------------- /docs/img/bf_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/HEAD/docs/img/bf_logo.png -------------------------------------------------------------------------------- /docs/img/bf_logo_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/HEAD/docs/img/bf_logo_1.png -------------------------------------------------------------------------------- /docs/img/workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/HEAD/docs/img/workflow.png -------------------------------------------------------------------------------- /examples/brickflow_task_injection_examples/__init__.py: -------------------------------------------------------------------------------- 1 | # Brickflow Task Injection Examples 2 | -------------------------------------------------------------------------------- /examples/brickflow_task_injection_examples/src/python/__init__.py: -------------------------------------------------------------------------------- 1 | # Python utilities package 2 | -------------------------------------------------------------------------------- /docs/img/maintainance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nike-Inc/brickflow/HEAD/docs/img/maintainance.png -------------------------------------------------------------------------------- /brickflow/hints/__init__.py: -------------------------------------------------------------------------------- 1 | from brickflow.hints.hint import propagate_hint 2 | 3 | __all__ = ["propagate_hint"] 4 | -------------------------------------------------------------------------------- /brickflow/hints/py.typed: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | 3 | def propagate_hint(decorator: Callable) -> Callable: ... -------------------------------------------------------------------------------- /examples/brickflow_examples/notebooks/example_notebook.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | 3 | print("hello world") 4 | -------------------------------------------------------------------------------- /examples/brickflow_examples/brickflow-multi-project.yml: -------------------------------------------------------------------------------- 1 | project_roots: 2 | brickflow-demo: 3 | root_yaml_rel_path: . 4 | version: v1 5 | -------------------------------------------------------------------------------- /examples/brickflow_examples/src/sql/sample.sql: -------------------------------------------------------------------------------- 1 | create or replace table $database.$schema.sample as 2 | select * from $database.$schema.source -------------------------------------------------------------------------------- /tests/cli/sample_yaml_project/brickflow-multi-project.yaml: -------------------------------------------------------------------------------- 1 | version: v1 2 | project_roots: 3 | test_cli_project: 4 | root_yaml_rel_path: . 5 | -------------------------------------------------------------------------------- /tests/cli/sample_yml_project/brickflow-multi-project.yml: -------------------------------------------------------------------------------- 1 | version: v1 2 | project_roots: 3 | test_cli_project: 4 | root_yaml_rel_path: . 5 | -------------------------------------------------------------------------------- /examples/brickflow_for_each_task_examples/brickflow-multi-project.yml: -------------------------------------------------------------------------------- 1 | project_roots: 2 | for_each_task_examples: 3 | root_yaml_rel_path: . 4 | version: v1 5 | -------------------------------------------------------------------------------- /examples/brickflow_serverless_examples/brickflow-multi-project.yml: -------------------------------------------------------------------------------- 1 | project_roots: 2 | brickflow-serverless-demo: 3 | root_yaml_rel_path: . 4 | version: v1 5 | -------------------------------------------------------------------------------- /examples/brickflow_for_each_task_examples/src/python/print_args.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | if __name__ == "__main__": 4 | print(f"Hello, running with input {sys.argv}") 5 | -------------------------------------------------------------------------------- /examples/brickflow_task_injection_examples/brickflow-multi-project.yml: -------------------------------------------------------------------------------- 1 | project_roots: 2 | brickflow-task-injection-demo: 3 | root_yaml_rel_path: . 4 | version: v1 5 | 6 | 7 | -------------------------------------------------------------------------------- /brickflow/hints/hint.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | 3 | 4 | # propagate type hints for decorated functions 5 | def propagate_hint(decorator: Callable) -> Callable: 6 | return decorator 7 | -------------------------------------------------------------------------------- /docs/api/cli.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow.cli 7 | handler: python 8 | options: 9 | filters: 10 | - "!^_[^_]" 11 | 12 | -------------------------------------------------------------------------------- /docs/api/secrets.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow_plugins.secrets 7 | handler: python 8 | options: 9 | filters: 10 | - "!^_[^_]" 11 | - "!^__[^__]" 12 | -------------------------------------------------------------------------------- /examples/brickflow_for_each_task_examples/notebooks/example_notebook.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | 3 | param = dbutils.widgets.get("looped_parameter") 4 | print(f"Hey this is a nested notebook running with inputs: {param}") 5 | -------------------------------------------------------------------------------- /docs/api/context.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow.context.context 7 | handler: python 8 | options: 9 | filters: 10 | - "!^_[^_]" 11 | - "!^__[^__]" 12 | 13 | -------------------------------------------------------------------------------- /docs/api/box_operator.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow_plugins.databricks.box_operator 7 | handler: python 8 | options: 9 | filters: 10 | - "!^_[^_]" 11 | - "!^__[^__]" 12 | -------------------------------------------------------------------------------- /examples/brickflow_task_injection_examples/workflows/entrypoint.py: -------------------------------------------------------------------------------- 1 | """ 2 | Entrypoint for Brickflow Task Injection Examples 3 | 4 | This file is required for Brickflow to discover and load workflows. 5 | """ 6 | 7 | from workflows import demo_workflow # noqa: F401 8 | -------------------------------------------------------------------------------- /docs/api/airflow_native_operators.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow_plugins.airflow.operators.native_operators 7 | handler: python 8 | options: 9 | filters: 10 | - "!^_[^_]" 11 | - "!^__[^__]" 12 | -------------------------------------------------------------------------------- /docs/api/uc_to_snowflake_operator.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow_plugins.databricks.uc_to_snowflake_operator 7 | handler: python 8 | options: 9 | filters: 10 | - "!^_[^_]" 11 | - "!^__[^__]" 12 | -------------------------------------------------------------------------------- /docs/api/airflow_external_task_dependency.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow_plugins.airflow.operators.external_tasks 7 | handler: python 8 | options: 9 | filters: 10 | - "!^_[^_]" 11 | - "!^__[^__]" 12 | -------------------------------------------------------------------------------- /tests/engine/sample_workflow_2.py: -------------------------------------------------------------------------------- 1 | from brickflow import Cluster, Workflow 2 | 3 | wf = Workflow( 4 | "test1", default_cluster=Cluster.from_existing_cluster("existing_cluster_id") 5 | ) 6 | 7 | 8 | @wf.task() 9 | def task_function(*, test="var"): 10 | return test 11 | -------------------------------------------------------------------------------- /docs/api/airflow_tableau_operators.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow_plugins.airflow.operators.external_tasks_tableau 7 | handler: python 8 | options: 9 | filters: 10 | - "!^_[^_]" 11 | - "!^__[^__]" 12 | -------------------------------------------------------------------------------- /docs/api/compute.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow.engine.compute 7 | handler: python 8 | options: 9 | members: 10 | - Cluster 11 | - Runtimes 12 | filters: 13 | - "!^_[^_]" 14 | 15 | -------------------------------------------------------------------------------- /docs/api/project.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow.engine.project 7 | handler: python 8 | options: 9 | members: 10 | - Project 11 | - BrickFlowEnvVars 12 | filters: 13 | - "!^_[^_]" 14 | 15 | -------------------------------------------------------------------------------- /docs/cli/reference.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | This page provides documentation for our command line tools. 7 | 8 | 9 | ::: mkdocs-click 10 | :module: brickflow.cli 11 | :command: cli 12 | :prog_name: bf 13 | :depth: 1 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /docs/api/sla_sensor.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow_plugins.databricks.sla_sensor 7 | handler: python 8 | options: 9 | members: 10 | - SLASensor 11 | filters: 12 | - "!^_[^_]" 13 | - "!^__[^__]" 14 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | *tests* 4 | brickflow/tf/* 5 | '*/.local/*', 6 | '**', 7 | 'tests/*', 8 | '*/tests/*', 9 | # omit anything in a .venv directory anywhere 10 | '.venv/*', 11 | "*/site-packages/*" 12 | 13 | [html] 14 | skip_empty = true 15 | 16 | [report] 17 | skip_empty = true 18 | -------------------------------------------------------------------------------- /tests/cli/sample_yaml_project/.brickflow-project-root.yaml: -------------------------------------------------------------------------------- 1 | version: v1 2 | projects: 3 | test_cli_project: 4 | name: test_cli_project 5 | brickflow_version: 1.2.1 6 | deployment_mode: bundle 7 | enable_plugins: false 8 | path_from_repo_root_to_project_root: some/test/path 9 | path_project_root_to_workflows_dir: path/to/workflows -------------------------------------------------------------------------------- /tests/cli/sample_yml_project/.brickflow-project-root.yml: -------------------------------------------------------------------------------- 1 | version: v1 2 | projects: 3 | test_cli_project: 4 | name: test_cli_project 5 | brickflow_version: 1.2.1 6 | deployment_mode: bundle 7 | enable_plugins: false 8 | path_from_repo_root_to_project_root: some/test/path 9 | path_project_root_to_workflows_dir: path/to/workflows -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | # Code generate tools 2 | 3 | Use this to code generate `brickflow/bundles/model.py` 4 | 5 | Make sure you are in the repository root and are using a *nix machine. 6 | 7 | ```shell 8 | ./tools/gen-bundle.sh # example: ./tools/gen-bundle.sh 0.201.0 9 | ``` 10 | 11 | Please note the version defaults to what is defaulted in brickflow. -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | # This is a comment. 2 | # Each line is a file pattern followed by one or more owners. 3 | 4 | # These owners will be the default owners for everything in 5 | # the repo. Unless a later match takes precedence, 6 | # @Nike-Inc/brickflow-dev will be requested for 7 | # review when someone opens a pull request. 8 | * @Nike-Inc/brickflow-dev @asingamaneni @stikkireddy @newfront 9 | -------------------------------------------------------------------------------- /brickflow/cli/constants.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from enum import Enum 4 | 5 | from decouple import config 6 | 7 | from brickflow import BrickflowEnvVars 8 | 9 | 10 | class BrickflowDeployMode(Enum): 11 | BUNDLE = "bundle" 12 | 13 | 14 | INTERACTIVE_MODE = config( 15 | BrickflowEnvVars.BRICKFLOW_INTERACTIVE_MODE.value, default=True, cast=bool 16 | ) 17 | -------------------------------------------------------------------------------- /docs/api/workflow_dependency_sensor.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow_plugins.databricks.workflow_dependency_sensor 7 | handler: python 8 | options: 9 | members: 10 | - WorkflowDependencySensor 11 | - WorkflowTaskDependencySensor 12 | filters: 13 | - "!^_[^_]" 14 | - "!^__[^__]" 15 | -------------------------------------------------------------------------------- /docs/api/workflow.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow.engine.workflow 7 | handler: python 8 | options: 9 | members: 10 | - Workflow 11 | - WorkspacePermissions 12 | - User 13 | - Group 14 | - ServicePrincipal 15 | filters: 16 | - "!^_[^_]" 17 | - "!^__[^__]" 18 | 19 | -------------------------------------------------------------------------------- /examples/brickflow_examples/.brickflow-project-root.yml: -------------------------------------------------------------------------------- 1 | # DO NOT MODIFY THIS FILE - IT IS AUTO GENERATED BY BRICKFLOW AND RESERVED FOR FUTURE USAGE 2 | projects: 3 | brickflow-demo: 4 | brickflow_version: auto 5 | deployment_mode: bundle 6 | enable_plugins: true 7 | name: brickflow-demo 8 | path_from_repo_root_to_project_root: . 9 | path_project_root_to_workflows_dir: workflows 10 | version: v1 11 | -------------------------------------------------------------------------------- /examples/brickflow_examples/src/python/lending_data_show.py: -------------------------------------------------------------------------------- 1 | from brickflow.context import ctx 2 | 3 | 4 | def lending_data_print(): 5 | ctx.spark.sql( 6 | """ 7 | SELECT 8 | addr_state, * 9 | FROM 10 | parquet.`dbfs:/databricks-datasets/samples/lending_club/parquet/` limit 10 11 | """ 12 | ).show(truncate=False) 13 | 14 | 15 | if __name__ == "__main__": 16 | lending_data_print() 17 | -------------------------------------------------------------------------------- /examples/brickflow_serverless_examples/.brickflow-project-root.yml: -------------------------------------------------------------------------------- 1 | # DO NOT MODIFY THIS FILE - IT IS AUTO GENERATED BY BRICKFLOW AND RESERVED FOR FUTURE USAGE 2 | projects: 3 | brickflow-serverless-demo: 4 | brickflow_version: auto 5 | deployment_mode: bundle 6 | enable_plugins: true 7 | name: brickflow-serverless-demo 8 | path_from_repo_root_to_project_root: . 9 | path_project_root_to_workflows_dir: workflows 10 | version: v1 11 | -------------------------------------------------------------------------------- /examples/brickflow_for_each_task_examples/workflows/entrypoint.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | 3 | import brickflow 4 | from brickflow import Project 5 | import workflows 6 | 7 | 8 | def main() -> None: 9 | with Project( 10 | "for_each_task_examples", 11 | git_repo="https://github.com/Nike-Inc/brickflow", 12 | provider="github", 13 | ) as f: 14 | f.add_pkg(workflows) 15 | 16 | 17 | if __name__ == "__main__": 18 | main() 19 | -------------------------------------------------------------------------------- /examples/brickflow_for_each_task_examples/.brickflow-project-root.yml: -------------------------------------------------------------------------------- 1 | # DO NOT MODIFY THIS FILE - IT IS AUTO GENERATED BY BRICKFLOW AND RESERVED FOR FUTURE USAGE 2 | projects: 3 | for_each_task_examples: 4 | brickflow_version: auto 5 | deployment_mode: bundle 6 | enable_plugins: true 7 | name: for_each_task_examples 8 | path_from_repo_root_to_project_root: examples/brickflow_for_each_task_examples 9 | path_project_root_to_workflows_dir: workflows 10 | version: v1 11 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: local 3 | hooks: 4 | - id: make-check 5 | name: Running Lint Checks 6 | entry: make check 7 | language: system 8 | files: '\.py$' 9 | pass_filenames: false 10 | always_run: true 11 | stages: [commit] 12 | - id: make-cov 13 | name: Running Lint Checks & Test Suite 14 | entry: make cov 15 | language: system 16 | files: '\.py$' 17 | pass_filenames: false 18 | always_run: true 19 | stages: [push] -------------------------------------------------------------------------------- /tests/engine/test_engine.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | from brickflow.engine import ( 4 | get_current_commit, 5 | ) 6 | 7 | 8 | class TestEngine: 9 | def test_get_current_commit(self, mocker): 10 | branch = "some_random_sha" 11 | mocker.patch("subprocess.check_output") 12 | subprocess.check_output.return_value = branch.encode("utf-8") 13 | assert get_current_commit() == branch 14 | subprocess.check_output.assert_called_once_with( 15 | ['git log -n 1 --pretty=format:"%H"'], shell=True 16 | ) # noqa 17 | -------------------------------------------------------------------------------- /tests/cli/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from brickflow.cli.projects import MultiProjectManager 3 | 4 | 5 | @pytest.fixture(autouse=True) 6 | def reset_multi_project_manager_singleton(): 7 | """Reset the MultiProjectManager singleton before each test.""" 8 | # Clear the singleton instance if it exists 9 | if hasattr(MultiProjectManager, "instance"): 10 | delattr(MultiProjectManager, "instance") 11 | yield 12 | # Optionally clean up after test 13 | if hasattr(MultiProjectManager, "instance"): 14 | delattr(MultiProjectManager, "instance") 15 | -------------------------------------------------------------------------------- /docs/api/task.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | exclude: true 4 | --- 5 | 6 | ::: brickflow.engine.task 7 | handler: python 8 | options: 9 | members: 10 | - Task 11 | - EmailNotifications 12 | - JarTaskLibrary 13 | - EggTaskLibrary 14 | - WheelTaskLibrary 15 | - PypiTaskLibrary 16 | - MavenTaskLibrary 17 | - CranTaskLibrary 18 | - BrickflowTriggerRule 19 | - BrickflowTaskEnvVars 20 | - TaskSettings 21 | - TaskType 22 | filters: 23 | - "!^_[^_]" 24 | - "!^__[^__]" 25 | 26 | -------------------------------------------------------------------------------- /brickflow/context/__init__.py: -------------------------------------------------------------------------------- 1 | from .context import ( 2 | ctx, 3 | Context, 4 | BrickflowTaskComs, 5 | BRANCH_SKIP_EXCEPT, 6 | SKIP_EXCEPT_HACK, 7 | RETURN_VALUE_KEY, 8 | BrickflowInternalVariables, 9 | BrickflowBuiltInTaskVariables, 10 | BrickflowTaskComsObject, 11 | TaskComsObjectResult, 12 | ) 13 | 14 | __all__ = [ 15 | "ctx", 16 | "Context", 17 | "BrickflowTaskComs", 18 | "BRANCH_SKIP_EXCEPT", 19 | "SKIP_EXCEPT_HACK", 20 | "RETURN_VALUE_KEY", 21 | "BrickflowInternalVariables", 22 | "BrickflowBuiltInTaskVariables", 23 | "BrickflowTaskComsObject", 24 | "TaskComsObjectResult", 25 | ] 26 | -------------------------------------------------------------------------------- /examples/brickflow_examples/workflows/entrypoint.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | 3 | import brickflow 4 | from brickflow import Project, PypiTaskLibrary 5 | import workflows 6 | 7 | 8 | def main() -> None: 9 | with Project( 10 | "brickflow-demo", 11 | git_repo="https://github.com/Nike-Inc/brickflow", 12 | provider="github", 13 | libraries=[ 14 | PypiTaskLibrary( 15 | package="spark-expectations==0.8.0" 16 | ), # comment if spark-expectations is not needed 17 | ], 18 | ) as f: 19 | f.add_pkg(workflows) 20 | 21 | 22 | if __name__ == "__main__": 23 | main() 24 | -------------------------------------------------------------------------------- /examples/brickflow_serverless_examples/notebooks/example_notebook.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %pip install pytz==2024.2 3 | 4 | # COMMAND ---------- 5 | import pytz 6 | from datetime import datetime 7 | 8 | 9 | def get_current_time_in_timezone(timezone_str): 10 | # Get the timezone object 11 | timezone = pytz.timezone(timezone_str) 12 | # Get the current time in the specified timezone 13 | current_time = datetime.now(timezone) 14 | return current_time 15 | 16 | 17 | # Example usage 18 | timezones = ["UTC", "Europe/Amsterdam", "Asia/Tokyo", "America/New_York"] 19 | for tz in timezones: 20 | print(f"Current time in {tz}: {get_current_time_in_timezone(tz)}") 21 | -------------------------------------------------------------------------------- /brickflow/cli/entrypoint.template: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | 3 | from brickflow import Project, PypiTaskLibrary, MavenTaskLibrary # make sure brickflow imports are at the top 4 | 5 | import {{ pkg }} 6 | 7 | def main() -> None: 8 | """Project entrypoint""" 9 | with Project( 10 | "{{ project_name }}", 11 | git_repo="{{ git_https_url }}", 12 | provider="{{ git_provider }}", 13 | libraries=[ 14 | # PypiTaskLibrary(package="spark-expectations=={{spark_expectations_version}}"), # Uncomment if spark-expectations is needed 15 | ], 16 | ) as f: 17 | f.add_pkg({{pkg}}) 18 | 19 | 20 | if __name__ == "__main__": 21 | main() 22 | 23 | -------------------------------------------------------------------------------- /examples/brickflow_serverless_examples/workflows/entrypoint.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # This should point to the `brickflows` version with serverless support or the wheel file with the same 3 | # MAGIC %pip install brickflows==1.2.1 4 | # MAGIC %pip install koheesio==0.8.1 5 | # MAGIC %restart_python 6 | 7 | # COMMAND ---------- 8 | import brickflow 9 | from brickflow import Project, PypiTaskLibrary 10 | import workflows 11 | 12 | 13 | def main() -> None: 14 | with Project( 15 | "brickflow-serverless-demo", 16 | git_repo="https://github.com/Nike-Inc/brickflow", 17 | provider="github", 18 | ) as f: 19 | f.add_pkg(workflows) 20 | 21 | 22 | if __name__ == "__main__": 23 | main() 24 | -------------------------------------------------------------------------------- /brickflow/templates/injected_task_default.py.j2: -------------------------------------------------------------------------------- 1 | {# Default template for injected tasks #} 2 | {# This template can be used for simple tasks that just need to print or log information #} 3 | 4 | {% if imports %} 5 | {% for import_stmt in imports %} 6 | {{ import_stmt }} 7 | {% endfor %} 8 | 9 | {% endif %} 10 | # Injected task: {{ task_name | default('unknown_task') }} 11 | print("=" * 60) 12 | print("Executing injected task: {{ task_name | default('unknown_task') }}") 13 | print("=" * 60) 14 | 15 | {% if message %} 16 | print("{{ message }}") 17 | {% else %} 18 | print("This is a dynamically injected task.") 19 | {% endif %} 20 | 21 | {% if params %} 22 | # Task parameters: 23 | {% for key, value in params.items() %} 24 | print(f" {{ key }}: {{ value }}") 25 | {% endfor %} 26 | {% endif %} 27 | 28 | print("=" * 60) 29 | print("Task completed successfully!") 30 | print("=" * 60) 31 | 32 | result = "success" 33 | -------------------------------------------------------------------------------- /tools/install_databricks_cli.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | try: 4 | from brickflow import BrickflowEnvVars 5 | from brickflow.cli import bundle_cli_setup 6 | from brickflow.cli.bundles import get_valid_bundle_cli 7 | from brickflow.engine import _call 8 | except ImportError: 9 | raise ImportError("Please install brickflow to use this script") 10 | 11 | if __name__ == "__main__": 12 | cli_version = os.environ.get("BUNDLE_CODE_GEN_CLI_VERSION", None) 13 | if cli_version is not None and cli_version != "": 14 | os.environ[BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_VERSION.value] = cli_version 15 | 16 | bundle_cli_setup() 17 | bundle_cli = get_valid_bundle_cli( 18 | os.environ[BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_EXEC.value] 19 | ) 20 | print(f"Using Databricks CLI: {bundle_cli}") 21 | print(_call(f"{bundle_cli} --version", shell=True).decode("utf-8")) 22 | _call(f"{bundle_cli} bundle schema > brickflow/bundles/schema.json", shell=True) 23 | -------------------------------------------------------------------------------- /examples/brickflow_serverless_examples/src/python/example.py: -------------------------------------------------------------------------------- 1 | import pytz 2 | from datetime import datetime 3 | import argparse 4 | 5 | 6 | def get_current_time_in_timezone(timezone_str): 7 | # Get the timezone object 8 | timezone = pytz.timezone(timezone_str) 9 | # Get the current time in the specified timezone 10 | current_time = datetime.now(timezone) 11 | return current_time 12 | 13 | 14 | if __name__ == "__main__": 15 | parser = argparse.ArgumentParser( 16 | description="Get the current time in a specified timezone." 17 | ) 18 | parser.add_argument( 19 | "--timezone", 20 | type=str, 21 | required=True, 22 | help="The timezone to get the current time for.", 23 | ) 24 | args = parser.parse_args() 25 | 26 | try: 27 | current_time = get_current_time_in_timezone(args.timezone) 28 | print(f"Current time in {args.timezone}: {current_time}") 29 | except pytz.UnknownTimeZoneError: 30 | print(f"Unknown timezone: {args.timezone}") 31 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "[FEATURE] Please add your feature request title" 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Cloud Information** 14 | 15 | 16 | - [ ] AWS 17 | - [ ] Azure 18 | - [ ] GCP 19 | - [ ] Other 20 | 21 | **Describe the solution you'd like** 22 | A clear and concise description of what you want to happen. 23 | 24 | **Describe alternatives you've considered** 25 | A clear and concise description of any alternative solutions or features you've considered. 26 | 27 | **Additional context** 28 | Add any other context or screenshots about the feature request here. 29 | -------------------------------------------------------------------------------- /tests/codegen/sample_serverless_workflow.py: -------------------------------------------------------------------------------- 1 | from brickflow import ( 2 | Workflow, 3 | NotebookTask, 4 | SparkPythonTask, 5 | ) 6 | from brickflow.engine.task import PypiTaskLibrary 7 | 8 | wf = Workflow( 9 | "brickflow-serverless-demo", 10 | schedule_quartz_expression="0 0/20 0 ? * * *", 11 | libraries=[PypiTaskLibrary(package="pytz==2024.2")], 12 | ) 13 | 14 | 15 | @wf.task 16 | def entrypoint_task(): 17 | pass 18 | 19 | 20 | @wf.notebook_task 21 | def notebook_task(): 22 | return NotebookTask( 23 | notebook_path="notebooks/example_notebook.py", 24 | base_parameters={ 25 | "some_parameter": "some_value", # in the notebook access these via dbutils.widgets.get("some_parameter") 26 | }, 27 | ) # type: ignore 28 | 29 | 30 | @wf.spark_python_task 31 | def spark_python_task(): 32 | return SparkPythonTask( 33 | python_file="./products/test-project/spark/python/src/run_task.py", 34 | source="GIT", 35 | parameters=["--timezone", "UTC"], 36 | ) # type: ignore 37 | -------------------------------------------------------------------------------- /docs/css/custom.css: -------------------------------------------------------------------------------- 1 | .md-footer-nav { display: none; } 2 | 3 | .md-footer__inner:not([hidden]) { 4 | display: none 5 | } 6 | 7 | /* Indentation. */ 8 | div.doc-contents:not(.first) { 9 | padding-left: 25px; 10 | border-left: .05rem solid var(--md-typeset-table-color); 11 | } 12 | 13 | /* Mark external links as such. */ 14 | a.autorefs-external::after { 15 | /* https://primer.style/octicons/arrow-up-right-24 */ 16 | background-image: url('data:image/svg+xml,'); 17 | content: ' '; 18 | 19 | display: inline-block; 20 | position: relative; 21 | top: 0.1em; 22 | margin-left: 0.2em; 23 | margin-right: 0.1em; 24 | 25 | height: 1em; 26 | width: 1em; 27 | border-radius: 100%; 28 | background-color: var(--md-typeset-a-color); 29 | } 30 | a.autorefs-external:hover::after { 31 | background-color: var(--md-accent-fg-color); 32 | } -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "[BUG] Please add your bug title here" 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Cloud Information** 27 | 28 | 29 | - [ ] AWS 30 | - [ ] Azure 31 | - [ ] GCP 32 | - [ ] Other 33 | 34 | **Desktop (please complete the following information):** 35 | - OS: [e.g. iOS] 36 | - Browser [e.g. chrome, safari] 37 | - Version [e.g. 22] 38 | 39 | **Additional context** 40 | Add any other context about the problem here. 41 | -------------------------------------------------------------------------------- /brickflow/cli/gitignore_template.txt: -------------------------------------------------------------------------------- 1 | # GENERATED BY BRICKFLOW CLI --START-- 2 | 3 | ### Terraform ### 4 | # Local .terraform directories 5 | **/.terraform/* 6 | 7 | # .tfstate files 8 | *.tfstate 9 | *.tfstate.* 10 | 11 | # Crash log files 12 | crash.log 13 | crash.*.log 14 | 15 | # Exclude all .tfvars files, which are likely to contain sensitive data, such as 16 | # password, private keys, and other secrets. These should not be part of version 17 | # control as they are data points which are potentially sensitive and subject 18 | # to change depending on the environment. 19 | *.tfvars 20 | *.tfvars.json 21 | 22 | # Ignore override files as they are usually used to override resources locally and so 23 | # are not checked in 24 | override.tf 25 | override.tf.json 26 | *_override.tf 27 | *_override.tf.json 28 | 29 | # Include override files you do wish to add to version control using negated pattern 30 | # !example_override.tf 31 | 32 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan 33 | # example: *tfplan* 34 | 35 | # Ignore CLI configuration files 36 | .terraformrc 37 | terraform.rc 38 | 39 | # GENERATED BY BRICKFLOW CLI --END-- -------------------------------------------------------------------------------- /tests/sample_workflows/sample_workflow_2.py: -------------------------------------------------------------------------------- 1 | from brickflow.engine.compute import Cluster 2 | from brickflow.engine.task import BrickflowTriggerRule, TaskType, TaskResponse 3 | from brickflow.engine.workflow import Workflow 4 | 5 | wf = Workflow( 6 | "test2", 7 | default_cluster=Cluster.from_existing_cluster("XXXX-XXXXXX-XXXXXXXX"), 8 | tags={"test": "test2"}, 9 | ) 10 | 11 | 12 | @wf.task() 13 | def task_function(): 14 | return "hello world" 15 | 16 | 17 | @wf.task 18 | def task_function_no_deco_args(): 19 | return "hello world" 20 | 21 | 22 | @wf.task() 23 | def task_function_nokwargs(): 24 | return "hello world" 25 | 26 | 27 | @wf.task(depends_on=task_function) 28 | def task_function_2(): 29 | return "hello world" 30 | 31 | 32 | @wf.task(depends_on="task_function_2") 33 | def task_function_3(): 34 | return "hello world" 35 | 36 | 37 | @wf.task(depends_on="task_function_3", trigger_rule=BrickflowTriggerRule.NONE_FAILED) 38 | def task_function_4(): 39 | return "hello world" 40 | 41 | 42 | @wf.task( 43 | task_type=TaskType.CUSTOM_PYTHON_TASK, 44 | trigger_rule=BrickflowTriggerRule.NONE_FAILED, 45 | custom_execute_callback=lambda x: TaskResponse(x.name, push_return_value=True), 46 | ) 47 | def custom_python_task_push(): 48 | pass 49 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | hide: 3 | - navigation 4 | --- 5 | 6 | # BrickFlow 7 | 8 | BrickFlow is a CLI tool for development and deployment of Python based Databricks Workflows in a declarative way. 9 | 10 | ## Concept 11 | 12 | `brickflow` aims to improve development experience for building any pipelines on databricks via: 13 | 14 | - Providing a declarative way to describe workflows via decorators 15 | - Provide intelligent defaults to compute targets 16 | - Provide a code and git first approach to managing and deploying workflows 17 | - Use databricks asset bundles to deploy workflows seamlessly. It is powered using terraform which helps manage state 18 | across deployments. 19 | - CLI tool helps facilitate setting up a projects 20 | - Provides additional functionality through the context library to be able to do additional things for workflows. 21 | 22 | 23 | ## Feedback 24 | 25 | Issues with `brickflow`? Found a :octicons-bug-24: bug? 26 | Have a great idea for an addition? Want to improve the documentation? Please feel 27 | free to file an [issue](https://github.com/Nike-Inc/brickflow/issues/new/choose). 28 | 29 | ## Contributing 30 | 31 | To contribute please fork and create a pull request. Here is 32 | a [guide](https://github.com/Nike-Inc/brickflow/blob/main/CONTRIBUTING.md) to help you through this process. -------------------------------------------------------------------------------- /tests/databricks_plugins/test_run_job.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pytest 4 | from requests_mock.mocker import Mocker as RequestsMocker 5 | 6 | from brickflow.engine.utils import ctx 7 | from brickflow_plugins.databricks.run_job import RunJobInRemoteWorkspace 8 | 9 | 10 | class TestRunJob: 11 | workspace_url = "https://42.cloud.databricks.com" 12 | endpoint_url = f"{workspace_url}/api/.*/jobs/run-now" 13 | response = {"run_id": 37, "number_in_job": 42} 14 | 15 | ctx.log.propagate = True 16 | 17 | @pytest.fixture(autouse=True) 18 | def mock_get_job_id(self, mocker): 19 | mocker.patch( 20 | "brickflow_plugins.databricks.run_job.get_job_id", 21 | return_value=1, 22 | ) 23 | 24 | @pytest.fixture(autouse=True, name="api") 25 | def mock_api(self): 26 | rm = RequestsMocker() 27 | rm.post(re.compile(self.endpoint_url), json=self.response, status_code=int(200)) 28 | yield rm 29 | 30 | def test_run_job(self, api, caplog): 31 | with api: 32 | RunJobInRemoteWorkspace( 33 | databricks_host=self.workspace_url, 34 | databricks_token="token", 35 | job_name="foo", 36 | ).execute() 37 | 38 | assert "RunNowResponse(number_in_job=42, run_id=37)" in caplog.text 39 | -------------------------------------------------------------------------------- /tools/gen-bundle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Check if the version argument is provided 4 | if [ $# -lt 1 ]; then 5 | echo "Usage: $0 " 6 | # exit 1 7 | fi 8 | 9 | set -e # Exit on any command failure 10 | 11 | # Set the provided version as an environment variable 12 | export BUNDLE_CODE_GEN_CLI_VERSION="$1" 13 | 14 | rm -rf .databricks/bin/cli/ 15 | poetry install 16 | poetry run python tools/install_databricks_cli.py 17 | poetry run python tools/modify_schema.py 18 | poetry run datamodel-codegen --input brickflow/bundles/transformed_schema.json \ 19 | --use-title-as-name \ 20 | --disable-appending-item-suffix \ 21 | --collapse-root-models \ 22 | --capitalise-enum-members \ 23 | --enum-field-as-literal all \ 24 | --input-file-type jsonschema \ 25 | --output brickflow/bundles/model.py 26 | echo "✅ Code generation completed successfully!" 27 | poetry run python tools/modify_model.py 28 | echo "✅ Updated and patched model successfully!" 29 | echo "# generated with Databricks CLI Version: $(.databricks/bin/cli/*/databricks --version)" | \ 30 | cat - brickflow/bundles/model.py > /tmp/codegen && \ 31 | mv /tmp/codegen brickflow/bundles/model.py 32 | echo "✅ Modified the front matter of the script!" 33 | poetry run python brickflow/bundles/model.py # validate python file 34 | echo "✅ Validated the file is proper python code!" 35 | -------------------------------------------------------------------------------- /brickflow/cli/commands.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import subprocess 5 | from typing import Optional, Union, Tuple, List 6 | 7 | from click import ClickException 8 | 9 | from brickflow import _ilog 10 | 11 | 12 | def exec_command( 13 | path_to_executable: str, 14 | base_command: Optional[str], 15 | args: Union[Tuple[str] | List[str]], 16 | capture_output: bool = False, 17 | ) -> Optional[str]: 18 | os.environ["PYTHONPATH"] = os.getcwd() 19 | my_env = os.environ.copy() 20 | try: 21 | _args = list(args) 22 | # add a base command if its provided for proxying for brickflow deploy 23 | if base_command is not None: 24 | _args = [base_command] + _args 25 | _ilog.info("Executing command: %s", " ".join([path_to_executable, *_args])) 26 | 27 | if capture_output is True: 28 | res = subprocess.run( 29 | [path_to_executable, *_args], 30 | check=True, 31 | env=my_env, 32 | capture_output=True, 33 | text=True, 34 | ) 35 | return res.stdout.strip() 36 | 37 | subprocess.run([path_to_executable, *_args], check=True, env=my_env) 38 | except subprocess.CalledProcessError as e: 39 | raise ClickException(str(e)) 40 | 41 | return None 42 | -------------------------------------------------------------------------------- /tests/databricks_plugins/test_workflow_dependency_sensor.py: -------------------------------------------------------------------------------- 1 | import re 2 | from datetime import timedelta 3 | 4 | import pytest 5 | from requests_mock.mocker import Mocker as RequestsMocker 6 | 7 | from brickflow_plugins.databricks.workflow_dependency_sensor import ( 8 | WorkflowDependencySensor, 9 | ) 10 | 11 | 12 | class TestWorkflowDependencySensor: 13 | workspace_url = "https://42.cloud.databricks.com" 14 | endpoint_url = f"{workspace_url}/api/.*/jobs/get" 15 | response = {} 16 | 17 | def test_sensor_failure_403(self): 18 | api = RequestsMocker() 19 | api.get(re.compile(self.endpoint_url), json=self.response, status_code=int(403)) 20 | 21 | # Databricks SDK will throw PermissionDenied exception if the job_id is not found or 22 | # user doesn't have permission 23 | from databricks.sdk.errors.platform import PermissionDenied 24 | 25 | with api: 26 | sensor = WorkflowDependencySensor( 27 | databricks_host=self.workspace_url, 28 | databricks_token="token", 29 | dependency_job_id="1", 30 | delta=timedelta(seconds=1), 31 | timeout_seconds=1, 32 | poke_interval_seconds=1, 33 | ) 34 | 35 | with pytest.raises(PermissionDenied): 36 | sensor.execute() 37 | -------------------------------------------------------------------------------- /examples/brickflow_for_each_task_examples/README.md: -------------------------------------------------------------------------------- 1 | # Brickflow for each task examples 2 | This repository contains some examples on how to use the fo each task type in brickflow. 3 | 4 | ## Getting Started 5 | 6 | ### Prerequisites 7 | 1.Install brickflows 8 | 9 | ```shell 10 | pip install brickflows 11 | ``` 12 | 13 | 2.Install [Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/databricks-cli.html) 14 | 15 | ```shell 16 | curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sudo sh 17 | ``` 18 | 19 | 3.Configure Databricks cli with workspace token. This configures your `~/.databrickscfg` file. 20 | 21 | ```shell 22 | databricks configure --token 23 | ``` 24 | 25 | ### Clone the repository 26 | 27 | ```shell 28 | git clone https://github.com/Nike-Inc/brickflow.git 29 | cd brickflow/examples/brickflow_serverless_examples 30 | ``` 31 | 32 | ### Customize the workflow 33 | 34 | Replace all the placeholders in workflows/for_each_task_workflow.py with configuration values compatible with your databricks workspace 35 | 36 | 37 | ### Deploy the workflow to databricks 38 | ```shell 39 | brickflow projects deploy --project for_each_task_examples -e local 40 | ``` 41 | 42 | ### Run the demo workflow 43 | - login to databricks workspace 44 | - go to the workflows and select the workflow 45 | - click on the run button 46 | -------------------------------------------------------------------------------- /tests/sample_workflows/sample_workflow_1.py: -------------------------------------------------------------------------------- 1 | from brickflow.engine.compute import Cluster 2 | from brickflow.engine.task import BrickflowTriggerRule, TaskType, TaskResponse 3 | from brickflow.engine.workflow import Workflow 4 | 5 | wf = Workflow( 6 | "test", 7 | default_cluster=Cluster.from_existing_cluster("XXXX-XXXXXX-XXXXXXXX"), 8 | tags={"test": "test2"}, 9 | common_task_parameters={"all_tasks1": "test", "all_tasks3": "123"}, # type: ignore 10 | ) 11 | 12 | 13 | @wf.task() 14 | def task_function(): 15 | return "hello world" 16 | 17 | 18 | @wf.task 19 | def task_function_no_deco_args(): 20 | return "hello world" 21 | 22 | 23 | @wf.task() 24 | def task_function_nokwargs(): 25 | return "hello world" 26 | 27 | 28 | @wf.task(depends_on=task_function) 29 | def task_function_2(): 30 | return "hello world" 31 | 32 | 33 | @wf.task(depends_on="task_function_2") 34 | def task_function_3(): 35 | return "hello world" 36 | 37 | 38 | @wf.task(depends_on="task_function_3", trigger_rule=BrickflowTriggerRule.NONE_FAILED) 39 | def task_function_4(): 40 | return "hello world" 41 | 42 | 43 | @wf.task( 44 | task_type=TaskType.CUSTOM_PYTHON_TASK, 45 | trigger_rule=BrickflowTriggerRule.NONE_FAILED, 46 | custom_execute_callback=lambda x: TaskResponse(x.name, push_return_value=True), 47 | ) 48 | def custom_python_task_push(): 49 | pass 50 | -------------------------------------------------------------------------------- /examples/brickflow_serverless_examples/README.md: -------------------------------------------------------------------------------- 1 | # Brickflows Serverless Example 2 | This project contains the example of the serverless workflow, that contains: 3 | - notebook task 4 | - python task 5 | - native Brickflow entrypoint task 6 | 7 | Note that in notebook task and entrypoint task the dependencies are set through magic `pip install` commands within 8 | the notebook. 9 | 10 | ## Getting Started 11 | 12 | ### Prerequisites 13 | 1.Install brickflows 14 | 15 | ```shell 16 | pip install brickflows 17 | ``` 18 | 19 | 2.Install [Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/databricks-cli.html) 20 | 21 | ```shell 22 | curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sudo sh 23 | ``` 24 | 25 | 3.Configure Databricks cli with workspace token. This configures your `~/.databrickscfg` file. 26 | 27 | ```shell 28 | databricks configure --token 29 | ``` 30 | 31 | ### Clone the repository 32 | 33 | ```shell 34 | git clone https://github.com/Nike-Inc/brickflow.git 35 | cd brickflow/examples/brickflow_serverless_examples 36 | ``` 37 | 38 | ### Deploy the workflow to databricks 39 | ```shell 40 | brickflow projects deploy --project brickflow-serverless-demo -e local 41 | ``` 42 | 43 | ### Run the demo workflow 44 | - login to databricks workspace 45 | - go to the workflows and select the workflow 46 | - click on the run button 47 | -------------------------------------------------------------------------------- /brickflow/engine/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import functools 4 | import logging 5 | import subprocess 6 | import sys 7 | from typing import Callable 8 | 9 | from brickflow import log, get_default_log_handler 10 | 11 | 12 | def _call(cmd: str, **kwargs: bool) -> bytes: 13 | return subprocess.check_output( # type: ignore 14 | [ 15 | cmd, 16 | ], 17 | **kwargs, 18 | ) 19 | 20 | 21 | def get_current_commit() -> str: 22 | p = _call('git log -n 1 --pretty=format:"%H"', shell=True) 23 | return p.strip().decode("utf-8") 24 | 25 | 26 | def with_brickflow_logger(f: Callable) -> Callable: 27 | @functools.wraps(f) 28 | def func(*args, **kwargs): # type: ignore 29 | _self = args[0] 30 | log.handlers = [] 31 | logger_handler = logging.StreamHandler( 32 | stream=sys.stdout 33 | ) # Handler for the logger 34 | # First, generic formatter: 35 | logger_handler.setFormatter( 36 | logging.Formatter( 37 | f"[%(asctime)s] [%(levelname)s] [brickflow:{_self.name}] " 38 | "{%(module)s.py:%(funcName)s:%(lineno)d} - %(message)s" 39 | ) 40 | ) 41 | log.addHandler(logger_handler) 42 | resp = f(*args, **kwargs) 43 | 44 | log.handlers = [get_default_log_handler()] 45 | 46 | return resp 47 | 48 | return func 49 | 50 | 51 | ROOT_NODE = "root" 52 | -------------------------------------------------------------------------------- /examples/brickflow_serverless_examples/workflows/demo_serverless_wf.py: -------------------------------------------------------------------------------- 1 | from brickflow import ( 2 | Workflow, 3 | NotebookTask, 4 | SparkPythonTask, 5 | ) 6 | from brickflow.engine.task import PypiTaskLibrary 7 | 8 | wf = Workflow( 9 | "brickflow-serverless-demo", 10 | schedule_quartz_expression="0 0/20 0 ? * * *", 11 | libraries=[ 12 | PypiTaskLibrary(package="pytz==2024.2"), 13 | # Custom repositories are not supported for serverless workloads, due to Databricks CLI limitations. 14 | # Refer to: https://github.com/databricks/cli/pull/1842This will be fixed in the future releases, use wheel instead. 15 | # PypiTaskLibrary( 16 | # package="my-lib==1.2.3", repo="https://artifactory.my-org.com/api/pypi/python-virtual/simple" 17 | # ), 18 | ], 19 | ) 20 | 21 | 22 | @wf.task 23 | def entrypoint_task(): 24 | pass 25 | 26 | 27 | @wf.notebook_task 28 | def notebook_task(): 29 | return NotebookTask( 30 | notebook_path="notebooks/example_notebook.py", 31 | base_parameters={ 32 | "some_parameter": "some_value", # in the notebook access these via dbutils.widgets.get("some_parameter") 33 | }, 34 | ) # type: ignore 35 | 36 | 37 | @wf.spark_python_task 38 | def spark_python_task(): 39 | return SparkPythonTask( 40 | python_file="/src/python/example.py", 41 | source="GIT", 42 | parameters=["--timezone", "UTC"], 43 | ) # type: ignore 44 | -------------------------------------------------------------------------------- /docs/projects.md: -------------------------------------------------------------------------------- 1 | The project is similar to a map cluster it can be composed of various different Workflows or dags. 2 | 3 | 4 | Here is an example of an entrypoint. 5 | Click the plus buttons to understand all the parts of the entrypoint file. 6 | 7 | ```python title="entrypoint.py" 8 | # Databricks notebook source (1) 9 | 10 | import examples.brickflow_examples.workflows 11 | 12 | from brickflow import Project, PypiTaskLibrary, MavenTaskLibrary 13 | 14 | 15 | def main() -> None: 16 | """Project entrypoint""" 17 | with Project( 18 | "brickflow-demo", # (3)! 19 | git_repo="https://github.com/nike-inc/brickflow", # (4)! 20 | provider="github", # (5)! 21 | libraries=[ # (6)! 22 | PypiTaskLibrary(package="networkx"), 23 | ], 24 | ) as f: 25 | f.add_pkg(examples.brickflow_examples.workflows) # (7)! 26 | 27 | 28 | if __name__ == "__main__": # (2)! 29 | main() 30 | ``` 31 | 32 | 33 | 1. Uploading this Python file into databricks with this comment on the first line treats the python file 34 | as a notebook. 35 | 2. This makes sure this only runs when this file is run via python entrypoint.py 36 | 3. This is the project name you provided when you do `bf projects add` 37 | 4. This is the git repo that is introspected when running `bf projects add` 38 | 5. This is the github provider that you decide on. 39 | 6. You can provide a list of packages that need to be installed in all of your clusters when running ETL. 40 | 7. You can add multiple packages in your project where you are defining workflows. -------------------------------------------------------------------------------- /tests/resolver/test_resolver.py: -------------------------------------------------------------------------------- 1 | # test_resolver.py 2 | from typing import Type 3 | 4 | import pytest 5 | 6 | import brickflow 7 | from brickflow.resolver import ( 8 | BrickflowRootNotFound, 9 | ) 10 | 11 | 12 | @pytest.fixture 13 | def default_mocks(mocker): 14 | # Create mocks for the three methods 15 | mocker.patch( 16 | "brickflow.resolver.get_caller_file_paths", return_value=["path1", "path2"] 17 | ) 18 | mocker.patch( 19 | "brickflow.resolver.get_notebook_ws_path", return_value="/notebook/ws/path" 20 | ) 21 | 22 | 23 | def test_resolver_methods(default_mocks, mocker): # noqa 24 | error_msg = "This is a test message" 25 | 26 | def make_exception_function(exc: Type[Exception]): 27 | def raise_exception(*args, **kwargs): 28 | raise exc(error_msg) 29 | 30 | return raise_exception 31 | 32 | # catch random error 33 | mocker.patch( 34 | "brickflow.resolver.go_up_till_brickflow_root", 35 | side_effect=make_exception_function(ValueError), 36 | ) 37 | with pytest.raises(ValueError, match=error_msg): 38 | brickflow.resolver.get_relative_path_to_brickflow_root() 39 | 40 | mocker.patch( 41 | "brickflow.resolver.go_up_till_brickflow_root", 42 | side_effect=make_exception_function(BrickflowRootNotFound), 43 | ) 44 | 45 | brickflow.resolver.get_relative_path_to_brickflow_root() 46 | 47 | mocker.patch( 48 | "brickflow.resolver.go_up_till_brickflow_root", 49 | side_effect=make_exception_function(PermissionError), 50 | ) 51 | 52 | brickflow.resolver.get_relative_path_to_brickflow_root() 53 | -------------------------------------------------------------------------------- /examples/brickflow_task_injection_examples/templates/custom_logger.py.j2: -------------------------------------------------------------------------------- 1 | {# Custom Logger Template - Example #} 2 | {# This demonstrates how to create a custom template for injected tasks #} 3 | 4 | import logging 5 | import datetime 6 | 7 | # Configure logging 8 | logging.basicConfig(level=logging.INFO) 9 | logger = logging.getLogger(__name__) 10 | 11 | # Task execution header 12 | logger.info("=" * 80) 13 | logger.info("CUSTOM INJECTED TASK: {{ task_name | default('custom_task') }}") 14 | logger.info("=" * 80) 15 | 16 | {% if custom_message %} 17 | # Custom message 18 | logger.info("") 19 | logger.info("{{ custom_message }}") 20 | logger.info("") 21 | {% endif %} 22 | 23 | # Execution timestamp 24 | execution_time = datetime.datetime.now().isoformat() 25 | logger.info(f"Execution Time: {execution_time}") 26 | 27 | {% if steps %} 28 | # Execute steps 29 | logger.info("") 30 | logger.info("Executing steps:") 31 | {% for step in steps %} 32 | logger.info(f" [{{ loop.index }}] {{ step }}") 33 | {% endfor %} 34 | logger.info("") 35 | {% endif %} 36 | 37 | # Simulated work 38 | import time 39 | logger.info("Processing...") 40 | time.sleep(0.5) 41 | logger.info("Processing complete!") 42 | 43 | # Task completion footer 44 | logger.info("") 45 | logger.info("=" * 80) 46 | logger.info("TASK COMPLETED: {{ task_name | default('custom_task') }}") 47 | logger.info("=" * 80) 48 | 49 | # Return result 50 | result = { 51 | "task": "{{ task_name | default('custom_task') }}", 52 | "status": "success", 53 | "execution_time": execution_time, 54 | {% if steps %} 55 | "steps_completed": {{ steps | length }}, 56 | {% endif %} 57 | } 58 | 59 | 60 | -------------------------------------------------------------------------------- /docs/highlevel.md: -------------------------------------------------------------------------------- 1 | ## Brickflow Overview 2 | 3 | The objective of Brickflow is to provide a thin layer on top of databricks workflows to help deploy 4 | and manage workflows in Databricks. It also provides plugins/extras to be able to run airflow 5 | operators directly in the workflows. 6 | 7 | ## Brickflow to Airflow Term Mapping 8 | 9 | | Object | Airflow | Brickflow | 10 | |-------------------------------------------|-----------------------------------|---------------------------------------------------| 11 | | Collection of Workflows | Airflow Cluster (Airflow Dag Bag) | Project/Entrypoint | 12 | | Workflow | Airflow Dag | Workflow | 13 | | Task | Airflow Operator | Task | 14 | | Schedule | Unix Cron | Quartz Cron | 15 | | Inter Task Communication | XComs | Task Values | 16 | | Managing Connections to External Services | Airflow Connections | Mocked Airflow connections or Databricks Secrets | 17 | | Variables to Tasks | Variables | Task Parameters [ctx.get_parameter(key, default)] | 18 | | Context values (execution_date, etc.) | Airflow Macros, context["ti"] | ctx. | 19 | -------------------------------------------------------------------------------- /tests/test_brickflow.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=unused-import 2 | import pytest 3 | from brickflow import get_config_file_type, ConfigFileType 4 | 5 | 6 | def test_imports(): 7 | try: 8 | from brickflow import ( 9 | log, 10 | _ilog, 11 | BrickflowEnvVars, 12 | BrickflowDefaultEnvs, 13 | ctx, 14 | Workflow, 15 | WorkflowPermissions, 16 | User, 17 | Group, 18 | ServicePrincipal, 19 | Task, 20 | TaskType, 21 | TaskResponse, 22 | BrickflowTriggerRule, 23 | BrickflowTaskEnvVars, 24 | StorageBasedTaskLibrary, 25 | JarTaskLibrary, 26 | EggTaskLibrary, 27 | WheelTaskLibrary, 28 | PypiTaskLibrary, 29 | MavenTaskLibrary, 30 | CranTaskLibrary, 31 | EmailNotifications, 32 | DLTPipeline, 33 | DLTEdition, 34 | DLTChannels, 35 | Cluster, 36 | Runtimes, 37 | Project, 38 | ) 39 | 40 | print("All imports Succeeded") 41 | except ImportError as e: 42 | print(f"Import failed: {e}") 43 | 44 | 45 | @pytest.mark.parametrize( 46 | "config_file_name,expected_extension", 47 | [ 48 | (".brickflow-project-root.yaml", ConfigFileType.YAML), 49 | (".brickflow-project-root.yml", ConfigFileType.YML), 50 | (".brickflow-project-root.json", ConfigFileType.YAML), 51 | ], 52 | ) 53 | def test_get_config_type(config_file_name, expected_extension): 54 | actual = get_config_file_type(f"some/brickflow/root/{config_file_name}") 55 | assert actual == expected_extension 56 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Description 4 | 5 | 6 | ## Related Issue 7 | 8 | 9 | 10 | 11 | 12 | ## Motivation and Context 13 | 14 | 15 | ## How Has This Been Tested? 16 | 17 | 18 | 19 | 20 | ## Screenshots (if appropriate): 21 | 22 | ## Types of changes 23 | 24 | - [ ] Bug fix (non-breaking change which fixes an issue) 25 | - [ ] New feature (non-breaking change which adds functionality) 26 | - [ ] Breaking change (fix or feature that would cause existing functionality to change) 27 | 28 | ## Checklist: 29 | 30 | 31 | - [ ] My code follows the code style of this project. 32 | - [ ] My change requires a change to the documentation. 33 | - [ ] I have updated the documentation accordingly. 34 | - [ ] I have read the **CONTRIBUTING** document. 35 | - [ ] I have added tests to cover my changes. 36 | - [ ] All new and existing tests passed. 37 | -------------------------------------------------------------------------------- /prospector.yaml: -------------------------------------------------------------------------------- 1 | strictness: high 2 | test-warnings: True 3 | doc-warnings: false 4 | 5 | ignore-paths: 6 | - build 7 | - venv 8 | - venv3 9 | - venv2 10 | - site 11 | - docs 12 | - tests/engine/sample_workflows.py 13 | - tools 14 | - .databricks 15 | - .mypy_cache 16 | - brickflow/bundles 17 | - brickflow/sample_dags 18 | - main.py 19 | - main2.py 20 | - .eggs 21 | - htmlcov 22 | - sample_workflows 23 | - integration_workflows 24 | - scripts 25 | - tests/test_brickflow.py 26 | - examples 27 | - __research__ 28 | - brickflow_plugins # will eventually need to remove once there are tests and linting logic is applied 29 | 30 | max-line-length: 120 31 | 32 | pylint: 33 | disable: 34 | - too-many-branches 35 | - too-many-statements 36 | - too-many-instance-attributes 37 | - cyclic-import 38 | - len-as-condition 39 | - invalid-name 40 | - no-else-return 41 | - no-self-use 42 | - protected-access 43 | - too-many-arguments 44 | - too-many-locals # TBD: this rule is actually a good one, we need to enable it and refactor code 45 | - inconsistent-return-statements 46 | - import-outside-toplevel 47 | - consider-using-set-comprehension 48 | - useless-object-inheritance 49 | - unnecessary-pass 50 | - raise-missing-from # pretty strange requirement with acquaint logic 51 | - broad-except 52 | - arguments-differ 53 | 54 | pycodestyle: 55 | # W293: disabled because we have newlines in docstrings 56 | # E203: disabled because pep8 and black disagree on whitespace before colon in some cases 57 | disable: W293,E203,E203 # conflicts with black formatting 58 | 59 | pyflakes: 60 | disable: 61 | - F821 # ignore undefined name errors 62 | 63 | mccabe: 64 | disable: 65 | - MC0001 66 | -------------------------------------------------------------------------------- /brickflow/engine/hooks.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | import pluggy 4 | 5 | if TYPE_CHECKING: 6 | from brickflow.engine.task import Task, TaskResponse # pragma: no cover 7 | from brickflow.engine.workflow import Workflow # pragma: no cover 8 | 9 | BRICKFLOW_TASK_PLUGINS = "brickflow_task_plugins" 10 | 11 | brickflow_plugin_spec = pluggy.HookspecMarker(BRICKFLOW_TASK_PLUGINS) 12 | 13 | 14 | class BrickflowTaskPluginSpec: 15 | @staticmethod 16 | def handle_user_result_errors(resp: "TaskResponse") -> None: 17 | """Custom execute method that is able to be plugged in.""" 18 | if resp.user_code_error is not None: 19 | original_message = str(resp.user_code_error) 20 | additional_info = ( 21 | "BRICKFLOW_USER_OR_DBR_ERROR: This is an error thrown in user code. \n" 22 | f"BRICKFLOW_INPUT_ARGS: {resp.input_kwargs}\n" 23 | "Original Exception Message: " 24 | ) 25 | new_message = additional_info + original_message 26 | resp.user_code_error.args = (new_message,) 27 | raise resp.user_code_error 28 | 29 | @staticmethod 30 | @brickflow_plugin_spec(firstresult=True) 31 | def task_execute(task: "Task", workflow: "Workflow") -> "TaskResponse": 32 | """Custom execute method that is able to be plugged in.""" 33 | raise NotImplementedError("task_execute must be implemented by a plugin") 34 | 35 | @staticmethod 36 | @brickflow_plugin_spec(firstresult=True) 37 | def handle_results( 38 | resp: "TaskResponse", task: "Task", workflow: "Workflow" 39 | ) -> "TaskResponse": 40 | """Custom execute method that is able to be plugged in.""" 41 | raise NotImplementedError("handle_results must be implemented by a plugin") 42 | -------------------------------------------------------------------------------- /CONTRIBUTORS.md: -------------------------------------------------------------------------------- 1 | # Authors 2 | * [Ashok Singamaneni](https://www.linkedin.com/in/ashok-singamaneni-193b1a32/) 3 | * [Sriharsha Tikkireddy](https://www.linkedin.com/in/sriharsha-tikkireddy/) 4 | 5 | # Contributors 6 | Thanks to the contributors who helped on this project apart from the authors 7 | * [Danny Meijer](https://www.linkedin.com/in/dannydatascientist/) 8 | * [Pariksheet Marotrao Barapatre](https://www.linkedin.com/in/pari-data-products/) 9 | * [Bhargav Sangars](https://www.linkedin.com/in/bhargav-sangars-a4b61037/) 10 | * [Brend Braeckmans](https://www.linkedin.com/in/brendbraeckmans/) 11 | * [Rebecca Raj Shree](https://www.linkedin.com/in/rebecca-raj-shree/) 12 | * [Brent (Johnson) Spetner](https://www.linkedin.com/in/brentjohnsoneng/) 13 | * [Dmitrii Grigorev](https://www.linkedin.com/in/dmitrii-grigorev-074739135/) 14 | * [Chanukya Konuganti](https://www.linkedin.com/in/chanukyakonuganti/) 15 | * [Maxim Mityutko](https://www.linkedin.com/in/mityutko/) 16 | * [Raju Gujjalapati](https://in.linkedin.com/in/raju-gujjalapati-470a88171) 17 | * [Madhusudan Koukutla](https://www.linkedin.com/in/madhusudan-reddy/) 18 | * [Surya Teja Jagatha](https://www.linkedin.com/in/surya-teja-jagatha/) 19 | * [Iris Meerman](https://www.linkedin.com/in/iris-meerman-92694675/) 20 | * [Michael Espiritu](https://www.linkedin.com/in/michaelespiritu92/) 21 | * [Riccardo Iacomini](https://www.linkedin.com/in/riccardo-iacomini-b757b6118/) 22 | 23 | # Honorary Mentions 24 | Thanks to the team below for invaluable insights and support throughout the initial release of this project 25 | 26 | * [Joe Hollow](https://www.linkedin.com/in/joe-hollow-23088b1/) 27 | * [Aditya Chaturvedi](https://www.linkedin.com/in/chaturvediaditya/) 28 | * [Scott Haines](https://www.linkedin.com/in/scotthaines/) 29 | * [Arijit Banerjee](https://www.linkedin.com/in/massborn/) 30 | -------------------------------------------------------------------------------- /examples/brickflow_task_injection_examples/templates/data_validator.py.j2: -------------------------------------------------------------------------------- 1 | {# Data Validator Template - Example #} 2 | {# This template demonstrates data validation in injected tasks #} 3 | 4 | import logging 5 | from pyspark.sql import SparkSession 6 | 7 | logging.basicConfig(level=logging.INFO) 8 | logger = logging.getLogger(__name__) 9 | 10 | # Initialize Spark 11 | spark = SparkSession.builder.getOrCreate() 12 | 13 | logger.info("=" * 80) 14 | logger.info("DATA VALIDATION TASK: {{ task_name | default('validator') }}") 15 | logger.info("=" * 80) 16 | 17 | # Configuration 18 | catalog = "{{ catalog | default('hive_metastore') }}" 19 | schema = "{{ schema | default('default') }}" 20 | table = "{{ table | default('sample_table') }}" 21 | table_name = f"{catalog}.{schema}.{table}" 22 | 23 | logger.info(f"Validating table: {table_name}") 24 | 25 | {% if validation_checks %} 26 | # Run validation checks 27 | validation_results = {} 28 | 29 | {% for check in validation_checks %} 30 | logger.info("Running check: {{ check }}") 31 | # Add your validation logic here 32 | validation_results["{{ check }}"] = "passed" 33 | {% endfor %} 34 | 35 | logger.info(f"Validation results: {validation_results}") 36 | {% else %} 37 | # Default validation: Check table exists 38 | if spark.catalog.tableExists(table_name): 39 | logger.info(f"✓ Table {table_name} exists") 40 | 41 | # Get row count 42 | df = spark.table(table_name) 43 | row_count = df.count() 44 | logger.info(f"✓ Table has {row_count} rows") 45 | 46 | result = { 47 | "status": "passed", 48 | "table": table_name, 49 | "row_count": row_count, 50 | } 51 | else: 52 | logger.error(f"✗ Table {table_name} does not exist!") 53 | result = { 54 | "status": "failed", 55 | "table": table_name, 56 | "error": "Table not found", 57 | } 58 | {% endif %} 59 | 60 | logger.info("=" * 80) 61 | logger.info("VALIDATION COMPLETED") 62 | logger.info("=" * 80) 63 | 64 | 65 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | black-check: 2 | @poetry run black --check . 3 | 4 | fmt: 5 | @poetry run black . 6 | 7 | check: black-check mypy 8 | @poetry run prospector --profile prospector.yaml --no-autodetect 9 | 10 | mypy: 11 | @poetry run mypy 12 | 13 | cov: check 14 | @poetry run coverage run --source=brickflow --omit "brickflow/sample_dags/*,sample_workflows/*,brickflow/tf/*" -m pytest && \ 15 | poetry run coverage report -m && \ 16 | poetry run coverage xml 17 | 18 | gen-bundle-sdk: 19 | @pip install . --force-reinstall 20 | @./tools/gen-bundle.sh 21 | 22 | dev: 23 | @poetry install --all-extras --with dev 24 | @poetry run pre-commit install 25 | @poetry run pre-commit install --hook-type pre-push 26 | 27 | deploy_env_setup: 28 | @poetry install --all-extras --with dev 29 | 30 | test: 31 | @poetry run coverage run --source=brickflow --omit "brickflow/bundles/*,brickflow/sample_dags/*,sample_workflows/*,brickflow/tf/*" -m pytest && \ 32 | poetry run coverage report -m && \ 33 | poetry run coverage html 34 | 35 | clean: 36 | @rm -rf dist 37 | 38 | build: clean 39 | @poetry build 40 | 41 | poetry: 42 | @poetry install --all-extras --with dev 43 | 44 | coverage: check test 45 | 46 | docs: 47 | @poetry run mike deploy -u dev latest 48 | @poetry run mike set-default latest 49 | @poetry run mike serve 50 | 51 | deploy-docs: 52 | @poetry run mike deploy --push --update-aliases $(version) latest 53 | 54 | docker-local: 55 | docker build -t brickflow:latest --build-arg CACHEBUST="$(shell date +%s)" . 56 | 57 | poetry-install: 58 | @pip install --upgrade setuptools && pip install poetry && poetry self add "poetry-dynamic-versioning[plugin]" 59 | 60 | get-version: 61 | @poetry version 62 | 63 | requirements: 64 | @poetry export -f requirements.txt --output requirements.txt --with dev --without-hashes 65 | 66 | docker-build: 67 | @docker build -t brickflow-local . 68 | 69 | docker: docker-build 70 | @docker run -it -v "$(shell pwd)":/brickflow brickflow-local /bin/bash 71 | 72 | .PHONY: docs -------------------------------------------------------------------------------- /brickflow/codegen/__init__.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from enum import Enum 3 | from pathlib import Path 4 | 5 | from typing import TYPE_CHECKING, Optional, Dict, Any 6 | 7 | from decouple import config 8 | 9 | from brickflow import get_brickflow_version, BrickflowEnvVars, BrickflowDefaultEnvs 10 | 11 | if TYPE_CHECKING: 12 | from brickflow.engine.project import _Project 13 | 14 | 15 | class CodegenInterface(abc.ABC): 16 | def __init__( 17 | self, project: "_Project", id_: str, env: str, **_: Any 18 | ) -> None: # noqa 19 | self.env: str = env 20 | self.project: "_Project" = project 21 | self.id_ = id_ 22 | 23 | @abc.abstractmethod 24 | def synth(self) -> None: 25 | pass 26 | 27 | 28 | class DatabricksDefaultClusterTagKeys(Enum): 29 | ENVIRONMENT = "environment" 30 | DEPLOYED_BY = "deployed_by" 31 | DEPLOYED_AT = "deployed_at" 32 | BRICKFLOW_PROJECT_NAME = "brickflow_project_name" 33 | BRICKFLOW_DEPLOYMENT_MODE = "brickflow_deployment_mode" 34 | DATABRICKS_TF_PROVIDER_VERSION = "databricks_tf_provider_version" 35 | BRICKFLOW_VERSION = "brickflow_version" 36 | 37 | 38 | BRICKFLOW_BUILTIN_DEPLOY_TAGS = { 39 | "brickflow_version": get_brickflow_version() 40 | or "undefined", # certain scenarios get_brickflow_version maybe None 41 | } 42 | 43 | 44 | def get_brickflow_tags( 45 | user_defined_tags: Optional[Dict[str, str]], other_tags: Dict[str, str] 46 | ) -> Dict[str, str]: 47 | return {**(user_defined_tags or {}), **other_tags, **BRICKFLOW_BUILTIN_DEPLOY_TAGS} 48 | 49 | 50 | def handle_mono_repo_path(project: "_Project", env: str) -> str: 51 | base_path = config( 52 | BrickflowEnvVars.BRICKFLOW_MONOREPO_PATH_TO_BUNDLE_ROOT.value, None 53 | ) 54 | 55 | if project.entry_point_path is None: 56 | raise ValueError("project.entry_point_path is None") 57 | 58 | if base_path is None or env == BrickflowDefaultEnvs.LOCAL.value: 59 | return project.entry_point_path 60 | else: 61 | return str(Path(base_path) / project.entry_point_path) 62 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 2 | USER root 3 | 4 | # DO NOT ADD AS ENV: 5 | # debconf noninteractive 6 | # This is the anti-frontend. It never interacts with you at all, 7 | # and makes the default answers be used for all questions. It 8 | # might mail error messages to root, but that's it; otherwise it 9 | # is completely silent and unobtrusive, a perfect frontend for 10 | # automatic installs. If you are using this front-end, and require 11 | # non-default answers to questions, you will need to preseed the 12 | # debconf database; see the section below on Unattended Package 13 | # Installation for more details. 14 | 15 | RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections && \ 16 | apt-get update -y && apt-get install -y git curl wget unzip software-properties-common 17 | SHELL ["/bin/bash", "-c"] 18 | 19 | ENV NODE_VERSION 18.14.0 20 | 21 | RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections \ 22 | && curl https://raw.githubusercontent.com/creationix/nvm/master/install.sh | bash \ 23 | && . $HOME/.nvm/nvm.sh \ 24 | && nvm install $NODE_VERSION \ 25 | && nvm use $NODE_VERSION \ 26 | && npm install --global cdktf-cli@latest 27 | 28 | ENV NODE_PATH /root/.nvm/versions/node/v$NODE_VERSION/lib/node_modules 29 | ENV PATH /root/.nvm/versions/node/v$NODE_VERSION/bin:$PATH 30 | ENV NVM_DIR /root/.nvm 31 | 32 | RUN add-apt-repository ppa:deadsnakes/ppa 33 | RUN apt-get install -y python3.9 python3-pip python3.9-distutils && ln -s /usr/bin/python3.9 /usr/bin/python 34 | 35 | ARG CACHEBUST=1 36 | 37 | RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \ 38 | python3.9 get-pip.py && \ 39 | ln -s /usr/local/bin/pip3.9 /usr/bin/pip3 && \ 40 | ln -s /usr/local/bin/pip3.9 /usr/bin/pip 41 | 42 | RUN python -m pip install -U pip && pip install -U setuptools poetry 43 | 44 | WORKDIR /brickflow 45 | 46 | COPY . . 47 | 48 | VOLUME ["/brickflow", "$(pwd)"] 49 | 50 | RUN poetry install 51 | 52 | CMD ["/bin/bash"] 53 | -------------------------------------------------------------------------------- /docs/how-imports-work.md: -------------------------------------------------------------------------------- 1 | ### How do imports work? 2 | 3 | !!! warning 4 | 5 | **This is very important to understand how imports work for mono repos. Please read this carefully. Otherwise you might run into issues during deployments.** 6 | 7 | When using brickflow projects every project will have a `.brickflow-project-root.yml` file. When you import brickflow, 8 | which you will 9 | in your entrypoint or workflows, brickflow will inspect all paths all stackframes during the import and recursively go 10 | up the path until it finds the `.brickflow-project-root.yml` file. 11 | The first instance of brickflow-project-root.yml will be added to the sys.path to help with module imports. 12 | 13 | Let us take a quick example of how to get imports to properly work! 14 | 15 | Let us say you have a project structure like this: 16 | 17 | ``` 18 | repo-root/ 19 | ├── .git 20 | ├── projects/ 21 | │ ├── project_abc/ 22 | │ │ ├── lib/ 23 | │ │ │ ├── __init__.py 24 | │ │ │ └── shared_functions.py 25 | │ │ ├── workflows/ 26 | │ │ │ ├── __init__.py 27 | │ │ │ ├── entrypoint.py 28 | │ │ │ └── workflow_abc.py 29 | │ │ ├── setup.py 30 | │ │ └── .brickflow-project-root.yml 31 | │ └── project_xyz/ 32 | │ ├── workflows_geo_b/ 33 | │ │ ├── entrypoint.py 34 | │ │ └── workflow_xyz.py 35 | │ ├── workflows_geo_a/ 36 | │ │ ├── entrypoint.py 37 | │ │ └── workflow_xyz.py 38 | │ └── .brickflow-project-root.yml 39 | ├── .gitignore 40 | ├── brickflow-multi-project.yml 41 | └── README.md 42 | ``` 43 | 44 | If let us say you are looking at adding imports from lib into `workflow_abc.py`, you need to: 45 | 46 | ```python 47 | from lib import share_functions 48 | 49 | share_functions.some_function(....) 50 | ``` 51 | 52 | Since in the project structure the `.brickflow-project-root.yml` is at `repo-root/projects/project_abc` then everything 53 | in that `project_abc` folder is 54 | added to sys.path in python. So you can import any of the folders under there. -------------------------------------------------------------------------------- /config/injected_tasks.yaml: -------------------------------------------------------------------------------- 1 | # Brickflow Task Injection Configuration 2 | # Simple configuration for automatic task injection into all workflows 3 | 4 | global: 5 | # Global enable/disable for task injection 6 | enabled: true 7 | 8 | # Default libraries to attach to all injected tasks 9 | default_libraries: 10 | - "requests>=2.28.0" 11 | 12 | # Artifactory credentials (can use environment variables) 13 | artifactory_username: ${ARTIFACTORY_USERNAME} 14 | artifactory_api_key: ${ARTIFACTORY_API_KEY} 15 | 16 | # List of tasks to inject 17 | tasks: 18 | # Example: Simple logging task 19 | - task_name: "logging_task" 20 | enabled: true 21 | 22 | # Task parameters 23 | template_context: 24 | imports: 25 | - "import datetime" 26 | task_name: "logging_task" 27 | message: "This task was automatically injected into the workflow" 28 | params: 29 | execution_time: "{{datetime.datetime.now()}}" 30 | workflow: "automated_workflow" 31 | 32 | # Run after all leaf nodes complete 33 | depends_on_strategy: "leaf_nodes" 34 | 35 | task_type: "BRICKFLOW_TASK" 36 | 37 | # ================================================================= 38 | # USAGE 39 | # ================================================================= 40 | # 41 | # The default template prints simple log messages. 42 | # You can specify: 43 | # - imports: Python import statements 44 | # - task_name: Name of the task 45 | # - message: Custom message to print 46 | # - params: Key-value pairs to display 47 | # 48 | # Example: 49 | # template_context: 50 | # imports: 51 | # - "import sys" 52 | # task_name: "my_task" 53 | # message: "Custom message here" 54 | # params: 55 | # environment: "production" 56 | # version: "1.0.0" 57 | # 58 | # To use different values for different environments, create 59 | # separate YAML files: 60 | # - config/injected_tasks_dev.yaml 61 | # - config/injected_tasks_prod.yaml 62 | # 63 | # Then set the environment variable accordingly: 64 | # export BRICKFLOW_INJECT_TASKS_CONFIG="config/injected_tasks_dev.yaml" 65 | # 66 | -------------------------------------------------------------------------------- /tools/modify_model.py: -------------------------------------------------------------------------------- 1 | if __name__ == "__main__": 2 | import re 3 | 4 | # string = "class Artifacts1(BaseModel)" 5 | regex_pattern = r"(?<=class\s)[A-Za-z]\w+" 6 | file_path = "brickflow/bundles/model.py" 7 | 8 | bad_class_names = {} 9 | 10 | def remove_number_from_end(string): 11 | match = re.search(r"\d+$", string) 12 | if match: 13 | number = match.group(0) 14 | string_without_number = string[: -len(number)] 15 | return string_without_number 16 | else: 17 | return None 18 | 19 | def remove_timestamp_line(input_code: str) -> str: 20 | return "\n".join( 21 | [ 22 | _line 23 | for _line in input_code.split("\n") 24 | if not _line.startswith("# timestamp: ") 25 | ] 26 | ) 27 | 28 | def replace_class_config_extras(input_code: str) -> str: 29 | pattern = r"extra\s*=\s*Extra\.forbid" 30 | return re.sub( 31 | pattern, 'extra = "forbid"\n protected_namespaces = ()', input_code 32 | ) 33 | 34 | def replace_regex_with_pattern(input_code: str) -> str: 35 | pattern = r"regex=" 36 | return re.sub(pattern, "pattern=", input_code) 37 | 38 | with open(file_path, "r") as f: 39 | lines = f.readlines() 40 | for line in lines: 41 | match = re.search(regex_pattern, line) 42 | if match: 43 | dynamic_value = match.group(0) 44 | if remove_number_from_end(dynamic_value): 45 | bad_class_names[dynamic_value] = remove_number_from_end( 46 | dynamic_value 47 | ) 48 | 49 | with open(file_path, "r") as r: 50 | data = r.read() 51 | 52 | with open(file_path, "w") as w: 53 | for key, value in bad_class_names.items(): 54 | data = data.replace(key, value) 55 | data = remove_timestamp_line(data) 56 | # remove extra config to remove deprecation warning 57 | data = replace_class_config_extras(data) 58 | # replace regex with pattern 59 | data = replace_regex_with_pattern(data) 60 | w.write(data) 61 | -------------------------------------------------------------------------------- /tests/engine/test_utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import pathlib 3 | import pytest 4 | from requests_mock.mocker import Mocker as RequestsMocker 5 | 6 | from pydantic import SecretStr 7 | 8 | from brickflow.engine.utils import get_job_id, ctx, get_bf_project_root 9 | 10 | 11 | class TestUtils: 12 | workspace_url = "https://42.cloud.databricks.com" 13 | endpoint_url = f"{workspace_url}/api/.*/jobs/list" 14 | 15 | ctx.log.propagate = True 16 | 17 | @pytest.fixture(autouse=True, name="api", scope="class") 18 | def mock_api(self): 19 | rm = RequestsMocker() 20 | rm.register_uri( 21 | method="GET", 22 | url=re.compile(self.endpoint_url), 23 | response_list=[ 24 | { 25 | "json": {"jobs": [{"job_id": 1234, "settings": {"name": "foo"}}]}, 26 | "status_code": int(200), 27 | }, 28 | { 29 | "json": {"has_more": False}, 30 | "status_code": int(200), 31 | }, 32 | { 33 | "json": {}, 34 | "status_code": int(404), 35 | }, 36 | ], 37 | ) 38 | yield rm 39 | 40 | def test_get_job_id_success(self, api): 41 | with api: 42 | job_id = get_job_id( 43 | job_name="foo", 44 | host=self.workspace_url, 45 | token=SecretStr("token"), 46 | ) 47 | assert job_id == 1234 48 | 49 | def test_get_job_id_failure(self, api): 50 | with pytest.raises(ValueError): 51 | with api: 52 | get_job_id(job_name="bar", host=self.workspace_url, token="token") 53 | 54 | def test_get_job_id_non_200(self, caplog, api): 55 | with api: 56 | get_job_id(job_name="buz", host=self.workspace_url, token="token") 57 | assert "An error occurred: request failed" in caplog.text 58 | 59 | def test_get_bf_project_root(self): 60 | # Set up expected path which is the root of the repo 61 | expected_root = pathlib.Path.cwd().parents[0] 62 | # Execute the function 63 | actual_root = get_bf_project_root() 64 | # Assert the result 65 | assert actual_root == expected_root 66 | -------------------------------------------------------------------------------- /brickflow_plugins/databricks/run_job.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | from pydantic import SecretStr 3 | 4 | from databricks.sdk import WorkspaceClient 5 | from brickflow.context import ctx 6 | from brickflow.engine.utils import get_job_id 7 | 8 | 9 | class RunJobInRemoteWorkspace: 10 | """ 11 | Currently Databricks does not natively support running a job in a remote workspace via the RunJobTask. 12 | This plugin adds this functionality. However, it aims to be a temporary solution until Databricks adds this 13 | functionality natively. 14 | The plugin does not support neither passing the parameters to the remote job, nor waiting for the job to finish. 15 | 16 | Examples 17 | -------- 18 | service_principle_pat = ctx.dbutils.secrets.get("scope", "service_principle_id") 19 | WorkflowDependencySensor( 20 | databricks_host=https://your_workspace_url.cloud.databricks.com, 21 | databricks_token=service_principle_pat, 22 | job_name="foo", 23 | ) 24 | In the above snippet Databricks secrets are used as a secure service to store the databricks token. 25 | If you get your token from another secret management service, like AWS Secrets Manager, GCP Secret Manager 26 | or Azure Key Vault, just pass it in the databricks_token argument. 27 | """ 28 | 29 | def __init__( 30 | self, 31 | databricks_host: str, 32 | databricks_token: Union[str, SecretStr], 33 | job_name: str, 34 | ): 35 | self.databricks_host = databricks_host 36 | self.databricks_token = ( 37 | databricks_token 38 | if isinstance(databricks_token, SecretStr) 39 | else SecretStr(databricks_token) 40 | ) 41 | self.job_name = job_name 42 | self._workspace_obj = WorkspaceClient( 43 | host=self.databricks_host, token=self.databricks_token.get_secret_value() 44 | ) 45 | 46 | def execute(self): 47 | job_id = get_job_id( 48 | host=self.databricks_host, 49 | token=self.databricks_token, 50 | job_name=self.job_name, 51 | ) 52 | # TODO: add support for passing parameters to the remote job 53 | # TODO: wait for the job to finish 54 | run = self._workspace_obj.jobs.run_now(job_id) 55 | ctx.log.info("Job run status: %s", run.response) 56 | -------------------------------------------------------------------------------- /tests/cli/test_projects.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import shutil 3 | import os 4 | import pytest 5 | from brickflow import ConfigFileType 6 | from brickflow.cli.projects import MultiProjectManager, get_brickflow_root 7 | 8 | 9 | @pytest.mark.parametrize( 10 | "project_folder,extension", 11 | [("sample_yml_project", "yml"), ("sample_yaml_project", "yaml")], 12 | ) 13 | def test_get_brickflow_root(project_folder, extension): 14 | cwd = Path.cwd() 15 | test_folder = Path(__file__).parent.resolve() 16 | 17 | # Creating empty test directories 18 | test_dir = test_folder / project_folder / "some" / "dummy" / "dir" 19 | test_dir.mkdir(parents=True, exist_ok=True) 20 | 21 | try: 22 | os.chdir(test_dir) 23 | 24 | actual = get_brickflow_root() 25 | expected = test_folder / project_folder / f"brickflow-multi-project.{extension}" 26 | assert actual.resolve().as_posix() == expected.resolve().as_posix() 27 | finally: 28 | # Cleanup 29 | os.chdir(cwd) 30 | shutil.rmtree(test_folder / project_folder / "some") 31 | 32 | 33 | @pytest.mark.parametrize( 34 | "project_folder, config_type", 35 | [ 36 | ("sample_yml_project", ConfigFileType.YML), 37 | ("sample_yaml_project", ConfigFileType.YAML), 38 | ], 39 | ) 40 | def test_multi_project_manager_yaml(project_folder, config_type): 41 | cwd = Path.cwd() 42 | test_folder = Path(__file__).parent 43 | 44 | try: 45 | os.chdir(test_folder) 46 | 47 | config_file_name = ( 48 | test_folder 49 | / project_folder 50 | / f"brickflow-multi-project.{config_type.value}" 51 | ) 52 | manager = MultiProjectManager( 53 | config_file_name=str(config_file_name), file_type=config_type 54 | ) 55 | assert manager._brickflow_multi_project_config.version == "v1" 56 | expected_project_config = { 57 | "version": "v1", 58 | "projects": { 59 | "test_cli_project": { 60 | "name": "test_cli_project", 61 | "path_from_repo_root_to_project_root": "some/test/path", 62 | "path_project_root_to_workflows_dir": "path/to/workflows", 63 | "deployment_mode": "bundle", 64 | "brickflow_version": "1.2.1", 65 | "enable_plugins": False, 66 | } 67 | }, 68 | } 69 | assert manager._project_config_dict["."].model_dump() == expected_project_config 70 | finally: 71 | os.chdir(cwd) 72 | -------------------------------------------------------------------------------- /docs/upgrades/upgrade-pre-0-10-0-to-0-10-0.md: -------------------------------------------------------------------------------- 1 | --- 2 | search: 3 | boost: 2 4 | --- 5 | 6 | ## Upgrade checklist 7 | 8 | * [x] The package has been renamed from `brickflow` to `brickflows`. Please run: 9 | 10 | ``` 11 | pip uninstall brickflow 12 | ``` 13 | 14 | and then 15 | 16 | ``` 17 | pip install brickflows>=0.10.0 18 | bf --version 19 | ``` 20 | 21 | * [x] If you are upgrading from a CDKTF version of brickflow then do not worry, the existing workflows as long as you do 22 | not change their names will be imported. 23 | 24 | * [x] Start using project configurations following the [quickstart guide](../../bundles-quickstart/#brickflow-projects-setup). 25 | 26 | * [x] Confirm the existence of the following files: 27 | 28 | * brickflow-multi-project.yml 29 | * brickflow-project-root.yml 30 | * Please reference [concepts](../../bundles-quickstart/#concepts) 31 | and [initialize project](../../bundles-quickstart/#initialize-project) for more details. 32 | 33 | * [x] RelativePathPackageResolver has been removed from the project to offer a seamless import 34 | as long as you import brickflow at the top. 35 | 36 | * [x] Ensure import for brickflow is at the top of your entrypoint.py 37 | 38 | * [x] Ensure import for brickflow is at the top of your entrypoint.py 39 | 40 | 41 | * [x] Ensure your entrypoint looks like this. **Make sure to click the plus buttons and read the highlighted sections**: 42 | 43 | ```python linenums="1" hl_lines="5 7 15 18" 44 | # Databricks notebook source 45 | 46 | # COMMAND ---------- 47 | 48 | from brickflow import Project # (1)! 49 | 50 | import workflows # (2)! 51 | 52 | def main() -> None: 53 | """Project entrypoint""" 54 | with Project( 55 | "product_abc_workflows_2", 56 | git_repo="https://github.com/stikkireddy/mono-repo-test", 57 | provider="github", 58 | libraries=[ # (3)! 59 | # PypiTaskLibrary(package="spark-expectations==0.5.0"), # Uncomment if spark-expectations is needed 60 | ], 61 | enable_plugins=True, # (4)! 62 | ) as f: 63 | f.add_pkg(workflows) 64 | 65 | 66 | if __name__ == "__main__": 67 | main() 68 | ``` 69 | 70 | 1. Make sure brickflow is at the top of your imports! This will help resolve paths and allow other libraries to be 71 | imported correctly. 72 | 2. Import your modules after brickflow has been imported! Make sure your optimize imports doesnt reorder your imports! 73 | 3. Make sure you remove brickflow and brickflow plugins and cron utils from this list. 74 | 4. Make sure you have enable_plugins=True. This will enable the plugins to be loaded to support airflow operators, etc. 75 | Disable this if you dont want to install airflow. 76 | 77 | 78 | -------------------------------------------------------------------------------- /tests/airflow_plugins/test_autosys.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from requests.exceptions import HTTPError 3 | from requests_mock.mocker import Mocker as RequestsMocker 4 | 5 | from brickflow_plugins.airflow.operators.external_tasks import AutosysSensor 6 | 7 | 8 | class TestAutosysSensor: 9 | @pytest.fixture(autouse=True, name="api", scope="class") 10 | def mock_api(self): 11 | rm = RequestsMocker() 12 | rm.register_uri( 13 | method="GET", 14 | url="https://42.autosys.my-org.com/foo", 15 | response_list=[ 16 | # Test 1: Success 17 | { 18 | "json": {"status": "SU", "lastEndUTC": "2024-01-01T00:55:00Z"}, 19 | "status_code": int(200), 20 | }, 21 | # Test 2: Raise Error 22 | { 23 | "json": {}, 24 | "status_code": int(404), 25 | }, 26 | # Test 3: Poke 4 times until success 27 | { 28 | "json": {"status": "FA", "lastEndUTC": "2024-01-01T00:55:00Z"}, 29 | "status_code": int(200), 30 | }, 31 | { 32 | "json": {"status": "UNK", "lastEndUTC": None}, 33 | "status_code": int(200), 34 | }, 35 | { 36 | "json": {"status": "UNK", "lastEndUTC": ""}, 37 | "status_code": int(200), 38 | }, 39 | { 40 | "json": {"status": "SU", "lastEndUTC": "2024-01-01T01:55:00Z"}, 41 | "status_code": int(200), 42 | }, 43 | ], 44 | ) 45 | yield rm 46 | 47 | @pytest.fixture() 48 | def sensor(self): 49 | yield AutosysSensor( 50 | task_id="test", 51 | url="https://42.autosys.my-org.com/", 52 | job_name="foo", 53 | poke_interval=1, 54 | time_delta={"hours": 1}, 55 | ) 56 | 57 | def test_success(self, api, caplog, sensor): 58 | with api: 59 | sensor.poke(context={"execution_date": "2024-01-01T01:00:00Z"}) 60 | assert caplog.text.count("Poking again") == 0 61 | assert "Success criteria met. Exiting" in caplog.text 62 | 63 | def test_non_200(self, api, sensor): 64 | with pytest.raises(HTTPError): 65 | with api: 66 | sensor.poke(context={"execution_date": "2024-01-01T01:00:00Z"}) 67 | 68 | def test_poking(self, api, caplog, sensor): 69 | with api: 70 | sensor.poke(context={"execution_date": "2024-01-01T02:00:00Z"}) 71 | assert caplog.text.count("Poking again") == 3 72 | assert "Success criteria met. Exiting" in caplog.text 73 | -------------------------------------------------------------------------------- /brickflow_plugins/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import List, Optional 3 | 4 | import pluggy 5 | 6 | from brickflow import get_default_log_handler 7 | 8 | 9 | def setup_logger(): 10 | _log = logging.getLogger(__name__) # Logger 11 | _log.setLevel(logging.INFO) 12 | logger_handler = get_default_log_handler("brickflow-plugins") 13 | _log.addHandler(logger_handler) 14 | _log.propagate = False 15 | return _log 16 | 17 | 18 | log = setup_logger() 19 | 20 | from brickflow_plugins.airflow.operators.external_tasks import ( 21 | TaskDependencySensor, 22 | AutosysSensor, 23 | AirflowProxyOktaClusterAuth, 24 | ) 25 | from brickflow_plugins.airflow.operators.external_tasks_tableau import ( 26 | TableauRefreshDataSourceOperator, 27 | TableauRefreshWorkBookOperator, 28 | ) 29 | from brickflow_plugins.airflow.operators.native_operators import ( 30 | BashOperator, 31 | BranchPythonOperator, 32 | ShortCircuitOperator, 33 | ) 34 | from brickflow_plugins.databricks.workflow_dependency_sensor import ( 35 | WorkflowDependencySensor, 36 | WorkflowTaskDependencySensor, 37 | ) 38 | from brickflow_plugins.databricks.uc_to_snowflake_operator import ( 39 | SnowflakeOperator, 40 | UcToSnowflakeOperator, 41 | ) 42 | from brickflow_plugins.databricks.box_operator import ( 43 | BoxToVolumesOperator, 44 | VolumesToBoxOperator, 45 | BoxOperator, 46 | ) 47 | from brickflow_plugins.databricks.sla_sensor import SLASensor 48 | 49 | 50 | def load_plugins(cache_bust: Optional[pluggy.PluginManager] = None) -> None: 51 | from brickflow.engine.task import get_plugin_manager 52 | from brickflow_plugins.airflow.brickflow_task_plugin import ( 53 | AirflowOperatorBrickflowTaskPluginImpl, 54 | ) 55 | 56 | if cache_bust is not None: 57 | cache_bust.register( 58 | AirflowOperatorBrickflowTaskPluginImpl(), name="airflow-plugin" 59 | ) 60 | return 61 | 62 | get_plugin_manager().register(AirflowOperatorBrickflowTaskPluginImpl()) 63 | 64 | 65 | def ensure_installation(): 66 | """Ensures that the brickflow_plugins package is installed in the current environment.""" 67 | from brickflow_plugins.airflow.cronhelper import cron_helper # noqa 68 | import airflow # noqa 69 | 70 | 71 | __all__: List[str] = [ 72 | "TaskDependencySensor", 73 | "AutosysSensor", 74 | "AirflowProxyOktaClusterAuth", 75 | "BashOperator", 76 | "BranchPythonOperator", 77 | "ShortCircuitOperator", 78 | "WorkflowDependencySensor", 79 | "WorkflowTaskDependencySensor", 80 | "SnowflakeOperator", 81 | "UcToSnowflakeOperator", 82 | "TableauRefreshDataSourceOperator", 83 | "TableauRefreshWorkBookOperator", 84 | "BoxToVolumesOperator", 85 | "VolumesToBoxOperator", 86 | "BoxOperator", 87 | "SLASensor", 88 | "load_plugins", 89 | "ensure_installation", 90 | ] 91 | -------------------------------------------------------------------------------- /brickflow_plugins/airflow/brickflow_task_plugin.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | import datetime 3 | import pendulum 4 | 5 | try: 6 | from airflow import macros 7 | from airflow.models import BaseOperator 8 | from airflow.utils.context import Context 9 | except ImportError: 10 | raise ImportError( 11 | "You must install airflow to use airflow plugins, " 12 | "please try pip install brickflow[apache-airflow]" 13 | ) 14 | 15 | from jinja2 import Environment 16 | from brickflow.context import ctx 17 | from brickflow.engine.hooks import BrickflowTaskPluginSpec 18 | from brickflow.engine.task import brickflow_task_plugin_impl, Task, TaskResponse 19 | from brickflow.engine.workflow import Workflow 20 | 21 | from brickflow_plugins import log 22 | from brickflow_plugins.airflow.context import get_task_context 23 | from brickflow_plugins.airflow.operators import get_modifier_chain 24 | from brickflow_plugins.secrets import BrickflowSecretsBackend 25 | 26 | 27 | def epoch_to_pendulum_datetime(epoch_str: Optional[str]): 28 | if epoch_str is None: 29 | return None 30 | return pendulum.instance(datetime.datetime.fromtimestamp(int(epoch_str) / 1000)) 31 | 32 | 33 | class AirflowOperatorBrickflowTaskPluginImpl(BrickflowTaskPluginSpec): 34 | @staticmethod 35 | @brickflow_task_plugin_impl(tryfirst=True) 36 | def handle_results( 37 | resp: "TaskResponse", task: "Task", workflow: "Workflow" 38 | ) -> "TaskResponse": 39 | log.info( 40 | "using AirflowOperatorBrickflowTaskPlugin for handling results for task: %s", 41 | task.task_id, 42 | ) 43 | 44 | BrickflowTaskPluginSpec.handle_user_result_errors(resp) 45 | 46 | _operator = resp.response 47 | 48 | if not isinstance(_operator, BaseOperator): 49 | return resp 50 | 51 | operator_modifier_chain = get_modifier_chain() 52 | # modify any functionality of operators and then 53 | _operator = operator_modifier_chain.modify(_operator, task, workflow) 54 | 55 | if hasattr(_operator, "log"): 56 | # overwrite the operator logger if it has one to the brickflow logger 57 | setattr(_operator, "_log", ctx.log) 58 | 59 | context: Context = get_task_context( 60 | task.task_id, 61 | _operator, 62 | workflow.schedule_quartz_expression, 63 | epoch_to_pendulum_datetime(ctx.start_time(debug=None)), 64 | tz=workflow.timezone, 65 | ) 66 | 67 | env: Optional[Environment] = Environment() 68 | env.globals.update({"macros": macros, "ti": context}) 69 | with BrickflowSecretsBackend(): 70 | _operator.render_template_fields(context, jinja_env=env) 71 | op_resp = _operator.execute(context) 72 | return TaskResponse( 73 | response=op_resp, 74 | push_return_value=_operator.do_xcom_push, 75 | ) 76 | -------------------------------------------------------------------------------- /examples/brickflow_task_injection_examples/src/python/helper.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helper utilities for the demo workflow. 3 | 4 | These utilities can be used by both the main workflow tasks 5 | and the injected tasks. 6 | """ 7 | 8 | import datetime 9 | import logging 10 | from typing import Dict, Any 11 | 12 | 13 | def get_timestamp() -> str: 14 | """Get current timestamp in ISO format.""" 15 | return datetime.datetime.now().isoformat() 16 | 17 | 18 | def format_task_result(task_name: str, status: str, **kwargs: Any) -> Dict[str, Any]: 19 | """ 20 | Format a task result dictionary. 21 | 22 | Args: 23 | task_name: Name of the task 24 | status: Status of the task (e.g., 'success', 'failed') 25 | **kwargs: Additional key-value pairs to include 26 | 27 | Returns: 28 | Formatted result dictionary 29 | """ 30 | result = { 31 | "task": task_name, 32 | "status": status, 33 | "timestamp": get_timestamp(), 34 | } 35 | result.update(kwargs) 36 | return result 37 | 38 | 39 | def log_task_start(task_name: str, logger: logging.Logger = None) -> None: 40 | """ 41 | Log the start of a task with formatted output. 42 | 43 | Args: 44 | task_name: Name of the task 45 | logger: Logger instance (creates new one if not provided) 46 | """ 47 | if logger is None: 48 | logger = logging.getLogger(__name__) 49 | 50 | logger.info("=" * 70) 51 | logger.info(f"Starting Task: {task_name}") 52 | logger.info(f"Timestamp: {get_timestamp()}") 53 | logger.info("=" * 70) 54 | 55 | 56 | def log_task_end( 57 | task_name: str, status: str = "success", logger: logging.Logger = None 58 | ) -> None: 59 | """ 60 | Log the end of a task with formatted output. 61 | 62 | Args: 63 | task_name: Name of the task 64 | status: Status of the task 65 | logger: Logger instance (creates new one if not provided) 66 | """ 67 | if logger is None: 68 | logger = logging.getLogger(__name__) 69 | 70 | logger.info("=" * 70) 71 | logger.info(f"Task Completed: {task_name}") 72 | logger.info(f"Status: {status}") 73 | logger.info(f"Timestamp: {get_timestamp()}") 74 | logger.info("=" * 70) 75 | 76 | 77 | def validate_config(config: Dict[str, Any], required_keys: list) -> bool: 78 | """ 79 | Validate that a configuration dictionary has all required keys. 80 | 81 | Args: 82 | config: Configuration dictionary to validate 83 | required_keys: List of required keys 84 | 85 | Returns: 86 | True if all keys present, False otherwise 87 | """ 88 | missing_keys = [key for key in required_keys if key not in config] 89 | 90 | if missing_keys: 91 | logging.error(f"Missing required configuration keys: {missing_keys}") 92 | return False 93 | 94 | return True 95 | 96 | 97 | # Example usage in injected tasks: 98 | # from src.python.helper import get_timestamp, format_task_result, log_task_start 99 | -------------------------------------------------------------------------------- /tests/engine/sample_workflow.py: -------------------------------------------------------------------------------- 1 | from brickflow.engine.compute import Cluster 2 | from brickflow.engine.task import ( 3 | BrickflowTriggerRule, 4 | TaskType, 5 | TaskResponse, 6 | DLTPipeline, 7 | RunJobTask, 8 | ) 9 | from brickflow.engine.workflow import Workflow, WorkflowPermissions, User 10 | 11 | wf = Workflow( 12 | "test", 13 | default_cluster=Cluster.from_existing_cluster("existing_cluster_id"), 14 | schedule_quartz_expression="* * * * *", 15 | permissions=WorkflowPermissions( 16 | owner=User("abc@abc.com"), 17 | can_manage_run=[User("abc@abc.com")], 18 | can_view=[User("abc@abc.com")], 19 | can_manage=[User("abc@abc.com")], 20 | ), 21 | tags={"test": "test2"}, 22 | common_task_parameters={"all_tasks1": "test", "all_tasks3": "123"}, # type: ignore 23 | health={ 24 | "rules": [ 25 | {"metric": "RUN_DURATION_SECONDS", "op": "GREATER_THAN", "value": 7200} 26 | ] 27 | }, 28 | timeout_seconds=42, 29 | ) 30 | 31 | 32 | @wf.task() 33 | def task_function(*, test="var"): 34 | return test 35 | 36 | 37 | @wf.task() 38 | def task_function_with_error(*, test="var"): 39 | raise ValueError("throwing random error") 40 | 41 | 42 | @wf.task 43 | def task_function_no_deco_args(): 44 | return "hello world" 45 | 46 | 47 | @wf.dlt_task 48 | def dlt_pipeline(): 49 | # pass 50 | return DLTPipeline( 51 | name="hello world", 52 | storage="123", 53 | language="PYTHON", 54 | configuration={}, 55 | cluster=Cluster( 56 | "test", 57 | "someversion", 58 | "vm-node", 59 | custom_tags={"name": "test"}, 60 | min_workers=2, 61 | max_workers=10, 62 | ), 63 | notebook_path="scripts/spark_script_1.py", 64 | ) 65 | 66 | 67 | @wf.dlt_task 68 | def dlt_pipeline_2(): 69 | # pass 70 | return DLTPipeline( 71 | name="hello world", 72 | storage="123", 73 | language="PYTHON", 74 | configuration={}, 75 | notebook_path="scripts/spark_script_2.py", 76 | ) 77 | 78 | 79 | @wf.task() 80 | def task_function_nokwargs(): 81 | return "hello world" 82 | 83 | 84 | @wf.task(depends_on=task_function) 85 | def task_function_2(): 86 | return "hello world" 87 | 88 | 89 | @wf.task(depends_on="task_function_2") 90 | def task_function_3(): 91 | return "hello world" 92 | 93 | 94 | @wf.task(depends_on="task_function_3", trigger_rule=BrickflowTriggerRule.NONE_FAILED) 95 | def task_function_4(): 96 | return "hello world" 97 | 98 | 99 | @wf.task( 100 | task_type=TaskType.CUSTOM_PYTHON_TASK, 101 | trigger_rule=BrickflowTriggerRule.NONE_FAILED, 102 | custom_execute_callback=lambda x: TaskResponse(x.name, push_return_value=True), 103 | ) 104 | def custom_python_task_push(): 105 | pass 106 | 107 | 108 | @wf.run_job_task() 109 | def run_job_task(): 110 | return RunJobTask(job_name="foo", host="https://foo.cloud.databricks.com") 111 | -------------------------------------------------------------------------------- /examples/brickflow_task_injection_examples/config/injected_tasks.yaml: -------------------------------------------------------------------------------- 1 | # Task Injection Configuration for Demo Workflow 2 | # This file defines tasks that will be automatically injected into all workflows 3 | 4 | global: 5 | enabled: true 6 | default_libraries: 7 | - "requests>=2.28.0" 8 | 9 | tasks: 10 | # Task 1: Initialization (runs FIRST) 11 | - task_name: "initialization_task" 12 | enabled: true 13 | depends_on_strategy: "all_tasks" 14 | template_context: 15 | imports: 16 | - "import datetime" 17 | - "import socket" 18 | task_name: "initialization_task" 19 | message: "Initializing workflow environment before execution" 20 | params: 21 | timestamp: "{{datetime.datetime.now().isoformat()}}" 22 | hostname: "{{socket.gethostname()}}" 23 | action: "setup" 24 | task_type: "BRICKFLOW_TASK" 25 | 26 | # Task 2: Monitoring (runs AFTER task_1 and task_2) 27 | - task_name: "monitoring_task" 28 | enabled: true 29 | depends_on_strategy: "specific_tasks:task_1,task_2" 30 | template_file: "templates/custom_logger.py.j2" 31 | template_context: 32 | task_name: "monitoring_task" 33 | custom_message: "Monitoring workflow progress after initial tasks" 34 | steps: 35 | - "Check task_1 completion" 36 | - "Check task_2 completion" 37 | - "Validate data processing" 38 | - "Log metrics" 39 | task_type: "BRICKFLOW_TASK" 40 | 41 | # Task 3: Completion Logger (runs LAST) 42 | - task_name: "completion_logger" 43 | enabled: true 44 | depends_on_strategy: "leaf_nodes" # Default - runs after all leaf nodes 45 | template_context: 46 | imports: 47 | - "import datetime" 48 | task_name: "completion_logger" 49 | message: "Workflow execution completed successfully!" 50 | params: 51 | completion_time: "{{datetime.datetime.now().isoformat()}}" 52 | status: "success" 53 | final: true 54 | task_type: "BRICKFLOW_TASK" 55 | 56 | # ================================================================= 57 | # DEPLOYMENT INSTRUCTIONS 58 | # ================================================================= 59 | # 60 | # To deploy with task injection: 61 | # export BRICKFLOW_INJECT_TASKS_CONFIG="config/injected_tasks.yaml" 62 | # brickflow projects deploy --project brickflow-task-injection-demo -e local 63 | # 64 | # To deploy without task injection: 65 | # brickflow projects deploy --project brickflow-task-injection-demo -e local 66 | # 67 | # ================================================================= 68 | # DEPENDENCY STRATEGIES 69 | # ================================================================= 70 | # 71 | # 1. all_tasks: Task runs first, all workflow tasks wait for it 72 | # Use for: Initialization, setup, prerequisite checks 73 | # 74 | # 2. leaf_nodes: Task runs after all leaf nodes (tasks with no dependents) 75 | # Use for: Cleanup, notifications, final logging 76 | # 77 | # 3. specific_tasks:task1,task2: Task runs after specified tasks 78 | # Use for: Monitoring, validation, checkpoints 79 | # 80 | 81 | 82 | -------------------------------------------------------------------------------- /brickflow/engine/utils.py: -------------------------------------------------------------------------------- 1 | import functools 2 | from typing import Callable, Type, List, Iterator, Union 3 | import pathlib 4 | import os 5 | 6 | from pydantic import SecretStr 7 | from databricks.sdk import WorkspaceClient 8 | 9 | from brickflow.context import ctx 10 | from brickflow.hints import propagate_hint 11 | 12 | 13 | @propagate_hint 14 | def wraps_keyerror(error_class: Type[Exception], msg: str) -> Callable: 15 | def wrapper(f: Callable) -> Callable: 16 | @functools.wraps(f) 17 | def func(*args, **kwargs): # type: ignore 18 | try: 19 | return f(*args, **kwargs) 20 | except KeyError as e: 21 | raise error_class( 22 | f"{msg}; err: {str(e)}; args: {args}; kwargs: {kwargs}" 23 | ) 24 | 25 | return func 26 | 27 | return wrapper 28 | 29 | 30 | def get_properties(some_obj: Type) -> List[str]: 31 | def _property_iter() -> Iterator[str]: 32 | for k, v in some_obj.__dict__.items(): 33 | if isinstance(v, property): 34 | yield k 35 | 36 | return list(_property_iter()) 37 | 38 | 39 | def get_job_id( 40 | job_name: str, host: Union[str, None] = None, token: Union[str, SecretStr] = None 41 | ) -> Union[float, None]: 42 | """ 43 | Get the job id from the specified Databricks workspace for a given job name. 44 | 45 | Parameters 46 | ---------- 47 | job_name: str 48 | Job name (case-insensitive) 49 | host: str 50 | Databricks workspace URL 51 | token: str 52 | Databricks API token 53 | 54 | Returns 55 | ------- 56 | str 57 | Databricks job id 58 | """ 59 | ctx.log.info("Searching job id for job name: %s", job_name) 60 | 61 | if host: 62 | host = host.rstrip("/") 63 | token = token.get_secret_value() if isinstance(token, SecretStr) else token 64 | 65 | workspace_obj = WorkspaceClient(host=host, token=token) 66 | jobs_list = workspace_obj.jobs.list(name=job_name) 67 | 68 | try: 69 | for job in jobs_list: 70 | ctx.log.info("Job id for job '%s' is %s", job_name, job.job_id) 71 | return job.job_id 72 | else: # pylint: disable=useless-else-on-loop 73 | raise ValueError 74 | except ValueError: 75 | raise ValueError(f"No job found with name {job_name}") 76 | except Exception as e: 77 | ctx.log.info("An error occurred: %s", e) 78 | 79 | return None 80 | 81 | 82 | def get_bf_project_root() -> pathlib.Path: 83 | """Returns the root directory of the current Brickflow project 84 | 85 | Parameters: 86 | _file (str): file path where the function is called 87 | 88 | Returns: 89 | pathlib.Path: Brickflow project root directory 90 | """ 91 | try: 92 | _file_name = os.getcwd() 93 | _project_root = pathlib.Path(_file_name).resolve().parents[0] 94 | ctx.log.info("Setting Brickflow project root as %s", _project_root) 95 | return _project_root 96 | except Exception as e: 97 | ctx.log.info("An error occurred: %s", e) 98 | raise e 99 | -------------------------------------------------------------------------------- /brickflow_plugins/airflow/context/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | try: 4 | from airflow.models import BaseOperator 5 | from airflow.utils.context import Context 6 | except ImportError: 7 | raise ImportError( 8 | "You must install airflow to use airflow plugins, " 9 | "please try pip install brickflow[apache-airflow]" 10 | ) 11 | 12 | from pendulum import DateTime 13 | from brickflow.context import ctx, RETURN_VALUE_KEY 14 | from brickflow_plugins.airflow.cronhelper import cron_helper 15 | from brickflow_plugins.airflow.vendor.timetable import create_timetable 16 | from brickflow_plugins.airflow.vendor.timezone import TIMEZONE 17 | 18 | 19 | class CrossDagXComsNotSupportedError(Exception): 20 | pass 21 | 22 | 23 | class XComsPullMultipleTaskIdsError(Exception): 24 | pass 25 | 26 | 27 | class FakeTaskInstance(object): 28 | def __init__( 29 | self, 30 | task_id: str, 31 | operator: BaseOperator, 32 | execution_date: str, 33 | ): 34 | self._operator = operator 35 | self._execution_date = execution_date 36 | self._task_id = task_id 37 | 38 | def xcom_push(self, key, value): 39 | ctx.task_coms.put(task_id=self._task_id, key=key, value=value) 40 | 41 | def xcom_pull(self, task_ids, key=RETURN_VALUE_KEY, dag_id=None): 42 | if dag_id is not None: 43 | raise CrossDagXComsNotSupportedError( 44 | "Cross dag xcoms not supported in framework raise feature request." 45 | ) 46 | if isinstance(task_ids, list) and len(task_ids) > 1: 47 | raise XComsPullMultipleTaskIdsError( 48 | "Currently xcoms pull only supports one task_id please raise feature " 49 | "request." 50 | ) 51 | task_id = task_ids[0] if isinstance(task_ids, list) else task_ids 52 | return ctx.task_coms.get(task_id, key) 53 | 54 | @property 55 | def execution_date(self): 56 | return self._execution_date 57 | 58 | @property 59 | def operator(self): 60 | return self._operator 61 | 62 | 63 | def execution_timestamp( 64 | quartz_cron_statement: Optional[str] = None, 65 | ts: Optional[DateTime] = None, 66 | tz=TIMEZONE, 67 | ) -> DateTime: 68 | if quartz_cron_statement is None: 69 | return DateTime.utcnow() 70 | if ts is None: 71 | ts = DateTime.utcnow() 72 | cron = cron_helper.quartz_to_unix(quartz_cron_statement) 73 | tt = create_timetable(cron, tz) 74 | return tt.align_to_prev(ts) 75 | 76 | 77 | def get_task_context( 78 | task_id, operator: BaseOperator, quartz_cron_statement, ts, tz=TIMEZONE 79 | ) -> Context: 80 | execution_ts = execution_timestamp(quartz_cron_statement, ts, tz) 81 | return Context( 82 | **{ 83 | "execution_date": str(execution_ts), 84 | "ds": execution_ts.strftime("%Y-%m-%d"), 85 | "ds_nodash": execution_ts.strftime("%Y%m%d"), 86 | "ts": str(execution_ts), 87 | "ts_nodash": execution_ts.strftime("%Y%m%d%H%M%S"), 88 | "ti": FakeTaskInstance(task_id, operator, str(execution_ts)), 89 | } 90 | ) 91 | -------------------------------------------------------------------------------- /examples/brickflow_examples/src/python/setup_data.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %fs ls dbfs:/databricks-datasets/samples/lending_club/parquet/ 3 | 4 | # COMMAND ---------- 5 | 6 | # MAGIC %sql 7 | # MAGIC SELECT 8 | # MAGIC addr_state, * 9 | # MAGIC FROM 10 | # MAGIC parquet.`dbfs:/databricks-datasets/samples/lending_club/parquet/` 11 | 12 | # COMMAND ---------- 13 | 14 | 15 | # -- ingest step 16 | catalog = "development" 17 | database = "team_databricks_sme" 18 | spark.sql( 19 | f""" 20 | CREATE TABLE IF NOT EXISTS {catalog}.{database}.lending_data 21 | USING DELTA -- this is default just for explicit purpose 22 | SELECT * FROM parquet.`dbfs:/databricks-datasets/samples/lending_club/parquet/` 23 | """ 24 | ) 25 | 26 | # COMMAND ---------- 27 | 28 | # Step 2 29 | catalog = "development" 30 | database = "team_databricks_sme" 31 | spark.sql( 32 | f""" 33 | OPTIMIZE {catalog}.{database}.lending_data; 34 | """ 35 | ) 36 | 37 | # COMMAND ---------- 38 | 39 | # MAGIC %sql 40 | # MAGIC SELECT distinct addr_state FROM development.team_databricks_sme.lending_data 41 | 42 | # COMMAND ---------- 43 | 44 | 45 | # -- T&S 1 process AZ data 46 | catalog = "development" 47 | database = "team_databricks_sme" 48 | spark.sql( 49 | f""" 50 | CREATE OR REPLACE TABLE {catalog}.{database}.lending_data_az_geo 51 | USING DELTA -- this is default just for explicit purpose 52 | SELECT * FROM {catalog}.{database}.lending_data where addr_state = 'AZ' 53 | """ 54 | ) 55 | 56 | # COMMAND ---------- 57 | 58 | # -- T&S 2 process CA data 59 | catalog = "development" 60 | database = "team_databricks_sme" 61 | spark.sql( 62 | f""" 63 | CREATE OR REPLACE TABLE {catalog}.{database}.lending_data_ca_geo 64 | USING DELTA -- this is default just for explicit purpose 65 | SELECT * FROM {catalog}.{database}.lending_data where addr_state = 'CA' 66 | """ 67 | ) 68 | 69 | # COMMAND ---------- 70 | 71 | # -- T&S 3 process IL data 72 | catalog = "development" 73 | database = "team_databricks_sme" 74 | spark.sql( 75 | f""" 76 | CREATE OR REPLACE TABLE {catalog}.{database}.lending_data_il_geo 77 | USING DELTA -- this is default just for explicit purpose 78 | SELECT * FROM {catalog}.{database}.≈ where addr_state = 'IL' 79 | """ 80 | ) 81 | 82 | # COMMAND ---------- 83 | 84 | # -- Union Data Together 85 | catalog = "development" 86 | database = "team_databricks_sme" 87 | spark.sql( 88 | f""" 89 | CREATE OR REPLACE TABLE {catalog}.{database}.lending_data_az_ca_il_geo 90 | USING DELTA -- this is default just for explicit purpose 91 | SELECT * FROM {catalog}.{database}.lending_data_az_geo 92 | UNION ALL 93 | SELECT * FROM {catalog}.{database}.lending_data_ca_geo 94 | UNION ALL 95 | SELECT * FROM {catalog}.{database}.lending_data_il_geo 96 | """ 97 | ) 98 | 99 | # COMMAND ---------- 100 | 101 | # -- Union Data Together 102 | catalog = "development" 103 | database = "team_databricks_sme" 104 | spark.sql( 105 | f""" 106 | SELECT * FROM {catalog}.{database}.lending_data_az_ca_il_geo 107 | """ 108 | ).limit(10).toPandas().to_csv("data.csv") 109 | with open("data.csv", "r") as f: 110 | print(f.read()) 111 | 112 | # COMMAND ---------- 113 | -------------------------------------------------------------------------------- /brickflow/resolver/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import inspect 4 | import os 5 | import sys 6 | from pathlib import Path 7 | from typing import Union, Any, List, Optional 8 | import pathlib 9 | 10 | from brickflow import BrickflowProjectConstants, _ilog, ctx 11 | 12 | 13 | def add_to_sys_path(directory: Union[str, pathlib.Path]) -> None: 14 | dir_str = str(directory) 15 | if dir_str not in sys.path and os.path.isdir(dir_str): 16 | sys.path.append(dir_str) 17 | 18 | 19 | def get_caller_file_paths() -> List[str]: 20 | caller_file_paths = [] 21 | frames = inspect.stack()[1:] # Exclude the current frame 22 | 23 | for frame in frames: 24 | caller_file_paths.append(frame.filename) 25 | 26 | return list(set(caller_file_paths)) 27 | 28 | 29 | class BrickflowRootNotFound(Exception): 30 | pass 31 | 32 | 33 | def go_up_till_brickflow_root(cur_path: str) -> str: 34 | if cur_path.startswith("<"): 35 | raise BrickflowRootNotFound("Invalid brickflow root.") 36 | 37 | path = pathlib.Path(cur_path).resolve() 38 | 39 | valid_roots = [ 40 | f"{BrickflowProjectConstants.DEFAULT_MULTI_PROJECT_ROOT_FILE_NAME.value}." 41 | f"{BrickflowProjectConstants.DEFAULT_CONFIG_FILE_TYPE.value}", 42 | f"{BrickflowProjectConstants.DEFAULT_MULTI_PROJECT_CONFIG_FILE_NAME.value}." 43 | f"{BrickflowProjectConstants.DEFAULT_CONFIG_FILE_TYPE.value}", 44 | ] 45 | 46 | # recurse to see if there is a brickflow root and return the path 47 | while not path.is_dir() or not any( 48 | file.name in valid_roots for file in path.iterdir() 49 | ): 50 | path = path.parent 51 | 52 | if path == path.parent: 53 | raise BrickflowRootNotFound( 54 | "Brickflow root directory not found in path hierarchy." 55 | ) 56 | 57 | return str(path.resolve()) 58 | 59 | 60 | def get_relative_path_to_brickflow_root() -> None: 61 | paths = get_caller_file_paths() 62 | _ilog.info("Brickflow setting up python path resolution...") 63 | # if inside notebook also get that path 64 | notebook_path = get_notebook_ws_path(ctx.dbutils) 65 | if notebook_path is not None: 66 | paths.append(notebook_path) 67 | 68 | for path in paths: 69 | try: 70 | resolved_path = go_up_till_brickflow_root(path) 71 | _ilog.info("Brickflow root input path - %s", path) 72 | _ilog.info("Brickflow root found - %s", resolved_path) 73 | add_to_sys_path(resolved_path) 74 | _ilog.info("Sys path set to: %s", str(sys.path)) 75 | except BrickflowRootNotFound: 76 | _ilog.info("Unable to find for path: %s", path) 77 | except PermissionError: 78 | _ilog.info("Most likely not accessible due to shared cluster: %s", path) 79 | 80 | 81 | def get_notebook_ws_path(dbutils: Optional[Any]) -> Optional[str]: 82 | if dbutils is not None: 83 | return str( 84 | "/Workspace" 85 | / Path( 86 | dbutils.notebook.entry_point.getDbutils() 87 | .notebook() 88 | .getContext() 89 | .notebookPath() 90 | .get() 91 | .lstrip("/") 92 | ) 93 | ) 94 | return None 95 | -------------------------------------------------------------------------------- /.github/workflows/onpush.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | 3 | on: 4 | pull_request: 5 | types: [ opened, synchronize ] 6 | push: 7 | branches: [ main ] 8 | release: 9 | types: [ created ] 10 | 11 | jobs: 12 | test-pipeline: 13 | runs-on: ${{ matrix.os }} 14 | container: 15 | image: python:${{ matrix.python-version }} 16 | options: --user 1001 # run as the runner user instead of root 17 | strategy: 18 | max-parallel: 2 19 | matrix: 20 | python-version: [ '3.9', '3.10', '3.11', '3.12' ] 21 | os: [ ubuntu-latest ] 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | 26 | - name: Set up JDK # used for py4j for cronutils 27 | uses: actions/setup-java@v3 28 | with: 29 | java-version: '8' 30 | distribution: 'adopt' 31 | 32 | - name: Install pip 33 | run: python -m pip install --upgrade pip 34 | 35 | - name: Install and configure Poetry 36 | uses: snok/install-poetry@v1 37 | 38 | - name: Install poetry and build tools 39 | run: | 40 | export PATH=$PATH:$HOME/.local/bin 41 | poetry self add "poetry-dynamic-versioning[plugin]" 42 | 43 | - name: Install dependencies 44 | run: | 45 | export PATH=$PATH:$HOME/.local/bin 46 | make poetry 47 | 48 | - name: Install, lint and test 49 | run: | 50 | export PATH=$PATH:$HOME/.local/bin 51 | export GITHUB_ACTIONS=true 52 | make cov 53 | 54 | - name: Publish test coverage 55 | uses: codecov/codecov-action@v3 56 | with: 57 | token: ${{ secrets.CODECOV_TOKEN }} 58 | files: coverage.xml 59 | 60 | deploy: 61 | name: Deploy to PyPi 62 | runs-on: ${{ matrix.os }} 63 | container: 64 | image: python:${{ matrix.python-version }} 65 | options: --user 1001 # run as the runner user instead of root 66 | strategy: 67 | max-parallel: 2 68 | matrix: 69 | python-version: [ '3.9' ] 70 | os: [ ubuntu-latest ] 71 | needs: 72 | - test-pipeline 73 | if: github.event_name == 'release' 74 | steps: 75 | - uses: actions/checkout@v3 # use latest version of the checkout action 76 | 77 | - name: Set up JDK # used for py4j for cronutils 78 | uses: actions/setup-java@v3 79 | with: 80 | java-version: '8' 81 | distribution: 'adopt' 82 | 83 | - name: Install pip 84 | run: python -m pip install --upgrade pip 85 | 86 | - name: Install and configure Poetry 87 | uses: snok/install-poetry@v1 88 | 89 | - name: Install build tools 90 | run: | 91 | export PATH=$PATH:$HOME/.local/bin 92 | poetry self add "poetry-dynamic-versioning[plugin]" 93 | 94 | - name: Install dependencies 95 | run: | 96 | export PATH=$PATH:$HOME/.local/bin 97 | make poetry 98 | 99 | - name: Install wheel and twine 100 | run: python -m pip install wheel twine 101 | 102 | - name: Build and publish 103 | env: 104 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 105 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 106 | run: | 107 | export PATH=$PATH:$HOME/.local/bin 108 | make build 109 | twine upload dist/* 110 | -------------------------------------------------------------------------------- /tests/cli/test_cli.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import traceback 4 | from unittest.mock import patch, Mock 5 | 6 | import click 7 | from click.testing import CliRunner 8 | 9 | from brickflow import BrickflowProjectDeploymentSettings, BrickflowEnvVars 10 | from brickflow.cli import ( 11 | cli, 12 | exec_command, 13 | ) 14 | from brickflow.cli.bundles import ( 15 | bundle_download_path, 16 | download_and_unzip_databricks_cli, 17 | get_force_lock_flag, 18 | ) 19 | from brickflow.cli.projects import handle_libraries 20 | 21 | 22 | def fake_run(*_, **__): 23 | click.echo("hello world") 24 | 25 | 26 | # TODO: Add more tests to the cli 27 | class TestCli: 28 | def test_no_command_error(self): 29 | runner = CliRunner() 30 | non_existent_command = "non_existent_command" 31 | result = runner.invoke(cli, ["non_existent_command"]) # noqa 32 | assert result.exit_code == 2 33 | assert result.output.strip().endswith( 34 | f"Error: No such command '{non_existent_command}'." 35 | ) 36 | 37 | @patch("webbrowser.open") 38 | def test_docs(self, browser: Mock): 39 | runner = CliRunner() 40 | browser.return_value = None 41 | result = runner.invoke(cli, ["docs"]) # noqa 42 | assert result.exit_code == 0, traceback.print_exception(*result.exc_info) 43 | assert result.output.strip().startswith("Opening browser for docs...") 44 | browser.assert_called_once_with( 45 | "https://engineering.nike.com/brickflow/", new=2 46 | ) 47 | 48 | def test_force_arg(self): 49 | with patch.dict( 50 | os.environ, {BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_VERSION.value: "0.203.0"} 51 | ): 52 | assert get_force_lock_flag() == "--force-lock" 53 | with patch.dict( 54 | os.environ, {BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_VERSION.value: "auto"} 55 | ): 56 | assert get_force_lock_flag() == "--force-lock" 57 | with patch.dict( 58 | os.environ, 59 | {BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_VERSION.value: "something else"}, 60 | ): 61 | assert get_force_lock_flag() == "--force-lock" 62 | with patch.dict( 63 | os.environ, {BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_VERSION.value: "0.202.0"} 64 | ): 65 | assert get_force_lock_flag() == "--force" 66 | 67 | def test_install_cli(self): 68 | expected_version = "0.200.0" 69 | url = bundle_download_path(expected_version) 70 | file_path = download_and_unzip_databricks_cli(url, expected_version) 71 | assert url is not None 72 | version_value = exec_command(file_path, "--version", [], capture_output=True) 73 | assert ( 74 | version_value.strip() == f"Databricks CLI v{expected_version}" 75 | ), version_value 76 | directory_path = ".databricks" 77 | if os.path.exists(directory_path): 78 | shutil.rmtree(directory_path) 79 | 80 | def test_projects_handle_libraries(self): 81 | bpd = BrickflowProjectDeploymentSettings() 82 | bpd.brickflow_auto_add_libraries = None 83 | handle_libraries(skip_libraries=True) 84 | assert bpd.brickflow_auto_add_libraries is False 85 | handle_libraries(skip_libraries=False) 86 | assert bpd.brickflow_auto_add_libraries is True 87 | bpd.brickflow_auto_add_libraries = None 88 | -------------------------------------------------------------------------------- /brickflow_plugins/airflow/vendor/context.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import contextlib 4 | import copy 5 | from typing import MutableMapping, Any, Iterator, KeysView, ItemsView, ValuesView 6 | 7 | 8 | class Context(MutableMapping[str, Any]): 9 | """Jinja2 template context for task rendering. 10 | 11 | This is a mapping (dict-like) class that can lazily emit warnings when 12 | (and only when) deprecated context keys are accessed. 13 | """ 14 | 15 | _DEPRECATION_REPLACEMENTS: dict[str, list[str]] = { 16 | "execution_date": ["data_interval_start", "logical_date"], 17 | "next_ds": ["{{ data_interval_end | ds }}"], 18 | "next_ds_nodash": ["{{ data_interval_end | ds_nodash }}"], 19 | "next_execution_date": ["data_interval_end"], 20 | "prev_ds": [], 21 | "prev_ds_nodash": [], 22 | "prev_execution_date": [], 23 | "prev_execution_date_success": ["prev_data_interval_start_success"], 24 | "tomorrow_ds": [], 25 | "tomorrow_ds_nodash": [], 26 | "yesterday_ds": [], 27 | "yesterday_ds_nodash": [], 28 | } 29 | 30 | def __init__( 31 | self, context: MutableMapping[str, Any] | None = None, **kwargs: Any 32 | ) -> None: 33 | self._context: MutableMapping[str, Any] = context or {} 34 | if kwargs: 35 | self._context.update(kwargs) 36 | self._deprecation_replacements = self._DEPRECATION_REPLACEMENTS.copy() 37 | 38 | def __repr__(self) -> str: 39 | return repr(self._context) 40 | 41 | def __reduce_ex__(self, protocol: int) -> tuple[Any, ...]: 42 | """Pickle the context as a dict. 43 | 44 | We are intentionally going through ``__getitem__`` in this function, 45 | instead of using ``items()``, to trigger deprecation warnings. 46 | """ 47 | items = [(key, self[key]) for key in self._context] 48 | return dict, (items,) 49 | 50 | def __copy__(self) -> Context: 51 | new = type(self)(copy.copy(self._context)) 52 | new._deprecation_replacements = self._deprecation_replacements.copy() 53 | return new 54 | 55 | def __getitem__(self, key: str) -> Any: 56 | # with contextlib.suppress(KeyError): 57 | # warnings.warn(_create_deprecation_warning(key, self._deprecation_replacements[key])) 58 | with contextlib.suppress(KeyError): 59 | return self._context[key] 60 | raise KeyError(key) 61 | 62 | def __setitem__(self, key: str, value: Any) -> None: 63 | self._deprecation_replacements.pop(key, None) 64 | self._context[key] = value 65 | 66 | def __delitem__(self, key: str) -> None: 67 | self._deprecation_replacements.pop(key, None) 68 | del self._context[key] 69 | 70 | def __contains__(self, key: object) -> bool: 71 | return key in self._context 72 | 73 | def __iter__(self) -> Iterator[str]: 74 | return iter(self._context) 75 | 76 | def __len__(self) -> int: 77 | return len(self._context) 78 | 79 | def __eq__(self, other: Any) -> bool: 80 | if not isinstance(other, Context): 81 | return NotImplemented 82 | return self._context == other._context 83 | 84 | def __ne__(self, other: Any) -> bool: 85 | if not isinstance(other, Context): 86 | return NotImplemented 87 | return self._context != other._context 88 | 89 | def keys(self) -> KeysView[str]: 90 | return self._context.keys() 91 | 92 | def items(self): 93 | return ItemsView(self._context) 94 | 95 | def values(self): 96 | return ValuesView(self._context) 97 | -------------------------------------------------------------------------------- /examples/brickflow_for_each_task_examples/workflows/for_each_task_wf.py: -------------------------------------------------------------------------------- 1 | from brickflow import ( 2 | Workflow, 3 | WorkflowPermissions, 4 | User, 5 | NotebookTask, 6 | Cluster, 7 | JarTaskLibrary, 8 | SparkJarTask, 9 | SparkPythonTask, 10 | SqlTask, 11 | ) 12 | 13 | from brickflow.context import ctx 14 | from brickflow.engine.task import JobsTasksForEachTaskConfigs 15 | 16 | cluster = Cluster( 17 | name=f"job_cluster_for_each_task_examples", 18 | driver_node_type_id="r7g.large", 19 | node_type_id="r7g.large", 20 | spark_version="13.3.x-scala2.12", 21 | min_workers=1, 22 | max_workers=1, 23 | policy_id="", # replace with an existing policy id 24 | ) 25 | 26 | wf = Workflow( 27 | "for_each_task_examples_wf", 28 | default_cluster=cluster, 29 | permissions=WorkflowPermissions( 30 | can_manage=[ 31 | User( 32 | "" # replace email with existing users' email on databricks 33 | ) 34 | ], 35 | ), 36 | ) 37 | 38 | 39 | @wf.task 40 | def example_task(): 41 | print("This is a dependant task!") 42 | 43 | 44 | @wf.for_each_task( 45 | depends_on=example_task, 46 | for_each_task_conf=JobsTasksForEachTaskConfigs( 47 | # Inputs can be provided by either a python iterable or a json-string 48 | inputs=[ 49 | "AZ", 50 | "CA", 51 | "IL", 52 | ], 53 | concurrency=3, 54 | ), 55 | ) 56 | def example_notebook(): 57 | return NotebookTask( 58 | notebook_path="notebooks/example_notebook.py", 59 | base_parameters={"looped_parameter": "{{input}}"}, 60 | ) 61 | 62 | 63 | @wf.for_each_task( 64 | depends_on=example_task, 65 | for_each_task_conf=JobsTasksForEachTaskConfigs( 66 | inputs='["1", "2", "3"]', concurrency=3 67 | ), 68 | ) 69 | def example_brickflow_task(*, test_param="{{input}}"): 70 | print(f"Test param: {test_param}") 71 | param = ctx.get_parameter("looped_parameter") 72 | print(f"Nested brickflow task running with input: {param}") 73 | 74 | 75 | @wf.for_each_task( 76 | depends_on=example_task, 77 | libraries=[ 78 | JarTaskLibrary( 79 | jar="" 80 | ) # Replace with actual jar path 81 | ], 82 | for_each_task_conf=JobsTasksForEachTaskConfigs( 83 | inputs="[1,2,3]", 84 | concurrency=1, 85 | ), 86 | ) 87 | def for_each_spark_jar(): 88 | return SparkJarTask( 89 | main_class_name="com.example.MainClass", # Replace with actual main class name 90 | parameters=["{{input}}"], 91 | ) 92 | 93 | 94 | @wf.for_each_task( 95 | depends_on=example_task, 96 | for_each_task_conf=JobsTasksForEachTaskConfigs( 97 | inputs="[1,2,3]", 98 | concurrency=1, 99 | ), 100 | ) 101 | def for_each_spark_python(): 102 | return SparkPythonTask( 103 | python_file="examples/brickflow_for_each_task_examples/src/python/print_args.py", 104 | source="WORKSPACE", 105 | parameters=["{{input}}"], 106 | ) 107 | 108 | 109 | @wf.for_each_task( 110 | depends_on=example_notebook, 111 | for_each_task_conf=JobsTasksForEachTaskConfigs( 112 | inputs="[1,2,3]", 113 | concurrency=1, 114 | ), 115 | ) 116 | def for_each_sql_task() -> any: 117 | return SqlTask( 118 | query_id="", # Replace with actual query id 119 | warehouse_id="", # Replace with actual warehouse id 120 | parameters={"looped_parameter": "{{input}}"}, 121 | ) 122 | -------------------------------------------------------------------------------- /examples/brickflow_examples/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | 132 | # GENERATED BY BRICKFLOW CLI --START-- 133 | 134 | ### Terraform ### 135 | # Local .terraform directories 136 | **/.terraform/* 137 | 138 | # .tfstate files 139 | *.tfstate 140 | *.tfstate.* 141 | 142 | # Crash log files 143 | crash.log 144 | crash.*.log 145 | 146 | # Exclude all .tfvars files, which are likely to contain sensitive data, such as 147 | # password, private keys, and other secrets. These should not be part of version 148 | # control as they are data points which are potentially sensitive and subject 149 | # to change depending on the environment. 150 | *.tfvars 151 | *.tfvars.json 152 | 153 | # Ignore override files as they are usually used to override resources locally and so 154 | # are not checked in 155 | override.tf 156 | override.tf.json 157 | *_override.tf 158 | *_override.tf.json 159 | 160 | # Include override files you do wish to add to version control using negated pattern 161 | # !example_override.tf 162 | 163 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan 164 | # example: *tfplan* 165 | 166 | # Ignore CLI configuration files 167 | .terraformrc 168 | terraform.rc 169 | 170 | # GENERATED BY BRICKFLOW CLI --END-- 171 | 172 | .idea 173 | bundle.yml -------------------------------------------------------------------------------- /examples/brickflow_serverless_examples/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | 132 | # GENERATED BY BRICKFLOW CLI --START-- 133 | 134 | ### Terraform ### 135 | # Local .terraform directories 136 | **/.terraform/* 137 | 138 | # .tfstate files 139 | *.tfstate 140 | *.tfstate.* 141 | 142 | # Crash log files 143 | crash.log 144 | crash.*.log 145 | 146 | # Exclude all .tfvars files, which are likely to contain sensitive data, such as 147 | # password, private keys, and other secrets. These should not be part of version 148 | # control as they are data points which are potentially sensitive and subject 149 | # to change depending on the environment. 150 | *.tfvars 151 | *.tfvars.json 152 | 153 | # Ignore override files as they are usually used to override resources locally and so 154 | # are not checked in 155 | override.tf 156 | override.tf.json 157 | *_override.tf 158 | *_override.tf.json 159 | 160 | # Include override files you do wish to add to version control using negated pattern 161 | # !example_override.tf 162 | 163 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan 164 | # example: *tfplan* 165 | 166 | # Ignore CLI configuration files 167 | .terraformrc 168 | terraform.rc 169 | 170 | # GENERATED BY BRICKFLOW CLI --END-- 171 | 172 | .idea 173 | bundle.yml -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "brickflows" 3 | version = "0.11.0a0" 4 | description = "Deploy scalable workflows to databricks using python" 5 | authors = ["Ashok Singamaneni, Sriharsha Tikkireddy"] 6 | readme = "README.md" 7 | license = "Apache License 2.0" 8 | homepage = "https://github.com/Nike-Inc/brickflow" 9 | repository = "https://github.com/Nike-Inc/brickflow" 10 | packages = [{ include = "brickflow" }, { include = "brickflow_plugins" }] 11 | include = ["LICENSE", "entrypoint.template", "gitignore_template.txt"] 12 | exclude = ["sample_workflows", "tests"] 13 | 14 | [tool.black] 15 | line-length = 88 16 | target-version = ['py39', 'py310'] 17 | include = '\.pyi?$' 18 | extend-exclude = ''' 19 | /( 20 | # The following are specific to Black, you probably don't want those. 21 | | brickflow/tf 22 | | venv 23 | | brickflow.egg-info 24 | | dist 25 | | brickflow/bundles 26 | )/ 27 | ''' 28 | 29 | [tool.poetry.dependencies] 30 | python = ">=3.9,<3.13" # pyspark <3.5 does not play happy with python 3.11. The latest DBRs Runtime (17.3) ships with Python 3.12. 31 | Jinja2 = ">=3.1.5" 32 | click = "^8.1.3" 33 | databricks-sdk = ">=0.1.8 <1.0.0" 34 | networkx = "3.1" 35 | pendulum = "3.1.0" 36 | pluggy = "^1.0.0" 37 | pydantic = ">=2.0.0 <3.0.0" 38 | python-decouple = "3.8" 39 | pyyaml = "^6.0" 40 | requests = ">=2.28.2 <3.0.0" 41 | # cerberus-python-client = {version = "~2.5.4", optional = true } # Users might have to manually install cerberus-python-client if required 42 | # tableauserverclient = {version = "~0.25", optional = true } # Users might have to manually install tableauserverclient if required 43 | 44 | 45 | [tool.poetry.scripts] 46 | bf = "brickflow.cli:cli" 47 | brickflow = "brickflow.cli:cli" 48 | 49 | [tool.poetry.group.dev.dependencies] 50 | black = "^24.3.0" 51 | coverage = "^7.2.5" 52 | datamodel-code-generator = "^0.25.2" 53 | deepdiff = "^6.3.0" 54 | mypy = "^1.3.0" 55 | pre-commit = "^3.3.1" 56 | prospector = "^1.10.3" 57 | py4j = "^0.10.9.7" 58 | pytest = ">=7.3.1 <8.0.0" 59 | pytest-mock = "^3.10.0" 60 | types-PyYAML = "*" # only for development purposes no need to make installation req 61 | types-requests = ">=2.28.11.16 <3.0.0.0" # only for development purposes no need to make installation req 62 | apache-airflow = "^2.7.3" 63 | snowflake = "^1.5.0" 64 | tableauserverclient = "^0.25" 65 | boxsdk = "^3.9.2" 66 | cerberus-python-client = "^2.5.4" 67 | watchdog = "<4.0.0" 68 | requests-mock = "1.12.1" 69 | pyspark = "^3.0.0" 70 | apache-airflow-providers-fab = ">=1.5.2" 71 | 72 | [tool.poetry.group.docs.dependencies] 73 | mdx-include = "^1.4.2" 74 | mike = "^2.1.3" 75 | mkdocs-click = "^0.8.1" 76 | mkdocs-material = "^9.5.49" 77 | mkdocstrings = { extras = ["python"], version = "^0.27.0" } 78 | 79 | [build-system] 80 | requires = ["poetry-core", "poetry-dynamic-versioning"] 81 | build-backend = "poetry_dynamic_versioning.backend" 82 | 83 | [tool.poetry-dynamic-versioning] 84 | enable = true 85 | vcs = "git" 86 | bump = true 87 | style = "semver" 88 | 89 | [tool.coverage] 90 | [tool.coverage.run] 91 | omit = [ 92 | # omit anything in a .local directory anywhere 93 | '*/.local/*', 94 | '**', 95 | 'tests/*', 96 | '*/tests/*', 97 | # omit anything in a .venv directory anywhere 98 | '.venv/*', 99 | "*/site-packages/*", 100 | ] 101 | 102 | [tool.coverage.report] 103 | skip_empty = true 104 | 105 | [tool.mypy] 106 | disallow_untyped_defs = true 107 | ignore_missing_imports = true 108 | files = [ 109 | "brickflow/context/*.py", 110 | "brickflow/cli/*.py", 111 | "brickflow/hints/*.py", 112 | "brickflow/engine/*.py", 113 | "brickflow/resolver/*.py", 114 | "brickflow/codegen/*.py", 115 | ] 116 | follow_imports = "skip" 117 | 118 | [tool.pylint.main] 119 | fail-under = 9.0 120 | 121 | 122 | [tool.pylint."messages control"] 123 | disable = ["too-many-lines", "too-many-positional-arguments"] 124 | -------------------------------------------------------------------------------- /examples/brickflow_examples/README.md: -------------------------------------------------------------------------------- 1 | # brickflow-examples 2 | This repository consists of examples for brickflow 3 | 4 | ## Getting Started 5 | 6 | ### Prerequisites 7 | 1.Install brickflows 8 | 9 | ```shell 10 | pip install brickflows 11 | ``` 12 | 13 | 2.Install [Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/databricks-cli.html) 14 | 15 | ```shell 16 | curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sudo sh 17 | ``` 18 | 19 | 3.Configure Databricks cli with workspace token. This configures your `~/.databrickscfg` file. 20 | 21 | ```shell 22 | databricks configure --token 23 | ``` 24 | 25 | ### Clone the repository 26 | 27 | ```shell 28 | git clone https://github.com/Nike-Inc/brickflow.git 29 | cd brickflow/examples/brickflow_examples 30 | ``` 31 | 32 | ### Hello World workflow 33 | - Create your first workflow using brickflow 34 | - Create a new file hello_world_workflow.py in the workflows directory 35 | - Add the following code to the file 36 | ```python 37 | from brickflow import ( 38 | Cluster, 39 | Workflow, 40 | NotebookTask, 41 | ) 42 | from brickflow.context import ctx 43 | from airflow.operators.bash import BashOperator 44 | 45 | 46 | cluster = Cluster( 47 | name="job_cluster", 48 | node_type_id="m6gd.xlarge", 49 | spark_version="13.3.x-scala2.12", 50 | min_workers=1, 51 | max_workers=2, 52 | ) 53 | 54 | wf = Workflow( 55 | "hello_world_workflow", 56 | default_cluster=cluster, 57 | tags={ 58 | "product_id": "brickflow_demo", 59 | }, 60 | common_task_parameters={ 61 | "catalog": "", 62 | "database": "", 63 | }, 64 | ) 65 | 66 | @wf.task 67 | # this task does nothing but explains the use of context object 68 | def start(): 69 | print(f"Environment: {ctx.env}") 70 | 71 | @wf.notebook_task 72 | # this task runs a databricks notebook 73 | def example_notebook(): 74 | return NotebookTask( 75 | notebook_path="notebooks/example_notebook.py", 76 | base_parameters={ 77 | "some_parameter": "some_value", # in the notebook access these via dbutils.widgets.get("some_parameter") 78 | }, 79 | ) 80 | 81 | 82 | @wf.task(depends_on=[start, example_notebook]) 83 | # this task runs a bash command 84 | def list_lending_club_data_files(): 85 | return BashOperator( 86 | task_id=list_lending_club_data_files.__name__, 87 | bash_command="ls -lrt /dbfs/databricks-datasets/samples/lending_club/parquet/", 88 | ) 89 | 90 | @wf.task(depends_on=list_lending_club_data_files) 91 | # this task runs the pyspark code 92 | def lending_data_ingest(): 93 | ctx.spark.sql( 94 | f""" 95 | CREATE TABLE IF NOT EXISTS 96 | {ctx.dbutils_widget_get_or_else(key="catalog", debug="development")}.\ 97 | {ctx.dbutils_widget_get_or_else(key="database", debug="dummy_database")}.\ 98 | {ctx.dbutils_widget_get_or_else(key="brickflow_env", debug="local")}_lending_data_ingest 99 | USING DELTA -- this is default just for explicit purpose 100 | SELECT * FROM parquet.`dbfs:/databricks-datasets/samples/lending_club/parquet/` 101 | """ 102 | ) 103 | ``` 104 | _Note: Modify the values of catalog/database for common_task_parameters._ 105 | 106 | ### Update demo_wf.py 107 | - demo_wf.py explains the various tasks and options available for the tasks 108 | - You can remove the demo_wf.py in case you just to run the hello_world_workflow.py 109 | - In case you want to run the demo_wf.py, update the below params with your values 110 | - default_cluster 111 | - common_task_parameters 112 | - permissions 113 | - default_task_settings 114 | 115 | ### Deploy the workflow to databricks 116 | ```shell 117 | brickflow projects deploy --project brickflow-demo -e local 118 | ``` 119 | 120 | ### Run the demo workflow 121 | - login to databricks workspace 122 | - go to the workflows and select the workflow 123 | ![img.png](../../docs/img/workflow.png) 124 | - click on the run button 125 | -------------------------------------------------------------------------------- /examples/brickflow_examples/workflows/pattern_matching_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example workflow demonstrating pattern matching in task dependencies. 3 | 4 | Pattern matching allows you to specify dependencies using glob patterns 5 | instead of explicitly listing each task. This is particularly useful for 6 | workflows with many similar tasks. 7 | """ 8 | 9 | from brickflow import ( 10 | Cluster, 11 | TaskSettings, 12 | TaskRunCondition, 13 | Workflow, 14 | ) 15 | 16 | wf = Workflow( 17 | "pattern_matching_demo", 18 | default_cluster=Cluster.from_existing_cluster(""), 19 | ) 20 | 21 | 22 | # Example 1: Basic Pattern Matching 23 | # Define multiple extract tasks 24 | @wf.task() 25 | def extract_customers(): 26 | print("Extracting customer data...") 27 | 28 | 29 | @wf.task() 30 | def extract_orders(): 31 | print("Extracting order data...") 32 | 33 | 34 | @wf.task() 35 | def extract_products(): 36 | print("Extracting product data...") 37 | 38 | 39 | @wf.task(depends_on="extract_*") # Matches all tasks starting with "extract_" 40 | def validate_extracts(): 41 | """This task depends on ALL extract_* tasks automatically.""" 42 | print("Validating all extracted data...") 43 | 44 | 45 | # Example 2: Multiple Stages with Patterns 46 | @wf.task(depends_on=validate_extracts) 47 | def transform_customer_data(): 48 | print("Transforming customer data...") 49 | 50 | 51 | @wf.task(depends_on=validate_extracts) 52 | def transform_order_data(): 53 | print("Transforming order data...") 54 | 55 | 56 | @wf.task(depends_on=validate_extracts) 57 | def transform_product_data(): 58 | print("Transforming product data...") 59 | 60 | 61 | @wf.task(depends_on="transform_*") # Matches all transform tasks 62 | def aggregate_data(): 63 | """Aggregates data from all transform tasks.""" 64 | print("Aggregating all transformed data...") 65 | 66 | 67 | # Example 3: Advanced Pattern - Character Sets 68 | @wf.task(depends_on=aggregate_data) 69 | def load_to_region_a(): 70 | print("Loading to region A...") 71 | 72 | 73 | @wf.task(depends_on=aggregate_data) 74 | def load_to_region_b(): 75 | print("Loading to region B...") 76 | 77 | 78 | @wf.task(depends_on=aggregate_data) 79 | def load_to_region_c(): 80 | print("Loading to region C...") 81 | 82 | 83 | @wf.task(depends_on=aggregate_data) 84 | def load_to_region_x(): 85 | print("Loading to region X...") 86 | 87 | 88 | # Only depends on regions a, b, c (not x) 89 | @wf.task(depends_on="load_to_region_[abc]") 90 | def verify_abc_regions(): 91 | """Pattern matches only regions a, b, and c.""" 92 | print("Verifying regions A, B, and C...") 93 | 94 | 95 | # Example 4: Multiple Patterns in a List 96 | @wf.task(depends_on=["extract_*", "transform_*"]) 97 | def comprehensive_validation(): 98 | """Depends on all extract AND transform tasks.""" 99 | print("Running comprehensive validation...") 100 | 101 | 102 | # Example 5: Mixing Patterns with Explicit Dependencies 103 | @wf.task() 104 | def setup_config(): 105 | print("Setting up configuration...") 106 | 107 | 108 | @wf.task(depends_on=[setup_config, "transform_*"]) 109 | def export_results(): 110 | """Depends on setup_config AND all transform tasks.""" 111 | print("Exporting results...") 112 | 113 | 114 | # Example 6: Pattern with Conditional Run 115 | @wf.task( 116 | depends_on="load_*", 117 | task_settings=TaskSettings(run_if=TaskRunCondition.AT_LEAST_ONE_SUCCESS), 118 | ) 119 | def send_notification(): 120 | """Runs if at least one load task succeeds.""" 121 | print("Sending notification...") 122 | 123 | 124 | # Example 7: Complex Pattern 125 | @wf.task(depends_on=aggregate_data) 126 | def analyze_customer_segment(): 127 | print("Analyzing customer segments...") 128 | 129 | 130 | @wf.task(depends_on=aggregate_data) 131 | def analyze_order_trends(): 132 | print("Analyzing order trends...") 133 | 134 | 135 | # Matches any task containing "customer" or "order" 136 | @wf.task(depends_on="*customer*") 137 | def customer_report(): 138 | """Pattern matches tasks containing 'customer' in the name.""" 139 | print("Generating customer report...") 140 | -------------------------------------------------------------------------------- /brickflow_plugins/airflow/operators/native_operators.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import sys 4 | import tempfile 5 | import time 6 | import types 7 | from typing import Optional 8 | 9 | from airflow.operators.bash import BashOperator 10 | from airflow.operators.python import BranchPythonOperator, ShortCircuitOperator 11 | 12 | from brickflow.context import BRANCH_SKIP_EXCEPT, SKIP_EXCEPT_HACK 13 | from brickflow.engine.task import Task 14 | from brickflow.engine.workflow import Workflow 15 | from brickflow_plugins import log 16 | from brickflow_plugins.airflow.operators import OperatorModifier, check_if 17 | 18 | 19 | def _bash_execute(self, context): # pylint:disable=unused-argument 20 | p = None 21 | returncode = None 22 | start = time.time() 23 | env = self.env 24 | if env is None: 25 | env = os.environ.copy() 26 | 27 | # log.info("Command: %s", self.bash_command) 28 | 29 | with tempfile.TemporaryDirectory(prefix="airflowtmp") as tmp_dir: 30 | try: 31 | p = subprocess.Popen( # pylint:disable=consider-using-with 32 | self.bash_command, 33 | shell=True, 34 | cwd=tmp_dir, 35 | executable="/bin/bash", 36 | stderr=subprocess.STDOUT, 37 | stdout=subprocess.PIPE, 38 | universal_newlines=True, 39 | env=env, 40 | ) 41 | for line in iter(p.stdout.readline, ""): 42 | resp = line 43 | log.info("[STDOUT]: %s", line.rstrip()) 44 | returncode = p.wait() 45 | p = None 46 | sys.stdout.flush() 47 | if returncode != 0: 48 | raise subprocess.CalledProcessError(returncode, self.bash_command) 49 | finally: 50 | end = time.time() 51 | if p is not None: 52 | p.terminate() 53 | p.wait() 54 | log.info("Command: exited with return code %s", returncode) 55 | log.info("Command took %s seconds", end - start) 56 | 57 | if self.do_xcom_push is True: 58 | return resp[:-1] # skip newline char at end 59 | return 60 | 61 | 62 | def _bash_empty_on_kill(self): # pylint:disable=unused-argument 63 | pass 64 | 65 | 66 | def _skip_all_except( 67 | self, ti: "FakeTaskInstance", branch_task_ids 68 | ): # pylint:disable=unused-argument 69 | log.info("Skipping all tasks except: %s", branch_task_ids) 70 | ti.xcom_push(BRANCH_SKIP_EXCEPT, branch_task_ids) 71 | 72 | 73 | def _short_circuit_execute(self, context): 74 | condition = super(ShortCircuitOperator, self).execute(context) 75 | log.info("Condition result is %s", condition) 76 | 77 | if condition: 78 | log.info("Proceeding with downstream tasks...") 79 | return 80 | 81 | # log 82 | log.info("Skipping downstream tasks...") 83 | ti = context["ti"] 84 | ti.xcom_push(BRANCH_SKIP_EXCEPT, SKIP_EXCEPT_HACK) 85 | 86 | 87 | class BashOperatorModifier(OperatorModifier): 88 | @check_if(BashOperator) 89 | def modify( 90 | self, operator: BashOperator, task: Task, workflow: Workflow 91 | ) -> Optional["BashOperator"]: 92 | f = types.MethodType(_bash_execute, operator) 93 | operator.execute = f 94 | operator.on_kill = _bash_empty_on_kill 95 | return operator 96 | 97 | 98 | class BranchPythonOperatorModifier(OperatorModifier): 99 | @check_if(BranchPythonOperator) 100 | def modify( 101 | self, operator: BranchPythonOperator, task: Task, workflow: Workflow 102 | ) -> Optional["BranchPythonOperator"]: 103 | f = types.MethodType(_skip_all_except, operator) 104 | operator.skip_all_except = f 105 | return operator 106 | 107 | 108 | class ShortCircuitOperatorModifier(OperatorModifier): 109 | @check_if(ShortCircuitOperator) 110 | def modify( 111 | self, operator: ShortCircuitOperator, task: Task, workflow: Workflow 112 | ) -> Optional["ShortCircuitOperator"]: 113 | f = types.MethodType(_short_circuit_execute, operator) 114 | operator.execute = f 115 | return operator 116 | -------------------------------------------------------------------------------- /tests/codegen/expected_bundles/local_serverless_bundle.yml: -------------------------------------------------------------------------------- 1 | "bundle": 2 | "name": "test-project" 3 | "targets": 4 | "test-project-local": 5 | "resources": 6 | "jobs": 7 | "brickflow-serverless-demo": 8 | "continuous": null 9 | "email_notifications": null 10 | "environments": 11 | - "environment_key": "Default" 12 | "spec": 13 | "client": "1" 14 | "dependencies": 15 | - "pytz==2024.2" 16 | "job_clusters": [] 17 | "max_concurrent_runs": 1.0 18 | "name": "test_user_brickflow-serverless-demo" 19 | "notification_settings": null 20 | "parameters": null 21 | "permissions": null 22 | "schedule": 23 | "pause_status": "PAUSED" 24 | "quartz_cron_expression": "0 0/20 0 ? * * *" 25 | "timezone_id": "UTC" 26 | "tags": 27 | "brickflow_deployment_mode": "Databricks Asset Bundles" 28 | "brickflow_project_name": "test-project" 29 | "brickflow_version": "1.0.0" 30 | "deployed_at": "1704067200000" 31 | "deployed_by": "test_user" 32 | "environment": "local" 33 | "tasks": 34 | - "depends_on": [] 35 | "email_notifications": {} 36 | "webhook_notifications": {} 37 | "max_retries": null 38 | "min_retry_interval_millis": null 39 | "notebook_task": 40 | "base_parameters": 41 | "brickflow_env": "local" 42 | "brickflow_internal_only_run_tasks": "" 43 | "brickflow_internal_task_name": "{{task_key}}" 44 | "brickflow_internal_workflow_name": "brickflow-serverless-demo" 45 | "brickflow_internal_workflow_prefix": "" 46 | "brickflow_internal_workflow_suffix": "" 47 | "brickflow_job_id": "{{job_id}}" 48 | "brickflow_parent_run_id": "{{parent_run_id}}" 49 | "brickflow_run_id": "{{run_id}}" 50 | "brickflow_start_date": "{{start_date}}" 51 | "brickflow_start_time": "{{start_time}}" 52 | "brickflow_task_key": "{{task_key}}" 53 | "brickflow_task_retry_count": "{{task_retry_count}}" 54 | "notebook_path": "test_databricks_bundle.py" 55 | "source": "WORKSPACE" 56 | "retry_on_timeout": null 57 | "task_key": "entrypoint_task" 58 | "timeout_seconds": null 59 | - "depends_on": [] 60 | "email_notifications": {} 61 | "webhook_notifications": {} 62 | "max_retries": null 63 | "min_retry_interval_millis": null 64 | "notebook_task": 65 | "base_parameters": 66 | "some_parameter": "some_value" 67 | "notebook_path": "notebooks/example_notebook.py" 68 | "retry_on_timeout": null 69 | "task_key": "notebook_task" 70 | "timeout_seconds": null 71 | - "depends_on": [] 72 | "email_notifications": {} 73 | "webhook_notifications": {} 74 | "environment_key": "Default" 75 | "max_retries": null 76 | "min_retry_interval_millis": null 77 | "retry_on_timeout": null 78 | "spark_python_task": 79 | "parameters": 80 | - "--timezone" 81 | - "UTC" 82 | "python_file": "/Workspace/Users/${workspace.current_user.userName}/.brickflow_bundles/test-project/local/files/spark/python/src/run_task.py" 83 | "source": "WORKSPACE" 84 | "task_key": "spark_python_task" 85 | "timeout_seconds": null 86 | "timeout_seconds": null 87 | "trigger": null 88 | "webhook_notifications": null 89 | "pipelines": {} 90 | "workspace": 91 | "file_path": "/Users/${workspace.current_user.userName}/.brickflow_bundles/test-project/local/files" 92 | "root_path": "/Users/${workspace.current_user.userName}/.brickflow_bundles/test-project/local" 93 | "state_path": "/Users/${workspace.current_user.userName}/.brickflow_bundles/test-project/local/state" 94 | "workspace": {} 95 | -------------------------------------------------------------------------------- /examples/brickflow_task_injection_examples/workflows/demo_workflow.py: -------------------------------------------------------------------------------- 1 | """ 2 | Demo Workflow - Task Injection Example 3 | 4 | This workflow demonstrates automatic task injection. 5 | When deployed with BRICKFLOW_INJECT_TASKS_CONFIG set, additional tasks 6 | will be automatically added based on the configuration. 7 | 8 | Original workflow structure: 9 | task_1 -> task_2 -> task_3 10 | 11 | After injection (see config/injected_tasks.yaml): 12 | initialization_task (runs first) 13 | ↓ 14 | task_1 15 | ↓ 16 | task_2 17 | ↓ 18 | monitoring_task (runs after task_1 and task_2) 19 | ↓ 20 | task_3 21 | ↓ 22 | completion_logger (runs last) 23 | """ 24 | 25 | from brickflow import Workflow, ctx 26 | from brickflow.engine.task import PypiTaskLibrary 27 | 28 | 29 | # Create the workflow 30 | wf = Workflow( 31 | "brickflow-task-injection-demo", 32 | schedule_quartz_expression="0 0/30 * ? * * *", # Every 30 minutes 33 | libraries=[ 34 | PypiTaskLibrary(package="pytz==2024.2"), 35 | ], 36 | tags={ 37 | "example": "task_injection", 38 | "feature": "auto_injection", 39 | }, 40 | ) 41 | 42 | 43 | @wf.task 44 | def task_1(): 45 | """ 46 | First task in the workflow. 47 | 48 | After injection, this will run after the initialization_task. 49 | """ 50 | print("=" * 70) 51 | print("Executing task_1") 52 | print("=" * 70) 53 | 54 | # Simulate some work 55 | import time 56 | 57 | print("Processing data in task_1...") 58 | time.sleep(1) 59 | 60 | # Get workflow parameters 61 | env = ctx.get_parameter("brickflow_env", "local") 62 | print(f"Running in environment: {env}") 63 | 64 | result = { 65 | "task": "task_1", 66 | "status": "completed", 67 | "records_processed": 1000, 68 | } 69 | 70 | print(f"Task 1 result: {result}") 71 | print("=" * 70) 72 | return result 73 | 74 | 75 | @wf.task(depends_on=task_1) 76 | def task_2(): 77 | """ 78 | Second task in the workflow. 79 | 80 | Depends on task_1. After injection, monitoring_task will run after this. 81 | """ 82 | print("=" * 70) 83 | print("Executing task_2") 84 | print("=" * 70) 85 | 86 | # Simulate some work 87 | import time 88 | 89 | print("Processing data in task_2...") 90 | time.sleep(1) 91 | 92 | # Get result from previous task 93 | task_1_result = ctx.task_coms.get("task_1", "result") 94 | print(f"Received from task_1: {task_1_result}") 95 | 96 | result = { 97 | "task": "task_2", 98 | "status": "completed", 99 | "records_processed": 2000, 100 | } 101 | 102 | print(f"Task 2 result: {result}") 103 | print("=" * 70) 104 | 105 | # Store result for next task 106 | ctx.task_coms.put("task_2", "result", result) 107 | 108 | return result 109 | 110 | 111 | @wf.task(depends_on=task_2) 112 | def task_3(): 113 | """ 114 | Third task in the workflow. 115 | 116 | This is a leaf node, so completion_logger will run after this. 117 | """ 118 | print("=" * 70) 119 | print("Executing task_3") 120 | print("=" * 70) 121 | 122 | # Simulate some work 123 | import time 124 | 125 | print("Processing data in task_3...") 126 | time.sleep(1) 127 | 128 | # Get result from previous task 129 | task_2_result = ctx.task_coms.get("task_2", "result") 130 | print(f"Received from task_2: {task_2_result}") 131 | 132 | result = { 133 | "task": "task_3", 134 | "status": "completed", 135 | "records_processed": 3000, 136 | "final": True, 137 | } 138 | 139 | print(f"Task 3 result: {result}") 140 | print("=" * 70) 141 | 142 | return result 143 | 144 | 145 | # Note: When deployed with task injection enabled, additional tasks will be 146 | # automatically added to this workflow based on config/injected_tasks.yaml 147 | # 148 | # To deploy with task injection: 149 | # export BRICKFLOW_INJECT_TASKS_CONFIG="config/injected_tasks.yaml" 150 | # brickflow projects deploy --project brickflow-task-injection-demo -e local 151 | # 152 | # To deploy without task injection: 153 | # brickflow projects deploy --project brickflow-task-injection-demo -e local 154 | -------------------------------------------------------------------------------- /tests/codegen/expected_bundles/local_bundle_continuous_schedule.yml: -------------------------------------------------------------------------------- 1 | "bundle": 2 | "name": "test-project" 3 | "targets": 4 | "test-project-local": 5 | "resources": 6 | "jobs": 7 | "wf-test-2": 8 | "continuous": 9 | "pause_status": "PAUSED" 10 | "email_notifications": null 11 | "git_source": null 12 | "health": 13 | "rules": 14 | - "metric": "RUN_DURATION_SECONDS" 15 | "op": "GREATER_THAN" 16 | "value": 7200.0 17 | "job_clusters": 18 | - "job_cluster_key": "sample_job_cluster" 19 | "new_cluster": 20 | "aws_attributes": null 21 | "custom_tags": 22 | "brickflow_deployment_mode": "Databricks Asset Bundles" 23 | "brickflow_project_name": "test-project" 24 | "brickflow_version": "1.0.0" 25 | "deployed_at": "1704067200000" 26 | "deployed_by": "test_user" 27 | "environment": "local" 28 | "data_security_mode": "SINGLE_USER" 29 | "driver_instance_pool_id": null 30 | "driver_node_type_id": null 31 | "enable_elastic_disk": null 32 | "init_scripts": null 33 | "instance_pool_id": null 34 | "node_type_id": "m6gd.xlarge" 35 | "num_workers": 1.0 36 | "policy_id": null 37 | "runtime_engine": null 38 | "spark_conf": null 39 | "spark_env_vars": null 40 | "spark_version": "13.3.x-scala2.12" 41 | "max_concurrent_runs": 1.0 42 | "name": "test_user_wf-test-2" 43 | "notification_settings": null 44 | "permissions": 45 | - "level": "IS_OWNER" 46 | "user_name": "abc@abc.com" 47 | - "level": "CAN_MANAGE" 48 | "user_name": "abc@abc.com" 49 | - "level": "CAN_MANAGE_RUN" 50 | "user_name": "abc@abc.com" 51 | - "level": "CAN_VIEW" 52 | "user_name": "abc@abc.com" 53 | "run_as": 54 | "user_name": "abc@abc.com" 55 | "schedule": null 56 | "tags": 57 | "brickflow_deployment_mode": "Databricks Asset Bundles" 58 | "brickflow_project_name": "test-project" 59 | "brickflow_version": "1.0.0" 60 | "deployed_at": "1704067200000" 61 | "deployed_by": "test_user" 62 | "environment": "local" 63 | "test": "test2" 64 | "tasks": 65 | - "depends_on": [] 66 | "email_notifications": {} 67 | "webhook_notifications": {} 68 | "job_cluster_key": "sample_job_cluster" 69 | "libraries": [] 70 | "max_retries": null 71 | "min_retry_interval_millis": null 72 | "notebook_task": 73 | "base_parameters": 74 | "all_tasks1": "test" 75 | "all_tasks3": "123" 76 | "brickflow_env": "local" 77 | "brickflow_internal_only_run_tasks": "" 78 | "brickflow_internal_task_name": "{{task_key}}" 79 | "brickflow_internal_workflow_name": "wf-test-2" 80 | "brickflow_internal_workflow_prefix": "" 81 | "brickflow_internal_workflow_suffix": "" 82 | "brickflow_job_id": "{{job_id}}" 83 | "brickflow_parent_run_id": "{{parent_run_id}}" 84 | "brickflow_run_id": "{{run_id}}" 85 | "brickflow_start_date": "{{start_date}}" 86 | "brickflow_start_time": "{{start_time}}" 87 | "brickflow_task_key": "{{task_key}}" 88 | "brickflow_task_retry_count": "{{task_retry_count}}" 89 | "test": "var" 90 | "notebook_path": "test_databricks_bundle.py" 91 | "source": "WORKSPACE" 92 | "retry_on_timeout": null 93 | "task_key": "task_function2" 94 | "timeout_seconds": null 95 | "timeout_seconds": null 96 | "trigger": null 97 | "webhook_notifications": null 98 | "pipelines": {} 99 | "workspace": 100 | "file_path": "/Users/${workspace.current_user.userName}/.brickflow_bundles/test-project/local/files" 101 | "root_path": "/Users/${workspace.current_user.userName}/.brickflow_bundles/test-project/local" 102 | "state_path": "/Users/${workspace.current_user.userName}/.brickflow_bundles/test-project/local/state" 103 | "workspace": {} -------------------------------------------------------------------------------- /tests/engine/test_compute.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from brickflow.engine.compute import Cluster 4 | 5 | 6 | class TestCompute: 7 | def test_autoscale(self): 8 | workers = 1234 9 | cluster = Cluster( 10 | "name", "spark_version", "vm-node", min_workers=workers, max_workers=workers 11 | ) 12 | assert cluster.autoscale() == { 13 | "autoscale": { 14 | "min_workers": workers, 15 | "max_workers": workers, 16 | } 17 | } 18 | 19 | cluster = Cluster("name", "spark_version", "vm-node") 20 | assert not cluster.autoscale() 21 | 22 | def test_job_task_field(self): 23 | cluster = Cluster.from_existing_cluster("existing_cluster_id") 24 | assert cluster.job_task_field_dict == { 25 | "existing_cluster_id": "existing_cluster_id" 26 | } 27 | cluster = Cluster("name", "spark_version", "vm-node") 28 | assert cluster.job_task_field_dict == {"job_cluster_key": "name"} 29 | 30 | def test_dict(self): 31 | cluster = Cluster.from_existing_cluster("existing_cluster_id") 32 | assert "existing_cluster_id" not in cluster.as_dict() 33 | 34 | def test_valid_cluster(self): 35 | with pytest.raises(AssertionError): 36 | Cluster( 37 | "some_name", "some_version", "some_vm", min_workers=8, max_workers=4 38 | ) 39 | 40 | with pytest.raises(AssertionError): 41 | Cluster( 42 | "some_name", 43 | "some_version", 44 | "some_vm", 45 | num_workers=3, 46 | min_workers=2, 47 | max_workers=4, 48 | ) 49 | 50 | with pytest.raises(AssertionError): 51 | Cluster("some_name", "some_version", "some_vm", max_workers=4) 52 | 53 | def test_node_type_or_instance_pool(self): 54 | assert ( 55 | Cluster( 56 | "some_name", 57 | "some_version", 58 | node_type_id="some_vm", 59 | driver_node_type_id="other_vm", 60 | ).node_type_id 61 | == "some_vm" 62 | ) 63 | assert ( 64 | Cluster( 65 | "some_name", "some_version", instance_pool_id="some_instance_pool_id" 66 | ).instance_pool_id 67 | == "some_instance_pool_id" 68 | ) 69 | with pytest.raises( 70 | AssertionError, match="Must specify either instance_pool_id or node_type_id" 71 | ): 72 | Cluster( 73 | "some_name", 74 | "some_version", 75 | ) 76 | 77 | with pytest.raises( 78 | AssertionError, 79 | match="Cannot specify instance_pool_id if node_type_id has been specified", 80 | ): 81 | Cluster( 82 | "some_name", 83 | "some_version", 84 | node_type_id="some_vm", 85 | instance_pool_id="1234", 86 | ) 87 | with pytest.raises( 88 | AssertionError, 89 | match=( 90 | "Cannot specify driver_node_type_id if instance_pool_id" 91 | " or driver_instance_pool_id has been specified" 92 | ), 93 | ): 94 | Cluster( 95 | "some_name", 96 | "some_version", 97 | driver_node_type_id="other_vm", 98 | instance_pool_id="1234", 99 | ) 100 | with pytest.raises( 101 | AssertionError, 102 | match=( 103 | "Cannot specify driver_node_type_id if instance_pool_id" 104 | " or driver_instance_pool_id has been specified" 105 | ), 106 | ): 107 | Cluster( 108 | "some_name", 109 | "some_version", 110 | node_type_id="some_vm", 111 | driver_node_type_id="other_vm", 112 | driver_instance_pool_id="1234", 113 | ) 114 | with pytest.raises( 115 | AssertionError, 116 | match=( 117 | "Cannot specify driver_node_type_id if instance_pool_id" 118 | " or driver_instance_pool_id has been specified" 119 | ), 120 | ): 121 | Cluster( 122 | "some_name", 123 | "some_version", 124 | driver_node_type_id="other_vm", 125 | instance_pool_id="1234", 126 | driver_instance_pool_id="12345", 127 | ) 128 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to contribute 2 | 3 | There are a few guidelines that we need contributors to follow so that we are able to process requests as efficiently as possible. If you have any questions or concerns please feel free to contact us at [opensource@nike.com](mailto:opensource@nike.com). 4 | 5 | ## Getting Started 6 | 7 | * Review our [Code of Conduct](https://github.com/Nike-Inc/nike-inc.github.io/blob/master/CONDUCT.md) 8 | * Submit the [Individual Contributor License Agreement](https://www.clahub.com/agreements/Nike-Inc/fastbreak) 9 | * Make sure you have a [GitHub account](https://github.com/signup/free) 10 | * Submit a ticket for your issue, assuming one does not already exist. 11 | * Clearly describe the issue including steps to reproduce when it is a bug. 12 | * Make sure you fill in the earliest version that you know has the issue. 13 | * Fork the repository on GitHub 14 | 15 | ## Making Changes 16 | 17 | * Create a feature branch off of `main` before you start your work. 18 | * Please avoid working directly on the `main` branch. 19 | * Setup the required package manager [poetry](#-package-manager) 20 | * Setup the dev environment [see below](#-dev-environment-setup) 21 | * Make commits of logical units. 22 | * You may be asked to squash unnecessary commits down to logical units. 23 | * Check for unnecessary whitespace with `git diff --check` before committing. 24 | * Write meaningful, descriptive commit messages. 25 | * Please follow existing code conventions when working on a file 26 | * Make sure to check the standards on the code [see below](#-linting-and-standards) 27 | * Install java 11 since it's required for unit tests while running 'make tests' 28 | * Make sure to test the code before you push changes [see below](#-testing) 29 | 30 | ## 🤝 Submitting Changes 31 | 32 | * Push your changes to a topic branch in your fork of the repository. 33 | * Submit a pull request to the repository in the Nike-Inc organization. 34 | * After feedback has been given we expect responses within two weeks. After two weeks we may close the pull request 35 | if it isn't showing any activity. 36 | * Bug fixes or features that lack appropriate tests may not be considered for merge. 37 | * Changes that lower test coverage may not be considered for merge. 38 | 39 | ### 📦 Package manager 40 | 41 | We use `make` for managing different steps of setup and maintenance in the project. You can install make by following 42 | the instructions [here](https://formulae.brew.sh/formula/make) 43 | 44 | We use `poetry` as our package manager. 45 | 46 | Please DO NOT use pip or conda to install the dependencies. Instead, use poetry: 47 | 48 | ```bash 49 | make poetry-install 50 | ``` 51 | 52 | ### 📌 Dev Environment Setup 53 | 54 | To ensure our standards, make sure to install the required packages. 55 | 56 | ```bash 57 | make dev 58 | ``` 59 | 60 | ### 🧹 Linting and Standards 61 | 62 | We use `pylint`, `black` and `mypy` to maintain standards in the codebase 63 | 64 | ```bash 65 | make check 66 | ``` 67 | 68 | Make sure that the linter does not report any errors or warnings before submitting a pull request. 69 | 70 | ### 🧪 Testing 71 | 72 | We use `pytest` to test our code. You can run the tests by running the following command: 73 | 74 | ```bash 75 | make test 76 | ``` 77 | 78 | #### 🧪 Integration Testing 79 | * Once you add a feature or a bug fix in brickflow, create a whl file from your feature branch 80 | * run 'poetry build' to generate the whl under the dist folder 81 | * Install brickflow from the whl file 82 | * pip install -whl file path- 83 | * Upload the whl file to Databricks workspace 84 | * Databricks Workspace --> Add --> Library 85 | * Copy the path of the uploaded whl file and paste it in the entrypoint.py as a Wheel Library 86 | * libraries=[ 87 | WheelTaskLibrary("dbfs:/FileStore/jars/dummy.whl") 88 | ], 89 | * Create a workflow and deploy it to make sure the feature or bug fix works as expected 90 | 91 | Make sure that all tests pass before submitting a pull request. 92 | 93 | ## 🚀 Release Process 94 | 95 | At the moment, the release process is manual. We try to make frequent releases. Usually, we release a new version when we have a new feature or bugfix. A developer with admin rights to the repository will create a new release on GitHub, and then publish the new version to PyPI. 96 | 97 | # Additional Resources 98 | 99 | * [General GitHub documentation](https://help.github.com/) 100 | * [GitHub pull request documentation](https://help.github.com/send-pull-requests/) 101 | * [Nike's Code of Conduct](https://github.com/Nike-Inc/nike-inc.github.io/blob/master/CONDUCT.md) 102 | * [Nike's Individual Contributor License Agreement](https://www.clahub.com/agreements/Nike-Inc/fastbreak) 103 | * [Nike OSS](https://nike-inc.github.io/) -------------------------------------------------------------------------------- /tests/databricks_plugins/test_workflow_task_dependency_sensor.py: -------------------------------------------------------------------------------- 1 | import re 2 | from datetime import timedelta 3 | 4 | import pytest 5 | from requests_mock.mocker import Mocker as RequestsMocker 6 | 7 | from brickflow_plugins.databricks.workflow_dependency_sensor import ( 8 | WorkflowDependencySensorTimeOutException, 9 | WorkflowTaskDependencySensor, 10 | ) 11 | 12 | 13 | class TestWorkflowTaskDependencySensor: 14 | workspace_url = "https://42.cloud.databricks.com" 15 | endpoint_url = f"{workspace_url}/api/.*/jobs/runs/list" 16 | response = { 17 | "runs": [ 18 | { 19 | "job_id": 1, 20 | "run_id": 1, 21 | "start_time": 1704063600000, 22 | "state": { 23 | "result_state": "SUCCESS", 24 | }, 25 | "tasks": [ 26 | { 27 | "run_id": 100, 28 | "task_key": "foo", 29 | "state": { 30 | "result_state": "SUCCESS", 31 | }, 32 | }, 33 | { 34 | "run_id": 200, 35 | "task_key": "bar", 36 | "state": { 37 | "result_state": "FAILED", 38 | }, 39 | }, 40 | { 41 | "run_id": 300, 42 | "task_key": "baz", 43 | "state": {}, 44 | }, 45 | ], 46 | } 47 | ] 48 | } 49 | 50 | @pytest.fixture(autouse=True) 51 | def mock_get_execution_start_time_unix_milliseconds(self, mocker): 52 | mocker.patch.object( 53 | WorkflowTaskDependencySensor, 54 | "get_execution_start_time_unix_milliseconds", 55 | return_value=1704063600000, 56 | ) 57 | 58 | @pytest.fixture(autouse=True) 59 | def mock_get_job_id(self, mocker): 60 | mocker.patch( 61 | "brickflow_plugins.databricks.workflow_dependency_sensor.get_job_id", 62 | return_value=1, 63 | ) 64 | 65 | @pytest.fixture(autouse=True, name="api") 66 | def mock_api(self): 67 | rm = RequestsMocker() 68 | rm.get(re.compile(self.endpoint_url), json=self.response, status_code=int(200)) 69 | yield rm 70 | 71 | def test_sensor_success(self, caplog, api): 72 | with api: 73 | sensor = WorkflowTaskDependencySensor( 74 | databricks_host=self.workspace_url, 75 | databricks_token="token", 76 | dependency_job_name="job", 77 | dependency_task_name="foo", 78 | delta=timedelta(seconds=1), 79 | timeout_seconds=1, 80 | poke_interval_seconds=1, 81 | ) 82 | 83 | sensor.execute() 84 | 85 | assert ( 86 | "Found the run_id '1' and 'foo' task with state: SUCCESS" in caplog.text 87 | ) 88 | assert "Found a successful run: 1" in caplog.text 89 | 90 | def test_sensor_failure(self, caplog, api): 91 | with api: 92 | sensor = WorkflowTaskDependencySensor( 93 | databricks_host=self.workspace_url, 94 | databricks_token="token", 95 | dependency_job_name="job", 96 | dependency_task_name="bar", 97 | delta=timedelta(seconds=1), 98 | timeout_seconds=1, 99 | poke_interval_seconds=1, 100 | ) 101 | 102 | with pytest.raises(WorkflowDependencySensorTimeOutException): 103 | sensor.execute() 104 | 105 | assert ( 106 | "Found the run_id '1' and 'bar' task with state: FAILED" 107 | in caplog.messages 108 | ) 109 | assert "Didn't find a successful task run yet..." in caplog.messages 110 | 111 | def test_sensor_no_state(self, caplog, api): 112 | with api: 113 | sensor = WorkflowTaskDependencySensor( 114 | databricks_host=self.workspace_url, 115 | databricks_token="token", 116 | dependency_job_name="job", 117 | dependency_task_name="baz", 118 | delta=timedelta(seconds=1), 119 | timeout_seconds=1, 120 | poke_interval_seconds=1, 121 | ) 122 | 123 | with pytest.raises(WorkflowDependencySensorTimeOutException): 124 | sensor.execute() 125 | 126 | assert ( 127 | "Found the run_id '1' and 'baz' but the task has not started yet..." 128 | in caplog.messages 129 | ) 130 | assert "Didn't find a successful task run yet..." in caplog.messages 131 | -------------------------------------------------------------------------------- /docs/bundles-quickstart.md: -------------------------------------------------------------------------------- 1 | # BrickFlow v1.3.1 Quickstart Guide 2 | 3 | This guide will help you get started with BrickFlow v1.3.1, walking you through project setup and deployment. 4 | 5 | ## Prerequisites 6 | 7 | 1. Local environment setup: 8 | - Python >= 3.8 9 | - Databricks CLI configured with access token 10 | - BrickFlow CLI 11 | 12 | ### Installation Steps 13 | 14 | 1. Install Databricks CLI and configure it: 15 | ```bash 16 | pip install databricks-cli 17 | databricks configure -t 18 | ``` 19 | 20 | 2. Install BrickFlow CLI: 21 | ```bash 22 | pip install brickflows 23 | ``` 24 | 25 | 3. Verify your installation: 26 | ```bash 27 | bf --help 28 | databricks workspace list / # Add --profile if using specific profile 29 | ``` 30 | 31 | ## Creating Your First Project 32 | 33 | 1. Navigate to your repository root (where `.git` folder is located) 34 | 35 | 2. Initialize a new BrickFlow project: 36 | ```bash 37 | bf projects add 38 | ``` 39 | 40 | 3. Follow the prompts: 41 | - Project Name: Enter your desired project name 42 | - Path from repo root to project root: Press Enter for default (`.`) or specify path 43 | - Path from project root to workflows dir: Enter the directory for your workflows 44 | - Git https url: Enter your repository URL 45 | - Brickflow version: Enter `1.3.1` (or press Enter for `auto`) 46 | - Spark expectations version: Press Enter for default (`0.8.0`) 47 | - Skip entrypoint: Choose `N` unless you have a specific reason to skip 48 | 49 | 4. Update your `.gitignore` file: 50 | ``` 51 | **/bundle.yml 52 | .databricks/ 53 | ``` 54 | 55 | ## Project Structure 56 | 57 | Your project will follow either a monorepo or polyrepo style: 58 | 59 | ### Monorepo Structure Example: 60 | ``` 61 | repo-root/ 62 | ├── .git 63 | ├── projects/ 64 | │ ├── project_abc/ 65 | │ │ ├── lib/ 66 | │ │ │ ├── __init__.py 67 | │ │ │ └── shared_functions.py 68 | │ │ ├── workflows/ 69 | │ │ │ ├── __init__.py 70 | │ │ │ ├── entrypoint.py 71 | │ │ │ └── workflow_abc.py 72 | │ │ └── .brickflow-project-root.yml 73 | ``` 74 | 75 | ### Polyrepo Structure Example: 76 | ``` 77 | repo-root/ 78 | ├── .git 79 | ├── src/ 80 | │ ├── lib/ 81 | │ │ ├── __init__.py 82 | │ │ └── shared_functions.py 83 | │ ├── workflows/ 84 | │ │ ├── __init__.py 85 | │ │ ├── entrypoint.py 86 | │ │ └── workflow.py 87 | ├── .brickflow-project-root.yml 88 | ``` 89 | 90 | ## Validating Your Project 91 | 92 | 1. Synthesize your project configuration: 93 | ```bash 94 | bf projects synth --project --profile 95 | ``` 96 | 97 | 2. Verify the output shows: 98 | ``` 99 | SUCCESSFULLY SYNTHESIZED BUNDLE.YML FOR PROJECT: 100 | ``` 101 | 102 | ## Deploying Your Project 103 | 104 | ### Development Deployment 105 | ```bash 106 | bf projects deploy --project -p --force-acquire-lock 107 | ``` 108 | 109 | ### Environment-Specific Deployments 110 | ```bash 111 | # Dev environment 112 | bf projects deploy --project -p -e dev --force-acquire-lock 113 | 114 | # Test environment 115 | bf projects deploy --project -p -e test --force-acquire-lock 116 | 117 | # Production environment 118 | bf projects deploy --project -p -e prod --force-acquire-lock 119 | ``` 120 | 121 | ### Release Candidate Deployments 122 | For testing specific versions or pull requests: 123 | 124 | ```bash 125 | # Deploy RC version 126 | BRICKFLOW_WORKFLOW_SUFFIX="1.3.1-rc1" bf projects deploy --project -p -e test --force-acquire-lock 127 | 128 | # Deploy PR version 129 | BRICKFLOW_WORKFLOW_SUFFIX="1.3.1-pr34" bf projects deploy --project -p -e test --force-acquire-lock 130 | ``` 131 | 132 | ## Cleaning Up 133 | 134 | ### Destroying Deployments 135 | ```bash 136 | # Destroy main deployment 137 | bf projects destroy --project -p --force-acquire-lock 138 | 139 | # Destroy RC deployment 140 | BRICKFLOW_WORKFLOW_SUFFIX="1.3.1-rc1" bf projects destroy --project -p -e test --force-acquire-lock 141 | 142 | # Destroy PR deployment 143 | BRICKFLOW_WORKFLOW_SUFFIX="1.3.1-pr34" bf projects destroy --project -p -e test --force-acquire-lock 144 | ``` 145 | 146 | ## Troubleshooting 147 | 148 | 1. If synthesis fails: 149 | - Verify you're in the repository root directory 150 | - Check that all paths in configuration files are correct 151 | - Ensure all required __init__.py files exist 152 | 153 | 2. If deployment fails: 154 | - Verify Databricks CLI configuration 155 | - Check permissions in your Databricks workspace 156 | - Verify environment variables are set correctly 157 | 158 | ## Next Steps 159 | 160 | After successful deployment: 161 | 1. Monitor your workflows in the Databricks workspace 162 | 2. Set up CI/CD pipelines for automated deployments 163 | 3. Configure environment-specific variables 164 | 4. Set up monitoring and alerting -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: BrickFlow 2 | site_description: Brickflow is a tool for managing and deploying scalable workflows on Databricks. 3 | site_url: https://brickflow.readthedocs.io/en/latest/ 4 | 5 | theme: 6 | name: material 7 | palette: 8 | - scheme: default 9 | primary: indigo 10 | accent: indigo 11 | toggle: 12 | icon: material/brightness-7 13 | name: Switch to dark mode 14 | - scheme: slate 15 | primary: indigo 16 | accent: indigo 17 | toggle: 18 | icon: material/brightness-4 19 | name: Switch to light mode 20 | features: 21 | # - announce.dismiss 22 | - content.code.annotate 23 | # - content.tabs.link 24 | - content.tooltips 25 | - content.code.copy 26 | # - header.autohide 27 | # - navigation.expand 28 | - navigation.indexes 29 | - navigation.instant 30 | # - navigation.prune 31 | # - navigation.sections 32 | - navigation.tabs 33 | - navigation.tabs.sticky 34 | - navigation.top 35 | - navigation.tracking 36 | - navigation.expand 37 | - search.highlight 38 | - search.share 39 | - search.suggest 40 | - toc.follow 41 | font: 42 | text: Roboto 43 | code: Roboto Mono 44 | logo: img/bf_logo.png 45 | favicon: img/bf_logo.png 46 | language: en 47 | 48 | repo_name: nike/brickflow 49 | repo_url: https://github.com/Nike-Inc/brickflow 50 | 51 | plugins: 52 | - search: 53 | lang: en 54 | - mkdocstrings: 55 | handlers: 56 | python: 57 | paths: [ "brickflow" ] # search packages in the src folder 58 | options: 59 | show_source: true 60 | show_root_heading: false 61 | heading_level: 1 62 | merge_init_into_class: true 63 | show_if_no_docstring: true 64 | show_root_full_path: true 65 | show_root_members_full_path: true 66 | show_root_toc_entry: false 67 | show_category_heading: true 68 | show_signature_annotations: true 69 | separate_signature: false 70 | 71 | markdown_extensions: 72 | - abbr 73 | - admonition 74 | - mkdocs-click 75 | - attr_list 76 | - def_list 77 | - footnotes 78 | - md_in_html 79 | - toc: 80 | permalink: true 81 | - pymdownx.arithmatex: 82 | generic: true 83 | - pymdownx.betterem: 84 | smart_enable: all 85 | - pymdownx.caret 86 | - pymdownx.details 87 | - pymdownx.emoji: 88 | emoji_generator: !!python/name:materialx.emoji.to_svg 89 | emoji_index: !!python/name:materialx.emoji.twemoji 90 | - pymdownx.highlight: 91 | anchor_linenums: true 92 | - pymdownx.inlinehilite 93 | - pymdownx.keys 94 | - pymdownx.magiclink: 95 | repo_url_shorthand: true 96 | user: squidfunk 97 | repo: mkdocs-material 98 | - pymdownx.mark 99 | - pymdownx.smartsymbols 100 | - pymdownx.superfences: 101 | custom_fences: 102 | - name: mermaid 103 | class: mermaid 104 | format: !!python/name:pymdownx.superfences.fence_code_format 105 | - pymdownx.tabbed: 106 | alternate_style: true 107 | - pymdownx.tasklist: 108 | custom_checkbox: true 109 | - pymdownx.tilde 110 | 111 | watch: 112 | - brickflow 113 | - brickflow_plugins 114 | 115 | extra_css: 116 | - css/custom.css 117 | 118 | nav: 119 | - Home: index.md 120 | - Quickstart: 121 | - Brickflow Projects: bundles-quickstart.md 122 | - Upgrading Versions: 123 | - Upgrading to v0.10.x: upgrades/upgrade-pre-0-10-0-to-0-10-0.md 124 | - Concepts: 125 | - HighLevel: highlevel.md 126 | - Workflows: workflows.md 127 | - Tasks: tasks.md 128 | - Projects: projects.md 129 | - ENV Variables: environment-variables.md 130 | - Importing Modules: how-imports-work.md 131 | - FAQ: faq/faq.md 132 | - CLI: 133 | - Commands: cli/reference.md 134 | - Python API: 135 | - Engine: 136 | - Project: api/project.md 137 | - Workflow: api/workflow.md 138 | - Compute: api/compute.md 139 | - Task: api/task.md 140 | - Context: api/context.md 141 | - CLI: api/cli.md 142 | - Brickflow Plugins: 143 | - AirflowTaskDependencySensor: api/airflow_external_task_dependency.md 144 | - AirflowNativeOperators: api/airflow_native_operators.md 145 | - WorkflowDependencySensor: api/workflow_dependency_sensor.md 146 | - SnowflakeOperator: api/uc_to_snowflake_operator.md 147 | - UcToSnowflakeOperator: api/uc_to_snowflake_operator.md 148 | - Secrets: api/secrets.md 149 | - TableauRefreshDataSourceOperator: api/airflow_tableau_operators.md 150 | - TableauRefreshWorkbookOperator: api/airflow_tableau_operators.md 151 | - BoxToVolumeOperator: api/box_operator.md 152 | - VolumeToBoxOperator: api/box_operator.md 153 | - BoxOperator: api/box_operator.md 154 | 155 | 156 | extra: 157 | generator: false 158 | version: 159 | provider: mike 160 | default: latest -------------------------------------------------------------------------------- /tests/test_plugins.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from pathlib import Path 3 | from typing import List 4 | from unittest import mock 5 | 6 | import pluggy 7 | import pytest 8 | import toml 9 | 10 | from brickflow.engine.task import get_brickflow_tasks_hook, get_plugin_manager 11 | 12 | 13 | def assert_plugin_manager( 14 | pm: pluggy.PluginManager, expected_plugins: List[str] 15 | ) -> None: 16 | num_expected_plugins = len(expected_plugins) 17 | assert ( 18 | len(pm.get_plugins()) == num_expected_plugins 19 | ), f"import error should only {num_expected_plugins} plugins" 20 | for plugin in expected_plugins: 21 | assert pm.has_plugin(plugin), f"plugin manager should have {plugin} plugin" 22 | 23 | all_plugins = set([pm.get_name(plugin_impl) for plugin_impl in pm.get_plugins()]) 24 | assert all_plugins == set(expected_plugins), ( 25 | f"plugin manager should have {expected_plugins} " f"plugins and nothing more" 26 | ) 27 | 28 | 29 | class TestBrickflowPlugins: 30 | def test_plugins_installed(self): 31 | pm = copy.deepcopy(get_plugin_manager()) 32 | get_brickflow_tasks_hook(pm) 33 | assert_plugin_manager(pm, ["airflow-plugin", "default"]) 34 | 35 | def test_plugins_load_plugins_import_error(self): 36 | with mock.patch("brickflow_plugins.load_plugins") as load_plugins_mock: 37 | load_plugins_mock.side_effect = ImportError 38 | pm = copy.deepcopy(get_plugin_manager()) 39 | get_brickflow_tasks_hook(pm) 40 | assert_plugin_manager(pm, ["default"]) 41 | 42 | def test_plugins_ensure_installation_import_error(self): 43 | with mock.patch("brickflow_plugins.ensure_installation") as load_plugins_mock: 44 | load_plugins_mock.side_effect = ImportError 45 | pm = copy.deepcopy(get_plugin_manager()) 46 | get_brickflow_tasks_hook(pm) 47 | assert_plugin_manager(pm, ["default"]) 48 | 49 | @pytest.mark.parametrize( 50 | "quartz_cron, expected_unix_cron", 51 | [ 52 | ("0 * * ? * * *", "* * * * *"), 53 | ("0 */5 * ? * * *", "*/5 * * * *"), 54 | ("0 30 * ? * * *", "30 * * * *"), 55 | ("0 0 12 ? * * *", "0 12 * * *"), 56 | ("0 0 12 ? * 2 *", "0 12 * * 1"), 57 | ("0 0 0 10 * ? *", "0 0 10 * *"), 58 | ("0 0 0 1 1 ? *", "0 0 1 1 *"), 59 | ("0 0/5 14,18 * * ?", "0/5 14,18 * * *"), 60 | ("0 0 12 ? * 1,2,5-7 *", "0 12 * * 0,1,4-6"), 61 | ("0 0 12 ? * SUN,MON,THU-SAT *", "0 12 * * SUN,MON,THU-SAT"), 62 | ], 63 | ) 64 | def test_cron_conversion(self, quartz_cron, expected_unix_cron): 65 | import brickflow_plugins.airflow.cronhelper as cronhelper # noqa 66 | 67 | converted_unix_cron = cronhelper.cron_helper.quartz_to_unix(quartz_cron) 68 | converted_quartz_cron = cronhelper.cron_helper.unix_to_quartz( 69 | converted_unix_cron 70 | ) 71 | converted_unix_cron_second = cronhelper.cron_helper.quartz_to_unix( 72 | converted_quartz_cron 73 | ) 74 | 75 | assert ( 76 | converted_unix_cron == converted_unix_cron_second 77 | ), "cron conversion should be idempotent" 78 | assert converted_unix_cron == expected_unix_cron 79 | 80 | @pytest.mark.parametrize( 81 | "quartz_cron", 82 | [ 83 | "0 0 12 ? * L *", 84 | "0 0 12 ? * 1L *", 85 | "0 0 12 ? * 1W *", 86 | "0 0 12 ? * 1#5 *", 87 | ], 88 | ) 89 | def test_unsupported_cron_expressions(self, quartz_cron): 90 | import brickflow_plugins.airflow.cronhelper as cronhelper # noqa 91 | 92 | with pytest.raises(ValueError): 93 | cronhelper.cron_helper.quartz_to_unix(quartz_cron) 94 | 95 | def test_plugins_dependency_versions(self): 96 | from brickflow import BrickflowProjectDeploymentSettings 97 | from brickflow.engine.task import get_brickflow_libraries 98 | 99 | settings = BrickflowProjectDeploymentSettings() 100 | settings.brickflow_project_runtime_version = "1.0.0" 101 | 102 | # List of libraries resolved from (dev)dependencies 103 | expected_libs = {} 104 | with open( 105 | str(Path(__file__).parent.parent / "poetry.lock"), "r", encoding="utf-8" 106 | ) as f: 107 | data = toml.load(f) 108 | for lib in data.get("package", []): 109 | expected_libs[lib.get("name")] = lib.get("version") 110 | 111 | # Libraries used for plugins expected to be available in the dev environment 112 | # and should match the versions in poetry.lock to ensure consistency 113 | for lib in get_brickflow_libraries(enable_plugins=True): 114 | name, version = lib.package.split("==") 115 | if name != "brickflows": 116 | assert name in expected_libs 117 | assert ( 118 | version == expected_libs[name] 119 | ), f"Version mismatch for {name}: expected {expected_libs[name]}, got {version}" 120 | -------------------------------------------------------------------------------- /brickflow_plugins/airflow/cronhelper.py: -------------------------------------------------------------------------------- 1 | import re 2 | import functools 3 | 4 | from brickflow_plugins import log 5 | 6 | 7 | class CronHelper: 8 | EVERY_X_UNITS_REPLACE_PLACEHOLDER = "%s" 9 | QUARTZ_EVERY_X_UNITS_REGEX = re.compile(r"^0/(\d+)$") # For handling 0/5 units 10 | UNIX_EVERY_X_UNITS_REGEX = re.compile(r"^\*/(\d+)$") # For handling */5 units 11 | QUARTZ_EVERY_X_UNITS_REPLACE_PATTERN = f"0/{EVERY_X_UNITS_REPLACE_PLACEHOLDER}" 12 | UNIX_EVERY_X_UNITS_REPLACE_PATTERN = f"*/{EVERY_X_UNITS_REPLACE_PLACEHOLDER}" 13 | 14 | @staticmethod 15 | def __get_expression_parts(expression: str) -> list: 16 | parts = [part.strip() for part in expression.split(" ")] 17 | 18 | # Unix cron expression have 5 parts, Quartz cron expression have 6 or 7 parts 19 | if len(parts) in [5, 7]: 20 | return parts 21 | # Year is an optional part in Quartz cron expression, adding the extra element to mimic 7 part Quartz expression 22 | if len(parts) == 6: 23 | parts.append("*") 24 | return parts 25 | 26 | raise ValueError("Invalid cron expression!") 27 | 28 | @staticmethod 29 | def convert_interval_parts(part: str, is_quartz: bool = False) -> str: 30 | every_x_units_pattern = ( 31 | CronHelper.QUARTZ_EVERY_X_UNITS_REGEX 32 | if is_quartz 33 | else CronHelper.UNIX_EVERY_X_UNITS_REGEX 34 | ) 35 | matches = every_x_units_pattern.match(part) 36 | every_x_units_replace_pattern = ( 37 | CronHelper.QUARTZ_EVERY_X_UNITS_REPLACE_PATTERN 38 | if is_quartz 39 | else CronHelper.UNIX_EVERY_X_UNITS_REPLACE_PATTERN 40 | ) 41 | 42 | if matches: 43 | return every_x_units_replace_pattern.replace( 44 | CronHelper.EVERY_X_UNITS_REPLACE_PLACEHOLDER, matches.group(1) 45 | ) 46 | 47 | return part 48 | 49 | @functools.lru_cache(maxsize=128) # cron expression conversion will not change 50 | def unix_to_quartz(self, unix_cron: str) -> str: 51 | parts = self.__get_expression_parts(expression=unix_cron) 52 | 53 | if len(parts) != 5: 54 | raise ValueError("Invalid Unix cron expression") 55 | 56 | minute, hour, dom, month, dow = map(self.convert_interval_parts, parts) 57 | 58 | # Converting Unix DOW to Quartz DOW 59 | def shift_days(day: str) -> str: 60 | """ 61 | Quartz DOW starts from 1 (Sunday) while Unix DOW starts from 0 (Sunday) 62 | """ 63 | if "-" in day: 64 | return "-".join([shift_days(day=d) for d in day.split("-")]) 65 | 66 | # Unix cron Sunday can be represented as 0 or 7, but only as 1 in Quartz cron 67 | if day in ["0", "7"]: 68 | return "1" 69 | if day in ["SUN", "MON", "TUE", "WED", "THU", "FRI", "SAT"]: 70 | return day 71 | return str(int(day) + 1) 72 | 73 | if "," in dow: 74 | quartz_dow = ",".join([shift_days(day=day) for day in dow.split(",")]) 75 | elif dow == "*": 76 | quartz_dow = dow 77 | else: 78 | quartz_dow = shift_days(day=dow) 79 | 80 | quartz_dom = dom 81 | 82 | if dom != "*" and dow == "*": 83 | quartz_dow = "?" 84 | elif dom == "*": 85 | quartz_dom = "?" 86 | 87 | quartz_cron = f"0 {minute} {hour} {quartz_dom} {month} {quartz_dow} *" 88 | log.info("Converted unix cron %s to quartz cron %s", unix_cron, quartz_cron) 89 | return quartz_cron 90 | 91 | @functools.lru_cache(maxsize=128) # cron expression conversion will not change 92 | def quartz_to_unix(self, quartz_cron: str) -> str: 93 | parts = self.__get_expression_parts(expression=quartz_cron) 94 | 95 | if len(parts) != 7: 96 | raise ValueError("Invalid Quartz cron expression") 97 | 98 | if "L" in quartz_cron or "W" in quartz_cron or "#" in quartz_cron: 99 | raise ValueError("Support for 'L, W, #' in Quartz cron is not implemented") 100 | 101 | # Unix cron expression does not support '?' 102 | parts = [part.replace("?", "*") for part in parts] 103 | 104 | _, minute, hour, dom, month, dow, _ = map( 105 | lambda part: self.convert_interval_parts(part, True), parts 106 | ) 107 | 108 | # Converting Quartz DOW to Unix DOW 109 | def shift_days(day: str) -> str: 110 | """ 111 | Quartz DOW starts from 1 (Sunday) while Unix DOW starts from 0 (Sunday) 112 | """ 113 | if "-" in day: 114 | return "-".join([shift_days(day=d) for d in day.split("-")]) 115 | if day in ["SUN", "MON", "TUE", "WED", "THU", "FRI", "SAT"]: 116 | return day 117 | 118 | return str(int(day) - 1) 119 | 120 | if "," in dow: 121 | unix_dow = ",".join([shift_days(day=day) for day in dow.split(",")]) 122 | elif dow == "*": 123 | unix_dow = "*" 124 | else: 125 | unix_dow = shift_days(day=dow) 126 | 127 | unix_dom = dom 128 | 129 | unix_cron = f"{minute} {hour} {unix_dom} {month} {unix_dow}" 130 | log.info("Converted quartz cron %s to unix cron %s", quartz_cron, unix_cron) 131 | return unix_cron 132 | 133 | 134 | cron_helper = CronHelper() 135 | -------------------------------------------------------------------------------- /tests/cli/test_bundles.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from typing import Optional 4 | from unittest.mock import patch, Mock 5 | from pytest import LogCaptureFixture 6 | import pytest 7 | 8 | from brickflow import BrickflowEnvVars, _ilog 9 | from brickflow.cli.bundles import bundle_deploy, bundle_destroy 10 | 11 | 12 | class TestBundles: 13 | @patch("brickflow.cli.bundles.should_deploy", return_value=True) 14 | @patch("brickflow.cli.bundles.exec_command") 15 | @patch.dict( 16 | os.environ, {BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_VERSION.value: "0.203.0"} 17 | ) 18 | def test_bundle_deploy_new_cli(self, mock_exec_command: Mock, _: Mock): 19 | mock_exec_command.side_effect = lambda *args, **kwargs: None 20 | mock_exec_command.return_value = None 21 | # workflows_dir needed to make the function work due to bundle sync 22 | bundle_deploy( 23 | force_acquire_lock=True, 24 | workflows_dir="somedir", 25 | debug=True, 26 | fail_on_active_runs=True, 27 | ) 28 | bundle_cli = os.environ[BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_EXEC.value] 29 | mock_exec_command.assert_called_with( 30 | bundle_cli, 31 | "bundle", 32 | [ 33 | "deploy", 34 | "-t", 35 | "local", 36 | "--fail-on-active-runs", 37 | "--force-lock", 38 | "--debug", 39 | ], 40 | ) 41 | bundle_destroy(force_acquire_lock=True, workflows_dir="somedir", debug=True) 42 | bundle_cli = os.environ[BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_EXEC.value] 43 | mock_exec_command.assert_called_with( 44 | bundle_cli, 45 | "bundle", 46 | ["destroy", "-t", "local", "--force-lock", "--debug"], 47 | ) 48 | 49 | @patch("brickflow.cli.bundles.should_deploy", return_value=True) 50 | @patch("brickflow.cli.bundles.exec_command") 51 | @patch.dict( 52 | os.environ, 53 | { 54 | BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_VERSION.value: "0.201.0", 55 | BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_EXEC.value: "databricks", 56 | }, 57 | ) 58 | def test_bundle_deploy_old_cli(self, mock_exec_command: Mock, _: Mock): 59 | mock_exec_command.side_effect = lambda *args, **kwargs: None 60 | mock_exec_command.return_value = None 61 | # workflows_dir needed to make the function work due to bundle sync 62 | bundle_deploy(force_acquire_lock=True, workflows_dir="somedir") 63 | bundle_cli = os.environ[BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_EXEC.value] 64 | mock_exec_command.assert_called_with( 65 | bundle_cli, 66 | "bundle", 67 | ["deploy", "-t", "local", "--force"], 68 | ) 69 | bundle_destroy(force_acquire_lock=True, workflows_dir="somedir") 70 | bundle_cli = os.environ[BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_EXEC.value] 71 | mock_exec_command.assert_called_with( 72 | bundle_cli, 73 | "bundle", 74 | ["destroy", "-t", "local", "--force"], 75 | ) 76 | 77 | @patch("brickflow.cli.bundles.exec_command") 78 | @patch.dict( 79 | os.environ, {BrickflowEnvVars.BRICKFLOW_BUNDLE_CLI_VERSION.value: "0.203.0"} 80 | ) 81 | def test_deploy_no_workflows( 82 | self, mock_exec_command: Mock, caplog: LogCaptureFixture 83 | ): 84 | mock_exec_command.side_effect = lambda *args, **kwargs: None 85 | mock_exec_command.return_value = None 86 | 87 | # Adjusting the log level and propagating it to the root logger to make sure it's captured by caplog 88 | _ilog.propagate = True 89 | _ilog.level = logging.WARN 90 | 91 | with caplog.at_level(logging.WARN): 92 | # running this should not fail but log a warning stating that no bundle has been found 93 | bundle_deploy(force_acquire_lock=True, workflows_dir="somedir") 94 | 95 | assert "No bundle.yml found, skipping deployment." in [ 96 | rec.message for rec in caplog.records 97 | ] 98 | 99 | @pytest.mark.parametrize( 100 | "input_arch,expected_arch", 101 | [ 102 | ("x86_64", "amd64"), # Test one x86_64 variant 103 | ("amd64", "amd64"), # Test alternative x86_64 name 104 | ("i386", "386"), # Test one 32-bit variant 105 | ("i686", "386"), # Test alternative 32-bit name 106 | ("arm64", "arm64"), # Test one ARM variant 107 | ("aarch64", "arm64"), # Test alternative ARM name 108 | ("X86_64", "amd64"), # Test case insensitivity 109 | ("unsupported_arch", None), # Test unsupported architecture 110 | ], 111 | ) 112 | def test_get_arch_mappings( 113 | self, input_arch: str, expected_arch: Optional[str] 114 | ) -> None: 115 | from brickflow.cli.bundles import get_arch 116 | 117 | with patch("platform.machine") as mock_machine: 118 | mock_machine.return_value = input_arch 119 | 120 | if expected_arch is None: 121 | with pytest.raises(RuntimeError) as exc_info: 122 | get_arch() 123 | assert f"Unsupported architecture: {input_arch}" in str(exc_info.value) 124 | else: 125 | assert get_arch() == expected_arch 126 | -------------------------------------------------------------------------------- /brickflow/cli/configure.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import importlib 4 | import os 5 | import re 6 | import sys 7 | from pathlib import Path 8 | from typing import Callable, Any, Optional 9 | 10 | import click 11 | from jinja2 import Environment, BaseLoader 12 | 13 | from brickflow import _ilog, BrickflowProjectConstants, get_entrypoint_python 14 | from brickflow.cli.commands import exec_command 15 | 16 | PWD = Path(__file__).parent.absolute() 17 | GITIGNORE_TEMPLATE = PWD / "gitignore_template.txt" 18 | GIT_PATH = Path(".git") 19 | 20 | 21 | class GitNotFoundError(Exception): 22 | pass 23 | 24 | 25 | class GitIgnoreNotFoundError(Exception): 26 | pass 27 | 28 | 29 | def _gitignore_exists() -> bool: 30 | return os.path.exists(".gitignore") and os.path.isfile(".gitignore") 31 | 32 | 33 | def _create_gitignore_if_not_exists() -> None: 34 | if _gitignore_exists() is False: 35 | Path(".gitignore").touch(mode=0o755) 36 | 37 | 38 | def _get_gitignore() -> str: 39 | return Path(".gitignore").read_text(encoding="utf-8") 40 | 41 | 42 | def _get_gitignore_template() -> str: 43 | return GITIGNORE_TEMPLATE.read_text() 44 | 45 | 46 | def _write_gitignore(data: str) -> None: 47 | Path(".gitignore").write_text(encoding="utf-8", data=data) 48 | 49 | 50 | def _update_gitignore() -> None: 51 | search_regex = re.compile( 52 | r"(# GENERATED BY BRICKFLOW CLI --START--(.|\n)*# GENERATED BY BRICKFLOW CLI --END--)" 53 | ) 54 | 55 | git_ignore_data = _get_gitignore() 56 | git_ignore_template = _get_gitignore_template() 57 | search = search_regex.findall(git_ignore_data) 58 | if len(search) > 0: 59 | search_match = search[0][0] 60 | gitignore_file_data = git_ignore_data.replace(search_match, git_ignore_template) 61 | else: 62 | gitignore_file_data = "\n\n".join([git_ignore_data, git_ignore_template]) 63 | _write_gitignore(gitignore_file_data) 64 | 65 | 66 | def _validate_package(path_str: str) -> str: 67 | folder_path: Path = Path(path_str) 68 | 69 | if not folder_path.exists(): 70 | raise ImportError(f"Invalid pkg error: {folder_path.as_posix()}") 71 | 72 | sys.path.append(os.getcwd()) 73 | folder_pkg_path: str = folder_path.as_posix().replace("/", ".") 74 | 75 | for module in folder_path.glob("**/*.py"): # only find python files 76 | # ignore __init__.py 77 | if module.name == "__init__.py": 78 | continue 79 | module_name = module.as_posix().replace(".py", "").replace("/", ".") 80 | # import all the modules into the mod object and not actually import them using __import__ 81 | mod = importlib.import_module(module_name) 82 | click.echo(f"Scanned module: {mod.__name__}") 83 | 84 | return folder_pkg_path 85 | 86 | 87 | def render_template(**kwargs) -> str: # type: ignore 88 | template = Path(__file__).parent.absolute() / "entrypoint.template" 89 | with template.open("r") as f: 90 | data = f.read() 91 | return Environment(loader=BaseLoader()).from_string(data).render(**kwargs) 92 | 93 | 94 | def create_entry_point(working_dir: str, data: str) -> None: 95 | path = Path(working_dir) / "entrypoint.py" 96 | if path.exists(): 97 | click.echo(f"Path: {str(path.absolute())} already exists...") 98 | # path = Path(working_dir) / "entrypoint.py.new" 99 | else: 100 | click.echo(f"Creating file in path: {str(path.absolute())}...") 101 | path.write_text(data) 102 | 103 | 104 | def create_brickflow_project_root_marker() -> None: 105 | path = Path( 106 | f"{BrickflowProjectConstants.DEFAULT_MULTI_PROJECT_ROOT_FILE_NAME.value}." 107 | f"{BrickflowProjectConstants.DEFAULT_CONFIG_FILE_TYPE.value}" 108 | ) 109 | if path.exists(): 110 | click.echo(f"Path: {str(path.absolute())} already exists...") 111 | # path = Path(working_dir) / "entrypoint.py.new" 112 | else: 113 | click.echo(f"Creating file in path: {str(path.absolute())}...") 114 | path.write_text( 115 | "# DO NOT MODIFY THIS FILE - IT IS AUTO GENERATED BY BRICKFLOW AND RESERVED FOR FUTURE USAGE", 116 | encoding="utf-8", 117 | ) 118 | 119 | 120 | def bind_env_var(env_var: str) -> Callable: 121 | def callback( 122 | ctx: click.Context, # noqa 123 | param: str, # noqa 124 | value: Any, 125 | ) -> None: 126 | # pylint: disable=unused-argument 127 | if value is not None and len(value) > 0: 128 | _ilog.info("Setting env var: %s to %s...", env_var, value) 129 | if isinstance(value, list): 130 | os.environ[env_var] = ",".join(value) 131 | if isinstance(value, tuple): 132 | os.environ[env_var] = ",".join(value) 133 | elif isinstance(value, bool): 134 | os.environ[env_var] = str(value).lower() 135 | else: 136 | os.environ[env_var] = value 137 | 138 | return callback 139 | 140 | 141 | def get_entrypoint(**kwargs: Any) -> str: 142 | wd: Optional[str] = kwargs.get("workflows_dir") 143 | if wd is None: 144 | raise ValueError( 145 | "workflows_dir not set, please set it using --workflows-dir or -wd" 146 | ) 147 | return str(Path(wd) / "entrypoint.py") 148 | 149 | 150 | def log_important_versions(bundle_cli: str) -> None: 151 | version = exec_command(bundle_cli, "--version", [], capture_output=True) 152 | _ilog.info("Using bundle version: %s", version) 153 | log_python_version() 154 | 155 | 156 | def log_python_version() -> None: 157 | version = exec_command( 158 | get_entrypoint_python(), "--version", [], capture_output=True 159 | ) 160 | _ilog.info("Using python version: %s", version) 161 | --------------------------------------------------------------------------------