├── .codecov.yml ├── .coveragerc ├── .dockerignore ├── .github ├── ISSUE_TEMPLATE │ ├── 01-bug.yml │ └── 02-feature.yml ├── dependabot.yml └── workflows │ └── cicd.yaml ├── .gitignore ├── .markdownlint.json ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── CODEOWNERS ├── LICENSE ├── Makefile ├── PRIVACY_NOTICE.md ├── README.md ├── SECURITY.md ├── dagfactory ├── __init__.py ├── constants.py ├── dagbuilder.py ├── dagfactory.py ├── exceptions.py ├── listeners │ ├── __init__.py │ └── runtime_event.py ├── parsers.py ├── plugin │ └── __init__.py ├── settings.py ├── telemetry.py └── utils.py ├── dev ├── .astro │ ├── config.yaml │ ├── dag_integrity_exceptions.txt │ └── test_dag_integrity_default.py ├── .dockerignore ├── .gitignore ├── Dockerfile ├── README.md ├── dags │ ├── comparison │ │ ├── example_hackernews_dagfactory.yml │ │ ├── example_hackernews_plain_airflow.py │ │ ├── example_pypi_stats_dagfactory.yml │ │ └── example_pypi_stats_plain_airflow.py │ ├── customized │ │ ├── __init__.py │ │ ├── callbacks │ │ │ ├── __init__.py │ │ │ └── custom_callbacks.py │ │ ├── helpers │ │ │ ├── __init__.py │ │ │ └── etl.py │ │ └── operators │ │ │ ├── __init__.py │ │ │ └── breakfast_operators.py │ ├── datasets │ │ ├── example_config_datasets.yml │ │ ├── example_dag_datasets.py │ │ ├── example_dag_datasets.yml │ │ ├── example_dag_datasets_outlet_inlet.yml │ │ ├── example_dataset_condition_string.yml │ │ └── example_dataset_yaml_syntax.yml │ ├── defaults.yml │ ├── example_callbacks.py │ ├── example_callbacks.yml │ ├── example_customize_operator.py │ ├── example_customize_operator.yml │ ├── example_dag_factory.py │ ├── example_dag_factory.yml │ ├── example_dag_factory_default_args.yml │ ├── example_dag_factory_default_config.py │ ├── example_dag_factory_default_config.yml │ ├── example_dag_factory_multiple.py │ ├── example_dag_factory_multiple_config.yml │ ├── example_dynamic_task_mapping.py │ ├── example_dynamic_task_mapping.yml │ ├── example_http_operator_task.py │ ├── example_http_operator_task.yml │ ├── example_load_yaml_dags.py │ ├── example_map_index_template.py │ ├── example_map_index_template.yml │ ├── example_simple_http_operator_task.yml │ ├── example_task_group.py │ ├── example_task_group.yml │ ├── example_taskflow.py │ ├── example_taskflow.yml │ ├── expand_tasks.py │ ├── external_task_sensor.yml │ ├── hacker_news.py │ ├── invalid.yaml │ ├── print_hello.py │ ├── pypi_stats.py │ └── sample.py ├── packages.txt ├── requirements.txt └── tests │ └── dags │ └── test_dag_example.py ├── docs ├── comparison │ ├── index.md │ ├── taskflow_api.md │ └── traditional_operators.md ├── configuration │ ├── configuring_workflows.md │ ├── defaults.md │ └── environment_variables.md ├── contributing │ ├── code_of_conduct.md │ ├── contributors.md │ ├── howto.md │ └── roles.md ├── features │ ├── callbacks.md │ ├── custom_operators.md │ ├── datasets.md │ ├── dynamic_tasks.md │ ├── http_task.md │ └── multiple_configuration_files.md ├── getting-started │ ├── quick-start-airflow-standalone.md │ └── quick-start-astro-cli.md ├── index.md └── static │ ├── example_dynamic_task_mapping.png │ ├── example_hackernews_dagfactory_code.png │ ├── example_hackernews_dagfactory_docs.png │ ├── example_hackernews_dagfactory_graph.png │ ├── example_hackernews_plain_airflow_code.png │ ├── example_hackernews_plain_airflow_graph.png │ ├── example_map_index_template.png │ ├── example_pypi_stats_dagfactory_code.png │ ├── example_pypi_stats_dagfactory_docs.png │ ├── example_pypi_stats_dagfactory_graph.png │ ├── example_pypi_stats_dagfactory_mapped_tasks.png │ ├── example_pypi_stats_plain_airflow_code.png │ ├── example_pypi_stats_plain_airflow_graph.png │ ├── example_pypi_stats_plain_airflow_mapped_tasks.png │ └── images │ ├── airflow-dag.png │ ├── airflow-home.png │ ├── custom_operators.png │ └── datasets │ ├── conditions │ ├── graph_conditional_dataset.png │ └── graph_conditional_dataset_2.png │ └── outlets │ └── datasets_example.png ├── examples └── dags ├── img ├── mapped_tasks_example.png ├── quickstart_dag.png └── quickstart_gantt.png ├── mkdocs.yml ├── pyproject.toml ├── scripts ├── airflow3 │ ├── .gitignore │ ├── README.md │ ├── dags │ │ ├── example_dag_factory.py │ │ └── example_dag_factory.yml │ ├── env.sh │ ├── install_from_main.sh │ ├── requirements.txt │ ├── setup.sh │ └── tests.sh ├── docs_deploy.py ├── test │ ├── integration-setup.sh │ ├── integration.sh │ ├── pre-install-airflow.sh │ ├── unit-cov.sh │ └── unit.sh └── verify_tag_and_version.py └── tests ├── __init__.py ├── fixtures ├── dag_factory.yml ├── dag_factory_http_operator_task.yml ├── dag_factory_kubernetes_pod_operator.yml ├── dag_factory_simple_http_operator_task.yml ├── dag_factory_task_group.yml ├── dag_factory_variables_as_arguments.yml ├── dag_md_docs.yml ├── defaults.yml ├── doc_md_builder.py ├── invalid_dag_factory.yml ├── invalid_yaml.yml └── mydocfile.md ├── test_dagbuilder.py ├── test_dagbuilder_httpoperator.py ├── test_dagfactory.py ├── test_example_dags.py ├── test_parsers.py ├── test_settings.py ├── test_telemetry.py ├── test_utils.py └── utils.py /.codecov.yml: -------------------------------------------------------------------------------- 1 | --- 2 | coverage: 3 | status: 4 | project: 5 | default: 6 | target: auto 7 | threshold: 2% 8 | only_pulls: true 9 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | tests/* 4 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/01-bug.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug Report 3 | description: File a bug report. 4 | title: "[Bug] " 5 | labels: ["bug", "triage-needed"] 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: | 10 | Thanks for taking the time to fill out this bug report! 11 | - type: input 12 | id: dag-factory-version 13 | attributes: 14 | label: DAG Factory version 15 | # yamllint disable rule:line-length 16 | description: > 17 | On what version of DAG Factory are you currently experiencing the issue? Remember, you are encouraged to 18 | test with the latest release or on the main branch to verify your issue still exists. 19 | placeholder: e.g. 0.19.0 20 | validations: 21 | required: true 22 | - type: input 23 | id: airflow-version 24 | attributes: 25 | label: airflow version 26 | description: What version of Apache Airflow are you running? 27 | placeholder: ex. 2.9.0 28 | validations: 29 | required: true 30 | - type: input 31 | id: python-version 32 | attributes: 33 | label: Python version 34 | description: What version of Python are you running? 35 | placeholder: e.g. 3.10 36 | validations: 37 | required: true 38 | - type: dropdown 39 | attributes: 40 | label: Deployment 41 | description: > 42 | What kind of deployment do you have? 43 | multiple: false 44 | options: 45 | - "Official Apache Airflow Helm Chart" 46 | - "Other 3rd-party Helm chart" 47 | - "Docker-Compose" 48 | - "Other Docker-based deployment" 49 | - "Virtualenv installation" 50 | - "Astronomer" 51 | - "Google Cloud Composer" 52 | - "Amazon (AWS) MWAA" 53 | - "Microsoft ADF Managed Airflow" 54 | - "Other" 55 | validations: 56 | required: true 57 | - type: textarea 58 | attributes: 59 | label: Deployment details 60 | description: Additional description of your deployment. 61 | placeholder: > 62 | Enter any relevant details of your deployment. Especially version of your tools, 63 | software (docker-compose, helm, k8s, etc.), any customisation and configuration you added. 64 | - type: textarea 65 | id: what-happened 66 | attributes: 67 | label: What happened? 68 | description: Also tell us, what did you expect to happen? 69 | placeholder: Tell us what you see! 70 | value: "A bug happened!" 71 | validations: 72 | required: true 73 | - type: textarea 74 | id: logs 75 | attributes: 76 | label: Relevant log output 77 | description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. 78 | render: shell 79 | - type: textarea 80 | attributes: 81 | label: How to reproduce 82 | description: What should we do to reproduce the problem? 83 | placeholder: > 84 | Please make sure you provide a reproducible step-by-step case of how to reproduce the problem 85 | as minimally and precisely as possible. Keep in mind we do not have access to your cluster or DAGs. 86 | Remember that non-reproducible issues make it hard for us to help you or resolve the issue! 87 | validations: 88 | required: true 89 | - type: textarea 90 | attributes: 91 | label: Anything else :)? 92 | description: Anything else we need to know? 93 | placeholder: > 94 | How often does this problem occur? (Once? Every time? Only when certain conditions are met?) 95 | - type: checkboxes 96 | attributes: 97 | label: Are you willing to submit PR? 98 | description: > 99 | This is absolutely not required, but we are happy to guide you in the contribution process 100 | especially if you already have a good understanding of how to implement the fix. We love to bring new 101 | contributors in. 102 | options: 103 | - label: Yes I am willing to submit a PR! 104 | - type: input 105 | id: contact 106 | attributes: 107 | label: Contact Details 108 | description: (Optional) How can we get in touch with you if we need more info? 109 | placeholder: ex. email@example.com 110 | validations: 111 | required: false 112 | - type: markdown 113 | attributes: 114 | value: "Thanks for completing our form!" 115 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/02-feature.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | description: Suggest an idea for this project 4 | title: "[Feature] " 5 | labels: ["enhancement", "triage-needed"] 6 | body: 7 | - type: markdown 8 | attributes: 9 | # yamllint disable rule:line-length 10 | value: " 11 | Thank you for finding the time to propose new feature! 12 | 13 | We really appreciate the community efforts to improve DAG Factory." 14 | # yamllint enable rule:line-length 15 | - type: textarea 16 | attributes: 17 | label: Description 18 | description: A short description of your feature 19 | - type: textarea 20 | attributes: 21 | label: Use case/motivation 22 | description: What would you like to happen? 23 | - type: textarea 24 | attributes: 25 | label: Related issues 26 | description: Is there currently another issue associated with this? 27 | - type: checkboxes 28 | attributes: 29 | label: Are you willing to submit a PR? 30 | options: 31 | - label: Yes, I am willing to submit a PR! 32 | - type: markdown 33 | attributes: 34 | value: "Thanks for completing our form!" 35 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "daily" 7 | labels: 8 | - "dependencies" 9 | reviewers: 10 | - "@astronomer/oss-integrations" 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | bin/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # Environments 83 | .env 84 | .venv 85 | env/ 86 | venv/ 87 | ENV/ 88 | .installed 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # venv 104 | pip-selfcheck.json 105 | 106 | # vscode 107 | .vscode 108 | 109 | # IntelliJ 110 | .idea/* 111 | 112 | # sqllite db 113 | *.db 114 | 115 | 116 | # Airflow logs 117 | logs/ 118 | 119 | # MacOS DS_Store 120 | *.DS_Store 121 | 122 | # VIM 123 | *.sw[a-z] 124 | 125 | # Airflow 126 | examples/.airflowignore 127 | airflow.cfg 128 | webserver_config.py 129 | 130 | # Astro 131 | dev/include/dag_factory-* 132 | -------------------------------------------------------------------------------- /.markdownlint.json: -------------------------------------------------------------------------------- 1 | { 2 | "MD007": { 3 | "indent": 4 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | repos: 4 | - repo: https://github.com/astral-sh/ruff-pre-commit 5 | rev: v0.6.9 6 | hooks: 7 | - id: ruff 8 | args: 9 | - --fix 10 | 11 | - repo: https://github.com/pre-commit/pre-commit-hooks 12 | rev: v5.0.0 13 | hooks: 14 | - id: check-added-large-files 15 | - id: check-merge-conflict 16 | - id: check-toml 17 | - id: check-yaml 18 | args: 19 | - --unsafe 20 | exclude: 'tests/fixtures/dag_factory.yml|dev/dags/invalid.yaml|tests/fixtures/invalid_yaml.yml' 21 | - id: debug-statements 22 | - id: end-of-file-fixer 23 | - id: mixed-line-ending 24 | - id: pretty-format-json 25 | args: [ "--autofix" ] 26 | - id: trailing-whitespace 27 | - id: detect-private-key 28 | - id: detect-aws-credentials 29 | args: [ "--allow-missing-credentials" ] 30 | 31 | - repo: https://github.com/psf/black 32 | rev: 24.10.0 33 | hooks: 34 | - id: black 35 | args: [ "--config", "./pyproject.toml" ] 36 | 37 | - repo: https://github.com/codespell-project/codespell 38 | rev: v2.2.4 39 | hooks: 40 | - id: codespell 41 | exclude: tests/fixtures/mydocfile.md 42 | 43 | - repo: https://github.com/igorshubovych/markdownlint-cli 44 | rev: v0.41.0 45 | hooks: 46 | - id: markdownlint 47 | args: 48 | - "--disable=MD013" # disable line length 49 | - "--disable=MD024" # disable multiple headings with the same content (CHANGELOG) 50 | - "--disable=MD033" # disable no inline html (needed for analytics dead pixel) 51 | 52 | - repo: https://github.com/tcort/markdown-link-check 53 | rev: v3.13.6 54 | hooks: 55 | - id: markdown-link-check 56 | args: [-q] 57 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @astronomer/oss-integrations 2 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: help 2 | help: 3 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 4 | 5 | .PHONY: setup 6 | setup: ## Setup development environment 7 | python -m venv venv 8 | . venv/bin/activate && pip --no-cache-dir install ".[tests]" 9 | @echo "To activate the virtual environment, run:" 10 | @echo "source venv/bin/activate" 11 | 12 | .PHONY: clean 13 | clean: ## Removes build and test artifacts 14 | @echo "==> Removing build and test artifacts" 15 | @rm -rf *.egg *egg-info .cache .coverage .tox build bin include dist htmlcov lib .pytest_cache .venv 16 | @find . -name '*.pyc' -exec rm -f {} + 17 | @find . -name '*.pyo' -exec rm -f {} + 18 | @find . -name '*~' -exec rm -f {} + 19 | @find . -name '__pycache__' -exec rm -rf {} + 20 | 21 | 22 | .PHONY: build-whl 23 | build-whl: ## Build installable whl file 24 | rm -rf dev/include/* 25 | rm -rf dist/* 26 | mkdir -p dev/include 27 | hatch build 28 | cp dist/* dev/include/ 29 | 30 | .PHONY: docker-run 31 | docker-run: build-whl ## Runs local Airflow for testing 32 | @if ! lsof -i :8080 | grep LISTEN > /dev/null; then \ 33 | cd dev && astro dev start --verbosity debug; \ 34 | else \ 35 | cd dev && astro dev restart --verbosity debug; \ 36 | fi 37 | 38 | .PHONY: docker-stop 39 | docker-stop: ## Stop Docker container 40 | cd dev && astro dev stop 41 | -------------------------------------------------------------------------------- /PRIVACY_NOTICE.md: -------------------------------------------------------------------------------- 1 | # Privacy Notice 2 | 3 | This project follows the [Privacy Policy of Astronomer](https://www.astronomer.io/privacy/). 4 | 5 | ## Collection of Data 6 | 7 | DAG Factory integrates [Scarf](https://about.scarf.sh/) to collect basic telemetry data during operation. 8 | This data assists the project maintainers in better understanding how DAG Factory is used. 9 | Insights gained from this telemetry are critical for prioritizing patches, minor releases, and 10 | security fixes. Additionally, this information supports key decisions related to the development road map. 11 | 12 | Deployments and individual users can opt-out of analytics by setting the configuration: 13 | 14 | ```ini 15 | [dag_factory] 16 | enable_telemetry False 17 | ``` 18 | 19 | As described in the [official documentation](https://docs.scarf.sh/gateway/#do-not-track), it is also possible to opt out by setting one of the following environment variables: 20 | 21 | ```commandline 22 | DO_NOT_TRACK=True 23 | SCARF_NO_ANALYTICS=True 24 | ``` 25 | 26 | In addition to Scarf's default data collection, DAG Factory collects the following information: 27 | 28 | - DAG Factory version 29 | - Airflow version 30 | - Python version 31 | - Operating system & machine architecture 32 | - Event type 33 | - Number of failed DagRuns 34 | - Number of successful DagRuns 35 | - Total tasks associated to each DagRun 36 | - Dag hash 37 | 38 | No user-identifiable information (IP included) is stored in Scarf. 39 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dag-factory 2 | 3 | [![Github Actions](https://github.com/astronomer/dag-factory/actions/workflows/cicd.yaml/badge.svg?branch=main&event=push)](https://github.com/astronomer/dag-factory/actions?workflow=build) 4 | [![Coverage](https://codecov.io/github/astronomer/dag-factory/coverage.svg?branch=master)](https://codecov.io/github/astronomer/dag-factory?branch=master) 5 | [![PyPi](https://img.shields.io/pypi/v/dag-factory.svg)](https://pypi.org/project/dag-factory/) 6 | [![Code Style](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black) 7 | [![Downloads](https://img.shields.io/pypi/dm/dag-factory.svg)](https://img.shields.io/pypi/dm/dag-factory) 8 | 9 | analytics 10 | 11 | Welcome to *dag-factory*! *dag-factory* is a library for [Apache Airflow®](https://airflow.apache.org) to construct DAGs 12 | declaratively via configuration files. 13 | 14 | The minimum requirements for **dag-factory** are: 15 | 16 | - Python 3.8.0+ 17 | - [Apache Airflow®](https://airflow.apache.org) 2.3+ 18 | 19 | For a gentle introduction, please take a look at our [Quickstart Guide](https://astronomer.github.io/dag-factory/latest/getting-started/quick-start-airflow-standalone/). For more examples, please see the 20 | [examples](/examples) folder. 21 | 22 | - [Quickstart](https://astronomer.github.io/dag-factory/latest/getting-started/quick-start-astro-cli/) 23 | - [Benefits](#benefits) 24 | - [Features](https://astronomer.github.io/dag-factory/latest/features/dynamic_tasks/) 25 | - [Dynamically Mapped Tasks](https://astronomer.github.io/dag-factory/latest/features/dynamic_tasks/) 26 | - [Multiple Configuration Files](https://astronomer.github.io/dag-factory/latest/features/multiple_configuration_files/) 27 | - [Callbacks](https://astronomer.github.io/dag-factory/latest/features/callbacks/) 28 | - [Custom Operators](https://astronomer.github.io/dag-factory/latest/features/custom_operators/) 29 | - [HttpSensor](https://astronomer.github.io/dag-factory/latest/features/http_task/) 30 | - [Contributing](https://astronomer.github.io/dag-factory/latest/contributing/howto/) 31 | 32 | ## Benefits 33 | 34 | - Construct DAGs without knowing Python 35 | - Construct DAGs without learning Airflow primitives 36 | - Avoid duplicative code 37 | - Everyone loves YAML! ;) 38 | 39 | ## License 40 | 41 | To learn more about the terms and conditions for use, reproduction and distribution, read the [Apache License 2.0](https://github.com/astronomer/dag-factory/blob/main/LICENSE). 42 | 43 | ## Privacy Notice 44 | 45 | This project follows [Astronomer's Privacy Policy](https://www.astronomer.io/privacy/). 46 | 47 | For further information, [read this](https://github.com/astronomer/dag-factory/blob/main/PRIVACY_NOTICE.md) 48 | 49 | ## Security Policy 50 | 51 | Check the project's [Security Policy](https://github.com/astronomer/dag-factory/blob/main/SECURITY.md) to learn 52 | how to report security vulnerabilities in DAG Factory and how security issues reported to the DAG Factory 53 | security team are handled. 54 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security 2 | 3 | [//]: # (This document is reused across Astronomer OSS Integrations projects, any changes should also be applied to security docs in the other repositories.) 4 | 5 | This document contains information on how to report security vulnerabilities in DAG Factory and 6 | how security issues reported to the Astronomer security team are handled. 7 | If you would like to learn more, refer to [https://www.astronomer.io/security/](https://www.astronomer.io/security). 8 | 9 | At Astronomer, we recognize the critical nature of security and view it as a transparent and collaborative effort. 10 | If you have any concern about the security of any Astronomer public repository, or believe you have uncovered a vulnerability, 11 | please email [oss_security@astronomer.io](mailto:oss_security@astronomer.io). 12 | 13 | > **warning**: Due to the nature of some security vulnerabilities, do not create a GitHub issue to report a vulnerability. 14 | 15 | ## Use of Email for Vulnerability Disclosure 16 | 17 | Only use the OSS security email to disclose security vulnerabilities. 18 | Astronomer does not accept bug reports, security implementation questions, or other security-related issues at this email address. 19 | If you are a customer of Astronomer, please reach out to your account team if you have any security-related questions or 20 | issues other than vulnerabilities, and they can assist you. Otherwise, this codebase is provided ‘as-is’ in accordance 21 | with its licensing structure. 22 | 23 | ## Scope 24 | 25 | When submitting vulnerabilities, please ensure that it is within the scope of the project, based on the following descriptions. Out-of-scope vulnerability reports will be ignored. 26 | 27 | ### In-scope 28 | 29 | * Code base with tagged releases 30 | * When integrated as specified in the [official DAG Factory documentation](https://github.com/astronomer/dag-factory/). 31 | 32 | ### Out-of-scope 33 | 34 | * Any other codebase, including other Astronomer products 35 | * Astronomer.io website 36 | * Dependencies used in DAG Factory 37 | * DAG Factory when modified or run using an unintended configuration 38 | * Other systems integrated with or CSP systems hosting the deployment 39 | * Cookie transfers between browsers 40 | 41 | For other products and repositories owned by Astronomer, please refer to their specific security policy or to 42 | [https://www.astronomer.io/vulnerability-disclosure/](https://www.astronomer.io/vulnerability-disclosure/) for 43 | vulnerabilities associated with Astronomer products. 44 | 45 | ## Required information and how to disclose 46 | 47 | Please send a single, plain-text (not HTML) email for each vulnerability you report. 48 | 49 | Your written description of the vulnerability is a critical part of the initial report. You can optionally include images and videos in your initial report, but the written description of the vulnerability must includes the following information, at a minimum: 50 | 51 | * Brief description/title of the vulnerability 52 | * Steps to recreate the issue 53 | * Contact information 54 | 55 | Upon review, we may request additional information including, but not limited to, images or a proof-of-concept video. 56 | 57 | ## Severity 58 | 59 | The vulnerability severity rating system used internally by Astronomer is not the same as the one used by the Apache Software Foundation. 60 | Please do not provide a severity for the vulnerability when disclosing, however, providing a CWE (Common Weakness Enumeration) is recommended. 61 | 62 | ## Response Timeframe 63 | 64 | Astronomer aims to acknowledge and validate disclosures within 5 business days. Resolutions will be provided in a timely manner. 65 | 66 | ## Follow-up Communication 67 | 68 | Astronomer handles follow-up communications to disclosures sent to [oss_security@astronomer.io](mailto:oss_security@astronomer.io) on a best-case effort, often within 3-5 business days. If the disclosure involves a product or repository that is covered by Astronomer's use of the Bugcrowd Vulnerability Disclosure Platform, please see Bugcrowd's terms of service for follow-up communication timelines. Disclosures to the Bugcrowd Vulnerability Disclosure Platform will result in communications through that platform. 69 | 70 | ## Partial Safe Harbor 71 | 72 | Astronomer will not threaten or bring any legal action against anyone who makes a good faith effort to comply with this 73 | vulnerability disclosure policy. This includes any claim under the DMCA for circumventing technological measures to 74 | protect the services and applications eligible under this policy. 75 | 76 | **As long as you comply with this policy:** 77 | 78 | * We consider your security research to be "authorized" under the Computer Fraud and Abuse Act (and/or similar state laws), and 79 | * We waive any restrictions in our application Terms of Use and Usage Policies that would prohibit your participation in this policy, but only for the limited purpose of your security research under this policy. 80 | 81 | ## Notification Requirement 82 | 83 | * Safe harbor under this policy is only extended if the discoverer of the vulnerability notifies Astronomer as outlined elsewhere in this policy, prior to notifying any other third-party entities, and does not notify any other third-party entities for 90 days after notifying Astronomer, without Astronomer’s prior written approval. 84 | * After notification of Astronomer and the lapse of the 90 day period, it is requested that any publications, third-party releases, or other disseminations of information related to or derived from the vulnerability discovery be coordinated with Astronomer prior. 85 | 86 | ## Right to rescind safe harbor protections 87 | 88 | Astronomer reserves the right to rescind any and all safe harbor protections originally extended to the vulnerability 89 | discoverer in the event that the discoverer, at any point prior to or after notification to Astronomer, 90 | has knowingly and willfully released, published, or otherwise used information related to the discovered vulnerability in a manner that: 91 | 92 | 1. Maligns or damages the reputation of Astronomer, its customers, or its employees; 93 | 2. Is used to conduct malicious attacks against Astronomer systems, regardless of whether material damages occur; or 94 | 3. Exacerbates existing vulnerabilities or threats, thereby increasing the risk to Astronomer or its stakeholders. 95 | 96 | ## Extension of safe harbor to third-party systems and services 97 | 98 | Astronomer systems and services can interconnect with third-party systems and services. 99 | If you submit a report that affects a third-party service through the [vulnerability disclosure program](https://www.astronomer.io/vulnerability-disclosure/), 100 | Astronomer will limit what we share with the affected third party. 101 | Please understand that, while we can authorize your research on Astronomer’s systems and services, 102 | we cannot authorize your efforts on third-party products or guarantee they won’t pursue legal action against you. 103 | If legal action is initiated by a third party against you because of your participation in this vulnerability 104 | disclosure program, and you have complied with our vulnerability disclosure policy, we will take steps to make it known 105 | that your actions were conducted in compliance with this policy. 106 | This is not, and should not be understood as, any agreement on Astronomer's part to defend, indemnify, or otherwise protect you 107 | from any third-party action based on your actions. 108 | 109 | You are expected, as always, to comply with all applicable laws. 110 | -------------------------------------------------------------------------------- /dagfactory/__init__.py: -------------------------------------------------------------------------------- 1 | """Modules and methods to export for easier access""" 2 | 3 | from .dagfactory import DagFactory, load_yaml_dags 4 | 5 | __version__ = "0.23.0a3" 6 | __all__ = [ 7 | "DagFactory", 8 | "load_yaml_dags", 9 | ] 10 | -------------------------------------------------------------------------------- /dagfactory/constants.py: -------------------------------------------------------------------------------- 1 | TELEMETRY_URL = "https://astronomer.gateway.scarf.sh/dag-factory/{telemetry_version}/{dagfactory_version}/{airflow_version}/{python_version}/{platform_system}/{platform_machine}/{event_type}/{status}/{dag_hash}/{task_count}" 2 | TELEMETRY_VERSION = "v2" 3 | TELEMETRY_TIMEOUT = 1.0 4 | 5 | AIRFLOW3_MAJOR_VERSION = 3 6 | -------------------------------------------------------------------------------- /dagfactory/exceptions.py: -------------------------------------------------------------------------------- 1 | """Module contains exceptions for dag-factory""" 2 | 3 | 4 | class DagFactoryException(Exception): 5 | """ 6 | Base class for all dag-factory errors. 7 | """ 8 | 9 | 10 | class DagFactoryConfigException(Exception): 11 | """ 12 | Raise for dag-factory config errors. 13 | """ 14 | -------------------------------------------------------------------------------- /dagfactory/listeners/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/dagfactory/listeners/__init__.py -------------------------------------------------------------------------------- /dagfactory/listeners/runtime_event.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from airflow.listeners import hookimpl 4 | from airflow.models.dag import DAG 5 | from airflow.models.dagrun import DagRun 6 | 7 | from dagfactory import telemetry 8 | 9 | 10 | class EventStatus: 11 | SUCCESS = "success" 12 | FAILED = "failed" 13 | 14 | 15 | DAG_RUN = "dag_run" 16 | 17 | 18 | def is_dagfactory_dag(dag: DAG | None = None): 19 | if "dagfactory" in dag.tags: 20 | return True 21 | return False 22 | 23 | 24 | @hookimpl 25 | def on_dag_run_success(dag_run: DagRun, msg: str): 26 | dag = dag_run.get_dag() 27 | if not is_dagfactory_dag(dag): 28 | return 29 | additional_telemetry_metrics = { 30 | "dag_hash": dag_run.dag_hash, 31 | "status": EventStatus.SUCCESS, 32 | "task_count": len(dag.task_ids), 33 | } 34 | 35 | telemetry.emit_usage_metrics_if_enabled(DAG_RUN, additional_telemetry_metrics) 36 | 37 | 38 | @hookimpl 39 | def on_dag_run_failed(dag_run: DagRun, msg: str): 40 | dag = dag_run.get_dag() 41 | if not is_dagfactory_dag(dag): 42 | return 43 | additional_telemetry_metrics = { 44 | "dag_hash": dag_run.dag_hash, 45 | "status": EventStatus.FAILED, 46 | "task_count": len(dag.task_ids), 47 | } 48 | 49 | telemetry.emit_usage_metrics_if_enabled(DAG_RUN, additional_telemetry_metrics) 50 | -------------------------------------------------------------------------------- /dagfactory/parsers.py: -------------------------------------------------------------------------------- 1 | import ast 2 | 3 | 4 | class SafeEvalVisitor(ast.NodeVisitor): 5 | def __init__(self, dataset_map): 6 | self.dataset_map = dataset_map 7 | 8 | def evaluate(self, tree): 9 | return self.visit(tree) 10 | 11 | def visit_Expression(self, node): 12 | return self.visit(node.body) 13 | 14 | def visit_BinOp(self, node): 15 | left = self.visit(node.left) 16 | right = self.visit(node.right) 17 | 18 | if isinstance(node.op, ast.BitAnd): 19 | return left & right 20 | elif isinstance(node.op, ast.BitOr): 21 | return left | right 22 | else: 23 | raise ValueError(f"Unsupported binary operation: {type(node.op).__name__}") 24 | 25 | def visit_Name(self, node): 26 | if node.id in self.dataset_map: 27 | return self.dataset_map[node.id] 28 | raise NameError(f"Undefined variable: {node.id}") 29 | 30 | def visit_Constant(self, node): 31 | return node.value 32 | 33 | def generic_visit(self, node): 34 | raise ValueError(f"Unsupported syntax: {type(node).__name__}") 35 | -------------------------------------------------------------------------------- /dagfactory/plugin/__init__.py: -------------------------------------------------------------------------------- 1 | from airflow.plugins_manager import AirflowPlugin 2 | 3 | from dagfactory.listeners import runtime_event 4 | 5 | 6 | class DagFactoryPlugin(AirflowPlugin): 7 | name = "Dag Factory Plugin" 8 | listeners = [runtime_event] 9 | 10 | 11 | dagfactory_plugin = DagFactoryPlugin() 12 | -------------------------------------------------------------------------------- /dagfactory/settings.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | 5 | from airflow.configuration import conf 6 | 7 | 8 | def convert_to_boolean(value: str | None) -> bool: 9 | """ 10 | Convert a string that represents a boolean to a Python boolean. 11 | """ 12 | value = str(value).lower().strip() 13 | if value in ("f", "false", "0", "", "none"): 14 | return False 15 | return True 16 | 17 | 18 | enable_telemetry = conf.getboolean("dag_factory", "enable_telemetry", fallback=True) 19 | do_not_track = convert_to_boolean(os.getenv("DO_NOT_TRACK")) 20 | no_analytics = convert_to_boolean(os.getenv("SCARF_NO_ANALYTICS")) 21 | -------------------------------------------------------------------------------- /dagfactory/telemetry.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import logging 4 | import platform 5 | from urllib import parse 6 | from urllib.parse import urlencode 7 | 8 | import httpx 9 | from airflow import __version__ as airflow_version 10 | 11 | import dagfactory 12 | from dagfactory import constants, settings 13 | 14 | 15 | def should_emit() -> bool: 16 | """ 17 | Identify if telemetry metrics should be emitted or not. 18 | """ 19 | return settings.enable_telemetry and not settings.do_not_track and not settings.no_analytics 20 | 21 | 22 | def collect_standard_usage_metrics() -> dict[str, object]: 23 | """ 24 | Return standard telemetry metrics. 25 | """ 26 | metrics = { 27 | "dagfactory_version": dagfactory.__version__, 28 | "airflow_version": parse.quote(airflow_version), 29 | "python_version": platform.python_version(), 30 | "platform_system": platform.system(), 31 | "platform_machine": platform.machine(), 32 | "variables": {}, 33 | } 34 | return metrics 35 | 36 | 37 | def emit_usage_metrics(metrics: dict[str, object]) -> bool: 38 | """ 39 | Emit desired telemetry metrics to remote telemetry endpoint. 40 | 41 | The metrics must contain the necessary fields to build the TELEMETRY_URL. 42 | """ 43 | query_string = urlencode(metrics) 44 | telemetry_url = constants.TELEMETRY_URL.format( 45 | **metrics, telemetry_version=constants.TELEMETRY_VERSION, query_string=query_string 46 | ) 47 | logging.debug("Telemetry is enabled. Emitting the following usage metrics to %s: %s", telemetry_url, metrics) 48 | try: 49 | response = httpx.get(telemetry_url, timeout=constants.TELEMETRY_TIMEOUT, follow_redirects=True) 50 | except httpx.HTTPError as e: 51 | logging.warning( 52 | "Unable to emit usage metrics to %s. An HTTPX connection error occurred: %s.", telemetry_url, str(e) 53 | ) 54 | is_success = False 55 | else: 56 | is_success = response.is_success 57 | if not is_success: 58 | logging.warning( 59 | "Unable to emit usage metrics to %s. Status code: %s. Message: %s", 60 | telemetry_url, 61 | response.status_code, 62 | response.text, 63 | ) 64 | return is_success 65 | 66 | 67 | def emit_usage_metrics_if_enabled(event_type: str, additional_metrics: dict[str, object]) -> bool: 68 | """ 69 | Checks if telemetry should be emitted, fetch standard metrics, complement with custom metrics 70 | and emit them to remote telemetry endpoint. 71 | 72 | :returns: If the event was successfully sent to the telemetry backend or not. 73 | """ 74 | if should_emit(): 75 | metrics = collect_standard_usage_metrics() 76 | metrics["event_type"] = event_type 77 | metrics["variables"].update(additional_metrics) 78 | metrics.update(additional_metrics) 79 | is_success = emit_usage_metrics(metrics) 80 | return is_success 81 | else: 82 | logging.debug("Telemetry is disabled. To enable it, export AIRFLOW__DAG_FACTORY__ENABLE_TELEMETRY=True.") 83 | return False 84 | -------------------------------------------------------------------------------- /dev/.astro/config.yaml: -------------------------------------------------------------------------------- 1 | project: 2 | name: dev 3 | -------------------------------------------------------------------------------- /dev/.astro/dag_integrity_exceptions.txt: -------------------------------------------------------------------------------- 1 | # Add dag files to exempt from parse test below. ex: dags/ 2 | -------------------------------------------------------------------------------- /dev/.astro/test_dag_integrity_default.py: -------------------------------------------------------------------------------- 1 | """Test the validity of all DAGs. **USED BY DEV PARSE COMMAND DO NOT EDIT**""" 2 | 3 | import logging 4 | import os 5 | from contextlib import contextmanager 6 | 7 | import pytest 8 | from airflow.hooks.base import BaseHook 9 | from airflow.models import Connection, DagBag, Variable 10 | from airflow.utils.db import initdb 11 | 12 | # init airflow database 13 | initdb() 14 | 15 | # The following code patches errors caused by missing OS Variables, Airflow Connections, and Airflow Variables 16 | 17 | 18 | # =========== MONKEYPATCH BaseHook.get_connection() =========== 19 | def basehook_get_connection_monkeypatch(key: str, *args, **kwargs): 20 | print(f"Attempted to fetch connection during parse returning an empty Connection object for {key}") 21 | return Connection(key) 22 | 23 | 24 | BaseHook.get_connection = basehook_get_connection_monkeypatch 25 | # # =========== /MONKEYPATCH BASEHOOK.GET_CONNECTION() =========== 26 | 27 | 28 | # =========== MONKEYPATCH OS.GETENV() =========== 29 | def os_getenv_monkeypatch(key: str, *args, **kwargs): 30 | default = None 31 | if args: 32 | default = args[0] # os.getenv should get at most 1 arg after the key 33 | if kwargs: 34 | default = kwargs.get("default", None) # and sometimes kwarg if people are using the sig 35 | 36 | env_value = os.environ.get(key, None) 37 | 38 | if env_value: 39 | return env_value # if the env_value is set, return it 40 | if key == "JENKINS_HOME" and default is None: # fix https://github.com/astronomer/astro-cli/issues/601 41 | return None 42 | if default: 43 | return default # otherwise return whatever default has been passed 44 | return f"MOCKED_{key.upper()}_VALUE" # if absolutely nothing has been passed - return the mocked value 45 | 46 | 47 | os.getenv = os_getenv_monkeypatch 48 | # # =========== /MONKEYPATCH OS.GETENV() =========== 49 | 50 | # =========== MONKEYPATCH VARIABLE.GET() =========== 51 | 52 | 53 | class magic_dict(dict): 54 | def __init__(self, *args, **kwargs): 55 | self.update(*args, **kwargs) 56 | 57 | def __getitem__(self, key): 58 | return {}.get(key, "MOCKED_KEY_VALUE") 59 | 60 | 61 | _no_default = object() # allow falsey defaults 62 | 63 | 64 | def variable_get_monkeypatch(key: str, default_var=_no_default, deserialize_json=False): 65 | print(f"Attempted to get Variable value during parse, returning a mocked value for {key}") 66 | 67 | if default_var is not _no_default: 68 | return default_var 69 | if deserialize_json: 70 | return magic_dict() 71 | return "NON_DEFAULT_MOCKED_VARIABLE_VALUE" 72 | 73 | 74 | Variable.get = variable_get_monkeypatch 75 | # # =========== /MONKEYPATCH VARIABLE.GET() =========== 76 | 77 | 78 | @contextmanager 79 | def suppress_logging(namespace): 80 | """ 81 | Suppress logging within a specific namespace to keep tests "clean" during build 82 | """ 83 | logger = logging.getLogger(namespace) 84 | old_value = logger.disabled 85 | logger.disabled = True 86 | try: 87 | yield 88 | finally: 89 | logger.disabled = old_value 90 | 91 | 92 | def get_import_errors(): 93 | """ 94 | Generate a tuple for import errors in the dag bag, and include DAGs without errors. 95 | """ 96 | with suppress_logging("airflow"): 97 | dag_bag = DagBag(include_examples=False) 98 | 99 | def strip_path_prefix(path): 100 | return os.path.relpath(path, os.environ.get("AIRFLOW_HOME")) 101 | 102 | # Initialize an empty list to store the tuples 103 | result = [] 104 | 105 | # Iterate over the items in import_errors 106 | for k, v in dag_bag.import_errors.items(): 107 | result.append((strip_path_prefix(k), v.strip())) 108 | 109 | # Check if there are DAGs without errors 110 | for file_path in dag_bag.dags: 111 | # Check if the file_path is not in import_errors, meaning no errors 112 | if file_path not in dag_bag.import_errors: 113 | result.append((strip_path_prefix(file_path), "No import errors")) 114 | 115 | return result 116 | 117 | 118 | @pytest.mark.parametrize("rel_path, rv", get_import_errors(), ids=[x[0] for x in get_import_errors()]) 119 | def test_file_imports(rel_path, rv): 120 | """Test for import errors on a file""" 121 | if os.path.exists(".astro/dag_integrity_exceptions.txt"): 122 | with open(".astro/dag_integrity_exceptions.txt", "r") as f: 123 | exceptions = f.readlines() 124 | print(f"Exceptions: {exceptions}") 125 | if (rv != "No import errors") and rel_path not in exceptions: 126 | # If rv is not "No import errors," consider it a failed test 127 | raise Exception(f"{rel_path} failed to import with message \n {rv}") 128 | else: 129 | # If rv is "No import errors," consider it a passed test 130 | print(f"{rel_path} passed the import test") 131 | -------------------------------------------------------------------------------- /dev/.dockerignore: -------------------------------------------------------------------------------- 1 | astro 2 | .git 3 | .env 4 | airflow_settings.yaml 5 | logs/ 6 | .venv 7 | airflow.db 8 | airflow.cfg 9 | -------------------------------------------------------------------------------- /dev/.gitignore: -------------------------------------------------------------------------------- 1 | .git 2 | .env 3 | .DS_Store 4 | airflow_settings.yaml 5 | __pycache__/ 6 | astro 7 | .venv 8 | airflow-webserver.pid 9 | webserver_config.py 10 | airflow.cfg 11 | airflow.db 12 | -------------------------------------------------------------------------------- /dev/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM quay.io/astronomer/astro-runtime:12.7.0 2 | 3 | ENV CONFIG_ROOT_DIR=/usr/local/airflow/dags/ 4 | 5 | USER root 6 | 7 | RUN apt-get update && apt-get install -y jq 8 | 9 | USER astro 10 | 11 | RUN pip install /usr/local/airflow/include/*.whl 12 | -------------------------------------------------------------------------------- /dev/README.md: -------------------------------------------------------------------------------- 1 | # Sample Airflow setup and DAG Factory examples 2 | 3 | ## Overview 4 | 5 | Welcome to Astronomer! This project was generated after you ran 'astro dev init' using the Astronomer CLI. This readme describes the contents of the project, as well as how to run Apache Airflow on your local machine. 6 | 7 | ## Project Contents 8 | 9 | Your Astro project contains the following files and folders: 10 | 11 | - dags: This folder contains the Python files for your Airflow DAGs. By default, this directory includes one example DAG: 12 | - `example_astronauts`: This DAG shows a simple ETL pipeline example that queries the list of astronauts currently in space from the Open Notify API and prints a statement for each astronaut. The DAG uses the TaskFlow API to define tasks in Python, and dynamic task mapping to dynamically print a statement for each astronaut. For more on how this DAG works, see our [Getting started tutorial](https://www.astronomer.io/docs/learn/get-started-with-airflow). 13 | - Dockerfile: This file contains a versioned Astro Runtime Docker image that provides a differentiated Airflow experience. If you want to execute other commands or overrides at runtime, specify them here. 14 | - include: This folder contains any additional files that you want to include as part of your project. It is empty by default. 15 | - packages.txt: Install OS-level packages needed for your project by adding them to this file. It is empty by default. 16 | - requirements.txt: Install Python packages needed for your project by adding them to this file. It is empty by default. 17 | - plugins: Add custom or community plugins for your project to this file. It is empty by default. 18 | - airflow_settings.yaml: Use this local-only file to specify Airflow Connections, Variables, and Pools instead of entering them in the Airflow UI as you develop DAGs in this project. 19 | 20 | ## Deploy Your Project Locally 21 | 22 | (1) Start Airflow on your local machine by running 'astro dev start'. 23 | 24 | This command will spin up 4 Docker containers on your machine, each for a different Airflow component: 25 | 26 | - Postgres: Airflow's Metadata Database 27 | - Webserver: The Airflow component responsible for rendering the Airflow UI 28 | - Scheduler: The Airflow component responsible for monitoring and triggering tasks 29 | - Triggerer: The Airflow component responsible for triggering deferred tasks 30 | 31 | (2) Verify that all 4 Docker containers were created by running 'docker ps'. 32 | 33 | Note: Running 'astro dev start' will start your project with the Airflow Webserver exposed at port 8080 and Postgres exposed at port 5432. If you already have either of those ports allocated, you can either [stop your existing Docker containers or change the port](https://www.astronomer.io/docs/astro/cli/troubleshoot-locally#ports-are-not-available-for-my-local-airflow-webserver). 34 | 35 | (3) Access the Airflow UI for your local Airflow project. To do so, go to and log in with 'admin' for both your Username and Password. 36 | 37 | You should also be able to access your Postgres Database at 'localhost:5432/postgres'. 38 | 39 | ## Deploy Your Project to Astronomer 40 | 41 | If you have an Astronomer account, pushing code to a Deployment on Astronomer is simple. For deploying instructions, refer to Astronomer documentation: 42 | 43 | ## Contact 44 | 45 | The Astronomer CLI is maintained with love by the Astronomer team. To report a bug or suggest a change, reach out to our support. 46 | -------------------------------------------------------------------------------- /dev/dags/comparison/example_hackernews_dagfactory.yml: -------------------------------------------------------------------------------- 1 | example_hackernews_dagfactory: 2 | default_args: 3 | start_date: 2022-03-04 4 | tasks: 5 | fetch_top_ten_news: 6 | operator: airflow.operators.bash_operator.BashOperator 7 | bash_command: "curl -s https://hacker-news.firebaseio.com/v0/topstories.json | jq -c -r '.[0:10]'" 8 | fetch_first_top_news: 9 | operator: airflow.operators.bash_operator.BashOperator 10 | bash_command: "echo {{ task_instance.xcom_pull(task_ids='fetch_top_ten_news') }} | jq -c -r '.[0]' | xargs -I {} curl -s 'https://hacker-news.firebaseio.com/v0/item/{}.json'" 11 | dependencies: [fetch_top_ten_news] 12 | fetch_second_top_news: 13 | operator: airflow.operators.bash_operator.BashOperator 14 | bash_command: "echo {{ task_instance.xcom_pull(task_ids='fetch_top_ten_news') }} | jq -c -r '.[1]' | xargs -I {} curl -s 'https://hacker-news.firebaseio.com/v0/item/{}.json'" 15 | dependencies: [fetch_top_ten_news] 16 | summarize: 17 | operator: airflow.operators.python.PythonOperator 18 | python_callable: hacker_news.summarize 19 | dependencies: [fetch_first_top_news, fetch_second_top_news] 20 | -------------------------------------------------------------------------------- /dev/dags/comparison/example_hackernews_plain_airflow.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from airflow.models.dag import DAG 4 | from airflow.operators.bash_operator import BashOperator 5 | from airflow.operators.python import PythonOperator 6 | from hacker_news import summarize 7 | 8 | with DAG(dag_id="example_hackernews_plain_airflow", schedule=None, start_date=datetime(2022, 3, 4)) as dag: 9 | 10 | fetch_top_ten_news = BashOperator( 11 | task_id="fetch_top_ten_news", 12 | bash_command="curl -s https://hacker-news.firebaseio.com/v0/topstories.json | jq -c -r '.[0:10]'", 13 | ) 14 | 15 | fetch_first_top_news = BashOperator( 16 | task_id="fetch_first_top_news", 17 | bash_command=""" 18 | echo {{ task_instance.xcom_pull(task_ids='fetch_top_ten_news') }} | jq -c -r '.[0]' | xargs -I {} curl -s 'https://hacker-news.firebaseio.com/v0/item/{}.json' 19 | """, 20 | ) 21 | 22 | fetch_second_top_news = BashOperator( 23 | task_id="fetch_second_news", 24 | bash_command=""" 25 | echo {{ task_instance.xcom_pull(task_ids='fetch_top_ten_news') }} | jq -c -r '.[1]' | xargs -I {} curl -s 'https://hacker-news.firebaseio.com/v0/item/{}.json' 26 | """, 27 | ) 28 | 29 | summarize = PythonOperator(task_id="summarize", python_callable=summarize) 30 | 31 | fetch_top_ten_news >> [fetch_first_top_news, fetch_second_top_news] >> summarize 32 | -------------------------------------------------------------------------------- /dev/dags/comparison/example_pypi_stats_dagfactory.yml: -------------------------------------------------------------------------------- 1 | example_pypi_stats_dagfactory: 2 | default_args: 3 | start_date: 2022-03-04 4 | tasks: 5 | get_pypi_projects_list: 6 | decorator: airflow.decorators.task 7 | python_callable: pypi_stats.get_pypi_projects_list 8 | fetch_pypi_stats_data: 9 | decorator: airflow.decorators.task 10 | python_callable: pypi_stats.fetch_pypi_stats_data 11 | expand: 12 | package_name: +get_pypi_projects_list 13 | summarize: 14 | decorator: airflow.decorators.task 15 | python_callable: pypi_stats.summarize 16 | values: +fetch_pypi_stats_data 17 | -------------------------------------------------------------------------------- /dev/dags/comparison/example_pypi_stats_plain_airflow.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from datetime import datetime 4 | from typing import Any 5 | 6 | from airflow.decorators import task 7 | from airflow.models.dag import DAG 8 | from pypi_stats import fetch_pypi_stats_data, get_pypi_projects_list, summarize 9 | 10 | with DAG(dag_id="example_pypi_stats_plain_airflow", schedule=None, start_date=datetime(2022, 3, 4)) as dag: 11 | 12 | @task 13 | def get_pypi_projects_list_(): 14 | return get_pypi_projects_list() 15 | 16 | @task 17 | def fetch_pypi_stats_data_(project_name: str): 18 | return fetch_pypi_stats_data(project_name) 19 | 20 | @task 21 | def summarize_(values: list[dict[str, Any]]): 22 | return summarize(values) 23 | 24 | pypi_stats_data = fetch_pypi_stats_data_.expand(project_name=get_pypi_projects_list_()) 25 | summarize_(pypi_stats_data) 26 | -------------------------------------------------------------------------------- /dev/dags/customized/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/dev/dags/customized/__init__.py -------------------------------------------------------------------------------- /dev/dags/customized/callbacks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/dev/dags/customized/callbacks/__init__.py -------------------------------------------------------------------------------- /dev/dags/customized/callbacks/custom_callbacks.py: -------------------------------------------------------------------------------- 1 | """ 2 | example_callbacks.py 3 | 4 | Author: Jake Roach 5 | Date: 2024-10-22 6 | """ 7 | 8 | 9 | def output_message(context, param1, param2): 10 | print("A callback has been raised!") 11 | print(f"{param1} ---------- {param2}") 12 | -------------------------------------------------------------------------------- /dev/dags/customized/helpers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/dev/dags/customized/helpers/__init__.py -------------------------------------------------------------------------------- /dev/dags/customized/helpers/etl.py: -------------------------------------------------------------------------------- 1 | def extract(): 2 | print("extract() function has been called") 3 | 4 | 5 | def transform(ds_nodash): 6 | print("transform() function has been called") 7 | 8 | 9 | def load(database_name, table_name): 10 | print("load() function has been called") 11 | -------------------------------------------------------------------------------- /dev/dags/customized/operators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/dev/dags/customized/operators/__init__.py -------------------------------------------------------------------------------- /dev/dags/customized/operators/breakfast_operators.py: -------------------------------------------------------------------------------- 1 | from airflow.models import BaseOperator 2 | 3 | 4 | class MakeBreadOperator(BaseOperator): 5 | template_fields = ("bread_type",) 6 | 7 | def __init__(self, bread_type, *args, **kwargs): 8 | super(MakeBreadOperator, self).__init__(*args, **kwargs) 9 | self.bread_type = bread_type 10 | 11 | def execute(self, context): 12 | print("Make {} bread".format(self.bread_type)) 13 | 14 | 15 | class MakeCoffeeOperator(BaseOperator): 16 | template_fields = ("coffee_type",) 17 | 18 | def __init__(self, coffee_type, *args, **kwargs): 19 | super(MakeCoffeeOperator, self).__init__(*args, **kwargs) 20 | self.coffee_type = coffee_type 21 | 22 | def execute(self, context): 23 | print("Make {} bread".format(self.coffee_type)) 24 | -------------------------------------------------------------------------------- /dev/dags/datasets/example_config_datasets.yml: -------------------------------------------------------------------------------- 1 | datasets: 2 | - name: dataset_custom_1 3 | uri: s3://bucket-cjmm/raw/dataset_custom_1 4 | - name: dataset_custom_2 5 | uri: s3://bucket-cjmm/raw/dataset_custom_2 6 | - name: dataset_custom_3 7 | uri: s3://bucket-cjmm/raw/dataset_custom_3 8 | -------------------------------------------------------------------------------- /dev/dags/datasets/example_dag_datasets.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | # The following import is here so Airflow parses this file 5 | # from airflow import DAG 6 | import dagfactory 7 | 8 | DEFAULT_CONFIG_ROOT_DIR = "/usr/local/airflow/dags/" 9 | CONFIG_ROOT_DIR = Path(os.getenv("CONFIG_ROOT_DIR", DEFAULT_CONFIG_ROOT_DIR)) 10 | 11 | config_file = str(CONFIG_ROOT_DIR / "datasets/example_dag_datasets.yml") 12 | 13 | example_dag_factory = dagfactory.DagFactory(config_file) 14 | 15 | # Creating task dependencies 16 | example_dag_factory.clean_dags(globals()) 17 | example_dag_factory.generate_dags(globals()) 18 | -------------------------------------------------------------------------------- /dev/dags/datasets/example_dag_datasets.yml: -------------------------------------------------------------------------------- 1 | default: 2 | default_args: 3 | owner: "default_owner" 4 | start_date: '2023-07-14' 5 | retries: 1 6 | retry_delay_sec: 300 7 | concurrency: 1 8 | max_active_runs: 1 9 | dagrun_timeout_sec: 600 10 | default_view: "tree" 11 | orientation: "LR" 12 | 13 | example_simple_dataset_producer_dag: 14 | description: "Example DAG producer simple datasets" 15 | schedule_interval: "0 5 * * *" 16 | tasks: 17 | task_1: 18 | operator: airflow.operators.bash_operator.BashOperator 19 | bash_command: "echo 1" 20 | inlets: [ 's3://bucket_example/raw/dataset1_source.json' ] 21 | outlets: ['s3://bucket_example/raw/dataset1.json'] 22 | task_2: 23 | operator: airflow.operators.bash_operator.BashOperator 24 | bash_command: "echo 2" 25 | dependencies: [task_1] 26 | inlets: [ 's3://bucket_example/raw/dataset2_source.json' ] 27 | outlets: ['s3://bucket_example/raw/dataset2.json'] 28 | 29 | example_simple_dataset_consumer_dag: 30 | description: "Example DAG consumer simple datasets" 31 | schedule: ['s3://bucket_example/raw/dataset1.json', 's3://bucket_example/raw/dataset2.json'] 32 | tasks: 33 | task_1: 34 | operator: airflow.operators.bash_operator.BashOperator 35 | bash_command: "echo 'consumer datasets'" 36 | 37 | example_custom_config_dataset_producer_dag: 38 | description: "Example DAG producer custom config datasets" 39 | schedule_interval: "0 5 * * *" 40 | tasks: 41 | task_1: 42 | operator: airflow.operators.bash_operator.BashOperator 43 | bash_command: "echo 1" 44 | outlets: 45 | file: $CONFIG_ROOT_DIR/datasets/example_config_datasets.yml 46 | datasets: ['dataset_custom_1', 'dataset_custom_2'] 47 | 48 | example_custom_config_dataset_consumer_dag: 49 | description: "Example DAG consumer custom config datasets" 50 | schedule: 51 | file: $CONFIG_ROOT_DIR/datasets/example_config_datasets.yml 52 | datasets: ['dataset_custom_1', 'dataset_custom_2'] 53 | tasks: 54 | task_1: 55 | operator: airflow.operators.bash_operator.BashOperator 56 | bash_command: "echo 'consumer datasets'" 57 | 58 | example_custom_config_condition_dataset_consumer_dag: 59 | description: "Example DAG consumer custom config condition datasets" 60 | schedule: 61 | file: $CONFIG_ROOT_DIR/datasets/example_config_datasets.yml 62 | datasets: "((dataset_custom_1 & dataset_custom_2) | dataset_custom_3)" 63 | tasks: 64 | task_1: 65 | operator: airflow.operators.bash_operator.BashOperator 66 | bash_command: "echo 'consumer datasets'" 67 | 68 | example_without_custom_config_condition_dataset_consumer_dag: 69 | description: "Example DAG consumer custom config condition datasets" 70 | schedule: 71 | datasets: 72 | !or 73 | - !and 74 | - "s3://bucket-cjmm/raw/dataset_custom_1" 75 | - "s3://bucket-cjmm/raw/dataset_custom_2" 76 | - "s3://bucket-cjmm/raw/dataset_custom_3" 77 | tasks: 78 | task_1: 79 | operator: airflow.operators.bash_operator.BashOperator 80 | bash_command: "echo 'consumer datasets'" 81 | -------------------------------------------------------------------------------- /dev/dags/datasets/example_dag_datasets_outlet_inlet.yml: -------------------------------------------------------------------------------- 1 | producer_dag: 2 | default_args: 3 | owner: "example_owner" 4 | retries: 1 5 | start_date: '2024-01-01' 6 | description: "Example DAG producer simple datasets" 7 | schedule_interval: "0 5 * * *" 8 | tasks: 9 | task_1: 10 | operator: airflow.operators.bash_operator.BashOperator 11 | bash_command: "echo 1" 12 | inlets: [ 's3://bucket_example/raw/dataset1_source.json' ] 13 | outlets: [ 's3://bucket_example/raw/dataset1.json' ] 14 | task_2: 15 | bash_command: "echo 2" 16 | dependencies: [ task_1 ] 17 | inlets: [ 's3://bucket_example/raw/dataset2_source.json' ] 18 | outlets: [ 's3://bucket_example/raw/dataset2.json' ] 19 | consumer_dag: 20 | default_args: 21 | owner: "example_owner" 22 | retries: 1 23 | start_date: '2024-01-01' 24 | description: "Example DAG consumer simple datasets" 25 | schedule: [ 's3://bucket_example/raw/dataset1.json', 's3://bucket_example/raw/dataset2.json' ] 26 | tasks: 27 | task_1: 28 | operator: airflow.operators.bash_operator.BashOperator 29 | bash_command: "echo 'consumer datasets'" 30 | -------------------------------------------------------------------------------- /dev/dags/datasets/example_dataset_condition_string.yml: -------------------------------------------------------------------------------- 1 | consumer_dag: 2 | default_args: 3 | owner: "example_owner" 4 | retries: 1 5 | start_date: '2024-01-01' 6 | description: "Example DAG consumer simple datasets" 7 | schedule: 8 | datasets: "((s3://bucket-cjmm/raw/dataset_custom_1 & s3://bucket-cjmm/raw/dataset_custom_2) | s3://bucket-cjmm/raw/dataset_custom_3)" 9 | tasks: 10 | task_1: 11 | operator: airflow.operators.bash_operator.BashOperator 12 | bash_command: "echo 'consumer datasets'" 13 | -------------------------------------------------------------------------------- /dev/dags/datasets/example_dataset_yaml_syntax.yml: -------------------------------------------------------------------------------- 1 | consumer_dag: 2 | default_args: 3 | owner: "example_owner" 4 | retries: 1 5 | start_date: '2024-01-01' 6 | description: "Example DAG consumer simple datasets" 7 | schedule: 8 | datasets: 9 | !or 10 | - !and 11 | - "s3://bucket-cjmm/raw/dataset_custom_1" 12 | - "s3://bucket-cjmm/raw/dataset_custom_2" 13 | - "s3://bucket-cjmm/raw/dataset_custom_3" 14 | tasks: 15 | task_1: 16 | operator: airflow.operators.bash_operator.BashOperator 17 | bash_command: "echo 'consumer datasets'" 18 | -------------------------------------------------------------------------------- /dev/dags/defaults.yml: -------------------------------------------------------------------------------- 1 | default_args: 2 | start_date: "2025-01-01" 3 | owner: "global_owner" 4 | -------------------------------------------------------------------------------- /dev/dags/example_callbacks.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | # The following import is here so Airflow parses this file 5 | # from airflow import DAG 6 | import dagfactory 7 | 8 | DEFAULT_CONFIG_ROOT_DIR = "/usr/local/airflow/dags/" 9 | CONFIG_ROOT_DIR = Path(os.getenv("CONFIG_ROOT_DIR", DEFAULT_CONFIG_ROOT_DIR)) 10 | 11 | config_file = str(CONFIG_ROOT_DIR / "example_callbacks.yml") 12 | 13 | example_dag_factory = dagfactory.DagFactory(config_file) 14 | 15 | # Creating task dependencies 16 | example_dag_factory.clean_dags(globals()) 17 | example_dag_factory.generate_dags(globals()) 18 | -------------------------------------------------------------------------------- /dev/dags/example_callbacks.yml: -------------------------------------------------------------------------------- 1 | example_callbacks: 2 | default_args: 3 | start_date: "2024-01-01" 4 | # Callbacks at be set at the default_args level. These callbacks are then passed to each Task. Fun fact; 5 | # default_args can be overridden within a Task 6 | on_retry_callback: print_hello.print_hello_from_callback 7 | on_failure_callback: 8 | callback: airflow.providers.slack.notifications.slack.send_slack_notification 9 | slack_conn_id: slack_conn_id 10 | text: | 11 | :red_circle: Task Failed. 12 | This task has failed and needs to be addressed. 13 | Please remediate this issue ASAP. 14 | channel: "#channel" 15 | schedule_interval: "@daily" 16 | catchup: False 17 | # These callbacks are set at the DAG-level, vs. the callbacks set above in default_args that are passed onto each 18 | # Task. Previously, the same "on_success_callback" configuration was set as part of task_2 19 | on_execute_callback_name: print_hello_from_callback 20 | on_execute_callback_file: $CONFIG_ROOT_DIR/print_hello.py 21 | on_success_callback: 22 | callback: customized.callbacks.custom_callbacks.output_message 23 | param1: param1 24 | param2: param2 25 | task_groups: 26 | task_group_1: 27 | default_args: 28 | on_success_callback: print_hello.print_hello_from_callback 29 | dependencies: [task_1, task_2] 30 | tasks: 31 | start: 32 | operator: airflow.operators.empty.EmptyOperator 33 | on_success_callback_name: print_hello_from_callback 34 | on_success_callback_file: $CONFIG_ROOT_DIR/print_hello.py 35 | task_1: 36 | operator: airflow.operators.bash_operator.BashOperator 37 | bash_command: "echo 1" 38 | on_success_callback: 39 | callback: customized.callbacks.custom_callbacks.output_message 40 | param1: param1 41 | param2: param2 42 | dependencies: [start] 43 | task_2: 44 | operator: airflow.operators.bash_operator.BashOperator 45 | bash_command: "echo 2" 46 | on_success_callback_name: print_hello_from_callback 47 | on_success_callback_file: $CONFIG_ROOT_DIR/print_hello.py 48 | dependencies: [start] 49 | task_3: 50 | operator: airflow.operators.bash_operator.BashOperator 51 | bash_command: "echo 3" 52 | task_group_name: task_group_1 53 | end: 54 | operator: airflow.operators.bash_operator.BashOperator 55 | bash_command: "echo -1" 56 | dependencies: 57 | - task_group_1 58 | -------------------------------------------------------------------------------- /dev/dags/example_customize_operator.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | # The following import is here so Airflow parses this file 5 | # from airflow import DAG 6 | import dagfactory 7 | 8 | DEFAULT_CONFIG_ROOT_DIR = "/usr/local/airflow/dags/" 9 | CONFIG_ROOT_DIR = Path(os.getenv("CONFIG_ROOT_DIR", DEFAULT_CONFIG_ROOT_DIR)) 10 | 11 | config_file = str(CONFIG_ROOT_DIR / "example_customize_operator.yml") 12 | 13 | example_dag_factory = dagfactory.DagFactory(config_file) 14 | 15 | # Creating task dependencies 16 | example_dag_factory.clean_dags(globals()) 17 | example_dag_factory.generate_dags(globals()) 18 | -------------------------------------------------------------------------------- /dev/dags/example_customize_operator.yml: -------------------------------------------------------------------------------- 1 | default: 2 | default_args: 3 | owner: "default_owner" 4 | start_date: 2020-01-01 5 | retries: 1 6 | retry_delay_sec: 300 7 | concurrency: 1 8 | max_active_runs: 1 9 | dagrun_timeout_sec: 600 10 | default_view: "tree" 11 | orientation: "LR" 12 | schedule_interval: "0 1 * * *" 13 | 14 | example_breadfast: 15 | default_args: 16 | owner: "custom_owner" 17 | start_date: 2 days 18 | description: "this is an customized operator dag" 19 | schedule_interval: "0 3 * * *" 20 | tasks: 21 | begin: 22 | operator: airflow.operators.dummy_operator.DummyOperator 23 | make_bread_1: 24 | operator: customized.operators.breakfast_operators.MakeBreadOperator 25 | bread_type: 'Sourdough' 26 | dependencies: 27 | - begin 28 | make_bread_2: 29 | operator: customized.operators.breakfast_operators.MakeBreadOperator 30 | bread_type: 'Multigrain' 31 | dependencies: 32 | - begin 33 | make_coffee_1: 34 | operator: customized.operators.breakfast_operators.MakeCoffeeOperator 35 | coffee_type: 'Black' 36 | dependencies: 37 | - begin 38 | - make_bread_1 39 | - make_bread_2 40 | end: 41 | operator: airflow.operators.dummy_operator.DummyOperator 42 | dependencies: 43 | - begin 44 | - make_bread_1 45 | - make_bread_2 46 | - make_coffee_1 47 | -------------------------------------------------------------------------------- /dev/dags/example_dag_factory.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | # The following import is here so Airflow parses this file 5 | # from airflow import DAG 6 | import dagfactory 7 | 8 | DEFAULT_CONFIG_ROOT_DIR = "/usr/local/airflow/dags/" 9 | 10 | CONFIG_ROOT_DIR = Path(os.getenv("CONFIG_ROOT_DIR", DEFAULT_CONFIG_ROOT_DIR)) 11 | 12 | config_file = str(CONFIG_ROOT_DIR / "example_dag_factory.yml") 13 | 14 | example_dag_factory = dagfactory.DagFactory(config_file) 15 | 16 | # Creating task dependencies 17 | example_dag_factory.clean_dags(globals()) 18 | example_dag_factory.generate_dags(globals()) 19 | -------------------------------------------------------------------------------- /dev/dags/example_dag_factory.yml: -------------------------------------------------------------------------------- 1 | default: 2 | default_args: 3 | catchup: false, 4 | start_date: 2024-11-11 5 | 6 | # ----8<--- [ start: example_dag_yaml_configuration ] 7 | basic_example_dag: 8 | default_args: 9 | owner: "custom_owner" 10 | description: "this is an example dag" 11 | schedule_interval: "0 3 * * *" 12 | render_template_as_native_obj: True 13 | tasks: 14 | task_1: 15 | operator: airflow.operators.bash_operator.BashOperator 16 | bash_command: "echo 1" 17 | task_2: 18 | operator: airflow.operators.bash_operator.BashOperator 19 | bash_command: "echo 2" 20 | dependencies: [task_1] 21 | task_3: 22 | operator: airflow.operators.bash_operator.BashOperator 23 | bash_command: "echo 2" 24 | dependencies: [task_1] 25 | # ----8<--- [ end: example_dag_yaml_configuration ] 26 | -------------------------------------------------------------------------------- /dev/dags/example_dag_factory_default_args.yml: -------------------------------------------------------------------------------- 1 | default: 2 | default_args: 3 | start_date: '2024-01-01' 4 | schedule_interval: 0 0 * * * 5 | catchup: false 6 | tags: 7 | - "data engineering" 8 | 9 | etl: 10 | tasks: 11 | extract: 12 | operator: airflow.operators.bash_operator.BashOperator 13 | bash_command: "echo extract" 14 | transform: 15 | operator: airflow.operators.bash_operator.BashOperator 16 | bash_command: "echo transform" 17 | dependencies: 18 | - extract 19 | load: 20 | operator: airflow.operators.bash_operator.BashOperator 21 | bash_command: "echo load" 22 | dependencies: 23 | - transform 24 | -------------------------------------------------------------------------------- /dev/dags/example_dag_factory_default_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | # The following import is here so Airflow parses this file 5 | # from airflow import DAG 6 | import dagfactory 7 | 8 | DEFAULT_CONFIG_ROOT_DIR = "/usr/local/airflow/dags/" 9 | CONFIG_ROOT_DIR = Path(os.getenv("CONFIG_ROOT_DIR", DEFAULT_CONFIG_ROOT_DIR)) 10 | 11 | config_file = str(CONFIG_ROOT_DIR / "example_dag_factory_default_config.yml") 12 | 13 | example_dag_factory = dagfactory.DagFactory(config_file) 14 | 15 | # Creating task dependencies 16 | example_dag_factory.clean_dags(globals()) 17 | example_dag_factory.generate_dags(globals()) 18 | -------------------------------------------------------------------------------- /dev/dags/example_dag_factory_default_config.yml: -------------------------------------------------------------------------------- 1 | default: 2 | default_args: 3 | start_date: '2024-01-01' 4 | schedule_interval: 0 0 * * * 5 | catchup: false 6 | tags: 7 | - dynamic 8 | tasks: 9 | extract: 10 | operator: airflow.operators.bash_operator.BashOperator 11 | bash_command: "echo extract" 12 | transform: 13 | operator: airflow.operators.bash_operator.BashOperator 14 | bash_command: "echo transform" 15 | dependencies: 16 | - extract 17 | load: 18 | operator: airflow.operators.bash_operator.BashOperator 19 | dependencies: 20 | - transform 21 | 22 | 23 | machine_learning: 24 | tasks: 25 | load: 26 | bash_command: "echo machine_larning" 27 | 28 | data_science: 29 | tasks: 30 | load: 31 | bash_command: "echo data_science" 32 | 33 | artificial_intelligence: 34 | tasks: 35 | load: 36 | bash_command: "echo artificial_intelligence" 37 | -------------------------------------------------------------------------------- /dev/dags/example_dag_factory_multiple.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | # The following import is here so Airflow parses this file 5 | # from airflow import DAG 6 | import dagfactory 7 | 8 | DEFAULT_CONFIG_ROOT_DIR = "/usr/local/airflow/dags/" 9 | CONFIG_ROOT_DIR = Path(os.getenv("CONFIG_ROOT_DIR", DEFAULT_CONFIG_ROOT_DIR)) 10 | 11 | config_file = str(CONFIG_ROOT_DIR / "example_dag_factory_multiple_config.yml") 12 | 13 | example_dag_factory = dagfactory.DagFactory(config_file) 14 | 15 | # Creating task dependencies 16 | example_dag_factory.clean_dags(globals()) 17 | example_dag_factory.generate_dags(globals()) 18 | -------------------------------------------------------------------------------- /dev/dags/example_dag_factory_multiple_config.yml: -------------------------------------------------------------------------------- 1 | default: 2 | default_args: 3 | catchup: false, 4 | owner: "default_owner" 5 | start_date: 2024-11-11 6 | retries: 1 7 | retry_delay_sec: 30 8 | on_success_callback_name: print_hello_from_callback 9 | on_success_callback_file: $CONFIG_ROOT_DIR/print_hello.py 10 | concurrency: 1 11 | max_active_runs: 1 12 | dagrun_timeout_sec: 600 13 | default_view: "tree" 14 | orientation: "LR" 15 | schedule_interval: "0 1 * * *" 16 | on_failure_callback_name: print_hello_from_callback 17 | on_failure_callback_file: $CONFIG_ROOT_DIR/print_hello.py 18 | 19 | 20 | # ----8<--- [ start: environment_variable_example ] 21 | example_dag: 22 | default_args: 23 | owner: "custom_owner" 24 | description: "this is an example dag" 25 | schedule_interval: "0 3 * * *" 26 | render_template_as_native_obj: True 27 | dag_display_name: "Pretty Example DAG" 28 | tasks: 29 | task_1: 30 | operator: airflow.operators.bash_operator.BashOperator 31 | bash_command: "echo 1" 32 | task_2: 33 | operator: airflow.operators.bash_operator.BashOperator 34 | bash_command: "echo 2" 35 | dependencies: [task_1] 36 | task_3: 37 | operator: airflow.operators.python_operator.PythonOperator 38 | python_callable_name: print_hello 39 | python_callable_file: $CONFIG_ROOT_DIR/print_hello.py 40 | dependencies: [task_1] 41 | # ----8<--- [ end: environment_variable_example ] 42 | 43 | example_dag2: 44 | default_args: 45 | timezone: Europe/Amsterdam 46 | tasks: 47 | task_1: 48 | operator: airflow.operators.bash_operator.BashOperator 49 | bash_command: "echo 1" 50 | task_2: 51 | operator: airflow.operators.bash_operator.BashOperator 52 | bash_command: "echo 2" 53 | dependencies: [task_1] 54 | task_3: 55 | operator: airflow.operators.bash_operator.BashOperator 56 | bash_command: "echo 3" 57 | dependencies: [task_1] 58 | 59 | example_dag3: 60 | tasks: 61 | task_1: 62 | operator: airflow.operators.bash_operator.BashOperator 63 | bash_command: "echo 1" 64 | task_2: 65 | operator: airflow.operators.bash_operator.BashOperator 66 | bash_command: "echo 2" 67 | dependencies: [task_1] 68 | task_3: 69 | operator: airflow.operators.bash_operator.BashOperator 70 | bash_command: "echo 3" 71 | dependencies: [task_1] 72 | 73 | example_dag4: 74 | description: "this dag uses task groups" 75 | task_groups: 76 | task_group_1: 77 | tooltip: "this is a task group" 78 | dependencies: [task_1] 79 | tasks: 80 | task_1: 81 | operator: airflow.operators.bash_operator.BashOperator 82 | bash_command: "echo 1" 83 | task_2: 84 | operator: airflow.operators.bash_operator.BashOperator 85 | bash_command: "echo 2" 86 | task_group_name: task_group_1 87 | task_3: 88 | operator: airflow.operators.python_operator.PythonOperator 89 | python_callable_name: print_hello 90 | python_callable_file: $CONFIG_ROOT_DIR/print_hello.py 91 | task_group_name: task_group_1 92 | dependencies: [task_2] 93 | task_4: 94 | operator: airflow.operators.bash_operator.BashOperator 95 | bash_command: "echo 1" 96 | dependencies: [task_group_1] 97 | -------------------------------------------------------------------------------- /dev/dags/example_dynamic_task_mapping.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | # The following import is here so Airflow parses this file 5 | # from airflow import DAG 6 | import dagfactory 7 | 8 | DEFAULT_CONFIG_ROOT_DIR = "/usr/local/airflow/dags/" 9 | 10 | CONFIG_ROOT_DIR = Path(os.getenv("CONFIG_ROOT_DIR", DEFAULT_CONFIG_ROOT_DIR)) 11 | 12 | config_file = str(CONFIG_ROOT_DIR / "example_dynamic_task_mapping.yml") 13 | 14 | example_dag_factory = dagfactory.DagFactory(config_file) 15 | 16 | # Creating task dependencies 17 | example_dag_factory.clean_dags(globals()) 18 | example_dag_factory.generate_dags(globals()) 19 | -------------------------------------------------------------------------------- /dev/dags/example_dynamic_task_mapping.yml: -------------------------------------------------------------------------------- 1 | test_expand: 2 | default_args: 3 | owner: "custom_owner" 4 | start_date: 2 days 5 | description: "test expand" 6 | schedule_interval: "0 3 * * *" 7 | default_view: "graph" 8 | tasks: 9 | process: 10 | operator: airflow.operators.python_operator.PythonOperator 11 | python_callable_name: consume_value 12 | python_callable_file: $CONFIG_ROOT_DIR/expand_tasks.py 13 | partial: 14 | op_kwargs: 15 | fixed_param: "test" 16 | expand: 17 | op_args: 18 | request.output 19 | dependencies: [request] 20 | # This task is intentionally placed after the "process" task to demonstrate that DAG Factory does not require tasks 21 | # to be topologically ordered in the YAML file according to their dependencies. 22 | request: 23 | operator: airflow.operators.python.PythonOperator 24 | python_callable_name: make_list 25 | python_callable_file: $CONFIG_ROOT_DIR/expand_tasks.py 26 | -------------------------------------------------------------------------------- /dev/dags/example_http_operator_task.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | try: 5 | from airflow.providers.http.operators.http import HttpOperator 6 | HTTP_OPERATOR_AVAILABLE = True 7 | except ImportError: 8 | HTTP_OPERATOR_AVAILABLE = False 9 | 10 | # The following import is here so Airflow parses this file 11 | # from airflow import DAG 12 | import dagfactory 13 | 14 | DEFAULT_CONFIG_ROOT_DIR = "/usr/local/airflow/dags/" 15 | 16 | CONFIG_ROOT_DIR = Path(os.getenv("CONFIG_ROOT_DIR", DEFAULT_CONFIG_ROOT_DIR)) 17 | if HTTP_OPERATOR_AVAILABLE: 18 | config_file = str(CONFIG_ROOT_DIR / "example_http_operator_task.yml") 19 | else: 20 | config_file = str(CONFIG_ROOT_DIR / "example_simple_http_operator_task.yml") 21 | 22 | example_dag_factory = dagfactory.DagFactory(config_file) 23 | 24 | # Creating task dependencies 25 | example_dag_factory.clean_dags(globals()) 26 | example_dag_factory.generate_dags(globals()) 27 | -------------------------------------------------------------------------------- /dev/dags/example_http_operator_task.yml: -------------------------------------------------------------------------------- 1 | default: 2 | default_args: 3 | catchup: false, 4 | start_date: 2025-03-20 5 | 6 | http_operator_example_dag: 7 | default_args: 8 | owner: "@owner" 9 | description: "this is an HttpOperator dag" 10 | schedule_interval: "0 3 * * *" 11 | tags: ['http'] 12 | render_template_as_native_obj: True 13 | tasks: 14 | send_request_json: 15 | operator: airflow.providers.http.operators.http.HttpOperator 16 | http_conn_id: "example_host" 17 | method: "POST" 18 | endpoint: "/run_test" 19 | data: 20 | data: "fake_data" 21 | format: "json" 22 | headers: 23 | Content-Type: application/json 24 | log_response: True 25 | send_request_plain_text: 26 | operator: airflow.providers.http.operators.http.HttpOperator 27 | http_conn_id: "example_host" 28 | method: "POST" 29 | endpoint: "/run_test" 30 | data: 31 | data: "fake_data" 32 | test: "plain_text" 33 | headers: 34 | Content-Type: text/plain 35 | log_response: True 36 | -------------------------------------------------------------------------------- /dev/dags/example_load_yaml_dags.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | from dagfactory import load_yaml_dags 5 | 6 | DEFAULT_CONFIG_ROOT_DIR = "/usr/local/airflow/dags/" 7 | CONFIG_ROOT_DIR = Path(os.getenv("CONFIG_ROOT_DIR", DEFAULT_CONFIG_ROOT_DIR)) 8 | config_dir = str(CONFIG_ROOT_DIR / "comparison") 9 | 10 | load_yaml_dags( 11 | globals_dict=globals(), 12 | dags_folder=config_dir, 13 | ) 14 | -------------------------------------------------------------------------------- /dev/dags/example_map_index_template.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | # The following import is here so Airflow parses this file 5 | # from airflow import DAG 6 | import dagfactory 7 | 8 | DEFAULT_CONFIG_ROOT_DIR = "/usr/local/airflow/dags/" 9 | CONFIG_ROOT_DIR = Path(os.getenv("CONFIG_ROOT_DIR", DEFAULT_CONFIG_ROOT_DIR)) 10 | 11 | config_file = str(CONFIG_ROOT_DIR / "example_map_index_template.yml") 12 | example_dag_factory = dagfactory.DagFactory(config_file) 13 | 14 | # Creating task dependencies 15 | example_dag_factory.clean_dags(globals()) 16 | example_dag_factory.generate_dags(globals()) 17 | -------------------------------------------------------------------------------- /dev/dags/example_map_index_template.yml: -------------------------------------------------------------------------------- 1 | # Requires Airflow 2.9 or higher 2 | example_map_index_template: 3 | default_args: 4 | owner: "custom_owner" 5 | start_date: 2 days 6 | description: "Example of TaskFlow powered DAG that includes dynamic task mapping" 7 | schedule_interval: "0 3 * * *" 8 | default_view: "graph" 9 | tasks: 10 | dynamic_task_with_named_mapping: 11 | decorator: airflow.decorators.task 12 | python_callable: sample.extract_last_name 13 | map_index_template: "{{ custom_mapping_key }}" 14 | expand: 15 | full_name: 16 | - Lucy Black 17 | - Vera Santos 18 | - Marks Spencer 19 | -------------------------------------------------------------------------------- /dev/dags/example_simple_http_operator_task.yml: -------------------------------------------------------------------------------- 1 | default: 2 | default_args: 3 | catchup: false, 4 | start_date: 2025-03-20 5 | 6 | http_operator_example_dag: 7 | default_args: 8 | owner: "@owner" 9 | description: "this is an HttpOperator dag" 10 | schedule_interval: "0 3 * * *" 11 | tags: ['http'] 12 | render_template_as_native_obj: True 13 | tasks: 14 | send_request_json: 15 | operator: airflow.providers.http.operators.http.SimpleHttpOperator 16 | http_conn_id: "example_host" 17 | method: "POST" 18 | endpoint: "/run_test" 19 | data: 20 | data: "fake_data" 21 | format: "json" 22 | headers: 23 | Content-Type: application/json 24 | log_response: True 25 | send_request_plain_text: 26 | operator: airflow.providers.http.operators.http.SimpleHttpOperator 27 | http_conn_id: "example_host" 28 | method: "POST" 29 | endpoint: "/run_test" 30 | data: 31 | data: "fake_data" 32 | test: "plain_text" 33 | headers: 34 | Content-Type: text/plain 35 | log_response: True 36 | -------------------------------------------------------------------------------- /dev/dags/example_task_group.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | # The following import is here so Airflow parses this file 5 | # from airflow import DAG 6 | import dagfactory 7 | 8 | DEFAULT_CONFIG_ROOT_DIR = "/usr/local/airflow/dags/" 9 | CONFIG_ROOT_DIR = Path(os.getenv("CONFIG_ROOT_DIR", DEFAULT_CONFIG_ROOT_DIR)) 10 | 11 | config_file = str(CONFIG_ROOT_DIR / "example_task_group.yml") 12 | 13 | example_dag_factory = dagfactory.DagFactory(config_file) 14 | 15 | # Creating task dependencies 16 | example_dag_factory.clean_dags(globals()) 17 | example_dag_factory.generate_dags(globals()) 18 | -------------------------------------------------------------------------------- /dev/dags/example_task_group.yml: -------------------------------------------------------------------------------- 1 | default: 2 | default_args: 3 | owner: default_owner 4 | retries: 1 5 | retry_delay_sec: 300 6 | start_date: 2024-01-01 7 | default_view: tree 8 | max_active_runs: 1 9 | schedule_interval: 0 1 * * * 10 | 11 | example_task_group: 12 | description: "this dag uses task groups" 13 | task_groups: 14 | task_group_1: 15 | tooltip: "this is a task group" 16 | dependencies: [task_1] 17 | task_group_2: 18 | tooltip: "this is a task group" 19 | parent_group_name: task_group_1 20 | tasks: 21 | task_1: 22 | operator: airflow.operators.bash_operator.BashOperator 23 | bash_command: "echo 1" 24 | task_2: 25 | operator: airflow.operators.bash_operator.BashOperator 26 | bash_command: "echo 2" 27 | task_group_name: task_group_1 28 | task_4: 29 | operator: airflow.operators.bash_operator.BashOperator 30 | bash_command: "echo 4" 31 | task_group_name: task_group_2 32 | -------------------------------------------------------------------------------- /dev/dags/example_taskflow.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | # The following import is here so Airflow parses this file 5 | # from airflow import DAG 6 | import dagfactory 7 | 8 | DEFAULT_CONFIG_ROOT_DIR = "/usr/local/airflow/dags/" 9 | CONFIG_ROOT_DIR = Path(os.getenv("CONFIG_ROOT_DIR", DEFAULT_CONFIG_ROOT_DIR)) 10 | 11 | config_file = str(CONFIG_ROOT_DIR / "example_taskflow.yml") 12 | example_dag_factory = dagfactory.DagFactory(config_file) 13 | 14 | # Creating task dependencies 15 | example_dag_factory.clean_dags(globals()) 16 | example_dag_factory.generate_dags(globals()) 17 | -------------------------------------------------------------------------------- /dev/dags/example_taskflow.yml: -------------------------------------------------------------------------------- 1 | example_taskflow: 2 | default_args: 3 | owner: "custom_owner" 4 | start_date: 2 days 5 | description: "Example of TaskFlow powered DAG that includes dynamic task mapping" 6 | schedule_interval: "0 3 * * *" 7 | default_view: "graph" 8 | tasks: 9 | some_number: 10 | decorator: airflow.decorators.task 11 | python_callable: sample.some_number 12 | numbers_list: 13 | decorator: airflow.decorators.task 14 | python_callable_name: build_numbers_list 15 | python_callable_file: $CONFIG_ROOT_DIR/sample.py 16 | another_numbers_list: 17 | decorator: airflow.decorators.task 18 | python_callable: sample.build_numbers_list 19 | double_number_from_arg: 20 | decorator: airflow.decorators.task 21 | python_callable: sample.double 22 | number: 2 23 | double_number_from_task: 24 | decorator: airflow.decorators.task 25 | python_callable: sample.double 26 | number: +some_number # the prefix + leads to resolving this value as the task `some_number`, previously defined 27 | double_number_with_dynamic_task_mapping_static: 28 | decorator: airflow.decorators.task 29 | python_callable: sample.double 30 | expand: 31 | number: 32 | - 1 33 | - 3 34 | - 5 35 | double_number_with_dynamic_task_mapping_taskflow: 36 | decorator: airflow.decorators.task 37 | python_callable: sample.double 38 | expand: 39 | number: +numbers_list # the prefix + tells DagFactory to resolve this value as the task `numbers_list`, previously defined 40 | multiply_with_multiple_parameters: 41 | decorator: airflow.decorators.task 42 | python_callable: sample.multiply 43 | expand: 44 | a: +numbers_list # the prefix + tells DagFactory to resolve this value as the task `numbers_list`, previously defined 45 | b: +another_numbers_list # the prefix + tells DagFactory to resolve this value as the task `another_numbers_list`, previously defined 46 | double_number_with_dynamic_task_and_partial: 47 | decorator: airflow.decorators.task 48 | python_callable: sample.double_with_label 49 | expand: 50 | number: +numbers_list # the prefix + tells DagFactory to resolve this value as the task `numbers_list`, previously defined 51 | partial: 52 | label: True 53 | -------------------------------------------------------------------------------- /dev/dags/expand_tasks.py: -------------------------------------------------------------------------------- 1 | def make_list(): 2 | return [[1], [2], [3], [4]] 3 | 4 | 5 | def consume_value(expanded_param, fixed_param): 6 | print(fixed_param) 7 | print(expanded_param) 8 | return [expanded_param] 9 | -------------------------------------------------------------------------------- /dev/dags/external_task_sensor.yml: -------------------------------------------------------------------------------- 1 | example_external_task_sensor_dag_factory_consumer: 2 | default_args: 3 | start_date: 2025-01-01 4 | schedule_interval: "@daily" 5 | tasks: 6 | wait_for_producer_task: 7 | operator: airflow.sensors.external_task_sensor.ExternalTaskSensor 8 | external_dag_id: example_external_task_sensor_dag_factory_producer 9 | external_task_id: producer_task 10 | mode: poke 11 | timeout: 600 12 | poke_interval: 30 13 | retries: 2 14 | consumer_task: 15 | operator: airflow.operators.empty.EmptyOperator 16 | 17 | example_external_task_sensor_dag_factory_consumer2: 18 | default_args: 19 | start_date: 2025-01-02 20 | schedule_interval: "@daily" 21 | tasks: 22 | wait_for_producer_task: 23 | operator: airflow.sensors.external_task_sensor.ExternalTaskSensor 24 | external_dag_id: example_external_task_sensor_dag_factory_producer 25 | external_task_id: producer_task 26 | execution_date_fn: sample.one_day_ago 27 | consumer_task: 28 | operator: airflow.operators.empty.EmptyOperator 29 | 30 | example_external_task_sensor_dag_factory_consumer3: 31 | default_args: 32 | start_date: 2025-01-03 33 | schedule_interval: "@daily" 34 | tasks: 35 | wait_for_producer_task: 36 | operator: airflow.sensors.external_task_sensor.ExternalTaskSensor 37 | external_dag_id: example_external_task_sensor_dag_factory_producer 38 | external_task_id: producer_task 39 | execution_delta: 1 days, 40 | consumer_task: 41 | operator: airflow.operators.empty.EmptyOperator 42 | -------------------------------------------------------------------------------- /dev/dags/hacker_news.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | 5 | import pandas as pd 6 | 7 | # ----8<--- [ start: hacker_news ] 8 | 9 | 10 | def summarize(**kwargs): 11 | """ 12 | Given the Airflow context is provided to this function, it will extract the XCom hackernews records from its 13 | upstream tasks and summarise in Markdown. 14 | """ 15 | ti = kwargs["ti"] 16 | upstream_task_ids = ti.task.upstream_task_ids # Get upstream task IDs dynamically 17 | values = [json.loads(ti.xcom_pull(task_ids=task_id)) for task_id in upstream_task_ids] 18 | 19 | df = pd.DataFrame(values) 20 | selected_columns = ["title", "url"] 21 | df = df[selected_columns] 22 | markdown_output = df.to_markdown(index=False) 23 | print(markdown_output) 24 | return markdown_output 25 | 26 | 27 | # ----8<--- [ end: hacker_news ] 28 | -------------------------------------------------------------------------------- /dev/dags/invalid.yaml: -------------------------------------------------------------------------------- 1 | name: John Doe 2 | age: 30 3 | is_student: yes 4 | address: 5 | street: 123 Main St 6 | city: New York 7 | postal_code 10001 8 | - phone: 555-1234 9 | email: johndoe@example.com 10 | -------------------------------------------------------------------------------- /dev/dags/print_hello.py: -------------------------------------------------------------------------------- 1 | def print_hello(): 2 | print("hello") 3 | 4 | 5 | def print_hello_from_callback(context): 6 | print("hello from callback") 7 | -------------------------------------------------------------------------------- /dev/dags/pypi_stats.py: -------------------------------------------------------------------------------- 1 | """ 2 | PyPI stats utility functions. 3 | """ 4 | 5 | from __future__ import annotations 6 | 7 | from typing import Any 8 | 9 | import httpx 10 | import pandas as pd 11 | 12 | DEFAULT_PYPI_PROJECTS = [ 13 | "apache-airflow", 14 | "dag-factory", 15 | "astronomer-cosmos", 16 | ] 17 | 18 | 19 | # ----8<--- [ start: pypi_stats ] 20 | 21 | 22 | def get_pypi_projects_list(**kwargs: dict[str, Any]) -> list[str]: 23 | """ 24 | Return a list of PyPI project names to be analysed. 25 | """ 26 | projects_from_ui = kwargs.get("dag_run").conf.get("pypi_projects") if kwargs.get("dag_run") else None 27 | if projects_from_ui is None: 28 | pypi_projects = DEFAULT_PYPI_PROJECTS 29 | else: 30 | pypi_projects = projects_from_ui 31 | return pypi_projects 32 | 33 | 34 | def fetch_pypi_stats_data(package_name: str) -> dict[str, Any]: 35 | """ 36 | Given a PyPI project name, return the PyPI stats data associated to it. 37 | """ 38 | url = f"https://pypistats.org/api/packages/{package_name}/recent" 39 | package_json = httpx.get(url).json() 40 | package_data = package_json["data"] 41 | package_data["package_name"] = package_name 42 | return package_data 43 | 44 | 45 | def summarize(values: list[dict[str, Any]]): 46 | """ 47 | Given a list with PyPI stats data, create a table summarizing it, sorting by the last day total downloads. 48 | """ 49 | df = pd.DataFrame(values) 50 | first_column = "package_name" 51 | sorted_columns = [first_column] + [col for col in df.columns if col != first_column] 52 | df = df[sorted_columns].sort_values(by="last_day", ascending=False) 53 | markdown_output = df.to_markdown(index=False) 54 | print(markdown_output) 55 | return markdown_output 56 | 57 | 58 | # ----8<--- [ end: pypi_stats ] 59 | 60 | if __name__ == "__main__": 61 | pypi_projects_list = get_pypi_projects_list() 62 | all_data = [] 63 | for pypi_project_name in pypi_projects_list: 64 | project_data = fetch_pypi_stats_data(pypi_project_name) 65 | all_data.append(project_data) 66 | summarize(data=all_data) 67 | -------------------------------------------------------------------------------- /dev/dags/sample.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | from random import randint 3 | 4 | from airflow.operators.python import get_current_context 5 | 6 | 7 | def build_numbers_list(): 8 | return [2, 4, 6] 9 | 10 | 11 | def some_number(): 12 | return randint(0, 100) 13 | 14 | 15 | def double(number: int): 16 | result = 2 * number 17 | print(result) 18 | return result 19 | 20 | 21 | def multiply(a: int, b: int) -> int: 22 | result = a * b 23 | print(result) 24 | return result 25 | 26 | 27 | # added_values = add.expand(x=first_list(), y=second_list()) 28 | 29 | 30 | def double_with_label(number: int, label: bool = False): 31 | result = 2 * number 32 | if not label: 33 | print(result) 34 | return result 35 | else: 36 | label_info = "even" if number % 2 else "odd" 37 | print(f"{result} is {label_info}") 38 | return result, label_info 39 | 40 | 41 | def extract_last_name(full_name: str): 42 | name, last_name = full_name.split(" ") 43 | print(f"{name} {last_name}") 44 | context = get_current_context() 45 | context["custom_mapping_key"] = name 46 | return last_name 47 | 48 | 49 | def one_day_ago(execution_date: datetime): 50 | return execution_date - timedelta(days=1) 51 | -------------------------------------------------------------------------------- /dev/packages.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/dev/packages.txt -------------------------------------------------------------------------------- /dev/requirements.txt: -------------------------------------------------------------------------------- 1 | # Astro Runtime includes the following pre-installed providers packages: https://www.astronomer.io/docs/astro/runtime-image-architecture#provider-packages 2 | apache-airflow-providers-slack 3 | -------------------------------------------------------------------------------- /dev/tests/dags/test_dag_example.py: -------------------------------------------------------------------------------- 1 | """Example DAGs test. This test ensures that all Dags have tags, retries set to two, and no import errors. This is an example pytest and may not be fit the context of your DAGs. Feel free to add and remove tests.""" 2 | 3 | import logging 4 | import os 5 | from contextlib import contextmanager 6 | 7 | import pytest 8 | from airflow.models import DagBag 9 | 10 | 11 | @contextmanager 12 | def suppress_logging(namespace): 13 | logger = logging.getLogger(namespace) 14 | old_value = logger.disabled 15 | logger.disabled = True 16 | try: 17 | yield 18 | finally: 19 | logger.disabled = old_value 20 | 21 | 22 | def get_import_errors(): 23 | """ 24 | Generate a tuple for import errors in the dag bag 25 | """ 26 | with suppress_logging("airflow"): 27 | dag_bag = DagBag(include_examples=False) 28 | 29 | def strip_path_prefix(path): 30 | return os.path.relpath(path, os.environ.get("AIRFLOW_HOME")) 31 | 32 | # prepend "(None,None)" to ensure that a test object is always created even if it's a no op. 33 | return [(None, None)] + [(strip_path_prefix(k), v.strip()) for k, v in dag_bag.import_errors.items()] 34 | 35 | 36 | def get_dags(): 37 | """ 38 | Generate a tuple of dag_id, in the DagBag 39 | """ 40 | with suppress_logging("airflow"): 41 | dag_bag = DagBag(include_examples=False) 42 | 43 | def strip_path_prefix(path): 44 | return os.path.relpath(path, os.environ.get("AIRFLOW_HOME")) 45 | 46 | return [(k, v, strip_path_prefix(v.fileloc)) for k, v in dag_bag.dags.items()] 47 | 48 | 49 | @pytest.mark.parametrize("rel_path,rv", get_import_errors(), ids=[x[0] for x in get_import_errors()]) 50 | def test_file_imports(rel_path, rv): 51 | """Test for import errors on a file""" 52 | if rel_path and rv: 53 | raise Exception(f"{rel_path} failed to import with message \n {rv}") 54 | 55 | 56 | APPROVED_TAGS = {} 57 | 58 | 59 | @pytest.mark.parametrize("dag_id,dag,fileloc", get_dags(), ids=[x[2] for x in get_dags()]) 60 | def test_dag_tags(dag_id, dag, fileloc): 61 | """ 62 | test if a DAG is tagged and if those TAGs are in the approved list 63 | """ 64 | assert dag.tags, f"{dag_id} in {fileloc} has no tags" 65 | if APPROVED_TAGS: 66 | assert not set(dag.tags) - APPROVED_TAGS 67 | -------------------------------------------------------------------------------- /docs/comparison/index.md: -------------------------------------------------------------------------------- 1 | # Using YAML instead of Python 2 | 3 | By default, Apache Airflow® users write their workflows, or sequences of tasks, in Python. 4 | 5 | DAG Factory offers an alternative interface, allowing users to represent Airflow workflows via YAML files, often using less code. 6 | 7 | This section illustrates a few examples of how to represent the same workflow using plain Airflow Python DAGs in comparison 8 | to their representation using DAG Factory YAML files. 9 | 10 | * [Traditional Airflow Operators](traditional_operators.md) 11 | * [TaskFlow API](traditional_operators.md) 12 | -------------------------------------------------------------------------------- /docs/comparison/taskflow_api.md: -------------------------------------------------------------------------------- 1 | # TaskFlow API: Using YAML instead of Python 2 | 3 | For users that employ lots of Python functions in their DAGs, [TaskFlow API](https://www.astronomer.io/docs/learn/airflow-decorators/) represent a simpler way to transform functions into tasks, with a more intuitive way of passing data between them. 4 | They were introduced in Airflow 2 as an alternative to Airflow [traditional operators](traditional_operators.md). 5 | 6 | The following section shows how to represent an Airflow DAG using TaskFlow API and how to define the same DAG using 7 | DAG Factory. Ultimately, both implementations use the same Airflow operators. The main difference is the language used 8 | to declare the workflow: one uses Python and the other uses YAML. 9 | 10 | ## Goal 11 | 12 | Let's say we'd like to create a workflow that performs the following: 13 | 14 | 1. Create a list of [PyPI](https://pypi.org/) projects to be analysed. 15 | 2. Fetch the [statistics](https://pypistats.org/) for each of these projects. 16 | 3. Summarize the selected statistics as Markdown, using Python. 17 | 18 | We will implement all these steps using the Airflow `task` decorator, and the last task will generate a Markdown table similar to: 19 | 20 | ```text 21 | | package_name | last_day | last_month | last_week | 22 | |:------------------|-----------:|-------------:|------------:| 23 | | apache-airflow | 852242 | 28194255 | 6253861 | 24 | | astronomer-cosmos | 442531 | 13354870 | 3127750 | 25 | | dag-factory | 10078 | 354085 | 77752 | 26 | ``` 27 | 28 | The main logic is implemented as plain Python functions in [pypi_stats.py](https://github.com/astronomer/dag-factory/blob/main/dev/dags/pypi_stats.py): 29 | 30 | ```title="pypi_stats.py" 31 | --8<-- "dev/dags/pypi_stats.py:pypi_stats" 32 | ``` 33 | 34 | ## Implementation 35 | 36 | As a reference, the following workflows run using Airflow 2.10.2 and DAG Factory 0.21.0. 37 | 38 | ### Plain Airflow Python DAG 39 | 40 | ```title="example_pypi_stats_plain_airflow.py" 41 | --8<-- "dev/dags/comparison/example_pypi_stats_plain_airflow.py" 42 | ``` 43 | 44 | ### Alternative DAG Factory YAML 45 | 46 | ```title="example_pypi_stats_dagfactory.yml" 47 | --8<-- "dev/dags/comparison/example_pypi_stats_dagfactory.yml" 48 | ``` 49 | 50 | ## Comparison 51 | 52 | ### Goal 53 | 54 | Both implementations accomplish the same goal and result in the expected Markdown table. 55 | 56 | ### Airflow Graph view 57 | 58 | As shown in the screenshots below, both the DAG created using Python with standard Airflow and the 59 | DAG created using YAML and DAG Factory look identical, from a graph topology perspective, and also from the underlining operators being used. 60 | 61 | #### Graph view: Plain Airflow Python DAG 62 | 63 | ![alt text](../static/example_pypi_stats_plain_airflow_graph.png "Python DAG Graph visualisation") 64 | 65 | #### Graph view: Alternative DAG Factory YAML 66 | 67 | ![alt text](../static/example_pypi_stats_dagfactory_graph.png "YAML DAG Graph visualization") 68 | 69 | ### Airflow Dynamic Task Mapping 70 | 71 | In both workflows, we are dynamically generating a task for each PyPI repo. 72 | 73 | #### Mapped Tasks: Plain Airflow Python DAG 74 | 75 | ![alt text](../static/example_pypi_stats_plain_airflow_mapped_tasks.png "Python DAG mapped tasks") 76 | 77 | #### Mapped Tasks: Alternative DAG Factory YAML 78 | 79 | ![alt text](../static/example_pypi_stats_dagfactory_mapped_tasks.png "YAML DAG mapped tasks") 80 | 81 | ### Airflow Code view 82 | 83 | From an Airflow UI perspective, the content displayed in the "Code" view is the main difference between the two implementations. While Airflow renders the original Python DAG, as expected, in the case of the YAML DAGs, Airflow displays the Python file that references the DAG Factory YAML files: 84 | 85 | ```title="example_load_yaml_dags.py" 86 | --8<-- "dev/dags/example_load_yaml_dags.py" 87 | ``` 88 | 89 | #### Code view: Plain Airflow Python DAG 90 | 91 | ![alt text](../static/example_pypi_stats_plain_airflow_code.png "Python DAG code visualization") 92 | 93 | #### Code view: Alternative DAG Factory YAML 94 | 95 | ![alt text](../static/example_pypi_stats_dagfactory_code.png "YAML DAG code visualization") 96 | 97 | To overcome this limitation, DAG Factory appends the YAML content to the DAG Documentation so users can better troubleshoot the DAG: 98 | 99 | ![alt text](../static/example_pypi_stats_dagfactory_docs.png "YAML DAG docs visualization") 100 | -------------------------------------------------------------------------------- /docs/comparison/traditional_operators.md: -------------------------------------------------------------------------------- 1 | # Traditional Operators: Using YAML instead of Python 2 | 3 | Traditionally, operators are Airflow's building blocks, and while they are robust and diverse, 4 | they can sometimes lead to boilerplate-heavy DAGs compared to the newer [TaskFlow API](./taskflow_api.md). 5 | 6 | Most of the Airflow providers come with built-in traditional operators. Some examples include `BashOperator`, `PythonOperator`, `KubernetesPodOperator`, and `PostgresOperator`. 7 | 8 | Below, we illustrate how to represent an Airflow DAG using traditional operators and how to define the same DAG using 9 | DAG Factory. Ultimately, both implementations use the same Airflow operators. The main difference is the language used 10 | to declare the workflow: one uses Python and the other uses YAML. 11 | 12 | ## Goal 13 | 14 | Let's say we'd like to create a workflow that performs the following: 15 | 16 | 1. Retrieve the top ten stories from Hacker News using the [Hacker News API](https://github.com/HackerNews/API). 17 | 2. Fetch the details for the two top stories using the Hacker News API. 18 | 3. Summarize the selected stories as Markdown, using Python. 19 | 20 | We will implement the first two steps using `BashOperator` and the third step using `PythonOperator`. 21 | The last task will generate a `Markdown` snippet similar to: 22 | 23 | ```text 24 | | title | url | 25 | |:----------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------| 26 | | I keep turning my Google Sheets into phone-friendly webapps | https://arstechnica.com/gadgets/2024/12/making-tiny-no-code-webapps-out-of-spreadsheets-is-a-weirdly-fulfilling-hobby/ | 27 | | Coconut by Meta AI – Better LLM Reasoning with Chain of Continuous Thought? | https://aipapersacademy.com/chain-of-continuous-thought/ | 28 | ``` 29 | 30 | The main logic is implemented as plain Python functions in [hacker_news.py](https://github.com/astronomer/dag-factory/blob/main/dev/dags/hacker_news.py): 31 | 32 | ```title="pypi_stats.py" 33 | --8<-- "dev/dags/hacker_news.py:hacker_news" 34 | ``` 35 | 36 | ## Implementation 37 | 38 | As a reference, the following workflows run using Airflow 2.10.2 and DAG Factory 0.21.0. 39 | 40 | ### Plain Airflow Python DAG 41 | 42 | ```title="example_hackernews_plain_airflow.py" 43 | --8<-- "dev/dags/comparison/example_hackernews_plain_airflow.py" 44 | ``` 45 | 46 | ### Alternative DAG Factory YAML 47 | 48 | ```title="example_hackernews_dagfactory.py" 49 | --8<-- "dev/dags/comparison/example_hackernews_dagfactory.yml" 50 | ``` 51 | 52 | ## Comparison 53 | 54 | ### Goal 55 | 56 | Both implementations accomplish the same goal and result in the expected Markdown table. 57 | 58 | ### Airflow Graph view 59 | 60 | As shown in the screenshots below, both the DAG created using Python with standard Airflow and the 61 | DAG created using YAML and DAG Factory look identical, from a graph topology perspective, and also from the underlining 62 | operators being used. 63 | 64 | #### Graph view: Plain Airflow Python DAG 65 | 66 | ![alt text](../static/example_hackernews_plain_airflow_graph.png "Python DAG Graph visualisation") 67 | 68 | #### Graph view: Alternative DAG Factory YAML 69 | 70 | ![alt text](../static/example_hackernews_dagfactory_graph.png "YAML DAG Graph visualization") 71 | 72 | ### Airflow Code view 73 | 74 | From an Airflow UI perspective, the content displayed in the "Code" view is the main difference between the two implementations. While Airflow renders the original Python DAG, as expected, in the case of the YAML DAGs, Airflow displays the Python file that references the DAG Factory YAML files: 75 | 76 | ```title="example_load_yaml_dags.py" 77 | --8<-- "dev/dags/example_load_yaml_dags.py" 78 | ``` 79 | 80 | #### Code view: Plain Airflow Python DAG 81 | 82 | ![alt text](../static/example_hackernews_plain_airflow_code.png "Python DAG code visualization") 83 | 84 | #### Code view: Alternative DAG Factory YAML 85 | 86 | ![alt text](../static/example_hackernews_dagfactory_code.png "YAML DAG code visualization") 87 | 88 | To overcome this limitation, DAG Factory appends the YAML content to the DAG Documentation so users can better troubleshoot 89 | the DAG: 90 | 91 | ![alt text](../static/example_hackernews_dagfactory_docs.png "YAML DAG docs visualization") 92 | -------------------------------------------------------------------------------- /docs/configuration/configuring_workflows.md: -------------------------------------------------------------------------------- 1 | # Configuring Your Workflows 2 | 3 | DAG Factory allows you to define workflows in a structured, configuration-driven way using YAML files. 4 | You can define multiple workflows within a single YAML file based on your requirements. 5 | 6 | ## Key Elements of Workflow Configuration 7 | 8 | - **dag_id**: Unique identifier for your DAG. 9 | - **default_args**: Common arguments for all tasks. 10 | - **schedule**/**schedule_interval**: Specifies the execution schedule. 11 | - **tasks**: Defines the [Airflow tasks](https://airflow.apache.org/docs/apache-airflow/stable/core-concepts/tasks.html) in your workflow. 12 | 13 | ### Example DAG Configuration 14 | 15 | ```title="example_dag_factory.yml" 16 | --8<-- "dev/dags/example_dag_factory.yml:example_dag_yaml_configuration" 17 | ``` 18 | 19 | ### Check out more configuration params 20 | 21 | - [Environment variables](environment_variables.md) 22 | - [Defaults](defaults.md) 23 | -------------------------------------------------------------------------------- /docs/configuration/defaults.md: -------------------------------------------------------------------------------- 1 | # Defaults 2 | 3 | DAG Factory allows you to define Airflow 4 | [default_args](https://airflow.apache.org/docs/apache-airflow/stable/core-concepts/dags.html#default-arguments) and 5 | additional DAG-level arguments in a `default` block. This block enables you to share common settings and configurations 6 | across all DAGs in your YAML configuration, with the arguments automatically applied to each DAG defined in the file. 7 | This is one of DAG Factory's most powerful features; using defaults allows for the dynamic generation of more than a 8 | single DAG. 9 | 10 | ## Benefits of using the default block 11 | 12 | - Consistency: Ensures uniform configurations across all tasks and DAGs. 13 | - Maintainability: Reduces duplication by centralizing common properties. 14 | - Simplicity: Makes configurations easier to read and manage. 15 | - Dynamic Generation: Use a single default block to easily generate more than a single DAG. 16 | 17 | ### Example usage of a default block for `default_args` 18 | 19 | #### Specifying `default_args` in the `default` block 20 | 21 | Using a `default` block in a YAML file allows for those key-value pairs to be applied to each DAG that is defined in 22 | that same file. One of the most common examples is using a `default` block to specify `default_args` for each DAG 23 | defined in that file. These arguments are automatically inherited by every DAG defined in the file. Below is an example of this. 24 | 25 | ```yaml title="Usage of default block for default_args in YAML" 26 | --8<-- "dev/dags/example_dag_factory_default_args.yml" 27 | ``` 28 | 29 | #### Specifying `default_args` directly in a DAG configuration 30 | 31 | You can override or define specific `default_args` at the individual DAG level. This allows you to customize 32 | arguments for each DAG without affecting others. Not only can existing `default_args` be overridden directly in a DAG 33 | configuration, but new arguments can be added. 34 | 35 | ```yaml 36 | etl: 37 | default_args: 38 | start_date: '2024-12-31' 39 | retries: 1 # A new default_arg was added 40 | ... 41 | ``` 42 | 43 | #### Specifying `default_args` in a shared `defaults.yml` 44 | 45 | Starting DAG Factory 0.22.0, you can also keep the `default_args` in the `defaults.yml` file. The configuration 46 | from `defaults.yml` will be applied to all DAG Factory generated DAGs. **Be careful, these will be applied to all 47 | generated DAGs.** 48 | 49 | ```yaml title="defaults.yml" 50 | --8<-- "dev/dags/defaults.yml" 51 | ``` 52 | 53 | Given the various ways to specify `default_args`, the following precedence order is applied when arguments are 54 | duplicated: 55 | 56 | 1. In the DAG configuration 57 | 2. In the `default` block within the workflow's YAML file 58 | 3. In the `defaults.yml` 59 | 60 | ### Example using of default block for dynamic DAG generation 61 | 62 | Not only can the `default` block in a YAML file be used to define `default_args` for one or more DAGs; it can also be 63 | used to create the skeleton of "templated" DAGs. In the example below, the `default` block is used to define not only 64 | the `default_args` of a DAG, but also default Tasks. These Tasks provide a "template" for the DAGs defined in this file. 65 | Each DAG (`machine_learning`, `data_science`, `artificial_intelligence`) will be defined using the values from the 66 | `default` block, and like with `default_args`, can override these values. **This is a powerful way to use DAG Factory 67 | to dynamically create DAGs using a single configuration.** 68 | 69 | 70 | ```yaml title="Usage of default block in YAML" 71 | --8<-- "dev/dags/example_dag_factory_default_config.yml" 72 | ``` 73 | 74 | Currently, only `default_args` can be specified using the `defaults.yml` file. 75 | -------------------------------------------------------------------------------- /docs/configuration/environment_variables.md: -------------------------------------------------------------------------------- 1 | # Environment variables 2 | 3 | Starting release `0.20.0`, DAG Factory introduces support for referencing environment variables directly within YAML 4 | configuration files. This enhancement enables dynamic configuration paths and enhances workflow portability by 5 | resolving environment variables during DAG parsing. 6 | 7 | With this feature, DAG Factory removes the reliance on hard-coded paths, allowing for more flexible and adaptable 8 | configurations that work seamlessly across various environments. 9 | 10 | ## Example YAML Configuration with Environment Variables 11 | 12 | ```title="Reference environment variable in YAML" 13 | --8<-- "dev/dags/example_dag_factory_multiple_config.yml:environment_variable_example" 14 | ``` 15 | 16 | In the above example, `$CONFIG_ROOT_DIR` is used to reference an environment variable that points to the root 17 | directory of your DAG configurations. During DAG parsing, it will be resolved to the value specified for the 18 | `CONFIG_ROOT_DIR` environment variable. 19 | -------------------------------------------------------------------------------- /docs/contributing/code_of_conduct.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socioeconomic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned with this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | [humans@astronomer.io](mailto:humans@astronomer.io). 64 | 65 | All complaints will be reviewed and investigated promptly and fairly. 66 | 67 | All community leaders are obligated to respect the privacy and security of the 68 | reporter of any incident. 69 | 70 | ## Enforcement Guidelines 71 | 72 | Community leaders will follow these Community Impact Guidelines in determining 73 | the consequences for any action they deem in violation of this Code of Conduct: 74 | 75 | ### 1. Correction 76 | 77 | **Community Impact**: Use of inappropriate language or other behavior deemed 78 | unprofessional or unwelcome in the community. 79 | 80 | **Consequence**: A private, written warning from community leaders, providing 81 | clarity around the nature of the violation and an explanation of why the 82 | behavior was inappropriate. A public apology may be requested. 83 | 84 | ### 2. Warning 85 | 86 | **Community Impact**: A violation through a single incident or series 87 | of actions. 88 | 89 | **Consequence**: A warning with consequences for continued behavior. No 90 | interaction with the people involved, including unsolicited interaction with 91 | those enforcing the Code of Conduct, for a specified period of time. This 92 | includes avoiding interactions in community spaces as well as external channels 93 | like social media. Violating these terms may lead to a temporary or 94 | permanent ban. 95 | 96 | ### 3. Temporary Ban 97 | 98 | **Community Impact**: A serious violation of community standards, including 99 | sustained inappropriate behavior. 100 | 101 | **Consequence**: A temporary ban from any sort of interaction or public 102 | communication with the community for a specified period of time. No public or 103 | private interaction with the people involved, including unsolicited interaction 104 | with those enforcing the Code of Conduct, is allowed during this period. 105 | Violating these terms may lead to a permanent ban. 106 | 107 | ### 4. Permanent Ban 108 | 109 | **Community Impact**: Demonstrating a pattern of violation of community 110 | standards, including sustained inappropriate behavior, harassment of an 111 | individual, or aggression toward or disparagement of classes of individuals. 112 | 113 | **Consequence**: A permanent ban from any sort of public interaction within 114 | the community. 115 | 116 | ## Attribution 117 | 118 | This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/version/2/0/code_of_conduct/), version 2.0. 119 | 120 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 121 | enforcement ladder](https://github.com/mozilla/inclusion). 122 | 123 | For answers to common questions about this code of conduct, see the [FAQ](https://www.contributor-covenant.org/faq/). 124 | Translations are available at [this page](https://www.contributor-covenant.org/translations/). 125 | -------------------------------------------------------------------------------- /docs/contributing/contributors.md: -------------------------------------------------------------------------------- 1 | # Contributors 2 | 3 | There are different ways people can contribute to DAG Factory. 4 | Learn more about the project [contributors roles](roles.md). 5 | 6 | ## Committers 7 | 8 | * Pankaj Koti ([@pankajkoti](https://github.com/pankajkoti)) 9 | * Pankaj Singh ([@pankajastro](https://github.com/pankajastro)) 10 | * Tatiana Al-Chueyr ([@tatiana](https://github.com/tatiana)) 11 | 12 | ## Emeritus Committers 13 | 14 | * Adam Boscarino ([@ajbosco](https://github.com/ajbosco)) 15 | 16 | ## Contributors 17 | 18 | Many people are improving DAG Factory each day. 19 | Find more contributors [in our GitHub page](https://github.com/astronomer/dag-factory/graphs/contributors). 20 | -------------------------------------------------------------------------------- /docs/contributing/howto.md: -------------------------------------------------------------------------------- 1 | # Contributing Guide 2 | 3 | All contributions, bug reports, bug fixes, documentation improvements, and enhancements are welcome. 4 | 5 | All contributors and maintainers to this project should abide by the [Contributor Code of Conduct](code_of_conduct.md). 6 | 7 | Learn more about the contributors' roles in [the Roles page](roles.md). 8 | 9 | This document describes how to contribute to DAG Factory, covering: 10 | 11 | - Overview of how to contribute 12 | - How to set up the local development environment 13 | - Running tests 14 | - Pre-commit and linting 15 | - Authoring the documentation 16 | - Releasing 17 | 18 | ## Overview of how to contribute 19 | 20 | To contribute to the DAG Factory project: 21 | 22 | 1. Please create a [GitHub Issue](https://github.com/astronomer/dag-factory/issues) describing a bug, enhancement, or feature request. 23 | 2. Open a branch off of the `main` branch and create a Pull Request into the `main` branch from your feature branch. 24 | 3. Link your issue to the pull request. 25 | 4. After you complete development on your feature branch, request a review. A maintainer will merge your PR after all reviewers approve it. 26 | 27 | ## Set up a local development environment 28 | 29 | ### Requirements 30 | 31 | - [Git](https://git-scm.com/) 32 | - [Python](https://www.python.org/) <= 3.12 (due to dependencies, such as ``google-re2`` not supporting Python 3.13 yet) 33 | - [Hatch](https://hatch.pypa.io/latest/) 34 | 35 | Clone the **DAG Factory** repository and change the current working directory to the repo's root directory: 36 | 37 | ```bash 38 | git clone https://github.com/astronomer/dag-factory.git 39 | cd dag-factory/ 40 | ``` 41 | 42 | After cloning the project, there are two options for setting up the local development environment: 43 | 44 | - Use a Python virtual environment, or 45 | - Use Docker 46 | 47 | ### Using a Python virtual environment for local development 48 | 49 | 1. Install the project dependencies: 50 | 51 | ```bash 52 | make setup 53 | ``` 54 | 55 | 2. Activate the local python environment: 56 | 57 | ```bash 58 | source venv/bin/activate 59 | ``` 60 | 61 | 3. Set [Apache Airflow®](https://airflow.apache.org/) home to the ``dev/``, so you can see DAG Factory example DAGs. 62 | Disable loading Airflow standard example DAGs: 63 | 64 | ```bash 65 | export AIRFLOW_HOME=$(pwd)/dev/ 66 | export AIRFLOW__CORE__LOAD_EXAMPLES=false 67 | ``` 68 | 69 | Then, run Airflow in standalone mode; the command below will create a new user (if it does not exist) and run the necessary Airflow component (webserver, scheduler and triggered): 70 | 71 | > Note: By default, Airflow will use sqlite as a database; you can override this by setting the variable ``AIRFLOW__DATABASE__SQL_ALCHEMY_CONN`` to the SQL connection string. 72 | 73 | ```bash 74 | airflow standalone 75 | ``` 76 | 77 | After Airflow is running, you can access the Airflow UI at ``http://localhost:8080``. 78 | 79 | > Note: whenever you want to start the development server, you need to activate the ``virtualenv`` and set the ``environment variables`` 80 | 81 | ### Use Docker for local development 82 | 83 | It is also possible to build the development environment using [Docker](https://www.docker.com/products/docker-desktop/): 84 | 85 | ```bash 86 | make docker-run 87 | ``` 88 | 89 | After the sandbox is running, you can access the Airflow UI at ``http://localhost:8080``. 90 | 91 | This approach builds a DAG Factory wheel, so if there are code changes, you must stop and restart the containers: 92 | 93 | ```bash 94 | make docker-stop 95 | ``` 96 | 97 | ## Testing application with hatch 98 | 99 | The tests are developed using PyTest and run using hatch. 100 | 101 | The [pyproject. toml](https://github.com/astronomer/dag-factory/blob/main/pyproject.toml) file currently defines a matrix of supported versions of Python and Airflow against which a user can run the tests. 102 | 103 | ### Run unit tests 104 | 105 | To run unit tests using Python 3.10 and Airflow 2.5, use the following: 106 | 107 | ```bash 108 | hatch run tests.py3.10-2.5:test-cov 109 | ``` 110 | 111 | It is also possible to run the tests using all the matrix combinations, by using: 112 | 113 | ```bash 114 | hatch run tests:test-cov 115 | ``` 116 | 117 | ### Run integration tests 118 | 119 | > Note: these tests create local Python virtual environments in a hatch-managed directory. 120 | > They also use the user-defined `AIRFLOW_HOME`, overriding any pre-existing `airflow.cfg` and `airflow.db` files. 121 | 122 | First, set the following environment variables: 123 | 124 | ```bash 125 | export AIRFLOW_HOME=$(pwd)/dev/ 126 | export CONFIG_ROOT_DIR=`pwd`"/dev/dags" 127 | export PYTHONPATH=dev/dags:$PYTHONPATH 128 | ``` 129 | 130 | To run the integration tests using Python 3.9 and Airflow 2.9, use 131 | 132 | ```bash 133 | hatch run tests.py3.9-2.9:test-integration-setup 134 | hatch run tests.py3.9-2.9:test-integration 135 | ``` 136 | 137 | ## Pre-Commit and linting 138 | 139 | We use pre-commit to run several checks on the code before committing. To install pre-commit hooks, run: 140 | 141 | ```bash 142 | pre-commit install 143 | ``` 144 | 145 | To run the checks manually, run the following: 146 | 147 | ```bash 148 | pre-commit run --all-files 149 | ``` 150 | 151 | Pre-commit runs several static checks, including Black and Ruff. It is also possible to run them using ``hatch``: 152 | 153 | ```bash 154 | hatch run tests.py3.9-2.9:static-check 155 | ``` 156 | 157 | ## Write docs 158 | 159 | We use Markdown to author DAG Factory documentation. 160 | 161 | Similar to running tests, we also use hatch to manage the documentation. 162 | 163 | To build and serve the documentation locally: 164 | 165 | ```bash 166 | hatch run docs:dev 167 | ``` 168 | 169 | To release the documentation with the current project version and set it to the latest: 170 | 171 | ```bash 172 | hatch run docs:gh-release 173 | ``` 174 | 175 | ## Releasing 176 | 177 | We currently use [hatch](https://github.com/pypa/hatch) for building and distributing ``dag-factory``. 178 | 179 | We use GitHub actions to create and deploy new releases. To create a new release, update the latest release version. 180 | 181 | It is possible to update the version either by using hatch: 182 | 183 | > Note: You can update the version in several different ways. To learn more, check out the [hatch docs](https://hatch.pypa.io/latest/version/#updating). 184 | 185 | ```bash 186 | hatch version minor 187 | ``` 188 | 189 | Or by manually updating the value of `__version__` in `dagfactory/__init__.py`. 190 | 191 | Make sure the [CHANGELOG file](https://github.com/astronomer/dag-factory/blob/main/CHANGELOG.md) is up-to-date. 192 | 193 | Create a release using the [GitHub UI](https://github.com/astronomer/dag-factory/releases/new). GitHub will update the package directly to [PyPI](https://pypi.org/project/dag-factory/). 194 | 195 | If you're a [project maintainer in PyPI](https://pypi.org/project/dag-factory/), it is also possible to create a release manually, 196 | by authenticating to PyPI and running the commands: 197 | 198 | ```bash 199 | hatch build 200 | hatch publish 201 | ``` 202 | -------------------------------------------------------------------------------- /docs/contributing/roles.md: -------------------------------------------------------------------------------- 1 | # Contributor roles 2 | 3 | Contributors are welcome and are greatly appreciated! Every little bit helps, and we give credit to them. 4 | 5 | This document aims to explain the current roles in the DAG Factory project. 6 | For more information, check the [contributing docs](howto.md). 7 | 8 | ## Contributors 9 | 10 | A contributor is anyone who wants to contribute code, documentation, tests, ideas, or anything to the DAG Factory project. 11 | 12 | DAG Factory contributors are listed in the Github [insights page](https://github.com/astronomer/dag-factory/graphs/contributors). 13 | 14 | Contributors are responsible for: 15 | 16 | * Fixing bugs 17 | * Refactoring code 18 | * Improving processes and tooling 19 | * Adding features 20 | * Improving the documentation 21 | 22 | ## Committers 23 | 24 | Committers are community members with write access to the [DAG Factory GitHub repository](https://github.com/astronomer/dag-factory). 25 | They can modify the code and the documentation and accept others' contributions to the repo. 26 | 27 | Check [contributors](contributors.md) for the official list of DAG Factory committers. 28 | 29 | Committers have the same responsibilities as standard contributors and also perform the following actions: 30 | 31 | * Reviewing & merging pull-requests 32 | * Scanning and responding to GitHub issues, helping triaging them 33 | 34 | If you know you are not going to be able to contribute for a long time (for instance, due to a change of job or circumstances), you should inform other maintainers, and we will mark you as "emeritus". 35 | Emeritus committers will no longer have write access to the repo. 36 | As merit earned never expires, once an emeritus committer becomes active again, they can simply email another maintainer from Astronomer and ask to be reinstated. 37 | 38 | ### Pre-requisites to becoming a committer 39 | 40 | General prerequisites that we look for in all candidates: 41 | 42 | 1. Consistent contribution over last few months 43 | 2. Visibility on discussions on the Slack channel or GitHub issues/discussions 44 | 3. Contributes to community health and project's sustainability for the long-term 45 | 4. Understands the project's [contributors' guidelines](howto.md). 46 | Astronomer is responsible and accountable for releasing new versions of DAG Factory in [PyPI](https://pypi.org/project/dag-factory/), following the [milestones](https://github.com/astronomer/dag-factory/milestones). 47 | Astronomer has the right to grant and revoke write access permissions to the project's official repository for any reason it sees fit. 48 | -------------------------------------------------------------------------------- /docs/features/callbacks.md: -------------------------------------------------------------------------------- 1 | # Callbacks 2 | DAG Factory supports the use of callbacks. These callbacks can be set at the DAG, TaskGroup, or Task level. The way 3 | that callbacks that can be configured for DAGs, TaskGroups, and Tasks differ slightly, and details around this can be 4 | found in the [Apache Airflow documentation](https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/callbacks.html#). 5 | 6 | Within DAG Factory itself, there are three approaches to defining callbacks. The goal is to make this process 7 | intuitive and provide parity with the traditional DAG authoring experience. These approaches to configure callbacks 8 | are outlined below, each with an example of implementation. While proceeding examples are all defined for individual 9 | Tasks, callbacks can also be defined using `default_args`, or at the DAG and TaskGroup level. 10 | 11 | * [Passing a string that points to a callable](#passing-a-string-that-points-to-a-callable) 12 | * [Specifying a user-defined `.py` and the function within that file to be executed](#specifying-a-user-defined-py-file-and-function) 13 | * [Configuring callbacks from providers](#provider-callbacks) 14 | 15 | 16 | ## Passing a string that points to a callable 17 | 18 | The most traditional way of configuring callbacks is by defining a custom function within the Airflow project and 19 | assigning that callback to the desired Task. Using the syntax below, this can be implemented using DAG Factory. In this 20 | case, the `output_standard_message` function is a user-defined function stored in the `include/custom_callbacks.py` 21 | file. This function requires no parameters, and the YAML would take the form below. 22 | 23 | For this example to be implemented in DAG Factory, the `include/custom_callbacks.py` file must be on the Python 24 | `sys.path`. If this is not the case, the full path to a `.py` function can be specified, as shown below. 25 | 26 | ```yaml 27 | ... 28 | 29 | task_1: 30 | operator: airflow.operators.bash_operator.BashOperator 31 | bash_command: "echo task_1" 32 | on_failure_callback: include.custom_callbacks.output_standard_message 33 | ... 34 | ``` 35 | 36 | Sometimes, a function may have parameters that need to be defined within the Task itself. Here, the 37 | `output_custom_message` callback takes two key-word arguments; `param1`, and `param2`. These values are defined in the 38 | YAML itself, offering DAG Factory authors an additional degree of flexibility and verbosity. 39 | 40 | ```yaml 41 | ... 42 | 43 | task_2: 44 | operator: airflow.operators.bash_operator.BashOperator 45 | bash_command: "echo task_2" 46 | on_success_callback: 47 | callback: include.custom_callbacks.output_custom_message 48 | param1: "Task status" 49 | param2: "Successful!" 50 | ... 51 | ``` 52 | 53 | 54 | ## Specifying a user-defined `.py` file and function 55 | 56 | In addition to passing a string that points to a callback, the full path to the file and name of the callback can be 57 | specified for a DAG, TaskGroup, or Task. This provides a viable option for defining a callback when the director the 58 | `.py` file is stored in is not on the Python path. 59 | 60 | ```yaml 61 | ... 62 | 63 | task_3: 64 | operator: airflow.operators.bash_operator.BashOperator 65 | bash_command: "echo task_3" 66 | on_retry_callback_name: output_standard_message 67 | on_retry_callback_file: /usr/local/airflow/include/custom_callbacks.py 68 | ... 69 | ``` 70 | 71 | Note that this method for defining callbacks in DAG Factory does not allow for parameters to be passed to the callable 72 | within the YAML itself. 73 | 74 | 75 | ## Provider callbacks 76 | 77 | In addition to custom-built callbacks, there are a number of provider-built callbacks that can be used when defining a 78 | DAG. With DAG Factory, these callbacks can be configured similar to how they would be when authoring a traditional DAG. 79 | First, the type of callback is specified (`on_success_callback`, `on_failure_callback`, etc.). The `callback` key-value 80 | pair specifies the provider-built function to be executed. Then, the specific key-word arguments the callback takes can 81 | be specified, as shown below. 82 | 83 | Note that the provider package being used must be available on the Python `sys.path` path, meaning it may need to be 84 | `pip installed`. 85 | 86 | ```yaml 87 | ... 88 | task_4: 89 | operator: airflow.operators.bash_operator.BashOperator 90 | bash_command: "echo task_4" 91 | on_failure_callback: 92 | callback: airflow.providers.slack.notifications.slack.send_slack_notification 93 | slack_conn_id: slack_conn_id 94 | text: | 95 | :red_circle: Task Failed. 96 | This task has failed and needs to be addressed. 97 | Please remediate this issue ASAP. 98 | channel: "#channel" 99 | ... 100 | ``` 101 | -------------------------------------------------------------------------------- /docs/features/custom_operators.md: -------------------------------------------------------------------------------- 1 | # Custom Operators 2 | 3 | **DAG-Factory** supports [custom operators](https://airflow.apache.org/docs/apache-airflow/stable/howto/custom-operator.html). To leverage, set the path to the custom operator within the `operator` key in the configuration file. You can add any additional parameters that the custom operator requires. 4 | 5 | ```yaml 6 | ... 7 | tasks: 8 | begin: 9 | operator: airflow.operators.empty.EmptyOperator 10 | make_bread_1: 11 | operator: customized.operators.breakfast_operators.MakeBreadOperator 12 | bread_type: 'Sourdough' 13 | ``` 14 | 15 | ![custom_operators.png](../static/images/custom_operators.png) 16 | -------------------------------------------------------------------------------- /docs/features/datasets.md: -------------------------------------------------------------------------------- 1 | # Datasets 2 | DAG Factory supports Airflow’s [Datasets](https://airflow.apache.org/docs/apache-airflow/stable/authoring-and-scheduling/datasets.html). 3 | 4 | ## Datasets Outlets and Inlets 5 | 6 | To leverage datasets, you need to specify the `Dataset` in the `outlets` and `inlets` keys in the configuration file. 7 | The `outlets` and `inlets` keys should contain a list of strings representing dataset locations. 8 | In the `schedule` key of the consumer DAG, you can set the `Dataset` that the DAG should be scheduled against. The key 9 | should contain a list of dataset locations. 10 | The consumer DAG will run when all the specified datasets become avai 11 | 12 | #### Example: Outlet and Inlet 13 | 14 | ```title="example_dag_datasets_outlet_inlet.yml" 15 | --8<-- "dev/dags/datasets/example_dag_datasets_outlet_inlet.yml" 16 | ``` 17 | 18 | ![datasets_example.png](../static/images/datasets/outlets/datasets_example.png "Simple Dataset Producer") 19 | 20 | ## Conditional Dataset Scheduling 21 | 22 | #### Minimum Requirements: 23 | * dag-factory 0.22.0+ 24 | * [Apache Airflow® 2.9+](https://www.astronomer.io/docs/learn/airflow-datasets/#conditional-dataset-scheduling) 25 | 26 | 27 | #### Logical operators for datasets 28 | Airflow supports two logical operators for combining dataset conditions: 29 | 30 | * AND (``&``): Specifies that the DAG should be triggered only after all of the specified datasets have been updated. 31 | * OR (``|``): Specifies that the DAG should be triggered when any of the specified datasets is updated. 32 | 33 | These operators enable you to configure your Airflow workflows to use more complex dataset update conditions, making them more dynamic and flexible. 34 | 35 | #### Examples of Conditional Dataset Scheduling 36 | 37 | Below are examples demonstrating how to configure a consumer DAG using conditional dataset scheduling. 38 | 39 | ##### Example 1: String Condition 40 | 41 | ```title="example_dataset_condition_string.yml" 42 | --8<-- "dev/dags/datasets/example_dataset_condition_string.yml" 43 | ``` 44 | 45 | ##### Example 2: YAML Syntax 46 | 47 | ```title="example_dataset_yaml_syntax.yml" 48 | --8<-- "dev/dags/datasets/example_dataset_yaml_syntax.yml" 49 | ``` 50 | 51 | --- 52 | 53 | #### Visualization 54 | 55 | The following diagrams illustrate the dataset conditions described in the example configurations: 56 | 57 | 1. **`s3://bucket-cjmm/raw/dataset_custom_1`** and **`s3://bucket-cjmm/raw/dataset_custom_2`** must both be updated for the first condition to be satisfied. 58 | 2. Alternatively, **`s3://bucket-cjmm/raw/dataset_custom_3`** alone can satisfy the condition. 59 | 60 | ![Graph Conditional Dataset 1](../static/images/datasets/conditions/graph_conditional_dataset.png) 61 | ![Graph Conditional Dataset 2](../static/images/datasets/conditions/graph_conditional_dataset_2.png) 62 | -------------------------------------------------------------------------------- /docs/features/dynamic_tasks.md: -------------------------------------------------------------------------------- 1 | # Dynamic tasks 2 | 3 | DAG Factory supports Airflow’s 4 | [Dynamic Task Mapping](https://airflow.apache.org/docs/apache-airflow/stable/authoring-and-scheduling/dynamic-task-mapping.html), 5 | enabling workflows to dynamically create tasks at runtime. This approach allows the number of tasks to be determined 6 | during execution, usually based on the outcome of a preceding task, rather than being predefined during DAG authoring. 7 | 8 | ## Example: Defining Dynamic Tasks 9 | 10 | Below is an example configuration for implementing dynamic tasks using DAG Factory: 11 | 12 | ```title="example_dynamic_task_mapping.yml" 13 | --8<-- "dev/dags/example_dynamic_task_mapping.yml" 14 | ``` 15 | 16 | ### Explanation of the Configuration 17 | 18 | 1. `request` Task: 19 | - Generates a list of items using the `make_list` function from the [expand_tasks.py](https://github.com/astronomer/dag-factory/blob/main/dev/dags/expand_tasks.py) module. 20 | - This task serves as the input provider for the dynamically mapped tasks. 21 | 22 | 2. `process` Task: 23 | - Dynamically generates one task for each item in the list produced by the `request` task. 24 | - The expand argument is used to create these tasks at runtime, with `request.output` supplying the input list. 25 | - Additionally, the `partial` argument is used to specify fixed parameters (`op_kwargs`) that are applied to all dynamically generated tasks. 26 | 27 | ### How It Works 28 | 29 | - Dynamic Task Creation: 30 | The `expand` keyword allows the process task to spawn multiple tasks at runtime, each processing a single item from 31 | the list output of the `request` task. 32 | 33 | - Fixed Parameters: 34 | The partial keyword ensures that common parameters, such as `fixed_param`, are passed to every dynamically created 35 | task instance. 36 | 37 | ### Benefits of Dynamic Task Mapping with DAG Factory 38 | 39 | - Flexibility: Handle varying input sizes and conditions dynamically without modifying the DAG definition. 40 | - Scalability: Efficiently process large datasets by leveraging Airflow’s parallel execution capabilities. 41 | - Simplicity: Define dynamic workflows declaratively using YAML, minimizing boilerplate code. 42 | 43 | ### Airflow mapped tasks view 44 | 45 | Below, you can see a list of mapped tasks generated dynamically as part of the `process` task. 46 | 47 | ![example_dynamic_task_mapping.png](../static/example_dynamic_task_mapping.png "Dynamic Task Mapping visualization") 48 | 49 | ## Advanced Dynamic Task Mapping with DAG Factory 50 | 51 | Below, we explain the different methods for defining dynamic task mapping, illustrated by the provided example configuration. 52 | 53 | ```title="Dynamic Task Mapping advanced usage" 54 | --8<-- "dev/dags/example_taskflow.yml" 55 | ``` 56 | 57 | The example above illustrates advanced usage of Dynamic Task Mapping using Dag Factory (the callable functions 58 | used in the example are kept in [sample.py](https://github.com/astronomer/dag-factory/blob/main/dev/dags/sample.py)): 59 | 60 | 1. **Static Input Mapping** 61 | 62 | The task `double_number_with_dynamic_task_mapping_static` shows how dynamic tasks can be created using static lists 63 | as input. Three tasks are created, each processing one number. 64 | 65 | 2. **Task-Generated Input Mapping** 66 | 67 | The task `double_number_with_dynamic_task_mapping_taskflow` shows how tasks can use outputs from other tasks as 68 | input for dynamic task mapping. The prefix `+` tells DAG Factory to resolve this value as the task `numbers_list`, 69 | previously defined. 70 | 71 | 3. **Mapping with Multiple Inputs** 72 | 73 | The task `multiply_with_multiple_parameters` shows how dynamic task mapping can combine outputs from multiple tasks 74 | as input parameters. 75 | 76 | ## Named Mapping in Dynamic Tasks with DAG Factory 77 | 78 | Starting with Airflow 2.9, the `map_index_template` feature allows for custom mapping name for dynamic tasks based on a 79 | user-defined key. DAG Factory fully supports this feature, enabling users to name tasks dynamically in a meaningful way 80 | during runtime. This can be useful for tracing and debugging tasks. 81 | 82 | Below is an example of how to configure and use custom names for mapped tasks 83 | 84 | ```title="example_map_index_template.yml" 85 | --8<-- "dev/dags/example_map_index_template.yml" 86 | ``` 87 | 88 | ### How it works 89 | 90 | 1. map_index_template: 91 | Customizes the naming of dynamically mapped tasks using a Jinja2 expression. In this example, it uses 92 | `custom_mapping_key` from the task context to define task names. 93 | 2. expand: 94 | Dynamically generates tasks for each entry in the `full_name` list 95 | - Lucy Black 96 | - Vera Santos 97 | - Marks Spencer 98 | 3. Dynamic Task Naming: 99 | The `custom_mapping_key` is set to the first name of each person, e.g., Lucy, Vera, and Marks using the callable 100 | function `extract_last_name`. This callable function is kept in [sample.py](https://github.com/astronomer/dag-factory/blob/main/dev/dags/sample.py) 101 | 102 | ### Airflow named mapped tasks view 103 | 104 | The image below shows that the `map_index` gets the first name of the person in the mapped tasks with the above configuration. 105 | 106 | ![example_map_index_template.png](../static/example_map_index_template.png "Dynamic Task Mapping named mapped index visualization") 107 | 108 | ## Scope and limitations 109 | 110 | The Airflow documentation on [dynamic task mapping](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html) 111 | provides various examples of this feature. While the previous sections have discussed the forms supported by DAG 112 | Factory, it’s important to note the scenarios that have not been tested or are known to be unsupported. 113 | 114 | The following cases are tested and expected to work (you can refer to previous sections on how to use them with DAG Factory): 115 | 116 | - [Simple mapping](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html#simple-mapping) 117 | - [Task-generated mapping](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html#task-generated-mapping) 118 | - [Repeated mapping](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html#repeated-mapping) 119 | - [Adding parameters that do not expand (partial)](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html#adding-parameters-that-do-not-expand) 120 | - [Mapping over multiple parameters](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html#mapping-over-multiple-parameters) 121 | - [Named mapping (map_index_template)](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html#named-mapping) 122 | 123 | The following cases are untested but are expected to work: 124 | 125 | - [Mapping with non-TaskFlow operators](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html#mapping-with-non-taskflow-operators) 126 | - [Mapping over the result of classic operators](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html#mapping-over-result-of-classic-operators) 127 | - [Filtering items from a mapped task](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html#filtering-items-from-a-mapped-task) 128 | 129 | The following cases are untested and may not work: 130 | 131 | - [Assigning multiple parameters to a non-TaskFlow operator](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html#assigning-multiple-parameters-to-a-non-taskflow-operator) 132 | - [Mapping over a task group](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html#mapping-over-a-task-group) 133 | - [Transforming expanding data](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html#transforming-expanding-data) 134 | - [Combining upstream data (aka “zipping”)](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html#combining-upstream-data-aka-zipping) 135 | -------------------------------------------------------------------------------- /docs/features/http_task.md: -------------------------------------------------------------------------------- 1 | # HttpSensor 2 | 3 | **DAG-Factory** supports the HttpSensor from the `airflow.providers.http.sensors.http` package. 4 | 5 | The example below demonstrates the response_check logic in a Python file: 6 | 7 | ```yaml 8 | task_2: 9 | operator: airflow.providers.http.sensors.http.HttpSensor 10 | http_conn_id: 'test-http' 11 | method: 'GET' 12 | response_check_name: check_sensor 13 | response_check_file: /path/to/example1/http_conn.py 14 | dependencies: [task_1] 15 | ``` 16 | 17 | The `response_check` logic can also be provided as a lambda: 18 | 19 | ```yaml 20 | task_2: 21 | operator: airflow.providers.http.sensors.http.HttpSensor 22 | http_conn_id: 'test-http' 23 | method: 'GET' 24 | response_check_lambda: 'lambda response: "ok" in response.text' 25 | dependencies: [task_1] 26 | ``` 27 | -------------------------------------------------------------------------------- /docs/features/multiple_configuration_files.md: -------------------------------------------------------------------------------- 1 | # Multiple Configuration Files 2 | 3 | Using **DAG-Factory** if you want to split your DAG configuration into multiple files, you can do so by leveraging a suffix in the configuration file name. 4 | 5 | ```python 6 | from dagfactory import load_yaml_dags # load relevant YAML files as airflow DAGs 7 | 8 | load_yaml_dags(globals_dict=globals(), suffix=['dag.yaml']) 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/getting-started/quick-start-airflow-standalone.md: -------------------------------------------------------------------------------- 1 | # DAG Factory: Quick Start Guide With Airflow 2 | 3 | **DAG Factory** is a Python library [Apache Airflow®](https://airflow.apache.org) that simplifies DAG creation using declarative YAML configuration files instead of Python. 4 | 5 | ## Prerequisites 6 | 7 | The minimum requirements for **dag-factory** are: 8 | 9 | - Python 3.8.0+ 10 | - [Apache Airflow®](https://airflow.apache.org) 2.3+ 11 | 12 | ## Step 1: Create a Python Virtual Environment 13 | 14 | Create and activate a virtual environment: 15 | 16 | ```commandline 17 | python3 -m venv dagfactory_env 18 | source dagfactory_env/bin/activate 19 | ``` 20 | 21 | ## Step 2: Install Apache Airflow 22 | 23 | Install [Apache Airflow®](https://airflow.apache.org): 24 | 25 | 1. Create a directory for your project and navigate to it: 26 | 27 | ```commandline 28 | mkdir dag-factory-quick-start && cd dag-factory-quick-start 29 | ``` 30 | 31 | 2. Set the `AIRFLOW_HOME` environment variable: 32 | 33 | ```commandline 34 | export AIRFLOW_HOME=$(pwd) 35 | export AIRFLOW__CORE__LOAD_EXAMPLES=False 36 | ``` 37 | 38 | 3. Install Apache Airflow: 39 | 40 | ```commandline 41 | pip install apache-airflow 42 | ``` 43 | 44 | ## Step 3: Install DAG Factory 45 | 46 | Install the DAG Factory library in your virtual environment: 47 | 48 | ```commandline 49 | pip install dag-factory 50 | ``` 51 | 52 | ## Step 4: Set Up the DAGS Folder 53 | 54 | Create a DAGs folder inside the $AIRFLOW_HOME directory, which is where your DAGs will be stored: 55 | 56 | ```commandline 57 | mkdir dags 58 | ``` 59 | 60 | ## Step 5: Define a DAG in YAML 61 | 62 | **DAG Factory** uses YAML files to define DAG configurations. Create a file named `example_dag_factory.yml` in the `$AIRFLOW_HOME/dags` folder with the following content: 63 | 64 | ```title="example_dag_factory.yml" 65 | --8<-- "dev/dags/example_dag_factory.yml" 66 | ``` 67 | 68 | ## Step 6: Generate the DAG from YAML 69 | 70 | Create a Python script named `example_dag_factory.py` in the `$AIRFLOW_HOME/dags` folder. This script will generate the DAG from the YAML configuration 71 | 72 | ```title="example_dag_factory.py" 73 | --8<-- "dev/dags/example_dag_factory.py" 74 | ``` 75 | 76 | ## Step 7: Start Airflow 77 | 78 | To start the Airflow environment with your DAG Factory setup, run the following command: 79 | 80 | ```commandline 81 | airflow standalone 82 | ``` 83 | 84 | This will take a few minutes to set up. Once completed, you can access the Airflow UI and the generated DAG at `http://localhost:8080` 🚀. 85 | 86 | ## View Your Generated DAG 87 | 88 | Once Airflow is up and running, you can login with the username `admin` and the password in `$AIRFLOW_HOME/standalone_admin_password.txt`. You should be able to see your generated DAG in the Airflow UI. 89 | 90 | ## Generated DAG 91 | 92 | ![Airflow DAG](../static/images/airflow-home.png) 93 | 94 | ## Graph View 95 | 96 | ![Airflow Home](../static/images/airflow-dag.png) 97 | 98 | Checkout [examples](https://github.com/astronomer/dag-factory/tree/main/dev/dags) for generating more advanced DAGs. 99 | -------------------------------------------------------------------------------- /docs/getting-started/quick-start-astro-cli.md: -------------------------------------------------------------------------------- 1 | # DAG Factory: Quick Start Guide With Astro CLI 2 | 3 | **DAG Factory** is a Python library [Apache Airflow®](https://airflow.apache.org) that simplifies DAG creation using declarative YAML configuration files instead of Python. 4 | 5 | ## Prerequisites 6 | 7 | The minimum requirements for **dag-factory** are: 8 | 9 | - Python 3.8.0+ 10 | - [Astro CLI](https://www.astronomer.io/docs/astro/cli/overview/) 11 | 12 | ## Step 1: Initialize Airflow Project 13 | 14 | Create a new directory and initialize your Astro CLI project: 15 | 16 | ```commandline 17 | mkdir dag-factory-quick-start && cd dag-factory-quick-start 18 | 19 | astro dev init 20 | ``` 21 | 22 | This will set up the necessary Airflow files and directories. 23 | 24 | ## Step 2: Install DAG Factory 25 | 26 | Install DAG Factory in your Airflow environment: 27 | 28 | 1. Add dag-factory as a dependency to the `requirements.txt` file created during the project initialization. 29 | 30 | ## Step 3: Define a DAG in YAML 31 | 32 | **DAG Factory** uses YAML files to define DAG configurations. Create a file named `example_dag_factory.yml` in the `$AIRFLOW_HOME/dags` folder with the following content: 33 | 34 | ```title="example_dag_factory.yml" 35 | --8<-- "dev/dags/example_dag_factory.yml" 36 | ``` 37 | 38 | ## Step 4: Generate the DAG from YAML 39 | 40 | Create a Python script named `example_dag_factory.py` in the `$AIRFLOW_HOME/dags` folder. This script will generate the DAG from the YAML configuration 41 | 42 | ```title="example_dag_factory.py" 43 | --8<-- "dev/dags/example_dag_factory.py" 44 | ``` 45 | 46 | ## Step 5: Start Airflow Project 47 | 48 | Once you've set up your YAML configuration and Python script, start the Airflow environment with the following command: 49 | 50 | ```commandline 51 | astro dev start 52 | ``` 53 | 54 | This will take a few minutes to set up. Once completed, you can access the Airflow UI and the generated DAG at `http://localhost:8080` 🚀. 55 | 56 | ## View Your Generated DAG 57 | 58 | Once Airflow is up and running, you can login with the username `admin` and the password `admin`. You should be able to see your generated DAG in the Airflow UI. 59 | 60 | ## Generated DAG 61 | 62 | ![Airflow DAG](../static/images/airflow-home.png) 63 | 64 | ## Graph View 65 | 66 | ![Airflow Home](../static/images/airflow-dag.png) 67 | 68 | Checkout [examples](https://github.com/astronomer/dag-factory/tree/main/dev/dags) for generating more advanced DAGs. 69 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # DAG Factory documentation 2 | 3 | Everything you need to know about how to build Apache Airflow® workflows using YAML files. 4 | 5 | ## Getting started 6 | 7 | Are you new to DAG Factory? This is the place to start! 8 | 9 | * DAG Factory at a glance 10 | * [Quickstart with Airflow standalone](getting-started/quick-start-airflow-standalone.md) 11 | * [Quickstart with Astro CLI](getting-started/quick-start-astro-cli.md) 12 | * [Using YAML instead of Python](./comparison/index.md) 13 | * [Traditional Airflow Operators](./comparison/traditional_operators.md) 14 | * [TaskFlow API](./comparison/taskflow_api.md) 15 | 16 | ## Configuration 17 | 18 | * [Configuring your workflows](configuration/configuring_workflows.md) 19 | * [Environment variables](configuration/environment_variables.md) 20 | * [Defaults](configuration/defaults.md) 21 | 22 | ## Features 23 | 24 | * [Dynamic tasks](features/dynamic_tasks.md) 25 | * [Datasets scheduling](features/datasets.md) 26 | * [Callbacks](features/callbacks.md) 27 | * [Custom operators](features/custom_operators.md) 28 | * [Multiple configuration files](features/multiple_configuration_files.md) 29 | * [HttpSensor](features/http_task.md) 30 | 31 | ## Getting help 32 | 33 | Having trouble? We'd like to help! 34 | 35 | * Report bugs, questions and feature requests in our [ticket tracker](https://github.com/astronomer/dag-factory/issues). 36 | 37 | ## Contributing 38 | 39 | DAG Factory is an Open-Source project. Learn about its development process and about how you can contribute: 40 | 41 | * [Contributing to DAG Factory](contributing/howto.md) 42 | * [Github repository](https://github.com/astronomer/dag-factory/) 43 | 44 | ## License 45 | 46 | To learn more about the terms and conditions for use, reproduction and distribution, read the [Apache License 2.0](https://github.com/astronomer/dag-factory/blob/main/LICENSE). 47 | 48 | ## Privacy Notice 49 | 50 | This project follows [Astronomer's Privacy Policy](https://www.astronomer.io/privacy/). 51 | 52 | For further information, [read this](https://github.com/astronomer/dag-factory/blob/main/PRIVACY_NOTICE.md) 53 | 54 | ## Security Policy 55 | 56 | Check the project's [Security Policy](https://github.com/astronomer/dag-factory/blob/main/SECURITY.md) to learn 57 | how to report security vulnerabilities in DAG Factory and how security issues reported to the DAG Factory 58 | security team are handled. 59 | 60 | analytics 61 | -------------------------------------------------------------------------------- /docs/static/example_dynamic_task_mapping.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_dynamic_task_mapping.png -------------------------------------------------------------------------------- /docs/static/example_hackernews_dagfactory_code.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_hackernews_dagfactory_code.png -------------------------------------------------------------------------------- /docs/static/example_hackernews_dagfactory_docs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_hackernews_dagfactory_docs.png -------------------------------------------------------------------------------- /docs/static/example_hackernews_dagfactory_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_hackernews_dagfactory_graph.png -------------------------------------------------------------------------------- /docs/static/example_hackernews_plain_airflow_code.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_hackernews_plain_airflow_code.png -------------------------------------------------------------------------------- /docs/static/example_hackernews_plain_airflow_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_hackernews_plain_airflow_graph.png -------------------------------------------------------------------------------- /docs/static/example_map_index_template.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_map_index_template.png -------------------------------------------------------------------------------- /docs/static/example_pypi_stats_dagfactory_code.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_pypi_stats_dagfactory_code.png -------------------------------------------------------------------------------- /docs/static/example_pypi_stats_dagfactory_docs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_pypi_stats_dagfactory_docs.png -------------------------------------------------------------------------------- /docs/static/example_pypi_stats_dagfactory_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_pypi_stats_dagfactory_graph.png -------------------------------------------------------------------------------- /docs/static/example_pypi_stats_dagfactory_mapped_tasks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_pypi_stats_dagfactory_mapped_tasks.png -------------------------------------------------------------------------------- /docs/static/example_pypi_stats_plain_airflow_code.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_pypi_stats_plain_airflow_code.png -------------------------------------------------------------------------------- /docs/static/example_pypi_stats_plain_airflow_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_pypi_stats_plain_airflow_graph.png -------------------------------------------------------------------------------- /docs/static/example_pypi_stats_plain_airflow_mapped_tasks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_pypi_stats_plain_airflow_mapped_tasks.png -------------------------------------------------------------------------------- /docs/static/images/airflow-dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/images/airflow-dag.png -------------------------------------------------------------------------------- /docs/static/images/airflow-home.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/images/airflow-home.png -------------------------------------------------------------------------------- /docs/static/images/custom_operators.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/images/custom_operators.png -------------------------------------------------------------------------------- /docs/static/images/datasets/conditions/graph_conditional_dataset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/images/datasets/conditions/graph_conditional_dataset.png -------------------------------------------------------------------------------- /docs/static/images/datasets/conditions/graph_conditional_dataset_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/images/datasets/conditions/graph_conditional_dataset_2.png -------------------------------------------------------------------------------- /docs/static/images/datasets/outlets/datasets_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/images/datasets/outlets/datasets_example.png -------------------------------------------------------------------------------- /examples/dags: -------------------------------------------------------------------------------- 1 | ../dev/dags -------------------------------------------------------------------------------- /img/mapped_tasks_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/img/mapped_tasks_example.png -------------------------------------------------------------------------------- /img/quickstart_dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/img/quickstart_dag.png -------------------------------------------------------------------------------- /img/quickstart_gantt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/img/quickstart_gantt.png -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: DAG Factory 2 | site_url: https://astronomer.github.io/dag-factory 3 | copyright: © Copyright 2025, Astronomer. 4 | 5 | repo_url: https://github.com/astronomer/dag-factory 6 | repo_name: astronomer/dag-factory 7 | 8 | edit_uri: "blob/main/docs" 9 | 10 | theme: 11 | name: material 12 | features: 13 | - announce.dismiss 14 | - content.action.edit 15 | - content.action.view 16 | - content.code.annotate 17 | - content.code.copy 18 | - content.tooltips 19 | - navigation.sections 20 | - navigation.tabs 21 | - navigation.footer 22 | - navigation.indexes 23 | - navigation.top 24 | - navigation.tracking 25 | - search.highlight 26 | - search.share 27 | - search.suggest 28 | - toc.follow 29 | 30 | extra: 31 | version: 32 | provider: mike 33 | alias: true 34 | 35 | markdown_extensions: 36 | - pymdownx.highlight: 37 | anchor_linenums: true 38 | line_spans: __span 39 | pygments_lang_class: true 40 | - pymdownx.inlinehilite 41 | - pymdownx.snippets: 42 | check_paths: true 43 | base_path: [ "." ] 44 | - pymdownx.superfences 45 | 46 | nav: 47 | - Home: index.md 48 | - Getting Started: 49 | - Airflow Standalone: getting-started/quick-start-airflow-standalone.md 50 | - Astro CLI: getting-started/quick-start-astro-cli.md 51 | - Configuration: 52 | - configuration/configuring_workflows.md 53 | - configuration/environment_variables.md 54 | - configuration/defaults.md 55 | - Features: 56 | - features/dynamic_tasks.md 57 | - features/datasets.md 58 | - features/callbacks.md 59 | - features/custom_operators.md 60 | - features/http_task.md 61 | - features/multiple_configuration_files.md 62 | 63 | - Comparison: 64 | - comparison/index.md 65 | - Traditional Airflow Operators: comparison/traditional_operators.md 66 | - TaskFlow API: comparison/taskflow_api.md 67 | - Contributing: 68 | - Code of Conduct: contributing/code_of_conduct.md 69 | - contributing/contributors.md 70 | - contributing/howto.md 71 | - contributing/roles.md 72 | 73 | plugins: 74 | - mike: 75 | alias_type: symlink 76 | - search 77 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "dag-factory" 7 | dynamic = ["version"] 8 | description = "Dynamically build Apache Airflow DAGs from YAML files" 9 | authors = [{ name = "Astronomer", email = "humans@astronomer.io" }] 10 | readme = "README.md" 11 | license = "Apache-2.0" 12 | license-files = { paths = ["LICENSE"] } 13 | requires-python = ">=3.8" 14 | keywords = ["airflow", "apache-airflow", "provider", "astronomer", "dag"] 15 | classifiers = [ 16 | "Development Status :: 5 - Production/Stable", 17 | "License :: OSI Approved :: Apache Software License", 18 | "Topic :: Database", 19 | "Framework :: Apache Airflow", 20 | "Intended Audience :: Developers", 21 | "Programming Language :: Python :: Implementation :: CPython", 22 | "Programming Language :: Python :: Implementation :: PyPy", 23 | "Programming Language :: Python :: 3", 24 | "Programming Language :: Python :: 3 :: Only", 25 | "Programming Language :: Python :: 3.8", 26 | "Programming Language :: Python :: 3.9", 27 | "Programming Language :: Python :: 3.10", 28 | "Programming Language :: Python :: 3.11", 29 | "Programming Language :: Python :: 3.12", 30 | ] 31 | dependencies = [ 32 | "apache-airflow>=2.3", 33 | "apache-airflow-providers-http>=2.0.0", 34 | "apache-airflow-providers-cncf-kubernetes<10.4.2", # https://github.com/astronomer/dag-factory/issues/397 35 | "pyyaml", 36 | "packaging", 37 | ] 38 | 39 | 40 | [project.optional-dependencies] 41 | tests = [ 42 | "apache-airflow-providers-slack", 43 | "pytest>=6.0", 44 | "pytest-cov", 45 | "pre-commit", 46 | ] 47 | 48 | ###################################### 49 | # TESTING 50 | ###################################### 51 | 52 | [tool.hatch.envs.tests] 53 | dependencies = [ 54 | "dag-factory[tests]", 55 | "apache-airflow~={matrix:airflow}.0,!=2.9.0,!=2.9.1", # https://github.com/apache/airflow/pull/39670 56 | "httpx>=0.25.0", 57 | "pandas", 58 | ] 59 | pre-install-commands = ["sh scripts/test/pre-install-airflow.sh {matrix:airflow} {matrix:python}"] 60 | 61 | [[tool.hatch.envs.tests.matrix]] 62 | python = ["3.8", "3.9", "3.10", "3.11", "3.12"] 63 | airflow = ["2.3", "2.4", "2.5", "2.6", "2.7", "2.8", "2.9", "2.10"] 64 | 65 | 66 | [tool.hatch.envs.tests.scripts] 67 | freeze = "pip freeze" 68 | static-check = " pre-commit run --files dagfactory/*" 69 | test = 'sh scripts/test/unit.sh' 70 | test-cov = 'sh scripts/test/unit-cov.sh' 71 | test-integration = 'sh scripts/test/integration.sh' 72 | test-integration-setup = 'sh scripts/test/integration-setup.sh' 73 | 74 | [project.urls] 75 | Source = "https://github.com/astronomer/dag-factory" 76 | 77 | [tool.hatch.version] 78 | path = "dagfactory/__init__.py" 79 | 80 | [project.entry-points."airflow.plugins"] 81 | dagfactory = "dagfactory.plugin:DagFactoryPlugin" 82 | 83 | [tool.hatch.build] 84 | sources = ["."] 85 | 86 | [tool.hatch.build.targets.sdist] 87 | include = ["dagfactory"] 88 | 89 | [tool.hatch.build.targets.wheel] 90 | packages = ["dagfactory"] 91 | 92 | [tool.distutils.bdist_wheel] 93 | universal = true 94 | 95 | [tool.pytest.ini_options] 96 | filterwarnings = ["ignore::DeprecationWarning"] 97 | minversion = "6.0" 98 | markers = ["integration", "callbacks"] 99 | 100 | ###################################### 101 | # DOCS 102 | ###################################### 103 | 104 | [tool.hatch.envs.docs] 105 | dependencies = [ 106 | "mkdocs", 107 | "mike", 108 | "pymdown-extensions", 109 | "mkdocs-material", 110 | ] 111 | 112 | [tool.hatch.envs.docs.scripts] 113 | dev = "mkdocs build && mkdocs serve" # For local development and preventing publishing 114 | gh-deploy = "python scripts/docs_deploy.py dev" 115 | gh-release = "python scripts/docs_deploy.py release" 116 | 117 | ###################################### 118 | # THIRD PARTY TOOLS 119 | ###################################### 120 | 121 | [tool.black] 122 | line-length = 120 123 | target-version = ['py39', 'py310', 'py311', 'py312'] 124 | 125 | [tool.ruff] 126 | line-length = 120 127 | 128 | [tool.ruff.lint] 129 | select = ["C901", "D300", "I", "F"] 130 | ignore = ["F541", "C901"] 131 | 132 | [tool.ruff.lint.isort] 133 | combine-as-imports = true 134 | known-first-party = ["dagfactory", "tests"] 135 | 136 | [tool.ruff.lint.mccabe] 137 | max-complexity = 10 138 | -------------------------------------------------------------------------------- /scripts/airflow3/.gitignore: -------------------------------------------------------------------------------- 1 | airflow.db-shm 2 | airflow.db-wal 3 | simple_auth_manager_passwords.json.generated 4 | venv-af3 5 | -------------------------------------------------------------------------------- /scripts/airflow3/README.md: -------------------------------------------------------------------------------- 1 | # Run Airflow3 Locally 2 | 3 | This guide will walk you through the process of setting up Apache Airflow 3 locally using pip. You can choose either SQLite or Postgres as the database backend for Airflow. 4 | 5 | ## 1. Setup Postgres Container (Optional) 6 | 7 | By default, SQLite will be used as Airflow metadata database unless you update the AIRFLOW__DATABASE__SQL_ALCHEMY_CONN environment variable to point to PostgreSQL. The following command will pull the official Postgres image , create a container named postgres, and expose the required ports. 8 | 9 | ### 1.1 Pull Postgres Image 10 | 11 | ```commandline 12 | docker run --name postgres -p 5432:5432 -p 5433:5433 -e POSTGRES_PASSWORD=postgres postgres 13 | ``` 14 | 15 | ### 1.2 Access the PostgreSQL Console and Create the Database 16 | 17 | Now that the PostgreSQL container is running, you can connect to it via the command line using psql 18 | 19 | ```commandline 20 | psql --u postgres 21 | ``` 22 | 23 | ### 1.3 Create the Database for Airflow 24 | 25 | Once you're inside the psql interactive terminal, you can create a new database that Airflow will use. 26 | 27 | ```commandline 28 | CREATE DATABASE airflow_db; 29 | ``` 30 | 31 | ## 2. Setup Virtual Environment for Airflow3 32 | 33 | You need to configure the virtual environment for Airflow3. 34 | 35 | ### 2.1 Export ENV 36 | 37 | This will export the AIRFLOW related env like AIRFLOW_HOME etc 38 | 39 | ```commandline 40 | source scripts/airflow3/env.sh 41 | ``` 42 | 43 | ## 3. Install Dependency 44 | 45 | ```commandline 46 | sh scripts/airflow3/setup.sh 47 | ``` 48 | 49 | ## 4. Run Airflow in Standalone Mode 50 | 51 | Activate the virtual env created in previous step and run airflow 52 | 53 | ```commandline 54 | source "$(pwd)/scripts/airflow3/venv-af3/bin/activate" 55 | 56 | airflow standalone 57 | ``` 58 | 59 | This command will: 60 | 61 | - Set the necessary environment variables (like AIRFLOW_HOME). 62 | - Initialize the Airflow database. 63 | - Start Airflow webserver, scheduler and trigger. 64 | 65 | ## 5. Run Airflow Tests 66 | 67 | Once Airflow is running, you can also run tests. 68 | 69 | ```commandline 70 | source scripts/airflow3/env.sh 71 | 72 | source "$(pwd)/scripts/airflow3/venv-af3/bin/activate" 73 | 74 | sh scripts/airflow3/tests.sh 75 | ``` 76 | 77 | ## 6. Access the Airflow Web Interface 78 | 79 | After running the standalone command, you can access the Airflow web interface to monitor the status of your DAGs, tasks, and more. 80 | 81 | - The web interface should be available at [Localhost Server](http://localhost:8080) 82 | 83 | ## 7. Install Airflow from the Main Branch 84 | 85 | If you want to install Airflow from the main branch, follow the steps from sections 1, 2, and 3 above. Then, proceed with the following steps: 86 | 87 | ### 7.1 Set ENV AIRFLOW_REPO_DIR 88 | 89 | Set ENV `AIRFLOW_REPO_DIR` in scripts/airflow3/env.sh pointing to the path where your Airflow repository is cloned. 90 | 91 | ### 7.2 Activate the Virtual Environment 92 | 93 | ```commandline 94 | source scripts/airflow3/env.sh 95 | 96 | source "$(pwd)/scripts/airflow3/venv-af3/bin/activate" 97 | ``` 98 | 99 | ### 7.3 Install Airflow from the Main Branch 100 | 101 | ```commandline 102 | sh scripts/airflow3/install_from_main.sh 103 | ``` 104 | 105 | ### 7.4 Run Airflow standalone 106 | 107 | Finally, run Airflow in standalone mode again: 108 | 109 | ```commandline 110 | airflow standalone 111 | ``` 112 | -------------------------------------------------------------------------------- /scripts/airflow3/dags/example_dag_factory.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | # The following import is here so Airflow parses this file 5 | # from airflow import DAG 6 | import dagfactory 7 | 8 | DEFAULT_CONFIG_ROOT_DIR = os.getenv("DEFAULT_CONFIG_ROOT_DIR", "/usr/local/airflow/dags/") 9 | 10 | CONFIG_ROOT_DIR = Path(os.getenv("CONFIG_ROOT_DIR", DEFAULT_CONFIG_ROOT_DIR)) 11 | 12 | config_file = str(CONFIG_ROOT_DIR / "example_dag_factory.yml") 13 | 14 | example_dag_factory = dagfactory.DagFactory(config_file) 15 | 16 | # Creating task dependencies 17 | example_dag_factory.clean_dags(globals()) 18 | example_dag_factory.generate_dags(globals()) 19 | -------------------------------------------------------------------------------- /scripts/airflow3/dags/example_dag_factory.yml: -------------------------------------------------------------------------------- 1 | default: 2 | default_args: 3 | catchup: false, 4 | start_date: 2024-11-11 5 | 6 | # ----8<--- [ start: example_dag_yaml_configuration ] 7 | basic_example_dag: 8 | default_args: 9 | owner: "custom_owner" 10 | description: "this is an example dag" 11 | schedule: "0 3 * * *" 12 | render_template_as_native_obj: True 13 | tasks: 14 | task_1: 15 | operator: airflow.providers.standard.operators.bash.BashOperator 16 | bash_command: "echo 1" 17 | task_2: 18 | operator: airflow.providers.standard.operators.bash.BashOperator 19 | bash_command: "echo 2" 20 | dependencies: [task_1] 21 | task_3: 22 | operator: airflow.providers.standard.operators.bash.BashOperator 23 | bash_command: "echo 2" 24 | dependencies: [task_1] 25 | # ----8<--- [ end: example_dag_yaml_configuration ] 26 | -------------------------------------------------------------------------------- /scripts/airflow3/env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | PYTHONPATH="$PWD" 6 | export PYTHONPATH 7 | AIRFLOW_HOME="$PWD/scripts/airflow3" 8 | export AIRFLOW_HOME 9 | export AIRFLOW__LOGGING__BASE_LOG_FOLDER="$AIRFLOW_HOME/logs" 10 | export AIRFLOW__WEBSERVER__CONFIG_FILE="$AIRFLOW_HOME/webserver_config.py" 11 | export AIRFLOW__SCHEDULER__CHILD_PROCESS_LOG_DIRECTORY="$AIRFLOW_HOME/logs/scheduler" 12 | # Comment below line to use the Postgres database backend. 13 | export AIRFLOW__DATABASE__SQL_ALCHEMY_CONN="sqlite:///$AIRFLOW_HOME/airflow.db" 14 | # Uncomment below line to use the Postgres database backend. 15 | # export AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=postgresql+psycopg2://postgres:postgres@localhost:5432/airflow_db 16 | export AIRFLOW__CORE__LOAD_EXAMPLES=false 17 | export AIRFLOW__CORE__DAGBAG_IMPORT_ERROR_TRACEBACK_DEPTH=10 18 | export AIRFLOW__CORE__DAGBAG_IMPORT_TIMEOUT=300 19 | # export AIRFLOW__LOGGING__LOGGING_LEVEL=DEBUG 20 | export AIRFLOW_REPO_DIR="$PWD/../airflow" 21 | export DEFAULT_CONFIG_ROOT_DIR="$AIRFLOW_HOME/dags" 22 | -------------------------------------------------------------------------------- /scripts/airflow3/install_from_main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -v 4 | set -x 5 | set -e 6 | 7 | : "${AIRFLOW_REPO_DIR:?Environment variable AIRFLOW_REPO_DIR is not set}" 8 | echo "AIRFLOW_REPO_DIR is set to '$AIRFLOW_REPO_DIR'" 9 | 10 | DAG_FACTORY_ROOT="$PWD" 11 | 12 | cd "$AIRFLOW_REPO_DIR" 13 | git checkout main && git pull 14 | 15 | pip uninstall -y apache-airflow-core 16 | pip uninstall -y apache-airflow-task-sdk 17 | pip uninstall -y apache-airflow-providers-fab 18 | pip uninstall -y apache-airflow 19 | pip uninstall -y apache-airflow-providers-git 20 | 21 | rm -rf dist 22 | 23 | pip install uv 24 | 25 | pip install -e "$AIRFLOW_REPO_DIR/dev/breeze" --force 26 | 27 | breeze release-management prepare-provider-distributions \ 28 | --distributions-list celery,common.io,common.compat,fab,standard,openlineage,git \ 29 | --distribution-format wheel 30 | 31 | breeze release-management prepare-airflow-distributions --distribution-format wheel 32 | 33 | cd task-sdk 34 | uv build --package apache-airflow-task-sdk --wheel 35 | 36 | cd .. 37 | 38 | pip install dist/* 39 | 40 | cd "$DAG_FACTORY_ROOT" 41 | -------------------------------------------------------------------------------- /scripts/airflow3/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow 2 | apache-airflow-task-sdk 3 | apache-airflow-providers-standard 4 | apache-airflow-providers-fab 5 | psycopg2 6 | asyncpg 7 | -------------------------------------------------------------------------------- /scripts/airflow3/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Exit on error 4 | set -e 5 | 6 | # Create a UV virtual environment named 'env' (you can change this as needed) 7 | echo "Creating virtual environment at $(pwd)/tools" 8 | python3 -m venv "$(pwd)/scripts/airflow3/venv-af3" 9 | 10 | # Activate the virtual environment 11 | echo "Activating virtual environment..." 12 | source "$(pwd)/scripts/airflow3/venv-af3/bin/activate" 13 | 14 | # Install dependencies in the virtual environment 15 | echo "Installing dependencies..." 16 | pip3 install --pre -r "$(pwd)/scripts/airflow3/requirements.txt" 17 | 18 | pip3 install ".[test]" 19 | 20 | echo "UV virtual environment setup and dependencies installed successfully!" 21 | -------------------------------------------------------------------------------- /scripts/airflow3/tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | 5 | set -e 6 | 7 | airflow dags list-import-errors 8 | -------------------------------------------------------------------------------- /scripts/docs_deploy.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | 4 | from packaging import version 5 | 6 | import dagfactory 7 | 8 | 9 | def deploy_docs(deploy_type: str): 10 | _version = version.parse(dagfactory.__version__) 11 | 12 | set_default = False 13 | 14 | if deploy_type == "release": 15 | if _version.pre is not None: 16 | command = ["mike", "deploy", "--push", "dev"] 17 | else: 18 | command = ["mike", "deploy", "--push", "--update-aliases", str(_version), "latest"] 19 | set_default = True 20 | else: 21 | command = ["mike", "deploy", "--push", "dev"] 22 | 23 | try: 24 | subprocess.run(command, capture_output=True, text=True, check=True) 25 | if set_default: 26 | default_command = ["mike", "set-default", "latest"] 27 | subprocess.run(default_command, capture_output=True, text=True, check=True) 28 | except subprocess.CalledProcessError as e: 29 | raise Exception(f"Error deploying: {e.stderr}") 30 | 31 | 32 | if __name__ == "__main__": 33 | if len(sys.argv) < 2: 34 | raise Exception("Argument deploy type is required: 'dev' or 'release'") 35 | 36 | deploy_type = sys.argv[1] 37 | 38 | if deploy_type not in ["dev", "release"]: 39 | raise Exception("Invalid argument provided. Valid deploy types are 'dev' or 'release'.") 40 | 41 | deploy_docs(deploy_type) 42 | -------------------------------------------------------------------------------- /scripts/test/integration-setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -v 4 | set -x 5 | set -e 6 | 7 | if [ -L "dags" ]; then 8 | echo "Symbolic link 'dags' already exists." 9 | elif [ -e "dags" ]; then 10 | echo "'dags' exists but is not a symbolic link. Please resolve this manually." 11 | else 12 | ln -s dev/dags dags 13 | echo "Symbolic link 'dags' created successfully." 14 | fi 15 | 16 | rm -rf airflow.* 17 | pip freeze | grep airflow 18 | airflow db reset -y 19 | airflow db init 20 | -------------------------------------------------------------------------------- /scripts/test/integration.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | 7 | pip freeze | grep airflow 8 | echo $AIRFLOW_HOME 9 | ls $AIRFLOW_HOME 10 | 11 | airflow db check 12 | 13 | # Necessary for overcoming the following issue with Airflow 2.3 and 2.4: 14 | # ImportError: Pandas requires version '0.9.0' or newer of 'tabulate' (version '0.8.9' currently installed) 15 | pip install "tabulate>=0.9.0" 16 | 17 | pytest -vv \ 18 | --cov=dagfactory \ 19 | --cov-report=term-missing \ 20 | --cov-report=xml \ 21 | -m integration 22 | -------------------------------------------------------------------------------- /scripts/test/pre-install-airflow.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | AIRFLOW_VERSION="$1" 4 | PYTHON_VERSION="$2" 5 | 6 | # Use this to set the appropriate Python environment in Github Actions, 7 | # while also not assuming --system when running locally. 8 | if [ "$GITHUB_ACTIONS" = "true" ] && [ -z "${VIRTUAL_ENV}" ]; then 9 | py_path=$(which python) 10 | virtual_env_dir=$(dirname "$(dirname "$py_path")") 11 | export VIRTUAL_ENV="$virtual_env_dir" 12 | fi 13 | 14 | echo "${VIRTUAL_ENV}" 15 | 16 | CONSTRAINT_URL="https://raw.githubusercontent.com/apache/airflow/constraints-$AIRFLOW_VERSION.0/constraints-$PYTHON_VERSION.txt" 17 | curl -sSL $CONSTRAINT_URL -o /tmp/constraint.txt 18 | # Workaround to remove PyYAML constraint that will work on both Linux and MacOS 19 | sed '/PyYAML==/d' /tmp/constraint.txt > /tmp/constraint.txt.tmp 20 | mv /tmp/constraint.txt.tmp /tmp/constraint.txt 21 | # Install Airflow with constraints 22 | pip install uv 23 | uv pip install "apache-airflow==$AIRFLOW_VERSION" --constraint /tmp/constraint.txt 24 | 25 | pip install apache-airflow-providers-cncf-kubernetes --constraint /tmp/constraint.txt 26 | rm /tmp/constraint.txt 27 | -------------------------------------------------------------------------------- /scripts/test/unit-cov.sh: -------------------------------------------------------------------------------- 1 | pytest \ 2 | -vv \ 3 | --cov=dagfactory \ 4 | --cov-report=term-missing \ 5 | --cov-report=xml \ 6 | --ignore=tests/test_example_dags.py 7 | -------------------------------------------------------------------------------- /scripts/test/unit.sh: -------------------------------------------------------------------------------- 1 | pytest \ 2 | -vv \ 3 | --ignore=tests/test_example_dags.py 4 | -------------------------------------------------------------------------------- /scripts/verify_tag_and_version.py: -------------------------------------------------------------------------------- 1 | """Verify the version of the Package with the version in Git tag.""" 2 | 3 | import os 4 | import re 5 | from pathlib import Path 6 | 7 | repo_dir = Path(__file__).parent.parent 8 | 9 | path_of_init_file = Path(repo_dir / "dagfactory" / "__init__.py") 10 | version_file = path_of_init_file.read_text() 11 | git_ref = os.getenv("GITHUB_REF", "") 12 | git_tag = git_ref.replace("refs/tags/", "") 13 | git_tag = git_tag[1:] if git_tag.startswith("v") else git_tag 14 | version = re.findall('__version__ = "(.*)"', version_file)[0] 15 | 16 | if git_tag is not None: 17 | if version != git_tag: 18 | raise SystemExit(f"The version in {path_of_init_file} ({version}) does not match the Git Tag ({git_tag}).") 19 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/tests/__init__.py -------------------------------------------------------------------------------- /tests/fixtures/dag_factory.yml: -------------------------------------------------------------------------------- 1 | default: 2 | concurrency: 1 3 | dagrun_timeout_sec: 600 4 | default_args: 5 | end_date: 2018-03-05 6 | owner: default_owner 7 | retries: 1 8 | retry_delay_sec: 300 9 | start_date: 2018-03-01 10 | default_view: tree 11 | max_active_runs: 1 12 | orientation: LR 13 | schedule_interval: 0 1 * * * 14 | example_dag: 15 | default_args: 16 | owner: custom_owner 17 | start_date: 2 days 18 | description: this is an example dag 19 | doc_md: '##here is a doc md string' 20 | schedule_interval: 0 3 * * * 21 | tasks: 22 | task_1: 23 | bash_command: echo 1 24 | operator: airflow.operators.bash_operator.BashOperator 25 | task_2: 26 | bash_command: echo 2 27 | dependencies: 28 | - task_1 29 | operator: airflow.operators.bash_operator.BashOperator 30 | task_3: 31 | bash_command: echo 3 32 | dependencies: 33 | - task_1 34 | operator: airflow.operators.bash_operator.BashOperator 35 | example_dag2: 36 | doc_md_file_path: $PWD/tests/fixtures/mydocfile.md 37 | schedule_interval: None 38 | tasks: 39 | task_1: 40 | bash_command: echo 1 41 | operator: airflow.operators.bash_operator.BashOperator 42 | task_2: 43 | bash_command: echo 2 44 | dependencies: 45 | - task_1 46 | operator: airflow.operators.bash_operator.BashOperator 47 | task_3: 48 | bash_command: echo 3 49 | dependencies: 50 | - task_1 51 | operator: airflow.operators.bash_operator.BashOperator 52 | example_dag3: 53 | doc_md_python_arguments: 54 | arg1: arg1 55 | arg2: arg2 56 | doc_md_python_callable_file: $PWD/tests/fixtures/doc_md_builder.py 57 | doc_md_python_callable_name: mydocmdbuilder 58 | tasks: 59 | task_1: 60 | bash_command: echo 1 61 | operator: airflow.operators.bash_operator.BashOperator 62 | example_dag4: 63 | vars: 64 | arg1: &arg1 'hello' 65 | arg2: &arg2 !join [*arg1, ' world'] 66 | tasks: 67 | task_1: 68 | bash_command: !join ['echo ', *arg2] 69 | operator: airflow.operators.bash_operator.BashOperator 70 | -------------------------------------------------------------------------------- /tests/fixtures/dag_factory_http_operator_task.yml: -------------------------------------------------------------------------------- 1 | default: 2 | default_args: 3 | catchup: false, 4 | start_date: 2025-03-20 5 | 6 | http_operator_example_dag: 7 | default_args: 8 | owner: "@owner" 9 | description: "this is an HttpOperator dag" 10 | schedule_interval: "0 3 * * *" 11 | tags: ['http'] 12 | render_template_as_native_obj: True 13 | tasks: 14 | send_request_json: 15 | operator: airflow.providers.http.operators.http.HttpOperator 16 | http_conn_id: "example_host" 17 | method: "POST" 18 | endpoint: "/run_test" 19 | data: 20 | data: "fake_data" 21 | format: "json" 22 | headers: 23 | Content-Type: application/json 24 | log_response: True 25 | send_request_plain_text: 26 | operator: airflow.providers.http.operators.http.HttpOperator 27 | http_conn_id: "example_host" 28 | method: "POST" 29 | endpoint: "/run_test" 30 | data: 31 | data: "fake_data" 32 | test: "plain_text" 33 | headers: 34 | Content-Type: text/plain 35 | log_response: True 36 | -------------------------------------------------------------------------------- /tests/fixtures/dag_factory_kubernetes_pod_operator.yml: -------------------------------------------------------------------------------- 1 | default: 2 | default_args: 3 | owner: 'default_owner' 4 | start_date: 2018-03-01 5 | end_date: 2018-03-05 6 | retries: 1 7 | retry_delay_sec: 300 8 | concurrency: 1 9 | max_active_runs: 1 10 | dagrun_timeout_sec: 600 11 | default_view: 'tree' 12 | orientation: 'LR' 13 | schedule_interval: '0 1 * * *' 14 | example_dag: 15 | tasks: 16 | task_1: 17 | operator: airflow.contrib.operators.kubernetes_pod_operator.KubernetesPodOperator 18 | namespace: 'default' 19 | config_file : 'path_to_config_file' 20 | image : 'image' 21 | image_pull_policy : 'Always' 22 | arguments : [ 23 | 'arg1', 24 | 'arg2', 25 | 'arg3', 26 | ] 27 | secrets : [{"secret":"secret","deploy_type":"env","deploy_target":"ENV_VAR"}] 28 | ports : [{"name" : "name","container_port":"container_port"},{"name" : "name","container_port":"container_port"}] 29 | volume_mounts : [ 30 | {"name":"name","mount_path":"mount_path","sub_path":"sub_path","read_only":"read_only"}, 31 | {"name":"name","mount_path":"mount_path","sub_path":"sub_path","read_only":"read_only"}, 32 | ] 33 | volumes : [ 34 | {"name":"name","configs":{'persistentVolumeClaim': {'claimName': 'test-volume'}}}, 35 | {"name":"name","configs":{'persistentVolumeClaim': {'claimName': 'test-volume'}}}, 36 | ] 37 | pod_runtime_info_envs : [ 38 | {"name":"name","field_path":"field_path"}, 39 | {"name":"name","field_path":"field_path"}, 40 | ] 41 | full_pod_spec : { 42 | "api_version": "api_version", 43 | "kind": "kind", 44 | "metadata": "metadata", 45 | "spec": "spec", 46 | "status": "status", 47 | } 48 | init_containers : [ 49 | {"name": "name","args":"args","command":"command"}, 50 | ] 51 | labels: {'foo': 'bar'} 52 | name: 'passing-test' 53 | task_id: 'passing-task' 54 | get_logs: True 55 | in_cluster: False 56 | dependencies: [] 57 | task_2: 58 | operator: airflow.contrib.operators.kubernetes_pod_operator.KubernetesPodOperator 59 | namespace: 'default' 60 | config_file : 'path_to_config_file' 61 | image : 'image' 62 | image_pull_policy : 'Always' 63 | arguments : [ 64 | 'arg1', 65 | 'arg2', 66 | 'arg3', 67 | ] 68 | labels: {'foo': 'bar'} 69 | name: 'passing-test' 70 | task_id: 'passing-task' 71 | get_logs: True 72 | in_cluster: False 73 | dependencies: ['task_1'] 74 | -------------------------------------------------------------------------------- /tests/fixtures/dag_factory_simple_http_operator_task.yml: -------------------------------------------------------------------------------- 1 | default: 2 | default_args: 3 | catchup: false, 4 | start_date: 2025-03-20 5 | 6 | simple_http_operator_example_dag: 7 | default_args: 8 | owner: "@owner" 9 | description: "this is a SimpleHttpOperator dag" 10 | schedule_interval: "0 3 * * *" 11 | tags: ['http'] 12 | render_template_as_native_obj: True 13 | tasks: 14 | send_request_json: 15 | operator: airflow.operators.http_operator.SimpleHttpOperator 16 | http_conn_id: "example_host" 17 | method: "POST" 18 | endpoint: "/run_test" 19 | data: 20 | data: "fake_data" 21 | format: "json" 22 | headers: 23 | Content-Type: application/json 24 | log_response: True 25 | send_request_plain_text: 26 | operator: airflow.operators.http_operator.SimpleHttpOperator 27 | http_conn_id: "example_host" 28 | method: "POST" 29 | endpoint: "/run_test" 30 | data: 31 | data: "fake_data" 32 | test: "plain_text" 33 | headers: 34 | Content-Type: text/plain 35 | log_response: True 36 | -------------------------------------------------------------------------------- /tests/fixtures/dag_factory_task_group.yml: -------------------------------------------------------------------------------- 1 | default: 2 | default_args: 3 | end_date: 2018-03-05 4 | owner: default_owner 5 | retries: 1 6 | retry_delay_sec: 300 7 | start_date: 2018-03-01 8 | default_view: tree 9 | max_active_runs: 1 10 | orientation: LR 11 | schedule_interval: 0 1 * * * 12 | example_dag: 13 | description: "this dag uses task groups" 14 | task_groups: 15 | task_group_1: 16 | tooltip: "this is a task group" 17 | dependencies: [task_1] 18 | tasks: 19 | task_1: 20 | operator: airflow.operators.bash_operator.BashOperator 21 | bash_command: "echo 1" 22 | task_2: 23 | operator: airflow.operators.bash_operator.BashOperator 24 | bash_command: "echo 2" 25 | task_group_name: task_group_1 26 | task_3: 27 | operator: airflow.operators.python_operator.PythonOperator 28 | python_callable_name: print_hello 29 | python_callable_file: examples/print_hello.py 30 | task_group_name: task_group_1 31 | dependencies: [task_2] 32 | task_4: 33 | operator: airflow.operators.bash_operator.BashOperator 34 | bash_command: "echo 1" 35 | dependencies: [task_group_1] 36 | -------------------------------------------------------------------------------- /tests/fixtures/dag_factory_variables_as_arguments.yml: -------------------------------------------------------------------------------- 1 | default: 2 | default_args: 3 | owner: 'default_owner' 4 | start_date: 2018-03-01 5 | end_date: 2018-03-05 6 | retries: 1 7 | retry_delay_sec: 300 8 | concurrency: 1 9 | max_active_runs: 1 10 | dagrun_timeout_sec: 600 11 | default_view: 'tree' 12 | orientation: 'LR' 13 | schedule_interval: '0 1 * * *' 14 | 15 | example_dag: 16 | default_args: 17 | owner: 'custom_owner' 18 | start_date: 2 days 19 | description: 'this is an example dag' 20 | schedule_interval: '0 3 * * *' 21 | tasks: 22 | task_1: 23 | operator: airflow.operators.bash_operator.BashOperator 24 | bash_command: 'echo 1' 25 | task_2: 26 | operator: airflow.operators.bash_operator.BashOperator 27 | bash_command: 'echo 2' 28 | dependencies: [task_1] 29 | task_3: 30 | operator: airflow.operators.bash_operator.BashOperator 31 | bash_command: 'echo 3' 32 | dependencies: [task_1] 33 | variables_as_arguments : [ 34 | {"variable":"var1","attribute":"bash_command"} 35 | ] 36 | 37 | second_example_dag: 38 | default_args: 39 | owner: 'custom_owner' 40 | start_date: 3 days 41 | description: 'this is a second example dag' 42 | schedule_interval: '0 6 * * *' 43 | tasks: 44 | task_0: 45 | operator: airflow.operators.bash_operator.BashOperator 46 | bash_command: 'echo 1' 47 | -------------------------------------------------------------------------------- /tests/fixtures/dag_md_docs.yml: -------------------------------------------------------------------------------- 1 | default: 2 | concurrency: 1 3 | dagrun_timeout_sec: 600 4 | default_args: 5 | end_date: 2018-03-05 6 | owner: default_owner 7 | retries: 1 8 | retry_delay_sec: 300 9 | start_date: 2018-03-01 10 | default_view: tree 11 | max_active_runs: 1 12 | orientation: LR 13 | schedule_interval: 0 1 * * * 14 | 15 | example_dag2: 16 | schedule_interval: None 17 | tasks: 18 | task_1: 19 | bash_command: echo 1 20 | operator: airflow.operators.bash_operator.BashOperator 21 | task_2: 22 | bash_command: echo 2 23 | dependencies: 24 | - task_1 25 | operator: airflow.operators.bash_operator.BashOperator 26 | task_3: 27 | bash_command: echo 3 28 | dependencies: 29 | - task_1 30 | operator: airflow.operators.bash_operator.BashOperator 31 | -------------------------------------------------------------------------------- /tests/fixtures/defaults.yml: -------------------------------------------------------------------------------- 1 | default_args: 2 | start_date: "2025-01-01" 3 | owner: "global_owner" 4 | depends_on_past: true 5 | -------------------------------------------------------------------------------- /tests/fixtures/doc_md_builder.py: -------------------------------------------------------------------------------- 1 | def mydocmdbuilder(**kwargs): 2 | return f"{kwargs}" 3 | -------------------------------------------------------------------------------- /tests/fixtures/invalid_dag_factory.yml: -------------------------------------------------------------------------------- 1 | default: 2 | default_args: 3 | owner: 'default_owner' 4 | max_active_runs: 1 5 | dagrun_timeout_sec: 600 6 | schedule_interval: '0 1 * * *' 7 | 8 | example_dag: 9 | default_args: 10 | owner: 'custom_owner' 11 | description: 'this is an example dag' 12 | schedule_interval: '0 3 * * *' 13 | tasks: 14 | task_1: 15 | operator: airflow.operators.bash_operator.BashOperator 16 | bash_command: 'echo 1' 17 | task_2: 18 | operator: airflow.operators.bash_operator.BashOperator 19 | bash_command: 'echo 2' 20 | dependencies: [task_1] 21 | task_3: 22 | operator: airflow.operators.bash_operator.BashOperator 23 | bash_command: 'echo 3' 24 | dependencies: [task_1] 25 | -------------------------------------------------------------------------------- /tests/fixtures/invalid_yaml.yml: -------------------------------------------------------------------------------- 1 | default: 2 | default_args: 3 | owner: 'default_owner' 4 | start_date: 2018-03-01 5 | max_active_runs: 1 6 | schedule_interval: '0 1 * * *' 7 | 8 | example_dag: 9 | default_args: 10 | owner: 'custom_owner' 11 | start_date: 2 days 12 | description: 'this is an example dag' 13 | schedule_interval: '0 3 * * *' 14 | tasks: 15 | task_1 16 | operator: airflow.operators.bash_operator.BashOperator 17 | bash_command: 'echo 1' 18 | task_2: 19 | operator: airflow.operators.bash_operator.BashOperator 20 | bash_command: 'echo 2' 21 | dependencies: [task_1] 22 | task_3: 23 | operator: airflow.operators.bash_operator.BashOperator 24 | bash_command: 'echo 3' 25 | dependencies: [task_1] 26 | -------------------------------------------------------------------------------- /tests/fixtures/mydocfile.md: -------------------------------------------------------------------------------- 1 | # Generantia quoque et umbrae canunt exspectatum medio 2 | 3 | ## In monstra bracchia et terrae a donec 4 | 5 | Lorem markdownum sua tot templisque auras conquerar avertit dant. Quis patris et 6 | Stygios tanta: est neque altera curvamine piasti, tota summa, anne aqua ponto. 7 | **In exanimi** Aegides studiis repetisse tales, promittat futurus! Secundas 8 | anima. 9 | 10 | 1. Ad patulis 11 | 2. Bracchia et auras 12 | 3. Ista inductas pinum 13 | 4. In fuit arcus Achilli 14 | 5. Quoque sumpsisse aurumque abesto fugit 15 | 6. Parum plangebat volvens addidit 16 | 17 | Vasto electarumque adit: ars mactatos potest Apollineos reliquit venis, abesto; 18 | flexile micantes, Hippodamas hunc urit. Iubebit umeris ex Phoebi gelidis in 19 | templorum summo; etiam hic **sumptas nosces** non? Decidit pariter membra nec 20 | deponunt dumque aere placido nec fata navalibus. Harena tempora esset 21 | sacrificos, poenas quam; caelestia superi isdem corpora. *Flexisque fraterna* 22 | removerat, concursibus ripae inferiora cuiquam *nisi plumbea* moriente nunc 23 | noviens meosque talia occiderat fecerat cogamque? 24 | 25 | clip_parameter_joystick -= xslt; 26 | var flashHdtv = nntp; 27 | if (mode_data_webcam) { 28 | hashtagPartyFirmware += root + jsf_rw.serverArray(pramOspfPrinter, 1, 29 | timeIo); 30 | } else { 31 | vci.mediaStandalone += javaIcannThyristor; 32 | richPciFddi(httpsCdSpam, web_bitmap + tutorial_source); 33 | sanWarm(tutorial + backlink_control); 34 | } 35 | if (remoteMetaApi == native + zebibyteExploitWan) { 36 | record_drive(root); 37 | control.fileVariable = pageForumMca; 38 | compressionServiceFlash = ocr; 39 | } else { 40 | metal_menu(5, honeypotIpvFlat.analyst_undo.alignmentChip(programIcmp, 41 | 2), noc_zip); 42 | } 43 | var scan_ics_basic = stack + snmpVirusFpu + -4 * outputWebcam; 44 | 45 | ## Nefasque pone lugubris moveant sceptra 46 | 47 | Reddit erat torus cornua pars, sceleris in ecce, illa esse quicquid adicit, 48 | obstantes. Dum puer egredior, nec telum [veniet](http://www.quo-piget.io/), aura 49 | **hic** ambobus septem Aram poteras annis. Traxit pectore: Troiane valebant 50 | increpat. Thoona fit et sibi adopertaque hanc; virgo natasque essent [quas 51 | polypus dicens](http://voce.org/corpore-habet) partem genibus, ex. 52 | 53 | > Provolvi ab summa quae verus illis: pronus est agmina flectat sua digna 54 | > *ille*, longa. [Tantalus](http://tuulmi.org/violentus) Gryneus mihi 55 | > circumfunditur posse stipitis deprensus porrigit in penetrat digiti! Currus 56 | > fere canis pectore, odiis, sororia et annis! Adspicit *tu adest* sua inserui 57 | > Liber! Translucet exigite templis et blanda, orbes gravidus Aeetias qui, et. 58 | 59 | *Non vox*, sum frigus caput dedi, indulsit se plurima tendentes, relictis 60 | damnatque, ante lacessit. Incaluit pallam. Magni toros quiete, timor laeta arida 61 | credat neque loquetur, pariterque mane, gerit ripas crevit ne. Vultum nondum, 62 | exclamant omnibus: per causa! 63 | 64 | Modo sunt legit pascua saepe, numeros ausi; quae Thracum. Est regia parte 65 | decerpsit: sidera! At visa, avi tenebras tibi formosior in causa, Perseu ratem, 66 | utilitas res et tolle. Vixque Minervae, ore libertas domos adspergine si sonat 67 | ut fonte. 68 | 69 | Frugilegas idem progenuit habebat fortissime lateque foci pignora, nec resumit 70 | quam Atrides. Viscera sua Paphon violenta naresque *esse* totas **crimine 71 | resonantia** vulneret ubi lecti omnia. Sua ingens ubi fecit ait est indigestaque 72 | quas haberet da *aerias iaculum nulloque* fluctibus comites cognata, et. Ora 73 | **intrat damna ante** Poemenis annos, et creatis Dianae. Uno lacertis levem? 74 | -------------------------------------------------------------------------------- /tests/test_example_dags.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pathlib import Path 4 | 5 | try: 6 | from functools import cache 7 | except ImportError: 8 | from functools import lru_cache as cache 9 | 10 | import airflow 11 | import pytest 12 | from airflow.models.dagbag import DagBag 13 | from airflow.utils.db import create_default_connections 14 | from airflow.utils.session import provide_session 15 | from packaging.version import Version 16 | 17 | from . import utils as test_utils 18 | 19 | EXAMPLE_DAGS_DIR = Path(__file__).parent.parent / "dev/dags" 20 | AIRFLOW_IGNORE_FILE = EXAMPLE_DAGS_DIR / ".airflowignore" 21 | AIRFLOW_VERSION = Version(airflow.__version__) 22 | IGNORED_DAG_FILES = ["example_callbacks.py"] 23 | 24 | MIN_VER_DAG_FILE_VER: dict[str, list[str]] = { 25 | # TaskFlow examples unrelated to dynamic task mapping work in earlier versions 26 | "2.3": ["example_dynamic_task_mapping.py", "example_taskflow.py"], 27 | "2.5": [ 28 | "example_pypi_stats_dagfactory", 29 | "example_hackernews_dagfactory", 30 | "example_hackernews_plain_airflow", 31 | "example_pypi_stats_plain_airflow", 32 | ], 33 | "2.7": ["example_map_index_template.py"], 34 | "2.4": ["example_external_sensor_dag.py"], 35 | } 36 | 37 | # Add HTTP operator DAG to ignored files for providers-http versions without HttpOperator 38 | try: 39 | from airflow.providers.http.operators.http import HttpOperator 40 | HTTP_OPERATOR_AVAILABLE = True 41 | except ImportError: 42 | HTTP_OPERATOR_AVAILABLE = False 43 | 44 | 45 | @provide_session 46 | def get_session(session=None): 47 | create_default_connections(session) 48 | return session 49 | 50 | 51 | @pytest.fixture() 52 | def session(): 53 | return get_session() 54 | 55 | 56 | @cache 57 | def get_dag_bag() -> DagBag: 58 | """Create a DagBag by adding the files that are not supported to .airflowignore""" 59 | 60 | with open(AIRFLOW_IGNORE_FILE, "w+") as file: 61 | for min_version, files in MIN_VER_DAG_FILE_VER.items(): 62 | if AIRFLOW_VERSION < Version(min_version): 63 | print(f"Adding {files} to .airflowignore") 64 | file.writelines([f"{file}\n" for file in files]) 65 | 66 | for dagfile in IGNORED_DAG_FILES: 67 | print(f"Adding {dagfile} to .airflowignore") 68 | file.writelines([f"{dagfile}\n"]) 69 | 70 | # Print the contents of the .airflowignore file, and build the DagBag 71 | print(".airflowignore contents: ") 72 | print(AIRFLOW_IGNORE_FILE.read_text()) 73 | db = DagBag(EXAMPLE_DAGS_DIR, include_examples=False) 74 | 75 | assert db.dags 76 | assert not db.import_errors 77 | return db 78 | 79 | 80 | def get_dag_ids() -> list[str]: 81 | dag_bag = get_dag_bag() 82 | return dag_bag.dag_ids 83 | 84 | 85 | @pytest.mark.integration 86 | @pytest.mark.parametrize("dag_id", get_dag_ids()) 87 | def test_example_dag(session, dag_id: str): 88 | dag_bag = get_dag_bag() 89 | dag = dag_bag.get_dag(dag_id) 90 | 91 | # Skip http_operator_example_dag in older Airflow versions without HttpOperator 92 | if dag_id == "http_operator_example_dag" and not HTTP_OPERATOR_AVAILABLE: 93 | pytest.skip(f"Skipping {dag_id} because HttpOperator is not available") 94 | 95 | # Skip http_operator_example_dag in older Airflow versions 96 | # since it has compatibility issues with our connection handling 97 | if dag_id == "http_operator_example_dag" and AIRFLOW_VERSION < Version("2.7.0"): 98 | pytest.skip(f"Skipping {dag_id} on Airflow version {AIRFLOW_VERSION}") 99 | 100 | # This feature is available since Airflow 2.5: 101 | # https://airflow.apache.org/docs/apache-airflow/stable/release_notes.html#airflow-2-5-0-2022-12-02 102 | if AIRFLOW_VERSION >= Version("2.5"): 103 | dag.test() 104 | else: 105 | test_utils.run_dag(dag) 106 | -------------------------------------------------------------------------------- /tests/test_parsers.py: -------------------------------------------------------------------------------- 1 | import ast 2 | 3 | import pytest 4 | 5 | from dagfactory.parsers import SafeEvalVisitor 6 | 7 | 8 | @pytest.fixture 9 | def dataset_map(): 10 | return {"dataset_custom_1": 1, "dataset_custom_2": 2, "dataset_custom_3": 3} 11 | 12 | 13 | @pytest.fixture 14 | def visitor(dataset_map): 15 | return SafeEvalVisitor(dataset_map) 16 | 17 | 18 | def test_evaluate(visitor): 19 | condition_string = "dataset_custom_1 & dataset_custom_2 | dataset_custom_3" 20 | tree = ast.parse(condition_string, mode="eval") 21 | result = visitor.evaluate(tree) 22 | expected = (1 & 2) | 3 23 | assert result == expected 24 | 25 | 26 | def test_visit_BinOp_and(visitor): 27 | condition_string = "dataset_custom_1 & dataset_custom_2" 28 | tree = ast.parse(condition_string, mode="eval") 29 | result = visitor.evaluate(tree) 30 | expected = 1 & 2 31 | assert result == expected 32 | 33 | 34 | def test_visit_BinOp_or(visitor): 35 | condition_string = "dataset_custom_1 | dataset_custom_3" 36 | tree = ast.parse(condition_string, mode="eval") 37 | result = visitor.evaluate(tree) 38 | expected = 1 | 3 39 | assert result == expected 40 | 41 | 42 | def test_visit_Name(visitor): 43 | condition_string = "dataset_custom_2" 44 | tree = ast.parse(condition_string, mode="eval") 45 | result = visitor.evaluate(tree) 46 | expected = 2 47 | assert result == expected 48 | 49 | 50 | def test_visit_Constant(visitor): 51 | condition_string = "42" 52 | tree = ast.parse(condition_string, mode="eval") 53 | result = visitor.evaluate(tree) 54 | expected = 42 55 | assert result == expected 56 | 57 | 58 | def test_unsupported_binary_operation(visitor): 59 | condition_string = "dataset_custom_1 + dataset_custom_2" 60 | tree = ast.parse(condition_string, mode="eval") 61 | with pytest.raises(ValueError): 62 | visitor.evaluate(tree) 63 | 64 | 65 | def test_unsupported_unary_operation(visitor): 66 | condition_string = "+dataset_custom_1" 67 | tree = ast.parse(condition_string, mode="eval") 68 | with pytest.raises(ValueError): 69 | visitor.evaluate(tree) 70 | 71 | 72 | def test_undefined_variable(visitor): 73 | condition_string = "undefined_dataset" 74 | tree = ast.parse(condition_string, mode="eval") 75 | with pytest.raises(NameError): 76 | visitor.evaluate(tree) 77 | 78 | 79 | def test_unsupported_syntax(visitor): 80 | condition_string = "[1, 2, 3]" 81 | tree = ast.parse(condition_string, mode="eval") 82 | with pytest.raises(ValueError): 83 | visitor.evaluate(tree) 84 | -------------------------------------------------------------------------------- /tests/test_settings.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from dagfactory import settings 4 | 5 | 6 | @pytest.mark.parametrize( 7 | "value,expected_response", 8 | [ 9 | ("f", False), 10 | ("false", False), 11 | ("0", False), 12 | ("", False), 13 | ("none", False), 14 | ("True", True), 15 | ("true", True), 16 | ("1", True), 17 | ], 18 | ) 19 | def test_convert_to_boolean(value, expected_response): 20 | assert settings.convert_to_boolean(value) == expected_response 21 | -------------------------------------------------------------------------------- /tests/test_telemetry.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from unittest.mock import patch 3 | 4 | import httpx 5 | import pytest 6 | 7 | from dagfactory import telemetry 8 | 9 | 10 | def test_should_emit_is_true_by_default(): 11 | assert telemetry.should_emit() 12 | 13 | 14 | @patch("dagfactory.settings.enable_telemetry", True) 15 | def test_should_emit_is_true_when_only_enable_telemetry_is_true(): 16 | assert telemetry.should_emit() 17 | 18 | 19 | @patch("dagfactory.settings.do_not_track", True) 20 | def test_should_emit_is_false_when_do_not_track(): 21 | assert not telemetry.should_emit() 22 | 23 | 24 | @patch("dagfactory.settings.no_analytics", True) 25 | def test_should_emit_is_false_when_no_analytics(): 26 | assert not telemetry.should_emit() 27 | 28 | 29 | def test_collect_standard_usage_metrics(): 30 | metrics = telemetry.collect_standard_usage_metrics() 31 | expected_keus = [ 32 | "airflow_version", 33 | "dagfactory_version", 34 | "platform_machine", 35 | "platform_system", 36 | "python_version", 37 | "variables", 38 | ] 39 | assert sorted(metrics.keys()) == expected_keus 40 | 41 | 42 | class MockFailedResponse: 43 | is_success = False 44 | status_code = "404" 45 | text = "Non existent URL" 46 | 47 | 48 | @patch("dagfactory.telemetry.httpx.get", return_value=MockFailedResponse()) 49 | def test_emit_usage_metrics_is_unsuccessful(mock_httpx_get, caplog): 50 | sample_metrics = { 51 | "dagfactory_version": "0.2.0a1", 52 | "airflow_version": "2.10.1", 53 | "python_version": "3.11", 54 | "platform_system": "darwin", 55 | "platform_machine": "amd64", 56 | "event_type": "dag_run", 57 | "status": "success", 58 | "dag_hash": "d151d1fa2f03270ea116cc7494f2c591", 59 | "task_count": 3, 60 | } 61 | is_success = telemetry.emit_usage_metrics(sample_metrics) 62 | mock_httpx_get.assert_called_once_with( 63 | f"""https://astronomer.gateway.scarf.sh/dag-factory/v2/0.2.0a1/2.10.1/3.11/darwin/amd64/dag_run/success/d151d1fa2f03270ea116cc7494f2c591/3""", 64 | timeout=1.0, 65 | follow_redirects=True, 66 | ) 67 | assert not is_success 68 | log_msg = f"""Unable to emit usage metrics to https://astronomer.gateway.scarf.sh/dag-factory/v2/0.2.0a1/2.10.1/3.11/darwin/amd64/dag_run/success/d151d1fa2f03270ea116cc7494f2c591/3. Status code: 404. Message: Non existent URL""" 69 | assert caplog.text.startswith("WARNING") 70 | assert log_msg in caplog.text 71 | 72 | 73 | @patch("dagfactory.telemetry.httpx.get", side_effect=httpx.ConnectError(message="Something is not right")) 74 | def test_emit_usage_metrics_fails(mock_httpx_get, caplog): 75 | sample_metrics = { 76 | "dagfactory_version": "0.2.0a1", 77 | "airflow_version": "2.10.1", 78 | "python_version": "3.11", 79 | "platform_system": "darwin", 80 | "platform_machine": "amd64", 81 | "event_type": "dag_run", 82 | "status": "success", 83 | "dag_hash": "d151d1fa2f03270ea116cc7494f2c591", 84 | "task_count": 3, 85 | } 86 | is_success = telemetry.emit_usage_metrics(sample_metrics) 87 | mock_httpx_get.assert_called_once_with( 88 | f"""https://astronomer.gateway.scarf.sh/dag-factory/v2/0.2.0a1/2.10.1/3.11/darwin/amd64/dag_run/success/d151d1fa2f03270ea116cc7494f2c591/3""", 89 | timeout=1.0, 90 | follow_redirects=True, 91 | ) 92 | assert not is_success 93 | log_msg = f"""Unable to emit usage metrics to https://astronomer.gateway.scarf.sh/dag-factory/v2/0.2.0a1/2.10.1/3.11/darwin/amd64/dag_run/success/d151d1fa2f03270ea116cc7494f2c591/3. An HTTPX connection error occurred: Something is not right.""" 94 | assert caplog.text.startswith("WARNING") 95 | assert log_msg in caplog.text 96 | 97 | 98 | @pytest.mark.integration 99 | def test_emit_usage_metrics_succeeds(caplog): 100 | caplog.set_level(logging.DEBUG) 101 | sample_metrics = { 102 | "dagfactory_version": "0.2.0a1", 103 | "airflow_version": "2.10.1", 104 | "python_version": "3.11", 105 | "platform_system": "darwin", 106 | "platform_machine": "amd64", 107 | "event_type": "dag_run", 108 | "status": "success", 109 | "dag_hash": "d151d1fa2f03270ea116cc7494f2c591", 110 | "task_count": 3, 111 | } 112 | is_success = telemetry.emit_usage_metrics(sample_metrics) 113 | assert is_success 114 | assert caplog.text.startswith("DEBUG") 115 | assert "Telemetry is enabled. Emitting the following usage metrics to" in caplog.text 116 | 117 | 118 | @patch("dagfactory.telemetry.should_emit", return_value=False) 119 | def test_emit_usage_metrics_if_enabled_fails(mock_should_emit, caplog): 120 | caplog.set_level(logging.DEBUG) 121 | assert not telemetry.emit_usage_metrics_if_enabled("any", {}) 122 | assert caplog.text.startswith("DEBUG") 123 | assert "Telemetry is disabled. To enable it, export AIRFLOW__DAG_FACTORY__ENABLE_TELEMETRY=True." in caplog.text 124 | 125 | 126 | @patch("dagfactory.telemetry.should_emit", return_value=True) 127 | @patch("dagfactory.telemetry.collect_standard_usage_metrics", return_value={"k1": "v1", "k2": "v2", "variables": {}}) 128 | @patch("dagfactory.telemetry.emit_usage_metrics") 129 | def test_emit_usage_metrics_if_enabled_succeeds( 130 | mock_emit_usage_metrics, mock_collect_standard_usage_metrics, mock_should_emit 131 | ): 132 | assert telemetry.emit_usage_metrics_if_enabled("any", {"k2": "v2"}) 133 | mock_emit_usage_metrics.assert_called_once() 134 | assert mock_emit_usage_metrics.call_args.args[0] == { 135 | "k1": "v1", 136 | "k2": "v2", 137 | "event_type": "any", 138 | "variables": {"k2": "v2"}, 139 | } 140 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import logging 4 | import sys 5 | from datetime import datetime 6 | from typing import Any 7 | 8 | from airflow.configuration import secrets_backend_list 9 | from airflow.exceptions import AirflowSkipException 10 | from airflow.models.dag import DAG 11 | from airflow.models.dagrun import DagRun 12 | from airflow.models.taskinstance import TaskInstance 13 | from airflow.secrets.local_filesystem import LocalFilesystemBackend 14 | from airflow.utils import timezone 15 | from airflow.utils.session import provide_session 16 | from airflow.utils.state import DagRunState, State 17 | from airflow.utils.types import DagRunType 18 | from sqlalchemy.orm.session import Session 19 | 20 | try: 21 | from airflow.utils.session import NEW_SESSION 22 | except ImportError: 23 | # Airflow < 2.3 did not have NEW_SESSION in airflow.utils.session 24 | from typing import cast 25 | 26 | from airflow import settings 27 | 28 | NEW_SESSION: settings.SASession = cast(settings.SASession, None) 29 | 30 | log = logging.getLogger(__name__) 31 | 32 | 33 | def run_dag(dag: DAG, conn_file_path: str | None = None) -> DagRun: 34 | return test_dag(dag=dag, conn_file_path=conn_file_path) 35 | 36 | 37 | # DAG.test() was added in Airflow version 2.5.0. And to test on older Airflow versions, we need to copy the 38 | # implementation here. 39 | @provide_session 40 | def test_dag( 41 | dag, 42 | execution_date: datetime | None = None, 43 | run_conf: dict[str, Any] | None = None, 44 | conn_file_path: str | None = None, 45 | variable_file_path: str | None = None, 46 | session: Session = NEW_SESSION, 47 | ) -> DagRun: 48 | """ 49 | Execute one single DagRun for a given DAG and execution date. 50 | 51 | :param execution_date: execution date for the DAG run 52 | :param run_conf: configuration to pass to newly created dagrun 53 | :param conn_file_path: file path to a connection file in either yaml or json 54 | :param variable_file_path: file path to a variable file in either yaml or json 55 | :param session: database connection (optional) 56 | """ 57 | 58 | if conn_file_path or variable_file_path: 59 | local_secrets = LocalFilesystemBackend( 60 | variables_file_path=variable_file_path, connections_file_path=conn_file_path 61 | ) 62 | secrets_backend_list.insert(0, local_secrets) 63 | 64 | execution_date = execution_date or timezone.utcnow() 65 | 66 | dag.log.debug("Clearing existing task instances for execution date %s", execution_date) 67 | dag.clear( 68 | start_date=execution_date, 69 | end_date=execution_date, 70 | dag_run_state=False, 71 | session=session, 72 | ) 73 | dag.log.debug("Getting dagrun for dag %s", dag.dag_id) 74 | dr: DagRun = _get_or_create_dagrun( 75 | dag=dag, 76 | start_date=execution_date, 77 | execution_date=execution_date, 78 | run_id=DagRun.generate_run_id(DagRunType.MANUAL, execution_date), 79 | session=session, 80 | conf=run_conf, 81 | ) 82 | 83 | tasks = dag.task_dict 84 | dag.log.debug("starting dagrun") 85 | # Instead of starting a scheduler, we run the minimal loop possible to check 86 | # for task readiness and dependency management. This is notably faster 87 | # than creating a BackfillJob and allows us to surface logs to the user 88 | while dr.state == State.RUNNING: 89 | schedulable_tis, _ = dr.update_state(session=session) 90 | for ti in schedulable_tis: 91 | add_logger_if_needed(dag, ti) 92 | ti.task = tasks[ti.task_id] 93 | _run_task(ti, session=session) 94 | if conn_file_path or variable_file_path: 95 | # Remove the local variables we have added to the secrets_backend_list 96 | secrets_backend_list.pop(0) 97 | 98 | print("conn_file_path", conn_file_path) 99 | 100 | return dr, session 101 | 102 | 103 | def add_logger_if_needed(dag: DAG, ti: TaskInstance): 104 | """ 105 | Add a formatted logger to the taskinstance so all logs are surfaced to the command line instead 106 | of into a task file. Since this is a local test run, it is much better for the user to see logs 107 | in the command line, rather than needing to search for a log file. 108 | Args: 109 | ti: The taskinstance that will receive a logger 110 | 111 | """ 112 | logging_format = logging.Formatter("[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s") 113 | handler = logging.StreamHandler(sys.stdout) 114 | handler.level = logging.INFO 115 | handler.setFormatter(logging_format) 116 | # only add log handler once 117 | if not any(isinstance(h, logging.StreamHandler) for h in ti.log.handlers): 118 | dag.log.debug("Adding Streamhandler to taskinstance %s", ti.task_id) 119 | ti.log.addHandler(handler) 120 | 121 | 122 | def _run_task(ti: TaskInstance, session): 123 | """ 124 | Run a single task instance, and push result to Xcom for downstream tasks. Bypasses a lot of 125 | extra steps used in `task.run` to keep our local running as fast as possible 126 | This function is only meant for the `dag.test` function as a helper function. 127 | 128 | Args: 129 | ti: TaskInstance to run 130 | """ 131 | log.info("*****************************************************") 132 | if hasattr(ti, "map_index") and ti.map_index > 0: 133 | log.info("Running task %s index %d", ti.task_id, ti.map_index) 134 | else: 135 | log.info("Running task %s", ti.task_id) 136 | try: 137 | ti._run_raw_task(session=session) 138 | session.flush() 139 | log.info("%s ran successfully!", ti.task_id) 140 | except AirflowSkipException: 141 | log.info("Task Skipped, continuing") 142 | log.info("*****************************************************") 143 | 144 | 145 | def _get_or_create_dagrun( 146 | dag: DAG, 147 | conf: dict[Any, Any] | None, 148 | start_date: datetime, 149 | execution_date: datetime, 150 | run_id: str, 151 | session: Session, 152 | ) -> DagRun: 153 | """ 154 | Create a DAGRun, but only after clearing the previous instance of said dagrun to prevent collisions. 155 | This function is only meant for the `dag.test` function as a helper function. 156 | :param dag: Dag to be used to find dagrun 157 | :param conf: configuration to pass to newly created dagrun 158 | :param start_date: start date of new dagrun, defaults to execution_date 159 | :param execution_date: execution_date for finding the dagrun 160 | :param run_id: run_id to pass to new dagrun 161 | :param session: sqlalchemy session 162 | :return: 163 | """ 164 | log.info("dagrun id: %s", dag.dag_id) 165 | dr: DagRun = ( 166 | session.query(DagRun).filter(DagRun.dag_id == dag.dag_id, DagRun.execution_date == execution_date).first() 167 | ) 168 | if dr: 169 | session.delete(dr) 170 | session.commit() 171 | dr = dag.create_dagrun( 172 | state=DagRunState.RUNNING, 173 | execution_date=execution_date, 174 | run_id=run_id, 175 | start_date=start_date or execution_date, 176 | session=session, 177 | conf=conf, 178 | ) 179 | log.info("created dagrun %s", str(dr)) 180 | return dr 181 | 182 | 183 | def one_hour_ago(execution_date: datetime): 184 | return execution_date - datetime.timedelta(hours=1) 185 | --------------------------------------------------------------------------------