├── .codecov.yml
├── .coveragerc
├── .dockerignore
├── .github
├── ISSUE_TEMPLATE
│ ├── 01-bug.yml
│ └── 02-feature.yml
├── dependabot.yml
└── workflows
│ └── cicd.yaml
├── .gitignore
├── .markdownlint.json
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── CODEOWNERS
├── LICENSE
├── Makefile
├── PRIVACY_NOTICE.md
├── README.md
├── SECURITY.md
├── dagfactory
├── __init__.py
├── constants.py
├── dagbuilder.py
├── dagfactory.py
├── exceptions.py
├── listeners
│ ├── __init__.py
│ └── runtime_event.py
├── parsers.py
├── plugin
│ └── __init__.py
├── settings.py
├── telemetry.py
└── utils.py
├── dev
├── .astro
│ ├── config.yaml
│ ├── dag_integrity_exceptions.txt
│ └── test_dag_integrity_default.py
├── .dockerignore
├── .gitignore
├── Dockerfile
├── README.md
├── dags
│ ├── comparison
│ │ ├── example_hackernews_dagfactory.yml
│ │ ├── example_hackernews_plain_airflow.py
│ │ ├── example_pypi_stats_dagfactory.yml
│ │ └── example_pypi_stats_plain_airflow.py
│ ├── customized
│ │ ├── __init__.py
│ │ ├── callbacks
│ │ │ ├── __init__.py
│ │ │ └── custom_callbacks.py
│ │ ├── helpers
│ │ │ ├── __init__.py
│ │ │ └── etl.py
│ │ └── operators
│ │ │ ├── __init__.py
│ │ │ └── breakfast_operators.py
│ ├── datasets
│ │ ├── example_config_datasets.yml
│ │ ├── example_dag_datasets.py
│ │ ├── example_dag_datasets.yml
│ │ ├── example_dag_datasets_outlet_inlet.yml
│ │ ├── example_dataset_condition_string.yml
│ │ └── example_dataset_yaml_syntax.yml
│ ├── defaults.yml
│ ├── example_callbacks.py
│ ├── example_callbacks.yml
│ ├── example_customize_operator.py
│ ├── example_customize_operator.yml
│ ├── example_dag_factory.py
│ ├── example_dag_factory.yml
│ ├── example_dag_factory_default_args.yml
│ ├── example_dag_factory_default_config.py
│ ├── example_dag_factory_default_config.yml
│ ├── example_dag_factory_multiple.py
│ ├── example_dag_factory_multiple_config.yml
│ ├── example_dynamic_task_mapping.py
│ ├── example_dynamic_task_mapping.yml
│ ├── example_http_operator_task.py
│ ├── example_http_operator_task.yml
│ ├── example_load_yaml_dags.py
│ ├── example_map_index_template.py
│ ├── example_map_index_template.yml
│ ├── example_simple_http_operator_task.yml
│ ├── example_task_group.py
│ ├── example_task_group.yml
│ ├── example_taskflow.py
│ ├── example_taskflow.yml
│ ├── expand_tasks.py
│ ├── external_task_sensor.yml
│ ├── hacker_news.py
│ ├── invalid.yaml
│ ├── print_hello.py
│ ├── pypi_stats.py
│ └── sample.py
├── packages.txt
├── requirements.txt
└── tests
│ └── dags
│ └── test_dag_example.py
├── docs
├── comparison
│ ├── index.md
│ ├── taskflow_api.md
│ └── traditional_operators.md
├── configuration
│ ├── configuring_workflows.md
│ ├── defaults.md
│ └── environment_variables.md
├── contributing
│ ├── code_of_conduct.md
│ ├── contributors.md
│ ├── howto.md
│ └── roles.md
├── features
│ ├── callbacks.md
│ ├── custom_operators.md
│ ├── datasets.md
│ ├── dynamic_tasks.md
│ ├── http_task.md
│ └── multiple_configuration_files.md
├── getting-started
│ ├── quick-start-airflow-standalone.md
│ └── quick-start-astro-cli.md
├── index.md
└── static
│ ├── example_dynamic_task_mapping.png
│ ├── example_hackernews_dagfactory_code.png
│ ├── example_hackernews_dagfactory_docs.png
│ ├── example_hackernews_dagfactory_graph.png
│ ├── example_hackernews_plain_airflow_code.png
│ ├── example_hackernews_plain_airflow_graph.png
│ ├── example_map_index_template.png
│ ├── example_pypi_stats_dagfactory_code.png
│ ├── example_pypi_stats_dagfactory_docs.png
│ ├── example_pypi_stats_dagfactory_graph.png
│ ├── example_pypi_stats_dagfactory_mapped_tasks.png
│ ├── example_pypi_stats_plain_airflow_code.png
│ ├── example_pypi_stats_plain_airflow_graph.png
│ ├── example_pypi_stats_plain_airflow_mapped_tasks.png
│ └── images
│ ├── airflow-dag.png
│ ├── airflow-home.png
│ ├── custom_operators.png
│ └── datasets
│ ├── conditions
│ ├── graph_conditional_dataset.png
│ └── graph_conditional_dataset_2.png
│ └── outlets
│ └── datasets_example.png
├── examples
└── dags
├── img
├── mapped_tasks_example.png
├── quickstart_dag.png
└── quickstart_gantt.png
├── mkdocs.yml
├── pyproject.toml
├── scripts
├── airflow3
│ ├── .gitignore
│ ├── README.md
│ ├── dags
│ │ ├── example_dag_factory.py
│ │ └── example_dag_factory.yml
│ ├── env.sh
│ ├── install_from_main.sh
│ ├── requirements.txt
│ ├── setup.sh
│ └── tests.sh
├── docs_deploy.py
├── test
│ ├── integration-setup.sh
│ ├── integration.sh
│ ├── pre-install-airflow.sh
│ ├── unit-cov.sh
│ └── unit.sh
└── verify_tag_and_version.py
└── tests
├── __init__.py
├── fixtures
├── dag_factory.yml
├── dag_factory_http_operator_task.yml
├── dag_factory_kubernetes_pod_operator.yml
├── dag_factory_simple_http_operator_task.yml
├── dag_factory_task_group.yml
├── dag_factory_variables_as_arguments.yml
├── dag_md_docs.yml
├── defaults.yml
├── doc_md_builder.py
├── invalid_dag_factory.yml
├── invalid_yaml.yml
└── mydocfile.md
├── test_dagbuilder.py
├── test_dagbuilder_httpoperator.py
├── test_dagfactory.py
├── test_example_dags.py
├── test_parsers.py
├── test_settings.py
├── test_telemetry.py
├── test_utils.py
└── utils.py
/.codecov.yml:
--------------------------------------------------------------------------------
1 | ---
2 | coverage:
3 | status:
4 | project:
5 | default:
6 | target: auto
7 | threshold: 2%
8 | only_pulls: true
9 |
--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit =
3 | tests/*
4 |
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | .git
2 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/01-bug.yml:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug Report
3 | description: File a bug report.
4 | title: "[Bug] "
5 | labels: ["bug", "triage-needed"]
6 | body:
7 | - type: markdown
8 | attributes:
9 | value: |
10 | Thanks for taking the time to fill out this bug report!
11 | - type: input
12 | id: dag-factory-version
13 | attributes:
14 | label: DAG Factory version
15 | # yamllint disable rule:line-length
16 | description: >
17 | On what version of DAG Factory are you currently experiencing the issue? Remember, you are encouraged to
18 | test with the latest release or on the main branch to verify your issue still exists.
19 | placeholder: e.g. 0.19.0
20 | validations:
21 | required: true
22 | - type: input
23 | id: airflow-version
24 | attributes:
25 | label: airflow version
26 | description: What version of Apache Airflow are you running?
27 | placeholder: ex. 2.9.0
28 | validations:
29 | required: true
30 | - type: input
31 | id: python-version
32 | attributes:
33 | label: Python version
34 | description: What version of Python are you running?
35 | placeholder: e.g. 3.10
36 | validations:
37 | required: true
38 | - type: dropdown
39 | attributes:
40 | label: Deployment
41 | description: >
42 | What kind of deployment do you have?
43 | multiple: false
44 | options:
45 | - "Official Apache Airflow Helm Chart"
46 | - "Other 3rd-party Helm chart"
47 | - "Docker-Compose"
48 | - "Other Docker-based deployment"
49 | - "Virtualenv installation"
50 | - "Astronomer"
51 | - "Google Cloud Composer"
52 | - "Amazon (AWS) MWAA"
53 | - "Microsoft ADF Managed Airflow"
54 | - "Other"
55 | validations:
56 | required: true
57 | - type: textarea
58 | attributes:
59 | label: Deployment details
60 | description: Additional description of your deployment.
61 | placeholder: >
62 | Enter any relevant details of your deployment. Especially version of your tools,
63 | software (docker-compose, helm, k8s, etc.), any customisation and configuration you added.
64 | - type: textarea
65 | id: what-happened
66 | attributes:
67 | label: What happened?
68 | description: Also tell us, what did you expect to happen?
69 | placeholder: Tell us what you see!
70 | value: "A bug happened!"
71 | validations:
72 | required: true
73 | - type: textarea
74 | id: logs
75 | attributes:
76 | label: Relevant log output
77 | description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
78 | render: shell
79 | - type: textarea
80 | attributes:
81 | label: How to reproduce
82 | description: What should we do to reproduce the problem?
83 | placeholder: >
84 | Please make sure you provide a reproducible step-by-step case of how to reproduce the problem
85 | as minimally and precisely as possible. Keep in mind we do not have access to your cluster or DAGs.
86 | Remember that non-reproducible issues make it hard for us to help you or resolve the issue!
87 | validations:
88 | required: true
89 | - type: textarea
90 | attributes:
91 | label: Anything else :)?
92 | description: Anything else we need to know?
93 | placeholder: >
94 | How often does this problem occur? (Once? Every time? Only when certain conditions are met?)
95 | - type: checkboxes
96 | attributes:
97 | label: Are you willing to submit PR?
98 | description: >
99 | This is absolutely not required, but we are happy to guide you in the contribution process
100 | especially if you already have a good understanding of how to implement the fix. We love to bring new
101 | contributors in.
102 | options:
103 | - label: Yes I am willing to submit a PR!
104 | - type: input
105 | id: contact
106 | attributes:
107 | label: Contact Details
108 | description: (Optional) How can we get in touch with you if we need more info?
109 | placeholder: ex. email@example.com
110 | validations:
111 | required: false
112 | - type: markdown
113 | attributes:
114 | value: "Thanks for completing our form!"
115 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/02-feature.yml:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | description: Suggest an idea for this project
4 | title: "[Feature] "
5 | labels: ["enhancement", "triage-needed"]
6 | body:
7 | - type: markdown
8 | attributes:
9 | # yamllint disable rule:line-length
10 | value: "
11 | Thank you for finding the time to propose new feature!
12 |
13 | We really appreciate the community efforts to improve DAG Factory."
14 | # yamllint enable rule:line-length
15 | - type: textarea
16 | attributes:
17 | label: Description
18 | description: A short description of your feature
19 | - type: textarea
20 | attributes:
21 | label: Use case/motivation
22 | description: What would you like to happen?
23 | - type: textarea
24 | attributes:
25 | label: Related issues
26 | description: Is there currently another issue associated with this?
27 | - type: checkboxes
28 | attributes:
29 | label: Are you willing to submit a PR?
30 | options:
31 | - label: Yes, I am willing to submit a PR!
32 | - type: markdown
33 | attributes:
34 | value: "Thanks for completing our form!"
35 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: "github-actions"
4 | directory: "/"
5 | schedule:
6 | interval: "daily"
7 | labels:
8 | - "dependencies"
9 | reviewers:
10 | - "@astronomer/oss-integrations"
11 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | bin/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # Environments
83 | .env
84 | .venv
85 | env/
86 | venv/
87 | ENV/
88 | .installed
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 |
103 | # venv
104 | pip-selfcheck.json
105 |
106 | # vscode
107 | .vscode
108 |
109 | # IntelliJ
110 | .idea/*
111 |
112 | # sqllite db
113 | *.db
114 |
115 |
116 | # Airflow logs
117 | logs/
118 |
119 | # MacOS DS_Store
120 | *.DS_Store
121 |
122 | # VIM
123 | *.sw[a-z]
124 |
125 | # Airflow
126 | examples/.airflowignore
127 | airflow.cfg
128 | webserver_config.py
129 |
130 | # Astro
131 | dev/include/dag_factory-*
132 |
--------------------------------------------------------------------------------
/.markdownlint.json:
--------------------------------------------------------------------------------
1 | {
2 | "MD007": {
3 | "indent": 4
4 | }
5 | }
6 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # See https://pre-commit.com for more information
2 | # See https://pre-commit.com/hooks.html for more hooks
3 | repos:
4 | - repo: https://github.com/astral-sh/ruff-pre-commit
5 | rev: v0.6.9
6 | hooks:
7 | - id: ruff
8 | args:
9 | - --fix
10 |
11 | - repo: https://github.com/pre-commit/pre-commit-hooks
12 | rev: v5.0.0
13 | hooks:
14 | - id: check-added-large-files
15 | - id: check-merge-conflict
16 | - id: check-toml
17 | - id: check-yaml
18 | args:
19 | - --unsafe
20 | exclude: 'tests/fixtures/dag_factory.yml|dev/dags/invalid.yaml|tests/fixtures/invalid_yaml.yml'
21 | - id: debug-statements
22 | - id: end-of-file-fixer
23 | - id: mixed-line-ending
24 | - id: pretty-format-json
25 | args: [ "--autofix" ]
26 | - id: trailing-whitespace
27 | - id: detect-private-key
28 | - id: detect-aws-credentials
29 | args: [ "--allow-missing-credentials" ]
30 |
31 | - repo: https://github.com/psf/black
32 | rev: 24.10.0
33 | hooks:
34 | - id: black
35 | args: [ "--config", "./pyproject.toml" ]
36 |
37 | - repo: https://github.com/codespell-project/codespell
38 | rev: v2.2.4
39 | hooks:
40 | - id: codespell
41 | exclude: tests/fixtures/mydocfile.md
42 |
43 | - repo: https://github.com/igorshubovych/markdownlint-cli
44 | rev: v0.41.0
45 | hooks:
46 | - id: markdownlint
47 | args:
48 | - "--disable=MD013" # disable line length
49 | - "--disable=MD024" # disable multiple headings with the same content (CHANGELOG)
50 | - "--disable=MD033" # disable no inline html (needed for analytics dead pixel)
51 |
52 | - repo: https://github.com/tcort/markdown-link-check
53 | rev: v3.13.6
54 | hooks:
55 | - id: markdown-link-check
56 | args: [-q]
57 |
--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @astronomer/oss-integrations
2 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: help
2 | help:
3 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
4 |
5 | .PHONY: setup
6 | setup: ## Setup development environment
7 | python -m venv venv
8 | . venv/bin/activate && pip --no-cache-dir install ".[tests]"
9 | @echo "To activate the virtual environment, run:"
10 | @echo "source venv/bin/activate"
11 |
12 | .PHONY: clean
13 | clean: ## Removes build and test artifacts
14 | @echo "==> Removing build and test artifacts"
15 | @rm -rf *.egg *egg-info .cache .coverage .tox build bin include dist htmlcov lib .pytest_cache .venv
16 | @find . -name '*.pyc' -exec rm -f {} +
17 | @find . -name '*.pyo' -exec rm -f {} +
18 | @find . -name '*~' -exec rm -f {} +
19 | @find . -name '__pycache__' -exec rm -rf {} +
20 |
21 |
22 | .PHONY: build-whl
23 | build-whl: ## Build installable whl file
24 | rm -rf dev/include/*
25 | rm -rf dist/*
26 | mkdir -p dev/include
27 | hatch build
28 | cp dist/* dev/include/
29 |
30 | .PHONY: docker-run
31 | docker-run: build-whl ## Runs local Airflow for testing
32 | @if ! lsof -i :8080 | grep LISTEN > /dev/null; then \
33 | cd dev && astro dev start --verbosity debug; \
34 | else \
35 | cd dev && astro dev restart --verbosity debug; \
36 | fi
37 |
38 | .PHONY: docker-stop
39 | docker-stop: ## Stop Docker container
40 | cd dev && astro dev stop
41 |
--------------------------------------------------------------------------------
/PRIVACY_NOTICE.md:
--------------------------------------------------------------------------------
1 | # Privacy Notice
2 |
3 | This project follows the [Privacy Policy of Astronomer](https://www.astronomer.io/privacy/).
4 |
5 | ## Collection of Data
6 |
7 | DAG Factory integrates [Scarf](https://about.scarf.sh/) to collect basic telemetry data during operation.
8 | This data assists the project maintainers in better understanding how DAG Factory is used.
9 | Insights gained from this telemetry are critical for prioritizing patches, minor releases, and
10 | security fixes. Additionally, this information supports key decisions related to the development road map.
11 |
12 | Deployments and individual users can opt-out of analytics by setting the configuration:
13 |
14 | ```ini
15 | [dag_factory]
16 | enable_telemetry False
17 | ```
18 |
19 | As described in the [official documentation](https://docs.scarf.sh/gateway/#do-not-track), it is also possible to opt out by setting one of the following environment variables:
20 |
21 | ```commandline
22 | DO_NOT_TRACK=True
23 | SCARF_NO_ANALYTICS=True
24 | ```
25 |
26 | In addition to Scarf's default data collection, DAG Factory collects the following information:
27 |
28 | - DAG Factory version
29 | - Airflow version
30 | - Python version
31 | - Operating system & machine architecture
32 | - Event type
33 | - Number of failed DagRuns
34 | - Number of successful DagRuns
35 | - Total tasks associated to each DagRun
36 | - Dag hash
37 |
38 | No user-identifiable information (IP included) is stored in Scarf.
39 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # dag-factory
2 |
3 | [](https://github.com/astronomer/dag-factory/actions?workflow=build)
4 | [](https://codecov.io/github/astronomer/dag-factory?branch=master)
5 | [](https://pypi.org/project/dag-factory/)
6 | [](https://github.com/ambv/black)
7 | [](https://img.shields.io/pypi/dm/dag-factory)
8 |
9 |
10 |
11 | Welcome to *dag-factory*! *dag-factory* is a library for [Apache Airflow®](https://airflow.apache.org) to construct DAGs
12 | declaratively via configuration files.
13 |
14 | The minimum requirements for **dag-factory** are:
15 |
16 | - Python 3.8.0+
17 | - [Apache Airflow®](https://airflow.apache.org) 2.3+
18 |
19 | For a gentle introduction, please take a look at our [Quickstart Guide](https://astronomer.github.io/dag-factory/latest/getting-started/quick-start-airflow-standalone/). For more examples, please see the
20 | [examples](/examples) folder.
21 |
22 | - [Quickstart](https://astronomer.github.io/dag-factory/latest/getting-started/quick-start-astro-cli/)
23 | - [Benefits](#benefits)
24 | - [Features](https://astronomer.github.io/dag-factory/latest/features/dynamic_tasks/)
25 | - [Dynamically Mapped Tasks](https://astronomer.github.io/dag-factory/latest/features/dynamic_tasks/)
26 | - [Multiple Configuration Files](https://astronomer.github.io/dag-factory/latest/features/multiple_configuration_files/)
27 | - [Callbacks](https://astronomer.github.io/dag-factory/latest/features/callbacks/)
28 | - [Custom Operators](https://astronomer.github.io/dag-factory/latest/features/custom_operators/)
29 | - [HttpSensor](https://astronomer.github.io/dag-factory/latest/features/http_task/)
30 | - [Contributing](https://astronomer.github.io/dag-factory/latest/contributing/howto/)
31 |
32 | ## Benefits
33 |
34 | - Construct DAGs without knowing Python
35 | - Construct DAGs without learning Airflow primitives
36 | - Avoid duplicative code
37 | - Everyone loves YAML! ;)
38 |
39 | ## License
40 |
41 | To learn more about the terms and conditions for use, reproduction and distribution, read the [Apache License 2.0](https://github.com/astronomer/dag-factory/blob/main/LICENSE).
42 |
43 | ## Privacy Notice
44 |
45 | This project follows [Astronomer's Privacy Policy](https://www.astronomer.io/privacy/).
46 |
47 | For further information, [read this](https://github.com/astronomer/dag-factory/blob/main/PRIVACY_NOTICE.md)
48 |
49 | ## Security Policy
50 |
51 | Check the project's [Security Policy](https://github.com/astronomer/dag-factory/blob/main/SECURITY.md) to learn
52 | how to report security vulnerabilities in DAG Factory and how security issues reported to the DAG Factory
53 | security team are handled.
54 |
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security
2 |
3 | [//]: # (This document is reused across Astronomer OSS Integrations projects, any changes should also be applied to security docs in the other repositories.)
4 |
5 | This document contains information on how to report security vulnerabilities in DAG Factory and
6 | how security issues reported to the Astronomer security team are handled.
7 | If you would like to learn more, refer to [https://www.astronomer.io/security/](https://www.astronomer.io/security).
8 |
9 | At Astronomer, we recognize the critical nature of security and view it as a transparent and collaborative effort.
10 | If you have any concern about the security of any Astronomer public repository, or believe you have uncovered a vulnerability,
11 | please email [oss_security@astronomer.io](mailto:oss_security@astronomer.io).
12 |
13 | > **warning**: Due to the nature of some security vulnerabilities, do not create a GitHub issue to report a vulnerability.
14 |
15 | ## Use of Email for Vulnerability Disclosure
16 |
17 | Only use the OSS security email to disclose security vulnerabilities.
18 | Astronomer does not accept bug reports, security implementation questions, or other security-related issues at this email address.
19 | If you are a customer of Astronomer, please reach out to your account team if you have any security-related questions or
20 | issues other than vulnerabilities, and they can assist you. Otherwise, this codebase is provided ‘as-is’ in accordance
21 | with its licensing structure.
22 |
23 | ## Scope
24 |
25 | When submitting vulnerabilities, please ensure that it is within the scope of the project, based on the following descriptions. Out-of-scope vulnerability reports will be ignored.
26 |
27 | ### In-scope
28 |
29 | * Code base with tagged releases
30 | * When integrated as specified in the [official DAG Factory documentation](https://github.com/astronomer/dag-factory/).
31 |
32 | ### Out-of-scope
33 |
34 | * Any other codebase, including other Astronomer products
35 | * Astronomer.io website
36 | * Dependencies used in DAG Factory
37 | * DAG Factory when modified or run using an unintended configuration
38 | * Other systems integrated with or CSP systems hosting the deployment
39 | * Cookie transfers between browsers
40 |
41 | For other products and repositories owned by Astronomer, please refer to their specific security policy or to
42 | [https://www.astronomer.io/vulnerability-disclosure/](https://www.astronomer.io/vulnerability-disclosure/) for
43 | vulnerabilities associated with Astronomer products.
44 |
45 | ## Required information and how to disclose
46 |
47 | Please send a single, plain-text (not HTML) email for each vulnerability you report.
48 |
49 | Your written description of the vulnerability is a critical part of the initial report. You can optionally include images and videos in your initial report, but the written description of the vulnerability must includes the following information, at a minimum:
50 |
51 | * Brief description/title of the vulnerability
52 | * Steps to recreate the issue
53 | * Contact information
54 |
55 | Upon review, we may request additional information including, but not limited to, images or a proof-of-concept video.
56 |
57 | ## Severity
58 |
59 | The vulnerability severity rating system used internally by Astronomer is not the same as the one used by the Apache Software Foundation.
60 | Please do not provide a severity for the vulnerability when disclosing, however, providing a CWE (Common Weakness Enumeration) is recommended.
61 |
62 | ## Response Timeframe
63 |
64 | Astronomer aims to acknowledge and validate disclosures within 5 business days. Resolutions will be provided in a timely manner.
65 |
66 | ## Follow-up Communication
67 |
68 | Astronomer handles follow-up communications to disclosures sent to [oss_security@astronomer.io](mailto:oss_security@astronomer.io) on a best-case effort, often within 3-5 business days. If the disclosure involves a product or repository that is covered by Astronomer's use of the Bugcrowd Vulnerability Disclosure Platform, please see Bugcrowd's terms of service for follow-up communication timelines. Disclosures to the Bugcrowd Vulnerability Disclosure Platform will result in communications through that platform.
69 |
70 | ## Partial Safe Harbor
71 |
72 | Astronomer will not threaten or bring any legal action against anyone who makes a good faith effort to comply with this
73 | vulnerability disclosure policy. This includes any claim under the DMCA for circumventing technological measures to
74 | protect the services and applications eligible under this policy.
75 |
76 | **As long as you comply with this policy:**
77 |
78 | * We consider your security research to be "authorized" under the Computer Fraud and Abuse Act (and/or similar state laws), and
79 | * We waive any restrictions in our application Terms of Use and Usage Policies that would prohibit your participation in this policy, but only for the limited purpose of your security research under this policy.
80 |
81 | ## Notification Requirement
82 |
83 | * Safe harbor under this policy is only extended if the discoverer of the vulnerability notifies Astronomer as outlined elsewhere in this policy, prior to notifying any other third-party entities, and does not notify any other third-party entities for 90 days after notifying Astronomer, without Astronomer’s prior written approval.
84 | * After notification of Astronomer and the lapse of the 90 day period, it is requested that any publications, third-party releases, or other disseminations of information related to or derived from the vulnerability discovery be coordinated with Astronomer prior.
85 |
86 | ## Right to rescind safe harbor protections
87 |
88 | Astronomer reserves the right to rescind any and all safe harbor protections originally extended to the vulnerability
89 | discoverer in the event that the discoverer, at any point prior to or after notification to Astronomer,
90 | has knowingly and willfully released, published, or otherwise used information related to the discovered vulnerability in a manner that:
91 |
92 | 1. Maligns or damages the reputation of Astronomer, its customers, or its employees;
93 | 2. Is used to conduct malicious attacks against Astronomer systems, regardless of whether material damages occur; or
94 | 3. Exacerbates existing vulnerabilities or threats, thereby increasing the risk to Astronomer or its stakeholders.
95 |
96 | ## Extension of safe harbor to third-party systems and services
97 |
98 | Astronomer systems and services can interconnect with third-party systems and services.
99 | If you submit a report that affects a third-party service through the [vulnerability disclosure program](https://www.astronomer.io/vulnerability-disclosure/),
100 | Astronomer will limit what we share with the affected third party.
101 | Please understand that, while we can authorize your research on Astronomer’s systems and services,
102 | we cannot authorize your efforts on third-party products or guarantee they won’t pursue legal action against you.
103 | If legal action is initiated by a third party against you because of your participation in this vulnerability
104 | disclosure program, and you have complied with our vulnerability disclosure policy, we will take steps to make it known
105 | that your actions were conducted in compliance with this policy.
106 | This is not, and should not be understood as, any agreement on Astronomer's part to defend, indemnify, or otherwise protect you
107 | from any third-party action based on your actions.
108 |
109 | You are expected, as always, to comply with all applicable laws.
110 |
--------------------------------------------------------------------------------
/dagfactory/__init__.py:
--------------------------------------------------------------------------------
1 | """Modules and methods to export for easier access"""
2 |
3 | from .dagfactory import DagFactory, load_yaml_dags
4 |
5 | __version__ = "0.23.0a3"
6 | __all__ = [
7 | "DagFactory",
8 | "load_yaml_dags",
9 | ]
10 |
--------------------------------------------------------------------------------
/dagfactory/constants.py:
--------------------------------------------------------------------------------
1 | TELEMETRY_URL = "https://astronomer.gateway.scarf.sh/dag-factory/{telemetry_version}/{dagfactory_version}/{airflow_version}/{python_version}/{platform_system}/{platform_machine}/{event_type}/{status}/{dag_hash}/{task_count}"
2 | TELEMETRY_VERSION = "v2"
3 | TELEMETRY_TIMEOUT = 1.0
4 |
5 | AIRFLOW3_MAJOR_VERSION = 3
6 |
--------------------------------------------------------------------------------
/dagfactory/exceptions.py:
--------------------------------------------------------------------------------
1 | """Module contains exceptions for dag-factory"""
2 |
3 |
4 | class DagFactoryException(Exception):
5 | """
6 | Base class for all dag-factory errors.
7 | """
8 |
9 |
10 | class DagFactoryConfigException(Exception):
11 | """
12 | Raise for dag-factory config errors.
13 | """
14 |
--------------------------------------------------------------------------------
/dagfactory/listeners/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/dagfactory/listeners/__init__.py
--------------------------------------------------------------------------------
/dagfactory/listeners/runtime_event.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from airflow.listeners import hookimpl
4 | from airflow.models.dag import DAG
5 | from airflow.models.dagrun import DagRun
6 |
7 | from dagfactory import telemetry
8 |
9 |
10 | class EventStatus:
11 | SUCCESS = "success"
12 | FAILED = "failed"
13 |
14 |
15 | DAG_RUN = "dag_run"
16 |
17 |
18 | def is_dagfactory_dag(dag: DAG | None = None):
19 | if "dagfactory" in dag.tags:
20 | return True
21 | return False
22 |
23 |
24 | @hookimpl
25 | def on_dag_run_success(dag_run: DagRun, msg: str):
26 | dag = dag_run.get_dag()
27 | if not is_dagfactory_dag(dag):
28 | return
29 | additional_telemetry_metrics = {
30 | "dag_hash": dag_run.dag_hash,
31 | "status": EventStatus.SUCCESS,
32 | "task_count": len(dag.task_ids),
33 | }
34 |
35 | telemetry.emit_usage_metrics_if_enabled(DAG_RUN, additional_telemetry_metrics)
36 |
37 |
38 | @hookimpl
39 | def on_dag_run_failed(dag_run: DagRun, msg: str):
40 | dag = dag_run.get_dag()
41 | if not is_dagfactory_dag(dag):
42 | return
43 | additional_telemetry_metrics = {
44 | "dag_hash": dag_run.dag_hash,
45 | "status": EventStatus.FAILED,
46 | "task_count": len(dag.task_ids),
47 | }
48 |
49 | telemetry.emit_usage_metrics_if_enabled(DAG_RUN, additional_telemetry_metrics)
50 |
--------------------------------------------------------------------------------
/dagfactory/parsers.py:
--------------------------------------------------------------------------------
1 | import ast
2 |
3 |
4 | class SafeEvalVisitor(ast.NodeVisitor):
5 | def __init__(self, dataset_map):
6 | self.dataset_map = dataset_map
7 |
8 | def evaluate(self, tree):
9 | return self.visit(tree)
10 |
11 | def visit_Expression(self, node):
12 | return self.visit(node.body)
13 |
14 | def visit_BinOp(self, node):
15 | left = self.visit(node.left)
16 | right = self.visit(node.right)
17 |
18 | if isinstance(node.op, ast.BitAnd):
19 | return left & right
20 | elif isinstance(node.op, ast.BitOr):
21 | return left | right
22 | else:
23 | raise ValueError(f"Unsupported binary operation: {type(node.op).__name__}")
24 |
25 | def visit_Name(self, node):
26 | if node.id in self.dataset_map:
27 | return self.dataset_map[node.id]
28 | raise NameError(f"Undefined variable: {node.id}")
29 |
30 | def visit_Constant(self, node):
31 | return node.value
32 |
33 | def generic_visit(self, node):
34 | raise ValueError(f"Unsupported syntax: {type(node).__name__}")
35 |
--------------------------------------------------------------------------------
/dagfactory/plugin/__init__.py:
--------------------------------------------------------------------------------
1 | from airflow.plugins_manager import AirflowPlugin
2 |
3 | from dagfactory.listeners import runtime_event
4 |
5 |
6 | class DagFactoryPlugin(AirflowPlugin):
7 | name = "Dag Factory Plugin"
8 | listeners = [runtime_event]
9 |
10 |
11 | dagfactory_plugin = DagFactoryPlugin()
12 |
--------------------------------------------------------------------------------
/dagfactory/settings.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import os
4 |
5 | from airflow.configuration import conf
6 |
7 |
8 | def convert_to_boolean(value: str | None) -> bool:
9 | """
10 | Convert a string that represents a boolean to a Python boolean.
11 | """
12 | value = str(value).lower().strip()
13 | if value in ("f", "false", "0", "", "none"):
14 | return False
15 | return True
16 |
17 |
18 | enable_telemetry = conf.getboolean("dag_factory", "enable_telemetry", fallback=True)
19 | do_not_track = convert_to_boolean(os.getenv("DO_NOT_TRACK"))
20 | no_analytics = convert_to_boolean(os.getenv("SCARF_NO_ANALYTICS"))
21 |
--------------------------------------------------------------------------------
/dagfactory/telemetry.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import logging
4 | import platform
5 | from urllib import parse
6 | from urllib.parse import urlencode
7 |
8 | import httpx
9 | from airflow import __version__ as airflow_version
10 |
11 | import dagfactory
12 | from dagfactory import constants, settings
13 |
14 |
15 | def should_emit() -> bool:
16 | """
17 | Identify if telemetry metrics should be emitted or not.
18 | """
19 | return settings.enable_telemetry and not settings.do_not_track and not settings.no_analytics
20 |
21 |
22 | def collect_standard_usage_metrics() -> dict[str, object]:
23 | """
24 | Return standard telemetry metrics.
25 | """
26 | metrics = {
27 | "dagfactory_version": dagfactory.__version__,
28 | "airflow_version": parse.quote(airflow_version),
29 | "python_version": platform.python_version(),
30 | "platform_system": platform.system(),
31 | "platform_machine": platform.machine(),
32 | "variables": {},
33 | }
34 | return metrics
35 |
36 |
37 | def emit_usage_metrics(metrics: dict[str, object]) -> bool:
38 | """
39 | Emit desired telemetry metrics to remote telemetry endpoint.
40 |
41 | The metrics must contain the necessary fields to build the TELEMETRY_URL.
42 | """
43 | query_string = urlencode(metrics)
44 | telemetry_url = constants.TELEMETRY_URL.format(
45 | **metrics, telemetry_version=constants.TELEMETRY_VERSION, query_string=query_string
46 | )
47 | logging.debug("Telemetry is enabled. Emitting the following usage metrics to %s: %s", telemetry_url, metrics)
48 | try:
49 | response = httpx.get(telemetry_url, timeout=constants.TELEMETRY_TIMEOUT, follow_redirects=True)
50 | except httpx.HTTPError as e:
51 | logging.warning(
52 | "Unable to emit usage metrics to %s. An HTTPX connection error occurred: %s.", telemetry_url, str(e)
53 | )
54 | is_success = False
55 | else:
56 | is_success = response.is_success
57 | if not is_success:
58 | logging.warning(
59 | "Unable to emit usage metrics to %s. Status code: %s. Message: %s",
60 | telemetry_url,
61 | response.status_code,
62 | response.text,
63 | )
64 | return is_success
65 |
66 |
67 | def emit_usage_metrics_if_enabled(event_type: str, additional_metrics: dict[str, object]) -> bool:
68 | """
69 | Checks if telemetry should be emitted, fetch standard metrics, complement with custom metrics
70 | and emit them to remote telemetry endpoint.
71 |
72 | :returns: If the event was successfully sent to the telemetry backend or not.
73 | """
74 | if should_emit():
75 | metrics = collect_standard_usage_metrics()
76 | metrics["event_type"] = event_type
77 | metrics["variables"].update(additional_metrics)
78 | metrics.update(additional_metrics)
79 | is_success = emit_usage_metrics(metrics)
80 | return is_success
81 | else:
82 | logging.debug("Telemetry is disabled. To enable it, export AIRFLOW__DAG_FACTORY__ENABLE_TELEMETRY=True.")
83 | return False
84 |
--------------------------------------------------------------------------------
/dev/.astro/config.yaml:
--------------------------------------------------------------------------------
1 | project:
2 | name: dev
3 |
--------------------------------------------------------------------------------
/dev/.astro/dag_integrity_exceptions.txt:
--------------------------------------------------------------------------------
1 | # Add dag files to exempt from parse test below. ex: dags/
2 |
--------------------------------------------------------------------------------
/dev/.astro/test_dag_integrity_default.py:
--------------------------------------------------------------------------------
1 | """Test the validity of all DAGs. **USED BY DEV PARSE COMMAND DO NOT EDIT**"""
2 |
3 | import logging
4 | import os
5 | from contextlib import contextmanager
6 |
7 | import pytest
8 | from airflow.hooks.base import BaseHook
9 | from airflow.models import Connection, DagBag, Variable
10 | from airflow.utils.db import initdb
11 |
12 | # init airflow database
13 | initdb()
14 |
15 | # The following code patches errors caused by missing OS Variables, Airflow Connections, and Airflow Variables
16 |
17 |
18 | # =========== MONKEYPATCH BaseHook.get_connection() ===========
19 | def basehook_get_connection_monkeypatch(key: str, *args, **kwargs):
20 | print(f"Attempted to fetch connection during parse returning an empty Connection object for {key}")
21 | return Connection(key)
22 |
23 |
24 | BaseHook.get_connection = basehook_get_connection_monkeypatch
25 | # # =========== /MONKEYPATCH BASEHOOK.GET_CONNECTION() ===========
26 |
27 |
28 | # =========== MONKEYPATCH OS.GETENV() ===========
29 | def os_getenv_monkeypatch(key: str, *args, **kwargs):
30 | default = None
31 | if args:
32 | default = args[0] # os.getenv should get at most 1 arg after the key
33 | if kwargs:
34 | default = kwargs.get("default", None) # and sometimes kwarg if people are using the sig
35 |
36 | env_value = os.environ.get(key, None)
37 |
38 | if env_value:
39 | return env_value # if the env_value is set, return it
40 | if key == "JENKINS_HOME" and default is None: # fix https://github.com/astronomer/astro-cli/issues/601
41 | return None
42 | if default:
43 | return default # otherwise return whatever default has been passed
44 | return f"MOCKED_{key.upper()}_VALUE" # if absolutely nothing has been passed - return the mocked value
45 |
46 |
47 | os.getenv = os_getenv_monkeypatch
48 | # # =========== /MONKEYPATCH OS.GETENV() ===========
49 |
50 | # =========== MONKEYPATCH VARIABLE.GET() ===========
51 |
52 |
53 | class magic_dict(dict):
54 | def __init__(self, *args, **kwargs):
55 | self.update(*args, **kwargs)
56 |
57 | def __getitem__(self, key):
58 | return {}.get(key, "MOCKED_KEY_VALUE")
59 |
60 |
61 | _no_default = object() # allow falsey defaults
62 |
63 |
64 | def variable_get_monkeypatch(key: str, default_var=_no_default, deserialize_json=False):
65 | print(f"Attempted to get Variable value during parse, returning a mocked value for {key}")
66 |
67 | if default_var is not _no_default:
68 | return default_var
69 | if deserialize_json:
70 | return magic_dict()
71 | return "NON_DEFAULT_MOCKED_VARIABLE_VALUE"
72 |
73 |
74 | Variable.get = variable_get_monkeypatch
75 | # # =========== /MONKEYPATCH VARIABLE.GET() ===========
76 |
77 |
78 | @contextmanager
79 | def suppress_logging(namespace):
80 | """
81 | Suppress logging within a specific namespace to keep tests "clean" during build
82 | """
83 | logger = logging.getLogger(namespace)
84 | old_value = logger.disabled
85 | logger.disabled = True
86 | try:
87 | yield
88 | finally:
89 | logger.disabled = old_value
90 |
91 |
92 | def get_import_errors():
93 | """
94 | Generate a tuple for import errors in the dag bag, and include DAGs without errors.
95 | """
96 | with suppress_logging("airflow"):
97 | dag_bag = DagBag(include_examples=False)
98 |
99 | def strip_path_prefix(path):
100 | return os.path.relpath(path, os.environ.get("AIRFLOW_HOME"))
101 |
102 | # Initialize an empty list to store the tuples
103 | result = []
104 |
105 | # Iterate over the items in import_errors
106 | for k, v in dag_bag.import_errors.items():
107 | result.append((strip_path_prefix(k), v.strip()))
108 |
109 | # Check if there are DAGs without errors
110 | for file_path in dag_bag.dags:
111 | # Check if the file_path is not in import_errors, meaning no errors
112 | if file_path not in dag_bag.import_errors:
113 | result.append((strip_path_prefix(file_path), "No import errors"))
114 |
115 | return result
116 |
117 |
118 | @pytest.mark.parametrize("rel_path, rv", get_import_errors(), ids=[x[0] for x in get_import_errors()])
119 | def test_file_imports(rel_path, rv):
120 | """Test for import errors on a file"""
121 | if os.path.exists(".astro/dag_integrity_exceptions.txt"):
122 | with open(".astro/dag_integrity_exceptions.txt", "r") as f:
123 | exceptions = f.readlines()
124 | print(f"Exceptions: {exceptions}")
125 | if (rv != "No import errors") and rel_path not in exceptions:
126 | # If rv is not "No import errors," consider it a failed test
127 | raise Exception(f"{rel_path} failed to import with message \n {rv}")
128 | else:
129 | # If rv is "No import errors," consider it a passed test
130 | print(f"{rel_path} passed the import test")
131 |
--------------------------------------------------------------------------------
/dev/.dockerignore:
--------------------------------------------------------------------------------
1 | astro
2 | .git
3 | .env
4 | airflow_settings.yaml
5 | logs/
6 | .venv
7 | airflow.db
8 | airflow.cfg
9 |
--------------------------------------------------------------------------------
/dev/.gitignore:
--------------------------------------------------------------------------------
1 | .git
2 | .env
3 | .DS_Store
4 | airflow_settings.yaml
5 | __pycache__/
6 | astro
7 | .venv
8 | airflow-webserver.pid
9 | webserver_config.py
10 | airflow.cfg
11 | airflow.db
12 |
--------------------------------------------------------------------------------
/dev/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM quay.io/astronomer/astro-runtime:12.7.0
2 |
3 | ENV CONFIG_ROOT_DIR=/usr/local/airflow/dags/
4 |
5 | USER root
6 |
7 | RUN apt-get update && apt-get install -y jq
8 |
9 | USER astro
10 |
11 | RUN pip install /usr/local/airflow/include/*.whl
12 |
--------------------------------------------------------------------------------
/dev/README.md:
--------------------------------------------------------------------------------
1 | # Sample Airflow setup and DAG Factory examples
2 |
3 | ## Overview
4 |
5 | Welcome to Astronomer! This project was generated after you ran 'astro dev init' using the Astronomer CLI. This readme describes the contents of the project, as well as how to run Apache Airflow on your local machine.
6 |
7 | ## Project Contents
8 |
9 | Your Astro project contains the following files and folders:
10 |
11 | - dags: This folder contains the Python files for your Airflow DAGs. By default, this directory includes one example DAG:
12 | - `example_astronauts`: This DAG shows a simple ETL pipeline example that queries the list of astronauts currently in space from the Open Notify API and prints a statement for each astronaut. The DAG uses the TaskFlow API to define tasks in Python, and dynamic task mapping to dynamically print a statement for each astronaut. For more on how this DAG works, see our [Getting started tutorial](https://www.astronomer.io/docs/learn/get-started-with-airflow).
13 | - Dockerfile: This file contains a versioned Astro Runtime Docker image that provides a differentiated Airflow experience. If you want to execute other commands or overrides at runtime, specify them here.
14 | - include: This folder contains any additional files that you want to include as part of your project. It is empty by default.
15 | - packages.txt: Install OS-level packages needed for your project by adding them to this file. It is empty by default.
16 | - requirements.txt: Install Python packages needed for your project by adding them to this file. It is empty by default.
17 | - plugins: Add custom or community plugins for your project to this file. It is empty by default.
18 | - airflow_settings.yaml: Use this local-only file to specify Airflow Connections, Variables, and Pools instead of entering them in the Airflow UI as you develop DAGs in this project.
19 |
20 | ## Deploy Your Project Locally
21 |
22 | (1) Start Airflow on your local machine by running 'astro dev start'.
23 |
24 | This command will spin up 4 Docker containers on your machine, each for a different Airflow component:
25 |
26 | - Postgres: Airflow's Metadata Database
27 | - Webserver: The Airflow component responsible for rendering the Airflow UI
28 | - Scheduler: The Airflow component responsible for monitoring and triggering tasks
29 | - Triggerer: The Airflow component responsible for triggering deferred tasks
30 |
31 | (2) Verify that all 4 Docker containers were created by running 'docker ps'.
32 |
33 | Note: Running 'astro dev start' will start your project with the Airflow Webserver exposed at port 8080 and Postgres exposed at port 5432. If you already have either of those ports allocated, you can either [stop your existing Docker containers or change the port](https://www.astronomer.io/docs/astro/cli/troubleshoot-locally#ports-are-not-available-for-my-local-airflow-webserver).
34 |
35 | (3) Access the Airflow UI for your local Airflow project. To do so, go to and log in with 'admin' for both your Username and Password.
36 |
37 | You should also be able to access your Postgres Database at 'localhost:5432/postgres'.
38 |
39 | ## Deploy Your Project to Astronomer
40 |
41 | If you have an Astronomer account, pushing code to a Deployment on Astronomer is simple. For deploying instructions, refer to Astronomer documentation:
42 |
43 | ## Contact
44 |
45 | The Astronomer CLI is maintained with love by the Astronomer team. To report a bug or suggest a change, reach out to our support.
46 |
--------------------------------------------------------------------------------
/dev/dags/comparison/example_hackernews_dagfactory.yml:
--------------------------------------------------------------------------------
1 | example_hackernews_dagfactory:
2 | default_args:
3 | start_date: 2022-03-04
4 | tasks:
5 | fetch_top_ten_news:
6 | operator: airflow.operators.bash_operator.BashOperator
7 | bash_command: "curl -s https://hacker-news.firebaseio.com/v0/topstories.json | jq -c -r '.[0:10]'"
8 | fetch_first_top_news:
9 | operator: airflow.operators.bash_operator.BashOperator
10 | bash_command: "echo {{ task_instance.xcom_pull(task_ids='fetch_top_ten_news') }} | jq -c -r '.[0]' | xargs -I {} curl -s 'https://hacker-news.firebaseio.com/v0/item/{}.json'"
11 | dependencies: [fetch_top_ten_news]
12 | fetch_second_top_news:
13 | operator: airflow.operators.bash_operator.BashOperator
14 | bash_command: "echo {{ task_instance.xcom_pull(task_ids='fetch_top_ten_news') }} | jq -c -r '.[1]' | xargs -I {} curl -s 'https://hacker-news.firebaseio.com/v0/item/{}.json'"
15 | dependencies: [fetch_top_ten_news]
16 | summarize:
17 | operator: airflow.operators.python.PythonOperator
18 | python_callable: hacker_news.summarize
19 | dependencies: [fetch_first_top_news, fetch_second_top_news]
20 |
--------------------------------------------------------------------------------
/dev/dags/comparison/example_hackernews_plain_airflow.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 |
3 | from airflow.models.dag import DAG
4 | from airflow.operators.bash_operator import BashOperator
5 | from airflow.operators.python import PythonOperator
6 | from hacker_news import summarize
7 |
8 | with DAG(dag_id="example_hackernews_plain_airflow", schedule=None, start_date=datetime(2022, 3, 4)) as dag:
9 |
10 | fetch_top_ten_news = BashOperator(
11 | task_id="fetch_top_ten_news",
12 | bash_command="curl -s https://hacker-news.firebaseio.com/v0/topstories.json | jq -c -r '.[0:10]'",
13 | )
14 |
15 | fetch_first_top_news = BashOperator(
16 | task_id="fetch_first_top_news",
17 | bash_command="""
18 | echo {{ task_instance.xcom_pull(task_ids='fetch_top_ten_news') }} | jq -c -r '.[0]' | xargs -I {} curl -s 'https://hacker-news.firebaseio.com/v0/item/{}.json'
19 | """,
20 | )
21 |
22 | fetch_second_top_news = BashOperator(
23 | task_id="fetch_second_news",
24 | bash_command="""
25 | echo {{ task_instance.xcom_pull(task_ids='fetch_top_ten_news') }} | jq -c -r '.[1]' | xargs -I {} curl -s 'https://hacker-news.firebaseio.com/v0/item/{}.json'
26 | """,
27 | )
28 |
29 | summarize = PythonOperator(task_id="summarize", python_callable=summarize)
30 |
31 | fetch_top_ten_news >> [fetch_first_top_news, fetch_second_top_news] >> summarize
32 |
--------------------------------------------------------------------------------
/dev/dags/comparison/example_pypi_stats_dagfactory.yml:
--------------------------------------------------------------------------------
1 | example_pypi_stats_dagfactory:
2 | default_args:
3 | start_date: 2022-03-04
4 | tasks:
5 | get_pypi_projects_list:
6 | decorator: airflow.decorators.task
7 | python_callable: pypi_stats.get_pypi_projects_list
8 | fetch_pypi_stats_data:
9 | decorator: airflow.decorators.task
10 | python_callable: pypi_stats.fetch_pypi_stats_data
11 | expand:
12 | package_name: +get_pypi_projects_list
13 | summarize:
14 | decorator: airflow.decorators.task
15 | python_callable: pypi_stats.summarize
16 | values: +fetch_pypi_stats_data
17 |
--------------------------------------------------------------------------------
/dev/dags/comparison/example_pypi_stats_plain_airflow.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from datetime import datetime
4 | from typing import Any
5 |
6 | from airflow.decorators import task
7 | from airflow.models.dag import DAG
8 | from pypi_stats import fetch_pypi_stats_data, get_pypi_projects_list, summarize
9 |
10 | with DAG(dag_id="example_pypi_stats_plain_airflow", schedule=None, start_date=datetime(2022, 3, 4)) as dag:
11 |
12 | @task
13 | def get_pypi_projects_list_():
14 | return get_pypi_projects_list()
15 |
16 | @task
17 | def fetch_pypi_stats_data_(project_name: str):
18 | return fetch_pypi_stats_data(project_name)
19 |
20 | @task
21 | def summarize_(values: list[dict[str, Any]]):
22 | return summarize(values)
23 |
24 | pypi_stats_data = fetch_pypi_stats_data_.expand(project_name=get_pypi_projects_list_())
25 | summarize_(pypi_stats_data)
26 |
--------------------------------------------------------------------------------
/dev/dags/customized/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/dev/dags/customized/__init__.py
--------------------------------------------------------------------------------
/dev/dags/customized/callbacks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/dev/dags/customized/callbacks/__init__.py
--------------------------------------------------------------------------------
/dev/dags/customized/callbacks/custom_callbacks.py:
--------------------------------------------------------------------------------
1 | """
2 | example_callbacks.py
3 |
4 | Author: Jake Roach
5 | Date: 2024-10-22
6 | """
7 |
8 |
9 | def output_message(context, param1, param2):
10 | print("A callback has been raised!")
11 | print(f"{param1} ---------- {param2}")
12 |
--------------------------------------------------------------------------------
/dev/dags/customized/helpers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/dev/dags/customized/helpers/__init__.py
--------------------------------------------------------------------------------
/dev/dags/customized/helpers/etl.py:
--------------------------------------------------------------------------------
1 | def extract():
2 | print("extract() function has been called")
3 |
4 |
5 | def transform(ds_nodash):
6 | print("transform() function has been called")
7 |
8 |
9 | def load(database_name, table_name):
10 | print("load() function has been called")
11 |
--------------------------------------------------------------------------------
/dev/dags/customized/operators/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/dev/dags/customized/operators/__init__.py
--------------------------------------------------------------------------------
/dev/dags/customized/operators/breakfast_operators.py:
--------------------------------------------------------------------------------
1 | from airflow.models import BaseOperator
2 |
3 |
4 | class MakeBreadOperator(BaseOperator):
5 | template_fields = ("bread_type",)
6 |
7 | def __init__(self, bread_type, *args, **kwargs):
8 | super(MakeBreadOperator, self).__init__(*args, **kwargs)
9 | self.bread_type = bread_type
10 |
11 | def execute(self, context):
12 | print("Make {} bread".format(self.bread_type))
13 |
14 |
15 | class MakeCoffeeOperator(BaseOperator):
16 | template_fields = ("coffee_type",)
17 |
18 | def __init__(self, coffee_type, *args, **kwargs):
19 | super(MakeCoffeeOperator, self).__init__(*args, **kwargs)
20 | self.coffee_type = coffee_type
21 |
22 | def execute(self, context):
23 | print("Make {} bread".format(self.coffee_type))
24 |
--------------------------------------------------------------------------------
/dev/dags/datasets/example_config_datasets.yml:
--------------------------------------------------------------------------------
1 | datasets:
2 | - name: dataset_custom_1
3 | uri: s3://bucket-cjmm/raw/dataset_custom_1
4 | - name: dataset_custom_2
5 | uri: s3://bucket-cjmm/raw/dataset_custom_2
6 | - name: dataset_custom_3
7 | uri: s3://bucket-cjmm/raw/dataset_custom_3
8 |
--------------------------------------------------------------------------------
/dev/dags/datasets/example_dag_datasets.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 |
4 | # The following import is here so Airflow parses this file
5 | # from airflow import DAG
6 | import dagfactory
7 |
8 | DEFAULT_CONFIG_ROOT_DIR = "/usr/local/airflow/dags/"
9 | CONFIG_ROOT_DIR = Path(os.getenv("CONFIG_ROOT_DIR", DEFAULT_CONFIG_ROOT_DIR))
10 |
11 | config_file = str(CONFIG_ROOT_DIR / "datasets/example_dag_datasets.yml")
12 |
13 | example_dag_factory = dagfactory.DagFactory(config_file)
14 |
15 | # Creating task dependencies
16 | example_dag_factory.clean_dags(globals())
17 | example_dag_factory.generate_dags(globals())
18 |
--------------------------------------------------------------------------------
/dev/dags/datasets/example_dag_datasets.yml:
--------------------------------------------------------------------------------
1 | default:
2 | default_args:
3 | owner: "default_owner"
4 | start_date: '2023-07-14'
5 | retries: 1
6 | retry_delay_sec: 300
7 | concurrency: 1
8 | max_active_runs: 1
9 | dagrun_timeout_sec: 600
10 | default_view: "tree"
11 | orientation: "LR"
12 |
13 | example_simple_dataset_producer_dag:
14 | description: "Example DAG producer simple datasets"
15 | schedule_interval: "0 5 * * *"
16 | tasks:
17 | task_1:
18 | operator: airflow.operators.bash_operator.BashOperator
19 | bash_command: "echo 1"
20 | inlets: [ 's3://bucket_example/raw/dataset1_source.json' ]
21 | outlets: ['s3://bucket_example/raw/dataset1.json']
22 | task_2:
23 | operator: airflow.operators.bash_operator.BashOperator
24 | bash_command: "echo 2"
25 | dependencies: [task_1]
26 | inlets: [ 's3://bucket_example/raw/dataset2_source.json' ]
27 | outlets: ['s3://bucket_example/raw/dataset2.json']
28 |
29 | example_simple_dataset_consumer_dag:
30 | description: "Example DAG consumer simple datasets"
31 | schedule: ['s3://bucket_example/raw/dataset1.json', 's3://bucket_example/raw/dataset2.json']
32 | tasks:
33 | task_1:
34 | operator: airflow.operators.bash_operator.BashOperator
35 | bash_command: "echo 'consumer datasets'"
36 |
37 | example_custom_config_dataset_producer_dag:
38 | description: "Example DAG producer custom config datasets"
39 | schedule_interval: "0 5 * * *"
40 | tasks:
41 | task_1:
42 | operator: airflow.operators.bash_operator.BashOperator
43 | bash_command: "echo 1"
44 | outlets:
45 | file: $CONFIG_ROOT_DIR/datasets/example_config_datasets.yml
46 | datasets: ['dataset_custom_1', 'dataset_custom_2']
47 |
48 | example_custom_config_dataset_consumer_dag:
49 | description: "Example DAG consumer custom config datasets"
50 | schedule:
51 | file: $CONFIG_ROOT_DIR/datasets/example_config_datasets.yml
52 | datasets: ['dataset_custom_1', 'dataset_custom_2']
53 | tasks:
54 | task_1:
55 | operator: airflow.operators.bash_operator.BashOperator
56 | bash_command: "echo 'consumer datasets'"
57 |
58 | example_custom_config_condition_dataset_consumer_dag:
59 | description: "Example DAG consumer custom config condition datasets"
60 | schedule:
61 | file: $CONFIG_ROOT_DIR/datasets/example_config_datasets.yml
62 | datasets: "((dataset_custom_1 & dataset_custom_2) | dataset_custom_3)"
63 | tasks:
64 | task_1:
65 | operator: airflow.operators.bash_operator.BashOperator
66 | bash_command: "echo 'consumer datasets'"
67 |
68 | example_without_custom_config_condition_dataset_consumer_dag:
69 | description: "Example DAG consumer custom config condition datasets"
70 | schedule:
71 | datasets:
72 | !or
73 | - !and
74 | - "s3://bucket-cjmm/raw/dataset_custom_1"
75 | - "s3://bucket-cjmm/raw/dataset_custom_2"
76 | - "s3://bucket-cjmm/raw/dataset_custom_3"
77 | tasks:
78 | task_1:
79 | operator: airflow.operators.bash_operator.BashOperator
80 | bash_command: "echo 'consumer datasets'"
81 |
--------------------------------------------------------------------------------
/dev/dags/datasets/example_dag_datasets_outlet_inlet.yml:
--------------------------------------------------------------------------------
1 | producer_dag:
2 | default_args:
3 | owner: "example_owner"
4 | retries: 1
5 | start_date: '2024-01-01'
6 | description: "Example DAG producer simple datasets"
7 | schedule_interval: "0 5 * * *"
8 | tasks:
9 | task_1:
10 | operator: airflow.operators.bash_operator.BashOperator
11 | bash_command: "echo 1"
12 | inlets: [ 's3://bucket_example/raw/dataset1_source.json' ]
13 | outlets: [ 's3://bucket_example/raw/dataset1.json' ]
14 | task_2:
15 | bash_command: "echo 2"
16 | dependencies: [ task_1 ]
17 | inlets: [ 's3://bucket_example/raw/dataset2_source.json' ]
18 | outlets: [ 's3://bucket_example/raw/dataset2.json' ]
19 | consumer_dag:
20 | default_args:
21 | owner: "example_owner"
22 | retries: 1
23 | start_date: '2024-01-01'
24 | description: "Example DAG consumer simple datasets"
25 | schedule: [ 's3://bucket_example/raw/dataset1.json', 's3://bucket_example/raw/dataset2.json' ]
26 | tasks:
27 | task_1:
28 | operator: airflow.operators.bash_operator.BashOperator
29 | bash_command: "echo 'consumer datasets'"
30 |
--------------------------------------------------------------------------------
/dev/dags/datasets/example_dataset_condition_string.yml:
--------------------------------------------------------------------------------
1 | consumer_dag:
2 | default_args:
3 | owner: "example_owner"
4 | retries: 1
5 | start_date: '2024-01-01'
6 | description: "Example DAG consumer simple datasets"
7 | schedule:
8 | datasets: "((s3://bucket-cjmm/raw/dataset_custom_1 & s3://bucket-cjmm/raw/dataset_custom_2) | s3://bucket-cjmm/raw/dataset_custom_3)"
9 | tasks:
10 | task_1:
11 | operator: airflow.operators.bash_operator.BashOperator
12 | bash_command: "echo 'consumer datasets'"
13 |
--------------------------------------------------------------------------------
/dev/dags/datasets/example_dataset_yaml_syntax.yml:
--------------------------------------------------------------------------------
1 | consumer_dag:
2 | default_args:
3 | owner: "example_owner"
4 | retries: 1
5 | start_date: '2024-01-01'
6 | description: "Example DAG consumer simple datasets"
7 | schedule:
8 | datasets:
9 | !or
10 | - !and
11 | - "s3://bucket-cjmm/raw/dataset_custom_1"
12 | - "s3://bucket-cjmm/raw/dataset_custom_2"
13 | - "s3://bucket-cjmm/raw/dataset_custom_3"
14 | tasks:
15 | task_1:
16 | operator: airflow.operators.bash_operator.BashOperator
17 | bash_command: "echo 'consumer datasets'"
18 |
--------------------------------------------------------------------------------
/dev/dags/defaults.yml:
--------------------------------------------------------------------------------
1 | default_args:
2 | start_date: "2025-01-01"
3 | owner: "global_owner"
4 |
--------------------------------------------------------------------------------
/dev/dags/example_callbacks.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 |
4 | # The following import is here so Airflow parses this file
5 | # from airflow import DAG
6 | import dagfactory
7 |
8 | DEFAULT_CONFIG_ROOT_DIR = "/usr/local/airflow/dags/"
9 | CONFIG_ROOT_DIR = Path(os.getenv("CONFIG_ROOT_DIR", DEFAULT_CONFIG_ROOT_DIR))
10 |
11 | config_file = str(CONFIG_ROOT_DIR / "example_callbacks.yml")
12 |
13 | example_dag_factory = dagfactory.DagFactory(config_file)
14 |
15 | # Creating task dependencies
16 | example_dag_factory.clean_dags(globals())
17 | example_dag_factory.generate_dags(globals())
18 |
--------------------------------------------------------------------------------
/dev/dags/example_callbacks.yml:
--------------------------------------------------------------------------------
1 | example_callbacks:
2 | default_args:
3 | start_date: "2024-01-01"
4 | # Callbacks at be set at the default_args level. These callbacks are then passed to each Task. Fun fact;
5 | # default_args can be overridden within a Task
6 | on_retry_callback: print_hello.print_hello_from_callback
7 | on_failure_callback:
8 | callback: airflow.providers.slack.notifications.slack.send_slack_notification
9 | slack_conn_id: slack_conn_id
10 | text: |
11 | :red_circle: Task Failed.
12 | This task has failed and needs to be addressed.
13 | Please remediate this issue ASAP.
14 | channel: "#channel"
15 | schedule_interval: "@daily"
16 | catchup: False
17 | # These callbacks are set at the DAG-level, vs. the callbacks set above in default_args that are passed onto each
18 | # Task. Previously, the same "on_success_callback" configuration was set as part of task_2
19 | on_execute_callback_name: print_hello_from_callback
20 | on_execute_callback_file: $CONFIG_ROOT_DIR/print_hello.py
21 | on_success_callback:
22 | callback: customized.callbacks.custom_callbacks.output_message
23 | param1: param1
24 | param2: param2
25 | task_groups:
26 | task_group_1:
27 | default_args:
28 | on_success_callback: print_hello.print_hello_from_callback
29 | dependencies: [task_1, task_2]
30 | tasks:
31 | start:
32 | operator: airflow.operators.empty.EmptyOperator
33 | on_success_callback_name: print_hello_from_callback
34 | on_success_callback_file: $CONFIG_ROOT_DIR/print_hello.py
35 | task_1:
36 | operator: airflow.operators.bash_operator.BashOperator
37 | bash_command: "echo 1"
38 | on_success_callback:
39 | callback: customized.callbacks.custom_callbacks.output_message
40 | param1: param1
41 | param2: param2
42 | dependencies: [start]
43 | task_2:
44 | operator: airflow.operators.bash_operator.BashOperator
45 | bash_command: "echo 2"
46 | on_success_callback_name: print_hello_from_callback
47 | on_success_callback_file: $CONFIG_ROOT_DIR/print_hello.py
48 | dependencies: [start]
49 | task_3:
50 | operator: airflow.operators.bash_operator.BashOperator
51 | bash_command: "echo 3"
52 | task_group_name: task_group_1
53 | end:
54 | operator: airflow.operators.bash_operator.BashOperator
55 | bash_command: "echo -1"
56 | dependencies:
57 | - task_group_1
58 |
--------------------------------------------------------------------------------
/dev/dags/example_customize_operator.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 |
4 | # The following import is here so Airflow parses this file
5 | # from airflow import DAG
6 | import dagfactory
7 |
8 | DEFAULT_CONFIG_ROOT_DIR = "/usr/local/airflow/dags/"
9 | CONFIG_ROOT_DIR = Path(os.getenv("CONFIG_ROOT_DIR", DEFAULT_CONFIG_ROOT_DIR))
10 |
11 | config_file = str(CONFIG_ROOT_DIR / "example_customize_operator.yml")
12 |
13 | example_dag_factory = dagfactory.DagFactory(config_file)
14 |
15 | # Creating task dependencies
16 | example_dag_factory.clean_dags(globals())
17 | example_dag_factory.generate_dags(globals())
18 |
--------------------------------------------------------------------------------
/dev/dags/example_customize_operator.yml:
--------------------------------------------------------------------------------
1 | default:
2 | default_args:
3 | owner: "default_owner"
4 | start_date: 2020-01-01
5 | retries: 1
6 | retry_delay_sec: 300
7 | concurrency: 1
8 | max_active_runs: 1
9 | dagrun_timeout_sec: 600
10 | default_view: "tree"
11 | orientation: "LR"
12 | schedule_interval: "0 1 * * *"
13 |
14 | example_breadfast:
15 | default_args:
16 | owner: "custom_owner"
17 | start_date: 2 days
18 | description: "this is an customized operator dag"
19 | schedule_interval: "0 3 * * *"
20 | tasks:
21 | begin:
22 | operator: airflow.operators.dummy_operator.DummyOperator
23 | make_bread_1:
24 | operator: customized.operators.breakfast_operators.MakeBreadOperator
25 | bread_type: 'Sourdough'
26 | dependencies:
27 | - begin
28 | make_bread_2:
29 | operator: customized.operators.breakfast_operators.MakeBreadOperator
30 | bread_type: 'Multigrain'
31 | dependencies:
32 | - begin
33 | make_coffee_1:
34 | operator: customized.operators.breakfast_operators.MakeCoffeeOperator
35 | coffee_type: 'Black'
36 | dependencies:
37 | - begin
38 | - make_bread_1
39 | - make_bread_2
40 | end:
41 | operator: airflow.operators.dummy_operator.DummyOperator
42 | dependencies:
43 | - begin
44 | - make_bread_1
45 | - make_bread_2
46 | - make_coffee_1
47 |
--------------------------------------------------------------------------------
/dev/dags/example_dag_factory.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 |
4 | # The following import is here so Airflow parses this file
5 | # from airflow import DAG
6 | import dagfactory
7 |
8 | DEFAULT_CONFIG_ROOT_DIR = "/usr/local/airflow/dags/"
9 |
10 | CONFIG_ROOT_DIR = Path(os.getenv("CONFIG_ROOT_DIR", DEFAULT_CONFIG_ROOT_DIR))
11 |
12 | config_file = str(CONFIG_ROOT_DIR / "example_dag_factory.yml")
13 |
14 | example_dag_factory = dagfactory.DagFactory(config_file)
15 |
16 | # Creating task dependencies
17 | example_dag_factory.clean_dags(globals())
18 | example_dag_factory.generate_dags(globals())
19 |
--------------------------------------------------------------------------------
/dev/dags/example_dag_factory.yml:
--------------------------------------------------------------------------------
1 | default:
2 | default_args:
3 | catchup: false,
4 | start_date: 2024-11-11
5 |
6 | # ----8<--- [ start: example_dag_yaml_configuration ]
7 | basic_example_dag:
8 | default_args:
9 | owner: "custom_owner"
10 | description: "this is an example dag"
11 | schedule_interval: "0 3 * * *"
12 | render_template_as_native_obj: True
13 | tasks:
14 | task_1:
15 | operator: airflow.operators.bash_operator.BashOperator
16 | bash_command: "echo 1"
17 | task_2:
18 | operator: airflow.operators.bash_operator.BashOperator
19 | bash_command: "echo 2"
20 | dependencies: [task_1]
21 | task_3:
22 | operator: airflow.operators.bash_operator.BashOperator
23 | bash_command: "echo 2"
24 | dependencies: [task_1]
25 | # ----8<--- [ end: example_dag_yaml_configuration ]
26 |
--------------------------------------------------------------------------------
/dev/dags/example_dag_factory_default_args.yml:
--------------------------------------------------------------------------------
1 | default:
2 | default_args:
3 | start_date: '2024-01-01'
4 | schedule_interval: 0 0 * * *
5 | catchup: false
6 | tags:
7 | - "data engineering"
8 |
9 | etl:
10 | tasks:
11 | extract:
12 | operator: airflow.operators.bash_operator.BashOperator
13 | bash_command: "echo extract"
14 | transform:
15 | operator: airflow.operators.bash_operator.BashOperator
16 | bash_command: "echo transform"
17 | dependencies:
18 | - extract
19 | load:
20 | operator: airflow.operators.bash_operator.BashOperator
21 | bash_command: "echo load"
22 | dependencies:
23 | - transform
24 |
--------------------------------------------------------------------------------
/dev/dags/example_dag_factory_default_config.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 |
4 | # The following import is here so Airflow parses this file
5 | # from airflow import DAG
6 | import dagfactory
7 |
8 | DEFAULT_CONFIG_ROOT_DIR = "/usr/local/airflow/dags/"
9 | CONFIG_ROOT_DIR = Path(os.getenv("CONFIG_ROOT_DIR", DEFAULT_CONFIG_ROOT_DIR))
10 |
11 | config_file = str(CONFIG_ROOT_DIR / "example_dag_factory_default_config.yml")
12 |
13 | example_dag_factory = dagfactory.DagFactory(config_file)
14 |
15 | # Creating task dependencies
16 | example_dag_factory.clean_dags(globals())
17 | example_dag_factory.generate_dags(globals())
18 |
--------------------------------------------------------------------------------
/dev/dags/example_dag_factory_default_config.yml:
--------------------------------------------------------------------------------
1 | default:
2 | default_args:
3 | start_date: '2024-01-01'
4 | schedule_interval: 0 0 * * *
5 | catchup: false
6 | tags:
7 | - dynamic
8 | tasks:
9 | extract:
10 | operator: airflow.operators.bash_operator.BashOperator
11 | bash_command: "echo extract"
12 | transform:
13 | operator: airflow.operators.bash_operator.BashOperator
14 | bash_command: "echo transform"
15 | dependencies:
16 | - extract
17 | load:
18 | operator: airflow.operators.bash_operator.BashOperator
19 | dependencies:
20 | - transform
21 |
22 |
23 | machine_learning:
24 | tasks:
25 | load:
26 | bash_command: "echo machine_larning"
27 |
28 | data_science:
29 | tasks:
30 | load:
31 | bash_command: "echo data_science"
32 |
33 | artificial_intelligence:
34 | tasks:
35 | load:
36 | bash_command: "echo artificial_intelligence"
37 |
--------------------------------------------------------------------------------
/dev/dags/example_dag_factory_multiple.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 |
4 | # The following import is here so Airflow parses this file
5 | # from airflow import DAG
6 | import dagfactory
7 |
8 | DEFAULT_CONFIG_ROOT_DIR = "/usr/local/airflow/dags/"
9 | CONFIG_ROOT_DIR = Path(os.getenv("CONFIG_ROOT_DIR", DEFAULT_CONFIG_ROOT_DIR))
10 |
11 | config_file = str(CONFIG_ROOT_DIR / "example_dag_factory_multiple_config.yml")
12 |
13 | example_dag_factory = dagfactory.DagFactory(config_file)
14 |
15 | # Creating task dependencies
16 | example_dag_factory.clean_dags(globals())
17 | example_dag_factory.generate_dags(globals())
18 |
--------------------------------------------------------------------------------
/dev/dags/example_dag_factory_multiple_config.yml:
--------------------------------------------------------------------------------
1 | default:
2 | default_args:
3 | catchup: false,
4 | owner: "default_owner"
5 | start_date: 2024-11-11
6 | retries: 1
7 | retry_delay_sec: 30
8 | on_success_callback_name: print_hello_from_callback
9 | on_success_callback_file: $CONFIG_ROOT_DIR/print_hello.py
10 | concurrency: 1
11 | max_active_runs: 1
12 | dagrun_timeout_sec: 600
13 | default_view: "tree"
14 | orientation: "LR"
15 | schedule_interval: "0 1 * * *"
16 | on_failure_callback_name: print_hello_from_callback
17 | on_failure_callback_file: $CONFIG_ROOT_DIR/print_hello.py
18 |
19 |
20 | # ----8<--- [ start: environment_variable_example ]
21 | example_dag:
22 | default_args:
23 | owner: "custom_owner"
24 | description: "this is an example dag"
25 | schedule_interval: "0 3 * * *"
26 | render_template_as_native_obj: True
27 | dag_display_name: "Pretty Example DAG"
28 | tasks:
29 | task_1:
30 | operator: airflow.operators.bash_operator.BashOperator
31 | bash_command: "echo 1"
32 | task_2:
33 | operator: airflow.operators.bash_operator.BashOperator
34 | bash_command: "echo 2"
35 | dependencies: [task_1]
36 | task_3:
37 | operator: airflow.operators.python_operator.PythonOperator
38 | python_callable_name: print_hello
39 | python_callable_file: $CONFIG_ROOT_DIR/print_hello.py
40 | dependencies: [task_1]
41 | # ----8<--- [ end: environment_variable_example ]
42 |
43 | example_dag2:
44 | default_args:
45 | timezone: Europe/Amsterdam
46 | tasks:
47 | task_1:
48 | operator: airflow.operators.bash_operator.BashOperator
49 | bash_command: "echo 1"
50 | task_2:
51 | operator: airflow.operators.bash_operator.BashOperator
52 | bash_command: "echo 2"
53 | dependencies: [task_1]
54 | task_3:
55 | operator: airflow.operators.bash_operator.BashOperator
56 | bash_command: "echo 3"
57 | dependencies: [task_1]
58 |
59 | example_dag3:
60 | tasks:
61 | task_1:
62 | operator: airflow.operators.bash_operator.BashOperator
63 | bash_command: "echo 1"
64 | task_2:
65 | operator: airflow.operators.bash_operator.BashOperator
66 | bash_command: "echo 2"
67 | dependencies: [task_1]
68 | task_3:
69 | operator: airflow.operators.bash_operator.BashOperator
70 | bash_command: "echo 3"
71 | dependencies: [task_1]
72 |
73 | example_dag4:
74 | description: "this dag uses task groups"
75 | task_groups:
76 | task_group_1:
77 | tooltip: "this is a task group"
78 | dependencies: [task_1]
79 | tasks:
80 | task_1:
81 | operator: airflow.operators.bash_operator.BashOperator
82 | bash_command: "echo 1"
83 | task_2:
84 | operator: airflow.operators.bash_operator.BashOperator
85 | bash_command: "echo 2"
86 | task_group_name: task_group_1
87 | task_3:
88 | operator: airflow.operators.python_operator.PythonOperator
89 | python_callable_name: print_hello
90 | python_callable_file: $CONFIG_ROOT_DIR/print_hello.py
91 | task_group_name: task_group_1
92 | dependencies: [task_2]
93 | task_4:
94 | operator: airflow.operators.bash_operator.BashOperator
95 | bash_command: "echo 1"
96 | dependencies: [task_group_1]
97 |
--------------------------------------------------------------------------------
/dev/dags/example_dynamic_task_mapping.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 |
4 | # The following import is here so Airflow parses this file
5 | # from airflow import DAG
6 | import dagfactory
7 |
8 | DEFAULT_CONFIG_ROOT_DIR = "/usr/local/airflow/dags/"
9 |
10 | CONFIG_ROOT_DIR = Path(os.getenv("CONFIG_ROOT_DIR", DEFAULT_CONFIG_ROOT_DIR))
11 |
12 | config_file = str(CONFIG_ROOT_DIR / "example_dynamic_task_mapping.yml")
13 |
14 | example_dag_factory = dagfactory.DagFactory(config_file)
15 |
16 | # Creating task dependencies
17 | example_dag_factory.clean_dags(globals())
18 | example_dag_factory.generate_dags(globals())
19 |
--------------------------------------------------------------------------------
/dev/dags/example_dynamic_task_mapping.yml:
--------------------------------------------------------------------------------
1 | test_expand:
2 | default_args:
3 | owner: "custom_owner"
4 | start_date: 2 days
5 | description: "test expand"
6 | schedule_interval: "0 3 * * *"
7 | default_view: "graph"
8 | tasks:
9 | process:
10 | operator: airflow.operators.python_operator.PythonOperator
11 | python_callable_name: consume_value
12 | python_callable_file: $CONFIG_ROOT_DIR/expand_tasks.py
13 | partial:
14 | op_kwargs:
15 | fixed_param: "test"
16 | expand:
17 | op_args:
18 | request.output
19 | dependencies: [request]
20 | # This task is intentionally placed after the "process" task to demonstrate that DAG Factory does not require tasks
21 | # to be topologically ordered in the YAML file according to their dependencies.
22 | request:
23 | operator: airflow.operators.python.PythonOperator
24 | python_callable_name: make_list
25 | python_callable_file: $CONFIG_ROOT_DIR/expand_tasks.py
26 |
--------------------------------------------------------------------------------
/dev/dags/example_http_operator_task.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 |
4 | try:
5 | from airflow.providers.http.operators.http import HttpOperator
6 | HTTP_OPERATOR_AVAILABLE = True
7 | except ImportError:
8 | HTTP_OPERATOR_AVAILABLE = False
9 |
10 | # The following import is here so Airflow parses this file
11 | # from airflow import DAG
12 | import dagfactory
13 |
14 | DEFAULT_CONFIG_ROOT_DIR = "/usr/local/airflow/dags/"
15 |
16 | CONFIG_ROOT_DIR = Path(os.getenv("CONFIG_ROOT_DIR", DEFAULT_CONFIG_ROOT_DIR))
17 | if HTTP_OPERATOR_AVAILABLE:
18 | config_file = str(CONFIG_ROOT_DIR / "example_http_operator_task.yml")
19 | else:
20 | config_file = str(CONFIG_ROOT_DIR / "example_simple_http_operator_task.yml")
21 |
22 | example_dag_factory = dagfactory.DagFactory(config_file)
23 |
24 | # Creating task dependencies
25 | example_dag_factory.clean_dags(globals())
26 | example_dag_factory.generate_dags(globals())
27 |
--------------------------------------------------------------------------------
/dev/dags/example_http_operator_task.yml:
--------------------------------------------------------------------------------
1 | default:
2 | default_args:
3 | catchup: false,
4 | start_date: 2025-03-20
5 |
6 | http_operator_example_dag:
7 | default_args:
8 | owner: "@owner"
9 | description: "this is an HttpOperator dag"
10 | schedule_interval: "0 3 * * *"
11 | tags: ['http']
12 | render_template_as_native_obj: True
13 | tasks:
14 | send_request_json:
15 | operator: airflow.providers.http.operators.http.HttpOperator
16 | http_conn_id: "example_host"
17 | method: "POST"
18 | endpoint: "/run_test"
19 | data:
20 | data: "fake_data"
21 | format: "json"
22 | headers:
23 | Content-Type: application/json
24 | log_response: True
25 | send_request_plain_text:
26 | operator: airflow.providers.http.operators.http.HttpOperator
27 | http_conn_id: "example_host"
28 | method: "POST"
29 | endpoint: "/run_test"
30 | data:
31 | data: "fake_data"
32 | test: "plain_text"
33 | headers:
34 | Content-Type: text/plain
35 | log_response: True
36 |
--------------------------------------------------------------------------------
/dev/dags/example_load_yaml_dags.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 |
4 | from dagfactory import load_yaml_dags
5 |
6 | DEFAULT_CONFIG_ROOT_DIR = "/usr/local/airflow/dags/"
7 | CONFIG_ROOT_DIR = Path(os.getenv("CONFIG_ROOT_DIR", DEFAULT_CONFIG_ROOT_DIR))
8 | config_dir = str(CONFIG_ROOT_DIR / "comparison")
9 |
10 | load_yaml_dags(
11 | globals_dict=globals(),
12 | dags_folder=config_dir,
13 | )
14 |
--------------------------------------------------------------------------------
/dev/dags/example_map_index_template.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 |
4 | # The following import is here so Airflow parses this file
5 | # from airflow import DAG
6 | import dagfactory
7 |
8 | DEFAULT_CONFIG_ROOT_DIR = "/usr/local/airflow/dags/"
9 | CONFIG_ROOT_DIR = Path(os.getenv("CONFIG_ROOT_DIR", DEFAULT_CONFIG_ROOT_DIR))
10 |
11 | config_file = str(CONFIG_ROOT_DIR / "example_map_index_template.yml")
12 | example_dag_factory = dagfactory.DagFactory(config_file)
13 |
14 | # Creating task dependencies
15 | example_dag_factory.clean_dags(globals())
16 | example_dag_factory.generate_dags(globals())
17 |
--------------------------------------------------------------------------------
/dev/dags/example_map_index_template.yml:
--------------------------------------------------------------------------------
1 | # Requires Airflow 2.9 or higher
2 | example_map_index_template:
3 | default_args:
4 | owner: "custom_owner"
5 | start_date: 2 days
6 | description: "Example of TaskFlow powered DAG that includes dynamic task mapping"
7 | schedule_interval: "0 3 * * *"
8 | default_view: "graph"
9 | tasks:
10 | dynamic_task_with_named_mapping:
11 | decorator: airflow.decorators.task
12 | python_callable: sample.extract_last_name
13 | map_index_template: "{{ custom_mapping_key }}"
14 | expand:
15 | full_name:
16 | - Lucy Black
17 | - Vera Santos
18 | - Marks Spencer
19 |
--------------------------------------------------------------------------------
/dev/dags/example_simple_http_operator_task.yml:
--------------------------------------------------------------------------------
1 | default:
2 | default_args:
3 | catchup: false,
4 | start_date: 2025-03-20
5 |
6 | http_operator_example_dag:
7 | default_args:
8 | owner: "@owner"
9 | description: "this is an HttpOperator dag"
10 | schedule_interval: "0 3 * * *"
11 | tags: ['http']
12 | render_template_as_native_obj: True
13 | tasks:
14 | send_request_json:
15 | operator: airflow.providers.http.operators.http.SimpleHttpOperator
16 | http_conn_id: "example_host"
17 | method: "POST"
18 | endpoint: "/run_test"
19 | data:
20 | data: "fake_data"
21 | format: "json"
22 | headers:
23 | Content-Type: application/json
24 | log_response: True
25 | send_request_plain_text:
26 | operator: airflow.providers.http.operators.http.SimpleHttpOperator
27 | http_conn_id: "example_host"
28 | method: "POST"
29 | endpoint: "/run_test"
30 | data:
31 | data: "fake_data"
32 | test: "plain_text"
33 | headers:
34 | Content-Type: text/plain
35 | log_response: True
36 |
--------------------------------------------------------------------------------
/dev/dags/example_task_group.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 |
4 | # The following import is here so Airflow parses this file
5 | # from airflow import DAG
6 | import dagfactory
7 |
8 | DEFAULT_CONFIG_ROOT_DIR = "/usr/local/airflow/dags/"
9 | CONFIG_ROOT_DIR = Path(os.getenv("CONFIG_ROOT_DIR", DEFAULT_CONFIG_ROOT_DIR))
10 |
11 | config_file = str(CONFIG_ROOT_DIR / "example_task_group.yml")
12 |
13 | example_dag_factory = dagfactory.DagFactory(config_file)
14 |
15 | # Creating task dependencies
16 | example_dag_factory.clean_dags(globals())
17 | example_dag_factory.generate_dags(globals())
18 |
--------------------------------------------------------------------------------
/dev/dags/example_task_group.yml:
--------------------------------------------------------------------------------
1 | default:
2 | default_args:
3 | owner: default_owner
4 | retries: 1
5 | retry_delay_sec: 300
6 | start_date: 2024-01-01
7 | default_view: tree
8 | max_active_runs: 1
9 | schedule_interval: 0 1 * * *
10 |
11 | example_task_group:
12 | description: "this dag uses task groups"
13 | task_groups:
14 | task_group_1:
15 | tooltip: "this is a task group"
16 | dependencies: [task_1]
17 | task_group_2:
18 | tooltip: "this is a task group"
19 | parent_group_name: task_group_1
20 | tasks:
21 | task_1:
22 | operator: airflow.operators.bash_operator.BashOperator
23 | bash_command: "echo 1"
24 | task_2:
25 | operator: airflow.operators.bash_operator.BashOperator
26 | bash_command: "echo 2"
27 | task_group_name: task_group_1
28 | task_4:
29 | operator: airflow.operators.bash_operator.BashOperator
30 | bash_command: "echo 4"
31 | task_group_name: task_group_2
32 |
--------------------------------------------------------------------------------
/dev/dags/example_taskflow.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 |
4 | # The following import is here so Airflow parses this file
5 | # from airflow import DAG
6 | import dagfactory
7 |
8 | DEFAULT_CONFIG_ROOT_DIR = "/usr/local/airflow/dags/"
9 | CONFIG_ROOT_DIR = Path(os.getenv("CONFIG_ROOT_DIR", DEFAULT_CONFIG_ROOT_DIR))
10 |
11 | config_file = str(CONFIG_ROOT_DIR / "example_taskflow.yml")
12 | example_dag_factory = dagfactory.DagFactory(config_file)
13 |
14 | # Creating task dependencies
15 | example_dag_factory.clean_dags(globals())
16 | example_dag_factory.generate_dags(globals())
17 |
--------------------------------------------------------------------------------
/dev/dags/example_taskflow.yml:
--------------------------------------------------------------------------------
1 | example_taskflow:
2 | default_args:
3 | owner: "custom_owner"
4 | start_date: 2 days
5 | description: "Example of TaskFlow powered DAG that includes dynamic task mapping"
6 | schedule_interval: "0 3 * * *"
7 | default_view: "graph"
8 | tasks:
9 | some_number:
10 | decorator: airflow.decorators.task
11 | python_callable: sample.some_number
12 | numbers_list:
13 | decorator: airflow.decorators.task
14 | python_callable_name: build_numbers_list
15 | python_callable_file: $CONFIG_ROOT_DIR/sample.py
16 | another_numbers_list:
17 | decorator: airflow.decorators.task
18 | python_callable: sample.build_numbers_list
19 | double_number_from_arg:
20 | decorator: airflow.decorators.task
21 | python_callable: sample.double
22 | number: 2
23 | double_number_from_task:
24 | decorator: airflow.decorators.task
25 | python_callable: sample.double
26 | number: +some_number # the prefix + leads to resolving this value as the task `some_number`, previously defined
27 | double_number_with_dynamic_task_mapping_static:
28 | decorator: airflow.decorators.task
29 | python_callable: sample.double
30 | expand:
31 | number:
32 | - 1
33 | - 3
34 | - 5
35 | double_number_with_dynamic_task_mapping_taskflow:
36 | decorator: airflow.decorators.task
37 | python_callable: sample.double
38 | expand:
39 | number: +numbers_list # the prefix + tells DagFactory to resolve this value as the task `numbers_list`, previously defined
40 | multiply_with_multiple_parameters:
41 | decorator: airflow.decorators.task
42 | python_callable: sample.multiply
43 | expand:
44 | a: +numbers_list # the prefix + tells DagFactory to resolve this value as the task `numbers_list`, previously defined
45 | b: +another_numbers_list # the prefix + tells DagFactory to resolve this value as the task `another_numbers_list`, previously defined
46 | double_number_with_dynamic_task_and_partial:
47 | decorator: airflow.decorators.task
48 | python_callable: sample.double_with_label
49 | expand:
50 | number: +numbers_list # the prefix + tells DagFactory to resolve this value as the task `numbers_list`, previously defined
51 | partial:
52 | label: True
53 |
--------------------------------------------------------------------------------
/dev/dags/expand_tasks.py:
--------------------------------------------------------------------------------
1 | def make_list():
2 | return [[1], [2], [3], [4]]
3 |
4 |
5 | def consume_value(expanded_param, fixed_param):
6 | print(fixed_param)
7 | print(expanded_param)
8 | return [expanded_param]
9 |
--------------------------------------------------------------------------------
/dev/dags/external_task_sensor.yml:
--------------------------------------------------------------------------------
1 | example_external_task_sensor_dag_factory_consumer:
2 | default_args:
3 | start_date: 2025-01-01
4 | schedule_interval: "@daily"
5 | tasks:
6 | wait_for_producer_task:
7 | operator: airflow.sensors.external_task_sensor.ExternalTaskSensor
8 | external_dag_id: example_external_task_sensor_dag_factory_producer
9 | external_task_id: producer_task
10 | mode: poke
11 | timeout: 600
12 | poke_interval: 30
13 | retries: 2
14 | consumer_task:
15 | operator: airflow.operators.empty.EmptyOperator
16 |
17 | example_external_task_sensor_dag_factory_consumer2:
18 | default_args:
19 | start_date: 2025-01-02
20 | schedule_interval: "@daily"
21 | tasks:
22 | wait_for_producer_task:
23 | operator: airflow.sensors.external_task_sensor.ExternalTaskSensor
24 | external_dag_id: example_external_task_sensor_dag_factory_producer
25 | external_task_id: producer_task
26 | execution_date_fn: sample.one_day_ago
27 | consumer_task:
28 | operator: airflow.operators.empty.EmptyOperator
29 |
30 | example_external_task_sensor_dag_factory_consumer3:
31 | default_args:
32 | start_date: 2025-01-03
33 | schedule_interval: "@daily"
34 | tasks:
35 | wait_for_producer_task:
36 | operator: airflow.sensors.external_task_sensor.ExternalTaskSensor
37 | external_dag_id: example_external_task_sensor_dag_factory_producer
38 | external_task_id: producer_task
39 | execution_delta: 1 days,
40 | consumer_task:
41 | operator: airflow.operators.empty.EmptyOperator
42 |
--------------------------------------------------------------------------------
/dev/dags/hacker_news.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import json
4 |
5 | import pandas as pd
6 |
7 | # ----8<--- [ start: hacker_news ]
8 |
9 |
10 | def summarize(**kwargs):
11 | """
12 | Given the Airflow context is provided to this function, it will extract the XCom hackernews records from its
13 | upstream tasks and summarise in Markdown.
14 | """
15 | ti = kwargs["ti"]
16 | upstream_task_ids = ti.task.upstream_task_ids # Get upstream task IDs dynamically
17 | values = [json.loads(ti.xcom_pull(task_ids=task_id)) for task_id in upstream_task_ids]
18 |
19 | df = pd.DataFrame(values)
20 | selected_columns = ["title", "url"]
21 | df = df[selected_columns]
22 | markdown_output = df.to_markdown(index=False)
23 | print(markdown_output)
24 | return markdown_output
25 |
26 |
27 | # ----8<--- [ end: hacker_news ]
28 |
--------------------------------------------------------------------------------
/dev/dags/invalid.yaml:
--------------------------------------------------------------------------------
1 | name: John Doe
2 | age: 30
3 | is_student: yes
4 | address:
5 | street: 123 Main St
6 | city: New York
7 | postal_code 10001
8 | - phone: 555-1234
9 | email: johndoe@example.com
10 |
--------------------------------------------------------------------------------
/dev/dags/print_hello.py:
--------------------------------------------------------------------------------
1 | def print_hello():
2 | print("hello")
3 |
4 |
5 | def print_hello_from_callback(context):
6 | print("hello from callback")
7 |
--------------------------------------------------------------------------------
/dev/dags/pypi_stats.py:
--------------------------------------------------------------------------------
1 | """
2 | PyPI stats utility functions.
3 | """
4 |
5 | from __future__ import annotations
6 |
7 | from typing import Any
8 |
9 | import httpx
10 | import pandas as pd
11 |
12 | DEFAULT_PYPI_PROJECTS = [
13 | "apache-airflow",
14 | "dag-factory",
15 | "astronomer-cosmos",
16 | ]
17 |
18 |
19 | # ----8<--- [ start: pypi_stats ]
20 |
21 |
22 | def get_pypi_projects_list(**kwargs: dict[str, Any]) -> list[str]:
23 | """
24 | Return a list of PyPI project names to be analysed.
25 | """
26 | projects_from_ui = kwargs.get("dag_run").conf.get("pypi_projects") if kwargs.get("dag_run") else None
27 | if projects_from_ui is None:
28 | pypi_projects = DEFAULT_PYPI_PROJECTS
29 | else:
30 | pypi_projects = projects_from_ui
31 | return pypi_projects
32 |
33 |
34 | def fetch_pypi_stats_data(package_name: str) -> dict[str, Any]:
35 | """
36 | Given a PyPI project name, return the PyPI stats data associated to it.
37 | """
38 | url = f"https://pypistats.org/api/packages/{package_name}/recent"
39 | package_json = httpx.get(url).json()
40 | package_data = package_json["data"]
41 | package_data["package_name"] = package_name
42 | return package_data
43 |
44 |
45 | def summarize(values: list[dict[str, Any]]):
46 | """
47 | Given a list with PyPI stats data, create a table summarizing it, sorting by the last day total downloads.
48 | """
49 | df = pd.DataFrame(values)
50 | first_column = "package_name"
51 | sorted_columns = [first_column] + [col for col in df.columns if col != first_column]
52 | df = df[sorted_columns].sort_values(by="last_day", ascending=False)
53 | markdown_output = df.to_markdown(index=False)
54 | print(markdown_output)
55 | return markdown_output
56 |
57 |
58 | # ----8<--- [ end: pypi_stats ]
59 |
60 | if __name__ == "__main__":
61 | pypi_projects_list = get_pypi_projects_list()
62 | all_data = []
63 | for pypi_project_name in pypi_projects_list:
64 | project_data = fetch_pypi_stats_data(pypi_project_name)
65 | all_data.append(project_data)
66 | summarize(data=all_data)
67 |
--------------------------------------------------------------------------------
/dev/dags/sample.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 | from random import randint
3 |
4 | from airflow.operators.python import get_current_context
5 |
6 |
7 | def build_numbers_list():
8 | return [2, 4, 6]
9 |
10 |
11 | def some_number():
12 | return randint(0, 100)
13 |
14 |
15 | def double(number: int):
16 | result = 2 * number
17 | print(result)
18 | return result
19 |
20 |
21 | def multiply(a: int, b: int) -> int:
22 | result = a * b
23 | print(result)
24 | return result
25 |
26 |
27 | # added_values = add.expand(x=first_list(), y=second_list())
28 |
29 |
30 | def double_with_label(number: int, label: bool = False):
31 | result = 2 * number
32 | if not label:
33 | print(result)
34 | return result
35 | else:
36 | label_info = "even" if number % 2 else "odd"
37 | print(f"{result} is {label_info}")
38 | return result, label_info
39 |
40 |
41 | def extract_last_name(full_name: str):
42 | name, last_name = full_name.split(" ")
43 | print(f"{name} {last_name}")
44 | context = get_current_context()
45 | context["custom_mapping_key"] = name
46 | return last_name
47 |
48 |
49 | def one_day_ago(execution_date: datetime):
50 | return execution_date - timedelta(days=1)
51 |
--------------------------------------------------------------------------------
/dev/packages.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/dev/packages.txt
--------------------------------------------------------------------------------
/dev/requirements.txt:
--------------------------------------------------------------------------------
1 | # Astro Runtime includes the following pre-installed providers packages: https://www.astronomer.io/docs/astro/runtime-image-architecture#provider-packages
2 | apache-airflow-providers-slack
3 |
--------------------------------------------------------------------------------
/dev/tests/dags/test_dag_example.py:
--------------------------------------------------------------------------------
1 | """Example DAGs test. This test ensures that all Dags have tags, retries set to two, and no import errors. This is an example pytest and may not be fit the context of your DAGs. Feel free to add and remove tests."""
2 |
3 | import logging
4 | import os
5 | from contextlib import contextmanager
6 |
7 | import pytest
8 | from airflow.models import DagBag
9 |
10 |
11 | @contextmanager
12 | def suppress_logging(namespace):
13 | logger = logging.getLogger(namespace)
14 | old_value = logger.disabled
15 | logger.disabled = True
16 | try:
17 | yield
18 | finally:
19 | logger.disabled = old_value
20 |
21 |
22 | def get_import_errors():
23 | """
24 | Generate a tuple for import errors in the dag bag
25 | """
26 | with suppress_logging("airflow"):
27 | dag_bag = DagBag(include_examples=False)
28 |
29 | def strip_path_prefix(path):
30 | return os.path.relpath(path, os.environ.get("AIRFLOW_HOME"))
31 |
32 | # prepend "(None,None)" to ensure that a test object is always created even if it's a no op.
33 | return [(None, None)] + [(strip_path_prefix(k), v.strip()) for k, v in dag_bag.import_errors.items()]
34 |
35 |
36 | def get_dags():
37 | """
38 | Generate a tuple of dag_id, in the DagBag
39 | """
40 | with suppress_logging("airflow"):
41 | dag_bag = DagBag(include_examples=False)
42 |
43 | def strip_path_prefix(path):
44 | return os.path.relpath(path, os.environ.get("AIRFLOW_HOME"))
45 |
46 | return [(k, v, strip_path_prefix(v.fileloc)) for k, v in dag_bag.dags.items()]
47 |
48 |
49 | @pytest.mark.parametrize("rel_path,rv", get_import_errors(), ids=[x[0] for x in get_import_errors()])
50 | def test_file_imports(rel_path, rv):
51 | """Test for import errors on a file"""
52 | if rel_path and rv:
53 | raise Exception(f"{rel_path} failed to import with message \n {rv}")
54 |
55 |
56 | APPROVED_TAGS = {}
57 |
58 |
59 | @pytest.mark.parametrize("dag_id,dag,fileloc", get_dags(), ids=[x[2] for x in get_dags()])
60 | def test_dag_tags(dag_id, dag, fileloc):
61 | """
62 | test if a DAG is tagged and if those TAGs are in the approved list
63 | """
64 | assert dag.tags, f"{dag_id} in {fileloc} has no tags"
65 | if APPROVED_TAGS:
66 | assert not set(dag.tags) - APPROVED_TAGS
67 |
--------------------------------------------------------------------------------
/docs/comparison/index.md:
--------------------------------------------------------------------------------
1 | # Using YAML instead of Python
2 |
3 | By default, Apache Airflow® users write their workflows, or sequences of tasks, in Python.
4 |
5 | DAG Factory offers an alternative interface, allowing users to represent Airflow workflows via YAML files, often using less code.
6 |
7 | This section illustrates a few examples of how to represent the same workflow using plain Airflow Python DAGs in comparison
8 | to their representation using DAG Factory YAML files.
9 |
10 | * [Traditional Airflow Operators](traditional_operators.md)
11 | * [TaskFlow API](traditional_operators.md)
12 |
--------------------------------------------------------------------------------
/docs/comparison/taskflow_api.md:
--------------------------------------------------------------------------------
1 | # TaskFlow API: Using YAML instead of Python
2 |
3 | For users that employ lots of Python functions in their DAGs, [TaskFlow API](https://www.astronomer.io/docs/learn/airflow-decorators/) represent a simpler way to transform functions into tasks, with a more intuitive way of passing data between them.
4 | They were introduced in Airflow 2 as an alternative to Airflow [traditional operators](traditional_operators.md).
5 |
6 | The following section shows how to represent an Airflow DAG using TaskFlow API and how to define the same DAG using
7 | DAG Factory. Ultimately, both implementations use the same Airflow operators. The main difference is the language used
8 | to declare the workflow: one uses Python and the other uses YAML.
9 |
10 | ## Goal
11 |
12 | Let's say we'd like to create a workflow that performs the following:
13 |
14 | 1. Create a list of [PyPI](https://pypi.org/) projects to be analysed.
15 | 2. Fetch the [statistics](https://pypistats.org/) for each of these projects.
16 | 3. Summarize the selected statistics as Markdown, using Python.
17 |
18 | We will implement all these steps using the Airflow `task` decorator, and the last task will generate a Markdown table similar to:
19 |
20 | ```text
21 | | package_name | last_day | last_month | last_week |
22 | |:------------------|-----------:|-------------:|------------:|
23 | | apache-airflow | 852242 | 28194255 | 6253861 |
24 | | astronomer-cosmos | 442531 | 13354870 | 3127750 |
25 | | dag-factory | 10078 | 354085 | 77752 |
26 | ```
27 |
28 | The main logic is implemented as plain Python functions in [pypi_stats.py](https://github.com/astronomer/dag-factory/blob/main/dev/dags/pypi_stats.py):
29 |
30 | ```title="pypi_stats.py"
31 | --8<-- "dev/dags/pypi_stats.py:pypi_stats"
32 | ```
33 |
34 | ## Implementation
35 |
36 | As a reference, the following workflows run using Airflow 2.10.2 and DAG Factory 0.21.0.
37 |
38 | ### Plain Airflow Python DAG
39 |
40 | ```title="example_pypi_stats_plain_airflow.py"
41 | --8<-- "dev/dags/comparison/example_pypi_stats_plain_airflow.py"
42 | ```
43 |
44 | ### Alternative DAG Factory YAML
45 |
46 | ```title="example_pypi_stats_dagfactory.yml"
47 | --8<-- "dev/dags/comparison/example_pypi_stats_dagfactory.yml"
48 | ```
49 |
50 | ## Comparison
51 |
52 | ### Goal
53 |
54 | Both implementations accomplish the same goal and result in the expected Markdown table.
55 |
56 | ### Airflow Graph view
57 |
58 | As shown in the screenshots below, both the DAG created using Python with standard Airflow and the
59 | DAG created using YAML and DAG Factory look identical, from a graph topology perspective, and also from the underlining operators being used.
60 |
61 | #### Graph view: Plain Airflow Python DAG
62 |
63 | 
64 |
65 | #### Graph view: Alternative DAG Factory YAML
66 |
67 | 
68 |
69 | ### Airflow Dynamic Task Mapping
70 |
71 | In both workflows, we are dynamically generating a task for each PyPI repo.
72 |
73 | #### Mapped Tasks: Plain Airflow Python DAG
74 |
75 | 
76 |
77 | #### Mapped Tasks: Alternative DAG Factory YAML
78 |
79 | 
80 |
81 | ### Airflow Code view
82 |
83 | From an Airflow UI perspective, the content displayed in the "Code" view is the main difference between the two implementations. While Airflow renders the original Python DAG, as expected, in the case of the YAML DAGs, Airflow displays the Python file that references the DAG Factory YAML files:
84 |
85 | ```title="example_load_yaml_dags.py"
86 | --8<-- "dev/dags/example_load_yaml_dags.py"
87 | ```
88 |
89 | #### Code view: Plain Airflow Python DAG
90 |
91 | 
92 |
93 | #### Code view: Alternative DAG Factory YAML
94 |
95 | 
96 |
97 | To overcome this limitation, DAG Factory appends the YAML content to the DAG Documentation so users can better troubleshoot the DAG:
98 |
99 | 
100 |
--------------------------------------------------------------------------------
/docs/comparison/traditional_operators.md:
--------------------------------------------------------------------------------
1 | # Traditional Operators: Using YAML instead of Python
2 |
3 | Traditionally, operators are Airflow's building blocks, and while they are robust and diverse,
4 | they can sometimes lead to boilerplate-heavy DAGs compared to the newer [TaskFlow API](./taskflow_api.md).
5 |
6 | Most of the Airflow providers come with built-in traditional operators. Some examples include `BashOperator`, `PythonOperator`, `KubernetesPodOperator`, and `PostgresOperator`.
7 |
8 | Below, we illustrate how to represent an Airflow DAG using traditional operators and how to define the same DAG using
9 | DAG Factory. Ultimately, both implementations use the same Airflow operators. The main difference is the language used
10 | to declare the workflow: one uses Python and the other uses YAML.
11 |
12 | ## Goal
13 |
14 | Let's say we'd like to create a workflow that performs the following:
15 |
16 | 1. Retrieve the top ten stories from Hacker News using the [Hacker News API](https://github.com/HackerNews/API).
17 | 2. Fetch the details for the two top stories using the Hacker News API.
18 | 3. Summarize the selected stories as Markdown, using Python.
19 |
20 | We will implement the first two steps using `BashOperator` and the third step using `PythonOperator`.
21 | The last task will generate a `Markdown` snippet similar to:
22 |
23 | ```text
24 | | title | url |
25 | |:----------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------|
26 | | I keep turning my Google Sheets into phone-friendly webapps | https://arstechnica.com/gadgets/2024/12/making-tiny-no-code-webapps-out-of-spreadsheets-is-a-weirdly-fulfilling-hobby/ |
27 | | Coconut by Meta AI – Better LLM Reasoning with Chain of Continuous Thought? | https://aipapersacademy.com/chain-of-continuous-thought/ |
28 | ```
29 |
30 | The main logic is implemented as plain Python functions in [hacker_news.py](https://github.com/astronomer/dag-factory/blob/main/dev/dags/hacker_news.py):
31 |
32 | ```title="pypi_stats.py"
33 | --8<-- "dev/dags/hacker_news.py:hacker_news"
34 | ```
35 |
36 | ## Implementation
37 |
38 | As a reference, the following workflows run using Airflow 2.10.2 and DAG Factory 0.21.0.
39 |
40 | ### Plain Airflow Python DAG
41 |
42 | ```title="example_hackernews_plain_airflow.py"
43 | --8<-- "dev/dags/comparison/example_hackernews_plain_airflow.py"
44 | ```
45 |
46 | ### Alternative DAG Factory YAML
47 |
48 | ```title="example_hackernews_dagfactory.py"
49 | --8<-- "dev/dags/comparison/example_hackernews_dagfactory.yml"
50 | ```
51 |
52 | ## Comparison
53 |
54 | ### Goal
55 |
56 | Both implementations accomplish the same goal and result in the expected Markdown table.
57 |
58 | ### Airflow Graph view
59 |
60 | As shown in the screenshots below, both the DAG created using Python with standard Airflow and the
61 | DAG created using YAML and DAG Factory look identical, from a graph topology perspective, and also from the underlining
62 | operators being used.
63 |
64 | #### Graph view: Plain Airflow Python DAG
65 |
66 | 
67 |
68 | #### Graph view: Alternative DAG Factory YAML
69 |
70 | 
71 |
72 | ### Airflow Code view
73 |
74 | From an Airflow UI perspective, the content displayed in the "Code" view is the main difference between the two implementations. While Airflow renders the original Python DAG, as expected, in the case of the YAML DAGs, Airflow displays the Python file that references the DAG Factory YAML files:
75 |
76 | ```title="example_load_yaml_dags.py"
77 | --8<-- "dev/dags/example_load_yaml_dags.py"
78 | ```
79 |
80 | #### Code view: Plain Airflow Python DAG
81 |
82 | 
83 |
84 | #### Code view: Alternative DAG Factory YAML
85 |
86 | 
87 |
88 | To overcome this limitation, DAG Factory appends the YAML content to the DAG Documentation so users can better troubleshoot
89 | the DAG:
90 |
91 | 
92 |
--------------------------------------------------------------------------------
/docs/configuration/configuring_workflows.md:
--------------------------------------------------------------------------------
1 | # Configuring Your Workflows
2 |
3 | DAG Factory allows you to define workflows in a structured, configuration-driven way using YAML files.
4 | You can define multiple workflows within a single YAML file based on your requirements.
5 |
6 | ## Key Elements of Workflow Configuration
7 |
8 | - **dag_id**: Unique identifier for your DAG.
9 | - **default_args**: Common arguments for all tasks.
10 | - **schedule**/**schedule_interval**: Specifies the execution schedule.
11 | - **tasks**: Defines the [Airflow tasks](https://airflow.apache.org/docs/apache-airflow/stable/core-concepts/tasks.html) in your workflow.
12 |
13 | ### Example DAG Configuration
14 |
15 | ```title="example_dag_factory.yml"
16 | --8<-- "dev/dags/example_dag_factory.yml:example_dag_yaml_configuration"
17 | ```
18 |
19 | ### Check out more configuration params
20 |
21 | - [Environment variables](environment_variables.md)
22 | - [Defaults](defaults.md)
23 |
--------------------------------------------------------------------------------
/docs/configuration/defaults.md:
--------------------------------------------------------------------------------
1 | # Defaults
2 |
3 | DAG Factory allows you to define Airflow
4 | [default_args](https://airflow.apache.org/docs/apache-airflow/stable/core-concepts/dags.html#default-arguments) and
5 | additional DAG-level arguments in a `default` block. This block enables you to share common settings and configurations
6 | across all DAGs in your YAML configuration, with the arguments automatically applied to each DAG defined in the file.
7 | This is one of DAG Factory's most powerful features; using defaults allows for the dynamic generation of more than a
8 | single DAG.
9 |
10 | ## Benefits of using the default block
11 |
12 | - Consistency: Ensures uniform configurations across all tasks and DAGs.
13 | - Maintainability: Reduces duplication by centralizing common properties.
14 | - Simplicity: Makes configurations easier to read and manage.
15 | - Dynamic Generation: Use a single default block to easily generate more than a single DAG.
16 |
17 | ### Example usage of a default block for `default_args`
18 |
19 | #### Specifying `default_args` in the `default` block
20 |
21 | Using a `default` block in a YAML file allows for those key-value pairs to be applied to each DAG that is defined in
22 | that same file. One of the most common examples is using a `default` block to specify `default_args` for each DAG
23 | defined in that file. These arguments are automatically inherited by every DAG defined in the file. Below is an example of this.
24 |
25 | ```yaml title="Usage of default block for default_args in YAML"
26 | --8<-- "dev/dags/example_dag_factory_default_args.yml"
27 | ```
28 |
29 | #### Specifying `default_args` directly in a DAG configuration
30 |
31 | You can override or define specific `default_args` at the individual DAG level. This allows you to customize
32 | arguments for each DAG without affecting others. Not only can existing `default_args` be overridden directly in a DAG
33 | configuration, but new arguments can be added.
34 |
35 | ```yaml
36 | etl:
37 | default_args:
38 | start_date: '2024-12-31'
39 | retries: 1 # A new default_arg was added
40 | ...
41 | ```
42 |
43 | #### Specifying `default_args` in a shared `defaults.yml`
44 |
45 | Starting DAG Factory 0.22.0, you can also keep the `default_args` in the `defaults.yml` file. The configuration
46 | from `defaults.yml` will be applied to all DAG Factory generated DAGs. **Be careful, these will be applied to all
47 | generated DAGs.**
48 |
49 | ```yaml title="defaults.yml"
50 | --8<-- "dev/dags/defaults.yml"
51 | ```
52 |
53 | Given the various ways to specify `default_args`, the following precedence order is applied when arguments are
54 | duplicated:
55 |
56 | 1. In the DAG configuration
57 | 2. In the `default` block within the workflow's YAML file
58 | 3. In the `defaults.yml`
59 |
60 | ### Example using of default block for dynamic DAG generation
61 |
62 | Not only can the `default` block in a YAML file be used to define `default_args` for one or more DAGs; it can also be
63 | used to create the skeleton of "templated" DAGs. In the example below, the `default` block is used to define not only
64 | the `default_args` of a DAG, but also default Tasks. These Tasks provide a "template" for the DAGs defined in this file.
65 | Each DAG (`machine_learning`, `data_science`, `artificial_intelligence`) will be defined using the values from the
66 | `default` block, and like with `default_args`, can override these values. **This is a powerful way to use DAG Factory
67 | to dynamically create DAGs using a single configuration.**
68 |
69 |
70 | ```yaml title="Usage of default block in YAML"
71 | --8<-- "dev/dags/example_dag_factory_default_config.yml"
72 | ```
73 |
74 | Currently, only `default_args` can be specified using the `defaults.yml` file.
75 |
--------------------------------------------------------------------------------
/docs/configuration/environment_variables.md:
--------------------------------------------------------------------------------
1 | # Environment variables
2 |
3 | Starting release `0.20.0`, DAG Factory introduces support for referencing environment variables directly within YAML
4 | configuration files. This enhancement enables dynamic configuration paths and enhances workflow portability by
5 | resolving environment variables during DAG parsing.
6 |
7 | With this feature, DAG Factory removes the reliance on hard-coded paths, allowing for more flexible and adaptable
8 | configurations that work seamlessly across various environments.
9 |
10 | ## Example YAML Configuration with Environment Variables
11 |
12 | ```title="Reference environment variable in YAML"
13 | --8<-- "dev/dags/example_dag_factory_multiple_config.yml:environment_variable_example"
14 | ```
15 |
16 | In the above example, `$CONFIG_ROOT_DIR` is used to reference an environment variable that points to the root
17 | directory of your DAG configurations. During DAG parsing, it will be resolved to the value specified for the
18 | `CONFIG_ROOT_DIR` environment variable.
19 |
--------------------------------------------------------------------------------
/docs/contributing/code_of_conduct.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | We as members, contributors, and leaders pledge to make participation in our
6 | community a harassment-free experience for everyone, regardless of age, body
7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
8 | identity and expression, level of experience, education, socioeconomic status,
9 | nationality, personal appearance, race, religion, or sexual identity
10 | and orientation.
11 |
12 | We pledge to act and interact in ways that contribute to an open, welcoming,
13 | diverse, inclusive, and healthy community.
14 |
15 | ## Our Standards
16 |
17 | Examples of behavior that contributes to a positive environment for our
18 | community include:
19 |
20 | * Demonstrating empathy and kindness toward other people
21 | * Being respectful of differing opinions, viewpoints, and experiences
22 | * Giving and gracefully accepting constructive feedback
23 | * Accepting responsibility and apologizing to those affected by our mistakes,
24 | and learning from the experience
25 | * Focusing on what is best not just for us as individuals, but for the
26 | overall community
27 |
28 | Examples of unacceptable behavior include:
29 |
30 | * The use of sexualized language or imagery, and sexual attention or
31 | advances of any kind
32 | * Trolling, insulting or derogatory comments, and personal or political attacks
33 | * Public or private harassment
34 | * Publishing others' private information, such as a physical or email
35 | address, without their explicit permission
36 | * Other conduct which could reasonably be considered inappropriate in a
37 | professional setting
38 |
39 | ## Enforcement Responsibilities
40 |
41 | Community leaders are responsible for clarifying and enforcing our standards of
42 | acceptable behavior and will take appropriate and fair corrective action in
43 | response to any behavior that they deem inappropriate, threatening, offensive,
44 | or harmful.
45 |
46 | Community leaders have the right and responsibility to remove, edit, or reject
47 | comments, commits, code, wiki edits, issues, and other contributions that are
48 | not aligned with this Code of Conduct, and will communicate reasons for moderation
49 | decisions when appropriate.
50 |
51 | ## Scope
52 |
53 | This Code of Conduct applies within all community spaces, and also applies when
54 | an individual is officially representing the community in public spaces.
55 | Examples of representing our community include using an official e-mail address,
56 | posting via an official social media account, or acting as an appointed
57 | representative at an online or offline event.
58 |
59 | ## Enforcement
60 |
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported to the community leaders responsible for enforcement at
63 | [humans@astronomer.io](mailto:humans@astronomer.io).
64 |
65 | All complaints will be reviewed and investigated promptly and fairly.
66 |
67 | All community leaders are obligated to respect the privacy and security of the
68 | reporter of any incident.
69 |
70 | ## Enforcement Guidelines
71 |
72 | Community leaders will follow these Community Impact Guidelines in determining
73 | the consequences for any action they deem in violation of this Code of Conduct:
74 |
75 | ### 1. Correction
76 |
77 | **Community Impact**: Use of inappropriate language or other behavior deemed
78 | unprofessional or unwelcome in the community.
79 |
80 | **Consequence**: A private, written warning from community leaders, providing
81 | clarity around the nature of the violation and an explanation of why the
82 | behavior was inappropriate. A public apology may be requested.
83 |
84 | ### 2. Warning
85 |
86 | **Community Impact**: A violation through a single incident or series
87 | of actions.
88 |
89 | **Consequence**: A warning with consequences for continued behavior. No
90 | interaction with the people involved, including unsolicited interaction with
91 | those enforcing the Code of Conduct, for a specified period of time. This
92 | includes avoiding interactions in community spaces as well as external channels
93 | like social media. Violating these terms may lead to a temporary or
94 | permanent ban.
95 |
96 | ### 3. Temporary Ban
97 |
98 | **Community Impact**: A serious violation of community standards, including
99 | sustained inappropriate behavior.
100 |
101 | **Consequence**: A temporary ban from any sort of interaction or public
102 | communication with the community for a specified period of time. No public or
103 | private interaction with the people involved, including unsolicited interaction
104 | with those enforcing the Code of Conduct, is allowed during this period.
105 | Violating these terms may lead to a permanent ban.
106 |
107 | ### 4. Permanent Ban
108 |
109 | **Community Impact**: Demonstrating a pattern of violation of community
110 | standards, including sustained inappropriate behavior, harassment of an
111 | individual, or aggression toward or disparagement of classes of individuals.
112 |
113 | **Consequence**: A permanent ban from any sort of public interaction within
114 | the community.
115 |
116 | ## Attribution
117 |
118 | This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/version/2/0/code_of_conduct/), version 2.0.
119 |
120 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
121 | enforcement ladder](https://github.com/mozilla/inclusion).
122 |
123 | For answers to common questions about this code of conduct, see the [FAQ](https://www.contributor-covenant.org/faq/).
124 | Translations are available at [this page](https://www.contributor-covenant.org/translations/).
125 |
--------------------------------------------------------------------------------
/docs/contributing/contributors.md:
--------------------------------------------------------------------------------
1 | # Contributors
2 |
3 | There are different ways people can contribute to DAG Factory.
4 | Learn more about the project [contributors roles](roles.md).
5 |
6 | ## Committers
7 |
8 | * Pankaj Koti ([@pankajkoti](https://github.com/pankajkoti))
9 | * Pankaj Singh ([@pankajastro](https://github.com/pankajastro))
10 | * Tatiana Al-Chueyr ([@tatiana](https://github.com/tatiana))
11 |
12 | ## Emeritus Committers
13 |
14 | * Adam Boscarino ([@ajbosco](https://github.com/ajbosco))
15 |
16 | ## Contributors
17 |
18 | Many people are improving DAG Factory each day.
19 | Find more contributors [in our GitHub page](https://github.com/astronomer/dag-factory/graphs/contributors).
20 |
--------------------------------------------------------------------------------
/docs/contributing/howto.md:
--------------------------------------------------------------------------------
1 | # Contributing Guide
2 |
3 | All contributions, bug reports, bug fixes, documentation improvements, and enhancements are welcome.
4 |
5 | All contributors and maintainers to this project should abide by the [Contributor Code of Conduct](code_of_conduct.md).
6 |
7 | Learn more about the contributors' roles in [the Roles page](roles.md).
8 |
9 | This document describes how to contribute to DAG Factory, covering:
10 |
11 | - Overview of how to contribute
12 | - How to set up the local development environment
13 | - Running tests
14 | - Pre-commit and linting
15 | - Authoring the documentation
16 | - Releasing
17 |
18 | ## Overview of how to contribute
19 |
20 | To contribute to the DAG Factory project:
21 |
22 | 1. Please create a [GitHub Issue](https://github.com/astronomer/dag-factory/issues) describing a bug, enhancement, or feature request.
23 | 2. Open a branch off of the `main` branch and create a Pull Request into the `main` branch from your feature branch.
24 | 3. Link your issue to the pull request.
25 | 4. After you complete development on your feature branch, request a review. A maintainer will merge your PR after all reviewers approve it.
26 |
27 | ## Set up a local development environment
28 |
29 | ### Requirements
30 |
31 | - [Git](https://git-scm.com/)
32 | - [Python](https://www.python.org/) <= 3.12 (due to dependencies, such as ``google-re2`` not supporting Python 3.13 yet)
33 | - [Hatch](https://hatch.pypa.io/latest/)
34 |
35 | Clone the **DAG Factory** repository and change the current working directory to the repo's root directory:
36 |
37 | ```bash
38 | git clone https://github.com/astronomer/dag-factory.git
39 | cd dag-factory/
40 | ```
41 |
42 | After cloning the project, there are two options for setting up the local development environment:
43 |
44 | - Use a Python virtual environment, or
45 | - Use Docker
46 |
47 | ### Using a Python virtual environment for local development
48 |
49 | 1. Install the project dependencies:
50 |
51 | ```bash
52 | make setup
53 | ```
54 |
55 | 2. Activate the local python environment:
56 |
57 | ```bash
58 | source venv/bin/activate
59 | ```
60 |
61 | 3. Set [Apache Airflow®](https://airflow.apache.org/) home to the ``dev/``, so you can see DAG Factory example DAGs.
62 | Disable loading Airflow standard example DAGs:
63 |
64 | ```bash
65 | export AIRFLOW_HOME=$(pwd)/dev/
66 | export AIRFLOW__CORE__LOAD_EXAMPLES=false
67 | ```
68 |
69 | Then, run Airflow in standalone mode; the command below will create a new user (if it does not exist) and run the necessary Airflow component (webserver, scheduler and triggered):
70 |
71 | > Note: By default, Airflow will use sqlite as a database; you can override this by setting the variable ``AIRFLOW__DATABASE__SQL_ALCHEMY_CONN`` to the SQL connection string.
72 |
73 | ```bash
74 | airflow standalone
75 | ```
76 |
77 | After Airflow is running, you can access the Airflow UI at ``http://localhost:8080``.
78 |
79 | > Note: whenever you want to start the development server, you need to activate the ``virtualenv`` and set the ``environment variables``
80 |
81 | ### Use Docker for local development
82 |
83 | It is also possible to build the development environment using [Docker](https://www.docker.com/products/docker-desktop/):
84 |
85 | ```bash
86 | make docker-run
87 | ```
88 |
89 | After the sandbox is running, you can access the Airflow UI at ``http://localhost:8080``.
90 |
91 | This approach builds a DAG Factory wheel, so if there are code changes, you must stop and restart the containers:
92 |
93 | ```bash
94 | make docker-stop
95 | ```
96 |
97 | ## Testing application with hatch
98 |
99 | The tests are developed using PyTest and run using hatch.
100 |
101 | The [pyproject. toml](https://github.com/astronomer/dag-factory/blob/main/pyproject.toml) file currently defines a matrix of supported versions of Python and Airflow against which a user can run the tests.
102 |
103 | ### Run unit tests
104 |
105 | To run unit tests using Python 3.10 and Airflow 2.5, use the following:
106 |
107 | ```bash
108 | hatch run tests.py3.10-2.5:test-cov
109 | ```
110 |
111 | It is also possible to run the tests using all the matrix combinations, by using:
112 |
113 | ```bash
114 | hatch run tests:test-cov
115 | ```
116 |
117 | ### Run integration tests
118 |
119 | > Note: these tests create local Python virtual environments in a hatch-managed directory.
120 | > They also use the user-defined `AIRFLOW_HOME`, overriding any pre-existing `airflow.cfg` and `airflow.db` files.
121 |
122 | First, set the following environment variables:
123 |
124 | ```bash
125 | export AIRFLOW_HOME=$(pwd)/dev/
126 | export CONFIG_ROOT_DIR=`pwd`"/dev/dags"
127 | export PYTHONPATH=dev/dags:$PYTHONPATH
128 | ```
129 |
130 | To run the integration tests using Python 3.9 and Airflow 2.9, use
131 |
132 | ```bash
133 | hatch run tests.py3.9-2.9:test-integration-setup
134 | hatch run tests.py3.9-2.9:test-integration
135 | ```
136 |
137 | ## Pre-Commit and linting
138 |
139 | We use pre-commit to run several checks on the code before committing. To install pre-commit hooks, run:
140 |
141 | ```bash
142 | pre-commit install
143 | ```
144 |
145 | To run the checks manually, run the following:
146 |
147 | ```bash
148 | pre-commit run --all-files
149 | ```
150 |
151 | Pre-commit runs several static checks, including Black and Ruff. It is also possible to run them using ``hatch``:
152 |
153 | ```bash
154 | hatch run tests.py3.9-2.9:static-check
155 | ```
156 |
157 | ## Write docs
158 |
159 | We use Markdown to author DAG Factory documentation.
160 |
161 | Similar to running tests, we also use hatch to manage the documentation.
162 |
163 | To build and serve the documentation locally:
164 |
165 | ```bash
166 | hatch run docs:dev
167 | ```
168 |
169 | To release the documentation with the current project version and set it to the latest:
170 |
171 | ```bash
172 | hatch run docs:gh-release
173 | ```
174 |
175 | ## Releasing
176 |
177 | We currently use [hatch](https://github.com/pypa/hatch) for building and distributing ``dag-factory``.
178 |
179 | We use GitHub actions to create and deploy new releases. To create a new release, update the latest release version.
180 |
181 | It is possible to update the version either by using hatch:
182 |
183 | > Note: You can update the version in several different ways. To learn more, check out the [hatch docs](https://hatch.pypa.io/latest/version/#updating).
184 |
185 | ```bash
186 | hatch version minor
187 | ```
188 |
189 | Or by manually updating the value of `__version__` in `dagfactory/__init__.py`.
190 |
191 | Make sure the [CHANGELOG file](https://github.com/astronomer/dag-factory/blob/main/CHANGELOG.md) is up-to-date.
192 |
193 | Create a release using the [GitHub UI](https://github.com/astronomer/dag-factory/releases/new). GitHub will update the package directly to [PyPI](https://pypi.org/project/dag-factory/).
194 |
195 | If you're a [project maintainer in PyPI](https://pypi.org/project/dag-factory/), it is also possible to create a release manually,
196 | by authenticating to PyPI and running the commands:
197 |
198 | ```bash
199 | hatch build
200 | hatch publish
201 | ```
202 |
--------------------------------------------------------------------------------
/docs/contributing/roles.md:
--------------------------------------------------------------------------------
1 | # Contributor roles
2 |
3 | Contributors are welcome and are greatly appreciated! Every little bit helps, and we give credit to them.
4 |
5 | This document aims to explain the current roles in the DAG Factory project.
6 | For more information, check the [contributing docs](howto.md).
7 |
8 | ## Contributors
9 |
10 | A contributor is anyone who wants to contribute code, documentation, tests, ideas, or anything to the DAG Factory project.
11 |
12 | DAG Factory contributors are listed in the Github [insights page](https://github.com/astronomer/dag-factory/graphs/contributors).
13 |
14 | Contributors are responsible for:
15 |
16 | * Fixing bugs
17 | * Refactoring code
18 | * Improving processes and tooling
19 | * Adding features
20 | * Improving the documentation
21 |
22 | ## Committers
23 |
24 | Committers are community members with write access to the [DAG Factory GitHub repository](https://github.com/astronomer/dag-factory).
25 | They can modify the code and the documentation and accept others' contributions to the repo.
26 |
27 | Check [contributors](contributors.md) for the official list of DAG Factory committers.
28 |
29 | Committers have the same responsibilities as standard contributors and also perform the following actions:
30 |
31 | * Reviewing & merging pull-requests
32 | * Scanning and responding to GitHub issues, helping triaging them
33 |
34 | If you know you are not going to be able to contribute for a long time (for instance, due to a change of job or circumstances), you should inform other maintainers, and we will mark you as "emeritus".
35 | Emeritus committers will no longer have write access to the repo.
36 | As merit earned never expires, once an emeritus committer becomes active again, they can simply email another maintainer from Astronomer and ask to be reinstated.
37 |
38 | ### Pre-requisites to becoming a committer
39 |
40 | General prerequisites that we look for in all candidates:
41 |
42 | 1. Consistent contribution over last few months
43 | 2. Visibility on discussions on the Slack channel or GitHub issues/discussions
44 | 3. Contributes to community health and project's sustainability for the long-term
45 | 4. Understands the project's [contributors' guidelines](howto.md).
46 | Astronomer is responsible and accountable for releasing new versions of DAG Factory in [PyPI](https://pypi.org/project/dag-factory/), following the [milestones](https://github.com/astronomer/dag-factory/milestones).
47 | Astronomer has the right to grant and revoke write access permissions to the project's official repository for any reason it sees fit.
48 |
--------------------------------------------------------------------------------
/docs/features/callbacks.md:
--------------------------------------------------------------------------------
1 | # Callbacks
2 | DAG Factory supports the use of callbacks. These callbacks can be set at the DAG, TaskGroup, or Task level. The way
3 | that callbacks that can be configured for DAGs, TaskGroups, and Tasks differ slightly, and details around this can be
4 | found in the [Apache Airflow documentation](https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/callbacks.html#).
5 |
6 | Within DAG Factory itself, there are three approaches to defining callbacks. The goal is to make this process
7 | intuitive and provide parity with the traditional DAG authoring experience. These approaches to configure callbacks
8 | are outlined below, each with an example of implementation. While proceeding examples are all defined for individual
9 | Tasks, callbacks can also be defined using `default_args`, or at the DAG and TaskGroup level.
10 |
11 | * [Passing a string that points to a callable](#passing-a-string-that-points-to-a-callable)
12 | * [Specifying a user-defined `.py` and the function within that file to be executed](#specifying-a-user-defined-py-file-and-function)
13 | * [Configuring callbacks from providers](#provider-callbacks)
14 |
15 |
16 | ## Passing a string that points to a callable
17 |
18 | The most traditional way of configuring callbacks is by defining a custom function within the Airflow project and
19 | assigning that callback to the desired Task. Using the syntax below, this can be implemented using DAG Factory. In this
20 | case, the `output_standard_message` function is a user-defined function stored in the `include/custom_callbacks.py`
21 | file. This function requires no parameters, and the YAML would take the form below.
22 |
23 | For this example to be implemented in DAG Factory, the `include/custom_callbacks.py` file must be on the Python
24 | `sys.path`. If this is not the case, the full path to a `.py` function can be specified, as shown below.
25 |
26 | ```yaml
27 | ...
28 |
29 | task_1:
30 | operator: airflow.operators.bash_operator.BashOperator
31 | bash_command: "echo task_1"
32 | on_failure_callback: include.custom_callbacks.output_standard_message
33 | ...
34 | ```
35 |
36 | Sometimes, a function may have parameters that need to be defined within the Task itself. Here, the
37 | `output_custom_message` callback takes two key-word arguments; `param1`, and `param2`. These values are defined in the
38 | YAML itself, offering DAG Factory authors an additional degree of flexibility and verbosity.
39 |
40 | ```yaml
41 | ...
42 |
43 | task_2:
44 | operator: airflow.operators.bash_operator.BashOperator
45 | bash_command: "echo task_2"
46 | on_success_callback:
47 | callback: include.custom_callbacks.output_custom_message
48 | param1: "Task status"
49 | param2: "Successful!"
50 | ...
51 | ```
52 |
53 |
54 | ## Specifying a user-defined `.py` file and function
55 |
56 | In addition to passing a string that points to a callback, the full path to the file and name of the callback can be
57 | specified for a DAG, TaskGroup, or Task. This provides a viable option for defining a callback when the director the
58 | `.py` file is stored in is not on the Python path.
59 |
60 | ```yaml
61 | ...
62 |
63 | task_3:
64 | operator: airflow.operators.bash_operator.BashOperator
65 | bash_command: "echo task_3"
66 | on_retry_callback_name: output_standard_message
67 | on_retry_callback_file: /usr/local/airflow/include/custom_callbacks.py
68 | ...
69 | ```
70 |
71 | Note that this method for defining callbacks in DAG Factory does not allow for parameters to be passed to the callable
72 | within the YAML itself.
73 |
74 |
75 | ## Provider callbacks
76 |
77 | In addition to custom-built callbacks, there are a number of provider-built callbacks that can be used when defining a
78 | DAG. With DAG Factory, these callbacks can be configured similar to how they would be when authoring a traditional DAG.
79 | First, the type of callback is specified (`on_success_callback`, `on_failure_callback`, etc.). The `callback` key-value
80 | pair specifies the provider-built function to be executed. Then, the specific key-word arguments the callback takes can
81 | be specified, as shown below.
82 |
83 | Note that the provider package being used must be available on the Python `sys.path` path, meaning it may need to be
84 | `pip installed`.
85 |
86 | ```yaml
87 | ...
88 | task_4:
89 | operator: airflow.operators.bash_operator.BashOperator
90 | bash_command: "echo task_4"
91 | on_failure_callback:
92 | callback: airflow.providers.slack.notifications.slack.send_slack_notification
93 | slack_conn_id: slack_conn_id
94 | text: |
95 | :red_circle: Task Failed.
96 | This task has failed and needs to be addressed.
97 | Please remediate this issue ASAP.
98 | channel: "#channel"
99 | ...
100 | ```
101 |
--------------------------------------------------------------------------------
/docs/features/custom_operators.md:
--------------------------------------------------------------------------------
1 | # Custom Operators
2 |
3 | **DAG-Factory** supports [custom operators](https://airflow.apache.org/docs/apache-airflow/stable/howto/custom-operator.html). To leverage, set the path to the custom operator within the `operator` key in the configuration file. You can add any additional parameters that the custom operator requires.
4 |
5 | ```yaml
6 | ...
7 | tasks:
8 | begin:
9 | operator: airflow.operators.empty.EmptyOperator
10 | make_bread_1:
11 | operator: customized.operators.breakfast_operators.MakeBreadOperator
12 | bread_type: 'Sourdough'
13 | ```
14 |
15 | 
16 |
--------------------------------------------------------------------------------
/docs/features/datasets.md:
--------------------------------------------------------------------------------
1 | # Datasets
2 | DAG Factory supports Airflow’s [Datasets](https://airflow.apache.org/docs/apache-airflow/stable/authoring-and-scheduling/datasets.html).
3 |
4 | ## Datasets Outlets and Inlets
5 |
6 | To leverage datasets, you need to specify the `Dataset` in the `outlets` and `inlets` keys in the configuration file.
7 | The `outlets` and `inlets` keys should contain a list of strings representing dataset locations.
8 | In the `schedule` key of the consumer DAG, you can set the `Dataset` that the DAG should be scheduled against. The key
9 | should contain a list of dataset locations.
10 | The consumer DAG will run when all the specified datasets become avai
11 |
12 | #### Example: Outlet and Inlet
13 |
14 | ```title="example_dag_datasets_outlet_inlet.yml"
15 | --8<-- "dev/dags/datasets/example_dag_datasets_outlet_inlet.yml"
16 | ```
17 |
18 | 
19 |
20 | ## Conditional Dataset Scheduling
21 |
22 | #### Minimum Requirements:
23 | * dag-factory 0.22.0+
24 | * [Apache Airflow® 2.9+](https://www.astronomer.io/docs/learn/airflow-datasets/#conditional-dataset-scheduling)
25 |
26 |
27 | #### Logical operators for datasets
28 | Airflow supports two logical operators for combining dataset conditions:
29 |
30 | * AND (``&``): Specifies that the DAG should be triggered only after all of the specified datasets have been updated.
31 | * OR (``|``): Specifies that the DAG should be triggered when any of the specified datasets is updated.
32 |
33 | These operators enable you to configure your Airflow workflows to use more complex dataset update conditions, making them more dynamic and flexible.
34 |
35 | #### Examples of Conditional Dataset Scheduling
36 |
37 | Below are examples demonstrating how to configure a consumer DAG using conditional dataset scheduling.
38 |
39 | ##### Example 1: String Condition
40 |
41 | ```title="example_dataset_condition_string.yml"
42 | --8<-- "dev/dags/datasets/example_dataset_condition_string.yml"
43 | ```
44 |
45 | ##### Example 2: YAML Syntax
46 |
47 | ```title="example_dataset_yaml_syntax.yml"
48 | --8<-- "dev/dags/datasets/example_dataset_yaml_syntax.yml"
49 | ```
50 |
51 | ---
52 |
53 | #### Visualization
54 |
55 | The following diagrams illustrate the dataset conditions described in the example configurations:
56 |
57 | 1. **`s3://bucket-cjmm/raw/dataset_custom_1`** and **`s3://bucket-cjmm/raw/dataset_custom_2`** must both be updated for the first condition to be satisfied.
58 | 2. Alternatively, **`s3://bucket-cjmm/raw/dataset_custom_3`** alone can satisfy the condition.
59 |
60 | 
61 | 
62 |
--------------------------------------------------------------------------------
/docs/features/dynamic_tasks.md:
--------------------------------------------------------------------------------
1 | # Dynamic tasks
2 |
3 | DAG Factory supports Airflow’s
4 | [Dynamic Task Mapping](https://airflow.apache.org/docs/apache-airflow/stable/authoring-and-scheduling/dynamic-task-mapping.html),
5 | enabling workflows to dynamically create tasks at runtime. This approach allows the number of tasks to be determined
6 | during execution, usually based on the outcome of a preceding task, rather than being predefined during DAG authoring.
7 |
8 | ## Example: Defining Dynamic Tasks
9 |
10 | Below is an example configuration for implementing dynamic tasks using DAG Factory:
11 |
12 | ```title="example_dynamic_task_mapping.yml"
13 | --8<-- "dev/dags/example_dynamic_task_mapping.yml"
14 | ```
15 |
16 | ### Explanation of the Configuration
17 |
18 | 1. `request` Task:
19 | - Generates a list of items using the `make_list` function from the [expand_tasks.py](https://github.com/astronomer/dag-factory/blob/main/dev/dags/expand_tasks.py) module.
20 | - This task serves as the input provider for the dynamically mapped tasks.
21 |
22 | 2. `process` Task:
23 | - Dynamically generates one task for each item in the list produced by the `request` task.
24 | - The expand argument is used to create these tasks at runtime, with `request.output` supplying the input list.
25 | - Additionally, the `partial` argument is used to specify fixed parameters (`op_kwargs`) that are applied to all dynamically generated tasks.
26 |
27 | ### How It Works
28 |
29 | - Dynamic Task Creation:
30 | The `expand` keyword allows the process task to spawn multiple tasks at runtime, each processing a single item from
31 | the list output of the `request` task.
32 |
33 | - Fixed Parameters:
34 | The partial keyword ensures that common parameters, such as `fixed_param`, are passed to every dynamically created
35 | task instance.
36 |
37 | ### Benefits of Dynamic Task Mapping with DAG Factory
38 |
39 | - Flexibility: Handle varying input sizes and conditions dynamically without modifying the DAG definition.
40 | - Scalability: Efficiently process large datasets by leveraging Airflow’s parallel execution capabilities.
41 | - Simplicity: Define dynamic workflows declaratively using YAML, minimizing boilerplate code.
42 |
43 | ### Airflow mapped tasks view
44 |
45 | Below, you can see a list of mapped tasks generated dynamically as part of the `process` task.
46 |
47 | 
48 |
49 | ## Advanced Dynamic Task Mapping with DAG Factory
50 |
51 | Below, we explain the different methods for defining dynamic task mapping, illustrated by the provided example configuration.
52 |
53 | ```title="Dynamic Task Mapping advanced usage"
54 | --8<-- "dev/dags/example_taskflow.yml"
55 | ```
56 |
57 | The example above illustrates advanced usage of Dynamic Task Mapping using Dag Factory (the callable functions
58 | used in the example are kept in [sample.py](https://github.com/astronomer/dag-factory/blob/main/dev/dags/sample.py)):
59 |
60 | 1. **Static Input Mapping**
61 |
62 | The task `double_number_with_dynamic_task_mapping_static` shows how dynamic tasks can be created using static lists
63 | as input. Three tasks are created, each processing one number.
64 |
65 | 2. **Task-Generated Input Mapping**
66 |
67 | The task `double_number_with_dynamic_task_mapping_taskflow` shows how tasks can use outputs from other tasks as
68 | input for dynamic task mapping. The prefix `+` tells DAG Factory to resolve this value as the task `numbers_list`,
69 | previously defined.
70 |
71 | 3. **Mapping with Multiple Inputs**
72 |
73 | The task `multiply_with_multiple_parameters` shows how dynamic task mapping can combine outputs from multiple tasks
74 | as input parameters.
75 |
76 | ## Named Mapping in Dynamic Tasks with DAG Factory
77 |
78 | Starting with Airflow 2.9, the `map_index_template` feature allows for custom mapping name for dynamic tasks based on a
79 | user-defined key. DAG Factory fully supports this feature, enabling users to name tasks dynamically in a meaningful way
80 | during runtime. This can be useful for tracing and debugging tasks.
81 |
82 | Below is an example of how to configure and use custom names for mapped tasks
83 |
84 | ```title="example_map_index_template.yml"
85 | --8<-- "dev/dags/example_map_index_template.yml"
86 | ```
87 |
88 | ### How it works
89 |
90 | 1. map_index_template:
91 | Customizes the naming of dynamically mapped tasks using a Jinja2 expression. In this example, it uses
92 | `custom_mapping_key` from the task context to define task names.
93 | 2. expand:
94 | Dynamically generates tasks for each entry in the `full_name` list
95 | - Lucy Black
96 | - Vera Santos
97 | - Marks Spencer
98 | 3. Dynamic Task Naming:
99 | The `custom_mapping_key` is set to the first name of each person, e.g., Lucy, Vera, and Marks using the callable
100 | function `extract_last_name`. This callable function is kept in [sample.py](https://github.com/astronomer/dag-factory/blob/main/dev/dags/sample.py)
101 |
102 | ### Airflow named mapped tasks view
103 |
104 | The image below shows that the `map_index` gets the first name of the person in the mapped tasks with the above configuration.
105 |
106 | 
107 |
108 | ## Scope and limitations
109 |
110 | The Airflow documentation on [dynamic task mapping](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html)
111 | provides various examples of this feature. While the previous sections have discussed the forms supported by DAG
112 | Factory, it’s important to note the scenarios that have not been tested or are known to be unsupported.
113 |
114 | The following cases are tested and expected to work (you can refer to previous sections on how to use them with DAG Factory):
115 |
116 | - [Simple mapping](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html#simple-mapping)
117 | - [Task-generated mapping](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html#task-generated-mapping)
118 | - [Repeated mapping](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html#repeated-mapping)
119 | - [Adding parameters that do not expand (partial)](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html#adding-parameters-that-do-not-expand)
120 | - [Mapping over multiple parameters](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html#mapping-over-multiple-parameters)
121 | - [Named mapping (map_index_template)](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html#named-mapping)
122 |
123 | The following cases are untested but are expected to work:
124 |
125 | - [Mapping with non-TaskFlow operators](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html#mapping-with-non-taskflow-operators)
126 | - [Mapping over the result of classic operators](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html#mapping-over-result-of-classic-operators)
127 | - [Filtering items from a mapped task](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html#filtering-items-from-a-mapped-task)
128 |
129 | The following cases are untested and may not work:
130 |
131 | - [Assigning multiple parameters to a non-TaskFlow operator](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html#assigning-multiple-parameters-to-a-non-taskflow-operator)
132 | - [Mapping over a task group](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html#mapping-over-a-task-group)
133 | - [Transforming expanding data](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html#transforming-expanding-data)
134 | - [Combining upstream data (aka “zipping”)](https://airflow.apache.org/docs/apache-airflow/2.10.3/authoring-and-scheduling/dynamic-task-mapping.html#combining-upstream-data-aka-zipping)
135 |
--------------------------------------------------------------------------------
/docs/features/http_task.md:
--------------------------------------------------------------------------------
1 | # HttpSensor
2 |
3 | **DAG-Factory** supports the HttpSensor from the `airflow.providers.http.sensors.http` package.
4 |
5 | The example below demonstrates the response_check logic in a Python file:
6 |
7 | ```yaml
8 | task_2:
9 | operator: airflow.providers.http.sensors.http.HttpSensor
10 | http_conn_id: 'test-http'
11 | method: 'GET'
12 | response_check_name: check_sensor
13 | response_check_file: /path/to/example1/http_conn.py
14 | dependencies: [task_1]
15 | ```
16 |
17 | The `response_check` logic can also be provided as a lambda:
18 |
19 | ```yaml
20 | task_2:
21 | operator: airflow.providers.http.sensors.http.HttpSensor
22 | http_conn_id: 'test-http'
23 | method: 'GET'
24 | response_check_lambda: 'lambda response: "ok" in response.text'
25 | dependencies: [task_1]
26 | ```
27 |
--------------------------------------------------------------------------------
/docs/features/multiple_configuration_files.md:
--------------------------------------------------------------------------------
1 | # Multiple Configuration Files
2 |
3 | Using **DAG-Factory** if you want to split your DAG configuration into multiple files, you can do so by leveraging a suffix in the configuration file name.
4 |
5 | ```python
6 | from dagfactory import load_yaml_dags # load relevant YAML files as airflow DAGs
7 |
8 | load_yaml_dags(globals_dict=globals(), suffix=['dag.yaml'])
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/getting-started/quick-start-airflow-standalone.md:
--------------------------------------------------------------------------------
1 | # DAG Factory: Quick Start Guide With Airflow
2 |
3 | **DAG Factory** is a Python library [Apache Airflow®](https://airflow.apache.org) that simplifies DAG creation using declarative YAML configuration files instead of Python.
4 |
5 | ## Prerequisites
6 |
7 | The minimum requirements for **dag-factory** are:
8 |
9 | - Python 3.8.0+
10 | - [Apache Airflow®](https://airflow.apache.org) 2.3+
11 |
12 | ## Step 1: Create a Python Virtual Environment
13 |
14 | Create and activate a virtual environment:
15 |
16 | ```commandline
17 | python3 -m venv dagfactory_env
18 | source dagfactory_env/bin/activate
19 | ```
20 |
21 | ## Step 2: Install Apache Airflow
22 |
23 | Install [Apache Airflow®](https://airflow.apache.org):
24 |
25 | 1. Create a directory for your project and navigate to it:
26 |
27 | ```commandline
28 | mkdir dag-factory-quick-start && cd dag-factory-quick-start
29 | ```
30 |
31 | 2. Set the `AIRFLOW_HOME` environment variable:
32 |
33 | ```commandline
34 | export AIRFLOW_HOME=$(pwd)
35 | export AIRFLOW__CORE__LOAD_EXAMPLES=False
36 | ```
37 |
38 | 3. Install Apache Airflow:
39 |
40 | ```commandline
41 | pip install apache-airflow
42 | ```
43 |
44 | ## Step 3: Install DAG Factory
45 |
46 | Install the DAG Factory library in your virtual environment:
47 |
48 | ```commandline
49 | pip install dag-factory
50 | ```
51 |
52 | ## Step 4: Set Up the DAGS Folder
53 |
54 | Create a DAGs folder inside the $AIRFLOW_HOME directory, which is where your DAGs will be stored:
55 |
56 | ```commandline
57 | mkdir dags
58 | ```
59 |
60 | ## Step 5: Define a DAG in YAML
61 |
62 | **DAG Factory** uses YAML files to define DAG configurations. Create a file named `example_dag_factory.yml` in the `$AIRFLOW_HOME/dags` folder with the following content:
63 |
64 | ```title="example_dag_factory.yml"
65 | --8<-- "dev/dags/example_dag_factory.yml"
66 | ```
67 |
68 | ## Step 6: Generate the DAG from YAML
69 |
70 | Create a Python script named `example_dag_factory.py` in the `$AIRFLOW_HOME/dags` folder. This script will generate the DAG from the YAML configuration
71 |
72 | ```title="example_dag_factory.py"
73 | --8<-- "dev/dags/example_dag_factory.py"
74 | ```
75 |
76 | ## Step 7: Start Airflow
77 |
78 | To start the Airflow environment with your DAG Factory setup, run the following command:
79 |
80 | ```commandline
81 | airflow standalone
82 | ```
83 |
84 | This will take a few minutes to set up. Once completed, you can access the Airflow UI and the generated DAG at `http://localhost:8080` 🚀.
85 |
86 | ## View Your Generated DAG
87 |
88 | Once Airflow is up and running, you can login with the username `admin` and the password in `$AIRFLOW_HOME/standalone_admin_password.txt`. You should be able to see your generated DAG in the Airflow UI.
89 |
90 | ## Generated DAG
91 |
92 | 
93 |
94 | ## Graph View
95 |
96 | 
97 |
98 | Checkout [examples](https://github.com/astronomer/dag-factory/tree/main/dev/dags) for generating more advanced DAGs.
99 |
--------------------------------------------------------------------------------
/docs/getting-started/quick-start-astro-cli.md:
--------------------------------------------------------------------------------
1 | # DAG Factory: Quick Start Guide With Astro CLI
2 |
3 | **DAG Factory** is a Python library [Apache Airflow®](https://airflow.apache.org) that simplifies DAG creation using declarative YAML configuration files instead of Python.
4 |
5 | ## Prerequisites
6 |
7 | The minimum requirements for **dag-factory** are:
8 |
9 | - Python 3.8.0+
10 | - [Astro CLI](https://www.astronomer.io/docs/astro/cli/overview/)
11 |
12 | ## Step 1: Initialize Airflow Project
13 |
14 | Create a new directory and initialize your Astro CLI project:
15 |
16 | ```commandline
17 | mkdir dag-factory-quick-start && cd dag-factory-quick-start
18 |
19 | astro dev init
20 | ```
21 |
22 | This will set up the necessary Airflow files and directories.
23 |
24 | ## Step 2: Install DAG Factory
25 |
26 | Install DAG Factory in your Airflow environment:
27 |
28 | 1. Add dag-factory as a dependency to the `requirements.txt` file created during the project initialization.
29 |
30 | ## Step 3: Define a DAG in YAML
31 |
32 | **DAG Factory** uses YAML files to define DAG configurations. Create a file named `example_dag_factory.yml` in the `$AIRFLOW_HOME/dags` folder with the following content:
33 |
34 | ```title="example_dag_factory.yml"
35 | --8<-- "dev/dags/example_dag_factory.yml"
36 | ```
37 |
38 | ## Step 4: Generate the DAG from YAML
39 |
40 | Create a Python script named `example_dag_factory.py` in the `$AIRFLOW_HOME/dags` folder. This script will generate the DAG from the YAML configuration
41 |
42 | ```title="example_dag_factory.py"
43 | --8<-- "dev/dags/example_dag_factory.py"
44 | ```
45 |
46 | ## Step 5: Start Airflow Project
47 |
48 | Once you've set up your YAML configuration and Python script, start the Airflow environment with the following command:
49 |
50 | ```commandline
51 | astro dev start
52 | ```
53 |
54 | This will take a few minutes to set up. Once completed, you can access the Airflow UI and the generated DAG at `http://localhost:8080` 🚀.
55 |
56 | ## View Your Generated DAG
57 |
58 | Once Airflow is up and running, you can login with the username `admin` and the password `admin`. You should be able to see your generated DAG in the Airflow UI.
59 |
60 | ## Generated DAG
61 |
62 | 
63 |
64 | ## Graph View
65 |
66 | 
67 |
68 | Checkout [examples](https://github.com/astronomer/dag-factory/tree/main/dev/dags) for generating more advanced DAGs.
69 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | # DAG Factory documentation
2 |
3 | Everything you need to know about how to build Apache Airflow® workflows using YAML files.
4 |
5 | ## Getting started
6 |
7 | Are you new to DAG Factory? This is the place to start!
8 |
9 | * DAG Factory at a glance
10 | * [Quickstart with Airflow standalone](getting-started/quick-start-airflow-standalone.md)
11 | * [Quickstart with Astro CLI](getting-started/quick-start-astro-cli.md)
12 | * [Using YAML instead of Python](./comparison/index.md)
13 | * [Traditional Airflow Operators](./comparison/traditional_operators.md)
14 | * [TaskFlow API](./comparison/taskflow_api.md)
15 |
16 | ## Configuration
17 |
18 | * [Configuring your workflows](configuration/configuring_workflows.md)
19 | * [Environment variables](configuration/environment_variables.md)
20 | * [Defaults](configuration/defaults.md)
21 |
22 | ## Features
23 |
24 | * [Dynamic tasks](features/dynamic_tasks.md)
25 | * [Datasets scheduling](features/datasets.md)
26 | * [Callbacks](features/callbacks.md)
27 | * [Custom operators](features/custom_operators.md)
28 | * [Multiple configuration files](features/multiple_configuration_files.md)
29 | * [HttpSensor](features/http_task.md)
30 |
31 | ## Getting help
32 |
33 | Having trouble? We'd like to help!
34 |
35 | * Report bugs, questions and feature requests in our [ticket tracker](https://github.com/astronomer/dag-factory/issues).
36 |
37 | ## Contributing
38 |
39 | DAG Factory is an Open-Source project. Learn about its development process and about how you can contribute:
40 |
41 | * [Contributing to DAG Factory](contributing/howto.md)
42 | * [Github repository](https://github.com/astronomer/dag-factory/)
43 |
44 | ## License
45 |
46 | To learn more about the terms and conditions for use, reproduction and distribution, read the [Apache License 2.0](https://github.com/astronomer/dag-factory/blob/main/LICENSE).
47 |
48 | ## Privacy Notice
49 |
50 | This project follows [Astronomer's Privacy Policy](https://www.astronomer.io/privacy/).
51 |
52 | For further information, [read this](https://github.com/astronomer/dag-factory/blob/main/PRIVACY_NOTICE.md)
53 |
54 | ## Security Policy
55 |
56 | Check the project's [Security Policy](https://github.com/astronomer/dag-factory/blob/main/SECURITY.md) to learn
57 | how to report security vulnerabilities in DAG Factory and how security issues reported to the DAG Factory
58 | security team are handled.
59 |
60 |
61 |
--------------------------------------------------------------------------------
/docs/static/example_dynamic_task_mapping.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_dynamic_task_mapping.png
--------------------------------------------------------------------------------
/docs/static/example_hackernews_dagfactory_code.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_hackernews_dagfactory_code.png
--------------------------------------------------------------------------------
/docs/static/example_hackernews_dagfactory_docs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_hackernews_dagfactory_docs.png
--------------------------------------------------------------------------------
/docs/static/example_hackernews_dagfactory_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_hackernews_dagfactory_graph.png
--------------------------------------------------------------------------------
/docs/static/example_hackernews_plain_airflow_code.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_hackernews_plain_airflow_code.png
--------------------------------------------------------------------------------
/docs/static/example_hackernews_plain_airflow_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_hackernews_plain_airflow_graph.png
--------------------------------------------------------------------------------
/docs/static/example_map_index_template.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_map_index_template.png
--------------------------------------------------------------------------------
/docs/static/example_pypi_stats_dagfactory_code.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_pypi_stats_dagfactory_code.png
--------------------------------------------------------------------------------
/docs/static/example_pypi_stats_dagfactory_docs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_pypi_stats_dagfactory_docs.png
--------------------------------------------------------------------------------
/docs/static/example_pypi_stats_dagfactory_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_pypi_stats_dagfactory_graph.png
--------------------------------------------------------------------------------
/docs/static/example_pypi_stats_dagfactory_mapped_tasks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_pypi_stats_dagfactory_mapped_tasks.png
--------------------------------------------------------------------------------
/docs/static/example_pypi_stats_plain_airflow_code.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_pypi_stats_plain_airflow_code.png
--------------------------------------------------------------------------------
/docs/static/example_pypi_stats_plain_airflow_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_pypi_stats_plain_airflow_graph.png
--------------------------------------------------------------------------------
/docs/static/example_pypi_stats_plain_airflow_mapped_tasks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/example_pypi_stats_plain_airflow_mapped_tasks.png
--------------------------------------------------------------------------------
/docs/static/images/airflow-dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/images/airflow-dag.png
--------------------------------------------------------------------------------
/docs/static/images/airflow-home.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/images/airflow-home.png
--------------------------------------------------------------------------------
/docs/static/images/custom_operators.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/images/custom_operators.png
--------------------------------------------------------------------------------
/docs/static/images/datasets/conditions/graph_conditional_dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/images/datasets/conditions/graph_conditional_dataset.png
--------------------------------------------------------------------------------
/docs/static/images/datasets/conditions/graph_conditional_dataset_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/images/datasets/conditions/graph_conditional_dataset_2.png
--------------------------------------------------------------------------------
/docs/static/images/datasets/outlets/datasets_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/docs/static/images/datasets/outlets/datasets_example.png
--------------------------------------------------------------------------------
/examples/dags:
--------------------------------------------------------------------------------
1 | ../dev/dags
--------------------------------------------------------------------------------
/img/mapped_tasks_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/img/mapped_tasks_example.png
--------------------------------------------------------------------------------
/img/quickstart_dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/img/quickstart_dag.png
--------------------------------------------------------------------------------
/img/quickstart_gantt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/img/quickstart_gantt.png
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: DAG Factory
2 | site_url: https://astronomer.github.io/dag-factory
3 | copyright: © Copyright 2025, Astronomer.
4 |
5 | repo_url: https://github.com/astronomer/dag-factory
6 | repo_name: astronomer/dag-factory
7 |
8 | edit_uri: "blob/main/docs"
9 |
10 | theme:
11 | name: material
12 | features:
13 | - announce.dismiss
14 | - content.action.edit
15 | - content.action.view
16 | - content.code.annotate
17 | - content.code.copy
18 | - content.tooltips
19 | - navigation.sections
20 | - navigation.tabs
21 | - navigation.footer
22 | - navigation.indexes
23 | - navigation.top
24 | - navigation.tracking
25 | - search.highlight
26 | - search.share
27 | - search.suggest
28 | - toc.follow
29 |
30 | extra:
31 | version:
32 | provider: mike
33 | alias: true
34 |
35 | markdown_extensions:
36 | - pymdownx.highlight:
37 | anchor_linenums: true
38 | line_spans: __span
39 | pygments_lang_class: true
40 | - pymdownx.inlinehilite
41 | - pymdownx.snippets:
42 | check_paths: true
43 | base_path: [ "." ]
44 | - pymdownx.superfences
45 |
46 | nav:
47 | - Home: index.md
48 | - Getting Started:
49 | - Airflow Standalone: getting-started/quick-start-airflow-standalone.md
50 | - Astro CLI: getting-started/quick-start-astro-cli.md
51 | - Configuration:
52 | - configuration/configuring_workflows.md
53 | - configuration/environment_variables.md
54 | - configuration/defaults.md
55 | - Features:
56 | - features/dynamic_tasks.md
57 | - features/datasets.md
58 | - features/callbacks.md
59 | - features/custom_operators.md
60 | - features/http_task.md
61 | - features/multiple_configuration_files.md
62 |
63 | - Comparison:
64 | - comparison/index.md
65 | - Traditional Airflow Operators: comparison/traditional_operators.md
66 | - TaskFlow API: comparison/taskflow_api.md
67 | - Contributing:
68 | - Code of Conduct: contributing/code_of_conduct.md
69 | - contributing/contributors.md
70 | - contributing/howto.md
71 | - contributing/roles.md
72 |
73 | plugins:
74 | - mike:
75 | alias_type: symlink
76 | - search
77 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["hatchling"]
3 | build-backend = "hatchling.build"
4 |
5 | [project]
6 | name = "dag-factory"
7 | dynamic = ["version"]
8 | description = "Dynamically build Apache Airflow DAGs from YAML files"
9 | authors = [{ name = "Astronomer", email = "humans@astronomer.io" }]
10 | readme = "README.md"
11 | license = "Apache-2.0"
12 | license-files = { paths = ["LICENSE"] }
13 | requires-python = ">=3.8"
14 | keywords = ["airflow", "apache-airflow", "provider", "astronomer", "dag"]
15 | classifiers = [
16 | "Development Status :: 5 - Production/Stable",
17 | "License :: OSI Approved :: Apache Software License",
18 | "Topic :: Database",
19 | "Framework :: Apache Airflow",
20 | "Intended Audience :: Developers",
21 | "Programming Language :: Python :: Implementation :: CPython",
22 | "Programming Language :: Python :: Implementation :: PyPy",
23 | "Programming Language :: Python :: 3",
24 | "Programming Language :: Python :: 3 :: Only",
25 | "Programming Language :: Python :: 3.8",
26 | "Programming Language :: Python :: 3.9",
27 | "Programming Language :: Python :: 3.10",
28 | "Programming Language :: Python :: 3.11",
29 | "Programming Language :: Python :: 3.12",
30 | ]
31 | dependencies = [
32 | "apache-airflow>=2.3",
33 | "apache-airflow-providers-http>=2.0.0",
34 | "apache-airflow-providers-cncf-kubernetes<10.4.2", # https://github.com/astronomer/dag-factory/issues/397
35 | "pyyaml",
36 | "packaging",
37 | ]
38 |
39 |
40 | [project.optional-dependencies]
41 | tests = [
42 | "apache-airflow-providers-slack",
43 | "pytest>=6.0",
44 | "pytest-cov",
45 | "pre-commit",
46 | ]
47 |
48 | ######################################
49 | # TESTING
50 | ######################################
51 |
52 | [tool.hatch.envs.tests]
53 | dependencies = [
54 | "dag-factory[tests]",
55 | "apache-airflow~={matrix:airflow}.0,!=2.9.0,!=2.9.1", # https://github.com/apache/airflow/pull/39670
56 | "httpx>=0.25.0",
57 | "pandas",
58 | ]
59 | pre-install-commands = ["sh scripts/test/pre-install-airflow.sh {matrix:airflow} {matrix:python}"]
60 |
61 | [[tool.hatch.envs.tests.matrix]]
62 | python = ["3.8", "3.9", "3.10", "3.11", "3.12"]
63 | airflow = ["2.3", "2.4", "2.5", "2.6", "2.7", "2.8", "2.9", "2.10"]
64 |
65 |
66 | [tool.hatch.envs.tests.scripts]
67 | freeze = "pip freeze"
68 | static-check = " pre-commit run --files dagfactory/*"
69 | test = 'sh scripts/test/unit.sh'
70 | test-cov = 'sh scripts/test/unit-cov.sh'
71 | test-integration = 'sh scripts/test/integration.sh'
72 | test-integration-setup = 'sh scripts/test/integration-setup.sh'
73 |
74 | [project.urls]
75 | Source = "https://github.com/astronomer/dag-factory"
76 |
77 | [tool.hatch.version]
78 | path = "dagfactory/__init__.py"
79 |
80 | [project.entry-points."airflow.plugins"]
81 | dagfactory = "dagfactory.plugin:DagFactoryPlugin"
82 |
83 | [tool.hatch.build]
84 | sources = ["."]
85 |
86 | [tool.hatch.build.targets.sdist]
87 | include = ["dagfactory"]
88 |
89 | [tool.hatch.build.targets.wheel]
90 | packages = ["dagfactory"]
91 |
92 | [tool.distutils.bdist_wheel]
93 | universal = true
94 |
95 | [tool.pytest.ini_options]
96 | filterwarnings = ["ignore::DeprecationWarning"]
97 | minversion = "6.0"
98 | markers = ["integration", "callbacks"]
99 |
100 | ######################################
101 | # DOCS
102 | ######################################
103 |
104 | [tool.hatch.envs.docs]
105 | dependencies = [
106 | "mkdocs",
107 | "mike",
108 | "pymdown-extensions",
109 | "mkdocs-material",
110 | ]
111 |
112 | [tool.hatch.envs.docs.scripts]
113 | dev = "mkdocs build && mkdocs serve" # For local development and preventing publishing
114 | gh-deploy = "python scripts/docs_deploy.py dev"
115 | gh-release = "python scripts/docs_deploy.py release"
116 |
117 | ######################################
118 | # THIRD PARTY TOOLS
119 | ######################################
120 |
121 | [tool.black]
122 | line-length = 120
123 | target-version = ['py39', 'py310', 'py311', 'py312']
124 |
125 | [tool.ruff]
126 | line-length = 120
127 |
128 | [tool.ruff.lint]
129 | select = ["C901", "D300", "I", "F"]
130 | ignore = ["F541", "C901"]
131 |
132 | [tool.ruff.lint.isort]
133 | combine-as-imports = true
134 | known-first-party = ["dagfactory", "tests"]
135 |
136 | [tool.ruff.lint.mccabe]
137 | max-complexity = 10
138 |
--------------------------------------------------------------------------------
/scripts/airflow3/.gitignore:
--------------------------------------------------------------------------------
1 | airflow.db-shm
2 | airflow.db-wal
3 | simple_auth_manager_passwords.json.generated
4 | venv-af3
5 |
--------------------------------------------------------------------------------
/scripts/airflow3/README.md:
--------------------------------------------------------------------------------
1 | # Run Airflow3 Locally
2 |
3 | This guide will walk you through the process of setting up Apache Airflow 3 locally using pip. You can choose either SQLite or Postgres as the database backend for Airflow.
4 |
5 | ## 1. Setup Postgres Container (Optional)
6 |
7 | By default, SQLite will be used as Airflow metadata database unless you update the AIRFLOW__DATABASE__SQL_ALCHEMY_CONN environment variable to point to PostgreSQL. The following command will pull the official Postgres image , create a container named postgres, and expose the required ports.
8 |
9 | ### 1.1 Pull Postgres Image
10 |
11 | ```commandline
12 | docker run --name postgres -p 5432:5432 -p 5433:5433 -e POSTGRES_PASSWORD=postgres postgres
13 | ```
14 |
15 | ### 1.2 Access the PostgreSQL Console and Create the Database
16 |
17 | Now that the PostgreSQL container is running, you can connect to it via the command line using psql
18 |
19 | ```commandline
20 | psql --u postgres
21 | ```
22 |
23 | ### 1.3 Create the Database for Airflow
24 |
25 | Once you're inside the psql interactive terminal, you can create a new database that Airflow will use.
26 |
27 | ```commandline
28 | CREATE DATABASE airflow_db;
29 | ```
30 |
31 | ## 2. Setup Virtual Environment for Airflow3
32 |
33 | You need to configure the virtual environment for Airflow3.
34 |
35 | ### 2.1 Export ENV
36 |
37 | This will export the AIRFLOW related env like AIRFLOW_HOME etc
38 |
39 | ```commandline
40 | source scripts/airflow3/env.sh
41 | ```
42 |
43 | ## 3. Install Dependency
44 |
45 | ```commandline
46 | sh scripts/airflow3/setup.sh
47 | ```
48 |
49 | ## 4. Run Airflow in Standalone Mode
50 |
51 | Activate the virtual env created in previous step and run airflow
52 |
53 | ```commandline
54 | source "$(pwd)/scripts/airflow3/venv-af3/bin/activate"
55 |
56 | airflow standalone
57 | ```
58 |
59 | This command will:
60 |
61 | - Set the necessary environment variables (like AIRFLOW_HOME).
62 | - Initialize the Airflow database.
63 | - Start Airflow webserver, scheduler and trigger.
64 |
65 | ## 5. Run Airflow Tests
66 |
67 | Once Airflow is running, you can also run tests.
68 |
69 | ```commandline
70 | source scripts/airflow3/env.sh
71 |
72 | source "$(pwd)/scripts/airflow3/venv-af3/bin/activate"
73 |
74 | sh scripts/airflow3/tests.sh
75 | ```
76 |
77 | ## 6. Access the Airflow Web Interface
78 |
79 | After running the standalone command, you can access the Airflow web interface to monitor the status of your DAGs, tasks, and more.
80 |
81 | - The web interface should be available at [Localhost Server](http://localhost:8080)
82 |
83 | ## 7. Install Airflow from the Main Branch
84 |
85 | If you want to install Airflow from the main branch, follow the steps from sections 1, 2, and 3 above. Then, proceed with the following steps:
86 |
87 | ### 7.1 Set ENV AIRFLOW_REPO_DIR
88 |
89 | Set ENV `AIRFLOW_REPO_DIR` in scripts/airflow3/env.sh pointing to the path where your Airflow repository is cloned.
90 |
91 | ### 7.2 Activate the Virtual Environment
92 |
93 | ```commandline
94 | source scripts/airflow3/env.sh
95 |
96 | source "$(pwd)/scripts/airflow3/venv-af3/bin/activate"
97 | ```
98 |
99 | ### 7.3 Install Airflow from the Main Branch
100 |
101 | ```commandline
102 | sh scripts/airflow3/install_from_main.sh
103 | ```
104 |
105 | ### 7.4 Run Airflow standalone
106 |
107 | Finally, run Airflow in standalone mode again:
108 |
109 | ```commandline
110 | airflow standalone
111 | ```
112 |
--------------------------------------------------------------------------------
/scripts/airflow3/dags/example_dag_factory.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 |
4 | # The following import is here so Airflow parses this file
5 | # from airflow import DAG
6 | import dagfactory
7 |
8 | DEFAULT_CONFIG_ROOT_DIR = os.getenv("DEFAULT_CONFIG_ROOT_DIR", "/usr/local/airflow/dags/")
9 |
10 | CONFIG_ROOT_DIR = Path(os.getenv("CONFIG_ROOT_DIR", DEFAULT_CONFIG_ROOT_DIR))
11 |
12 | config_file = str(CONFIG_ROOT_DIR / "example_dag_factory.yml")
13 |
14 | example_dag_factory = dagfactory.DagFactory(config_file)
15 |
16 | # Creating task dependencies
17 | example_dag_factory.clean_dags(globals())
18 | example_dag_factory.generate_dags(globals())
19 |
--------------------------------------------------------------------------------
/scripts/airflow3/dags/example_dag_factory.yml:
--------------------------------------------------------------------------------
1 | default:
2 | default_args:
3 | catchup: false,
4 | start_date: 2024-11-11
5 |
6 | # ----8<--- [ start: example_dag_yaml_configuration ]
7 | basic_example_dag:
8 | default_args:
9 | owner: "custom_owner"
10 | description: "this is an example dag"
11 | schedule: "0 3 * * *"
12 | render_template_as_native_obj: True
13 | tasks:
14 | task_1:
15 | operator: airflow.providers.standard.operators.bash.BashOperator
16 | bash_command: "echo 1"
17 | task_2:
18 | operator: airflow.providers.standard.operators.bash.BashOperator
19 | bash_command: "echo 2"
20 | dependencies: [task_1]
21 | task_3:
22 | operator: airflow.providers.standard.operators.bash.BashOperator
23 | bash_command: "echo 2"
24 | dependencies: [task_1]
25 | # ----8<--- [ end: example_dag_yaml_configuration ]
26 |
--------------------------------------------------------------------------------
/scripts/airflow3/env.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | PYTHONPATH="$PWD"
6 | export PYTHONPATH
7 | AIRFLOW_HOME="$PWD/scripts/airflow3"
8 | export AIRFLOW_HOME
9 | export AIRFLOW__LOGGING__BASE_LOG_FOLDER="$AIRFLOW_HOME/logs"
10 | export AIRFLOW__WEBSERVER__CONFIG_FILE="$AIRFLOW_HOME/webserver_config.py"
11 | export AIRFLOW__SCHEDULER__CHILD_PROCESS_LOG_DIRECTORY="$AIRFLOW_HOME/logs/scheduler"
12 | # Comment below line to use the Postgres database backend.
13 | export AIRFLOW__DATABASE__SQL_ALCHEMY_CONN="sqlite:///$AIRFLOW_HOME/airflow.db"
14 | # Uncomment below line to use the Postgres database backend.
15 | # export AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=postgresql+psycopg2://postgres:postgres@localhost:5432/airflow_db
16 | export AIRFLOW__CORE__LOAD_EXAMPLES=false
17 | export AIRFLOW__CORE__DAGBAG_IMPORT_ERROR_TRACEBACK_DEPTH=10
18 | export AIRFLOW__CORE__DAGBAG_IMPORT_TIMEOUT=300
19 | # export AIRFLOW__LOGGING__LOGGING_LEVEL=DEBUG
20 | export AIRFLOW_REPO_DIR="$PWD/../airflow"
21 | export DEFAULT_CONFIG_ROOT_DIR="$AIRFLOW_HOME/dags"
22 |
--------------------------------------------------------------------------------
/scripts/airflow3/install_from_main.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -v
4 | set -x
5 | set -e
6 |
7 | : "${AIRFLOW_REPO_DIR:?Environment variable AIRFLOW_REPO_DIR is not set}"
8 | echo "AIRFLOW_REPO_DIR is set to '$AIRFLOW_REPO_DIR'"
9 |
10 | DAG_FACTORY_ROOT="$PWD"
11 |
12 | cd "$AIRFLOW_REPO_DIR"
13 | git checkout main && git pull
14 |
15 | pip uninstall -y apache-airflow-core
16 | pip uninstall -y apache-airflow-task-sdk
17 | pip uninstall -y apache-airflow-providers-fab
18 | pip uninstall -y apache-airflow
19 | pip uninstall -y apache-airflow-providers-git
20 |
21 | rm -rf dist
22 |
23 | pip install uv
24 |
25 | pip install -e "$AIRFLOW_REPO_DIR/dev/breeze" --force
26 |
27 | breeze release-management prepare-provider-distributions \
28 | --distributions-list celery,common.io,common.compat,fab,standard,openlineage,git \
29 | --distribution-format wheel
30 |
31 | breeze release-management prepare-airflow-distributions --distribution-format wheel
32 |
33 | cd task-sdk
34 | uv build --package apache-airflow-task-sdk --wheel
35 |
36 | cd ..
37 |
38 | pip install dist/*
39 |
40 | cd "$DAG_FACTORY_ROOT"
41 |
--------------------------------------------------------------------------------
/scripts/airflow3/requirements.txt:
--------------------------------------------------------------------------------
1 | apache-airflow
2 | apache-airflow-task-sdk
3 | apache-airflow-providers-standard
4 | apache-airflow-providers-fab
5 | psycopg2
6 | asyncpg
7 |
--------------------------------------------------------------------------------
/scripts/airflow3/setup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Exit on error
4 | set -e
5 |
6 | # Create a UV virtual environment named 'env' (you can change this as needed)
7 | echo "Creating virtual environment at $(pwd)/tools"
8 | python3 -m venv "$(pwd)/scripts/airflow3/venv-af3"
9 |
10 | # Activate the virtual environment
11 | echo "Activating virtual environment..."
12 | source "$(pwd)/scripts/airflow3/venv-af3/bin/activate"
13 |
14 | # Install dependencies in the virtual environment
15 | echo "Installing dependencies..."
16 | pip3 install --pre -r "$(pwd)/scripts/airflow3/requirements.txt"
17 |
18 | pip3 install ".[test]"
19 |
20 | echo "UV virtual environment setup and dependencies installed successfully!"
21 |
--------------------------------------------------------------------------------
/scripts/airflow3/tests.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -x
4 |
5 | set -e
6 |
7 | airflow dags list-import-errors
8 |
--------------------------------------------------------------------------------
/scripts/docs_deploy.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | import sys
3 |
4 | from packaging import version
5 |
6 | import dagfactory
7 |
8 |
9 | def deploy_docs(deploy_type: str):
10 | _version = version.parse(dagfactory.__version__)
11 |
12 | set_default = False
13 |
14 | if deploy_type == "release":
15 | if _version.pre is not None:
16 | command = ["mike", "deploy", "--push", "dev"]
17 | else:
18 | command = ["mike", "deploy", "--push", "--update-aliases", str(_version), "latest"]
19 | set_default = True
20 | else:
21 | command = ["mike", "deploy", "--push", "dev"]
22 |
23 | try:
24 | subprocess.run(command, capture_output=True, text=True, check=True)
25 | if set_default:
26 | default_command = ["mike", "set-default", "latest"]
27 | subprocess.run(default_command, capture_output=True, text=True, check=True)
28 | except subprocess.CalledProcessError as e:
29 | raise Exception(f"Error deploying: {e.stderr}")
30 |
31 |
32 | if __name__ == "__main__":
33 | if len(sys.argv) < 2:
34 | raise Exception("Argument deploy type is required: 'dev' or 'release'")
35 |
36 | deploy_type = sys.argv[1]
37 |
38 | if deploy_type not in ["dev", "release"]:
39 | raise Exception("Invalid argument provided. Valid deploy types are 'dev' or 'release'.")
40 |
41 | deploy_docs(deploy_type)
42 |
--------------------------------------------------------------------------------
/scripts/test/integration-setup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -v
4 | set -x
5 | set -e
6 |
7 | if [ -L "dags" ]; then
8 | echo "Symbolic link 'dags' already exists."
9 | elif [ -e "dags" ]; then
10 | echo "'dags' exists but is not a symbolic link. Please resolve this manually."
11 | else
12 | ln -s dev/dags dags
13 | echo "Symbolic link 'dags' created successfully."
14 | fi
15 |
16 | rm -rf airflow.*
17 | pip freeze | grep airflow
18 | airflow db reset -y
19 | airflow db init
20 |
--------------------------------------------------------------------------------
/scripts/test/integration.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -x
4 | set -e
5 |
6 |
7 | pip freeze | grep airflow
8 | echo $AIRFLOW_HOME
9 | ls $AIRFLOW_HOME
10 |
11 | airflow db check
12 |
13 | # Necessary for overcoming the following issue with Airflow 2.3 and 2.4:
14 | # ImportError: Pandas requires version '0.9.0' or newer of 'tabulate' (version '0.8.9' currently installed)
15 | pip install "tabulate>=0.9.0"
16 |
17 | pytest -vv \
18 | --cov=dagfactory \
19 | --cov-report=term-missing \
20 | --cov-report=xml \
21 | -m integration
22 |
--------------------------------------------------------------------------------
/scripts/test/pre-install-airflow.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | AIRFLOW_VERSION="$1"
4 | PYTHON_VERSION="$2"
5 |
6 | # Use this to set the appropriate Python environment in Github Actions,
7 | # while also not assuming --system when running locally.
8 | if [ "$GITHUB_ACTIONS" = "true" ] && [ -z "${VIRTUAL_ENV}" ]; then
9 | py_path=$(which python)
10 | virtual_env_dir=$(dirname "$(dirname "$py_path")")
11 | export VIRTUAL_ENV="$virtual_env_dir"
12 | fi
13 |
14 | echo "${VIRTUAL_ENV}"
15 |
16 | CONSTRAINT_URL="https://raw.githubusercontent.com/apache/airflow/constraints-$AIRFLOW_VERSION.0/constraints-$PYTHON_VERSION.txt"
17 | curl -sSL $CONSTRAINT_URL -o /tmp/constraint.txt
18 | # Workaround to remove PyYAML constraint that will work on both Linux and MacOS
19 | sed '/PyYAML==/d' /tmp/constraint.txt > /tmp/constraint.txt.tmp
20 | mv /tmp/constraint.txt.tmp /tmp/constraint.txt
21 | # Install Airflow with constraints
22 | pip install uv
23 | uv pip install "apache-airflow==$AIRFLOW_VERSION" --constraint /tmp/constraint.txt
24 |
25 | pip install apache-airflow-providers-cncf-kubernetes --constraint /tmp/constraint.txt
26 | rm /tmp/constraint.txt
27 |
--------------------------------------------------------------------------------
/scripts/test/unit-cov.sh:
--------------------------------------------------------------------------------
1 | pytest \
2 | -vv \
3 | --cov=dagfactory \
4 | --cov-report=term-missing \
5 | --cov-report=xml \
6 | --ignore=tests/test_example_dags.py
7 |
--------------------------------------------------------------------------------
/scripts/test/unit.sh:
--------------------------------------------------------------------------------
1 | pytest \
2 | -vv \
3 | --ignore=tests/test_example_dags.py
4 |
--------------------------------------------------------------------------------
/scripts/verify_tag_and_version.py:
--------------------------------------------------------------------------------
1 | """Verify the version of the Package with the version in Git tag."""
2 |
3 | import os
4 | import re
5 | from pathlib import Path
6 |
7 | repo_dir = Path(__file__).parent.parent
8 |
9 | path_of_init_file = Path(repo_dir / "dagfactory" / "__init__.py")
10 | version_file = path_of_init_file.read_text()
11 | git_ref = os.getenv("GITHUB_REF", "")
12 | git_tag = git_ref.replace("refs/tags/", "")
13 | git_tag = git_tag[1:] if git_tag.startswith("v") else git_tag
14 | version = re.findall('__version__ = "(.*)"', version_file)[0]
15 |
16 | if git_tag is not None:
17 | if version != git_tag:
18 | raise SystemExit(f"The version in {path_of_init_file} ({version}) does not match the Git Tag ({git_tag}).")
19 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/dag-factory/346fcf1027262fa2631a9d95b29f94eb23184c83/tests/__init__.py
--------------------------------------------------------------------------------
/tests/fixtures/dag_factory.yml:
--------------------------------------------------------------------------------
1 | default:
2 | concurrency: 1
3 | dagrun_timeout_sec: 600
4 | default_args:
5 | end_date: 2018-03-05
6 | owner: default_owner
7 | retries: 1
8 | retry_delay_sec: 300
9 | start_date: 2018-03-01
10 | default_view: tree
11 | max_active_runs: 1
12 | orientation: LR
13 | schedule_interval: 0 1 * * *
14 | example_dag:
15 | default_args:
16 | owner: custom_owner
17 | start_date: 2 days
18 | description: this is an example dag
19 | doc_md: '##here is a doc md string'
20 | schedule_interval: 0 3 * * *
21 | tasks:
22 | task_1:
23 | bash_command: echo 1
24 | operator: airflow.operators.bash_operator.BashOperator
25 | task_2:
26 | bash_command: echo 2
27 | dependencies:
28 | - task_1
29 | operator: airflow.operators.bash_operator.BashOperator
30 | task_3:
31 | bash_command: echo 3
32 | dependencies:
33 | - task_1
34 | operator: airflow.operators.bash_operator.BashOperator
35 | example_dag2:
36 | doc_md_file_path: $PWD/tests/fixtures/mydocfile.md
37 | schedule_interval: None
38 | tasks:
39 | task_1:
40 | bash_command: echo 1
41 | operator: airflow.operators.bash_operator.BashOperator
42 | task_2:
43 | bash_command: echo 2
44 | dependencies:
45 | - task_1
46 | operator: airflow.operators.bash_operator.BashOperator
47 | task_3:
48 | bash_command: echo 3
49 | dependencies:
50 | - task_1
51 | operator: airflow.operators.bash_operator.BashOperator
52 | example_dag3:
53 | doc_md_python_arguments:
54 | arg1: arg1
55 | arg2: arg2
56 | doc_md_python_callable_file: $PWD/tests/fixtures/doc_md_builder.py
57 | doc_md_python_callable_name: mydocmdbuilder
58 | tasks:
59 | task_1:
60 | bash_command: echo 1
61 | operator: airflow.operators.bash_operator.BashOperator
62 | example_dag4:
63 | vars:
64 | arg1: &arg1 'hello'
65 | arg2: &arg2 !join [*arg1, ' world']
66 | tasks:
67 | task_1:
68 | bash_command: !join ['echo ', *arg2]
69 | operator: airflow.operators.bash_operator.BashOperator
70 |
--------------------------------------------------------------------------------
/tests/fixtures/dag_factory_http_operator_task.yml:
--------------------------------------------------------------------------------
1 | default:
2 | default_args:
3 | catchup: false,
4 | start_date: 2025-03-20
5 |
6 | http_operator_example_dag:
7 | default_args:
8 | owner: "@owner"
9 | description: "this is an HttpOperator dag"
10 | schedule_interval: "0 3 * * *"
11 | tags: ['http']
12 | render_template_as_native_obj: True
13 | tasks:
14 | send_request_json:
15 | operator: airflow.providers.http.operators.http.HttpOperator
16 | http_conn_id: "example_host"
17 | method: "POST"
18 | endpoint: "/run_test"
19 | data:
20 | data: "fake_data"
21 | format: "json"
22 | headers:
23 | Content-Type: application/json
24 | log_response: True
25 | send_request_plain_text:
26 | operator: airflow.providers.http.operators.http.HttpOperator
27 | http_conn_id: "example_host"
28 | method: "POST"
29 | endpoint: "/run_test"
30 | data:
31 | data: "fake_data"
32 | test: "plain_text"
33 | headers:
34 | Content-Type: text/plain
35 | log_response: True
36 |
--------------------------------------------------------------------------------
/tests/fixtures/dag_factory_kubernetes_pod_operator.yml:
--------------------------------------------------------------------------------
1 | default:
2 | default_args:
3 | owner: 'default_owner'
4 | start_date: 2018-03-01
5 | end_date: 2018-03-05
6 | retries: 1
7 | retry_delay_sec: 300
8 | concurrency: 1
9 | max_active_runs: 1
10 | dagrun_timeout_sec: 600
11 | default_view: 'tree'
12 | orientation: 'LR'
13 | schedule_interval: '0 1 * * *'
14 | example_dag:
15 | tasks:
16 | task_1:
17 | operator: airflow.contrib.operators.kubernetes_pod_operator.KubernetesPodOperator
18 | namespace: 'default'
19 | config_file : 'path_to_config_file'
20 | image : 'image'
21 | image_pull_policy : 'Always'
22 | arguments : [
23 | 'arg1',
24 | 'arg2',
25 | 'arg3',
26 | ]
27 | secrets : [{"secret":"secret","deploy_type":"env","deploy_target":"ENV_VAR"}]
28 | ports : [{"name" : "name","container_port":"container_port"},{"name" : "name","container_port":"container_port"}]
29 | volume_mounts : [
30 | {"name":"name","mount_path":"mount_path","sub_path":"sub_path","read_only":"read_only"},
31 | {"name":"name","mount_path":"mount_path","sub_path":"sub_path","read_only":"read_only"},
32 | ]
33 | volumes : [
34 | {"name":"name","configs":{'persistentVolumeClaim': {'claimName': 'test-volume'}}},
35 | {"name":"name","configs":{'persistentVolumeClaim': {'claimName': 'test-volume'}}},
36 | ]
37 | pod_runtime_info_envs : [
38 | {"name":"name","field_path":"field_path"},
39 | {"name":"name","field_path":"field_path"},
40 | ]
41 | full_pod_spec : {
42 | "api_version": "api_version",
43 | "kind": "kind",
44 | "metadata": "metadata",
45 | "spec": "spec",
46 | "status": "status",
47 | }
48 | init_containers : [
49 | {"name": "name","args":"args","command":"command"},
50 | ]
51 | labels: {'foo': 'bar'}
52 | name: 'passing-test'
53 | task_id: 'passing-task'
54 | get_logs: True
55 | in_cluster: False
56 | dependencies: []
57 | task_2:
58 | operator: airflow.contrib.operators.kubernetes_pod_operator.KubernetesPodOperator
59 | namespace: 'default'
60 | config_file : 'path_to_config_file'
61 | image : 'image'
62 | image_pull_policy : 'Always'
63 | arguments : [
64 | 'arg1',
65 | 'arg2',
66 | 'arg3',
67 | ]
68 | labels: {'foo': 'bar'}
69 | name: 'passing-test'
70 | task_id: 'passing-task'
71 | get_logs: True
72 | in_cluster: False
73 | dependencies: ['task_1']
74 |
--------------------------------------------------------------------------------
/tests/fixtures/dag_factory_simple_http_operator_task.yml:
--------------------------------------------------------------------------------
1 | default:
2 | default_args:
3 | catchup: false,
4 | start_date: 2025-03-20
5 |
6 | simple_http_operator_example_dag:
7 | default_args:
8 | owner: "@owner"
9 | description: "this is a SimpleHttpOperator dag"
10 | schedule_interval: "0 3 * * *"
11 | tags: ['http']
12 | render_template_as_native_obj: True
13 | tasks:
14 | send_request_json:
15 | operator: airflow.operators.http_operator.SimpleHttpOperator
16 | http_conn_id: "example_host"
17 | method: "POST"
18 | endpoint: "/run_test"
19 | data:
20 | data: "fake_data"
21 | format: "json"
22 | headers:
23 | Content-Type: application/json
24 | log_response: True
25 | send_request_plain_text:
26 | operator: airflow.operators.http_operator.SimpleHttpOperator
27 | http_conn_id: "example_host"
28 | method: "POST"
29 | endpoint: "/run_test"
30 | data:
31 | data: "fake_data"
32 | test: "plain_text"
33 | headers:
34 | Content-Type: text/plain
35 | log_response: True
36 |
--------------------------------------------------------------------------------
/tests/fixtures/dag_factory_task_group.yml:
--------------------------------------------------------------------------------
1 | default:
2 | default_args:
3 | end_date: 2018-03-05
4 | owner: default_owner
5 | retries: 1
6 | retry_delay_sec: 300
7 | start_date: 2018-03-01
8 | default_view: tree
9 | max_active_runs: 1
10 | orientation: LR
11 | schedule_interval: 0 1 * * *
12 | example_dag:
13 | description: "this dag uses task groups"
14 | task_groups:
15 | task_group_1:
16 | tooltip: "this is a task group"
17 | dependencies: [task_1]
18 | tasks:
19 | task_1:
20 | operator: airflow.operators.bash_operator.BashOperator
21 | bash_command: "echo 1"
22 | task_2:
23 | operator: airflow.operators.bash_operator.BashOperator
24 | bash_command: "echo 2"
25 | task_group_name: task_group_1
26 | task_3:
27 | operator: airflow.operators.python_operator.PythonOperator
28 | python_callable_name: print_hello
29 | python_callable_file: examples/print_hello.py
30 | task_group_name: task_group_1
31 | dependencies: [task_2]
32 | task_4:
33 | operator: airflow.operators.bash_operator.BashOperator
34 | bash_command: "echo 1"
35 | dependencies: [task_group_1]
36 |
--------------------------------------------------------------------------------
/tests/fixtures/dag_factory_variables_as_arguments.yml:
--------------------------------------------------------------------------------
1 | default:
2 | default_args:
3 | owner: 'default_owner'
4 | start_date: 2018-03-01
5 | end_date: 2018-03-05
6 | retries: 1
7 | retry_delay_sec: 300
8 | concurrency: 1
9 | max_active_runs: 1
10 | dagrun_timeout_sec: 600
11 | default_view: 'tree'
12 | orientation: 'LR'
13 | schedule_interval: '0 1 * * *'
14 |
15 | example_dag:
16 | default_args:
17 | owner: 'custom_owner'
18 | start_date: 2 days
19 | description: 'this is an example dag'
20 | schedule_interval: '0 3 * * *'
21 | tasks:
22 | task_1:
23 | operator: airflow.operators.bash_operator.BashOperator
24 | bash_command: 'echo 1'
25 | task_2:
26 | operator: airflow.operators.bash_operator.BashOperator
27 | bash_command: 'echo 2'
28 | dependencies: [task_1]
29 | task_3:
30 | operator: airflow.operators.bash_operator.BashOperator
31 | bash_command: 'echo 3'
32 | dependencies: [task_1]
33 | variables_as_arguments : [
34 | {"variable":"var1","attribute":"bash_command"}
35 | ]
36 |
37 | second_example_dag:
38 | default_args:
39 | owner: 'custom_owner'
40 | start_date: 3 days
41 | description: 'this is a second example dag'
42 | schedule_interval: '0 6 * * *'
43 | tasks:
44 | task_0:
45 | operator: airflow.operators.bash_operator.BashOperator
46 | bash_command: 'echo 1'
47 |
--------------------------------------------------------------------------------
/tests/fixtures/dag_md_docs.yml:
--------------------------------------------------------------------------------
1 | default:
2 | concurrency: 1
3 | dagrun_timeout_sec: 600
4 | default_args:
5 | end_date: 2018-03-05
6 | owner: default_owner
7 | retries: 1
8 | retry_delay_sec: 300
9 | start_date: 2018-03-01
10 | default_view: tree
11 | max_active_runs: 1
12 | orientation: LR
13 | schedule_interval: 0 1 * * *
14 |
15 | example_dag2:
16 | schedule_interval: None
17 | tasks:
18 | task_1:
19 | bash_command: echo 1
20 | operator: airflow.operators.bash_operator.BashOperator
21 | task_2:
22 | bash_command: echo 2
23 | dependencies:
24 | - task_1
25 | operator: airflow.operators.bash_operator.BashOperator
26 | task_3:
27 | bash_command: echo 3
28 | dependencies:
29 | - task_1
30 | operator: airflow.operators.bash_operator.BashOperator
31 |
--------------------------------------------------------------------------------
/tests/fixtures/defaults.yml:
--------------------------------------------------------------------------------
1 | default_args:
2 | start_date: "2025-01-01"
3 | owner: "global_owner"
4 | depends_on_past: true
5 |
--------------------------------------------------------------------------------
/tests/fixtures/doc_md_builder.py:
--------------------------------------------------------------------------------
1 | def mydocmdbuilder(**kwargs):
2 | return f"{kwargs}"
3 |
--------------------------------------------------------------------------------
/tests/fixtures/invalid_dag_factory.yml:
--------------------------------------------------------------------------------
1 | default:
2 | default_args:
3 | owner: 'default_owner'
4 | max_active_runs: 1
5 | dagrun_timeout_sec: 600
6 | schedule_interval: '0 1 * * *'
7 |
8 | example_dag:
9 | default_args:
10 | owner: 'custom_owner'
11 | description: 'this is an example dag'
12 | schedule_interval: '0 3 * * *'
13 | tasks:
14 | task_1:
15 | operator: airflow.operators.bash_operator.BashOperator
16 | bash_command: 'echo 1'
17 | task_2:
18 | operator: airflow.operators.bash_operator.BashOperator
19 | bash_command: 'echo 2'
20 | dependencies: [task_1]
21 | task_3:
22 | operator: airflow.operators.bash_operator.BashOperator
23 | bash_command: 'echo 3'
24 | dependencies: [task_1]
25 |
--------------------------------------------------------------------------------
/tests/fixtures/invalid_yaml.yml:
--------------------------------------------------------------------------------
1 | default:
2 | default_args:
3 | owner: 'default_owner'
4 | start_date: 2018-03-01
5 | max_active_runs: 1
6 | schedule_interval: '0 1 * * *'
7 |
8 | example_dag:
9 | default_args:
10 | owner: 'custom_owner'
11 | start_date: 2 days
12 | description: 'this is an example dag'
13 | schedule_interval: '0 3 * * *'
14 | tasks:
15 | task_1
16 | operator: airflow.operators.bash_operator.BashOperator
17 | bash_command: 'echo 1'
18 | task_2:
19 | operator: airflow.operators.bash_operator.BashOperator
20 | bash_command: 'echo 2'
21 | dependencies: [task_1]
22 | task_3:
23 | operator: airflow.operators.bash_operator.BashOperator
24 | bash_command: 'echo 3'
25 | dependencies: [task_1]
26 |
--------------------------------------------------------------------------------
/tests/fixtures/mydocfile.md:
--------------------------------------------------------------------------------
1 | # Generantia quoque et umbrae canunt exspectatum medio
2 |
3 | ## In monstra bracchia et terrae a donec
4 |
5 | Lorem markdownum sua tot templisque auras conquerar avertit dant. Quis patris et
6 | Stygios tanta: est neque altera curvamine piasti, tota summa, anne aqua ponto.
7 | **In exanimi** Aegides studiis repetisse tales, promittat futurus! Secundas
8 | anima.
9 |
10 | 1. Ad patulis
11 | 2. Bracchia et auras
12 | 3. Ista inductas pinum
13 | 4. In fuit arcus Achilli
14 | 5. Quoque sumpsisse aurumque abesto fugit
15 | 6. Parum plangebat volvens addidit
16 |
17 | Vasto electarumque adit: ars mactatos potest Apollineos reliquit venis, abesto;
18 | flexile micantes, Hippodamas hunc urit. Iubebit umeris ex Phoebi gelidis in
19 | templorum summo; etiam hic **sumptas nosces** non? Decidit pariter membra nec
20 | deponunt dumque aere placido nec fata navalibus. Harena tempora esset
21 | sacrificos, poenas quam; caelestia superi isdem corpora. *Flexisque fraterna*
22 | removerat, concursibus ripae inferiora cuiquam *nisi plumbea* moriente nunc
23 | noviens meosque talia occiderat fecerat cogamque?
24 |
25 | clip_parameter_joystick -= xslt;
26 | var flashHdtv = nntp;
27 | if (mode_data_webcam) {
28 | hashtagPartyFirmware += root + jsf_rw.serverArray(pramOspfPrinter, 1,
29 | timeIo);
30 | } else {
31 | vci.mediaStandalone += javaIcannThyristor;
32 | richPciFddi(httpsCdSpam, web_bitmap + tutorial_source);
33 | sanWarm(tutorial + backlink_control);
34 | }
35 | if (remoteMetaApi == native + zebibyteExploitWan) {
36 | record_drive(root);
37 | control.fileVariable = pageForumMca;
38 | compressionServiceFlash = ocr;
39 | } else {
40 | metal_menu(5, honeypotIpvFlat.analyst_undo.alignmentChip(programIcmp,
41 | 2), noc_zip);
42 | }
43 | var scan_ics_basic = stack + snmpVirusFpu + -4 * outputWebcam;
44 |
45 | ## Nefasque pone lugubris moveant sceptra
46 |
47 | Reddit erat torus cornua pars, sceleris in ecce, illa esse quicquid adicit,
48 | obstantes. Dum puer egredior, nec telum [veniet](http://www.quo-piget.io/), aura
49 | **hic** ambobus septem Aram poteras annis. Traxit pectore: Troiane valebant
50 | increpat. Thoona fit et sibi adopertaque hanc; virgo natasque essent [quas
51 | polypus dicens](http://voce.org/corpore-habet) partem genibus, ex.
52 |
53 | > Provolvi ab summa quae verus illis: pronus est agmina flectat sua digna
54 | > *ille*, longa. [Tantalus](http://tuulmi.org/violentus) Gryneus mihi
55 | > circumfunditur posse stipitis deprensus porrigit in penetrat digiti! Currus
56 | > fere canis pectore, odiis, sororia et annis! Adspicit *tu adest* sua inserui
57 | > Liber! Translucet exigite templis et blanda, orbes gravidus Aeetias qui, et.
58 |
59 | *Non vox*, sum frigus caput dedi, indulsit se plurima tendentes, relictis
60 | damnatque, ante lacessit. Incaluit pallam. Magni toros quiete, timor laeta arida
61 | credat neque loquetur, pariterque mane, gerit ripas crevit ne. Vultum nondum,
62 | exclamant omnibus: per causa!
63 |
64 | Modo sunt legit pascua saepe, numeros ausi; quae Thracum. Est regia parte
65 | decerpsit: sidera! At visa, avi tenebras tibi formosior in causa, Perseu ratem,
66 | utilitas res et tolle. Vixque Minervae, ore libertas domos adspergine si sonat
67 | ut fonte.
68 |
69 | Frugilegas idem progenuit habebat fortissime lateque foci pignora, nec resumit
70 | quam Atrides. Viscera sua Paphon violenta naresque *esse* totas **crimine
71 | resonantia** vulneret ubi lecti omnia. Sua ingens ubi fecit ait est indigestaque
72 | quas haberet da *aerias iaculum nulloque* fluctibus comites cognata, et. Ora
73 | **intrat damna ante** Poemenis annos, et creatis Dianae. Uno lacertis levem?
74 |
--------------------------------------------------------------------------------
/tests/test_example_dags.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from pathlib import Path
4 |
5 | try:
6 | from functools import cache
7 | except ImportError:
8 | from functools import lru_cache as cache
9 |
10 | import airflow
11 | import pytest
12 | from airflow.models.dagbag import DagBag
13 | from airflow.utils.db import create_default_connections
14 | from airflow.utils.session import provide_session
15 | from packaging.version import Version
16 |
17 | from . import utils as test_utils
18 |
19 | EXAMPLE_DAGS_DIR = Path(__file__).parent.parent / "dev/dags"
20 | AIRFLOW_IGNORE_FILE = EXAMPLE_DAGS_DIR / ".airflowignore"
21 | AIRFLOW_VERSION = Version(airflow.__version__)
22 | IGNORED_DAG_FILES = ["example_callbacks.py"]
23 |
24 | MIN_VER_DAG_FILE_VER: dict[str, list[str]] = {
25 | # TaskFlow examples unrelated to dynamic task mapping work in earlier versions
26 | "2.3": ["example_dynamic_task_mapping.py", "example_taskflow.py"],
27 | "2.5": [
28 | "example_pypi_stats_dagfactory",
29 | "example_hackernews_dagfactory",
30 | "example_hackernews_plain_airflow",
31 | "example_pypi_stats_plain_airflow",
32 | ],
33 | "2.7": ["example_map_index_template.py"],
34 | "2.4": ["example_external_sensor_dag.py"],
35 | }
36 |
37 | # Add HTTP operator DAG to ignored files for providers-http versions without HttpOperator
38 | try:
39 | from airflow.providers.http.operators.http import HttpOperator
40 | HTTP_OPERATOR_AVAILABLE = True
41 | except ImportError:
42 | HTTP_OPERATOR_AVAILABLE = False
43 |
44 |
45 | @provide_session
46 | def get_session(session=None):
47 | create_default_connections(session)
48 | return session
49 |
50 |
51 | @pytest.fixture()
52 | def session():
53 | return get_session()
54 |
55 |
56 | @cache
57 | def get_dag_bag() -> DagBag:
58 | """Create a DagBag by adding the files that are not supported to .airflowignore"""
59 |
60 | with open(AIRFLOW_IGNORE_FILE, "w+") as file:
61 | for min_version, files in MIN_VER_DAG_FILE_VER.items():
62 | if AIRFLOW_VERSION < Version(min_version):
63 | print(f"Adding {files} to .airflowignore")
64 | file.writelines([f"{file}\n" for file in files])
65 |
66 | for dagfile in IGNORED_DAG_FILES:
67 | print(f"Adding {dagfile} to .airflowignore")
68 | file.writelines([f"{dagfile}\n"])
69 |
70 | # Print the contents of the .airflowignore file, and build the DagBag
71 | print(".airflowignore contents: ")
72 | print(AIRFLOW_IGNORE_FILE.read_text())
73 | db = DagBag(EXAMPLE_DAGS_DIR, include_examples=False)
74 |
75 | assert db.dags
76 | assert not db.import_errors
77 | return db
78 |
79 |
80 | def get_dag_ids() -> list[str]:
81 | dag_bag = get_dag_bag()
82 | return dag_bag.dag_ids
83 |
84 |
85 | @pytest.mark.integration
86 | @pytest.mark.parametrize("dag_id", get_dag_ids())
87 | def test_example_dag(session, dag_id: str):
88 | dag_bag = get_dag_bag()
89 | dag = dag_bag.get_dag(dag_id)
90 |
91 | # Skip http_operator_example_dag in older Airflow versions without HttpOperator
92 | if dag_id == "http_operator_example_dag" and not HTTP_OPERATOR_AVAILABLE:
93 | pytest.skip(f"Skipping {dag_id} because HttpOperator is not available")
94 |
95 | # Skip http_operator_example_dag in older Airflow versions
96 | # since it has compatibility issues with our connection handling
97 | if dag_id == "http_operator_example_dag" and AIRFLOW_VERSION < Version("2.7.0"):
98 | pytest.skip(f"Skipping {dag_id} on Airflow version {AIRFLOW_VERSION}")
99 |
100 | # This feature is available since Airflow 2.5:
101 | # https://airflow.apache.org/docs/apache-airflow/stable/release_notes.html#airflow-2-5-0-2022-12-02
102 | if AIRFLOW_VERSION >= Version("2.5"):
103 | dag.test()
104 | else:
105 | test_utils.run_dag(dag)
106 |
--------------------------------------------------------------------------------
/tests/test_parsers.py:
--------------------------------------------------------------------------------
1 | import ast
2 |
3 | import pytest
4 |
5 | from dagfactory.parsers import SafeEvalVisitor
6 |
7 |
8 | @pytest.fixture
9 | def dataset_map():
10 | return {"dataset_custom_1": 1, "dataset_custom_2": 2, "dataset_custom_3": 3}
11 |
12 |
13 | @pytest.fixture
14 | def visitor(dataset_map):
15 | return SafeEvalVisitor(dataset_map)
16 |
17 |
18 | def test_evaluate(visitor):
19 | condition_string = "dataset_custom_1 & dataset_custom_2 | dataset_custom_3"
20 | tree = ast.parse(condition_string, mode="eval")
21 | result = visitor.evaluate(tree)
22 | expected = (1 & 2) | 3
23 | assert result == expected
24 |
25 |
26 | def test_visit_BinOp_and(visitor):
27 | condition_string = "dataset_custom_1 & dataset_custom_2"
28 | tree = ast.parse(condition_string, mode="eval")
29 | result = visitor.evaluate(tree)
30 | expected = 1 & 2
31 | assert result == expected
32 |
33 |
34 | def test_visit_BinOp_or(visitor):
35 | condition_string = "dataset_custom_1 | dataset_custom_3"
36 | tree = ast.parse(condition_string, mode="eval")
37 | result = visitor.evaluate(tree)
38 | expected = 1 | 3
39 | assert result == expected
40 |
41 |
42 | def test_visit_Name(visitor):
43 | condition_string = "dataset_custom_2"
44 | tree = ast.parse(condition_string, mode="eval")
45 | result = visitor.evaluate(tree)
46 | expected = 2
47 | assert result == expected
48 |
49 |
50 | def test_visit_Constant(visitor):
51 | condition_string = "42"
52 | tree = ast.parse(condition_string, mode="eval")
53 | result = visitor.evaluate(tree)
54 | expected = 42
55 | assert result == expected
56 |
57 |
58 | def test_unsupported_binary_operation(visitor):
59 | condition_string = "dataset_custom_1 + dataset_custom_2"
60 | tree = ast.parse(condition_string, mode="eval")
61 | with pytest.raises(ValueError):
62 | visitor.evaluate(tree)
63 |
64 |
65 | def test_unsupported_unary_operation(visitor):
66 | condition_string = "+dataset_custom_1"
67 | tree = ast.parse(condition_string, mode="eval")
68 | with pytest.raises(ValueError):
69 | visitor.evaluate(tree)
70 |
71 |
72 | def test_undefined_variable(visitor):
73 | condition_string = "undefined_dataset"
74 | tree = ast.parse(condition_string, mode="eval")
75 | with pytest.raises(NameError):
76 | visitor.evaluate(tree)
77 |
78 |
79 | def test_unsupported_syntax(visitor):
80 | condition_string = "[1, 2, 3]"
81 | tree = ast.parse(condition_string, mode="eval")
82 | with pytest.raises(ValueError):
83 | visitor.evaluate(tree)
84 |
--------------------------------------------------------------------------------
/tests/test_settings.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from dagfactory import settings
4 |
5 |
6 | @pytest.mark.parametrize(
7 | "value,expected_response",
8 | [
9 | ("f", False),
10 | ("false", False),
11 | ("0", False),
12 | ("", False),
13 | ("none", False),
14 | ("True", True),
15 | ("true", True),
16 | ("1", True),
17 | ],
18 | )
19 | def test_convert_to_boolean(value, expected_response):
20 | assert settings.convert_to_boolean(value) == expected_response
21 |
--------------------------------------------------------------------------------
/tests/test_telemetry.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from unittest.mock import patch
3 |
4 | import httpx
5 | import pytest
6 |
7 | from dagfactory import telemetry
8 |
9 |
10 | def test_should_emit_is_true_by_default():
11 | assert telemetry.should_emit()
12 |
13 |
14 | @patch("dagfactory.settings.enable_telemetry", True)
15 | def test_should_emit_is_true_when_only_enable_telemetry_is_true():
16 | assert telemetry.should_emit()
17 |
18 |
19 | @patch("dagfactory.settings.do_not_track", True)
20 | def test_should_emit_is_false_when_do_not_track():
21 | assert not telemetry.should_emit()
22 |
23 |
24 | @patch("dagfactory.settings.no_analytics", True)
25 | def test_should_emit_is_false_when_no_analytics():
26 | assert not telemetry.should_emit()
27 |
28 |
29 | def test_collect_standard_usage_metrics():
30 | metrics = telemetry.collect_standard_usage_metrics()
31 | expected_keus = [
32 | "airflow_version",
33 | "dagfactory_version",
34 | "platform_machine",
35 | "platform_system",
36 | "python_version",
37 | "variables",
38 | ]
39 | assert sorted(metrics.keys()) == expected_keus
40 |
41 |
42 | class MockFailedResponse:
43 | is_success = False
44 | status_code = "404"
45 | text = "Non existent URL"
46 |
47 |
48 | @patch("dagfactory.telemetry.httpx.get", return_value=MockFailedResponse())
49 | def test_emit_usage_metrics_is_unsuccessful(mock_httpx_get, caplog):
50 | sample_metrics = {
51 | "dagfactory_version": "0.2.0a1",
52 | "airflow_version": "2.10.1",
53 | "python_version": "3.11",
54 | "platform_system": "darwin",
55 | "platform_machine": "amd64",
56 | "event_type": "dag_run",
57 | "status": "success",
58 | "dag_hash": "d151d1fa2f03270ea116cc7494f2c591",
59 | "task_count": 3,
60 | }
61 | is_success = telemetry.emit_usage_metrics(sample_metrics)
62 | mock_httpx_get.assert_called_once_with(
63 | f"""https://astronomer.gateway.scarf.sh/dag-factory/v2/0.2.0a1/2.10.1/3.11/darwin/amd64/dag_run/success/d151d1fa2f03270ea116cc7494f2c591/3""",
64 | timeout=1.0,
65 | follow_redirects=True,
66 | )
67 | assert not is_success
68 | log_msg = f"""Unable to emit usage metrics to https://astronomer.gateway.scarf.sh/dag-factory/v2/0.2.0a1/2.10.1/3.11/darwin/amd64/dag_run/success/d151d1fa2f03270ea116cc7494f2c591/3. Status code: 404. Message: Non existent URL"""
69 | assert caplog.text.startswith("WARNING")
70 | assert log_msg in caplog.text
71 |
72 |
73 | @patch("dagfactory.telemetry.httpx.get", side_effect=httpx.ConnectError(message="Something is not right"))
74 | def test_emit_usage_metrics_fails(mock_httpx_get, caplog):
75 | sample_metrics = {
76 | "dagfactory_version": "0.2.0a1",
77 | "airflow_version": "2.10.1",
78 | "python_version": "3.11",
79 | "platform_system": "darwin",
80 | "platform_machine": "amd64",
81 | "event_type": "dag_run",
82 | "status": "success",
83 | "dag_hash": "d151d1fa2f03270ea116cc7494f2c591",
84 | "task_count": 3,
85 | }
86 | is_success = telemetry.emit_usage_metrics(sample_metrics)
87 | mock_httpx_get.assert_called_once_with(
88 | f"""https://astronomer.gateway.scarf.sh/dag-factory/v2/0.2.0a1/2.10.1/3.11/darwin/amd64/dag_run/success/d151d1fa2f03270ea116cc7494f2c591/3""",
89 | timeout=1.0,
90 | follow_redirects=True,
91 | )
92 | assert not is_success
93 | log_msg = f"""Unable to emit usage metrics to https://astronomer.gateway.scarf.sh/dag-factory/v2/0.2.0a1/2.10.1/3.11/darwin/amd64/dag_run/success/d151d1fa2f03270ea116cc7494f2c591/3. An HTTPX connection error occurred: Something is not right."""
94 | assert caplog.text.startswith("WARNING")
95 | assert log_msg in caplog.text
96 |
97 |
98 | @pytest.mark.integration
99 | def test_emit_usage_metrics_succeeds(caplog):
100 | caplog.set_level(logging.DEBUG)
101 | sample_metrics = {
102 | "dagfactory_version": "0.2.0a1",
103 | "airflow_version": "2.10.1",
104 | "python_version": "3.11",
105 | "platform_system": "darwin",
106 | "platform_machine": "amd64",
107 | "event_type": "dag_run",
108 | "status": "success",
109 | "dag_hash": "d151d1fa2f03270ea116cc7494f2c591",
110 | "task_count": 3,
111 | }
112 | is_success = telemetry.emit_usage_metrics(sample_metrics)
113 | assert is_success
114 | assert caplog.text.startswith("DEBUG")
115 | assert "Telemetry is enabled. Emitting the following usage metrics to" in caplog.text
116 |
117 |
118 | @patch("dagfactory.telemetry.should_emit", return_value=False)
119 | def test_emit_usage_metrics_if_enabled_fails(mock_should_emit, caplog):
120 | caplog.set_level(logging.DEBUG)
121 | assert not telemetry.emit_usage_metrics_if_enabled("any", {})
122 | assert caplog.text.startswith("DEBUG")
123 | assert "Telemetry is disabled. To enable it, export AIRFLOW__DAG_FACTORY__ENABLE_TELEMETRY=True." in caplog.text
124 |
125 |
126 | @patch("dagfactory.telemetry.should_emit", return_value=True)
127 | @patch("dagfactory.telemetry.collect_standard_usage_metrics", return_value={"k1": "v1", "k2": "v2", "variables": {}})
128 | @patch("dagfactory.telemetry.emit_usage_metrics")
129 | def test_emit_usage_metrics_if_enabled_succeeds(
130 | mock_emit_usage_metrics, mock_collect_standard_usage_metrics, mock_should_emit
131 | ):
132 | assert telemetry.emit_usage_metrics_if_enabled("any", {"k2": "v2"})
133 | mock_emit_usage_metrics.assert_called_once()
134 | assert mock_emit_usage_metrics.call_args.args[0] == {
135 | "k1": "v1",
136 | "k2": "v2",
137 | "event_type": "any",
138 | "variables": {"k2": "v2"},
139 | }
140 |
--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import logging
4 | import sys
5 | from datetime import datetime
6 | from typing import Any
7 |
8 | from airflow.configuration import secrets_backend_list
9 | from airflow.exceptions import AirflowSkipException
10 | from airflow.models.dag import DAG
11 | from airflow.models.dagrun import DagRun
12 | from airflow.models.taskinstance import TaskInstance
13 | from airflow.secrets.local_filesystem import LocalFilesystemBackend
14 | from airflow.utils import timezone
15 | from airflow.utils.session import provide_session
16 | from airflow.utils.state import DagRunState, State
17 | from airflow.utils.types import DagRunType
18 | from sqlalchemy.orm.session import Session
19 |
20 | try:
21 | from airflow.utils.session import NEW_SESSION
22 | except ImportError:
23 | # Airflow < 2.3 did not have NEW_SESSION in airflow.utils.session
24 | from typing import cast
25 |
26 | from airflow import settings
27 |
28 | NEW_SESSION: settings.SASession = cast(settings.SASession, None)
29 |
30 | log = logging.getLogger(__name__)
31 |
32 |
33 | def run_dag(dag: DAG, conn_file_path: str | None = None) -> DagRun:
34 | return test_dag(dag=dag, conn_file_path=conn_file_path)
35 |
36 |
37 | # DAG.test() was added in Airflow version 2.5.0. And to test on older Airflow versions, we need to copy the
38 | # implementation here.
39 | @provide_session
40 | def test_dag(
41 | dag,
42 | execution_date: datetime | None = None,
43 | run_conf: dict[str, Any] | None = None,
44 | conn_file_path: str | None = None,
45 | variable_file_path: str | None = None,
46 | session: Session = NEW_SESSION,
47 | ) -> DagRun:
48 | """
49 | Execute one single DagRun for a given DAG and execution date.
50 |
51 | :param execution_date: execution date for the DAG run
52 | :param run_conf: configuration to pass to newly created dagrun
53 | :param conn_file_path: file path to a connection file in either yaml or json
54 | :param variable_file_path: file path to a variable file in either yaml or json
55 | :param session: database connection (optional)
56 | """
57 |
58 | if conn_file_path or variable_file_path:
59 | local_secrets = LocalFilesystemBackend(
60 | variables_file_path=variable_file_path, connections_file_path=conn_file_path
61 | )
62 | secrets_backend_list.insert(0, local_secrets)
63 |
64 | execution_date = execution_date or timezone.utcnow()
65 |
66 | dag.log.debug("Clearing existing task instances for execution date %s", execution_date)
67 | dag.clear(
68 | start_date=execution_date,
69 | end_date=execution_date,
70 | dag_run_state=False,
71 | session=session,
72 | )
73 | dag.log.debug("Getting dagrun for dag %s", dag.dag_id)
74 | dr: DagRun = _get_or_create_dagrun(
75 | dag=dag,
76 | start_date=execution_date,
77 | execution_date=execution_date,
78 | run_id=DagRun.generate_run_id(DagRunType.MANUAL, execution_date),
79 | session=session,
80 | conf=run_conf,
81 | )
82 |
83 | tasks = dag.task_dict
84 | dag.log.debug("starting dagrun")
85 | # Instead of starting a scheduler, we run the minimal loop possible to check
86 | # for task readiness and dependency management. This is notably faster
87 | # than creating a BackfillJob and allows us to surface logs to the user
88 | while dr.state == State.RUNNING:
89 | schedulable_tis, _ = dr.update_state(session=session)
90 | for ti in schedulable_tis:
91 | add_logger_if_needed(dag, ti)
92 | ti.task = tasks[ti.task_id]
93 | _run_task(ti, session=session)
94 | if conn_file_path or variable_file_path:
95 | # Remove the local variables we have added to the secrets_backend_list
96 | secrets_backend_list.pop(0)
97 |
98 | print("conn_file_path", conn_file_path)
99 |
100 | return dr, session
101 |
102 |
103 | def add_logger_if_needed(dag: DAG, ti: TaskInstance):
104 | """
105 | Add a formatted logger to the taskinstance so all logs are surfaced to the command line instead
106 | of into a task file. Since this is a local test run, it is much better for the user to see logs
107 | in the command line, rather than needing to search for a log file.
108 | Args:
109 | ti: The taskinstance that will receive a logger
110 |
111 | """
112 | logging_format = logging.Formatter("[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s")
113 | handler = logging.StreamHandler(sys.stdout)
114 | handler.level = logging.INFO
115 | handler.setFormatter(logging_format)
116 | # only add log handler once
117 | if not any(isinstance(h, logging.StreamHandler) for h in ti.log.handlers):
118 | dag.log.debug("Adding Streamhandler to taskinstance %s", ti.task_id)
119 | ti.log.addHandler(handler)
120 |
121 |
122 | def _run_task(ti: TaskInstance, session):
123 | """
124 | Run a single task instance, and push result to Xcom for downstream tasks. Bypasses a lot of
125 | extra steps used in `task.run` to keep our local running as fast as possible
126 | This function is only meant for the `dag.test` function as a helper function.
127 |
128 | Args:
129 | ti: TaskInstance to run
130 | """
131 | log.info("*****************************************************")
132 | if hasattr(ti, "map_index") and ti.map_index > 0:
133 | log.info("Running task %s index %d", ti.task_id, ti.map_index)
134 | else:
135 | log.info("Running task %s", ti.task_id)
136 | try:
137 | ti._run_raw_task(session=session)
138 | session.flush()
139 | log.info("%s ran successfully!", ti.task_id)
140 | except AirflowSkipException:
141 | log.info("Task Skipped, continuing")
142 | log.info("*****************************************************")
143 |
144 |
145 | def _get_or_create_dagrun(
146 | dag: DAG,
147 | conf: dict[Any, Any] | None,
148 | start_date: datetime,
149 | execution_date: datetime,
150 | run_id: str,
151 | session: Session,
152 | ) -> DagRun:
153 | """
154 | Create a DAGRun, but only after clearing the previous instance of said dagrun to prevent collisions.
155 | This function is only meant for the `dag.test` function as a helper function.
156 | :param dag: Dag to be used to find dagrun
157 | :param conf: configuration to pass to newly created dagrun
158 | :param start_date: start date of new dagrun, defaults to execution_date
159 | :param execution_date: execution_date for finding the dagrun
160 | :param run_id: run_id to pass to new dagrun
161 | :param session: sqlalchemy session
162 | :return:
163 | """
164 | log.info("dagrun id: %s", dag.dag_id)
165 | dr: DagRun = (
166 | session.query(DagRun).filter(DagRun.dag_id == dag.dag_id, DagRun.execution_date == execution_date).first()
167 | )
168 | if dr:
169 | session.delete(dr)
170 | session.commit()
171 | dr = dag.create_dagrun(
172 | state=DagRunState.RUNNING,
173 | execution_date=execution_date,
174 | run_id=run_id,
175 | start_date=start_date or execution_date,
176 | session=session,
177 | conf=conf,
178 | )
179 | log.info("created dagrun %s", str(dr))
180 | return dr
181 |
182 |
183 | def one_hour_ago(execution_date: datetime):
184 | return execution_date - datetime.timedelta(hours=1)
185 |
--------------------------------------------------------------------------------