├── .bumpversion.cfg ├── .github ├── FUNDING.yml └── workflows │ ├── ci.yml │ └── lint.yml ├── .gitignore ├── MANIFEST.in ├── Makefile ├── README.md ├── VERSION ├── airflow_dvc ├── __init__.py ├── cli │ ├── __init__.py │ └── entrypoint.py ├── custom_downloads.py ├── custom_uploads.py ├── dvc_download_operator.py ├── dvc_existence_sensor.py ├── dvc_hook.py ├── dvc_update_operator.py ├── dvc_update_sensor.py ├── exceptions.py ├── logs.py ├── plugin │ ├── __init__.py │ ├── git_url_parser.py │ ├── platforms │ │ ├── __init__.py │ │ ├── assembla.py │ │ ├── base.py │ │ ├── bitbucket.py │ │ ├── friendcode.py │ │ ├── github.py │ │ └── gitlab.py │ ├── plugin.py │ └── templates │ │ └── dvc │ │ ├── list.html │ │ └── pushes.html ├── stats.py ├── test_utils.py └── tests │ └── helpers.py ├── example ├── dags │ ├── dvc_download_example.py │ ├── dvc_existence_sensor_example.py │ ├── dvc_update_sensor_example.py │ ├── dvc_upload_example.py │ └── dvc_upload_with_template_example.py └── plugins │ └── dvc.py ├── install_deps.sh ├── package.json ├── poetry.lock ├── publish.py ├── pyproject.toml ├── run_airflow.sh ├── setup.cfg ├── static ├── cg_logo.png ├── screen1.png ├── screen2.png └── screen3.png └── tests └── test_dag_loading.py /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 1.9.9 3 | commit = True 4 | tag = False 5 | parse = (?P\d+)\.(?P\d+)\.(?P\d+) 6 | serialize = 7 | {major}.{minor}.{patch} 8 | {major}.{minor}.{patch} 9 | 10 | [bumpversion:part:release] 11 | optional_value = prod 12 | first_value = dev 13 | values = 14 | dev 15 | prod 16 | 17 | [bumpversion:part:build] 18 | 19 | [bumpversion:file:VERSION] 20 | 21 | [bumpversion:file:pyproject.toml] 22 | search = version = "{current_version}" 23 | replace = version = "{new_version}" 24 | 25 | [bumpversion:file:README.md] 26 | 27 | [bumpversion:file:./airflow_dvc/__init__.py] 28 | search = __version__ = "{current_version}" 29 | replace = __version__ = "{new_version}" 30 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [styczynski] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Build and test 2 | 3 | on: 4 | push: 5 | branches: [ master, feature/ci-workflow ] 6 | pull_request: 7 | branches: 8 | - "**" 9 | 10 | jobs: 11 | build_and_test: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Install Orca 16 | run: npm install -g electron@6.1.4 orca 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.9' 21 | - name: Install Poetry 22 | uses: snok/install-poetry@v1 23 | with: 24 | virtualenvs-create: true 25 | virtualenvs-in-project: true 26 | - name: Load cached venv 27 | id: cached-poetry-dependencies 28 | uses: actions/cache@v2 29 | with: 30 | path: .venv 31 | key: venv-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }} 32 | - name: Install dependencies 33 | if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' 34 | run: poetry install --no-interaction --no-root 35 | - name: Install project 36 | run: poetry install --no-interaction 37 | - name: Install DVC 38 | run: pip install 'dvc[s3]' 39 | - name: Run tests 40 | env: 41 | DVC_GITHUB_REPO_TOKEN: ${{ secrets.DVC_GITHUB_REPO_TOKEN }} 42 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} 43 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 44 | AWS_DEFAULT_REGION: "eu-central-1" 45 | run: | 46 | export AIRFLOW_HOME=$(pwd)/airflow 47 | export AIRFLOW_CONFIG=$AIRFLOW_HOME/airflow.cfg 48 | mkdir -p $AIRFLOW_HOME > /dev/null 2> /dev/null 49 | poetry run airflow db init 50 | poetry run airflow users create \ 51 | --username admin \ 52 | --firstname Peter \ 53 | --lastname Parker \ 54 | --role Admin \ 55 | --email spiderman@superhero.org \ 56 | --password admin 57 | source .venv/bin/activate 58 | coverage run --omit 'venv/*' -m pytest tests/ 59 | coverage report 60 | - name: Build package 61 | run: poetry build 62 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Lint code 2 | 3 | on: 4 | push: 5 | branches: 6 | - "**" 7 | 8 | jobs: 9 | lint: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v2 13 | - name: Set up Python 14 | uses: actions/setup-python@v2 15 | with: 16 | python-version: '3.8' 17 | - name: Apply netrc creds with direct input again 18 | uses: little-core-labs/netrc-creds@master 19 | with: 20 | machine: "pypi.covidgenomics.com" 21 | login: "pstyczynski" 22 | password: "y\\)Yup~f1&s-B9G6kmIR.hOAZ1,!6.wa" 23 | - name: Install Poetry 24 | uses: snok/install-poetry@v1 25 | with: 26 | virtualenvs-create: false 27 | virtualenvs-in-project: false 28 | - name: Install dependencies 29 | run: poetry install --no-interaction --no-root 30 | - name: Install project 31 | run: poetry install --no-interaction 32 | - name: Lint project 33 | run: | 34 | make lint -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | env.sh 2 | airflow/ 3 | .idea/ 4 | **/__pycache__/ 5 | dist/ 6 | publish.log 7 | poetry.toml -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include example 3 | include example/dags 4 | include example/dags/dvc_sensor_example.py 5 | include example/dags/dvc_upload_example.py 6 | include example/plugins 7 | include example/plugins/dvc.py 8 | include pyproject.toml 9 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | lint: 2 | poetry run black . 3 | poetry run isort . 4 | poetry run flakehell lint 5 | 6 | install: 7 | poetry install 8 | 9 | update: 10 | poetry update 11 | 12 | test: 13 | poetry run pytest -n 4 14 | 15 | publish: 16 | poetry run publish 17 | 18 | documentation: 19 | rm -rf pydoc-markdown.yml > /dev/null 2> /dev/null 20 | rm -rf build/docs > /dev/null 2> /dev/null 21 | poetry run pydoc-markdown --bootstrap hugo 22 | poetry run pydoc-markdown 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Airflow DVC (1.9.9) [![Sponsor project](https://user-images.githubusercontent.com/4967343/89290185-5e893700-d650-11ea-8942-4579b2c96c2c.png)](https://github.com/sponsors/styczynski) 2 | 3 | [![Build and test](https://github.com/styczynski/airflow-dvc/actions/workflows/ci.yml/badge.svg)](https://github.com/styczynski/airflow-dvc/actions/workflows/ci.yml) 4 | [![PyPI](https://img.shields.io/pypi/v/airflow-dvc?style=flat-square)](https://pypi.org/project/airflow-dvc/) 5 | [![GitHub commit activity](https://img.shields.io/github/commit-activity/m/styczynski/airflow-dvc?style=flat-square)](https://github.com/styczynski/airflow-dvc/commits/master) 6 | 7 | ## What is it? 8 | 9 | This is an [Airflow](https://airflow.apache.org/) extension that adds support for [DVC](https://dvc.org/doc) operations. 10 | 11 | The basic tutorial about DVC and Airflow can be found on [Deepflare website in the News section here](https://deepflare.ai/blog/airflow_dvc/). 12 | The motivation for such a package was to create modern automated data science pipelines that operates on versioned data. 13 | 14 | ## Installation 15 | 16 | To install this package please do: 17 | ```bash 18 | $ python3 -m pip install "airflow-dvc==1.9.9" 19 | ``` 20 | 21 | Or if you are using [Poetry](https://python-poetry.org/) to run Apache Airflow: 22 | ```bash 23 | $ poetry add apache-airflow@latest 24 | $ poetry add "airflow-dvc@1.9.9" 25 | ``` 26 | 27 | ## What this package provides? 28 | 29 | The package provides the following core features: 30 | * 📊 [DVC Operator view](https://github.com/styczynski/airflow-dvc#-dvc-operator-view) (tab to browse all configured DVC operators) 31 | * 💾 [DVCUpdateOperator](https://github.com/styczynski/airflow-dvc#-dvcupdateoperator-uploading) (for uploading data to DVC) 32 | * ⬇️ [DVCDownloadOperator](https://github.com/styczynski/airflow-dvc#%EF%B8%8F-dvcdownloadoperator-downloading) (for downloading data from DVC) 33 | * 👀 [DVCUpdateSensor](https://github.com/styczynski/airflow-dvc#-dvcupdatesensor) (for waiting for a file modification on DVC) 34 | * 🤖 [DVCHook](https://github.com/styczynski/airflow-dvc#-dvchook) (high-level client for DVC) 35 | 36 | ## Run examples yourself 37 | 38 | The examples are provided in the `example/` directory. 39 | Please do the following to setup quick Airflow demo: 40 | ```bash 41 | # Your git repo clone URL 42 | # Example: 43 | # REPO="https://GITHUB_PERSONAL_TOKEN@github.com/OWNER/REPO.git" 44 | $ export REPO="" 45 | 46 | # Install Airflow with Poetry 47 | $ mkdir airflow-dvc-test && cd airflow-dvc-test 48 | $ poetry init 49 | $ poetry add apache-airflow "airflow-dvc@1.9.9" 50 | 51 | # Configure Airflow paths 52 | $ export AIRFLOW_HOME=$(pwd)/airflow 53 | $ export AIRFLOW_CONFIG=$AIRFLOW_HOME/airflow.cfg 54 | $ mkdir -p $AIRFLOW_HOME > /dev/null 2> /dev/null 55 | 56 | # Init Airflow 57 | $ poetry run airflow db init 58 | $ poetry run airflow users create \ 59 | --username admin \ 60 | --firstname Peter \ 61 | --lastname Parker \ 62 | --role Admin \ 63 | --email spiderman@superhero.org 64 | 65 | # Create example DVC DAGs 66 | $ poetry run airflow_dvc generate example_dags 67 | 68 | # Run Airflow 69 | $ poetry run airflow webserver --port 8080 & 70 | $ poetry run airflow scheduler & 71 | ``` 72 | 73 | ## Usage 74 | 75 | ### 📊 DVC Operator view 76 | 77 | After installation, you should be able to access `Browse > DVC Operators` option in the Airflow menu. 78 | 79 | 80 | 81 | The `DVC Operators` view allows you to display all configured DVC operators and repositories that they will push the files to/pull from. 82 | 83 | 84 | 85 | The `DVC Pushes` view allows you to display all commits created by the DVC operators among all repositories: 86 | 87 | 88 | 89 | 90 | ### 💾 DVCUpdateOperator (Uploading) 91 | 92 | The upload operator supports various types of data inputs that you can feed into it. 93 | 94 | **Uploading a string as a file:** 95 | ```python 96 | from airflow_dvc import DVCUpdateOperator, DVCStringUpload 97 | from datetime import datetime 98 | 99 | upload_task = DVCUpdateOperator( 100 | dvc_repo="", 101 | files=[ 102 | DVCStringUpload("data/1.txt", f"This will be saved into DVC. Current time: {datetime.now()}"), 103 | ], 104 | task_id='update_dvc', 105 | ) 106 | ``` 107 | 108 | **Uploading local file using its path:** 109 | ```python 110 | from airflow_dvc import DVCUpdateOperator, DVCPathUpload 111 | 112 | upload_task = DVCUpdateOperator( 113 | dvc_repo="", 114 | files=[ 115 | DVCPathUpload("data/1.txt", "~/local_file_path.txt"), 116 | ], 117 | task_id='update_dvc', 118 | ) 119 | ``` 120 | 121 | **Upload content generated by a python function:** 122 | ```python 123 | from airflow_dvc import DVCUpdateOperator, DVCCallbackUpload 124 | 125 | upload_task = DVCUpdateOperator( 126 | dvc_repo="", 127 | files=[ 128 | DVCCallbackUpload("data/1.txt", lambda: "Test data"), 129 | ], 130 | task_id='update_dvc', 131 | ) 132 | ``` 133 | 134 | **Uploading file from S3:** 135 | This is specially useful when you have a workflow that uses [S3Hook](https://airflow.apache.org/docs/apache-airflow/1.10.14/_modules/airflow/hooks/S3_hook.html) to temporarily save the data between tasks. 136 | 137 | ```python 138 | from airflow import DAG 139 | from airflow.operators.python_operator import PythonOperator 140 | from airflow.providers.amazon.aws.hooks.s3 import S3Hook 141 | from datetime import datetime, timedelta 142 | 143 | from io import StringIO 144 | import pandas as pd 145 | import requests 146 | 147 | from airflow_dvc import DVCUpdateOperator, DVCS3Upload 148 | 149 | s3_conn_id = 's3-conn' 150 | bucket = 'astro-workshop-bucket' 151 | state = 'wa' 152 | date = '{{ yesterday_ds_nodash }}' 153 | 154 | def upload_to_s3(state, date): 155 | '''Grabs data from Covid tracking endpoint and saves to flat file on S3 156 | ''' 157 | # Connect to S3 158 | s3_hook = S3Hook(aws_conn_id=s3_conn_id) 159 | 160 | # Get data from API 161 | url = 'https://covidtracking.com/api/v1/states/' 162 | res = requests.get(url+'{0}/{1}.csv'.format(state, date)) 163 | 164 | # Save data to CSV on S3 165 | s3_hook.load_string(res.text, '{0}_{1}.csv'.format(state, date), bucket_name=bucket, replace=True) 166 | 167 | def process_data(state, date): 168 | '''Reads data from S3, processes, and saves to new S3 file 169 | ''' 170 | # Connect to S3 171 | s3_hook = S3Hook(aws_conn_id=s3_conn_id) 172 | 173 | # Read data 174 | data = StringIO(s3_hook.read_key(key='{0}_{1}.csv'.format(state, date), bucket_name=bucket)) 175 | df = pd.read_csv(data, sep=',') 176 | 177 | # Process data 178 | processed_data = df[['date', 'state', 'positive', 'negative']] 179 | 180 | # Save processed data to CSV on S3 181 | s3_hook.load_string(processed_data.to_string(), 'dvc_upload.csv', bucket_name=bucket, replace=True) 182 | 183 | # Default settings applied to all tasks 184 | default_args = { 185 | 'owner': 'airflow', 186 | 'depends_on_past': False, 187 | 'email_on_failure': False, 188 | 'email_on_retry': False, 189 | 'retries': 1, 190 | 'retry_delay': timedelta(minutes=1) 191 | } 192 | 193 | with DAG('intermediary_data_storage_dag', 194 | start_date=datetime(2021, 1, 1), 195 | max_active_runs=1, 196 | schedule_interval='@daily', 197 | default_args=default_args, 198 | catchup=False 199 | ) as dag: 200 | 201 | generate_file = PythonOperator( 202 | task_id='generate_file_{0}'.format(state), 203 | python_callable=upload_to_s3, 204 | op_kwargs={'state': state, 'date': date} 205 | ) 206 | 207 | process_data = PythonOperator( 208 | task_id='process_data_{0}'.format(state), 209 | python_callable=process_data, 210 | op_kwargs={'state': state, 'date': date} 211 | ) 212 | 213 | upload_to_dvc = DVCUpdateOperator( 214 | dvc_repo="", 215 | files=[ 216 | DVCS3Upload("dvc_path/data.txt", s3_conn_id, bucket, 'dvc_upload.csv'), 217 | ], 218 | task_id='update_dvc', 219 | ) 220 | 221 | generate_file >> process_data 222 | process_data >> upload_to_dvc 223 | ``` 224 | 225 | **Uploading file from S3, but using task arguments:** 226 | Instead of passing list as a files parameter you can pass function as would do in case of PythonOperator: 227 | ```python 228 | from airflow import DAG 229 | from airflow.operators.python_operator import PythonOperator 230 | from airflow.providers.amazon.aws.hooks.s3 import S3Hook 231 | from datetime import datetime, timedelta 232 | 233 | from io import StringIO 234 | import pandas as pd 235 | import requests 236 | 237 | from airflow_dvc import DVCUpdateOperator, DVCS3Upload 238 | 239 | s3_conn_id = 's3-conn' 240 | bucket = 'astro-workshop-bucket' 241 | state = 'wa' 242 | date = '{{ yesterday_ds_nodash }}' 243 | 244 | def upload_to_s3(state, date): 245 | '''Grabs data from Covid endpoint and saves to flat file on S3 246 | ''' 247 | # Connect to S3 248 | s3_hook = S3Hook(aws_conn_id=s3_conn_id) 249 | 250 | # Get data from API 251 | url = 'https://covidtracking.com/api/v1/states/' 252 | res = requests.get(url+'{0}/{1}.csv'.format(state, date)) 253 | 254 | # Save data to CSV on S3 255 | s3_hook.load_string(res.text, '{0}_{1}.csv'.format(state, date), bucket_name=bucket, replace=True) 256 | 257 | def process_data(state, date): 258 | '''Reads data from S3, processes, and saves to new S3 file 259 | ''' 260 | # Connect to S3 261 | s3_hook = S3Hook(aws_conn_id=s3_conn_id) 262 | 263 | # Read data 264 | data = StringIO(s3_hook.read_key(key='{0}_{1}.csv'.format(state, date), bucket_name=bucket)) 265 | df = pd.read_csv(data, sep=',') 266 | 267 | # Process data 268 | processed_data = df[['date', 'state', 'positive', 'negative']] 269 | 270 | # Save processed data to CSV on S3 271 | s3_hook.load_string(processed_data.to_string(), '{0}_{1}_processed.csv'.format(state, date), bucket_name=bucket, replace=True) 272 | 273 | def get_files_for_upload(state, date): 274 | return [ 275 | DVCS3Upload("dvc_path/data.txt", s3_conn_id, bucket, '{0}_{1}_processed.csv'.format(state, date)), 276 | ] 277 | 278 | # Default settings applied to all tasks 279 | default_args = { 280 | 'owner': 'airflow', 281 | 'depends_on_past': False, 282 | 'email_on_failure': False, 283 | 'email_on_retry': False, 284 | 'retries': 1, 285 | 'retry_delay': timedelta(minutes=1) 286 | } 287 | 288 | with DAG('intermediary_data_storage_dag', 289 | start_date=datetime(2021, 1, 1), 290 | max_active_runs=1, 291 | schedule_interval='@daily', 292 | default_args=default_args, 293 | catchup=False 294 | ) as dag: 295 | 296 | generate_file = PythonOperator( 297 | task_id='generate_file_{0}'.format(state), 298 | python_callable=upload_to_s3, 299 | op_kwargs={'state': state, 'date': date} 300 | ) 301 | 302 | process_data = PythonOperator( 303 | task_id='process_data_{0}'.format(state), 304 | python_callable=process_data, 305 | op_kwargs={'state': state, 'date': date} 306 | ) 307 | 308 | # Passing a function as files paramterer (it sould return a list of DVCUpload objects) 309 | # Also we specify op_kwargs to allow passing of parameters as in case of normal PythonOperator 310 | upload_to_dvc = DVCUpdateOperator( 311 | dvc_repo="", 312 | files=get_files_for_upload, 313 | task_id='update_dvc', 314 | op_kwargs={'state': state, 'date': date} 315 | ) 316 | 317 | generate_file >> process_data 318 | process_data >> upload_to_dvc 319 | ``` 320 | 321 | ### ⬇️ DVCDownloadOperator (Downloading) 322 | 323 | We can use `VCDownloadOperator` similarily to the `DVCUpdateOperator`. The syntax is the same: 324 | ```python 325 | from airflow_dvc import DVCDownloadOperator, DVCCallbackDownload 326 | 327 | # Download DVC file data/1.txt and print it on the screen 328 | upload_task = DVCDownloadOperator( 329 | dvc_repo="", 330 | files=[ 331 | DVCCallbackDownload("data/1.txt", lambda content: print(content)), 332 | ], 333 | task_id='update_dvc', 334 | ) 335 | ``` 336 | 337 | The `DVCDownload` implementations are similar to `DVCUpload`. 338 | 339 | ### 👀 DVCUpdateSensor 340 | 341 | `DVCUpdateSensor` will allow you to pause the DAG run until the specified file will be updated. 342 | The sensor checks the date of the latest DAG run and compares it with timestamp of meta DVC file in the repo. 343 | 344 | ```python 345 | from datetime import datetime 346 | from airflow import DAG 347 | from airflow.operators.dummy_operator import DummyOperator 348 | from airflow.operators.bash_operator import BashOperator 349 | 350 | from airflow_dvc import DVCUpdateSensor 351 | 352 | 353 | with DAG('dvc_sensor_example', description='Another tutorial DAG', 354 | start_date=datetime(2017, 3, 20), 355 | catchup=False, 356 | ) as dag: 357 | 358 | dummy_task = DummyOperator(task_id='dummy_task', dag=dag) 359 | 360 | sensor_task = DVCUpdateSensor( 361 | task_id='dvc_sensor_task', 362 | dag=dag, 363 | dvc_repo="", 364 | files=["data/1.txt"], 365 | ) 366 | 367 | task = BashOperator( 368 | task_id='task_triggered_by_sensor', 369 | bash_command='echo "OK" && ( echo $[ ( $RANDOM % 30 ) + 1 ] > meowu.txt ) && cat meowu.txt') 370 | 371 | dummy_task >> sensor_task >> task 372 | 373 | ``` 374 | 375 | ### 👀 DVCExistenceSensor 376 | 377 | The `DVCExistenceSensor` is similar to the `DVCUpdateSensor` but it checks if the file exists in the DVC repo: 378 | ```python 379 | from airflow_dvc import DVCExistenceSensor 380 | 381 | # Sensor will wait till the file is present 382 | sensor_task = DVCExistenceSensor( 383 | task_id='dvc_sensor_task', 384 | dag=dag, 385 | dvc_repo="", 386 | files=["some_path/some_subfolder/some_file.txt"], 387 | ) 388 | ``` 389 | 390 | ### 🤖 DVCHook 391 | 392 | You can perform all the operation manually using DVCHook: 393 | ```python 394 | from airflow_dvc import DVCHook, DVCPathUpload 395 | 396 | hook = DVCHook("") 397 | 398 | # Hook requires dag_id so if we will run this code inside class extending any Operator 399 | # we will be able to access self.dag_id field 400 | # Dag IDs are used to track all pushes to your DVC repositories 401 | hook.update([ 402 | DVCPathUpload("data/1.txt", "~/local_file_path.txt"), 403 | ], dag_id=self.dag_id) 404 | ``` 405 | ## Development 406 | 407 | Install the project with the following command: 408 | ```python 409 | $ poetry install 410 | ``` 411 | 412 | * You may want to run `poetry --version` to check if you have [Poetry](https://python-poetry.org/docs/) installed. If the command fails then proceed to install Poetry. The installer installs the poetry tool to Poetry's bin directory. On Unix it is located at `$HOME/.poetry/bin` and on Windows at `%USERPROFILE%\.poetry\bin`. 413 | This directory will be automatically added to your `$PATH` environment variable, by appending a statement to your `$HOME/.profile` configuration (or equivalent files). If you do not feel comfortable with this, please pass the --no-modify-path flag to the installer and manually add the Poetry's bin directory to your path. 414 | * **Linux/Mac:** `curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | python -` 415 | * **Windows:** Type in Powershell: `(Invoke-WebRequest -Uri https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py -UseBasicParsing).Content | python ` 416 | * Install project with `poetry install` 417 | * You can now use virtual env created by poetry. Please type `poetry shell` 418 | 419 | ### Code style 420 | 421 | The project has configured styles in `pyproject.toml`. 422 | To format it please call `make lint`. 423 | 424 | ### Versioning 425 | 426 | To bump project version before release please use the following command (for developers): 427 | ```bash 428 | $ poetry run bump2version minor 429 | ``` 430 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 1.9.9 -------------------------------------------------------------------------------- /airflow_dvc/__init__.py: -------------------------------------------------------------------------------- 1 | from dvc_fs.dvc_download import (DVCCallbackDownload, DVCDownload, 2 | DVCPathDownload) 3 | from dvc_fs.dvc_upload import (DVCCallbackUpload, DVCPathUpload, 4 | DVCStringUpload, DVCUpload) 5 | 6 | from . import exceptions, logs, stats 7 | from .cli.entrypoint import run_cli 8 | from .custom_downloads import DVCS3Download 9 | from .custom_uploads import DVCS3Upload 10 | from .dvc_download_operator import DVCDownloadOperator 11 | from .dvc_existence_sensor import DVCExistenceSensor 12 | from .dvc_hook import DVCCommit, DVCHook 13 | from .dvc_update_operator import DVCUpdateOperator 14 | from .dvc_update_sensor import DVCUpdateSensor 15 | from .plugin import DVCPlugin 16 | from .test_utils import execute_test_task 17 | 18 | __all__ = [ 19 | "DVCHook", 20 | "DVCUpdateSensor", 21 | "DVCUpdateOperator", 22 | "DVCExistenceSensor", 23 | "DVCUpload", 24 | "DVCStringUpload", 25 | "DVCS3Upload", 26 | "DVCPathUpload", 27 | "DVCCallbackUpload", 28 | "DVCDownloadOperator", 29 | "DVCDownload", 30 | "DVCPathDownload", 31 | "DVCS3Download", 32 | "DVCCallbackDownload", 33 | "DVCPlugin", 34 | "DVCCommit", 35 | "run_cli", 36 | "exceptions", 37 | "logs", 38 | "stats", 39 | "execute_test_task", 40 | ] 41 | 42 | __version__ = "1.9.9" 43 | -------------------------------------------------------------------------------- /airflow_dvc/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joednkn1/airflow-dvc/8147496723e4490a286407a210bf9c8150468ab4/airflow_dvc/cli/__init__.py -------------------------------------------------------------------------------- /airflow_dvc/cli/entrypoint.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import os 3 | import pathlib 4 | import traceback 5 | from typing import Optional, Tuple 6 | 7 | import typer 8 | 9 | app = typer.Typer() 10 | 11 | generator_app = typer.Typer() 12 | app.add_typer(generator_app, name="generate") 13 | 14 | AIRFLOW_HOME_VAR_NAME = "AIRFLOW_HOME" 15 | AIRFLOW_CFG_VAR_NAME = "AIRFLOW_CONFIG" 16 | AIRFLOW_CFG_FILE_NAME = "airflow.cfg" 17 | AIRFLOW_DEFAULT_DAGS_DIR = "dags" 18 | 19 | 20 | EXAMPLE_DAGS_DIRECTORY = os.path.abspath( 21 | os.path.join( 22 | os.path.dirname(__file__), 23 | "..", 24 | "..", 25 | "example", 26 | "dags", 27 | ) 28 | ) 29 | 30 | 31 | def get_airflow_dirs() -> Tuple[str, str]: 32 | airflow_home: Optional[str] = None 33 | if airflow_home is None and AIRFLOW_HOME_VAR_NAME in os.environ: 34 | airflow_home = os.path.abspath(os.environ[AIRFLOW_HOME_VAR_NAME]) 35 | if airflow_home is None: 36 | airflow_cfg_path: Optional[str] = None 37 | if AIRFLOW_CFG_VAR_NAME in os.environ: 38 | airflow_cfg_path = os.environ[AIRFLOW_CFG_VAR_NAME] 39 | if airflow_cfg_path is None: 40 | search_cfg_path = os.path.abspath(".") 41 | while True: 42 | try: 43 | cfg_path = os.path.join( 44 | search_cfg_path, AIRFLOW_CFG_FILE_NAME 45 | ) 46 | if os.path.exists(cfg_path): 47 | airflow_cfg_path = cfg_path 48 | break 49 | except Exception: 50 | traceback.print_exc() 51 | new_search_cfg_path = os.path.dirname(search_cfg_path) 52 | if new_search_cfg_path == search_cfg_path: 53 | break 54 | search_cfg_path = new_search_cfg_path 55 | if airflow_cfg_path is not None: 56 | try: 57 | config = configparser.ConfigParser() 58 | config.read(AIRFLOW_CFG_VAR_NAME) 59 | airflow_home = os.path.dirname( 60 | os.path.abspath(config["core"]["dags_folder"]) 61 | ) 62 | except Exception: 63 | traceback.print_exc() 64 | if airflow_home is not None: 65 | if os.path.exists(os.path.join(airflow_home, AIRFLOW_CFG_FILE_NAME)): 66 | config = configparser.ConfigParser() 67 | config.read(os.path.join(airflow_home, AIRFLOW_CFG_FILE_NAME)) 68 | airflow_dag_dir = os.path.abspath(config["core"]["dags_folder"]) 69 | else: 70 | airflow_dag_dir = os.path.join( 71 | airflow_home, AIRFLOW_DEFAULT_DAGS_DIR 72 | ) 73 | else: 74 | # Fallback 75 | # By default use current directory 76 | typer.echo( 77 | "Failed to find Airflow home directory. " 78 | f"Please specify {AIRFLOW_HOME_VAR_NAME} or {AIRFLOW_CFG_VAR_NAME} " 79 | "env variables. Or run the command anywhere near airflow.cfg file." 80 | ) 81 | airflow_home = os.path.abspath(".") 82 | airflow_dag_dir = os.path.abspath(AIRFLOW_DEFAULT_DAGS_DIR) 83 | 84 | # Ensure all directories exist 85 | pathlib.Path(airflow_home).mkdir(parents=True, exist_ok=True) 86 | pathlib.Path(airflow_dag_dir).mkdir(parents=True, exist_ok=True) 87 | 88 | return airflow_home, airflow_dag_dir 89 | 90 | 91 | @generator_app.command("example_dags") 92 | def example_dags(): 93 | _, dags_dir = get_airflow_dirs() 94 | typer.echo(f"Create example DAGs in {dags_dir} directory:") 95 | for filename in os.listdir(EXAMPLE_DAGS_DIRECTORY): 96 | if os.path.isfile(os.path.join(dags_dir, filename)): 97 | with open(os.path.join(dags_dir, filename), "w") as output: 98 | with open( 99 | os.path.join(EXAMPLE_DAGS_DIRECTORY, filename) 100 | ) as input: 101 | output.write(input.read()) 102 | typer.echo(f"Create example DAG {filename}") 103 | typer.echo("Done") 104 | 105 | 106 | def run_cli(): 107 | app() 108 | 109 | 110 | if __name__ == "__main__": 111 | run_cli() 112 | -------------------------------------------------------------------------------- /airflow_dvc/custom_downloads.py: -------------------------------------------------------------------------------- 1 | """ 2 | Abstraction for DVC download targets. 3 | @Piotr Styczyński 2021 4 | """ 5 | from dvc_fs import DVCDownload 6 | 7 | from airflow.providers.amazon.aws.hooks.s3 import S3Hook 8 | 9 | 10 | class DVCS3Download(DVCDownload): 11 | """ 12 | Download item from DVC and save it to S3 13 | This is useful when you have S3Hook in your workflows used 14 | as a temporary cache for files and you're not using shared-filesystem, 15 | so using DVCPathDownload is not an option. 16 | """ 17 | 18 | # Fields to apply Airflow templates 19 | template_fields = ["dvc_path", "bucket_name", "bucket_path"] 20 | 21 | # Connection ID (the same as for Airflow S3Hook) 22 | # For more details please see: 23 | # - https://airflow.apache.org/docs/apache-airflow/1.10.14/_modules/airflow/hooks/S3_hook.html 24 | # - https://www.programcreek.com/python/example/120741/airflow.hooks.S3_hook.S3Hook 25 | aws_conn_id: str 26 | # Bucket name (see above) 27 | bucket_name: str 28 | # Bucket path for the downloaded file (see above) 29 | bucket_path: str 30 | 31 | def __init__( 32 | self, 33 | dvc_path: str, 34 | aws_conn_id: str, 35 | bucket_name: str, 36 | bucket_path: str, 37 | ): 38 | super().__init__(dvc_path=dvc_path) 39 | self.aws_conn_id = aws_conn_id 40 | self.bucket_name = bucket_name 41 | self.bucket_path = bucket_path 42 | 43 | def describe_target(self) -> str: 44 | return f"S3 {self.bucket_name}/{self.bucket_path}" 45 | 46 | def write(self, content: str): 47 | # Open connection to the S3 and download the file 48 | s3_hook = S3Hook(aws_conn_id=self.aws_conn_id) 49 | s3_hook.load_string( 50 | content, 51 | self.bucket_path, 52 | bucket_name=self.bucket_name, 53 | replace=True, 54 | ) 55 | -------------------------------------------------------------------------------- /airflow_dvc/custom_uploads.py: -------------------------------------------------------------------------------- 1 | """ 2 | Abstraction for DVC upload sources. 3 | @Piotr Styczyński 2021 4 | """ 5 | from io import StringIO 6 | 7 | from dvc_fs.dvc_upload import DVCUpload 8 | 9 | from airflow.providers.amazon.aws.hooks.s3 import S3Hook 10 | 11 | 12 | class DVCS3Upload(DVCUpload): 13 | """ 14 | Upload item from S3 to DVC 15 | This is useful when you have S3Hook in your workflows used 16 | as a temporary cache for files and you're not using shared-filesystem, 17 | so using DVCPathUpload is not an option. 18 | """ 19 | 20 | # Fields to apply Airflow templates 21 | template_fields = ["bucket_path", "bucket_name", "dvc_path"] 22 | 23 | # Connection ID (the same as for Airflow S3Hook) 24 | # For more details please see: 25 | # - https://airflow.apache.org/docs/apache-airflow/1.10.14/_modules/airflow/hooks/S3_hook.html 26 | # - https://www.programcreek.com/python/example/120741/airflow.hooks.S3_hook.S3Hook 27 | aws_conn_id: str 28 | # Bucket name (see above) 29 | bucket_name: str 30 | # Bucket path for the downloaded file (see above) 31 | bucket_path: str 32 | 33 | def __init__( 34 | self, 35 | dvc_path: str, 36 | aws_conn_id: str, 37 | bucket_name: str, 38 | bucket_path: str, 39 | ): 40 | super().__init__(dvc_path=dvc_path) 41 | self.aws_conn_id = aws_conn_id 42 | self.bucket_name = bucket_name 43 | self.bucket_path = bucket_path 44 | 45 | def describe_source(self) -> str: 46 | return f"S3 {self.bucket_name}/{self.bucket_path}" 47 | 48 | def open(self): 49 | # Open connection to the S3 and download the file 50 | s3_hook = S3Hook(aws_conn_id=self.aws_conn_id) 51 | return StringIO( 52 | s3_hook.read_key( 53 | key=self.bucket_path, bucket_name=self.bucket_name 54 | ) 55 | ) 56 | 57 | def close(self, resource): 58 | # Closing is not necessary for S3 59 | pass 60 | -------------------------------------------------------------------------------- /airflow_dvc/dvc_download_operator.py: -------------------------------------------------------------------------------- 1 | """ 2 | Airflow operator to upload files to DVC. 3 | 4 | @Piotr Styczyński 2021 5 | """ 6 | from typing import Callable, List, Union 7 | 8 | from dvc_fs.dvc_download import DVCDownload 9 | 10 | from airflow.operators.python_operator import PythonOperator 11 | from airflow_dvc.dvc_hook import DVCHook 12 | from airflow_dvc.exceptions import add_log_exception_handler 13 | from airflow_dvc.logs import LOGS 14 | from airflow_dvc.stats import DVCDownloadMetadata 15 | 16 | Downloads = Union[List[DVCDownload], Callable[..., List[DVCDownload]]] 17 | 18 | TEMPLATE_FIELDS = ["files", "templates_dict", "op_args", "op_kwargs"] 19 | 20 | 21 | class DVCDownloadOperator(PythonOperator): 22 | """ 23 | Operator that downloads given DVC files. 24 | """ 25 | 26 | # Fields to apply Airflow templates 27 | template_fields = TEMPLATE_FIELDS 28 | 29 | dvc_repo: str # Clone URL for a GIT repo 30 | files: Downloads # List of files to be downloaded or function that returns it 31 | empty_fallback: bool # Create empty file if it does not exists remotely 32 | 33 | @property 34 | def affected_files(self) -> List[DVCDownload]: 35 | if callable(self.files): 36 | return [] 37 | return self.files 38 | 39 | def __init__( 40 | self, 41 | dvc_repo: str, 42 | files: Downloads, 43 | empty_fallback: bool = False, 44 | disable_error_message: bool = False, 45 | ignore_errors: bool = False, 46 | **kwargs, 47 | ) -> None: 48 | """ 49 | Creates Airflow download operator. 50 | 51 | :param dvc_repo: Git clone url for repo with configured DVC 52 | :param files: Files to be downloaded (please see DVCDownload class for more details) 53 | """ 54 | super().__init__( 55 | **kwargs, 56 | python_callable=add_log_exception_handler( 57 | self._execute_operator, 58 | disable_error_message=disable_error_message, 59 | ignore_errors=ignore_errors, 60 | ), 61 | ) 62 | self.dvc_repo = dvc_repo 63 | self.empty_fallback = empty_fallback 64 | self.files = files 65 | if not callable(self.files): 66 | for file in self.files: 67 | file.dvc_repo = dvc_repo 68 | self.template_fields = TEMPLATE_FIELDS 69 | 70 | def _execute_operator(self, *args, **kwargs) -> DVCDownloadMetadata: 71 | """ 72 | Perform the DVC uploads. 73 | """ 74 | files = self.files 75 | if callable(self.files): 76 | files = self.files(*args, **kwargs) 77 | dvc = DVCHook(self.dvc_repo) 78 | LOGS.dvc_download_operator.info( 79 | f"Download operator executed for files: {', '.join([file.dvc_path for file in files])}" 80 | ) 81 | meta = dvc.download( 82 | downloaded_files=files, 83 | empty_fallback=self.empty_fallback, 84 | ) 85 | LOGS.dvc_download_operator.info("Download completed.") 86 | return meta 87 | -------------------------------------------------------------------------------- /airflow_dvc/dvc_existence_sensor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Airflow sensor to wait for DVC files changes. 3 | 4 | @Piotr Styczyński 2021 5 | """ 6 | import inspect 7 | from typing import Callable, List, Union 8 | 9 | from airflow.sensors.python import PythonSensor 10 | from airflow_dvc.dvc_hook import DVCHook 11 | from airflow_dvc.exceptions import add_log_exception_handler 12 | from airflow_dvc.logs import LOGS 13 | 14 | FileListLike = Union[List[str], Callable[..., List[str]]] 15 | 16 | TEMPLATE_FIELDS = ["templates_dict", "op_args", "op_kwargs", "files"] 17 | 18 | 19 | class DVCExistenceSensor(PythonSensor): 20 | """ 21 | Sensor that waits for the file/-s to be present in the DVC 22 | """ 23 | 24 | dag_name: str # Name of the running DAG (to compare DAG start and file timestamps) 25 | dvc_repo: str # Git repo clone url 26 | files: FileListLike # Files to watch for 27 | instance_context: str 28 | 29 | # Fields to apply Airflow templates 30 | template_fields = TEMPLATE_FIELDS 31 | 32 | def __init__( 33 | self, 34 | dvc_repo: str, 35 | files: FileListLike, 36 | dag, 37 | disable_error_message: bool = False, 38 | ignore_errors: bool = False, 39 | *args, 40 | **kwargs, 41 | ): 42 | """ 43 | Airflow sensor will run exists(...) and check if the files exist. 44 | 45 | :param dvc_repo: Git clone URL for a repo with DVC configured 46 | :param files: Files to watch for 47 | :param dag: DAG object 48 | """ 49 | super().__init__( 50 | **kwargs, 51 | python_callable=add_log_exception_handler( 52 | self._poke, 53 | disable_error_message=disable_error_message, 54 | ignore_errors=ignore_errors, 55 | ), 56 | ) 57 | self.dag_name = dag.dag_id 58 | self.dvc_repo = dvc_repo 59 | self.files = files 60 | 61 | curframe = inspect.currentframe() 62 | caller = inspect.getouterframes(curframe, 2)[3] 63 | caller_path = caller.filename.split("/")[-1] 64 | self.instance_context = f"({caller_path}:{caller.lineno})" 65 | self.template_fields = TEMPLATE_FIELDS 66 | 67 | def _poke(self, *args, **kwargs): 68 | """ 69 | Implementation of the Airflow interface to check if the DAG should proceed. 70 | """ 71 | dvc = DVCHook(self.dvc_repo) 72 | files = self.files 73 | if callable(self.files): 74 | files = self.files(*args, **kwargs) 75 | # Check if given input files exist 76 | for file in files: 77 | if not dvc.exists(file): 78 | LOGS.dvc_existence_sensor.info( 79 | f"File {file} does not exist (sensor will wait)" 80 | ) 81 | # File do not exist so we do not proceed 82 | return False 83 | LOGS.dvc_existence_sensor.info( 84 | f"All files ({', '.join(files)}) exist so sensor will continue." 85 | ) 86 | return True 87 | -------------------------------------------------------------------------------- /airflow_dvc/dvc_hook.py: -------------------------------------------------------------------------------- 1 | """ 2 | High-level DVC client for building aced workflows. 3 | 4 | @Piotr Styczyński 2021 5 | """ 6 | import datetime 7 | from dataclasses import dataclass 8 | from typing import Any, List, Optional 9 | 10 | from dvc_fs import Client as DVCClient 11 | 12 | from airflow.hooks.base import BaseHook 13 | from airflow.models.dag import DAG 14 | 15 | 16 | @dataclass 17 | class DVCCommit: 18 | """ 19 | Information about the commit created by the DVC operators 20 | """ 21 | 22 | dvc_repo: str # DVC repo URL 23 | dvc_repo_name: str # Same as above 24 | message: str # Commit message 25 | date: datetime.datetime # Commit time 26 | dag: DAG # DAG that triggered this commit 27 | files: List[str] # List of modified files 28 | sha: str # Commit sha 29 | commit_url: str # Commit URL 30 | 31 | 32 | class DVCHook(DVCClient, BaseHook): 33 | """ 34 | Interface for all high-level DVC operations. 35 | For low-level DVC operations please see DVCLocalCli class. 36 | """ 37 | 38 | def __init__( 39 | self, 40 | dvc_repo: str, 41 | ): 42 | """ 43 | :param dvc_repo: Clone URL for the GIT repo that has DVC configured 44 | """ 45 | super().__init__( 46 | dvc_repo, 47 | ) 48 | 49 | def get_conn(self) -> Any: 50 | return self 51 | 52 | def list_dag_commits( 53 | self, 54 | temp_path: Optional[str] = None, 55 | ) -> List[DVCCommit]: 56 | """ 57 | Returns list of all commits generated for the given DVC repository. 58 | 59 | :param temp_path: Optional temporary clone path 60 | :returns: List with commits generated by the DVC operators 61 | """ 62 | # TODO: Rewrite functionality here using dvc-fs 63 | return [] 64 | -------------------------------------------------------------------------------- /airflow_dvc/dvc_update_operator.py: -------------------------------------------------------------------------------- 1 | """ 2 | Airflow operator to upload files to DVC. 3 | 4 | @Piotr Styczyński 2021 5 | """ 6 | import os 7 | from typing import Callable, List, Optional, Union 8 | 9 | from dvc_fs import DVCUpload 10 | 11 | from airflow.operators.python_operator import PythonOperator 12 | from airflow_dvc.dvc_hook import DVCHook 13 | from airflow_dvc.exceptions import add_log_exception_handler 14 | from airflow_dvc.logs import LOGS 15 | from airflow_dvc.stats import DVCUpdateMetadata 16 | 17 | Uploads = Union[List[DVCUpload], Callable[..., List[DVCUpload]]] 18 | 19 | TEMPLATE_FIELDS = [ 20 | "files", 21 | "commit_message", 22 | "temp_path", 23 | "templates_dict", 24 | "op_args", 25 | "op_kwargs", 26 | ] 27 | 28 | 29 | class DVCUpdateOperator(PythonOperator): 30 | """ 31 | Operator that allows DAGs to update DVC files. 32 | You can use it to upload various types of sources. 33 | For more information please see DVCUpload abstract class. 34 | """ 35 | 36 | # Fields to apply Airflow templates 37 | template_fields = TEMPLATE_FIELDS 38 | 39 | dvc_repo: str # Clone URL for a GIT repo 40 | files: Uploads # List of files to be uploaded or function that returns it 41 | commit_message: Optional[str] # Optional Git custom commit message 42 | temp_path: Optional[str] # Path to a temporary clone directory 43 | 44 | @property 45 | def affected_files(self) -> List[DVCUpload]: 46 | if callable(self.files): 47 | return [] 48 | return self.files 49 | 50 | def __init__( 51 | self, 52 | dvc_repo: str, 53 | files: Uploads, 54 | commit_message: Optional[str] = None, 55 | temp_path: Optional[str] = None, 56 | disable_error_message: bool = False, 57 | ignore_errors: bool = False, 58 | **kwargs, 59 | ) -> None: 60 | """ 61 | Creates Airflow upload operator. 62 | 63 | :param dvc_repo: Git clone url for repo with configured DVC 64 | :param files: Files to be uploaded (please see DVCUpload class for more details) 65 | """ 66 | super().__init__( 67 | **kwargs, 68 | python_callable=add_log_exception_handler( 69 | self._execute_operator, 70 | disable_error_message=disable_error_message, 71 | ignore_errors=ignore_errors, 72 | ), 73 | ) 74 | self.dvc_repo = dvc_repo 75 | self.files = files 76 | self.commit_message = commit_message 77 | self.temp_path = temp_path 78 | if not callable(self.files): 79 | for file in self.files: 80 | file.dvc_repo = dvc_repo 81 | self.template_fields = TEMPLATE_FIELDS 82 | 83 | def _execute_operator(self, *args, **kwargs) -> DVCUpdateMetadata: 84 | """ 85 | Perform the DVC uploads. 86 | """ 87 | files = self.files 88 | if callable(self.files): 89 | files = self.files(*args, **kwargs) 90 | dvc = DVCHook(self.dvc_repo) 91 | LOGS.dvc_update_operator.info( 92 | f"Update operator executed for files: {', '.join([file.dvc_path for file in files])}" 93 | ) 94 | commit_message = self.commit_message 95 | if commit_message is None: 96 | file_list_str = ", ".join( 97 | [os.path.basename(file.dvc_path) for file in self.files] 98 | ) 99 | commit_message = ( 100 | f"DVC Automatically updated files: {file_list_str}" 101 | ) 102 | commit_message = f"{commit_message}\ndag: {self.dag_id}" 103 | meta = dvc.update( 104 | updated_files=files, 105 | commit_message=commit_message, 106 | ) 107 | LOGS.dvc_update_operator.info("Update completed.") 108 | return meta 109 | -------------------------------------------------------------------------------- /airflow_dvc/dvc_update_sensor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Airflow sensor to wait for DVC files changes. 3 | 4 | @Piotr Styczyński 2021 5 | """ 6 | import inspect 7 | from typing import List 8 | 9 | from airflow.models.dagrun import DagRun 10 | from airflow.sensors.python import PythonSensor 11 | from airflow_dvc.dvc_hook import DVCHook 12 | from airflow_dvc.exceptions import add_log_exception_handler 13 | from airflow_dvc.logs import LOGS 14 | 15 | TEMPLATE_FIELDS = ["templates_dict", "op_args", "op_kwargs", "files"] 16 | 17 | 18 | class DVCUpdateSensor(PythonSensor): 19 | """ 20 | Sensor that waits until the given path will be updated in DVC. 21 | """ 22 | 23 | dag_name: str # Name of the running DAG (to compare DAG start and file timestamps) 24 | dvc_repo: str # Git repo clone url 25 | files: List[str] # Files to watch for 26 | instance_context: str 27 | 28 | # Fields to apply Airflow templates 29 | template_fields = TEMPLATE_FIELDS 30 | 31 | def __init__( 32 | self, 33 | dvc_repo: str, 34 | files: List[str], 35 | dag, 36 | disable_error_message: bool = False, 37 | ignore_errors: bool = False, 38 | *args, 39 | **kwargs, 40 | ): 41 | """ 42 | Airflow sensor will compare timestamp of the current DAG run and the paths of files 43 | tracked in DVC given as an input parameter. 44 | 45 | :param dvc_repo: Git clone URL for a repo with DVC configured 46 | :param files: Files to watch for 47 | :param dag: DAG object 48 | """ 49 | super().__init__( 50 | **kwargs, 51 | python_callable=add_log_exception_handler( 52 | self._poke, 53 | disable_error_message=disable_error_message, 54 | ignore_errors=ignore_errors, 55 | ), 56 | ) 57 | self.dag_name = dag.dag_id 58 | self.dvc_repo = dvc_repo 59 | self.files = files 60 | 61 | curframe = inspect.currentframe() 62 | caller = inspect.getouterframes(curframe, 2)[3] 63 | caller_path = caller.filename.split("/")[-1] 64 | self.instance_context = f"({caller_path}:{caller.lineno})" 65 | self.template_fields = TEMPLATE_FIELDS 66 | 67 | def _poke(self, context): 68 | """ 69 | Implementation of the Airflow interface to check if the DAG should proceed. 70 | """ 71 | dag_runs = DagRun.find(dag_id=self.dag_name) 72 | length = len(dag_runs) 73 | # Query the latest start date of the DAG 74 | last_start_date = dag_runs[length - 1].start_date.replace(tzinfo=None) 75 | 76 | update = False 77 | dvc = DVCHook(self.dvc_repo) 78 | # Check modification dates of the given files 79 | for file in self.files: 80 | LOGS.dvc_update_sensor.info( 81 | f"Current date = {last_start_date} vs. file modified date {dvc.modified_date(file)}" 82 | ) 83 | if dvc.modified_date(file) >= last_start_date: 84 | LOGS.dvc_update_sensor.info("DVC sensor is active.") 85 | update = True 86 | break 87 | return update 88 | -------------------------------------------------------------------------------- /airflow_dvc/exceptions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Definitions of possible DVC errors 3 | 4 | @Piotr Styczyński 2021 5 | """ 6 | import traceback 7 | from typing import List, Optional 8 | 9 | from git import exc 10 | from semantic_version import SimpleSpec, Version 11 | 12 | from airflow_dvc.logs import LOGS 13 | 14 | 15 | class DVCFileMissingError(FileNotFoundError): 16 | """ 17 | DVC file is missing (remotely) 18 | """ 19 | 20 | repo: str 21 | file_path: str 22 | 23 | def __init__(self, repo: str, file_path: str): 24 | self.repo = repo 25 | self.file_path = file_path 26 | super().__init__( 27 | f"Missing DVC file {self.file_path} in repo {self.repo}" 28 | ) 29 | 30 | 31 | class DVCCliCommandError(Exception): 32 | """ 33 | DVC command in shell failed 34 | """ 35 | 36 | dvc_command: str 37 | dvc_output: Optional[str] 38 | dvc_exit_code: int 39 | execution_path: str 40 | 41 | def __init__( 42 | self, 43 | dvc_command: str, 44 | dvc_output: Optional[str], 45 | dvc_exit_code: int, 46 | execution_path: str, 47 | ): 48 | self.dvc_command = dvc_command 49 | self.dvc_output = dvc_output 50 | self.dvc_exit_code = dvc_exit_code 51 | self.execution_path = execution_path 52 | super().__init__( 53 | f"DVC Command {self.dvc_command} failed with status code {self.dvc_exit_code} and output: {self.dvc_output}" 54 | ) 55 | 56 | 57 | class DVCMissingExecutableError(Exception): 58 | """ 59 | DVC executable is missing (not callable from shell) 60 | """ 61 | 62 | def __init__(self): 63 | super().__init__( 64 | "DVC Python library is missing and DVC " 65 | "executable cannot be found in PATH. " 66 | "Please configure your system correctly " 67 | "by installing dvc Python package or application " 68 | "package suitable for your operating system." 69 | ) 70 | 71 | 72 | class DVCGitRepoNotAccessibleError(Exception): 73 | """ 74 | Repository is not cloneable (access was denied or repository was not found) 75 | """ 76 | 77 | git_exception: exc.GitError 78 | repo: str 79 | 80 | def __init__(self, repo: str, git_exception: exc.GitError): 81 | self.git_exception = git_exception 82 | self.repo = repo 83 | super().__init__( 84 | f"Could not clone git repository: {repo}. Error: {git_exception}" 85 | ) 86 | 87 | 88 | class DVCGitUpdateError(Exception): 89 | """ 90 | Problem with committing changes via git 91 | """ 92 | 93 | git_exception: exc.GitError 94 | updated_files: List[str] 95 | repo: str 96 | 97 | def __init__( 98 | self, repo: str, updated_files: List[str], git_exception: exc.GitError 99 | ): 100 | self.git_exception = git_exception 101 | self.updated_files = updated_files 102 | self.repo = repo 103 | super().__init__( 104 | f"Cannot update DVC. Git push failed for repository " 105 | f"{self.repo} (upload files {', '.join(self.updated_files)}). " 106 | f"Error: {self.git_exception}" 107 | ) 108 | 109 | 110 | class DVCInvalidVersion(Exception): 111 | """ 112 | Installed DVC has invalid version 113 | """ 114 | 115 | version: Version 116 | constraint: SimpleSpec 117 | description: str 118 | 119 | def __init__( 120 | self, description: str, version: Version, constraint: SimpleSpec 121 | ): 122 | self.version = version 123 | self.description = description 124 | self.constraint = constraint 125 | super().__init__( 126 | f"{self.description}. Required version: {constraint}. Got: {version}" 127 | ) 128 | 129 | 130 | def add_log_exception_handler( 131 | fn, 132 | disable_error_message: bool = False, 133 | ignore_errors: bool = False, 134 | ): 135 | """ 136 | Utility that wraps function and adds log statement that prints all the error information 137 | and then reraises that error. 138 | """ 139 | inner_fn = fn 140 | 141 | def wrapped_fn(*args, **kwargs): 142 | try: 143 | return inner_fn(*args, **kwargs) 144 | except Exception as e: 145 | if not disable_error_message: 146 | error_trace = traceback.format_exc() 147 | LOGS.exceptions.error( 148 | f"Error was thrown inside the airflow-dvc code. " 149 | f"This is just a useful message to help with Airflow " 150 | f"pipeline debugging. The error will be reraised. Error message: {e}." 151 | f"Error trace: {error_trace}" 152 | ) 153 | if ignore_errors: 154 | return None 155 | raise e 156 | 157 | return wrapped_fn 158 | -------------------------------------------------------------------------------- /airflow_dvc/logs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Logging utilities 3 | 4 | @Piotr Styczyński 2021 5 | """ 6 | import inspect 7 | import logging 8 | 9 | BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = range(8) 10 | 11 | RESET_SEQ = "\033[0m" 12 | COLOR_SEQ = "\033[1;%dm" 13 | BOLD_SEQ = "\033[1m" 14 | 15 | COLORS = { 16 | "WARNING": YELLOW, 17 | "INFO": WHITE, 18 | "DEBUG": BLUE, 19 | "CRITICAL": YELLOW, 20 | "ERROR": RED, 21 | } 22 | 23 | 24 | def formatter_message(message: str, use_color: bool = True) -> str: 25 | """ 26 | Format message string. 27 | :param message: Input message 28 | :param use_color: Use colouring for output? 29 | :return: Formatted message 30 | """ 31 | if use_color: 32 | message = message.replace("$RESET", RESET_SEQ).replace( 33 | "$BOLD", BOLD_SEQ 34 | ) 35 | for color_name in COLORS.keys(): 36 | message = message.replace( 37 | f"${color_name}", COLOR_SEQ % (30 + COLORS[color_name]) 38 | ) 39 | else: 40 | message = message.replace("$RESET", "").replace("$BOLD", "") 41 | return message 42 | 43 | 44 | class ColoredFormatter(logging.Formatter): 45 | """ 46 | Log formatter that supports nice terminal output colouring. 47 | """ 48 | 49 | def __init__(self, msg, use_color=True): 50 | logging.Formatter.__init__(self, msg) 51 | self.use_color = use_color 52 | 53 | def format(self, record) -> str: 54 | """ 55 | Format logging record 56 | :param record: Logging record 57 | :return: Formatted log line 58 | """ 59 | levelname = record.levelname 60 | if self.use_color and levelname in COLORS: 61 | levelname_color = ( 62 | COLOR_SEQ % (30 + COLORS[levelname]) + levelname + RESET_SEQ 63 | ) 64 | record.levelname = levelname_color 65 | return logging.Formatter.format(self, record) 66 | 67 | 68 | class UniversalLoggerSet: 69 | """ 70 | Collection of loggers. 71 | """ 72 | 73 | def __init__(self): 74 | self._loggers = dict() 75 | 76 | def get_logger(self, name): 77 | """ 78 | Get universal logger for a given name. 79 | If the logger does not exist yet, create it and add to the UniversalLoggerSet. 80 | If it exists already, just return it. 81 | :param name: Name of the loggger 82 | :return: Logger instance 83 | """ 84 | if name in self._loggers: 85 | return self._loggers[name] 86 | logger = logging.getLogger(name) 87 | handler = logging.StreamHandler() 88 | logger.addHandler(handler) 89 | logger.setLevel(logging.DEBUG) 90 | 91 | universal_logger = UniversalLogger(logger, name) 92 | self._loggers[name] = universal_logger 93 | return universal_logger 94 | 95 | def __getattr__(self, attr): 96 | return self.get_logger(attr) 97 | 98 | 99 | class UniversalLogger: 100 | """ 101 | Wrapper for Python logging library logger and set of useful logging utilities. 102 | """ 103 | 104 | def __init__(self, logger_instance, name): 105 | self._logger_instance = logger_instance 106 | self._enable_progress = True 107 | self._name = name 108 | 109 | def _prefromat_message(self, message, level): 110 | """ 111 | Format message before passing to the inner logger. 112 | :param message: Input message string 113 | :param level: Level of logging for that message 114 | :return: Preformatted message 115 | """ 116 | frame = inspect.currentframe() 117 | this_frame = frame # Save current frame. 118 | 119 | frame_no = 0 120 | while frame.f_back: 121 | frame = frame.f_back 122 | if frame_no == 1: 123 | this_frame = frame 124 | frame_no = frame_no + 1 125 | 126 | this_frame_info = inspect.getframeinfo(this_frame) 127 | filename = "/".join(this_frame_info.filename.split("/")[-1:]) 128 | lineno = this_frame_info.lineno 129 | 130 | level_block = "[$%s%-4s$RESET]" % (level, level.lower()[:4]) 131 | name_block = "[$BOLD%-15s$RESET]" % self._name 132 | source_block = "%-20s |" % f"{filename}:{lineno}" 133 | return formatter_message( 134 | f"{level_block} {name_block} {source_block} {message}" 135 | ) 136 | 137 | def is_progress_enabled(self): 138 | """ 139 | Check if displaying progress is enabled for this logger instance? 140 | :return: Is progress reporting enabled? 141 | """ 142 | return self._enable_progress 143 | 144 | def error(self, message): 145 | """ 146 | Log error message. 147 | :param message: Message to be logged 148 | """ 149 | self._logger_instance.error(self._prefromat_message(message, "ERROR")) 150 | 151 | def info(self, message): 152 | """ 153 | Log info message. 154 | :param message: Message to be logged 155 | """ 156 | self._logger_instance.info(self._prefromat_message(message, "INFO")) 157 | 158 | def debug(self, message): 159 | """ 160 | Log debug message. 161 | :param message: Message to be logged 162 | """ 163 | self._logger_instance.debug(self._prefromat_message(message, "DEBUG")) 164 | 165 | 166 | LOGS = UniversalLoggerSet() 167 | -------------------------------------------------------------------------------- /airflow_dvc/plugin/__init__.py: -------------------------------------------------------------------------------- 1 | from .plugin import DVCPlugin 2 | 3 | __all__ = [ 4 | "DVCPlugin", 5 | ] 6 | -------------------------------------------------------------------------------- /airflow_dvc/plugin/git_url_parser.py: -------------------------------------------------------------------------------- 1 | # Imports 2 | from collections import defaultdict 3 | 4 | from airflow_dvc.logs import LOGS 5 | 6 | from .platforms import PLATFORMS, PLATFORMS_MAP 7 | 8 | # Possible values to extract from a Git Url 9 | REQUIRED_ATTRIBUTES = ( 10 | "domain", 11 | "repo", 12 | ) 13 | 14 | 15 | class GitUrlParsed(object): 16 | def __init__(self, parsed_info): 17 | self._parsed = parsed_info 18 | 19 | # Set parsed objects as attributes 20 | for k, v in parsed_info.items(): 21 | setattr(self, k, v) 22 | 23 | def _valid_attrs(self): 24 | return all([getattr(self, attr, None) for attr in REQUIRED_ATTRIBUTES]) 25 | 26 | @property 27 | def valid(self): 28 | return all( 29 | [ 30 | self._valid_attrs(), 31 | ] 32 | ) 33 | 34 | @property 35 | def _platform_obj(self): 36 | return PLATFORMS_MAP[self.platform] 37 | 38 | ## 39 | # Alias properties 40 | ## 41 | @property 42 | def host(self): 43 | return self.domain 44 | 45 | @property 46 | def user(self): 47 | if hasattr(self, "_user"): 48 | return self._user 49 | 50 | return self.owner 51 | 52 | ## 53 | # Format URL to protocol 54 | ## 55 | def format(self, protocol): 56 | return self._platform_obj.FORMATS[protocol] % self._parsed 57 | 58 | ## 59 | # Normalize 60 | ## 61 | @property 62 | def normalized(self): 63 | return self.format(self.protocol) 64 | 65 | ## 66 | # Rewriting 67 | ## 68 | @property 69 | def url2ssh(self): 70 | return self.format("ssh") 71 | 72 | @property 73 | def url2http(self): 74 | return self.format("http") 75 | 76 | @property 77 | def url2https(self): 78 | return self.format("https") 79 | 80 | @property 81 | def url2git(self): 82 | return self.format("git") 83 | 84 | # All supported Urls for a repo 85 | @property 86 | def urls(self): 87 | return dict( 88 | (protocol, self.format(protocol)) 89 | for protocol in self._platform_obj.PROTOCOLS 90 | ) 91 | 92 | ## 93 | # Platforms 94 | ## 95 | @property 96 | def github(self): 97 | return self.platform == "github" 98 | 99 | @property 100 | def bitbucket(self): 101 | return self.platform == "bitbucket" 102 | 103 | @property 104 | def friendcode(self): 105 | return self.platform == "friendcode" 106 | 107 | @property 108 | def assembla(self): 109 | return self.platform == "assembla" 110 | 111 | @property 112 | def gitlab(self): 113 | return self.platform == "gitlab" 114 | 115 | ## 116 | # Get data as dict 117 | ## 118 | @property 119 | def data(self): 120 | return dict(self._parsed) 121 | 122 | 123 | SUPPORTED_ATTRIBUTES = ( 124 | "domain", 125 | "repo", 126 | "owner", 127 | "_user", 128 | "port", 129 | "url", 130 | "platform", 131 | "protocol", 132 | ) 133 | 134 | 135 | def _parse(url: str, check_domain: bool = True): 136 | # Values are None by default 137 | parsed_info = defaultdict(lambda: None) 138 | parsed_info["port"] = "" 139 | LOGS.git_url_parser.info(f"Parse GIT url: {url}") 140 | 141 | # Defaults to all attributes 142 | map(parsed_info.setdefault, SUPPORTED_ATTRIBUTES) 143 | 144 | for name, platform in PLATFORMS: 145 | for protocol, regex in platform.COMPILED_PATTERNS.items(): 146 | # Match current regex against URL 147 | match = regex.match(url) 148 | 149 | # Skip if not matched 150 | if not match: 151 | LOGS.git_url_parser.info( 152 | f"GIT url {url} not matched by {regex.pattern} for platform {platform}" 153 | ) 154 | continue 155 | 156 | # Skip if domain is bad 157 | domain = match.group("domain") 158 | if "@" in domain: 159 | domain = domain.split("@")[-1] 160 | LOGS.git_url_parser.info(f"GIT url domain is: {domain}") 161 | if check_domain: 162 | if platform.DOMAINS and not (domain in platform.DOMAINS): 163 | LOGS.git_url_parser.info( 164 | f"GIT url domain {domain} not listed in {' ,'.join(platform.DOMAINS)}" 165 | ) 166 | continue 167 | 168 | # Get matches as dictionary 169 | matches = match.groupdict() 170 | 171 | # Update info with matches 172 | parsed_info.update(matches) 173 | 174 | # add in platform defaults 175 | parsed_info.update(platform.DEFAULTS) 176 | 177 | # Update info with platform info 178 | parsed_info.update( 179 | { 180 | "url": url, 181 | "platform": name, 182 | "protocol": protocol, 183 | } 184 | ) 185 | LOGS.git_url_parser.info(f"Correctly parsed GIT url {url}") 186 | return parsed_info 187 | 188 | # Empty if none matched 189 | LOGS.git_url_parser.info(f"Invalid URL {url}") 190 | return parsed_info 191 | 192 | 193 | def parse(url, check_domain=True): 194 | return GitUrlParsed(_parse(url, check_domain)) 195 | 196 | 197 | def validate(url, check_domain=True): 198 | return parse(url, check_domain).valid 199 | -------------------------------------------------------------------------------- /airflow_dvc/plugin/platforms/__init__.py: -------------------------------------------------------------------------------- 1 | # Imports 2 | from .assembla import AssemblaPlatform 3 | from .base import BasePlatform 4 | from .bitbucket import BitbucketPlatform 5 | from .friendcode import FriendCodePlatform 6 | from .github import GitHubPlatform 7 | from .gitlab import GitLabPlatform 8 | 9 | # Supported platforms 10 | PLATFORMS = ( 11 | # name -> Platform object 12 | ("github", GitHubPlatform()), 13 | ("bitbucket", BitbucketPlatform()), 14 | ("friendcode", FriendCodePlatform()), 15 | ("assembla", AssemblaPlatform()), 16 | ("gitlab", GitLabPlatform()), 17 | # Match url 18 | ("base", BasePlatform()), 19 | ) 20 | 21 | PLATFORMS_MAP = dict(PLATFORMS) 22 | -------------------------------------------------------------------------------- /airflow_dvc/plugin/platforms/assembla.py: -------------------------------------------------------------------------------- 1 | # Imports 2 | from .base import BasePlatform 3 | 4 | 5 | class AssemblaPlatform(BasePlatform): 6 | DOMAINS = ("git.assembla.com",) 7 | PATTERNS = { 8 | "ssh": r"git@(?P.+):(?P.+).git", 9 | "git": r"git://(?P.+)/(?P.+).git", 10 | } 11 | FORMATS = { 12 | "ssh": r"git@%(domain)s:%(repo)s.git", 13 | "git": r"git://%(domain)s/%(repo)s.git", 14 | } 15 | DEFAULTS = {"_user": "git"} 16 | -------------------------------------------------------------------------------- /airflow_dvc/plugin/platforms/base.py: -------------------------------------------------------------------------------- 1 | # Imports 2 | import re 3 | 4 | 5 | class BasePlatform(object): 6 | FORMATS = { 7 | "ssh": r"%(_user)s@%(host)s:%(repo)s.git", 8 | "http": r"http://%(host)s/%(repo)s.git", 9 | "https": r"http://%(host)s/%(repo)s.git", 10 | "git": r"git://%(host)s/%(repo)s.git", 11 | } 12 | 13 | PATTERNS = { 14 | "ssh": r"(?P<_user>.+)s@(?P.+)s:(?P.+)s.git", 15 | "http": r"http://(?P.+)s/(?P.+)s.git", 16 | "https": r"http://(?P.+)s/(?P.+)s.git", 17 | "git": r"git://(?P.+)s/(?P.+)s.git", 18 | } 19 | 20 | # None means it matches all domains 21 | DOMAINS = None 22 | DEFAULTS = {} 23 | 24 | def __init__(self): 25 | # Precompile PATTERNS 26 | self.COMPILED_PATTERNS = dict( 27 | (proto, re.compile(regex)) 28 | for proto, regex in self.PATTERNS.items() 29 | ) 30 | 31 | # Supported protocols 32 | self.PROTOCOLS = self.PATTERNS.keys() 33 | -------------------------------------------------------------------------------- /airflow_dvc/plugin/platforms/bitbucket.py: -------------------------------------------------------------------------------- 1 | # Imports 2 | from .base import BasePlatform 3 | 4 | 5 | class BitbucketPlatform(BasePlatform): 6 | PATTERNS = { 7 | "https": r"https://(?P<_user>.+)@(?P.+)/(?P.+)/(?P.+).git", 8 | "ssh": r"git@(?P.+):(?P.+)/(?P.+).git", 9 | } 10 | FORMATS = { 11 | "https": r"https://%(owner)s@%(domain)s/%(owner)s/%(repo)s.git", 12 | "ssh": r"git@%(domain)s:%(owner)s/%(repo)s.git", 13 | } 14 | DOMAINS = ("bitbucket.org",) 15 | DEFAULTS = {"_user": "git"} 16 | -------------------------------------------------------------------------------- /airflow_dvc/plugin/platforms/friendcode.py: -------------------------------------------------------------------------------- 1 | # Imports 2 | from .base import BasePlatform 3 | 4 | 5 | class FriendCodePlatform(BasePlatform): 6 | DOMAINS = ("friendco.de",) 7 | PATTERNS = { 8 | "https": r"https://(?P.+)/(?P.+)@user/(?P.+).git", 9 | } 10 | FORMATS = { 11 | "https": r"https://%(domain)s/%(owner)s@user/%(repo)s.git", 12 | } 13 | -------------------------------------------------------------------------------- /airflow_dvc/plugin/platforms/github.py: -------------------------------------------------------------------------------- 1 | # Imports 2 | from .base import BasePlatform 3 | 4 | 5 | class GitHubPlatform(BasePlatform): 6 | PATTERNS = { 7 | "https": r"https://(?P.+)/(?P.+)/(?P.+)(.git)?", 8 | "ssh": r"git@(?P.+):(?P.+)/(?P.+)(.git)?", 9 | "git": r"git://(?P.+)/(?P.+)/(?P.+)(.git)?", 10 | } 11 | FORMATS = { 12 | "https": r"https://%(domain)s/%(owner)s/%(repo)s(.git)?", 13 | "ssh": r"git@%(domain)s:%(owner)s/%(repo)s(.git)?", 14 | "git": r"git://%(domain)s/%(owner)s/%(repo)s(.git)?", 15 | } 16 | DOMAINS = ( 17 | "github.com", 18 | "gist.github.com", 19 | ) 20 | DEFAULTS = {"_user": "git"} 21 | -------------------------------------------------------------------------------- /airflow_dvc/plugin/platforms/gitlab.py: -------------------------------------------------------------------------------- 1 | # Imports 2 | from .base import BasePlatform 3 | 4 | 5 | class GitLabPlatform(BasePlatform): 6 | PATTERNS = { 7 | "https": r"https://(?P.+)/(?P.+)/(?P.+).git", 8 | "ssh": r"git@(?P.+):(?P.+)/(?P.+).git", 9 | "git": r"git://(?P.+)/(?P.+)/(?P.+).git", 10 | } 11 | FORMATS = { 12 | "https": r"https://%(domain)s/%(owner)s/%(repo)s.git", 13 | "ssh": r"git@%(domain)s:%(owner)s/%(repo)s.git", 14 | "git": r"git://%(domain)s/%(owner)s/%(repo)s.git", 15 | } 16 | DEFAULTS = {"_user": "git"} 17 | -------------------------------------------------------------------------------- /airflow_dvc/plugin/plugin.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from dataclasses import dataclass 3 | from typing import Dict, List, Set, Union 4 | 5 | from flask import Blueprint, request 6 | from flask_appbuilder import BaseView as AppBuilderBaseView 7 | from flask_appbuilder import expose 8 | 9 | from airflow.models.dagbag import DagBag 10 | from airflow.plugins_manager import AirflowPlugin 11 | from airflow_dvc import (DVCCommit, DVCDownloadOperator, DVCHook, 12 | DVCUpdateOperator, DVCUpdateSensor) 13 | 14 | from .git_url_parser import parse as parse_git_url 15 | 16 | AnyDVCOperator = Union[DVCDownloadOperator, DVCUpdateOperator, DVCUpdateSensor] 17 | 18 | 19 | @dataclass(frozen=True) 20 | class DVCTargetInfo: 21 | dvc_repo: str 22 | dvc_repo_owner: str 23 | dvc_repo_name: str 24 | upload_connected: bool 25 | download_connected: bool 26 | sensors_connected: bool 27 | uploads: List[DVCUpdateOperator] 28 | downloads: List[DVCDownloadOperator] 29 | sensors: List[DVCUpdateSensor] 30 | 31 | 32 | class AppBuilderDVCPushesView(AppBuilderBaseView): 33 | @expose("/list", methods=["GET", "POST"]) 34 | def list(self): 35 | """ 36 | DVC Pushes view displays information about commit generated by the DVC operators 37 | """ 38 | operators: List[AnyDVCOperator] = [] 39 | for dag in DagBag().dags.values(): 40 | for task in dag.tasks: 41 | if ( 42 | isinstance(task, DVCDownloadOperator) 43 | or isinstance(task, DVCUpdateOperator) 44 | or isinstance(task, DVCUpdateSensor) 45 | ): 46 | setattr(task, "dag", dag) 47 | operators.append(task) 48 | 49 | repos: Dict[str, List[AnyDVCOperator]] = defaultdict(list) 50 | for operator in operators: 51 | repos[operator.dvc_repo].append(operator) 52 | 53 | all_commits: List[DVCCommit] = [] 54 | for repo in repos.keys(): 55 | hook = DVCHook(repo) 56 | all_commits += hook.list_dag_commits() 57 | for commit in all_commits: 58 | repo_url_info = parse_git_url(commit.dvc_repo) 59 | target_name = f"{repo_url_info.owner}/{repo_url_info.repo}" 60 | commit.dvc_repo_name = target_name 61 | 62 | return self.render_template( 63 | "dvc/pushes.html", 64 | all_commits=all_commits, 65 | ) 66 | 67 | 68 | class AppBuilderDVCTargetsView(AppBuilderBaseView): 69 | @expose("/list", methods=["GET", "POST"]) 70 | def list(self): 71 | """ 72 | DVC Targets view displays listings of all DVC operators used in all of the DAGs 73 | """ 74 | operator_type = request.args.get("operator_type") 75 | if operator_type is None: 76 | operator_type = "all" 77 | elif operator_type not in ["downloads", "uploads", "sensors"]: 78 | operator_type = "all" 79 | 80 | operators: List[AnyDVCOperator] = [] 81 | for dag in DagBag().dags.values(): 82 | for task in dag.tasks: 83 | if ( 84 | isinstance(task, DVCDownloadOperator) 85 | or isinstance(task, DVCUpdateOperator) 86 | or isinstance(task, DVCUpdateSensor) 87 | ): 88 | setattr(task, "dag", dag) 89 | operators.append(task) 90 | 91 | repos: Dict[str, List[AnyDVCOperator]] = defaultdict(list) 92 | for operator in operators: 93 | repos[operator.dvc_repo].append(operator) 94 | targets_info: List[DVCTargetInfo] = [] 95 | 96 | uploads_ref_files_count = 0 97 | downloads_ref_files_count = 0 98 | sensors_ref_files_count = 0 99 | 100 | dvc_diagram_free_id = 1 101 | dvc_diagram_nodes: Dict[str, dict] = dict() 102 | dvc_diagram_edges: Dict[int, Set[int]] = defaultdict(set) 103 | 104 | for target in repos.keys(): 105 | repo_url_info = parse_git_url(target) 106 | target_name = f"{repo_url_info.owner}/{repo_url_info.repo}" 107 | if target_name in dvc_diagram_nodes: 108 | target_node_id = dvc_diagram_nodes[target_name]["id"] 109 | else: 110 | target_node_id = dvc_diagram_free_id 111 | dvc_diagram_free_id += 1 112 | for operator in repos[target]: 113 | dag_id = operator.dag.dag_id 114 | if dag_id in dvc_diagram_nodes: 115 | dag_node_id = dvc_diagram_nodes[dag_id]["id"] 116 | else: 117 | dag_node_id = dvc_diagram_free_id 118 | dvc_diagram_free_id += 1 119 | add = False 120 | if isinstance( 121 | operator, DVCUpdateOperator 122 | ) and operator_type in ["all", "uploads"]: 123 | dvc_diagram_edges[dag_node_id].add(target_node_id) 124 | add = True 125 | elif isinstance( 126 | operator, DVCDownloadOperator 127 | ) and operator_type in ["all", "downloads"]: 128 | dvc_diagram_edges[target_node_id].add(dag_node_id) 129 | add = True 130 | if add: 131 | dvc_diagram_nodes[dag_id] = dict( 132 | id=dag_node_id, label=dag_id, dag=True 133 | ) 134 | dvc_diagram_nodes[target_name] = dict( 135 | id=target_node_id, label=target_name, dag=False 136 | ) 137 | 138 | for target in repos.keys(): 139 | repo_url_info = parse_git_url(target) 140 | uploads = [ 141 | op for op in repos[target] if isinstance(op, DVCUpdateOperator) 142 | ] 143 | downloads = [ 144 | op 145 | for op in repos[target] 146 | if isinstance(op, DVCDownloadOperator) 147 | ] 148 | sensors = [ 149 | op for op in repos[target] if isinstance(op, DVCUpdateSensor) 150 | ] 151 | targets_info.append( 152 | DVCTargetInfo( 153 | dvc_repo=target, 154 | dvc_repo_owner=repo_url_info.owner, 155 | dvc_repo_name=repo_url_info.repo, 156 | upload_connected=any( 157 | [ 158 | isinstance(op, DVCUpdateOperator) 159 | for op in repos[target] 160 | ] 161 | ), 162 | download_connected=any( 163 | [ 164 | isinstance(op, DVCDownloadOperator) 165 | for op in repos[target] 166 | ] 167 | ), 168 | sensors_connected=any( 169 | [ 170 | isinstance(op, DVCUpdateSensor) 171 | for op in repos[target] 172 | ] 173 | ), 174 | uploads=uploads, 175 | downloads=downloads, 176 | sensors=sensors, 177 | ) 178 | ) 179 | 180 | uploads_ref_files_count += len( 181 | [max(len(operator.affected_files), 1) for operator in uploads] 182 | ) 183 | downloads_ref_files_count += sum( 184 | [ 185 | max(len(operator.affected_files), 1) 186 | for operator in downloads 187 | ] 188 | ) 189 | sensors_ref_files_count += len( 190 | [max(len(operator.files), 1) for operator in sensors] 191 | ) 192 | 193 | return self.render_template( 194 | "dvc/list.html", 195 | targets_info=targets_info, 196 | operator_type=operator_type, 197 | uploads_ref_files_count=uploads_ref_files_count, 198 | downloads_ref_files_count=downloads_ref_files_count, 199 | sensors_ref_files_count=sensors_ref_files_count, 200 | dvc_diagram_nodes=dvc_diagram_nodes, 201 | dvc_diagram_edges=dvc_diagram_edges, 202 | ) 203 | 204 | 205 | v_appbuilder_view = AppBuilderDVCTargetsView() 206 | v_appbuilder_package = dict( 207 | name="DVC Operators", 208 | category="Browse", 209 | view=v_appbuilder_view, 210 | ) 211 | 212 | v_dvc_pushes = AppBuilderDVCPushesView() 213 | v_dvc_pushes_view = dict( 214 | name="DVC Pushes", 215 | category="Browse", 216 | view=v_dvc_pushes, 217 | ) 218 | 219 | dag_creation_manager_bp = Blueprint( 220 | "dag_creation_manager_bp", 221 | __name__, 222 | template_folder="templates", 223 | static_folder="static", 224 | static_url_path="/static/dvc", 225 | ) 226 | 227 | 228 | # Defining the plugin class 229 | class DVCPlugin(AirflowPlugin): 230 | """ 231 | DVC Airflow plugin 232 | """ 233 | 234 | name = "dvc_plugin" 235 | flask_blueprints = [dag_creation_manager_bp] 236 | admin_views = ( 237 | [] 238 | ) # if we dont have RBAC we use this view and can comment the next line 239 | appbuilder_views = [ 240 | v_appbuilder_package, 241 | v_dvc_pushes_view, 242 | ] # if we use RBAC we use this view and can comment the previous line 243 | hooks = [DVCHook] 244 | -------------------------------------------------------------------------------- /airflow_dvc/plugin/templates/dvc/list.html: -------------------------------------------------------------------------------- 1 | {% extends "airflow/main.html" %} 2 | 3 | {% block content %} 4 | 7 | 11 | 12 | 19 | 29 |
30 | 48 |
49 | {% if operator_type != "sensors" %} 50 |
51 |
52 |
53 | {% endif %} 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | {% if ( operator_type == "all" or operator_type == "uploads" ) %} 66 | {% for target in targets_info %} 67 | {% for operator in target.uploads %} 68 | {% if operator.affected_files|length > 0 %} 69 | {% for file in operator.affected_files %} 70 | 71 | 74 | 75 | 80 | 81 | 91 | 92 | 95 | 96 | 99 | 100 | {% endfor %} 101 | {% else %} 102 | 103 | 106 | 107 | 112 | 113 | 123 | 124 | 127 | 128 | 131 | 132 | {% endif %} 133 | {% endfor %} 134 | {% endfor %} 135 | {% endif %} 136 | 137 | {% if ( operator_type == "all" or operator_type == "downloads" ) %} 138 | {% for target in targets_info %} 139 | {% for operator in target.downloads %} 140 | {% if operator.affected_files|length > 0 %} 141 | {% for file in operator.affected_files %} 142 | 143 | 146 | 147 | 152 | 153 | 163 | 164 | 167 | 168 | 171 | 172 | {% endfor %} 173 | {% else %} 174 | 175 | 178 | 179 | 184 | 185 | 195 | 196 | 199 | 200 | 203 | 204 | {% endif %} 205 | {% endfor %} 206 | {% endfor %} 207 | {% endif %} 208 | 209 | {% if ( operator_type == "all" or operator_type == "sensors" ) %} 210 | {% for target in targets_info %} 211 | {% for operator in target.sensors %} 212 | {% for file in operator.files %} 213 | 214 | 217 | 218 | 223 | 224 | 234 | 235 | 238 | 239 | 242 | 243 | {% endfor %} 244 | {% endfor %} 245 | {% endfor %} 246 | {% endif %} 247 | 248 |
TypeDAGRepoSourceTarget
72 | Upload 73 | 76 | 77 | {{ operator.dag.dag_id }} 78 | 79 | 82 | 83 | 84 | {{ target.dvc_repo_owner }} / {{ target.dvc_repo_name }} 85 | 86 | 87 |
88 | 89 |
90 |
93 | {{ file.describe_source() }} 94 | 97 | {{ file.dvc_path }} 98 |
104 | Upload 105 | 108 | 109 | {{ operator.dag.dag_id }} 110 | 111 | 114 | 115 | 116 | {{ target.dvc_repo_owner }} / {{ target.dvc_repo_name }} 117 | 118 | 119 |
120 | 121 |
122 |
125 | {{ file.describe_source() }} 126 | 129 | Dynamic 130 |
144 | Download 145 | 148 | 149 | {{ operator.dag.dag_id }} 150 | 151 | 154 | 155 | 156 | {{ target.dvc_repo_owner }} / {{ target.dvc_repo_name }} 157 | 158 | 159 |
160 | 161 |
162 |
165 | {{ file.describe_target() }} 166 | 169 | {{ file.dvc_path }} 170 |
176 | Download 177 | 180 | 181 | {{ operator.dag.dag_id }} 182 | 183 | 186 | 187 | 188 | {{ target.dvc_repo_owner }} / {{ target.dvc_repo_name }} 189 | 190 | 191 |
192 | 193 |
194 |
197 | {{ file.describe_target() }} 198 | 201 | Dynamic 202 |
215 | Sensor 216 | 219 | 220 | {{ operator.dag.dag_id }} 221 | 222 | 225 | 226 | 227 | {{ target.dvc_repo_owner }} / {{ target.dvc_repo_name }} 228 | 229 | 230 |
231 | 232 |
233 |
236 | N/A 237 | 240 | {{ file }} 241 |
249 |
250 |
251 | 252 | 314 | {% endblock %} 315 | -------------------------------------------------------------------------------- /airflow_dvc/plugin/templates/dvc/pushes.html: -------------------------------------------------------------------------------- 1 | {% extends "airflow/main.html" %} 2 | 3 | {% block content %} 4 | 7 | 11 | 12 | 19 | 29 |
30 |
31 |
32 |
33 | 34 |
35 |
36 |
37 |
38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | {% for commit in all_commits %} 51 | 52 | 59 | 64 | 67 | 70 | 79 | 80 | 85 | 86 | {% endfor %} 87 | 88 |
RepoDAGMessageDateFilesCommit
53 | 54 | 55 | {{ commit.dvc_repo_name }} 56 | 57 | 58 | 60 | 61 | {{ commit.dag.dag_id }} 62 | 63 | 65 | {{ commit.message }} 66 | 68 | {{ commit.date }} 69 | 71 |
    72 | {% for file in commit.files %} 73 |
  • 74 | {{ file }} 75 |
  • 76 | {% endfor %} 77 |
78 |
81 | 82 | {{ commit.sha }} 83 | 84 |
89 |
90 |
91 | {% endblock %} 92 | -------------------------------------------------------------------------------- /airflow_dvc/stats.py: -------------------------------------------------------------------------------- 1 | """ 2 | Definitions of metadata containers that represent information about operator execution. 3 | 4 | @Piotr Styczyński 2021 5 | """ 6 | import time 7 | from dataclasses import dataclass 8 | from typing import List, Optional 9 | 10 | 11 | @dataclass(frozen=True) 12 | class DVCUpdateMetadata: 13 | """ 14 | Additional information about performed update operation 15 | """ 16 | 17 | dvc_repo: str 18 | dvc_files_updated: List[str] 19 | dvc_files_update_requested: List[str] 20 | dag_id: str 21 | commit_message: Optional[str] 22 | temp_path: Optional[str] 23 | commit_hexsha: Optional[str] 24 | committed_date: Optional[int] 25 | duration: time.time 26 | 27 | 28 | @dataclass(frozen=True) 29 | class DVCDownloadMetadata: 30 | """ 31 | Additional information about the performed download operation 32 | """ 33 | 34 | dvc_repo: str 35 | downloaded_dvc_files: List[str] 36 | downloaded_dvc_files_sizes: List[int] 37 | duration: time.time 38 | -------------------------------------------------------------------------------- /airflow_dvc/test_utils.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from random import choices 3 | 4 | from airflow import DAG 5 | from airflow.models import BaseOperator, TaskInstance 6 | 7 | 8 | def execute_test_task(operator: BaseOperator, *args, **kwargs): 9 | post_fix = "".join(str(e) for e in choices(population=range(10), k=10)) 10 | dag = DAG(dag_id=f"test_dag{post_fix}", start_date=datetime.now()) 11 | task = operator(dag=dag, task_id=f"test_task{post_fix}") 12 | ti = TaskInstance(task=task, execution_date=datetime.now()) 13 | result = task.prepare_for_execution().execute(ti.get_template_context()) 14 | return result 15 | -------------------------------------------------------------------------------- /airflow_dvc/tests/helpers.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | from random import choices 4 | 5 | from airflow import DAG 6 | from airflow.models import BaseOperator, TaskInstance 7 | 8 | OS_ENV = dict(GIT_TOKEN="", SLACK_TOKEN="", REPO="") 9 | 10 | 11 | def execute_test_task(operator: BaseOperator, *args, **kwargs): 12 | post_fix = "".join(str(e) for e in choices(population=range(10), k=10)) 13 | dag = DAG(dag_id=f"test_dag{post_fix}", start_date=datetime.now()) 14 | task = operator(dag=dag, task_id=f"test_task{post_fix}") 15 | ti = TaskInstance(task=task, execution_date=datetime.now()) 16 | result = task.prepare_for_execution().execute(ti.get_template_context()) 17 | return result 18 | 19 | 20 | def fake_env(): 21 | for key in OS_ENV: 22 | os.environ[key] = OS_ENV[key] 23 | -------------------------------------------------------------------------------- /example/dags/dvc_download_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example usage of the DVC upload operator (uploading a string) in an advanced Airflow DAG. 3 | 4 | @Piotr Styczyński 2021 5 | """ 6 | import os 7 | from datetime import datetime, timedelta 8 | 9 | from airflow import DAG 10 | from airflow_dvc import DVCDownloadOperator, DVCPathDownload 11 | 12 | # Default settings applied to all tasks 13 | default_args = { 14 | "owner": "airflow", 15 | "depends_on_past": False, 16 | "email_on_failure": False, 17 | "email_on_retry": False, 18 | "retries": 1, 19 | "retry_delay": timedelta(minutes=5), 20 | } 21 | 22 | # Using a DAG context manager, you don't have to specify the dag property of each task 23 | with DAG( 24 | "dvc_download_example", 25 | start_date=datetime(2019, 1, 1), 26 | max_active_runs=1, 27 | default_args=default_args, 28 | catchup=False, 29 | ) as dag: 30 | 31 | download_task = DVCDownloadOperator( 32 | dvc_repo=os.environ["REPO"], 33 | files=[ 34 | DVCPathDownload( 35 | "non_existing_path/data.txt", 36 | "output_file.txt", 37 | ), 38 | ], 39 | task_id="download_task", 40 | empty_fallback=True, 41 | ) 42 | -------------------------------------------------------------------------------- /example/dags/dvc_existence_sensor_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example usage of the DVC existence sensor in the Airflow DAG. 3 | 4 | @Piotr Styczyński 2021 5 | """ 6 | import os 7 | from datetime import datetime 8 | 9 | from airflow import DAG 10 | from airflow.operators.bash_operator import BashOperator 11 | from airflow.operators.dummy_operator import DummyOperator 12 | from airflow_dvc import DVCExistenceSensor 13 | 14 | with DAG( 15 | "dvc_existence_sensor_example", 16 | description="Existence sensor example", 17 | start_date=datetime(2017, 3, 20), 18 | catchup=False, 19 | ) as dag: 20 | 21 | dummy_task = DummyOperator(task_id="dummy_task", dag=dag) 22 | 23 | sensor_task_missing = DVCExistenceSensor( 24 | task_id="sensor_task_missing", 25 | dag=dag, 26 | dvc_repo=os.environ["REPO"], 27 | files=["gisaid/some_missing_file.txt"], 28 | ) 29 | 30 | sensor_task_exists = DVCExistenceSensor( 31 | task_id="sensor_task_exists", 32 | dag=dag, 33 | dvc_repo=os.environ["REPO"], 34 | files=["gisaid/all.fasta"], 35 | ) 36 | 37 | task_for_existing_file = BashOperator( 38 | task_id="task_for_existing_file", 39 | bash_command='echo "OK" && ( echo $[ ( $RANDOM % 30 ) + 1 ] > meowu.txt ) && cat meowu.txt', 40 | ) 41 | 42 | task_for_missing_file = BashOperator( 43 | task_id="task_for_missing_file", 44 | bash_command='echo "OK" && ( echo $[ ( $RANDOM % 30 ) + 1 ] > meowu.txt ) && cat meowu.txt', 45 | ) 46 | 47 | final_task = DummyOperator(task_id="final_task", dag=dag) 48 | 49 | dummy_task >> sensor_task_exists >> task_for_existing_file 50 | dummy_task >> sensor_task_missing >> task_for_missing_file 51 | [task_for_existing_file, task_for_existing_file] >> final_task 52 | -------------------------------------------------------------------------------- /example/dags/dvc_update_sensor_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example usage of the DVC update sensor in the Airflow DAG. 3 | 4 | @Piotr Styczyński 2021 5 | """ 6 | import os 7 | from datetime import datetime 8 | 9 | from airflow import DAG 10 | from airflow.operators.bash_operator import BashOperator 11 | from airflow.operators.dummy_operator import DummyOperator 12 | from airflow_dvc import DVCUpdateSensor 13 | 14 | with DAG( 15 | "dvc_update_sensor_example", 16 | description="Another tutorial DAG", 17 | start_date=datetime(2017, 3, 20), 18 | catchup=False, 19 | ) as dag: 20 | 21 | dummy_task = DummyOperator(task_id="dummy_task", dag=dag) 22 | 23 | sensor_task = DVCUpdateSensor( 24 | task_id="dvc_sensor_task", 25 | dag=dag, 26 | dvc_repo=os.environ["REPO"], 27 | files=["data/1.txt"], 28 | ) 29 | 30 | task = BashOperator( 31 | task_id="task_triggered_by_sensor", 32 | bash_command='echo "OK" && ( echo $[ ( $RANDOM % 30 ) + 1 ] > meowu.txt ) && cat meowu.txt', 33 | ) 34 | 35 | dummy_task >> sensor_task >> task 36 | -------------------------------------------------------------------------------- /example/dags/dvc_upload_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example usage of the DVC upload operator (uploading a string) in an advanced Airflow DAG. 3 | 4 | @Piotr Styczyński 2021 5 | """ 6 | import os 7 | from datetime import datetime, timedelta 8 | 9 | from airflow import DAG 10 | from airflow.operators.bash_operator import BashOperator 11 | from airflow.operators.dummy_operator import DummyOperator 12 | from airflow.operators.python_operator import PythonOperator 13 | from airflow.version import version 14 | from airflow_dvc import DVCStringUpload, DVCUpdateOperator 15 | 16 | 17 | def custom_python_task_handler(ts, **kwargs): 18 | print( 19 | f"I am task number {kwargs['task_number']}. " 20 | f"This DAG Run execution date is {ts} and the " 21 | f"current time is {datetime.now()}" 22 | ) 23 | print( 24 | "Here is the full DAG Run context. It is available " 25 | "because provide_context=True" 26 | ) 27 | print(kwargs) 28 | 29 | 30 | # Default settings applied to all tasks 31 | default_args = { 32 | "owner": "airflow", 33 | "depends_on_past": False, 34 | "email_on_failure": False, 35 | "email_on_retry": False, 36 | "retries": 1, 37 | "retry_delay": timedelta(minutes=5), 38 | } 39 | 40 | # Using a DAG context manager, you don't have to specify the dag property of each task 41 | with DAG( 42 | "dvc_upload_example", 43 | start_date=datetime(2019, 1, 1), 44 | max_active_runs=1, 45 | default_args=default_args, 46 | catchup=False, 47 | ) as dag: 48 | 49 | t0 = DummyOperator(task_id="sample-task") 50 | 51 | t1 = DummyOperator(task_id="group_bash_tasks") 52 | 53 | t2 = BashOperator( 54 | task_id="bash_task1", 55 | bash_command='echo "OK" && ( echo $[ ( $RANDOM % 30 ) + 1 ] > meowu.txt ) && cat meowu.txt', 56 | ) 57 | 58 | t3 = BashOperator( 59 | task_id="bash_task2", 60 | bash_command="sleep $[ ( $RANDOM % 30 ) + 1 ]s && date", 61 | ) 62 | 63 | upload_task = DVCUpdateOperator( 64 | dvc_repo=os.environ["REPO"], 65 | files=[ 66 | DVCStringUpload( 67 | "data/1.txt", 68 | f"This will be saved into DVC. Current time: {datetime.now()}", 69 | ), 70 | ], 71 | task_id="update_dvc", 72 | ) 73 | 74 | # generate tasks with a loop. task_id must be unique 75 | for task in range(5): 76 | if version.startswith("2"): 77 | tn = PythonOperator( 78 | task_id=f"python_custom_task_{task}", 79 | python_callable=custom_python_task_handler, # make sure you don't include the () of the function 80 | op_kwargs={"task_number": task}, 81 | ) 82 | else: 83 | tn = PythonOperator( 84 | task_id=f"python_custom_task_{task}", 85 | python_callable=custom_python_task_handler, # make sure you don't include the () of the function 86 | op_kwargs={"task_number": task}, 87 | provide_context=True, 88 | ) 89 | 90 | t0 >> tn 91 | 92 | t0 >> t1 93 | t2 >> upload_task 94 | t1 >> [t2, t3] 95 | -------------------------------------------------------------------------------- /example/dags/dvc_upload_with_template_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example usage of the DVC upload operator (uploading a string) in an advanced Airflow DAG. 3 | 4 | @Piotr Styczyński 2021 5 | """ 6 | import os 7 | from datetime import datetime, timedelta 8 | 9 | from airflow import DAG 10 | from airflow_dvc import DVCStringUpload, DVCUpdateOperator 11 | 12 | # Default settings applied to all tasks 13 | default_args = { 14 | "owner": "airflow", 15 | "depends_on_past": False, 16 | "email_on_failure": False, 17 | "email_on_retry": False, 18 | "retries": 1, 19 | "retry_delay": timedelta(minutes=5), 20 | } 21 | 22 | # Using a DAG context manager, you don't have to specify the dag property of each task 23 | with DAG( 24 | "dvc_upload_with_template_example", 25 | start_date=datetime(2019, 1, 1), 26 | max_active_runs=1, 27 | default_args=default_args, 28 | catchup=False, 29 | ) as dag: 30 | 31 | upload_task = DVCUpdateOperator( 32 | dvc_repo=os.environ["REPO"], 33 | files=[ 34 | DVCStringUpload( 35 | "data/{{ yesterday_ds_nodash }}.txt", 36 | "This is jinja Airflow template. " 37 | "DAG date: {{ yesterday_ds_nodash }}", 38 | ), 39 | ], 40 | task_id="update_dvc", 41 | ) 42 | -------------------------------------------------------------------------------- /example/plugins/dvc.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from airflow_dvc import DVCPlugin 3 | -------------------------------------------------------------------------------- /install_deps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python3 -m pip install --requirement <(poetry export --dev --format requirements.txt) 4 | python3 -m pip install --no-deps . 5 | python3 -m pip install apache-airflow-providers-amazon 6 | 7 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "cluster_deployments": [], 3 | "dependencies": {}, 4 | "devDependencies": {}, 5 | "name": "airflow-dvc", 6 | "private": false, 7 | "scripts": { 8 | "build": "poetry build", 9 | "build:package": "yarn run build", 10 | "clean": "rm -rfd dist", 11 | "configure": "poetry config --local virtualenvs.in-project true && poetry config --local virtualenvs.path $(pwd)/.venv", 12 | "dependencies:install": "poetry install", 13 | "dependencies:update": "poetry update", 14 | "deploy": "poetry run publish", 15 | "deploy:package": "yarn run deploy", 16 | "lint": "poetry run black . && poetry run isort . && poetry run flakehell lint", 17 | "test": "poetry run pytest -n 4", 18 | "version:bump": "poetry run bump2version --no-tag --no-commit patch" 19 | }, 20 | "version": "1.9.9" 21 | } -------------------------------------------------------------------------------- /publish.py: -------------------------------------------------------------------------------- 1 | """ 2 | Script to publish Python package. 3 | Run via "poetry run publish" 4 | 5 | @Piotr Styczyński 2021 6 | """ 7 | from pathlib import Path 8 | 9 | from poetry_publish.publish import poetry_publish 10 | 11 | import airflow_dvc 12 | 13 | 14 | def publish(): 15 | poetry_publish( 16 | package_root=Path(airflow_dvc.__file__).parent.parent, 17 | version=airflow_dvc.__version__, 18 | ) 19 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "airflow_dvc" 3 | version = "1.9.9" 4 | description = "DVC operator for Airflow" 5 | authors = ["Piotr Styczyński "] 6 | readme = "README.md" 7 | include = [ 8 | { path = 'README.md', format = 'wheel' }, 9 | { path = 'README.md', format = 'sdist' }, 10 | { path = 'example/dags/*', format = 'wheel' }, 11 | { path = 'example/dags/*', format = 'sdist' }, 12 | { path = 'example/plugins/*', format = 'wheel' }, 13 | { path = 'example/plugins/*', format = 'sdist' }, 14 | { path = 'pyproject.toml', format = 'wheel' }, 15 | { path = 'pyproject.toml', format = 'sdist' }, 16 | "README.md", 17 | "example", 18 | "example/dags", 19 | "example/dags/dvc_sensor_example.py", 20 | "example/dags/dvc_upload_example.py", 21 | "example/plugins", 22 | "example/plugins/dvc.py", 23 | "pyproject.toml" 24 | ] 25 | 26 | [tool.poetry.scripts] 27 | publish = 'publish:publish' 28 | airflow_dvc = 'airflow_dvc.cli.entrypoint:run_cli' 29 | 30 | 31 | [tool.poetry.dependencies] 32 | python = "^3.8" 33 | SQLAlchemy = "<1.4.0" 34 | GitPython = "^3.1.14" 35 | apache-airflow = ">=2.0.0" 36 | apache-airflow-providers-amazon = "^1.3.0" 37 | typer = "^0.3.2" 38 | semver = "^2.13.0" 39 | semantic-version = "^2.8.5" 40 | toml = "^0.10.2" 41 | dvc-fs = "^0.7.2" 42 | 43 | [tool.poetry.extras] 44 | dvc = ["s3", "dvc>=2.0.18"] 45 | 46 | [tool.poetry.dev-dependencies] 47 | pytest = "^6.2.1" 48 | pytest-xdist = "^2.2.0" 49 | isort = "^5.7.0" 50 | black = "^20.8b1" 51 | pydoc-markdown = "^3.9.0" 52 | s3pypi = "^0.11.0" 53 | flakehell = "0.9.0" 54 | flake8 = "3.8.3" 55 | poetry-publish = "^0.4.1" 56 | bump2version = "^1.0.1" 57 | gitchangelog = "^3.0.4" 58 | 59 | [tool.poetry.plugins] 60 | 61 | [tool.poetry.plugins."airflow.plugins"] 62 | "airflow_dvc_plugin" = "airflow_dvc:DVCPlugin" 63 | 64 | [tool.black] 65 | line-length = 79 66 | include = '\.pyi?$' 67 | exclude = ''' 68 | /( 69 | \.git 70 | | \.hg 71 | | \.mypy_cache 72 | | \.tox 73 | | \.venv 74 | | _build 75 | | buck-out 76 | | build 77 | | dist 78 | )/ 79 | ''' 80 | 81 | [tool.flakehell] 82 | exclude = ["README.rst", "README.md"] 83 | format = "colored" 84 | max_line_length = 120 85 | show_source = true 86 | whitelist = "../../allowlist.txt" 87 | 88 | [tool.flakehell.plugins] 89 | flake8-bandit = ["+*", "-S322"] 90 | flake8-bugbear = ["+*"] 91 | flake8-builtins = ["+*"] 92 | flake8-comprehensions = ["+*"] 93 | flake8-darglint = ["+*"] 94 | flake8-docstrings = ["+*"] 95 | flake8-eradicate = ["+*"] 96 | flake8-isort = ["+*"] 97 | flake8-mutable = ["+*"] 98 | flake8-pytest-style = ["+*"] 99 | flake8-spellcheck = ["+*"] 100 | mccabe = ["+*"] 101 | pep8-naming = ["+*"] 102 | pycodestyle = ["+*", "-E203", "-W503"] 103 | pyflakes = ["+*", "-E203"] 104 | pylint = ["+*", "-E203"] 105 | 106 | [build-system] 107 | requires = ["poetry-core>=1.1.2a3"] 108 | build-backend = "poetry.core.masonry.api" 109 | -------------------------------------------------------------------------------- /run_airflow.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export AIRFLOW_HOME=$(pwd)/airflow 4 | export AIRFLOW_CONFIG=$AIRFLOW_HOME/airflow.cfg 5 | mkdir -p $AIRFLOW_HOME > /dev/null 2> /dev/null 6 | 7 | poetry run airflow db init 8 | 9 | poetry run airflow users create \ 10 | --username admin \ 11 | --firstname Peter \ 12 | --lastname Parker \ 13 | --role Admin \ 14 | --email spiderman@superhero.org 15 | 16 | poetry run airflow webserver --port 8080 & 17 | poetry run airflow scheduler & -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [options.entry_points] 2 | console_scripts = 3 | airflow_dvc = airflow_dvc.cli.entrypoint:run_cli 4 | 5 | [options] 6 | zip_safe = False 7 | include_package_data = True 8 | 9 | [options.package_data] 10 | * = 11 | example/dags/*.py 12 | example/plugins/*.py 13 | example/* 14 | example 15 | pyproject.toml 16 | -------------------------------------------------------------------------------- /static/cg_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joednkn1/airflow-dvc/8147496723e4490a286407a210bf9c8150468ab4/static/cg_logo.png -------------------------------------------------------------------------------- /static/screen1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joednkn1/airflow-dvc/8147496723e4490a286407a210bf9c8150468ab4/static/screen1.png -------------------------------------------------------------------------------- /static/screen2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joednkn1/airflow-dvc/8147496723e4490a286407a210bf9c8150468ab4/static/screen2.png -------------------------------------------------------------------------------- /static/screen3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joednkn1/airflow-dvc/8147496723e4490a286407a210bf9c8150468ab4/static/screen3.png -------------------------------------------------------------------------------- /tests/test_dag_loading.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import importlib 3 | import os 4 | 5 | import pytest 6 | 7 | from airflow_dvc.tests.helpers import fake_env 8 | 9 | fake_env() 10 | 11 | 12 | @pytest.mark.parametrize("dag_path", glob.glob("example/dags/*.py")) 13 | def test_dag_loading(dag_path: str): 14 | file_name = os.path.basename(dag_path).split(".")[0] 15 | spec = importlib.util.spec_from_file_location(file_name, dag_path) 16 | dag_module = importlib.util.module_from_spec(spec) 17 | spec.loader.exec_module(dag_module) 18 | assert hasattr(dag_module, "dag") 19 | 20 | dag = dag_module.dag 21 | assert dag.dag_id == file_name 22 | --------------------------------------------------------------------------------