├── .gitignore ├── Dockerfile ├── README.md ├── airflow-data ├── logs │ └── .keep └── plugins │ └── .keep ├── config ├── logging_conf.py └── logging_handlers.py ├── constraints.txt ├── dags ├── dag_with_two_tasks.py ├── example_api_dag.py ├── exchange_rates │ ├── __init__.py │ ├── hook.py │ ├── operator.py │ └── sql │ │ ├── create_table.sql │ │ └── insert_rate.sql ├── exchange_rates_dynamic │ ├── __init__.py │ ├── hook.py │ ├── operator.py │ └── sql │ │ ├── create_table.sql │ │ └── insert_rate.sql ├── first_dag.py ├── first_dag_execution_date.py ├── nyc_taxi │ ├── __init__.py │ └── functions.py ├── nyc_taxi_2021_sensor.py ├── nyc_taxi_pre_2 │ └── __init__.py ├── sla_example_dag.py └── taskflow_dag_with_two_operators.py ├── docker-compose.yml ├── fabfile.py ├── requirements-ci.txt └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.py[cod] 3 | .idea 4 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM apache/airflow:2.8.1-python3.11 2 | COPY requirements.txt /tmp 3 | RUN pip install --no-cache-dir -r /tmp/requirements.txt 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Apache Airflow 2.0: практический курс 2 | 3 | Примеры из курса про [Apache Airflow 2.0](https://startdatajourney.com/ru/course/apache-airflow-2): 4 | -------------------------------------------------------------------------------- /airflow-data/logs/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adilkhash/apache-airflow-course-materials/a18b720b9e45425f669b4c703b8d1ee6af992536/airflow-data/logs/.keep -------------------------------------------------------------------------------- /airflow-data/plugins/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adilkhash/apache-airflow-course-materials/a18b720b9e45425f669b4c703b8d1ee6af992536/airflow-data/plugins/.keep -------------------------------------------------------------------------------- /config/logging_conf.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | from airflow.config_templates.airflow_local_settings import DEFAULT_LOGGING_CONFIG 4 | 5 | LOGGING_CONFIG = deepcopy(DEFAULT_LOGGING_CONFIG) 6 | LOGGING_CONFIG['handlers'].update( 7 | { 8 | 'telegram_handler': { 9 | 'class': 'logging_handlers.TelegramBotHandler', 10 | 'chat_id': '', 11 | 'token': '', 12 | 'level': 'ERROR', 13 | } 14 | } 15 | ) 16 | LOGGING_CONFIG['loggers']['airflow.task']['handlers'].append('telegram_handler') 17 | -------------------------------------------------------------------------------- /config/logging_handlers.py: -------------------------------------------------------------------------------- 1 | from logging import Handler, LogRecord 2 | 3 | import telebot 4 | 5 | 6 | class TelegramBotHandler(Handler): 7 | def __init__(self, token: str, chat_id: str): 8 | super().__init__() 9 | self.token = token 10 | self.chat_id = chat_id 11 | self.context = None 12 | 13 | def emit(self, record: LogRecord): 14 | bot = telebot.TeleBot(self.token) 15 | bot.send_message( 16 | self.chat_id, 17 | f'DAG: {self.context.dag_id}\n' 18 | f'Execution date: {self.context.execution_date}\n' 19 | f'Task: {self.context.task_id}\n' 20 | f'Error: {self.format(record)}' 21 | ) 22 | 23 | def set_context(self, context): 24 | self.context = context 25 | -------------------------------------------------------------------------------- /constraints.txt: -------------------------------------------------------------------------------- 1 | Authlib==1.3.0 2 | Babel==2.14.0 3 | ConfigUpdater==3.2 4 | Deprecated==1.2.14 5 | Flask-AppBuilder==4.3.10 6 | Flask-Babel==2.0.0 7 | Flask-Bcrypt==1.0.1 8 | Flask-Caching==2.1.0 9 | Flask-JWT-Extended==4.6.0 10 | Flask-Limiter==3.5.0 11 | Flask-Login==0.6.3 12 | Flask-SQLAlchemy==2.5.1 13 | Flask-Session==0.5.0 14 | Flask-WTF==1.2.1 15 | Flask==2.2.5 16 | GitPython==3.1.41 17 | JPype1==1.5.0 18 | JayDeBeApi==1.2.3 19 | Jinja2==3.1.3 20 | Js2Py==0.74 21 | Mako==1.3.0 22 | Markdown==3.5.2 23 | MarkupSafe==2.1.3 24 | PyGithub==2.1.1 25 | PyHive==0.7.0 26 | PyJWT==2.8.0 27 | PyNaCl==1.5.0 28 | PyYAML==6.0.1 29 | Pygments==2.17.2 30 | SQLAlchemy-JSONField==1.0.2 31 | SQLAlchemy-Utils==0.41.1 32 | SQLAlchemy==1.4.51 33 | SecretStorage==3.3.3 34 | Sphinx==5.3.0 35 | WTForms==3.1.2 36 | Werkzeug==2.2.3 37 | adal==1.2.7 38 | adlfs==2023.12.0 39 | aiobotocore==2.9.0 40 | aiofiles==23.2.1 41 | aiohttp==3.9.1 42 | aioitertools==0.11.0 43 | aioresponses==0.7.6 44 | aiosignal==1.3.1 45 | alabaster==0.7.16 46 | alembic==1.13.1 47 | alibabacloud-adb20211201==1.2.6 48 | alibabacloud-tea==0.3.5 49 | alibabacloud_credentials==0.3.2 50 | alibabacloud_endpoint_util==0.0.3 51 | alibabacloud_gateway_spi==0.0.1 52 | alibabacloud_openapi_util==0.2.2 53 | alibabacloud_tea_openapi==0.3.8 54 | alibabacloud_tea_util==0.3.11 55 | alibabacloud_tea_xml==0.0.2 56 | aliyun-python-sdk-core==2.14.0 57 | aliyun-python-sdk-kms==2.16.2 58 | amqp==5.2.0 59 | analytics-python==1.2.9 60 | annotated-types==0.6.0 61 | anyascii==0.3.2 62 | anyio==4.2.0 63 | apache-airflow-providers-airbyte==3.5.1 64 | apache-airflow-providers-alibaba==2.7.1 65 | apache-airflow-providers-amazon==8.16.0 66 | apache-airflow-providers-apache-beam==5.5.0 67 | apache-airflow-providers-apache-cassandra==3.4.1 68 | apache-airflow-providers-apache-drill==2.6.0 69 | apache-airflow-providers-apache-druid==3.7.0 70 | apache-airflow-providers-apache-flink==1.3.0 71 | apache-airflow-providers-apache-hdfs==4.3.2 72 | apache-airflow-providers-apache-hive==6.4.1 73 | apache-airflow-providers-apache-impala==1.3.0 74 | apache-airflow-providers-apache-kafka==1.3.1 75 | apache-airflow-providers-apache-kylin==3.5.0 76 | apache-airflow-providers-apache-livy==3.7.1 77 | apache-airflow-providers-apache-pig==4.3.0 78 | apache-airflow-providers-apache-pinot==4.3.0 79 | apache-airflow-providers-apache-spark==4.7.0 80 | apache-airflow-providers-apprise==1.2.1 81 | apache-airflow-providers-arangodb==2.4.1 82 | apache-airflow-providers-asana==2.4.1 83 | apache-airflow-providers-atlassian-jira==2.5.0 84 | apache-airflow-providers-celery==3.5.1 85 | apache-airflow-providers-cloudant==3.4.1 86 | apache-airflow-providers-cncf-kubernetes==7.13.0 87 | apache-airflow-providers-cohere==1.1.1 88 | apache-airflow-providers-common-io==1.2.0 89 | apache-airflow-providers-common-sql==1.10.0 90 | apache-airflow-providers-databricks==6.0.0 91 | apache-airflow-providers-datadog==3.5.1 92 | apache-airflow-providers-dbt-cloud==3.5.1 93 | apache-airflow-providers-dingding==3.4.0 94 | apache-airflow-providers-discord==3.5.0 95 | apache-airflow-providers-docker==3.9.1 96 | apache-airflow-providers-elasticsearch==5.3.1 97 | apache-airflow-providers-exasol==4.4.1 98 | apache-airflow-providers-facebook==3.4.0 99 | apache-airflow-providers-ftp==3.7.0 100 | apache-airflow-providers-github==2.5.1 101 | apache-airflow-providers-google==10.13.1 102 | apache-airflow-providers-grpc==3.4.1 103 | apache-airflow-providers-hashicorp==3.6.1 104 | apache-airflow-providers-http==4.8.0 105 | apache-airflow-providers-imap==3.5.0 106 | apache-airflow-providers-influxdb==2.4.0 107 | apache-airflow-providers-jdbc==4.2.1 108 | apache-airflow-providers-jenkins==3.5.1 109 | apache-airflow-providers-microsoft-azure==8.5.1 110 | apache-airflow-providers-microsoft-mssql==3.6.0 111 | apache-airflow-providers-microsoft-psrp==2.5.0 112 | apache-airflow-providers-microsoft-winrm==3.4.0 113 | apache-airflow-providers-mongo==3.5.0 114 | apache-airflow-providers-mysql==5.5.1 115 | apache-airflow-providers-neo4j==3.5.0 116 | apache-airflow-providers-odbc==4.4.0 117 | apache-airflow-providers-openai==1.1.0 118 | apache-airflow-providers-openfaas==3.4.0 119 | apache-airflow-providers-openlineage==1.4.0 120 | apache-airflow-providers-opensearch==1.1.1 121 | apache-airflow-providers-opsgenie==5.5.0 122 | apache-airflow-providers-oracle==3.9.1 123 | apache-airflow-providers-pagerduty==3.6.0 124 | apache-airflow-providers-papermill==3.6.0 125 | apache-airflow-providers-pgvector==1.1.0 126 | apache-airflow-providers-pinecone==1.1.1 127 | apache-airflow-providers-postgres==5.10.0 128 | apache-airflow-providers-presto==5.4.0 129 | apache-airflow-providers-redis==3.6.0 130 | apache-airflow-providers-salesforce==5.6.1 131 | apache-airflow-providers-samba==4.5.0 132 | apache-airflow-providers-segment==3.4.0 133 | apache-airflow-providers-sendgrid==3.4.0 134 | apache-airflow-providers-sftp==4.8.1 135 | apache-airflow-providers-singularity==3.4.0 136 | apache-airflow-providers-slack==8.5.1 137 | apache-airflow-providers-smtp==1.5.0 138 | apache-airflow-providers-snowflake==5.2.1 139 | apache-airflow-providers-sqlite==3.7.0 140 | apache-airflow-providers-ssh==3.10.0 141 | apache-airflow-providers-tableau==4.4.0 142 | apache-airflow-providers-tabular==1.4.1 143 | apache-airflow-providers-telegram==4.3.0 144 | apache-airflow-providers-trino==5.6.0 145 | apache-airflow-providers-vertica==3.7.0 146 | apache-airflow-providers-weaviate==1.3.0 147 | apache-airflow-providers-yandex==3.7.1 148 | apache-airflow-providers-zendesk==4.6.0 149 | apache-beam==2.53.0 150 | apispec==6.4.0 151 | apprise==1.7.1 152 | argcomplete==3.2.1 153 | asana==3.2.2 154 | asgiref==3.7.2 155 | asn1crypto==1.5.1 156 | astroid==2.15.8 157 | asttokens==2.4.1 158 | atlasclient==1.0.0 159 | atlassian-python-api==3.41.5 160 | attrs==23.2.0 161 | aws-sam-translator==1.83.0 162 | aws-xray-sdk==2.12.1 163 | azure-batch==14.1.0 164 | azure-common==1.1.28 165 | azure-core==1.29.6 166 | azure-cosmos==4.5.1 167 | azure-datalake-store==0.0.53 168 | azure-identity==1.15.0 169 | azure-keyvault-secrets==4.7.0 170 | azure-kusto-data==4.3.1 171 | azure-mgmt-containerinstance==10.1.0 172 | azure-mgmt-containerregistry==10.3.0 173 | azure-mgmt-core==1.4.0 174 | azure-mgmt-cosmosdb==9.4.0 175 | azure-mgmt-datafactory==4.0.0 176 | azure-mgmt-datalake-nspkg==3.0.1 177 | azure-mgmt-datalake-store==0.5.0 178 | azure-mgmt-nspkg==3.0.2 179 | azure-mgmt-resource==23.0.1 180 | azure-mgmt-storage==21.1.0 181 | azure-nspkg==3.0.2 182 | azure-servicebus==7.11.4 183 | azure-storage-blob==12.19.0 184 | azure-storage-file-datalake==12.14.0 185 | azure-storage-file-share==12.15.0 186 | azure-synapse-artifacts==0.18.0 187 | azure-synapse-spark==0.7.0 188 | backoff==2.2.1 189 | bcrypt==4.1.2 190 | beautifulsoup4==4.12.2 191 | billiard==4.2.0 192 | bitarray==2.9.2 193 | black==24.1a1 194 | blinker==1.7.0 195 | boto3==1.33.13 196 | botocore==1.33.13 197 | cachelib==0.9.0 198 | cachetools==5.3.2 199 | cassandra-driver==3.29.0 200 | cattrs==23.2.3 201 | celery==5.3.6 202 | certifi==2023.11.17 203 | cffi==1.16.0 204 | cfgv==3.4.0 205 | cfn-lint==0.83.8 206 | cgroupspy==0.2.2 207 | chardet==5.2.0 208 | charset-normalizer==3.3.2 209 | checksumdir==1.2.0 210 | ciso8601==2.3.1 211 | click-didyoumean==0.3.0 212 | click-plugins==1.1.1 213 | click-repl==0.3.0 214 | click==8.1.7 215 | clickclick==20.10.2 216 | cloudant==2.15.0 217 | cloudpickle==2.2.1 218 | cohere==4.44 219 | colorama==0.4.6 220 | colorlog==4.8.0 221 | comm==0.2.1 222 | confluent-kafka==2.3.0 223 | connexion==2.14.2 224 | coverage==7.4.0 225 | crcmod==1.7 226 | cron-descriptor==1.4.0 227 | croniter==2.0.1 228 | cryptography==41.0.7 229 | curlify==2.2.1 230 | databricks-sql-connector==2.9.3 231 | datadog==0.47.0 232 | db-dtypes==1.2.0 233 | debugpy==1.8.0 234 | decorator==5.1.1 235 | defusedxml==0.7.1 236 | deltalake==0.15.1 237 | dill==0.3.1.1 238 | distlib==0.3.8 239 | distro==1.9.0 240 | dnspython==2.4.2 241 | docker==7.0.0 242 | docopt==0.6.2 243 | docutils==0.20.1 244 | duckdb==0.9.2 245 | ecdsa==0.18.0 246 | editables==0.5 247 | elastic-transport==8.11.0 248 | elasticsearch==8.11.1 249 | email-validator==1.3.1 250 | entrypoints==0.4 251 | eralchemy2==1.3.8 252 | et-xmlfile==1.1.0 253 | eventlet==0.34.3 254 | execnet==2.0.2 255 | executing==2.0.1 256 | facebook-business==19.0.2 257 | fastavro==1.9.3 258 | fasteners==0.19 259 | fastjsonschema==2.19.1 260 | filelock==3.13.1 261 | flower==2.0.1 262 | frozenlist==1.4.1 263 | fsspec==2023.12.2 264 | future==0.18.3 265 | gcloud-aio-auth==4.2.3 266 | gcloud-aio-bigquery==7.0.0 267 | gcloud-aio-storage==9.0.0 268 | gcsfs==2023.12.2.post1 269 | geomet==0.2.1.post1 270 | gevent==23.9.1 271 | gitdb==4.0.11 272 | google-ads==23.1.0 273 | google-analytics-admin==0.22.2 274 | google-api-core==2.15.0 275 | google-api-python-client==2.113.0 276 | google-auth-httplib2==0.2.0 277 | google-auth-oauthlib==1.2.0 278 | google-auth==2.26.2 279 | google-cloud-aiplatform==1.39.0 280 | google-cloud-appengine-logging==1.4.0 281 | google-cloud-audit-log==0.2.5 282 | google-cloud-automl==2.12.0 283 | google-cloud-batch==0.17.7 284 | google-cloud-bigquery-datatransfer==3.13.0 285 | google-cloud-bigquery-storage==2.24.0 286 | google-cloud-bigquery==3.16.0 287 | google-cloud-bigtable==2.22.0 288 | google-cloud-build==3.22.0 289 | google-cloud-compute==1.15.0 290 | google-cloud-container==2.37.0 291 | google-cloud-core==2.4.1 292 | google-cloud-datacatalog==3.17.2 293 | google-cloud-dataflow-client==0.8.6 294 | google-cloud-dataform==0.5.5 295 | google-cloud-dataplex==1.11.0 296 | google-cloud-dataproc-metastore==1.14.0 297 | google-cloud-dataproc==5.8.0 298 | google-cloud-dlp==3.14.0 299 | google-cloud-kms==2.20.0 300 | google-cloud-language==2.12.0 301 | google-cloud-logging==3.9.0 302 | google-cloud-memcache==1.8.0 303 | google-cloud-monitoring==2.18.0 304 | google-cloud-orchestration-airflow==1.10.0 305 | google-cloud-os-login==2.13.0 306 | google-cloud-pubsub==2.19.0 307 | google-cloud-redis==2.14.0 308 | google-cloud-resource-manager==1.11.0 309 | google-cloud-run==0.10.1 310 | google-cloud-secret-manager==2.17.0 311 | google-cloud-spanner==3.41.0 312 | google-cloud-speech==2.23.0 313 | google-cloud-storage-transfer==1.10.0 314 | google-cloud-storage==2.14.0 315 | google-cloud-tasks==2.15.0 316 | google-cloud-texttospeech==2.15.1 317 | google-cloud-translate==3.14.0 318 | google-cloud-videointelligence==2.12.0 319 | google-cloud-vision==3.5.0 320 | google-cloud-workflows==1.13.0 321 | google-crc32c==1.5.0 322 | google-re2==1.1 323 | google-resumable-media==2.7.0 324 | googleapis-common-protos==1.62.0 325 | graphql-core==3.2.3 326 | graphviz==0.20.1 327 | greenlet==3.0.3 328 | grpc-google-iam-v1==0.13.0 329 | grpcio-gcp==0.2.2 330 | grpcio-status==1.60.0 331 | grpcio==1.60.0 332 | gssapi==1.8.3 333 | gunicorn==21.2.0 334 | h11==0.14.0 335 | hatch==1.9.1 336 | hatchling==1.21.0 337 | hdfs==2.7.3 338 | hmsclient==0.1.1 339 | httpcore==0.16.3 340 | httplib2==0.22.0 341 | httpx==0.23.3 342 | humanize==4.9.0 343 | hvac==2.1.0 344 | hyperlink==21.0.0 345 | icdiff==2.0.7 346 | identify==2.5.33 347 | idna==3.6 348 | ijson==3.2.3 349 | imagesize==1.4.1 350 | importlib-metadata==6.11.0 351 | importlib-resources==6.1.1 352 | impyla==0.19.0 353 | incremental==22.10.0 354 | inflection==0.5.1 355 | influxdb-client==1.39.0 356 | iniconfig==2.0.0 357 | ipdb==0.13.13 358 | ipykernel==6.28.0 359 | ipython==8.20.0 360 | isodate==0.6.1 361 | itsdangerous==2.1.2 362 | jaraco.classes==3.3.0 363 | jedi==0.19.1 364 | jeepney==0.8.0 365 | jmespath==0.10.0 366 | jschema-to-python==1.2.3 367 | json-merge-patch==0.2 368 | jsondiff==2.0.0 369 | jsonpatch==1.33 370 | jsonpath-ng==1.6.1 371 | jsonpickle==3.0.2 372 | jsonpointer==2.4 373 | jsonschema-path==0.3.2 374 | jsonschema-specifications==2023.12.1 375 | jsonschema==4.20.0 376 | junit-xml==1.9 377 | jupyter_client==8.6.0 378 | jupyter_core==5.7.1 379 | keyring==24.3.0 380 | kombu==5.3.5 381 | krb5==0.5.1 382 | kubernetes-asyncio==24.2.3 383 | kubernetes==23.6.0 384 | kylinpy==2.8.4 385 | lazy-object-proxy==1.10.0 386 | ldap3==2.9.1 387 | limits==3.7.0 388 | linkify-it-py==2.0.2 389 | lockfile==0.12.2 390 | loguru==0.7.2 391 | looker-sdk==23.20.1 392 | lxml==5.1.0 393 | lz4==4.3.3 394 | markdown-it-py==3.0.0 395 | marshmallow-oneofschema==3.0.1 396 | marshmallow-sqlalchemy==0.26.1 397 | marshmallow==3.20.2 398 | matplotlib-inline==0.1.6 399 | mdit-py-plugins==0.4.0 400 | mdurl==0.1.2 401 | mmhash3==3.0.1 402 | mongomock==4.1.2 403 | more-itertools==10.2.0 404 | moto==4.2.13 405 | mpmath==1.3.0 406 | msal-extensions==1.1.0 407 | msal==1.26.0 408 | msrest==0.7.1 409 | msrestazure==0.6.4 410 | multi_key_dict==2.0.3 411 | multidict==6.0.4 412 | mypy-boto3-appflow==1.34.0 413 | mypy-boto3-rds==1.34.6 414 | mypy-boto3-redshift-data==1.34.0 415 | mypy-boto3-s3==1.34.14 416 | mypy-extensions==1.0.0 417 | mypy==1.2.0 418 | mysql-connector-python==8.0.29 419 | mysqlclient==2.2.1 420 | nbclient==0.9.0 421 | nbformat==5.9.2 422 | neo4j==5.16.0 423 | nest-asyncio==1.5.9 424 | networkx==3.2.1 425 | nh3==0.2.15 426 | nodeenv==1.8.0 427 | numpy==1.24.4 428 | oauthlib==3.2.2 429 | objsize==0.6.1 430 | openai==1.7.2 431 | openapi-schema-validator==0.6.2 432 | openapi-spec-validator==0.7.1 433 | openlineage-integration-common==1.7.0 434 | openlineage-python==1.7.0 435 | openlineage_sql==1.7.0 436 | openpyxl==3.1.2 437 | opensearch-py==2.4.2 438 | opentelemetry-api==1.22.0 439 | opentelemetry-exporter-otlp-proto-common==1.22.0 440 | opentelemetry-exporter-otlp-proto-grpc==1.22.0 441 | opentelemetry-exporter-otlp-proto-http==1.22.0 442 | opentelemetry-exporter-otlp==1.22.0 443 | opentelemetry-exporter-prometheus==0.43b0 444 | opentelemetry-proto==1.22.0 445 | opentelemetry-sdk==1.22.0 446 | opentelemetry-semantic-conventions==0.43b0 447 | opsgenie-sdk==2.1.5 448 | oracledb==2.0.1 449 | ordered-set==4.1.0 450 | orjson==3.9.10 451 | oss2==2.18.4 452 | packaging==23.2 453 | pandas-gbq==0.20.0 454 | pandas-stubs==2.0.2.230605 455 | pandas==2.1.4 456 | papermill==2.5.0 457 | paramiko==3.4.0 458 | parso==0.8.3 459 | pathable==0.4.3 460 | pathspec==0.12.1 461 | pbr==6.0.0 462 | pdpyras==5.2.0 463 | pendulum==3.0.0 464 | pexpect==4.9.0 465 | pgvector==0.2.4 466 | pinecone-client==2.2.4 467 | pinotdb==5.1.2 468 | pipdeptree==2.13.2 469 | pipx==1.4.2 470 | pkginfo==1.9.6 471 | platformdirs==3.11.0 472 | pluggy==1.3.0 473 | ply==3.11 474 | plyvel==1.5.1 475 | portalocker==2.8.2 476 | pprintpp==0.4.0 477 | pre-commit==3.6.0 478 | presto-python-client==0.8.4 479 | prison==0.2.1 480 | prometheus-client==0.19.0 481 | prompt-toolkit==3.0.43 482 | proto-plus==1.23.0 483 | protobuf==4.25.2 484 | psutil==5.9.7 485 | psycopg2-binary==2.9.9 486 | ptyprocess==0.7.0 487 | pure-eval==0.2.2 488 | pure-sasl==0.6.2 489 | py-partiql-parser==0.5.0 490 | py4j==0.10.9.7 491 | pyOpenSSL==23.3.0 492 | pyarrow-hotfix==0.6 493 | pyarrow==14.0.2 494 | pyasn1-modules==0.3.0 495 | pyasn1==0.5.1 496 | pycountry==23.12.11 497 | pycparser==2.21 498 | pycryptodome==3.20.0 499 | pydantic==2.5.3 500 | pydantic_core==2.14.6 501 | pydata-google-auth==1.8.2 502 | pydot==1.4.2 503 | pydruid==0.6.6 504 | pyenchant==3.2.2 505 | pyexasol==0.25.2 506 | pygraphviz==1.12 507 | pyiceberg==0.5.1 508 | pyjsparser==2.7.1 509 | pykerberos==1.2.4 510 | pymongo==4.6.1 511 | pymssql==2.2.11 512 | pyodbc==5.0.1 513 | pyparsing==3.1.1 514 | pypsrp==0.8.1 515 | pyspark==3.5.0 516 | pyspnego==0.10.2 517 | pytest-asyncio==0.23.3 518 | pytest-cov==4.1.0 519 | pytest-httpx==0.21.3 520 | pytest-icdiff==0.9 521 | pytest-instafail==0.5.0 522 | pytest-mock==3.12.0 523 | pytest-rerunfailures==13.0 524 | pytest-timeouts==1.2.1 525 | pytest-xdist==3.5.0 526 | pytest==7.4.4 527 | python-arango==7.9.0 528 | python-daemon==3.0.1 529 | python-dateutil==2.8.2 530 | python-dotenv==1.0.0 531 | python-http-client==3.3.7 532 | python-jenkins==1.8.2 533 | python-jose==3.3.0 534 | python-ldap==3.4.4 535 | python-nvd3==0.15.0 536 | python-slugify==8.0.1 537 | python-telegram-bot==20.2 538 | python3-saml==1.16.0 539 | pytz==2023.3.post1 540 | pywinrm==0.4.3 541 | pyzmq==25.1.2 542 | reactivex==4.0.4 543 | readme-renderer==42.0 544 | redis==4.6.0 545 | redshift-connector==2.0.918 546 | referencing==0.32.1 547 | regex==2023.12.25 548 | requests-file==1.5.1 549 | requests-kerberos==0.14.0 550 | requests-mock==1.11.0 551 | requests-ntlm==1.2.0 552 | requests-oauthlib==1.3.1 553 | requests-toolbelt==1.0.0 554 | requests==2.31.0 555 | responses==0.24.1 556 | restructuredtext_lint==1.4.0 557 | rfc3339-validator==0.1.4 558 | rfc3986==1.5.0 559 | rich-argparse==1.4.0 560 | rich-click==1.7.3 561 | rich==13.7.0 562 | rpds-py==0.17.1 563 | rsa==4.9 564 | ruff==0.1.11 565 | s3fs==2023.12.2 566 | s3transfer==0.8.2 567 | sarif-om==1.0.4 568 | scramp==1.4.4 569 | scrapbook==0.5.0 570 | semver==3.0.2 571 | sendgrid==6.11.0 572 | sentinels==1.0.0 573 | sentry-sdk==1.39.2 574 | setproctitle==1.3.3 575 | shapely==2.0.2 576 | shellingham==1.5.4 577 | simple-salesforce==1.12.5 578 | six==1.16.0 579 | slack_sdk==3.26.2 580 | smbprotocol==1.12.0 581 | smmap==5.0.1 582 | sniffio==1.3.0 583 | snowballstemmer==2.2.0 584 | snowflake-connector-python==3.6.0 585 | snowflake-sqlalchemy==1.5.1 586 | sortedcontainers==2.4.0 587 | soupsieve==2.5 588 | sphinx-airflow-theme==0.0.12 589 | sphinx-argparse==0.4.0 590 | sphinx-autoapi==2.1.1 591 | sphinx-copybutton==0.5.2 592 | sphinx-jinja==2.0.2 593 | sphinx-rtd-theme==2.0.0 594 | sphinx_design==0.5.0 595 | sphinxcontrib-applehelp==1.0.8 596 | sphinxcontrib-devhelp==1.0.6 597 | sphinxcontrib-htmlhelp==2.0.5 598 | sphinxcontrib-httpdomain==1.8.1 599 | sphinxcontrib-jquery==4.1 600 | sphinxcontrib-jsmath==1.0.1 601 | sphinxcontrib-qthelp==1.0.7 602 | sphinxcontrib-redoc==1.6.0 603 | sphinxcontrib-serializinghtml==1.1.5 604 | sphinxcontrib-spelling==8.0.0 605 | spython==0.3.13 606 | sqlalchemy-bigquery==1.9.0 607 | sqlalchemy-redshift==0.8.14 608 | sqlalchemy-spanner==1.6.2 609 | sqlalchemy_drill==1.1.4 610 | sqlparse==0.4.4 611 | sshpubkeys==3.3.1 612 | sshtunnel==0.4.0 613 | stack-data==0.6.3 614 | starkbank-ecdsa==2.2.0 615 | statsd==4.0.1 616 | strictyaml==1.7.3 617 | sympy==1.12 618 | tableauserverclient==0.29 619 | tabulate==0.9.0 620 | tenacity==8.2.3 621 | termcolor==2.4.0 622 | text-unidecode==1.3 623 | thrift-sasl==0.4.3 624 | thrift==0.16.0 625 | time-machine==2.13.0 626 | tomli_w==1.0.0 627 | tomlkit==0.12.3 628 | tornado==6.4 629 | towncrier==23.11.0 630 | tqdm==4.66.1 631 | traitlets==5.14.1 632 | trino==0.327.0 633 | trove-classifiers==2024.1.8 634 | twine==4.0.2 635 | types-Deprecated==1.2.9.20240106 636 | types-Markdown==3.5.0.20240106 637 | types-PyMySQL==1.1.0.1 638 | types-PyYAML==6.0.12.12 639 | types-aiofiles==23.2.0.20240106 640 | types-certifi==2021.10.8.3 641 | types-croniter==2.0.0.20240106 642 | types-docutils==0.20.0.20240106 643 | types-paramiko==3.4.0.20240106 644 | types-protobuf==4.24.0.20240106 645 | types-pyOpenSSL==23.3.0.20240106 646 | types-python-dateutil==2.8.19.20240106 647 | types-python-slugify==8.0.0.3 648 | types-pytz==2023.3.1.1 649 | types-redis==4.6.0.20240106 650 | types-requests==2.31.0.20240106 651 | types-setuptools==69.0.0.20240115 652 | types-tabulate==0.9.0.20240106 653 | types-termcolor==1.1.6.2 654 | types-toml==0.10.8.7 655 | typing_extensions==4.9.0 656 | tzdata==2023.4 657 | tzlocal==5.2 658 | uc-micro-py==1.0.2 659 | unicodecsv==0.14.1 660 | universal-pathlib==0.1.4 661 | uritemplate==4.1.1 662 | urllib3==2.0.7 663 | userpath==1.9.1 664 | validators==0.22.0 665 | vertica-python==1.3.8 666 | vine==5.1.0 667 | virtualenv==20.25.0 668 | watchtower==3.0.1 669 | wcwidth==0.2.13 670 | weaviate-client==3.26.1 671 | websocket-client==1.7.0 672 | wrapt==1.16.0 673 | xmlsec==1.3.13 674 | xmltodict==0.13.0 675 | yamllint==1.33.0 676 | yandexcloud==0.253.0 677 | yarl==1.9.4 678 | zeep==4.2.1 679 | zenpy==2.0.45 680 | zipp==3.17.0 681 | zope.event==5.0 682 | zope.interface==6.1 683 | zstandard==0.22.0 684 | -------------------------------------------------------------------------------- /dags/dag_with_two_tasks.py: -------------------------------------------------------------------------------- 1 | import datetime as dt 2 | 3 | from airflow.models import DAG 4 | from airflow.operators.python import PythonOperator, get_current_context 5 | from airflow.operators.dummy import DummyOperator 6 | 7 | default_args = { 8 | 'owner': 'airflow', 9 | 'start_date': dt.datetime(2021, 1, 20), 10 | } 11 | 12 | 13 | def even_only(): 14 | context = get_current_context() 15 | execution_date = context['execution_date'] 16 | 17 | if execution_date.day % 2 != 0: 18 | raise ValueError(f'Odd day: {execution_date}') 19 | 20 | 21 | with DAG(dag_id='dag_with_two_tasks', 22 | schedule_interval='@daily', 23 | default_args=default_args) as dag: 24 | 25 | even_only = PythonOperator( 26 | task_id='even_only', 27 | python_callable=even_only, 28 | dag=dag, 29 | ) 30 | 31 | dummy = DummyOperator( 32 | task_id='dummy_task', 33 | dag=dag 34 | ) 35 | 36 | even_only >> dummy 37 | 38 | # другие варианты написания кода 39 | 40 | # dummy.set_upstream(even_only) 41 | # even_only.set_downstream(dummy) 42 | # dummy << even_only 43 | -------------------------------------------------------------------------------- /dags/example_api_dag.py: -------------------------------------------------------------------------------- 1 | import datetime as dt 2 | 3 | from airflow.models import DAG 4 | from airflow.operators.dummy import DummyOperator 5 | 6 | default_args = { 7 | 'owner': 'airflow', 8 | 'start_date': dt.datetime(2021, 1, 20), 9 | } 10 | 11 | with DAG( 12 | dag_id='example_api_dag', 13 | schedule_interval='@daily', 14 | default_args=default_args, 15 | catchup=False, 16 | ) as dag: 17 | dice = DummyOperator( 18 | task_id='dummy_task', 19 | dag=dag, 20 | ) 21 | -------------------------------------------------------------------------------- /dags/exchange_rates/__init__.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from airflow import DAG 4 | from airflow.providers.postgres.operators.postgres import PostgresOperator 5 | 6 | from .operator import CurrencyScoopOperator 7 | 8 | 9 | with DAG( 10 | dag_id='exchange_rate_usd_kzt_dag', 11 | start_date=datetime(2021, 3, 1), 12 | schedule_interval='@daily', 13 | ) as dag: 14 | 15 | create_table = PostgresOperator( 16 | task_id='create_table_task', 17 | sql='sql/create_table.sql', 18 | postgres_conn_id='postgres_default', 19 | ) 20 | 21 | get_rate = CurrencyScoopOperator( 22 | task_id='get_rate', 23 | base_currency='USD', 24 | currency='KZT', 25 | conn_id='cur_scoop_conn_id', 26 | dag=dag, 27 | do_xcom_push=True, 28 | ) 29 | 30 | insert_rate = PostgresOperator( 31 | task_id='insert_rate', 32 | postgres_conn_id='postgres_default', 33 | sql='sql/insert_rate.sql', 34 | params={ 35 | 'base_currency': 'USD', 36 | 'currency': 'KZT', 37 | } 38 | ) 39 | 40 | create_table >> get_rate >> insert_rate 41 | -------------------------------------------------------------------------------- /dags/exchange_rates/hook.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from airflow.exceptions import AirflowException 3 | from airflow.hooks.base import BaseHook 4 | 5 | 6 | class CurrencyScoopHook(BaseHook): 7 | 8 | def __init__(self, currency_conn_id: str): 9 | super().__init__() 10 | self.conn_id = currency_conn_id 11 | 12 | def get_rate(self, date, base_currency: str, currency: str): 13 | url = 'https://api.currencyscoop.com/v1/historical' 14 | params = { 15 | 'base': base_currency.upper(), 16 | 'symbols': currency.upper(), 17 | 'api_key': self._get_api_key(), 18 | 'date': str(date), 19 | } 20 | response = requests.get(url, params=params) 21 | response.raise_for_status() 22 | return response.json()['response']['rates'][currency] 23 | 24 | def _get_api_key(self): 25 | conn = self.get_connection(self.conn_id) 26 | if not conn.password: 27 | raise AirflowException('Missing API key (password) in connection settings') 28 | return conn.password 29 | -------------------------------------------------------------------------------- /dags/exchange_rates/operator.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from airflow.models.baseoperator import BaseOperator 4 | from airflow.utils.decorators import apply_defaults 5 | 6 | from .hook import CurrencyScoopHook 7 | 8 | 9 | class CurrencyScoopOperator(BaseOperator): 10 | 11 | @apply_defaults 12 | def __init__( 13 | self, 14 | base_currency: str, 15 | currency: str, 16 | conn_id: str = 'currency_scoop_conn_id', 17 | **kwargs) -> None: 18 | super().__init__(**kwargs) 19 | self.conn_id = conn_id 20 | self.base_currency = base_currency 21 | self.currency = currency 22 | 23 | def execute(self, context: Any): 24 | api = CurrencyScoopHook(self.conn_id) 25 | return api.get_rate(context['execution_date'].date(), self.base_currency, self.currency) 26 | -------------------------------------------------------------------------------- /dags/exchange_rates/sql/create_table.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS currency_exchange_rates ( 2 | base VARCHAR(3) NOT NULL, 3 | currency VARCHAR(3) NOT NULL, 4 | rate NUMERIC(12, 3) NOT NULL, 5 | date DATE NOT NULL, 6 | UNIQUE (base, currency, date) 7 | ); 8 | -------------------------------------------------------------------------------- /dags/exchange_rates/sql/insert_rate.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO 2 | currency_exchange_rates 3 | VALUES ('{{ params.base_currency }}', '{{ params.currency }}', {{ ti.xcom_pull(task_ids="get_rate") }}, '{{ execution_date.strftime("%Y-%m-%d") }}') 4 | ON CONFLICT (base, currency, date) DO 5 | UPDATE 6 | SET rate = excluded.rate; 7 | -------------------------------------------------------------------------------- /dags/exchange_rates_dynamic/__init__.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from airflow import DAG 4 | from airflow.providers.postgres.operators.postgres import PostgresOperator 5 | 6 | from .operator import CurrencyScoopOperator 7 | 8 | 9 | with DAG( 10 | dag_id='exchange_rate_dynamic', 11 | start_date=datetime(2021, 3, 1), 12 | schedule_interval='@daily', 13 | catchup=False, 14 | ) as dag: 15 | 16 | create_table = PostgresOperator( 17 | task_id='create_table_task', 18 | sql='sql/create_table.sql', 19 | postgres_conn_id='postgres_default', 20 | ) 21 | 22 | tasks = [] 23 | 24 | for base, currency in [ 25 | ('USD', 'KZT'), 26 | ('USD', 'RUB'), 27 | ('USD', 'EUR'), 28 | ('KZT', 'RUB'), 29 | ('RUB', 'KZT'), 30 | ('EUR', 'KZT'), 31 | ('EUR', 'RUB'), 32 | ]: 33 | get_rate_task = CurrencyScoopOperator( 34 | task_id=f'get_rate_{ base }_{ currency }', 35 | base_currency=base, 36 | currency=currency, 37 | conn_id='cur_scoop_conn_id', 38 | dag=dag, 39 | do_xcom_push=True, 40 | ) 41 | 42 | insert_rate = PostgresOperator( 43 | task_id=f'insert_rate_{ base }_{ currency }', 44 | postgres_conn_id='postgres_default', 45 | sql='sql/insert_rate.sql', 46 | params={ 47 | 'base_currency': base, 48 | 'currency': currency, 49 | 'get_rate_task_id': f'get_rate_{ base }_{ currency }' 50 | } 51 | ) 52 | 53 | get_rate_task >> insert_rate 54 | 55 | tasks.append(get_rate_task) 56 | 57 | create_table.set_downstream(tasks) 58 | -------------------------------------------------------------------------------- /dags/exchange_rates_dynamic/hook.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from airflow.exceptions import AirflowException 3 | from airflow.hooks.base import BaseHook 4 | 5 | 6 | class CurrencyScoopHook(BaseHook): 7 | 8 | def __init__(self, currency_conn_id: str): 9 | super().__init__() 10 | self.conn_id = currency_conn_id 11 | 12 | def get_rate(self, date, base_currency: str, currency: str): 13 | url = 'https://api.currencyscoop.com/v1/historical' 14 | params = { 15 | 'base': base_currency.upper(), 16 | 'symbols': currency.upper(), 17 | 'api_key': self._get_api_key(), 18 | 'date': str(date), 19 | } 20 | response = requests.get(url, params=params) 21 | response.raise_for_status() 22 | return response.json()['response']['rates'][currency] 23 | 24 | def _get_api_key(self): 25 | conn = self.get_connection(self.conn_id) 26 | if not conn.password: 27 | raise AirflowException('Missing API key (password) in connection settings') 28 | return conn.password 29 | -------------------------------------------------------------------------------- /dags/exchange_rates_dynamic/operator.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from airflow.models.baseoperator import BaseOperator 4 | from airflow.utils.decorators import apply_defaults 5 | 6 | from .hook import CurrencyScoopHook 7 | 8 | 9 | class CurrencyScoopOperator(BaseOperator): 10 | 11 | @apply_defaults 12 | def __init__( 13 | self, 14 | base_currency: str, 15 | currency: str, 16 | conn_id: str = 'currency_scoop_conn_id', 17 | **kwargs) -> None: 18 | super().__init__(**kwargs) 19 | self.conn_id = conn_id 20 | self.base_currency = base_currency 21 | self.currency = currency 22 | 23 | def execute(self, context: Any): 24 | api = CurrencyScoopHook(self.conn_id) 25 | return api.get_rate(context['execution_date'].date(), self.base_currency, self.currency) 26 | -------------------------------------------------------------------------------- /dags/exchange_rates_dynamic/sql/create_table.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS currency_exchange_rates ( 2 | base VARCHAR(3) NOT NULL, 3 | currency VARCHAR(3) NOT NULL, 4 | rate NUMERIC(12, 3) NOT NULL, 5 | date DATE NOT NULL, 6 | UNIQUE (base, currency, date) 7 | ); 8 | -------------------------------------------------------------------------------- /dags/exchange_rates_dynamic/sql/insert_rate.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO 2 | currency_exchange_rates 3 | VALUES ('{{ params.base_currency }}', '{{ params.currency }}', {{ ti.xcom_pull(task_ids=params.get_rate_task_id) }}, '{{ execution_date.strftime("%Y-%m-%d") }}') 4 | ON CONFLICT (base, currency, date) DO 5 | UPDATE 6 | SET rate = excluded.rate; 7 | -------------------------------------------------------------------------------- /dags/first_dag.py: -------------------------------------------------------------------------------- 1 | import random 2 | import datetime as dt 3 | 4 | from airflow.models import DAG 5 | from airflow.operators.python import PythonOperator 6 | 7 | default_args = { 8 | 'owner': 'airflow', 9 | 'start_date': dt.datetime(2021, 1, 20), 10 | 'retries': 2, 11 | 'retry_delay': dt.timedelta(seconds=10), 12 | } 13 | 14 | 15 | def random_dice(): 16 | val = random.randint(1, 6) 17 | if val % 2 != 0: 18 | raise ValueError(f'Odd {val}') 19 | 20 | 21 | with DAG(dag_id='first_dag', 22 | schedule_interval='@daily', 23 | default_args=default_args) as dag: 24 | 25 | dice = PythonOperator( 26 | task_id='random_dice', 27 | python_callable=random_dice, 28 | dag=dag, 29 | ) 30 | -------------------------------------------------------------------------------- /dags/first_dag_execution_date.py: -------------------------------------------------------------------------------- 1 | import datetime as dt 2 | 3 | from airflow.models import DAG 4 | from airflow.operators.python import PythonOperator 5 | from airflow.operators.python import get_current_context 6 | 7 | default_args = { 8 | 'owner': 'airflow', 9 | 'start_date': dt.datetime(2021, 1, 20), 10 | } 11 | 12 | 13 | def even_only(): 14 | context = get_current_context() 15 | execution_date = context['execution_date'] 16 | 17 | if execution_date.day % 2 != 0: 18 | raise ValueError(f'Odd day: {execution_date}') 19 | 20 | 21 | with DAG(dag_id='first_dag_execution_date', 22 | schedule_interval='@daily', 23 | default_args=default_args) as dag: 24 | 25 | even_only = PythonOperator( 26 | task_id='even_only', 27 | python_callable=even_only, 28 | dag=dag, 29 | ) 30 | -------------------------------------------------------------------------------- /dags/nyc_taxi/__init__.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from airflow.decorators import dag, task 4 | from airflow.operators.python import get_current_context 5 | from airflow.providers.http.operators.http import SimpleHttpOperator 6 | from .functions import download_dataset, convert_to_parquet 7 | 8 | default_args = { 9 | 'owner': 'airflow', 10 | } 11 | 12 | 13 | @dag(default_args=default_args, 14 | schedule_interval='@monthly', 15 | start_date=datetime(2020, 1, 1), 16 | catchup=False, 17 | ) 18 | def nyc_taxi_dataset_dag(): 19 | 20 | check_file = SimpleHttpOperator( 21 | method='HEAD', 22 | endpoint='yellow_tripdata_{{ execution_date.strftime("%Y-%m") }}.csv', 23 | task_id='check_file', 24 | http_conn_id='nyc_yellow_taxi_id' 25 | ) 26 | 27 | @task 28 | def download_file(): 29 | context = get_current_context() 30 | return download_dataset(context['execution_date'].strftime('%Y-%m')) 31 | 32 | @task 33 | def to_parquet(file_path: str): 34 | context = get_current_context() 35 | return convert_to_parquet(context['execution_date'].strftime('%Y-%m'), file_path) 36 | 37 | file_path = download_file() 38 | parquet_file_path = to_parquet(file_path) 39 | 40 | check_file >> file_path >> parquet_file_path 41 | 42 | 43 | nyc_dag = nyc_taxi_dataset_dag() 44 | -------------------------------------------------------------------------------- /dags/nyc_taxi/functions.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import datetime as dt 3 | from tempfile import NamedTemporaryFile 4 | 5 | import requests 6 | import pandas as pd 7 | from airflow.providers.amazon.aws.hooks.s3 import S3Hook 8 | 9 | 10 | def download_dataset(year_month: str): 11 | url = ( 12 | f'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_{year_month}.csv' 13 | ) 14 | response = requests.get(url, stream=True) 15 | response.raise_for_status() 16 | 17 | s3 = S3Hook('aws_connection_id') 18 | 19 | s3_path = f's3://nyc-yellow-taxi-raw-data/yellow_tripdata_{year_month}.csv.gz' 20 | bucket, key = s3.parse_s3_url(s3_path) 21 | 22 | with NamedTemporaryFile('w', encoding='utf-8', delete=False) as f: 23 | for chunk in response.iter_lines(): 24 | f.write('{}\n'.format(chunk.decode('utf-8'))) 25 | s3.load_file(f.name, key, bucket, replace=True, gzip=True) 26 | 27 | return s3_path 28 | 29 | 30 | def convert_to_parquet(year_month: str, s3_path: str): 31 | s3 = S3Hook('aws_connection_id') 32 | bucket, key = s3.parse_s3_url(s3_path) 33 | file_path = s3.download_file(key, bucket) 34 | 35 | with gzip.open(file_path, mode='rb') as f: 36 | df = pd.read_csv(f) 37 | 38 | current_month = dt.datetime.strptime(year_month, '%Y-%m') 39 | next_month = current_month.replace(month=current_month.month + 1) 40 | 41 | df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime']) 42 | df = df[ 43 | (df['tpep_pickup_datetime'] >= f'{current_month:%Y-%m-%d}') & 44 | (df['tpep_pickup_datetime'] < f'{next_month:%Y-%m-%d}') 45 | ] 46 | df['pickup_date'] = df['tpep_pickup_datetime'].dt.strftime('%Y-%m-%d') 47 | 48 | s3_path = f's3://nyc-yellow-taxi-parquet-data/yellow_tripdata_{year_month}.parquet' 49 | bucket, key = s3.parse_s3_url(s3_path) 50 | 51 | with NamedTemporaryFile('wb', delete=False) as f: 52 | df.to_parquet(f) 53 | 54 | s3.load_file( 55 | f.name, 56 | key, 57 | bucket, 58 | replace=True, 59 | ) 60 | return s3_path 61 | -------------------------------------------------------------------------------- /dags/nyc_taxi_2021_sensor.py: -------------------------------------------------------------------------------- 1 | import datetime as dt 2 | 3 | from airflow import DAG 4 | from airflow.decorators import task 5 | from airflow.operators.python import get_current_context 6 | from airflow.providers.http.sensors.http import HttpSensor 7 | 8 | from nyc_taxi.functions import download_dataset, convert_to_parquet 9 | 10 | default_args = { 11 | 'owner': 'airflow', 12 | } 13 | 14 | 15 | with DAG( 16 | start_date=dt.datetime(2021, 1, 1), 17 | dag_id='nyc_taxi_2021_dag', 18 | schedule_interval='@monthly', 19 | default_args=default_args, 20 | ) as dag: 21 | 22 | check_if_exists = HttpSensor( 23 | method='HEAD', 24 | endpoint='yellow_tripdata_{{ execution_date.strftime("%Y-%m") }}.csv', 25 | http_conn_id='nyc_yellow_taxi_id', 26 | task_id='check_if_exists', 27 | poke_interval=60 * 60 * 24, # every 24 hours 28 | mode='reschedule', 29 | ) 30 | 31 | @task 32 | def download_file(): 33 | context = get_current_context() 34 | return download_dataset(context['execution_date'].strftime('%Y-%m')) 35 | 36 | @task 37 | def to_parquet(file_path: str): 38 | context = get_current_context() 39 | return convert_to_parquet(context['execution_date'].strftime('%Y-%m'), file_path) 40 | 41 | file_path = download_file() 42 | parquet_file_path = to_parquet(file_path) 43 | 44 | check_if_exists >> file_path 45 | -------------------------------------------------------------------------------- /dags/nyc_taxi_pre_2/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from datetime import datetime 4 | 5 | sys.path.append( 6 | os.path.join(os.path.dirname(__file__), '..'), 7 | ) 8 | 9 | from airflow import DAG 10 | from airflow.operators.python import PythonOperator 11 | from airflow.providers.http.operators.http import SimpleHttpOperator 12 | 13 | from nyc_taxi.functions import download_dataset, convert_to_parquet 14 | 15 | 16 | def to_parquet(**context): 17 | ti = context['ti'] 18 | execution_date = context['execution_date'] 19 | s3_path = ti.xcom_pull(task_ids='download_file') 20 | convert_to_parquet(execution_date.strftime('%Y-%m'), s3_path) 21 | 22 | 23 | with DAG( 24 | dag_id='nyc_taxi_pre_2', 25 | start_date=datetime(2020, 1, 1), 26 | schedule_interval='@monthly', 27 | catchup=False, 28 | ) as dag: 29 | check_file_task = SimpleHttpOperator( 30 | task_id='check_file', 31 | method='HEAD', 32 | http_conn_id='nyc_yellow_taxi_id', 33 | endpoint='yellow_tripdata_{{ execution_date.strftime("%Y-%m") }}.csv' 34 | ) 35 | 36 | download_file_task = PythonOperator( 37 | task_id='download_file', 38 | python_callable=download_dataset, 39 | op_args=['{{ execution_date.strftime("%Y-%m") }}'] 40 | ) 41 | 42 | to_parquet_task = PythonOperator( 43 | task_id='to_parquet', 44 | python_callable=to_parquet, 45 | ) 46 | 47 | check_file_task >> download_file_task >> to_parquet_task 48 | -------------------------------------------------------------------------------- /dags/sla_example_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta, datetime 2 | 3 | from airflow import DAG 4 | from airflow.operators.bash import BashOperator 5 | 6 | 7 | def sla_miss_callback(dag, task_list, blocking_task_list, slas, blocking_tis): 8 | print(dag) 9 | print(task_list) 10 | print(blocking_task_list) 11 | print(slas) 12 | print(blocking_tis) 13 | 14 | 15 | default_args = { 16 | 'email': 'adil.khashtamov@gmail.com', 17 | 'sla': timedelta(seconds=5), 18 | } 19 | 20 | with DAG( 21 | dag_id='sla_example_dag', 22 | sla_miss_callback=sla_miss_callback, 23 | start_date=datetime(2021, 3, 16), 24 | schedule_interval='*/5 * * * *', 25 | catchup=False, 26 | default_args=default_args, 27 | ) as dag: 28 | 29 | cmd = BashOperator( 30 | task_id='slow_task', 31 | bash_command='sleep 15', 32 | dag=dag, 33 | ) 34 | -------------------------------------------------------------------------------- /dags/taskflow_dag_with_two_operators.py: -------------------------------------------------------------------------------- 1 | import datetime as dt 2 | 3 | from airflow.decorators import dag, task 4 | from airflow.operators.python import get_current_context 5 | from airflow.operators.dummy import DummyOperator 6 | 7 | default_args = { 8 | 'owner': 'airflow', 9 | 'start_date': dt.datetime(2021, 1, 20), 10 | } 11 | 12 | 13 | @dag( 14 | default_args=default_args, 15 | schedule_interval='@daily', 16 | dag_id='taskflow_dag_with_two_operators' 17 | ) 18 | def first_dag_taskflow(): 19 | @task 20 | def even_only(): 21 | context = get_current_context() 22 | execution_date = context['execution_date'] 23 | 24 | if execution_date.day % 2 != 0: 25 | raise ValueError(f'Odd day: {execution_date}') 26 | 27 | @task 28 | def dummy_task(): 29 | pass 30 | 31 | even_only() >> dummy_task() 32 | 33 | # вариант с Operator из Airflow 34 | # even_only() >> DummyOperator(task_id='dummy_task') 35 | 36 | 37 | main_dag = first_dag_taskflow() 38 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | --- 2 | version: '3' 3 | x-airflow-common: 4 | &airflow-common 5 | build: . 6 | environment: 7 | &airflow-common-env 8 | AIRFLOW__CORE__EXECUTOR: LocalExecutor 9 | AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 10 | AIRFLOW__CORE__FERNET_KEY: '' 11 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' 12 | AIRFLOW__CORE__LOAD_EXAMPLES: 'false' 13 | AIRFLOW__SCHEDULER__DAG_DIR_LIST_INTERVAL: 60 14 | AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth' 15 | AIRFLOW__CORE__MAX_ACTIVE_RUNS_PER_DAG: 2 16 | AIRFLOW__CORE__LAZY_LOAD_PLUGINS: 'false' 17 | volumes: 18 | - ./dags:/opt/airflow/dags 19 | - ./logs:/opt/airflow/logs 20 | - ./plugins:/opt/airflow/plugins 21 | - ./tests:/opt/airflow/tests 22 | user: "${AIRFLOW_UID:-50000}:0" 23 | depends_on: 24 | &airflow-common-depends-on 25 | postgres: 26 | condition: service_healthy 27 | 28 | services: 29 | postgres: 30 | image: postgres:13 31 | environment: 32 | POSTGRES_USER: airflow 33 | POSTGRES_PASSWORD: airflow 34 | POSTGRES_DB: airflow 35 | volumes: 36 | - postgres-db-volume:/var/lib/postgresql/data 37 | healthcheck: 38 | test: ["CMD", "pg_isready", "-U", "airflow"] 39 | interval: 5s 40 | retries: 5 41 | restart: on-failure 42 | 43 | airflow-webserver: 44 | <<: *airflow-common 45 | command: webserver 46 | ports: 47 | - 8080:8080 48 | healthcheck: 49 | test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] 50 | interval: 10s 51 | timeout: 10s 52 | retries: 5 53 | restart: on-failure 54 | depends_on: 55 | <<: *airflow-common-depends-on 56 | airflow-init: 57 | condition: service_completed_successfully 58 | 59 | airflow-scheduler: 60 | <<: *airflow-common 61 | command: scheduler 62 | healthcheck: 63 | test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"'] 64 | interval: 10s 65 | timeout: 10s 66 | retries: 5 67 | restart: on-failure 68 | depends_on: 69 | <<: *airflow-common-depends-on 70 | airflow-init: 71 | condition: service_completed_successfully 72 | 73 | airflow-triggerer: 74 | <<: *airflow-common 75 | command: triggerer 76 | healthcheck: 77 | test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"'] 78 | interval: 10s 79 | timeout: 10s 80 | retries: 5 81 | restart: on-failure 82 | depends_on: 83 | <<: *airflow-common-depends-on 84 | airflow-init: 85 | condition: service_completed_successfully 86 | 87 | airflow-init: 88 | <<: *airflow-common 89 | entrypoint: /bin/bash 90 | # yamllint disable rule:line-length 91 | command: 92 | - -c 93 | - | 94 | function ver() { 95 | printf "%04d%04d%04d%04d" $${1//./ } 96 | } 97 | airflow_version=$$(AIRFLOW__LOGGING__LOGGING_LEVEL=INFO && gosu airflow airflow version) 98 | airflow_version_comparable=$$(ver $${airflow_version}) 99 | min_airflow_version=2.2.0 100 | min_airflow_version_comparable=$$(ver $${min_airflow_version}) 101 | if (( airflow_version_comparable < min_airflow_version_comparable )); then 102 | echo 103 | echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m" 104 | echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!" 105 | echo 106 | exit 1 107 | fi 108 | if [[ -z "${AIRFLOW_UID}" ]]; then 109 | echo 110 | echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m" 111 | echo "If you are on Linux, you SHOULD follow the instructions below to set " 112 | echo "AIRFLOW_UID environment variable, otherwise files will be owned by root." 113 | echo "For other operating systems you can get rid of the warning with manually created .env file:" 114 | echo " See: https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#setting-the-right-airflow-user" 115 | echo 116 | fi 117 | one_meg=1048576 118 | mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg)) 119 | cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat) 120 | disk_available=$$(df / | tail -1 | awk '{print $$4}') 121 | warning_resources="false" 122 | if (( mem_available < 4000 )) ; then 123 | echo 124 | echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m" 125 | echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))" 126 | echo 127 | warning_resources="true" 128 | fi 129 | if (( cpus_available < 2 )); then 130 | echo 131 | echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m" 132 | echo "At least 2 CPUs recommended. You have $${cpus_available}" 133 | echo 134 | warning_resources="true" 135 | fi 136 | if (( disk_available < one_meg * 10 )); then 137 | echo 138 | echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m" 139 | echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))" 140 | echo 141 | warning_resources="true" 142 | fi 143 | if [[ $${warning_resources} == "true" ]]; then 144 | echo 145 | echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m" 146 | echo "Please follow the instructions to increase amount of resources available:" 147 | echo " https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#before-you-begin" 148 | echo 149 | fi 150 | mkdir -p /sources/logs /sources/dags /sources/plugins 151 | chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins} 152 | exec /entrypoint airflow version 153 | # yamllint enable rule:line-length 154 | environment: 155 | <<: *airflow-common-env 156 | _AIRFLOW_DB_UPGRADE: 'true' 157 | _AIRFLOW_WWW_USER_CREATE: 'true' 158 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} 159 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} 160 | _PIP_ADDITIONAL_REQUIREMENTS: '' 161 | user: "0:0" 162 | volumes: 163 | - .:/sources 164 | 165 | airflow-cli: 166 | <<: *airflow-common 167 | profiles: 168 | - debug 169 | environment: 170 | <<: *airflow-common-env 171 | CONNECTION_CHECK_MAX_COUNT: "0" 172 | # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252 173 | command: 174 | - bash 175 | - -c 176 | - airflow 177 | 178 | volumes: 179 | postgres-db-volume: 180 | -------------------------------------------------------------------------------- /fabfile.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from fabric import Connection, task 4 | 5 | ssh_key_path = os.path.join(os.path.expanduser('~'), '.ssh', 'id_rsa') 6 | HOSTNAME = 'flow.dataeng.ru' 7 | USERNAME = 'airflow' 8 | 9 | 10 | @task 11 | def deploy(context): 12 | with Connection( 13 | HOSTNAME, 14 | connect_kwargs={'key_filename': ssh_key_path}, 15 | user=USERNAME, 16 | ) as conn: 17 | with conn.cd('~/repositories/dataeng_dags'): 18 | with conn.prefix('source ~/venvs/.airflow/bin/activate'): 19 | _ = conn.run('git pull') 20 | print(f'{_.stdout.strip()}') 21 | _ = conn.run('pip install -r requirements.txt', hide=True) 22 | print(f'{_.stdout.strip()}') 23 | _ = conn.run('cp -R dags/* ~/airflow/dags/', hide=True) 24 | print(f'{_.stdout.strip()}') 25 | -------------------------------------------------------------------------------- /requirements-ci.txt: -------------------------------------------------------------------------------- 1 | fabric==2.6.0 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pyarrow==13.0.0 2 | pyTelegramBotAPI==4.14.0 3 | apache-airflow-providers-postgres==5.6.1 4 | --------------------------------------------------------------------------------