├── .gitignore ├── .prometheus ├── README.md ├── mapping.yml └── prometheus.yml ├── LICENSE ├── README.md ├── airflow.cfg ├── dags ├── bigquery_data_analytics.py ├── bigquery_data_load.py ├── bigquery_data_validation.py ├── core_concepts.py └── pyspark_subdag.py ├── data ├── 4649493c.csv └── c876bd01.csv ├── plugins ├── __init__.py └── bigquery_plugin.py ├── pyspark ├── weekday │ ├── avg_speed.py │ ├── avg_temperature.py │ └── avg_tire_pressure.py └── weekend │ └── gas_composition_count.py ├── tests ├── README.md ├── test_bigquery_data_validation.py └── test_core_concepts.py └── variables └── dev.json /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /.prometheus/README.md: -------------------------------------------------------------------------------- 1 | # Monitor Airflow with StatsD, Prometheus and Grafana 2 | 3 | This is the source code for the lecture Monitor Airflow with StatsD, Prometheus and Grafana in Section 8: Airflow in Production. 4 | 5 | Make sure you're in the `.prometheus/` directory while running these commands. We're using Docker to run these services so make sure you have Docker installed. 6 | 7 | ## Run statsd-exporter 8 | 9 | This command configures statsd-exporter to listen for metrics sent on port 8125. It converts Statsd metrics to Prometheus format using the mapping.yml configuration file. It exposes metrics for Prometheus to scrape over host port 9123 and container port 9102. 10 | 11 | ```Bash 12 | docker run --name=prom-statsd-exporter \ 13 | -p 9123:9102 \ 14 | -p 8125:8125/udp \ 15 | -v $PWD/mapping.yml:/tmp/mapping.yml \ 16 | prom/statsd-exporter \ 17 | --statsd.mapping-config=/tmp/mapping.yml \ 18 | --statsd.listen-udp=:8125 \ 19 | --web.listen-address=:9102 20 | ``` 21 | 22 | ## Run Prometheus 23 | 24 | This command runs Prometherus and configures it to scrape metrics exposed on port 9123. It allows Grafana to use Prometheus as a data source over address port 9090. 25 | 26 | ```Bash 27 | docker run --name=prometheus \ 28 | -p 9090:9090 \ 29 | -v $PWD/prometheus.yml:/prometheus.yml \ 30 | prom/prometheus \ 31 | --config.file=/prometheus.yml \ 32 | --log.level=debug \ 33 | --web.listen-address=:9090 \ 34 | --web.page-title='Prometheus - Airflow Demo' 35 | ``` 36 | 37 | Now you can access the Prometheus web UI on http://localhost:9090. 38 | 39 | ## Run Grafana 40 | 41 | ```Bash 42 | docker run -d --name=grafana -p 3000:3000 grafana/grafana 43 | ``` 44 | 45 | Now you can access the Grafana web UI on http://localhost:3000. 46 | 47 | Follow the lecture to configure Prometheus as a data source in the Grafana UI. 48 | -------------------------------------------------------------------------------- /.prometheus/mapping.yml: -------------------------------------------------------------------------------- 1 | mappings: 2 | - match: 'airflow.*' 3 | name: 'airflow' 4 | labels: 5 | metric: '$1' 6 | -------------------------------------------------------------------------------- /.prometheus/prometheus.yml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 15s 3 | evaluation_interval: 15s 4 | 5 | scrape_configs: 6 | - job_name: 'prometheus' 7 | static_configs: 8 | - targets: ['localhost:9090'] 9 | 10 | - job_name: 'airflow' 11 | static_configs: 12 | - targets: ['host.docker.internal:9123'] 13 | labels: {'host': 'airflow-statsd'} 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Alexandra Abbas 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Apache Airflow: Complete Hands-On Beginner to Advanced Class 2 | 3 | This repository hold the source code for the Udemy online course [Apache Airflow: Complete Hands-On Beginner to Advanced Class](https://www.udemy.com/course/apache-airflow-course/?referralCode=7A7192D2BDE0A30803F8) by Alexandra Abbas. 4 | 5 | ## Intsall Apache Airflow 6 | 7 | As explained in the course before meking use of this code base you need to install Apache Airflow locally on your machine. 8 | 9 | ```Bash 10 | pip install apache-airflow[gcp,statsd,sentry]==1.10.10 11 | ``` 12 | 13 | Install these extra packages as well. 14 | 15 | ```Bash 16 | pip install cryptography==2.9.2 17 | pip install pyspark==2.4.5 18 | ``` 19 | 20 | To validate your Airflow installation check your Airflow version. This should print 1.10.10. 21 | 22 | ```Bash 23 | airflow version 24 | ``` 25 | 26 | If you have installed Airflow earlier you might get a DeprecationWarning about having multiple airflow.cfg files but that’s okay as long as you set the correct AIRFLOW_HOME environment variable in your Terminal. 27 | 28 | # Initialise an Airflow environment 29 | 30 | As a next step you need to initialise an Airflow environment locally to run DAGs. 31 | 32 | Set the AIRFLOW_HOME variable. 33 | 34 | ```Bash 35 | export AIRFLOW_HOME=path/to/this/directory 36 | ``` 37 | 38 | Initialise Airflow and the metadata database. 39 | 40 | ```Bash 41 | airflow initdb 42 | ``` 43 | 44 | Now, you can run both the web server and the scheduler. 45 | 46 | Run the web server. 47 | 48 | ```Bash 49 | airflow webserver 50 | ``` 51 | 52 | In a different terminal window/session where you set the AIRFLOW_HOME variable again run the scheduler. 53 | 54 | ```Bash 55 | airflow scheduler 56 | ``` 57 | 58 | Great!🎉 Now you can access the Airflow web UI on http://localhost:8080. 59 | -------------------------------------------------------------------------------- /airflow.cfg: -------------------------------------------------------------------------------- 1 | [core] 2 | # The folder where your airflow pipelines live, most likely a 3 | # subfolder in a code repository. This path must be absolute. 4 | dags_folder = /Users/alexaabbas/Desktop/airflow-tutorial/dags 5 | 6 | # The folder where airflow should store its log files 7 | # This path must be absolute 8 | base_log_folder = /Users/alexaabbas/Desktop/airflow-tutorial/logs 9 | 10 | # Airflow can store logs remotely in AWS S3, Google Cloud Storage or Elastic Search. 11 | # Set this to True if you want to enable remote logging. 12 | remote_logging = True 13 | 14 | # Users must supply an Airflow connection id that provides access to the storage 15 | # location. 16 | remote_log_conn_id = google_cloud_default 17 | remote_base_log_folder = gs://aa-logistics-landing-bucket/airflow 18 | encrypt_s3_logs = False 19 | 20 | # Logging level 21 | logging_level = INFO 22 | 23 | # Logging level for Flask-appbuilder UI 24 | fab_logging_level = WARN 25 | 26 | # Logging class 27 | # Specify the class that will specify the logging configuration 28 | # This class has to be on the python classpath 29 | # Example: logging_config_class = my.path.default_local_settings.LOGGING_CONFIG 30 | logging_config_class = 31 | 32 | # Flag to enable/disable Colored logs in Console 33 | # Colour the logs when the controlling terminal is a TTY. 34 | colored_console_log = True 35 | 36 | # Log format for when Colored logs is enabled 37 | colored_log_format = [%%(blue)s%%(asctime)s%%(reset)s] {%%(blue)s%%(filename)s:%%(reset)s%%(lineno)d} %%(log_color)s%%(levelname)s%%(reset)s - %%(log_color)s%%(message)s%%(reset)s 38 | colored_formatter_class = airflow.utils.log.colored_log.CustomTTYColoredFormatter 39 | 40 | # Format of Log line 41 | log_format = [%%(asctime)s] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s 42 | simple_log_format = %%(asctime)s %%(levelname)s - %%(message)s 43 | 44 | # Log filename format 45 | log_filename_template = {{ ti.dag_id }}/{{ ti.task_id }}/{{ ts }}/{{ try_number }}.log 46 | log_processor_filename_template = {{ filename }}.log 47 | dag_processor_manager_log_location = /Users/alexaabbas/Desktop/airflow-tutorial/logs/dag_processor_manager/dag_processor_manager.log 48 | 49 | # Name of handler to read task instance logs. 50 | # Default to use task handler. 51 | task_log_reader = task 52 | 53 | # Hostname by providing a path to a callable, which will resolve the hostname. 54 | # The format is "package:function". 55 | # 56 | # For example, default value "socket:getfqdn" means that result from getfqdn() of "socket" 57 | # package will be used as hostname. 58 | # 59 | # No argument should be required in the function specified. 60 | # If using IP address as hostname is preferred, use value ``airflow.utils.net:get_host_ip_address`` 61 | hostname_callable = socket:getfqdn 62 | 63 | # Default timezone in case supplied date times are naive 64 | # can be utc (default), system, or any IANA timezone string (e.g. Europe/Amsterdam) 65 | default_timezone = utc 66 | 67 | # The executor class that airflow should use. Choices include 68 | # SequentialExecutor, LocalExecutor, CeleryExecutor, DaskExecutor, KubernetesExecutor 69 | executor = SequentialExecutor 70 | 71 | # The SqlAlchemy connection string to the metadata database. 72 | # SqlAlchemy supports many different database engine, more information 73 | # their website 74 | sql_alchemy_conn = sqlite:////Users/alexaabbas/Desktop/airflow-tutorial/airflow.db 75 | 76 | # The encoding for the databases 77 | sql_engine_encoding = utf-8 78 | 79 | # If SqlAlchemy should pool database connections. 80 | sql_alchemy_pool_enabled = True 81 | 82 | # The SqlAlchemy pool size is the maximum number of database connections 83 | # in the pool. 0 indicates no limit. 84 | sql_alchemy_pool_size = 5 85 | 86 | # The maximum overflow size of the pool. 87 | # When the number of checked-out connections reaches the size set in pool_size, 88 | # additional connections will be returned up to this limit. 89 | # When those additional connections are returned to the pool, they are disconnected and discarded. 90 | # It follows then that the total number of simultaneous connections the pool will allow 91 | # is pool_size + max_overflow, 92 | # and the total number of "sleeping" connections the pool will allow is pool_size. 93 | # max_overflow can be set to -1 to indicate no overflow limit; 94 | # no limit will be placed on the total number of concurrent connections. Defaults to 10. 95 | sql_alchemy_max_overflow = 10 96 | 97 | # The SqlAlchemy pool recycle is the number of seconds a connection 98 | # can be idle in the pool before it is invalidated. This config does 99 | # not apply to sqlite. If the number of DB connections is ever exceeded, 100 | # a lower config value will allow the system to recover faster. 101 | sql_alchemy_pool_recycle = 1800 102 | 103 | # Check connection at the start of each connection pool checkout. 104 | # Typically, this is a simple statement like "SELECT 1". 105 | # More information here: 106 | # https://docs.sqlalchemy.org/en/13/core/pooling.html#disconnect-handling-pessimistic 107 | sql_alchemy_pool_pre_ping = True 108 | 109 | # The schema to use for the metadata database. 110 | # SqlAlchemy supports databases with the concept of multiple schemas. 111 | sql_alchemy_schema = 112 | 113 | # The amount of parallelism as a setting to the executor. This defines 114 | # the max number of task instances that should run simultaneously 115 | # on this airflow installation 116 | parallelism = 32 117 | 118 | # The number of task instances allowed to run concurrently by the scheduler 119 | dag_concurrency = 16 120 | 121 | # Are DAGs paused by default at creation 122 | dags_are_paused_at_creation = True 123 | 124 | # The maximum number of active DAG runs per DAG 125 | max_active_runs_per_dag = 16 126 | 127 | # Whether to load the DAG examples that ship with Airflow. It's good to 128 | # get started, but you probably want to set this to False in a production 129 | # environment 130 | load_examples = False 131 | 132 | # Whether to load the default connections that ship with Airflow. It's good to 133 | # get started, but you probably want to set this to False in a production 134 | # environment 135 | load_default_connections = True 136 | 137 | # Where your Airflow plugins are stored 138 | plugins_folder = /Users/alexaabbas/Desktop/airflow-tutorial/plugins 139 | 140 | # Secret key to save connection passwords in the db 141 | fernet_key = aMe6oy61nW2447cp76p6VLi3PN8almZL6J6nXC7cxSc= 142 | 143 | # Whether to disable pickling dags 144 | donot_pickle = False 145 | 146 | # How long before timing out a python file import 147 | dagbag_import_timeout = 30 148 | 149 | # How long before timing out a DagFileProcessor, which processes a dag file 150 | dag_file_processor_timeout = 50 151 | 152 | # The class to use for running task instances in a subprocess 153 | task_runner = StandardTaskRunner 154 | 155 | # If set, tasks without a ``run_as_user`` argument will be run with this user 156 | # Can be used to de-elevate a sudo user running Airflow when executing tasks 157 | default_impersonation = 158 | 159 | # What security module to use (for example kerberos) 160 | security = 161 | 162 | # If set to False enables some unsecure features like Charts and Ad Hoc Queries. 163 | # In 2.0 will default to True. 164 | secure_mode = False 165 | 166 | # Turn unit test mode on (overwrites many configuration options with test 167 | # values at runtime) 168 | unit_test_mode = False 169 | 170 | # Whether to enable pickling for xcom (note that this is insecure and allows for 171 | # RCE exploits). This will be deprecated in Airflow 2.0 (be forced to False). 172 | enable_xcom_pickling = True 173 | 174 | # When a task is killed forcefully, this is the amount of time in seconds that 175 | # it has to cleanup after it is sent a SIGTERM, before it is SIGKILLED 176 | killed_task_cleanup_time = 60 177 | 178 | # Whether to override params with dag_run.conf. If you pass some key-value pairs 179 | # through ``airflow dags backfill -c`` or 180 | # ``airflow dags trigger -c``, the key-value pairs will override the existing ones in params. 181 | dag_run_conf_overrides_params = False 182 | 183 | # Worker initialisation check to validate Metadata Database connection 184 | worker_precheck = False 185 | 186 | # When discovering DAGs, ignore any files that don't contain the strings ``DAG`` and ``airflow``. 187 | dag_discovery_safe_mode = True 188 | 189 | # The number of retries each task is going to have by default. Can be overridden at dag or task level. 190 | default_task_retries = 0 191 | 192 | # Whether to serialise DAGs and persist them in DB. 193 | # If set to True, Webserver reads from DB instead of parsing DAG files 194 | # More details: https://airflow.apache.org/docs/stable/dag-serialization.html 195 | store_serialized_dags = False 196 | 197 | # Updating serialized DAG can not be faster than a minimum interval to reduce database write rate. 198 | min_serialized_dag_update_interval = 30 199 | 200 | # Whether to persist DAG files code in DB. 201 | # If set to True, Webserver reads file contents from DB instead of 202 | # trying to access files in a DAG folder. Defaults to same as the 203 | # ``store_serialized_dags`` setting. 204 | store_dag_code = %(store_serialized_dags)s 205 | 206 | # Maximum number of Rendered Task Instance Fields (Template Fields) per task to store 207 | # in the Database. 208 | # When Dag Serialization is enabled (``store_serialized_dags=True``), all the template_fields 209 | # for each of Task Instance are stored in the Database. 210 | # Keeping this number small may cause an error when you try to view ``Rendered`` tab in 211 | # TaskInstance view for older tasks. 212 | max_num_rendered_ti_fields_per_task = 30 213 | 214 | # On each dagrun check against defined SLAs 215 | check_slas = True 216 | 217 | [secrets] 218 | # Full class name of secrets backend to enable (will precede env vars and metastore in search path) 219 | # Example: backend = airflow.contrib.secrets.aws_systems_manager.SystemsManagerParameterStoreBackend 220 | backend = 221 | 222 | # The backend_kwargs param is loaded into a dictionary and passed to __init__ of secrets backend class. 223 | # See documentation for the secrets backend you are using. JSON is expected. 224 | # Example for AWS Systems Manager ParameterStore: 225 | # ``{"connections_prefix": "/airflow/connections", "profile_name": "default"}`` 226 | backend_kwargs = 227 | 228 | [cli] 229 | # In what way should the cli access the API. The LocalClient will use the 230 | # database directly, while the json_client will use the api running on the 231 | # webserver 232 | api_client = airflow.api.client.local_client 233 | 234 | # If you set web_server_url_prefix, do NOT forget to append it here, ex: 235 | # ``endpoint_url = http://localhost:8080/myroot`` 236 | # So api will look like: ``http://localhost:8080/myroot/api/experimental/...`` 237 | endpoint_url = http://localhost:8080 238 | 239 | [debug] 240 | # Used only with DebugExecutor. If set to True DAG will fail with first 241 | # failed task. Helpful for debugging purposes. 242 | fail_fast = False 243 | 244 | [api] 245 | # How to authenticate users of the API 246 | auth_backend = airflow.api.auth.backend.default 247 | 248 | [lineage] 249 | # what lineage backend to use 250 | backend = 251 | 252 | [atlas] 253 | sasl_enabled = False 254 | host = 255 | port = 21000 256 | username = 257 | password = 258 | 259 | [operators] 260 | # The default owner assigned to each new operator, unless 261 | # provided explicitly or passed via ``default_args`` 262 | default_owner = airflow 263 | default_cpus = 1 264 | default_ram = 512 265 | default_disk = 512 266 | default_gpus = 0 267 | 268 | [hive] 269 | # Default mapreduce queue for HiveOperator tasks 270 | default_hive_mapred_queue = 271 | 272 | [webserver] 273 | # The base url of your website as airflow cannot guess what domain or 274 | # cname you are using. This is used in automated emails that 275 | # airflow sends to point links to the right web server 276 | base_url = http://localhost:8080 277 | 278 | # Default timezone to display all dates in the RBAC UI, can be UTC, system, or 279 | # any IANA timezone string (e.g. Europe/Amsterdam). If left empty the 280 | # default value of core/default_timezone will be used 281 | # Example: default_ui_timezone = America/New_York 282 | default_ui_timezone = UTC 283 | 284 | # The ip specified when starting the web server 285 | web_server_host = 0.0.0.0 286 | 287 | # The port on which to run the web server 288 | web_server_port = 8080 289 | 290 | # Paths to the SSL certificate and key for the web server. When both are 291 | # provided SSL will be enabled. This does not change the web server port. 292 | web_server_ssl_cert = 293 | 294 | # Paths to the SSL certificate and key for the web server. When both are 295 | # provided SSL will be enabled. This does not change the web server port. 296 | web_server_ssl_key = 297 | 298 | # Number of seconds the webserver waits before killing gunicorn master that doesn't respond 299 | web_server_master_timeout = 120 300 | 301 | # Number of seconds the gunicorn webserver waits before timing out on a worker 302 | web_server_worker_timeout = 120 303 | 304 | # Number of workers to refresh at a time. When set to 0, worker refresh is 305 | # disabled. When nonzero, airflow periodically refreshes webserver workers by 306 | # bringing up new ones and killing old ones. 307 | worker_refresh_batch_size = 1 308 | 309 | # Number of seconds to wait before refreshing a batch of workers. 310 | worker_refresh_interval = 30 311 | 312 | # Secret key used to run your flask app 313 | # It should be as random as possible 314 | secret_key = temporary_key 315 | 316 | # Number of workers to run the Gunicorn web server 317 | workers = 4 318 | 319 | # The worker class gunicorn should use. Choices include 320 | # sync (default), eventlet, gevent 321 | worker_class = sync 322 | 323 | # Log files for the gunicorn webserver. '-' means log to stderr. 324 | access_logfile = - 325 | 326 | # Log files for the gunicorn webserver. '-' means log to stderr. 327 | error_logfile = - 328 | 329 | # Expose the configuration file in the web server 330 | expose_config = False 331 | 332 | # Expose hostname in the web server 333 | expose_hostname = True 334 | 335 | # Expose stacktrace in the web server 336 | expose_stacktrace = True 337 | 338 | # Set to true to turn on authentication: 339 | # https://airflow.apache.org/security.html#web-authentication 340 | authenticate = True 341 | auth_backend = airflow.contrib.backends.password_auth 342 | 343 | # Filter the list of dags by owner name (requires authentication to be enabled) 344 | filter_by_owner = False 345 | 346 | # Filtering mode. Choices include user (default) and ldapgroup. 347 | # Ldap group filtering requires using the ldap backend 348 | # 349 | # Note that the ldap server needs the "memberOf" overlay to be set up 350 | # in order to user the ldapgroup mode. 351 | owner_mode = user 352 | 353 | # Default DAG view. Valid values are: 354 | # tree, graph, duration, gantt, landing_times 355 | dag_default_view = tree 356 | 357 | # "Default DAG orientation. Valid values are:" 358 | # LR (Left->Right), TB (Top->Bottom), RL (Right->Left), BT (Bottom->Top) 359 | dag_orientation = LR 360 | 361 | # Puts the webserver in demonstration mode; blurs the names of Operators for 362 | # privacy. 363 | demo_mode = False 364 | 365 | # The amount of time (in secs) webserver will wait for initial handshake 366 | # while fetching logs from other worker machine 367 | log_fetch_timeout_sec = 5 368 | 369 | # Time interval (in secs) to wait before next log fetching. 370 | log_fetch_delay_sec = 2 371 | 372 | # Distance away from page bottom to enable auto tailing. 373 | log_auto_tailing_offset = 30 374 | 375 | # Animation speed for auto tailing log display. 376 | log_animation_speed = 1000 377 | 378 | # By default, the webserver shows paused DAGs. Flip this to hide paused 379 | # DAGs by default 380 | hide_paused_dags_by_default = False 381 | 382 | # Consistent page size across all listing views in the UI 383 | page_size = 100 384 | 385 | # Use FAB-based webserver with RBAC feature 386 | rbac = True 387 | 388 | # Define the color of navigation bar 389 | navbar_color = #007A87 390 | 391 | # Default dagrun to show in UI 392 | default_dag_run_display_number = 25 393 | 394 | # Enable werkzeug ``ProxyFix`` middleware for reverse proxy 395 | enable_proxy_fix = False 396 | 397 | # Number of values to trust for ``X-Forwarded-For``. 398 | # More info: https://werkzeug.palletsprojects.com/en/0.16.x/middleware/proxy_fix/ 399 | proxy_fix_x_for = 1 400 | 401 | # Number of values to trust for ``X-Forwarded-Proto`` 402 | proxy_fix_x_proto = 1 403 | 404 | # Number of values to trust for ``X-Forwarded-Host`` 405 | proxy_fix_x_host = 1 406 | 407 | # Number of values to trust for ``X-Forwarded-Port`` 408 | proxy_fix_x_port = 1 409 | 410 | # Number of values to trust for ``X-Forwarded-Prefix`` 411 | proxy_fix_x_prefix = 1 412 | 413 | # Set secure flag on session cookie 414 | cookie_secure = False 415 | 416 | # Set samesite policy on session cookie 417 | cookie_samesite = 418 | 419 | # Default setting for wrap toggle on DAG code and TI log views. 420 | default_wrap = False 421 | 422 | # Allow the UI to be rendered in a frame 423 | x_frame_enabled = True 424 | 425 | # Send anonymous user activity to your analytics tool 426 | # choose from google_analytics, segment, or metarouter 427 | # analytics_tool = 428 | 429 | # Unique ID of your account in the analytics tool 430 | # analytics_id = 431 | 432 | # Update FAB permissions and sync security manager roles 433 | # on webserver startup 434 | update_fab_perms = True 435 | 436 | # Minutes of non-activity before logged out from UI 437 | # 0 means never get forcibly logged out 438 | force_log_out_after = 0 439 | 440 | # The UI cookie lifetime in days 441 | session_lifetime_days = 30 442 | 443 | [email] 444 | email_backend = airflow.utils.email.send_email_smtp 445 | 446 | [smtp] 447 | 448 | # If you want airflow to send emails on retries, failure, and you want to use 449 | # the airflow.utils.email.send_email_smtp function, you have to configure an 450 | # smtp server here 451 | smtp_host = localhost 452 | smtp_starttls = True 453 | smtp_ssl = False 454 | # Example: smtp_user = airflow 455 | # smtp_user = 456 | # Example: smtp_password = airflow 457 | # smtp_password = 458 | smtp_port = 25 459 | smtp_mail_from = airflow@example.com 460 | 461 | [sentry] 462 | 463 | # Sentry (https://docs.sentry.io) integration 464 | sentry_dsn = https://2cb2cf66faae436992bc5d32b92604cb@o402350.ingest.sentry.io/5303573 465 | 466 | [celery] 467 | 468 | # This section only applies if you are using the CeleryExecutor in 469 | # ``[core]`` section above 470 | # The app name that will be used by celery 471 | celery_app_name = airflow.executors.celery_executor 472 | 473 | # The concurrency that will be used when starting workers with the 474 | # ``airflow celery worker`` command. This defines the number of task instances that 475 | # a worker will take, so size up your workers based on the resources on 476 | # your worker box and the nature of your tasks 477 | worker_concurrency = 16 478 | 479 | # The maximum and minimum concurrency that will be used when starting workers with the 480 | # ``airflow celery worker`` command (always keep minimum processes, but grow 481 | # to maximum if necessary). Note the value should be max_concurrency,min_concurrency 482 | # Pick these numbers based on resources on worker box and the nature of the task. 483 | # If autoscale option is available, worker_concurrency will be ignored. 484 | # http://docs.celeryproject.org/en/latest/reference/celery.bin.worker.html#cmdoption-celery-worker-autoscale 485 | # Example: worker_autoscale = 16,12 486 | # worker_autoscale = 487 | 488 | # When you start an airflow worker, airflow starts a tiny web server 489 | # subprocess to serve the workers local log files to the airflow main 490 | # web server, who then builds pages and sends them to users. This defines 491 | # the port on which the logs are served. It needs to be unused, and open 492 | # visible from the main web server to connect into the workers. 493 | worker_log_server_port = 8793 494 | 495 | # The Celery broker URL. Celery supports RabbitMQ, Redis and experimentally 496 | # a sqlalchemy database. Refer to the Celery documentation for more 497 | # information. 498 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#broker-settings 499 | broker_url = sqla+mysql://airflow:airflow@localhost:3306/airflow 500 | 501 | # The Celery result_backend. When a job finishes, it needs to update the 502 | # metadata of the job. Therefore it will post a message on a message bus, 503 | # or insert it into a database (depending of the backend) 504 | # This status is used by the scheduler to update the state of the task 505 | # The use of a database is highly recommended 506 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#task-result-backend-settings 507 | result_backend = db+mysql://airflow:airflow@localhost:3306/airflow 508 | 509 | # Celery Flower is a sweet UI for Celery. Airflow has a shortcut to start 510 | # it ``airflow flower``. This defines the IP that Celery Flower runs on 511 | flower_host = 0.0.0.0 512 | 513 | # The root URL for Flower 514 | # Example: flower_url_prefix = /flower 515 | flower_url_prefix = 516 | 517 | # This defines the port that Celery Flower runs on 518 | flower_port = 5555 519 | 520 | # Securing Flower with Basic Authentication 521 | # Accepts user:password pairs separated by a comma 522 | # Example: flower_basic_auth = user1:password1,user2:password2 523 | flower_basic_auth = 524 | 525 | # Default queue that tasks get assigned to and that worker listen on. 526 | default_queue = default 527 | 528 | # How many processes CeleryExecutor uses to sync task state. 529 | # 0 means to use max(1, number of cores - 1) processes. 530 | sync_parallelism = 0 531 | 532 | # Import path for celery configuration options 533 | celery_config_options = airflow.config_templates.default_celery.DEFAULT_CELERY_CONFIG 534 | 535 | # In case of using SSL 536 | ssl_active = False 537 | ssl_key = 538 | ssl_cert = 539 | ssl_cacert = 540 | 541 | # Celery Pool implementation. 542 | # Choices include: prefork (default), eventlet, gevent or solo. 543 | # See: 544 | # https://docs.celeryproject.org/en/latest/userguide/workers.html#concurrency 545 | # https://docs.celeryproject.org/en/latest/userguide/concurrency/eventlet.html 546 | pool = prefork 547 | 548 | # The number of seconds to wait before timing out ``send_task_to_executor`` or 549 | # ``fetch_celery_task_state`` operations. 550 | operation_timeout = 2 551 | 552 | [celery_broker_transport_options] 553 | 554 | # This section is for specifying options which can be passed to the 555 | # underlying celery broker transport. See: 556 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-broker_transport_options 557 | # The visibility timeout defines the number of seconds to wait for the worker 558 | # to acknowledge the task before the message is redelivered to another worker. 559 | # Make sure to increase the visibility timeout to match the time of the longest 560 | # ETA you're planning to use. 561 | # visibility_timeout is only supported for Redis and SQS celery brokers. 562 | # See: 563 | # http://docs.celeryproject.org/en/master/userguide/configuration.html#std:setting-broker_transport_options 564 | # Example: visibility_timeout = 21600 565 | # visibility_timeout = 566 | 567 | [dask] 568 | 569 | # This section only applies if you are using the DaskExecutor in 570 | # [core] section above 571 | # The IP address and port of the Dask cluster's scheduler. 572 | cluster_address = 127.0.0.1:8786 573 | 574 | # TLS/ SSL settings to access a secured Dask scheduler. 575 | tls_ca = 576 | tls_cert = 577 | tls_key = 578 | 579 | [scheduler] 580 | # Task instances listen for external kill signal (when you clear tasks 581 | # from the CLI or the UI), this defines the frequency at which they should 582 | # listen (in seconds). 583 | job_heartbeat_sec = 5 584 | 585 | # The scheduler constantly tries to trigger new tasks (look at the 586 | # scheduler section in the docs for more information). This defines 587 | # how often the scheduler should run (in seconds). 588 | scheduler_heartbeat_sec = 5 589 | 590 | # After how much time should the scheduler terminate in seconds 591 | # -1 indicates to run continuously (see also num_runs) 592 | run_duration = -1 593 | 594 | # The number of times to try to schedule each DAG file 595 | # -1 indicates unlimited number 596 | num_runs = -1 597 | 598 | # The number of seconds to wait between consecutive DAG file processing 599 | processor_poll_interval = 1 600 | 601 | # after how much time (seconds) a new DAGs should be picked up from the filesystem 602 | min_file_process_interval = 0 603 | 604 | # How often (in seconds) to scan the DAGs directory for new files. Default to 5 minutes. 605 | dag_dir_list_interval = 300 606 | 607 | # How often should stats be printed to the logs. Setting to 0 will disable printing stats 608 | print_stats_interval = 30 609 | 610 | # If the last scheduler heartbeat happened more than scheduler_health_check_threshold 611 | # ago (in seconds), scheduler is considered unhealthy. 612 | # This is used by the health check in the "/health" endpoint 613 | scheduler_health_check_threshold = 30 614 | child_process_log_directory = /Users/alexaabbas/Desktop/airflow-tutorial/logs/scheduler 615 | 616 | # Local task jobs periodically heartbeat to the DB. If the job has 617 | # not heartbeat in this many seconds, the scheduler will mark the 618 | # associated task instance as failed and will re-schedule the task. 619 | scheduler_zombie_task_threshold = 300 620 | 621 | # Turn off scheduler catchup by setting this to False. 622 | # Default behavior is unchanged and 623 | # Command Line Backfills still work, but the scheduler 624 | # will not do scheduler catchup if this is False, 625 | # however it can be set on a per DAG basis in the 626 | # DAG definition (catchup) 627 | catchup_by_default = True 628 | 629 | # This changes the batch size of queries in the scheduling main loop. 630 | # If this is too high, SQL query performance may be impacted by one 631 | # or more of the following: 632 | # - reversion to full table scan 633 | # - complexity of query predicate 634 | # - excessive locking 635 | # Additionally, you may hit the maximum allowable query length for your db. 636 | # Set this to 0 for no limit (not advised) 637 | max_tis_per_query = 512 638 | 639 | # Statsd (https://github.com/etsy/statsd) integration settings 640 | statsd_on = True 641 | statsd_host = localhost 642 | statsd_port = 8125 643 | statsd_prefix = airflow 644 | 645 | # If you want to avoid send all the available metrics to StatsD, 646 | # you can configure an allow list of prefixes to send only the metrics that 647 | # start with the elements of the list (e.g: scheduler,executor,dagrun) 648 | statsd_allow_list = 649 | 650 | # The scheduler can run multiple threads in parallel to schedule dags. 651 | # This defines how many threads will run. 652 | max_threads = 2 653 | authenticate = False 654 | 655 | # Turn off scheduler use of cron intervals by setting this to False. 656 | # DAGs submitted manually in the web UI or with trigger_dag will still run. 657 | use_job_schedule = True 658 | 659 | # Allow externally triggered DagRuns for Execution Dates in the future 660 | # Only has effect if schedule_interval is set to None in DAG 661 | allow_trigger_in_future = False 662 | 663 | [ldap] 664 | # set this to ldaps://: 665 | uri = 666 | user_filter = objectClass=* 667 | user_name_attr = uid 668 | group_member_attr = memberOf 669 | superuser_filter = 670 | data_profiler_filter = 671 | bind_user = cn=Manager,dc=example,dc=com 672 | bind_password = insecure 673 | basedn = dc=example,dc=com 674 | cacert = /etc/ca/ldap_ca.crt 675 | search_scope = LEVEL 676 | 677 | # This setting allows the use of LDAP servers that either return a 678 | # broken schema, or do not return a schema. 679 | ignore_malformed_schema = False 680 | 681 | [mesos] 682 | # Mesos master address which MesosExecutor will connect to. 683 | master = localhost:5050 684 | 685 | # The framework name which Airflow scheduler will register itself as on mesos 686 | framework_name = Airflow 687 | 688 | # Number of cpu cores required for running one task instance using 689 | # 'airflow run --local -p ' 690 | # command on a mesos slave 691 | task_cpu = 1 692 | 693 | # Memory in MB required for running one task instance using 694 | # 'airflow run --local -p ' 695 | # command on a mesos slave 696 | task_memory = 256 697 | 698 | # Enable framework checkpointing for mesos 699 | # See http://mesos.apache.org/documentation/latest/slave-recovery/ 700 | checkpoint = False 701 | 702 | # Failover timeout in milliseconds. 703 | # When checkpointing is enabled and this option is set, Mesos waits 704 | # until the configured timeout for 705 | # the MesosExecutor framework to re-register after a failover. Mesos 706 | # shuts down running tasks if the 707 | # MesosExecutor framework fails to re-register within this timeframe. 708 | # Example: failover_timeout = 604800 709 | # failover_timeout = 710 | 711 | # Enable framework authentication for mesos 712 | # See http://mesos.apache.org/documentation/latest/configuration/ 713 | authenticate = False 714 | 715 | # Mesos credentials, if authentication is enabled 716 | # Example: default_principal = admin 717 | # default_principal = 718 | # Example: default_secret = admin 719 | # default_secret = 720 | 721 | # Optional Docker Image to run on slave before running the command 722 | # This image should be accessible from mesos slave i.e mesos slave 723 | # should be able to pull this docker image before executing the command. 724 | # Example: docker_image_slave = puckel/docker-airflow 725 | # docker_image_slave = 726 | 727 | [kerberos] 728 | ccache = /tmp/airflow_krb5_ccache 729 | 730 | # gets augmented with fqdn 731 | principal = airflow 732 | reinit_frequency = 3600 733 | kinit_path = kinit 734 | keytab = airflow.keytab 735 | 736 | [github_enterprise] 737 | api_rev = v3 738 | 739 | [admin] 740 | # UI to hide sensitive variable fields when set to True 741 | hide_sensitive_variable_fields = True 742 | 743 | [elasticsearch] 744 | # Elasticsearch host 745 | host = 746 | 747 | # Format of the log_id, which is used to query for a given tasks logs 748 | log_id_template = {dag_id}-{task_id}-{execution_date}-{try_number} 749 | 750 | # Used to mark the end of a log stream for a task 751 | end_of_log_mark = end_of_log 752 | 753 | # Qualified URL for an elasticsearch frontend (like Kibana) with a template argument for log_id 754 | # Code will construct log_id using the log_id template from the argument above. 755 | # NOTE: The code will prefix the https:// automatically, don't include that here. 756 | frontend = 757 | 758 | # Write the task logs to the stdout of the worker, rather than the default files 759 | write_stdout = False 760 | 761 | # Instead of the default log formatter, write the log lines as JSON 762 | json_format = False 763 | 764 | # Log fields to also attach to the json output, if enabled 765 | json_fields = asctime, filename, lineno, levelname, message 766 | 767 | [elasticsearch_configs] 768 | use_ssl = False 769 | verify_certs = True 770 | 771 | [kubernetes] 772 | # The repository, tag and imagePullPolicy of the Kubernetes Image for the Worker to Run 773 | worker_container_repository = 774 | worker_container_tag = 775 | worker_container_image_pull_policy = IfNotPresent 776 | 777 | # If True (default), worker pods will be deleted upon termination 778 | delete_worker_pods = True 779 | 780 | # Number of Kubernetes Worker Pod creation calls per scheduler loop 781 | worker_pods_creation_batch_size = 1 782 | 783 | # The Kubernetes namespace where airflow workers should be created. Defaults to ``default`` 784 | namespace = default 785 | 786 | # The name of the Kubernetes ConfigMap containing the Airflow Configuration (this file) 787 | # Example: airflow_configmap = airflow-configmap 788 | airflow_configmap = 789 | 790 | # The name of the Kubernetes ConfigMap containing ``airflow_local_settings.py`` file. 791 | # 792 | # For example: 793 | # 794 | # ``airflow_local_settings_configmap = "airflow-configmap"`` if you have the following ConfigMap. 795 | # 796 | # ``airflow-configmap.yaml``: 797 | # 798 | # .. code-block:: yaml 799 | # 800 | # --- 801 | # apiVersion: v1 802 | # kind: ConfigMap 803 | # metadata: 804 | # name: airflow-configmap 805 | # data: 806 | # airflow_local_settings.py: | 807 | # def pod_mutation_hook(pod): 808 | # ... 809 | # airflow.cfg: | 810 | # ... 811 | # Example: airflow_local_settings_configmap = airflow-configmap 812 | airflow_local_settings_configmap = 813 | 814 | # For docker image already contains DAGs, this is set to ``True``, and the worker will 815 | # search for dags in dags_folder, 816 | # otherwise use git sync or dags volume claim to mount DAGs 817 | dags_in_image = False 818 | 819 | # For either git sync or volume mounted DAGs, the worker will look in this subpath for DAGs 820 | dags_volume_subpath = 821 | 822 | # For DAGs mounted via a volume claim (mutually exclusive with git-sync and host path) 823 | dags_volume_claim = 824 | 825 | # For volume mounted logs, the worker will look in this subpath for logs 826 | logs_volume_subpath = 827 | 828 | # A shared volume claim for the logs 829 | logs_volume_claim = 830 | 831 | # For DAGs mounted via a hostPath volume (mutually exclusive with volume claim and git-sync) 832 | # Useful in local environment, discouraged in production 833 | dags_volume_host = 834 | 835 | # A hostPath volume for the logs 836 | # Useful in local environment, discouraged in production 837 | logs_volume_host = 838 | 839 | # A list of configMapsRefs to envFrom. If more than one configMap is 840 | # specified, provide a comma separated list: configmap_a,configmap_b 841 | env_from_configmap_ref = 842 | 843 | # A list of secretRefs to envFrom. If more than one secret is 844 | # specified, provide a comma separated list: secret_a,secret_b 845 | env_from_secret_ref = 846 | 847 | # Git credentials and repository for DAGs mounted via Git (mutually exclusive with volume claim) 848 | git_repo = 849 | git_branch = 850 | git_subpath = 851 | 852 | # The specific rev or hash the git_sync init container will checkout 853 | # This becomes GIT_SYNC_REV environment variable in the git_sync init container for worker pods 854 | git_sync_rev = 855 | 856 | # Use git_user and git_password for user authentication or git_ssh_key_secret_name 857 | # and git_ssh_key_secret_key for SSH authentication 858 | git_user = 859 | git_password = 860 | git_sync_root = /git 861 | git_sync_dest = repo 862 | 863 | # Mount point of the volume if git-sync is being used. 864 | # i.e. /Users/alexaabbas/Desktop/airflow-tutorial/dags 865 | git_dags_folder_mount_point = 866 | 867 | # To get Git-sync SSH authentication set up follow this format 868 | # 869 | # ``airflow-secrets.yaml``: 870 | # 871 | # .. code-block:: yaml 872 | # 873 | # --- 874 | # apiVersion: v1 875 | # kind: Secret 876 | # metadata: 877 | # name: airflow-secrets 878 | # data: 879 | # # key needs to be gitSshKey 880 | # gitSshKey: 881 | # Example: git_ssh_key_secret_name = airflow-secrets 882 | git_ssh_key_secret_name = 883 | 884 | # To get Git-sync SSH authentication set up follow this format 885 | # 886 | # ``airflow-configmap.yaml``: 887 | # 888 | # .. code-block:: yaml 889 | # 890 | # --- 891 | # apiVersion: v1 892 | # kind: ConfigMap 893 | # metadata: 894 | # name: airflow-configmap 895 | # data: 896 | # known_hosts: | 897 | # github.com ssh-rsa <...> 898 | # airflow.cfg: | 899 | # ... 900 | # Example: git_ssh_known_hosts_configmap_name = airflow-configmap 901 | git_ssh_known_hosts_configmap_name = 902 | 903 | # To give the git_sync init container credentials via a secret, create a secret 904 | # with two fields: GIT_SYNC_USERNAME and GIT_SYNC_PASSWORD (example below) and 905 | # add ``git_sync_credentials_secret = `` to your airflow config under the 906 | # ``kubernetes`` section 907 | # 908 | # Secret Example: 909 | # 910 | # .. code-block:: yaml 911 | # 912 | # --- 913 | # apiVersion: v1 914 | # kind: Secret 915 | # metadata: 916 | # name: git-credentials 917 | # data: 918 | # GIT_SYNC_USERNAME: 919 | # GIT_SYNC_PASSWORD: 920 | git_sync_credentials_secret = 921 | 922 | # For cloning DAGs from git repositories into volumes: https://github.com/kubernetes/git-sync 923 | git_sync_container_repository = k8s.gcr.io/git-sync 924 | git_sync_container_tag = v3.1.1 925 | git_sync_init_container_name = git-sync-clone 926 | git_sync_run_as_user = 65533 927 | 928 | # The name of the Kubernetes service account to be associated with airflow workers, if any. 929 | # Service accounts are required for workers that require access to secrets or cluster resources. 930 | # See the Kubernetes RBAC documentation for more: 931 | # https://kubernetes.io/docs/admin/authorization/rbac/ 932 | worker_service_account_name = 933 | 934 | # Any image pull secrets to be given to worker pods, If more than one secret is 935 | # required, provide a comma separated list: secret_a,secret_b 936 | image_pull_secrets = 937 | 938 | # GCP Service Account Keys to be provided to tasks run on Kubernetes Executors 939 | # Should be supplied in the format: key-name-1:key-path-1,key-name-2:key-path-2 940 | gcp_service_account_keys = 941 | 942 | # Use the service account kubernetes gives to pods to connect to kubernetes cluster. 943 | # It's intended for clients that expect to be running inside a pod running on kubernetes. 944 | # It will raise an exception if called from a process not running in a kubernetes environment. 945 | in_cluster = True 946 | 947 | # When running with in_cluster=False change the default cluster_context or config_file 948 | # options to Kubernetes client. Leave blank these to use default behaviour like ``kubectl`` has. 949 | # cluster_context = 950 | # config_file = 951 | 952 | # Affinity configuration as a single line formatted JSON object. 953 | # See the affinity model for top-level key names (e.g. ``nodeAffinity``, etc.): 954 | # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.12/#affinity-v1-core 955 | affinity = 956 | 957 | # A list of toleration objects as a single line formatted JSON array 958 | # See: 959 | # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.12/#toleration-v1-core 960 | tolerations = 961 | 962 | # Keyword parameters to pass while calling a kubernetes client core_v1_api methods 963 | # from Kubernetes Executor provided as a single line formatted JSON dictionary string. 964 | # List of supported params are similar for all core_v1_apis, hence a single config 965 | # variable for all apis. 966 | # See: 967 | # https://raw.githubusercontent.com/kubernetes-client/python/master/kubernetes/client/apis/core_v1_api.py 968 | # Note that if no _request_timeout is specified, the kubernetes client will wait indefinitely 969 | # for kubernetes api responses, which will cause the scheduler to hang. 970 | # The timeout is specified as [connect timeout, read timeout] 971 | kube_client_request_args = 972 | 973 | # Specifies the uid to run the first process of the worker pods containers as 974 | run_as_user = 975 | 976 | # Specifies a gid to associate with all containers in the worker pods 977 | # if using a git_ssh_key_secret_name use an fs_group 978 | # that allows for the key to be read, e.g. 65533 979 | fs_group = 980 | 981 | [kubernetes_node_selectors] 982 | 983 | # The Key-value pairs to be given to worker pods. 984 | # The worker pods will be scheduled to the nodes of the specified key-value pairs. 985 | # Should be supplied in the format: key = value 986 | 987 | [kubernetes_annotations] 988 | 989 | # The Key-value annotations pairs to be given to worker pods. 990 | # Should be supplied in the format: key = value 991 | 992 | [kubernetes_environment_variables] 993 | 994 | # The scheduler sets the following environment variables into your workers. You may define as 995 | # many environment variables as needed and the kubernetes launcher will set them in the launched workers. 996 | # Environment variables in this section are defined as follows 997 | # `` = `` 998 | # 999 | # For example if you wanted to set an environment variable with value `prod` and key 1000 | # ``ENVIRONMENT`` you would follow the following format: 1001 | # ENVIRONMENT = prod 1002 | # 1003 | # Additionally you may override worker airflow settings with the ``AIRFLOW__
__`` 1004 | # formatting as supported by airflow normally. 1005 | 1006 | [kubernetes_secrets] 1007 | 1008 | # The scheduler mounts the following secrets into your workers as they are launched by the 1009 | # scheduler. You may define as many secrets as needed and the kubernetes launcher will parse the 1010 | # defined secrets and mount them as secret environment variables in the launched workers. 1011 | # Secrets in this section are defined as follows 1012 | # `` = =`` 1013 | # 1014 | # For example if you wanted to mount a kubernetes secret key named ``postgres_password`` from the 1015 | # kubernetes secret object ``airflow-secret`` as the environment variable ``POSTGRES_PASSWORD`` into 1016 | # your workers you would follow the following format: 1017 | # ``POSTGRES_PASSWORD = airflow-secret=postgres_credentials`` 1018 | # 1019 | # Additionally you may override worker airflow settings with the ``AIRFLOW__
__`` 1020 | # formatting as supported by airflow normally. 1021 | 1022 | [kubernetes_labels] 1023 | 1024 | # The Key-value pairs to be given to worker pods. 1025 | # The worker pods will be given these static labels, as well as some additional dynamic labels 1026 | # to identify the task. 1027 | # Should be supplied in the format: ``key = value`` -------------------------------------------------------------------------------- /dags/bigquery_data_analytics.py: -------------------------------------------------------------------------------- 1 | """ 2 | ## Example PySpark dag 3 | This example dag walks you through the concepts of branching, subdags and trigger rules. 4 | It creates a Dataproc cluster in Google Cloud and runs a series of PySpark jobs. 5 | """ 6 | from airflow import DAG 7 | from airflow.operators.python_operator import BranchPythonOperator 8 | from airflow.operators.subdag_operator import SubDagOperator 9 | from airflow.contrib.operators.dataproc_operator import ( 10 | DataprocClusterCreateOperator, 11 | DataProcPySparkOperator, 12 | DataprocClusterDeleteOperator, 13 | ) 14 | 15 | from airflow.utils.dates import days_ago 16 | from datetime import datetime 17 | 18 | from pyspark_subdag import weekday_subdag 19 | 20 | default_arguments = {"owner": "YOUR-NAME-HERE", "start_date": days_ago(1)} 21 | 22 | 23 | def assess_day(execution_date=None): 24 | date = datetime.strptime(execution_date, "%Y-%m-%d") 25 | 26 | if date.isoweekday() < 6: 27 | return "weekday_analytics" 28 | 29 | return "weekend_analytics" 30 | 31 | 32 | with DAG( 33 | "bigquery_data_analytics", 34 | schedule_interval="0 20 * * *", 35 | catchup=False, 36 | default_args=default_arguments, 37 | ) as dag: 38 | 39 | dag.doc_md = __doc__ 40 | 41 | create_cluster = DataprocClusterCreateOperator( 42 | task_id="create_cluster", 43 | project_id="YOUR-PROJECT-NAME-HERE", 44 | cluster_name="spark-cluster-{{ ds_nodash }}", 45 | num_workers=2, 46 | storage_bucket="YOUR-BUCKET-NAME-HERE", 47 | zone="europe-west2-a", 48 | ) 49 | 50 | create_cluster.doc_md = """## Create Dataproc cluster 51 | This task creates a Dataproc cluster in your project. 52 | """ 53 | 54 | weekday_or_weekend = BranchPythonOperator( 55 | task_id="weekday_or_weekend", 56 | python_callable=assess_day, 57 | op_kwargs={"execution_date": "{{ ds }}"}, 58 | ) 59 | 60 | weekend_analytics = DataProcPySparkOperator( 61 | task_id="weekend_analytics", 62 | main="gs://YOUR-BUCKET-NAME-HERE/pyspark/weekend/gas_composition_count.py", 63 | cluster_name="spark-cluster-{{ ds_nodash }}", 64 | dataproc_pyspark_jars="gs://spark-lib/bigquery/spark-bigquery-latest.jar", 65 | ) 66 | 67 | weekday_analytics = SubDagOperator( 68 | task_id="weekday_analytics", 69 | subdag=weekday_subdag( 70 | parent_dag="bigquery_data_analytics", 71 | task_id="weekday_analytics", 72 | schedule_interval="0 20 * * *", 73 | default_args=default_arguments, 74 | ), 75 | ) 76 | 77 | delete_cluster = DataprocClusterDeleteOperator( 78 | task_id="delete_cluster", 79 | project_id="YOUR-PROJECT-NAME-HERE", 80 | cluster_name="spark-cluster-{{ ds_nodash }}", 81 | trigger_rule="all_done", 82 | ) 83 | 84 | create_cluster >> weekday_or_weekend >> [ 85 | weekend_analytics, 86 | weekday_analytics, 87 | ] >> delete_cluster 88 | -------------------------------------------------------------------------------- /dags/bigquery_data_load.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.models import Variable 3 | from airflow.operators.python_operator import PythonOperator 4 | from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook 5 | from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator 6 | from airflow.contrib.operators.bigquery_operator import BigQueryOperator 7 | 8 | from airflow.utils.dates import days_ago 9 | 10 | PROJECT_ID = Variable.get("project") 11 | LANDING_BUCKET = Variable.get("landing_bucket") 12 | BACKUP_BUCKET = Variable.get("backup_bucket") 13 | 14 | default_arguments = {"owner": "YOUR-NAME-HERE", "start_date": days_ago(1)} 15 | 16 | 17 | def list_objects(bucket=None): 18 | hook = GoogleCloudStorageHook() 19 | storage_objects = hook.list(bucket) 20 | 21 | return storage_objects 22 | 23 | 24 | def move_objects(source_bucket=None, destination_bucket=None, prefix=None, **kwargs): 25 | 26 | storage_objects = kwargs["ti"].xcom_pull(task_ids="list_files") 27 | 28 | hook = GoogleCloudStorageHook() 29 | 30 | for storage_object in storage_objects: 31 | destination_object = storage_object 32 | 33 | if prefix: 34 | destination_object = "{}/{}".format(prefix, storage_object) 35 | 36 | hook.copy(source_bucket, storage_object, destination_bucket, destination_object) 37 | hook.delete(source_bucket, storage_object) 38 | 39 | 40 | with DAG( 41 | "bigquery_data_load", 42 | schedule_interval="@hourly", 43 | catchup=False, 44 | default_args=default_arguments, 45 | max_active_runs=1, 46 | user_defined_macros={"project": PROJECT_ID}, 47 | ) as dag: 48 | 49 | list_files = PythonOperator( 50 | task_id="list_files", 51 | python_callable=list_objects, 52 | op_kwargs={"bucket": LANDING_BUCKET}, 53 | ) 54 | 55 | load_data = GoogleCloudStorageToBigQueryOperator( 56 | task_id="load_data", 57 | bucket=LANDING_BUCKET, 58 | source_objects=["*"], 59 | source_format="CSV", 60 | skip_leading_rows=1, 61 | field_delimiter=",", 62 | destination_project_dataset_table="{{ project }}.vehicle_analytics.history", 63 | create_disposition="CREATE_IF_NEEDED", 64 | write_disposition="WRITE_APPEND", 65 | bigquery_conn_id="google_cloud_default", 66 | google_cloud_storage_conn_id="google_cloud_default", 67 | ) 68 | 69 | query = """ 70 | SELECT * except (rank) 71 | FROM ( 72 | SELECT 73 | *, 74 | ROW_NUMBER() OVER ( 75 | PARTITION BY vehicle_id ORDER BY DATETIME(date, TIME(hour, minute, 0)) DESC 76 | ) as rank 77 | FROM `{{ project }}.vehicle_analytics.history`) as latest 78 | WHERE rank = 1; 79 | """ 80 | 81 | create_table = BigQueryOperator( 82 | task_id="create_table", 83 | sql=query, 84 | destination_dataset_table="{{ project }}.vehicle_analytics.latest", 85 | write_disposition="WRITE_TRUNCATE", 86 | create_disposition="CREATE_IF_NEEDED", 87 | use_legacy_sql=False, 88 | location="europe-west2", 89 | bigquery_conn_id="google_cloud_default", 90 | ) 91 | 92 | move_files = PythonOperator( 93 | task_id="move_files", 94 | python_callable=move_objects, 95 | op_kwargs={ 96 | "source_bucket": LANDING_BUCKET, 97 | "destination_bucket": BACKUP_BUCKET, 98 | "prefix": "{{ ts_nodash }}", 99 | }, 100 | provide_context=True, 101 | ) 102 | 103 | list_files >> load_data >> create_table >> move_files 104 | -------------------------------------------------------------------------------- /dags/bigquery_data_validation.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.utils.dates import days_ago 3 | 4 | from airflow.operators.bigquery_plugin import ( 5 | BigQueryDataValidationOperator, 6 | BigQueryDatasetSensor, 7 | ) 8 | 9 | default_arguments = {"owner": "YOUR-NAME-HERE", "start_date": days_ago(1)} 10 | 11 | with DAG( 12 | "bigquery_data_validation", 13 | schedule_interval="@daily", 14 | catchup=False, 15 | default_args=default_arguments, 16 | user_defined_macros={"project": "YOUR-PROJECT-NAME-HERE"}, 17 | ) as dag: 18 | 19 | is_table_empty = BigQueryDataValidationOperator( 20 | task_id="is_table_empty", 21 | sql="SELECT COUNT(*) FROM `{{ project }}.vehicle_analytics.history`", 22 | location="europe-west2", 23 | ) 24 | 25 | dataset_exists = BigQueryDatasetSensor( 26 | task_id="dataset_exists", 27 | project_id="{{ project }}", 28 | dataset_id="vehicle_analytics", 29 | ) 30 | 31 | dataset_exists >> is_table_empty 32 | -------------------------------------------------------------------------------- /dags/core_concepts.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.utils.dates import days_ago 3 | 4 | from airflow.operators.bash_operator import BashOperator 5 | from airflow.operators.python_operator import PythonOperator 6 | 7 | from airflow.utils.helpers import chain, cross_downstream 8 | 9 | from random import seed, random 10 | 11 | from datetime import timedelta 12 | 13 | default_arguments = { 14 | "owner": "YOUR-NAME-HERE", 15 | "start_date": days_ago(1), 16 | "sla": timedelta(hours=1), 17 | } 18 | 19 | 20 | with DAG( 21 | "core_concepts", 22 | schedule_interval="@daily", 23 | catchup=False, 24 | default_args=default_arguments, 25 | ) as dag: 26 | 27 | bash_task = BashOperator( 28 | task_id="bash_command", 29 | bash_command="echo $TODAY", 30 | env={"TODAY": "2020-05-21"}, 31 | sla=timedelta(hours=2), 32 | ) 33 | 34 | def print_random_number(number): 35 | seed(number) 36 | print(random()) 37 | 38 | python_task = PythonOperator( 39 | task_id="python_function", python_callable=print_random_number, op_args=[1], 40 | ) 41 | 42 | bash_task >> python_task 43 | -------------------------------------------------------------------------------- /dags/pyspark_subdag.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.contrib.operators.dataproc_operator import DataProcPySparkOperator 3 | 4 | 5 | def weekday_subdag( 6 | parent_dag=None, task_id=None, schedule_interval=None, default_args=None 7 | ): 8 | 9 | subdag = DAG( 10 | f"{parent_dag}.{task_id}", 11 | schedule_interval=schedule_interval, 12 | default_args=default_args, 13 | ) 14 | 15 | pyspark_jobs = ["avg_speed", "avg_temperature", "avg_tire_pressure"] 16 | 17 | for job in pyspark_jobs: 18 | 19 | DataProcPySparkOperator( 20 | task_id=f"{job}", 21 | main=f"gs://YOUR-BUCKET-NAME-HERE/pyspark/weekday/{job}.py", 22 | cluster_name="spark-cluster-{{ ds_nodash }}", 23 | dataproc_pyspark_jars="gs://spark-lib/bigquery/spark-bigquery-latest.jar", 24 | dag=subdag, 25 | ) 26 | 27 | return subdag 28 | -------------------------------------------------------------------------------- /data/4649493c.csv: -------------------------------------------------------------------------------- 1 | vehicle_id,date,hour,minute,latitude,longitude,tire_pressure,speed,temperature,gas_composition 2 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,0,34.995728,104.655337,10896.939,131.4,34.19,Blue 3 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,1,-6.8561144,107.5193473,15108.392,73.4,-8.36,Indigo 4 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,2,50.2923198,27.9814782,8668.885,53.9,29.0,Purple 5 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,3,22.636828,113.814606,13176.617,51.7,-3.03,Yellow 6 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,4,41.0872247,-8.1084595,9754.625,282.9,33.55,Maroon 7 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,5,22.919769,113.618216,7290.369,274.4,18.18,Mauv 8 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,6,-8.1448805,-79.0517936,12215.39,93.4,22.0,Pink 9 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,7,41.6088994,-8.7234712,1458.653,107.0,25.93,Fuscia 10 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,8,-6.4185424,106.8502879,11149.99,273.8,31.14,Purple 11 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,9,-6.4104949,107.0126076,9128.732,9.5,-6.3,Blue 12 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,10,25.185809,111.579535,3642.755,274.5,12.1,Pink 13 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,11,30.2827181,120.1245708,12015.249,2.4,37.96,Red 14 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,12,15.2586581,100.8677256,13419.904,243.0,36.63,Blue 15 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,13,46.987383,123.769368,464.688,11.5,39.62,Crimson 16 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,14,-13.8556,-73.758263,2109.403,288.4,12.52,Mauv 17 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,15,45.0417524,-73.9260044,4857.905,74.6,12.33,Violet 18 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,16,51.1434519,40.2997102,2400.634,199.2,12.0,Orange 19 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,17,8.490877,124.345771,5255.081,67.3,3.16,Violet 20 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,18,48.8493975,2.4751086,1680.678,265.0,33.61,Violet 21 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,19,40.8245332,-8.0435522,14994.36,74.5,14.93,Violet 22 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,20,49.5519718,17.3375604,4957.965,103.5,13.71,Green 23 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,21,-7.056066,108.4780523,9239.987,66.9,4.75,Khaki 24 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,22,32.1942601,35.3736237,18576.176,120.1,5.59,Pink 25 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,23,46.817914,-0.626516,8795.055,217.8,25.72,Blue 26 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,24,-23.4214264,-57.4344451,7004.375,234.7,16.34,Aquamarine 27 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,25,53.6040518,24.741899,18664.287,121.3,0.74,Red 28 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,26,61.7983586,34.3753781,6894.003,187.8,37.48,Goldenrod 29 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,27,-26.3018088,-54.7192376,9188.153,232.1,24.68,Purple 30 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,28,-6.6591023,106.2931013,10759.753,51.9,11.41,Orange 31 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,29,35.579427,109.262961,2214.161,87.3,30.42,Orange 32 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,30,39.851469,113.463703,943.845,276.4,39.16,Mauv 33 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,31,27.358123,110.514837,2393.395,208.5,30.98,Orange 34 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,32,32.119799,34.986653,2771.459,130.5,-4.73,Fuscia 35 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,33,45.4123755,20.8041177,2289.512,186.6,1.95,Maroon 36 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,34,50.3781731,15.5447069,18739.197,108.6,20.51,Khaki 37 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,35,-23.5893189,-46.0107997,174.271,62.5,2.29,Crimson 38 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,36,-43.7567555,172.0223196,4055.949,182.5,0.73,Crimson 39 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,37,59.4411274,30.1610699,16611.812,117.6,3.71,Blue 40 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,38,23.5989926,56.5448304,1330.887,257.2,2.46,Green 41 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,39,39.1251493,23.6799766,6137.24,257.7,34.03,Crimson 42 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,40,-26.0440358,28.1574467,16995.984,0.9,10.82,Crimson 43 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,41,20.9414842,105.9569025,2487.218,16.6,13.63,Green 44 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,42,-34.6698749,-58.5616502,3969.342,173.7,26.85,Mauv 45 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,43,32.6892815,-16.7907398,5789.196,186.5,9.3,Violet 46 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,44,-19.6818529,-49.0817124,12341.257,266.3,-4.35,Aquamarine 47 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,45,47.357916,88.027707,7922.845,264.7,18.4,Aquamarine 48 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,46,49.51688,-96.50029,16218.418,225.4,4.32,Maroon 49 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,47,15.4418966,-61.2583352,17530.821,276.7,8.49,Maroon 50 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,48,49.9808189,21.726063,18768.76,6.0,16.33,Maroon 51 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,49,36.097577,114.392392,1553.011,155.9,22.06,Red 52 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,50,34.379742,117.788836,13737.251,18.4,17.22,Puce 53 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,51,29.985295,122.207215,19613.727,114.8,0.93,Orange 54 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,52,-7.9636675,112.6225104,5630.25,64.2,31.86,Pink 55 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,53,50.2014324,14.8328189,1172.312,2.2,5.07,Yellow 56 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,54,8.0155697,-71.7637309,19388.976,242.6,-6.2,Khaki 57 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,55,-31.3699,27.03523,2508.02,37.4,4.01,Mauv 58 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,56,24.1301619,55.8023118,1030.756,188.4,7.93,Orange 59 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,57,36.0720984,49.7013486,6494.651,77.2,38.58,Fuscia 60 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,58,40.6945206,-7.8725232,11217.461,22.0,2.13,Blue 61 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,59,48.4691324,37.0871224,16649.352,173.5,21.07,Green 62 | -------------------------------------------------------------------------------- /data/c876bd01.csv: -------------------------------------------------------------------------------- 1 | vehicle_id,date,hour,minute,latitude,longitude,tire_pressure,speed,temperature,gas_composition 2 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,0,-8.1448805,-79.0517936,12215.39,93.4,22.0,Pink 3 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,1,34.199479,119.578364,17703.228,78.7,-0.06,Indigo 4 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,2,49.954706,15.0305859,6308.355,58.0,-4.19,Fuscia 5 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,3,34.995728,104.655337,10896.939,131.4,34.19,Blue 6 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,4,37.8925401,140.5266555,2504.253,227.5,14.05,Green 7 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,5,38.4687834,48.8728029,2300.072,207.4,5.16,Green 8 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,6,30.780217,120.644805,9852.028,0.1,12.21,Khaki 9 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,7,40.280218,-8.4788248,12868.242,213.9,-9.6,Puce 10 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,8,37.1878209,50.1575212,1277.421,96.4,29.91,Maroon 11 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,9,-6.4306844,106.7175669,19316.439,111.0,17.87,Pink 12 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,10,-5.8335001,34.9644426,9019.275,173.5,24.98,Aquamarine 13 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,11,38.013999,24.4198995,2293.132,204.8,25.63,Maroon 14 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,12,52.2163528,61.2809373,3104.701,67.3,38.86,Orange 15 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,13,38.734814,93.330613,4822.982,0.7,6.53,Pink 16 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,14,59.3462826,18.0843085,2680.192,119.3,-8.12,Turquoise 17 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,15,35.72154,111.350842,4813.115,118.4,-3.69,Indigo 18 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,16,37.79446,20.85188,8463.648,297.9,29.99,Maroon 19 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,17,31.191643,121.389262,4611.078,218.2,30.57,Puce 20 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,18,9.7457208,123.8401962,3290.653,63.9,10.85,Khaki 21 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,19,53.9762845,43.8688442,10250.0,34.2,28.02,Purple 22 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,20,24.513425,117.723153,18560.283,172.4,21.44,Pink 23 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,21,35.7017899,59.8468432,15401.556,180.3,33.62,Pink 24 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,22,44.840524,82.353656,6517.541,14.0,17.78,Goldenrod 25 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,23,18.9237513,-70.4144776,1764.466,205.4,23.52,Turquoise 26 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,24,15.4053048,-91.7142051,18070.46,185.8,5.36,Green 27 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,25,6.129226,102.236216,12329.13,264.8,28.18,Puce 28 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,26,-8.5294459,119.0109502,13301.391,47.4,3.82,Green 29 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,27,59.8863041,29.9085976,10543.427,286.2,5.3,Maroon 30 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,28,54.0297214,28.0892299,14565.844,269.1,31.19,Teal 31 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,29,55.6967262,39.2331589,15211.381,274.2,39.29,Purple 32 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,30,7.6447222,149.4208333,4035.918,294.7,-8.25,Purple 33 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,31,42.9895326,131.8411237,3914.634,167.6,24.91,Fuscia 34 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,32,55.6942718,74.3214928,13474.821,221.1,-9.11,Khaki 35 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,33,38.430793,100.812859,750.106,106.5,17.87,Indigo 36 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,34,41.0490009,39.513623,11325.001,157.9,0.91,Crimson 37 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,35,62.6571846,26.0472266,3598.447,159.3,27.45,Crimson 38 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,36,15.1539332,-87.8721602,11346.493,190.0,8.49,Mauv 39 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,37,57.7311038,12.0586612,18059.776,20.2,-2.16,Teal 40 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,38,60.3448681,17.4966539,14181.513,281.9,19.21,Indigo 41 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,39,29.306756,120.07514,9057.801,159.2,7.51,Maroon 42 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,40,40.417358,117.500558,11505.682,178.2,10.48,Green 43 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,41,10.3696393,-66.9571026,12820.516,35.4,4.39,Goldenrod 44 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,42,31.03094,103.183075,1314.934,22.5,32.19,Red 45 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,43,34.014215,105.298756,12460.738,231.2,26.25,Aquamarine 46 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,44,48.8610504,2.3237084,432.326,249.5,18.08,Turquoise 47 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,45,47.2952721,39.8734276,4035.261,227.4,32.39,Red 48 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,46,32.147679,114.091192,14946.176,194.9,10.54,Fuscia 49 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,47,58.2774681,11.4424559,10162.833,108.5,2.17,Turquoise 50 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,48,-6.8058522,111.9611919,16282.329,159.7,19.21,Turquoise 51 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,49,31.9963592,-5.1174039,1263.775,74.0,3.31,Pink 52 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,50,9.9497452,126.0068121,15802.549,125.7,37.82,Indigo 53 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,51,-26.4070347,-61.4128561,2934.254,221.2,17.79,Blue 54 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,52,14.0242809,-60.9758292,4843.068,124.3,25.85,Blue 55 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,53,34.6718873,133.8964708,15607.254,236.9,-5.56,Puce 56 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,54,53.3547991,-113.7233907,4433.983,205.5,30.7,Red 57 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,55,-7.2893545,-34.8403408,8702.503,189.4,20.44,Pink 58 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,56,23.284628,116.268675,15210.3,273.5,3.85,Green 59 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,57,22.654032,110.18122,19745.464,21.7,3.32,Crimson 60 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,58,22.270978,113.576677,14309.905,36.4,25.93,Red 61 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,59,7.193611,100.592145,8750.315,87.0,-7.97,Indigo 62 | -------------------------------------------------------------------------------- /plugins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexandraabbas/apache-airflow-course/4280ef25705f38f74c2ac4aa2ac7afcbc8aba024/plugins/__init__.py -------------------------------------------------------------------------------- /plugins/bigquery_plugin.py: -------------------------------------------------------------------------------- 1 | from airflow.plugins_manager import AirflowPlugin 2 | 3 | from airflow.models import BaseOperator 4 | from airflow.contrib.hooks.bigquery_hook import BigQueryHook 5 | from airflow.sensors.base_sensor_operator import BaseSensorOperator 6 | 7 | from airflow.exceptions import AirflowException 8 | from airflow.utils.decorators import apply_defaults 9 | 10 | from googleapiclient.errors import HttpError 11 | from google.cloud import bigquery 12 | 13 | 14 | class BigQueryDataValidationOperator(BaseOperator): 15 | template_fields = ["sql"] 16 | ui_color = "#fcf197" 17 | 18 | @apply_defaults 19 | def __init__( 20 | self, 21 | sql, 22 | gcp_conn_id="google_cloud_default", 23 | use_legacy_sql=False, 24 | location=None, 25 | *args, 26 | **kwargs, 27 | ): 28 | 29 | super().__init__(*args, **kwargs) 30 | self.sql = sql 31 | self.gcp_conn_id = gcp_conn_id 32 | self.use_legacy_sql = use_legacy_sql 33 | self.location = location 34 | 35 | def run_query(self, project, credentials): 36 | client = bigquery.Client(project=project, credentials=credentials) 37 | 38 | query_job = client.query(self.sql) 39 | results = query_job.result() 40 | 41 | return [list(row.values()) for row in results][0] 42 | 43 | def execute(self, context): 44 | hook = BigQueryHook( 45 | bigquery_conn_id=self.gcp_conn_id, 46 | use_legacy_sql=self.use_legacy_sql, 47 | location=self.location, 48 | ) 49 | 50 | records = self.run_query( 51 | project=hook._get_field("project"), credentials=hook._get_credentials() 52 | ) 53 | 54 | if not records: 55 | raise AirflowException("Query returned no results.") 56 | elif not all([bool(record) for record in records]): 57 | raise AirflowException( 58 | f"Test failed\nQuery: {self.sql}\nRecords: {records}" 59 | ) 60 | 61 | self.log.info(f"Test passed\nQuery: {self.sql}\nRecords: {records}") 62 | 63 | 64 | class BigQueryDatasetSensor(BaseSensorOperator): 65 | template_fields = ["project_id", "dataset_id"] 66 | ui_color = "#feeef1" 67 | 68 | def __init__( 69 | self, 70 | project_id, 71 | dataset_id, 72 | gcp_conn_id="google_cloud_default", 73 | *args, 74 | **kwargs, 75 | ): 76 | super().__init__(*args, **kwargs) 77 | self.project_id = project_id 78 | self.dataset_id = dataset_id 79 | self.gcp_conn_id = gcp_conn_id 80 | 81 | def poke(self, context): 82 | hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id) 83 | service = hook.get_service() 84 | 85 | try: 86 | service.datasets().get( 87 | datasetId=self.dataset_id, projectId=self.project_id 88 | ).execute() 89 | 90 | return True 91 | except HttpError as e: 92 | if e.resp["status"] == "404": 93 | return False 94 | 95 | raise AirflowException(f"Error: {e}") 96 | 97 | 98 | class BigQueryPlugin(AirflowPlugin): 99 | name = "bigquery_plugin" 100 | operators = [BigQueryDataValidationOperator] 101 | sensors = [BigQueryDatasetSensor] 102 | -------------------------------------------------------------------------------- /pyspark/weekday/avg_speed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from pyspark.sql import SparkSession 3 | 4 | spark = SparkSession \ 5 | .builder \ 6 | .master('yarn') \ 7 | .appName('bigquery-analytics-avg-speed') \ 8 | .getOrCreate() 9 | 10 | bucket = 'YOUR-BUCKET-NAME-HERE' 11 | spark.conf.set('temporaryGcsBucket', bucket) 12 | 13 | history = spark.read.format('bigquery') \ 14 | .option('table', 'vehicle_analytics.history') \ 15 | .load() 16 | history.createOrReplaceTempView('history') 17 | 18 | avg_speed = spark.sql( 19 | 'SELECT vehicle_id, date, AVG(speed) AS avg_speed FROM history GROUP BY vehicle_id, date' 20 | ) 21 | avg_speed.show() 22 | avg_speed.printSchema() 23 | 24 | avg_speed.write.format('bigquery') \ 25 | .option('table', 'vehicle_analytics.avg_speed') \ 26 | .mode('append') \ 27 | .save() 28 | -------------------------------------------------------------------------------- /pyspark/weekday/avg_temperature.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from pyspark.sql import SparkSession 3 | 4 | spark = SparkSession \ 5 | .builder \ 6 | .master('yarn') \ 7 | .appName('bigquery-analytics-avg-temperature') \ 8 | .getOrCreate() 9 | 10 | bucket = 'YOUR-BUCKET-NAME-HERE' 11 | spark.conf.set('temporaryGcsBucket', bucket) 12 | 13 | history = spark.read.format('bigquery') \ 14 | .option('table', 'vehicle_analytics.history') \ 15 | .load() 16 | history.createOrReplaceTempView('history') 17 | 18 | avg_temperature = spark.sql( 19 | 'SELECT vehicle_id, date, AVG(temperature) AS avg_temperature FROM history GROUP BY vehicle_id, date' 20 | ) 21 | avg_temperature.show() 22 | avg_temperature.printSchema() 23 | 24 | avg_temperature.write.format('bigquery') \ 25 | .option('table', 'vehicle_analytics.avg_temperature') \ 26 | .mode('append') \ 27 | .save() 28 | -------------------------------------------------------------------------------- /pyspark/weekday/avg_tire_pressure.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from pyspark.sql import SparkSession 3 | 4 | spark = SparkSession \ 5 | .builder \ 6 | .master('yarn') \ 7 | .appName('bigquery-analytics-avg-tire-pressure') \ 8 | .getOrCreate() 9 | 10 | bucket = 'YOUR-BUCKET-NAME-HERE' 11 | spark.conf.set('temporaryGcsBucket', bucket) 12 | 13 | history = spark.read.format('bigquery') \ 14 | .option('table', 'vehicle_analytics.history') \ 15 | .load() 16 | history.createOrReplaceTempView('history') 17 | 18 | avg_tire_pressure = spark.sql( 19 | 'SELECT vehicle_id, date, AVG(tire_pressure) AS avg_tire_pressure FROM history GROUP BY vehicle_id, date' 20 | ) 21 | avg_tire_pressure.show() 22 | avg_tire_pressure.printSchema() 23 | 24 | avg_tire_pressure.write.format('bigquery') \ 25 | .option('table', 'vehicle_analytics.avg_tire_pressure') \ 26 | .mode('append') \ 27 | .save() 28 | -------------------------------------------------------------------------------- /pyspark/weekend/gas_composition_count.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from pyspark.sql import SparkSession 3 | 4 | spark = SparkSession \ 5 | .builder \ 6 | .master('yarn') \ 7 | .appName('bigquery-analytics-gas-composition-count') \ 8 | .getOrCreate() 9 | 10 | bucket = 'YOUR-BUCKET-NAME-HERE' 11 | spark.conf.set('temporaryGcsBucket', bucket) 12 | 13 | history = spark.read.format('bigquery') \ 14 | .option('table', 'vehicle_analytics.history') \ 15 | .load() 16 | history.createOrReplaceTempView('history') 17 | 18 | gas_composition_count = spark.sql( 19 | 'SELECT vehicle_id, date, COUNT(DISTINCT gas_composition) AS gas_composition_count FROM history GROUP BY vehicle_id, date' 20 | ) 21 | gas_composition_count.show() 22 | gas_composition_count.printSchema() 23 | 24 | gas_composition_count.write.format('bigquery') \ 25 | .option('table', 'vehicle_analytics.gas_composition_count') \ 26 | .mode('append') \ 27 | .save() 28 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Section 7: Testing Airflow DAGs 2 | 3 | This directory holds the source code of Section 7: Testing Airflow DAGs. 4 | 5 | ## How to run unit tests 6 | 7 | ```Bash 8 | python3 -m unittest -v {TEST-MODULE} 9 | ``` 10 | 11 | For example to test the `core_concepts` DAG and its operators run the following. 12 | 13 | ```Bash 14 | python3 -m unittest -v test_core_concepts 15 | ``` 16 | -------------------------------------------------------------------------------- /tests/test_bigquery_data_validation.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import patch 3 | 4 | from datetime import datetime 5 | 6 | from airflow import DAG 7 | from airflow.utils.state import State 8 | from airflow.models import TaskInstance 9 | from airflow.operators.bigquery_plugin import BigQueryDataValidationOperator 10 | 11 | from airflow.exceptions import AirflowException 12 | 13 | 14 | def mock_run_query(): 15 | def return_empty_list(*args, **kwargs): 16 | return [] 17 | 18 | return return_empty_list 19 | 20 | 21 | class TestBigQueryDataValidationOperator(unittest.TestCase): 22 | def setUp(self): 23 | EXEC_DATE = "2020-06-25" 24 | 25 | self.dag = DAG( 26 | "test_bigquery_data_validation", 27 | schedule_interval="@daily", 28 | default_args={"start_date": EXEC_DATE}, 29 | ) 30 | 31 | self.op = BigQueryDataValidationOperator( 32 | task_id="bigquery_op", 33 | sql="SELECT COUNT(*) FROM `example.example.example`", 34 | location="europe-west2", 35 | dag=self.dag, 36 | ) 37 | 38 | self.ti = TaskInstance( 39 | task=self.op, execution_date=datetime.strptime(EXEC_DATE, "%Y-%m-%d") 40 | ) 41 | 42 | @patch.object( 43 | BigQueryDataValidationOperator, "run_query", new_callable=mock_run_query 44 | ) 45 | def test_with_empty_result(self, mock): 46 | with self.assertRaises(AirflowException) as context: 47 | self.ti.run() 48 | self.assertEqual(self.ti.state, State.FAILED) 49 | self.assertEqual(str(context.exception), "Query returned no results.") 50 | -------------------------------------------------------------------------------- /tests/test_core_concepts.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from airflow.models import DagBag 3 | 4 | 5 | class TestCoreConceptsDAG(unittest.TestCase): 6 | def setUp(self): 7 | self.dagbag = DagBag() 8 | self.dag = self.dagbag.get_dag(dag_id="core_concepts") 9 | 10 | def test_dag_loaded(self): 11 | self.assertDictEqual(self.dagbag.import_errors, {}) 12 | self.assertIsNotNone(self.dag) 13 | 14 | def test_contain_tasks(self): 15 | self.assertListEqual(self.dag.task_ids, ["bash_command", "python_function"]) 16 | 17 | def test_dependencies_of_bash_command(self): 18 | bash_task = self.dag.get_task("bash_command") 19 | 20 | self.assertEqual(bash_task.upstream_task_ids, set()) 21 | self.assertEqual(bash_task.downstream_task_ids, set(["python_function"])) 22 | 23 | def assertDagDictEqual(self, structure, dag): 24 | self.assertEqual(dag.task_dict.keys(), structure.keys()) 25 | 26 | for task_id, downstream_list in structure.items(): 27 | self.assertTrue(dag.has_task(task_id)) 28 | 29 | task = dag.get_task(task_id) 30 | 31 | self.assertEqual(task.downstream_task_ids, set(downstream_list)) 32 | 33 | def test_dag_structure(self): 34 | self.assertDagDictEqual( 35 | {"bash_command": ["python_function"], "python_function": []}, self.dag 36 | ) 37 | 38 | -------------------------------------------------------------------------------- /variables/dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "project": "YOUR-PROJECT-NAME-HERE", 3 | "landing_bucket": "YOUR-LANDING-BUCKET-NAME-HERE", 4 | "backup_bucket": "YOUR-BACKUP-BUCKET-NAME-HERE" 5 | } --------------------------------------------------------------------------------