├── .gitignore
├── .prometheus
    ├── README.md
    ├── mapping.yml
    └── prometheus.yml
├── LICENSE
├── README.md
├── airflow.cfg
├── dags
    ├── bigquery_data_analytics.py
    ├── bigquery_data_load.py
    ├── bigquery_data_validation.py
    ├── core_concepts.py
    └── pyspark_subdag.py
├── data
    ├── 4649493c.csv
    └── c876bd01.csv
├── plugins
    ├── __init__.py
    └── bigquery_plugin.py
├── pyspark
    ├── weekday
    │   ├── avg_speed.py
    │   ├── avg_temperature.py
    │   └── avg_tire_pressure.py
    └── weekend
    │   └── gas_composition_count.py
├── tests
    ├── README.md
    ├── test_bigquery_data_validation.py
    └── test_core_concepts.py
└── variables
    └── dev.json


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/.prometheus/README.md:
--------------------------------------------------------------------------------
 1 | # Monitor Airflow with StatsD, Prometheus and Grafana
 2 | 
 3 | This is the source code for the lecture Monitor Airflow with StatsD, Prometheus and Grafana in Section 8: Airflow in Production.
 4 | 
 5 | Make sure you're in the `.prometheus/` directory while running these commands. We're using Docker to run these services so make sure you have Docker installed.
 6 | 
 7 | ## Run statsd-exporter
 8 | 
 9 | This command configures statsd-exporter to listen for metrics sent on port 8125. It converts Statsd metrics to Prometheus format using the mapping.yml configuration file. It exposes metrics for Prometheus to scrape over host port 9123 and container port 9102.
10 | 
11 | ```Bash
12 | docker run --name=prom-statsd-exporter \
13 |     -p 9123:9102 \
14 |     -p 8125:8125/udp \
15 |     -v $PWD/mapping.yml:/tmp/mapping.yml \
16 |     prom/statsd-exporter \
17 |         --statsd.mapping-config=/tmp/mapping.yml \
18 |         --statsd.listen-udp=:8125 \
19 |         --web.listen-address=:9102
20 | ```
21 | 
22 | ## Run Prometheus
23 | 
24 | This command runs Prometherus and configures it to scrape metrics exposed on port 9123. It allows Grafana to use Prometheus as a data source over address port 9090.
25 | 
26 | ```Bash
27 | docker run --name=prometheus \
28 |     -p 9090:9090 \
29 |     -v $PWD/prometheus.yml:/prometheus.yml \
30 |     prom/prometheus \
31 |         --config.file=/prometheus.yml \
32 |         --log.level=debug \
33 |         --web.listen-address=:9090 \
34 |         --web.page-title='Prometheus - Airflow Demo'
35 | ```
36 | 
37 | Now you can access the Prometheus web UI on http://localhost:9090.
38 | 
39 | ## Run Grafana
40 | 
41 | ```Bash
42 | docker run -d --name=grafana -p 3000:3000 grafana/grafana
43 | ```
44 | 
45 | Now you can access the Grafana web UI on http://localhost:3000.
46 | 
47 | Follow the lecture to configure Prometheus as a data source in the Grafana UI.
48 | 


--------------------------------------------------------------------------------
/.prometheus/mapping.yml:
--------------------------------------------------------------------------------
1 | mappings:
2 |   - match: 'airflow.*'
3 |     name: 'airflow'
4 |     labels:
5 |       metric: '$1'
6 | 


--------------------------------------------------------------------------------
/.prometheus/prometheus.yml:
--------------------------------------------------------------------------------
 1 | global:
 2 |   scrape_interval: 15s
 3 |   evaluation_interval: 15s
 4 | 
 5 | scrape_configs:
 6 |   - job_name: 'prometheus'
 7 |     static_configs:
 8 |       - targets: ['localhost:9090']
 9 | 
10 |   - job_name: 'airflow'
11 |     static_configs:
12 |       - targets: ['host.docker.internal:9123']
13 |         labels: {'host': 'airflow-statsd'}
14 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Alexandra Abbas
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Apache Airflow: Complete Hands-On Beginner to Advanced Class
 2 | 
 3 | This repository hold the source code for the Udemy online course [Apache Airflow: Complete Hands-On Beginner to Advanced Class](https://www.udemy.com/course/apache-airflow-course/?referralCode=7A7192D2BDE0A30803F8) by Alexandra Abbas.
 4 | 
 5 | ## Intsall Apache Airflow
 6 | 
 7 | As explained in the course before meking use of this code base you need to install Apache Airflow locally on your machine.
 8 | 
 9 | ```Bash
10 | pip install apache-airflow[gcp,statsd,sentry]==1.10.10
11 | ```
12 | 
13 | Install these extra packages as well.
14 | 
15 | ```Bash
16 | pip install cryptography==2.9.2
17 | pip install pyspark==2.4.5
18 | ```
19 | 
20 | To validate your Airflow installation check your Airflow version. This should print 1.10.10.
21 | 
22 | ```Bash
23 | airflow version
24 | ```
25 | 
26 | If you have installed Airflow earlier you might get a DeprecationWarning about having multiple airflow.cfg files but that’s okay as long as you set the correct AIRFLOW_HOME environment variable in your Terminal.
27 | 
28 | # Initialise an Airflow environment
29 | 
30 | As a next step you need to initialise an Airflow environment locally to run DAGs.
31 | 
32 | Set the AIRFLOW_HOME variable.
33 | 
34 | ```Bash
35 | export AIRFLOW_HOME=path/to/this/directory
36 | ```
37 | 
38 | Initialise Airflow and the metadata database.
39 | 
40 | ```Bash
41 | airflow initdb
42 | ```
43 | 
44 | Now, you can run both the web server and the scheduler.
45 | 
46 | Run the web server.
47 | 
48 | ```Bash
49 | airflow webserver
50 | ```
51 | 
52 | In a different terminal window/session where you set the AIRFLOW_HOME variable again run the scheduler.
53 | 
54 | ```Bash
55 | airflow scheduler
56 | ```
57 | 
58 | Great!🎉 Now you can access the Airflow web UI on http://localhost:8080.
59 | 


--------------------------------------------------------------------------------
/airflow.cfg:
--------------------------------------------------------------------------------
   1 | [core]
   2 | # The folder where your airflow pipelines live, most likely a
   3 | # subfolder in a code repository. This path must be absolute.
   4 | dags_folder = /Users/alexaabbas/Desktop/airflow-tutorial/dags
   5 | 
   6 | # The folder where airflow should store its log files
   7 | # This path must be absolute
   8 | base_log_folder = /Users/alexaabbas/Desktop/airflow-tutorial/logs
   9 | 
  10 | # Airflow can store logs remotely in AWS S3, Google Cloud Storage or Elastic Search.
  11 | # Set this to True if you want to enable remote logging.
  12 | remote_logging = True
  13 | 
  14 | # Users must supply an Airflow connection id that provides access to the storage
  15 | # location.
  16 | remote_log_conn_id = google_cloud_default
  17 | remote_base_log_folder = gs://aa-logistics-landing-bucket/airflow
  18 | encrypt_s3_logs = False
  19 | 
  20 | # Logging level
  21 | logging_level = INFO
  22 | 
  23 | # Logging level for Flask-appbuilder UI
  24 | fab_logging_level = WARN
  25 | 
  26 | # Logging class
  27 | # Specify the class that will specify the logging configuration
  28 | # This class has to be on the python classpath
  29 | # Example: logging_config_class = my.path.default_local_settings.LOGGING_CONFIG
  30 | logging_config_class =
  31 | 
  32 | # Flag to enable/disable Colored logs in Console
  33 | # Colour the logs when the controlling terminal is a TTY.
  34 | colored_console_log = True
  35 | 
  36 | # Log format for when Colored logs is enabled
  37 | colored_log_format = [%%(blue)s%%(asctime)s%%(reset)s] {%%(blue)s%%(filename)s:%%(reset)s%%(lineno)d} %%(log_color)s%%(levelname)s%%(reset)s - %%(log_color)s%%(message)s%%(reset)s
  38 | colored_formatter_class = airflow.utils.log.colored_log.CustomTTYColoredFormatter
  39 | 
  40 | # Format of Log line
  41 | log_format = [%%(asctime)s] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s
  42 | simple_log_format = %%(asctime)s %%(levelname)s - %%(message)s
  43 | 
  44 | # Log filename format
  45 | log_filename_template = {{ ti.dag_id }}/{{ ti.task_id }}/{{ ts }}/{{ try_number }}.log
  46 | log_processor_filename_template = {{ filename }}.log
  47 | dag_processor_manager_log_location = /Users/alexaabbas/Desktop/airflow-tutorial/logs/dag_processor_manager/dag_processor_manager.log
  48 | 
  49 | # Name of handler to read task instance logs.
  50 | # Default to use task handler.
  51 | task_log_reader = task
  52 | 
  53 | # Hostname by providing a path to a callable, which will resolve the hostname.
  54 | # The format is "package:function".
  55 | #
  56 | # For example, default value "socket:getfqdn" means that result from getfqdn() of "socket"
  57 | # package will be used as hostname.
  58 | #
  59 | # No argument should be required in the function specified.
  60 | # If using IP address as hostname is preferred, use value ``airflow.utils.net:get_host_ip_address``
  61 | hostname_callable = socket:getfqdn
  62 | 
  63 | # Default timezone in case supplied date times are naive
  64 | # can be utc (default), system, or any IANA timezone string (e.g. Europe/Amsterdam)
  65 | default_timezone = utc
  66 | 
  67 | # The executor class that airflow should use. Choices include
  68 | # SequentialExecutor, LocalExecutor, CeleryExecutor, DaskExecutor, KubernetesExecutor
  69 | executor = SequentialExecutor
  70 | 
  71 | # The SqlAlchemy connection string to the metadata database.
  72 | # SqlAlchemy supports many different database engine, more information
  73 | # their website
  74 | sql_alchemy_conn = sqlite:////Users/alexaabbas/Desktop/airflow-tutorial/airflow.db
  75 | 
  76 | # The encoding for the databases
  77 | sql_engine_encoding = utf-8
  78 | 
  79 | # If SqlAlchemy should pool database connections.
  80 | sql_alchemy_pool_enabled = True
  81 | 
  82 | # The SqlAlchemy pool size is the maximum number of database connections
  83 | # in the pool. 0 indicates no limit.
  84 | sql_alchemy_pool_size = 5
  85 | 
  86 | # The maximum overflow size of the pool.
  87 | # When the number of checked-out connections reaches the size set in pool_size,
  88 | # additional connections will be returned up to this limit.
  89 | # When those additional connections are returned to the pool, they are disconnected and discarded.
  90 | # It follows then that the total number of simultaneous connections the pool will allow
  91 | # is pool_size + max_overflow,
  92 | # and the total number of "sleeping" connections the pool will allow is pool_size.
  93 | # max_overflow can be set to -1 to indicate no overflow limit;
  94 | # no limit will be placed on the total number of concurrent connections. Defaults to 10.
  95 | sql_alchemy_max_overflow = 10
  96 | 
  97 | # The SqlAlchemy pool recycle is the number of seconds a connection
  98 | # can be idle in the pool before it is invalidated. This config does
  99 | # not apply to sqlite. If the number of DB connections is ever exceeded,
 100 | # a lower config value will allow the system to recover faster.
 101 | sql_alchemy_pool_recycle = 1800
 102 | 
 103 | # Check connection at the start of each connection pool checkout.
 104 | # Typically, this is a simple statement like "SELECT 1".
 105 | # More information here:
 106 | # https://docs.sqlalchemy.org/en/13/core/pooling.html#disconnect-handling-pessimistic
 107 | sql_alchemy_pool_pre_ping = True
 108 | 
 109 | # The schema to use for the metadata database.
 110 | # SqlAlchemy supports databases with the concept of multiple schemas.
 111 | sql_alchemy_schema =
 112 | 
 113 | # The amount of parallelism as a setting to the executor. This defines
 114 | # the max number of task instances that should run simultaneously
 115 | # on this airflow installation
 116 | parallelism = 32
 117 | 
 118 | # The number of task instances allowed to run concurrently by the scheduler
 119 | dag_concurrency = 16
 120 | 
 121 | # Are DAGs paused by default at creation
 122 | dags_are_paused_at_creation = True
 123 | 
 124 | # The maximum number of active DAG runs per DAG
 125 | max_active_runs_per_dag = 16
 126 | 
 127 | # Whether to load the DAG examples that ship with Airflow. It's good to
 128 | # get started, but you probably want to set this to False in a production
 129 | # environment
 130 | load_examples = False
 131 | 
 132 | # Whether to load the default connections that ship with Airflow. It's good to
 133 | # get started, but you probably want to set this to False in a production
 134 | # environment
 135 | load_default_connections = True
 136 | 
 137 | # Where your Airflow plugins are stored
 138 | plugins_folder = /Users/alexaabbas/Desktop/airflow-tutorial/plugins
 139 | 
 140 | # Secret key to save connection passwords in the db
 141 | fernet_key = aMe6oy61nW2447cp76p6VLi3PN8almZL6J6nXC7cxSc=
 142 | 
 143 | # Whether to disable pickling dags
 144 | donot_pickle = False
 145 | 
 146 | # How long before timing out a python file import
 147 | dagbag_import_timeout = 30
 148 | 
 149 | # How long before timing out a DagFileProcessor, which processes a dag file
 150 | dag_file_processor_timeout = 50
 151 | 
 152 | # The class to use for running task instances in a subprocess
 153 | task_runner = StandardTaskRunner
 154 | 
 155 | # If set, tasks without a ``run_as_user`` argument will be run with this user
 156 | # Can be used to de-elevate a sudo user running Airflow when executing tasks
 157 | default_impersonation =
 158 | 
 159 | # What security module to use (for example kerberos)
 160 | security =
 161 | 
 162 | # If set to False enables some unsecure features like Charts and Ad Hoc Queries.
 163 | # In 2.0 will default to True.
 164 | secure_mode = False
 165 | 
 166 | # Turn unit test mode on (overwrites many configuration options with test
 167 | # values at runtime)
 168 | unit_test_mode = False
 169 | 
 170 | # Whether to enable pickling for xcom (note that this is insecure and allows for
 171 | # RCE exploits). This will be deprecated in Airflow 2.0 (be forced to False).
 172 | enable_xcom_pickling = True
 173 | 
 174 | # When a task is killed forcefully, this is the amount of time in seconds that
 175 | # it has to cleanup after it is sent a SIGTERM, before it is SIGKILLED
 176 | killed_task_cleanup_time = 60
 177 | 
 178 | # Whether to override params with dag_run.conf. If you pass some key-value pairs
 179 | # through ``airflow dags backfill -c`` or
 180 | # ``airflow dags trigger -c``, the key-value pairs will override the existing ones in params.
 181 | dag_run_conf_overrides_params = False
 182 | 
 183 | # Worker initialisation check to validate Metadata Database connection
 184 | worker_precheck = False
 185 | 
 186 | # When discovering DAGs, ignore any files that don't contain the strings ``DAG`` and ``airflow``.
 187 | dag_discovery_safe_mode = True
 188 | 
 189 | # The number of retries each task is going to have by default. Can be overridden at dag or task level.
 190 | default_task_retries = 0
 191 | 
 192 | # Whether to serialise DAGs and persist them in DB.
 193 | # If set to True, Webserver reads from DB instead of parsing DAG files
 194 | # More details: https://airflow.apache.org/docs/stable/dag-serialization.html
 195 | store_serialized_dags = False
 196 | 
 197 | # Updating serialized DAG can not be faster than a minimum interval to reduce database write rate.
 198 | min_serialized_dag_update_interval = 30
 199 | 
 200 | # Whether to persist DAG files code in DB.
 201 | # If set to True, Webserver reads file contents from DB instead of
 202 | # trying to access files in a DAG folder. Defaults to same as the
 203 | # ``store_serialized_dags`` setting.
 204 | store_dag_code = %(store_serialized_dags)s
 205 | 
 206 | # Maximum number of Rendered Task Instance Fields (Template Fields) per task to store
 207 | # in the Database.
 208 | # When Dag Serialization is enabled (``store_serialized_dags=True``), all the template_fields
 209 | # for each of Task Instance are stored in the Database.
 210 | # Keeping this number small may cause an error when you try to view ``Rendered`` tab in
 211 | # TaskInstance view for older tasks.
 212 | max_num_rendered_ti_fields_per_task = 30
 213 | 
 214 | # On each dagrun check against defined SLAs
 215 | check_slas = True
 216 | 
 217 | [secrets]
 218 | # Full class name of secrets backend to enable (will precede env vars and metastore in search path)
 219 | # Example: backend = airflow.contrib.secrets.aws_systems_manager.SystemsManagerParameterStoreBackend
 220 | backend =
 221 | 
 222 | # The backend_kwargs param is loaded into a dictionary and passed to __init__ of secrets backend class.
 223 | # See documentation for the secrets backend you are using. JSON is expected.
 224 | # Example for AWS Systems Manager ParameterStore:
 225 | # ``{"connections_prefix": "/airflow/connections", "profile_name": "default"}``
 226 | backend_kwargs =
 227 | 
 228 | [cli]
 229 | # In what way should the cli access the API. The LocalClient will use the
 230 | # database directly, while the json_client will use the api running on the
 231 | # webserver
 232 | api_client = airflow.api.client.local_client
 233 | 
 234 | # If you set web_server_url_prefix, do NOT forget to append it here, ex:
 235 | # ``endpoint_url = http://localhost:8080/myroot``
 236 | # So api will look like: ``http://localhost:8080/myroot/api/experimental/...``
 237 | endpoint_url = http://localhost:8080
 238 | 
 239 | [debug]
 240 | # Used only with DebugExecutor. If set to True DAG will fail with first
 241 | # failed task. Helpful for debugging purposes.
 242 | fail_fast = False
 243 | 
 244 | [api]
 245 | # How to authenticate users of the API
 246 | auth_backend = airflow.api.auth.backend.default
 247 | 
 248 | [lineage]
 249 | # what lineage backend to use
 250 | backend =
 251 | 
 252 | [atlas]
 253 | sasl_enabled = False
 254 | host =
 255 | port = 21000
 256 | username =
 257 | password =
 258 | 
 259 | [operators]
 260 | # The default owner assigned to each new operator, unless
 261 | # provided explicitly or passed via ``default_args``
 262 | default_owner = airflow
 263 | default_cpus = 1
 264 | default_ram = 512
 265 | default_disk = 512
 266 | default_gpus = 0
 267 | 
 268 | [hive]
 269 | # Default mapreduce queue for HiveOperator tasks
 270 | default_hive_mapred_queue =
 271 | 
 272 | [webserver]
 273 | # The base url of your website as airflow cannot guess what domain or
 274 | # cname you are using. This is used in automated emails that
 275 | # airflow sends to point links to the right web server
 276 | base_url = http://localhost:8080
 277 | 
 278 | # Default timezone to display all dates in the RBAC UI, can be UTC, system, or
 279 | # any IANA timezone string (e.g. Europe/Amsterdam). If left empty the
 280 | # default value of core/default_timezone will be used
 281 | # Example: default_ui_timezone = America/New_York
 282 | default_ui_timezone = UTC
 283 | 
 284 | # The ip specified when starting the web server
 285 | web_server_host = 0.0.0.0
 286 | 
 287 | # The port on which to run the web server
 288 | web_server_port = 8080
 289 | 
 290 | # Paths to the SSL certificate and key for the web server. When both are
 291 | # provided SSL will be enabled. This does not change the web server port.
 292 | web_server_ssl_cert =
 293 | 
 294 | # Paths to the SSL certificate and key for the web server. When both are
 295 | # provided SSL will be enabled. This does not change the web server port.
 296 | web_server_ssl_key =
 297 | 
 298 | # Number of seconds the webserver waits before killing gunicorn master that doesn't respond
 299 | web_server_master_timeout = 120
 300 | 
 301 | # Number of seconds the gunicorn webserver waits before timing out on a worker
 302 | web_server_worker_timeout = 120
 303 | 
 304 | # Number of workers to refresh at a time. When set to 0, worker refresh is
 305 | # disabled. When nonzero, airflow periodically refreshes webserver workers by
 306 | # bringing up new ones and killing old ones.
 307 | worker_refresh_batch_size = 1
 308 | 
 309 | # Number of seconds to wait before refreshing a batch of workers.
 310 | worker_refresh_interval = 30
 311 | 
 312 | # Secret key used to run your flask app
 313 | # It should be as random as possible
 314 | secret_key = temporary_key
 315 | 
 316 | # Number of workers to run the Gunicorn web server
 317 | workers = 4
 318 | 
 319 | # The worker class gunicorn should use. Choices include
 320 | # sync (default), eventlet, gevent
 321 | worker_class = sync
 322 | 
 323 | # Log files for the gunicorn webserver. '-' means log to stderr.
 324 | access_logfile = -
 325 | 
 326 | # Log files for the gunicorn webserver. '-' means log to stderr.
 327 | error_logfile = -
 328 | 
 329 | # Expose the configuration file in the web server
 330 | expose_config = False
 331 | 
 332 | # Expose hostname in the web server
 333 | expose_hostname = True
 334 | 
 335 | # Expose stacktrace in the web server
 336 | expose_stacktrace = True
 337 | 
 338 | # Set to true to turn on authentication:
 339 | # https://airflow.apache.org/security.html#web-authentication
 340 | authenticate = True
 341 | auth_backend = airflow.contrib.backends.password_auth
 342 | 
 343 | # Filter the list of dags by owner name (requires authentication to be enabled)
 344 | filter_by_owner = False
 345 | 
 346 | # Filtering mode. Choices include user (default) and ldapgroup.
 347 | # Ldap group filtering requires using the ldap backend
 348 | #
 349 | # Note that the ldap server needs the "memberOf" overlay to be set up
 350 | # in order to user the ldapgroup mode.
 351 | owner_mode = user
 352 | 
 353 | # Default DAG view. Valid values are:
 354 | # tree, graph, duration, gantt, landing_times
 355 | dag_default_view = tree
 356 | 
 357 | # "Default DAG orientation. Valid values are:"
 358 | # LR (Left->Right), TB (Top->Bottom), RL (Right->Left), BT (Bottom->Top)
 359 | dag_orientation = LR
 360 | 
 361 | # Puts the webserver in demonstration mode; blurs the names of Operators for
 362 | # privacy.
 363 | demo_mode = False
 364 | 
 365 | # The amount of time (in secs) webserver will wait for initial handshake
 366 | # while fetching logs from other worker machine
 367 | log_fetch_timeout_sec = 5
 368 | 
 369 | # Time interval (in secs) to wait before next log fetching.
 370 | log_fetch_delay_sec = 2
 371 | 
 372 | # Distance away from page bottom to enable auto tailing.
 373 | log_auto_tailing_offset = 30
 374 | 
 375 | # Animation speed for auto tailing log display.
 376 | log_animation_speed = 1000
 377 | 
 378 | # By default, the webserver shows paused DAGs. Flip this to hide paused
 379 | # DAGs by default
 380 | hide_paused_dags_by_default = False
 381 | 
 382 | # Consistent page size across all listing views in the UI
 383 | page_size = 100
 384 | 
 385 | # Use FAB-based webserver with RBAC feature
 386 | rbac = True
 387 | 
 388 | # Define the color of navigation bar
 389 | navbar_color = #007A87
 390 | 
 391 | # Default dagrun to show in UI
 392 | default_dag_run_display_number = 25
 393 | 
 394 | # Enable werkzeug ``ProxyFix`` middleware for reverse proxy
 395 | enable_proxy_fix = False
 396 | 
 397 | # Number of values to trust for ``X-Forwarded-For``.
 398 | # More info: https://werkzeug.palletsprojects.com/en/0.16.x/middleware/proxy_fix/
 399 | proxy_fix_x_for = 1
 400 | 
 401 | # Number of values to trust for ``X-Forwarded-Proto``
 402 | proxy_fix_x_proto = 1
 403 | 
 404 | # Number of values to trust for ``X-Forwarded-Host``
 405 | proxy_fix_x_host = 1
 406 | 
 407 | # Number of values to trust for ``X-Forwarded-Port``
 408 | proxy_fix_x_port = 1
 409 | 
 410 | # Number of values to trust for ``X-Forwarded-Prefix``
 411 | proxy_fix_x_prefix = 1
 412 | 
 413 | # Set secure flag on session cookie
 414 | cookie_secure = False
 415 | 
 416 | # Set samesite policy on session cookie
 417 | cookie_samesite =
 418 | 
 419 | # Default setting for wrap toggle on DAG code and TI log views.
 420 | default_wrap = False
 421 | 
 422 | # Allow the UI to be rendered in a frame
 423 | x_frame_enabled = True
 424 | 
 425 | # Send anonymous user activity to your analytics tool
 426 | # choose from google_analytics, segment, or metarouter
 427 | # analytics_tool =
 428 | 
 429 | # Unique ID of your account in the analytics tool
 430 | # analytics_id =
 431 | 
 432 | # Update FAB permissions and sync security manager roles
 433 | # on webserver startup
 434 | update_fab_perms = True
 435 | 
 436 | # Minutes of non-activity before logged out from UI
 437 | # 0 means never get forcibly logged out
 438 | force_log_out_after = 0
 439 | 
 440 | # The UI cookie lifetime in days
 441 | session_lifetime_days = 30
 442 | 
 443 | [email]
 444 | email_backend = airflow.utils.email.send_email_smtp
 445 | 
 446 | [smtp]
 447 | 
 448 | # If you want airflow to send emails on retries, failure, and you want to use
 449 | # the airflow.utils.email.send_email_smtp function, you have to configure an
 450 | # smtp server here
 451 | smtp_host = localhost
 452 | smtp_starttls = True
 453 | smtp_ssl = False
 454 | # Example: smtp_user = airflow
 455 | # smtp_user =
 456 | # Example: smtp_password = airflow
 457 | # smtp_password =
 458 | smtp_port = 25
 459 | smtp_mail_from = airflow@example.com
 460 | 
 461 | [sentry]
 462 | 
 463 | # Sentry (https://docs.sentry.io) integration
 464 | sentry_dsn = https://2cb2cf66faae436992bc5d32b92604cb@o402350.ingest.sentry.io/5303573
 465 | 
 466 | [celery]
 467 | 
 468 | # This section only applies if you are using the CeleryExecutor in
 469 | # ``[core]`` section above
 470 | # The app name that will be used by celery
 471 | celery_app_name = airflow.executors.celery_executor
 472 | 
 473 | # The concurrency that will be used when starting workers with the
 474 | # ``airflow celery worker`` command. This defines the number of task instances that
 475 | # a worker will take, so size up your workers based on the resources on
 476 | # your worker box and the nature of your tasks
 477 | worker_concurrency = 16
 478 | 
 479 | # The maximum and minimum concurrency that will be used when starting workers with the
 480 | # ``airflow celery worker`` command (always keep minimum processes, but grow
 481 | # to maximum if necessary). Note the value should be max_concurrency,min_concurrency
 482 | # Pick these numbers based on resources on worker box and the nature of the task.
 483 | # If autoscale option is available, worker_concurrency will be ignored.
 484 | # http://docs.celeryproject.org/en/latest/reference/celery.bin.worker.html#cmdoption-celery-worker-autoscale
 485 | # Example: worker_autoscale = 16,12
 486 | # worker_autoscale =
 487 | 
 488 | # When you start an airflow worker, airflow starts a tiny web server
 489 | # subprocess to serve the workers local log files to the airflow main
 490 | # web server, who then builds pages and sends them to users. This defines
 491 | # the port on which the logs are served. It needs to be unused, and open
 492 | # visible from the main web server to connect into the workers.
 493 | worker_log_server_port = 8793
 494 | 
 495 | # The Celery broker URL. Celery supports RabbitMQ, Redis and experimentally
 496 | # a sqlalchemy database. Refer to the Celery documentation for more
 497 | # information.
 498 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#broker-settings
 499 | broker_url = sqla+mysql://airflow:airflow@localhost:3306/airflow
 500 | 
 501 | # The Celery result_backend. When a job finishes, it needs to update the
 502 | # metadata of the job. Therefore it will post a message on a message bus,
 503 | # or insert it into a database (depending of the backend)
 504 | # This status is used by the scheduler to update the state of the task
 505 | # The use of a database is highly recommended
 506 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#task-result-backend-settings
 507 | result_backend = db+mysql://airflow:airflow@localhost:3306/airflow
 508 | 
 509 | # Celery Flower is a sweet UI for Celery. Airflow has a shortcut to start
 510 | # it ``airflow flower``. This defines the IP that Celery Flower runs on
 511 | flower_host = 0.0.0.0
 512 | 
 513 | # The root URL for Flower
 514 | # Example: flower_url_prefix = /flower
 515 | flower_url_prefix =
 516 | 
 517 | # This defines the port that Celery Flower runs on
 518 | flower_port = 5555
 519 | 
 520 | # Securing Flower with Basic Authentication
 521 | # Accepts user:password pairs separated by a comma
 522 | # Example: flower_basic_auth = user1:password1,user2:password2
 523 | flower_basic_auth =
 524 | 
 525 | # Default queue that tasks get assigned to and that worker listen on.
 526 | default_queue = default
 527 | 
 528 | # How many processes CeleryExecutor uses to sync task state.
 529 | # 0 means to use max(1, number of cores - 1) processes.
 530 | sync_parallelism = 0
 531 | 
 532 | # Import path for celery configuration options
 533 | celery_config_options = airflow.config_templates.default_celery.DEFAULT_CELERY_CONFIG
 534 | 
 535 | # In case of using SSL
 536 | ssl_active = False
 537 | ssl_key =
 538 | ssl_cert =
 539 | ssl_cacert =
 540 | 
 541 | # Celery Pool implementation.
 542 | # Choices include: prefork (default), eventlet, gevent or solo.
 543 | # See:
 544 | # https://docs.celeryproject.org/en/latest/userguide/workers.html#concurrency
 545 | # https://docs.celeryproject.org/en/latest/userguide/concurrency/eventlet.html
 546 | pool = prefork
 547 | 
 548 | # The number of seconds to wait before timing out ``send_task_to_executor`` or
 549 | # ``fetch_celery_task_state`` operations.
 550 | operation_timeout = 2
 551 | 
 552 | [celery_broker_transport_options]
 553 | 
 554 | # This section is for specifying options which can be passed to the
 555 | # underlying celery broker transport. See:
 556 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-broker_transport_options
 557 | # The visibility timeout defines the number of seconds to wait for the worker
 558 | # to acknowledge the task before the message is redelivered to another worker.
 559 | # Make sure to increase the visibility timeout to match the time of the longest
 560 | # ETA you're planning to use.
 561 | # visibility_timeout is only supported for Redis and SQS celery brokers.
 562 | # See:
 563 | # http://docs.celeryproject.org/en/master/userguide/configuration.html#std:setting-broker_transport_options
 564 | # Example: visibility_timeout = 21600
 565 | # visibility_timeout =
 566 | 
 567 | [dask]
 568 | 
 569 | # This section only applies if you are using the DaskExecutor in
 570 | # [core] section above
 571 | # The IP address and port of the Dask cluster's scheduler.
 572 | cluster_address = 127.0.0.1:8786
 573 | 
 574 | # TLS/ SSL settings to access a secured Dask scheduler.
 575 | tls_ca =
 576 | tls_cert =
 577 | tls_key =
 578 | 
 579 | [scheduler]
 580 | # Task instances listen for external kill signal (when you clear tasks
 581 | # from the CLI or the UI), this defines the frequency at which they should
 582 | # listen (in seconds).
 583 | job_heartbeat_sec = 5
 584 | 
 585 | # The scheduler constantly tries to trigger new tasks (look at the
 586 | # scheduler section in the docs for more information). This defines
 587 | # how often the scheduler should run (in seconds).
 588 | scheduler_heartbeat_sec = 5
 589 | 
 590 | # After how much time should the scheduler terminate in seconds
 591 | # -1 indicates to run continuously (see also num_runs)
 592 | run_duration = -1
 593 | 
 594 | # The number of times to try to schedule each DAG file
 595 | # -1 indicates unlimited number
 596 | num_runs = -1
 597 | 
 598 | # The number of seconds to wait between consecutive DAG file processing
 599 | processor_poll_interval = 1
 600 | 
 601 | # after how much time (seconds) a new DAGs should be picked up from the filesystem
 602 | min_file_process_interval = 0
 603 | 
 604 | # How often (in seconds) to scan the DAGs directory for new files. Default to 5 minutes.
 605 | dag_dir_list_interval = 300
 606 | 
 607 | # How often should stats be printed to the logs. Setting to 0 will disable printing stats
 608 | print_stats_interval = 30
 609 | 
 610 | # If the last scheduler heartbeat happened more than scheduler_health_check_threshold
 611 | # ago (in seconds), scheduler is considered unhealthy.
 612 | # This is used by the health check in the "/health" endpoint
 613 | scheduler_health_check_threshold = 30
 614 | child_process_log_directory = /Users/alexaabbas/Desktop/airflow-tutorial/logs/scheduler
 615 | 
 616 | # Local task jobs periodically heartbeat to the DB. If the job has
 617 | # not heartbeat in this many seconds, the scheduler will mark the
 618 | # associated task instance as failed and will re-schedule the task.
 619 | scheduler_zombie_task_threshold = 300
 620 | 
 621 | # Turn off scheduler catchup by setting this to False.
 622 | # Default behavior is unchanged and
 623 | # Command Line Backfills still work, but the scheduler
 624 | # will not do scheduler catchup if this is False,
 625 | # however it can be set on a per DAG basis in the
 626 | # DAG definition (catchup)
 627 | catchup_by_default = True
 628 | 
 629 | # This changes the batch size of queries in the scheduling main loop.
 630 | # If this is too high, SQL query performance may be impacted by one
 631 | # or more of the following:
 632 | # - reversion to full table scan
 633 | # - complexity of query predicate
 634 | # - excessive locking
 635 | # Additionally, you may hit the maximum allowable query length for your db.
 636 | # Set this to 0 for no limit (not advised)
 637 | max_tis_per_query = 512
 638 | 
 639 | # Statsd (https://github.com/etsy/statsd) integration settings
 640 | statsd_on = True
 641 | statsd_host = localhost
 642 | statsd_port = 8125
 643 | statsd_prefix = airflow
 644 | 
 645 | # If you want to avoid send all the available metrics to StatsD,
 646 | # you can configure an allow list of prefixes to send only the metrics that
 647 | # start with the elements of the list (e.g: scheduler,executor,dagrun)
 648 | statsd_allow_list =
 649 | 
 650 | # The scheduler can run multiple threads in parallel to schedule dags.
 651 | # This defines how many threads will run.
 652 | max_threads = 2
 653 | authenticate = False
 654 | 
 655 | # Turn off scheduler use of cron intervals by setting this to False.
 656 | # DAGs submitted manually in the web UI or with trigger_dag will still run.
 657 | use_job_schedule = True
 658 | 
 659 | # Allow externally triggered DagRuns for Execution Dates in the future
 660 | # Only has effect if schedule_interval is set to None in DAG
 661 | allow_trigger_in_future = False
 662 | 
 663 | [ldap]
 664 | # set this to ldaps://<your.ldap.server>:<port>
 665 | uri =
 666 | user_filter = objectClass=*
 667 | user_name_attr = uid
 668 | group_member_attr = memberOf
 669 | superuser_filter =
 670 | data_profiler_filter =
 671 | bind_user = cn=Manager,dc=example,dc=com
 672 | bind_password = insecure
 673 | basedn = dc=example,dc=com
 674 | cacert = /etc/ca/ldap_ca.crt
 675 | search_scope = LEVEL
 676 | 
 677 | # This setting allows the use of LDAP servers that either return a
 678 | # broken schema, or do not return a schema.
 679 | ignore_malformed_schema = False
 680 | 
 681 | [mesos]
 682 | # Mesos master address which MesosExecutor will connect to.
 683 | master = localhost:5050
 684 | 
 685 | # The framework name which Airflow scheduler will register itself as on mesos
 686 | framework_name = Airflow
 687 | 
 688 | # Number of cpu cores required for running one task instance using
 689 | # 'airflow run <dag_id> <task_id> <execution_date> --local -p <pickle_id>'
 690 | # command on a mesos slave
 691 | task_cpu = 1
 692 | 
 693 | # Memory in MB required for running one task instance using
 694 | # 'airflow run <dag_id> <task_id> <execution_date> --local -p <pickle_id>'
 695 | # command on a mesos slave
 696 | task_memory = 256
 697 | 
 698 | # Enable framework checkpointing for mesos
 699 | # See http://mesos.apache.org/documentation/latest/slave-recovery/
 700 | checkpoint = False
 701 | 
 702 | # Failover timeout in milliseconds.
 703 | # When checkpointing is enabled and this option is set, Mesos waits
 704 | # until the configured timeout for
 705 | # the MesosExecutor framework to re-register after a failover. Mesos
 706 | # shuts down running tasks if the
 707 | # MesosExecutor framework fails to re-register within this timeframe.
 708 | # Example: failover_timeout = 604800
 709 | # failover_timeout =
 710 | 
 711 | # Enable framework authentication for mesos
 712 | # See http://mesos.apache.org/documentation/latest/configuration/
 713 | authenticate = False
 714 | 
 715 | # Mesos credentials, if authentication is enabled
 716 | # Example: default_principal = admin
 717 | # default_principal =
 718 | # Example: default_secret = admin
 719 | # default_secret =
 720 | 
 721 | # Optional Docker Image to run on slave before running the command
 722 | # This image should be accessible from mesos slave i.e mesos slave
 723 | # should be able to pull this docker image before executing the command.
 724 | # Example: docker_image_slave = puckel/docker-airflow
 725 | # docker_image_slave =
 726 | 
 727 | [kerberos]
 728 | ccache = /tmp/airflow_krb5_ccache
 729 | 
 730 | # gets augmented with fqdn
 731 | principal = airflow
 732 | reinit_frequency = 3600
 733 | kinit_path = kinit
 734 | keytab = airflow.keytab
 735 | 
 736 | [github_enterprise]
 737 | api_rev = v3
 738 | 
 739 | [admin]
 740 | # UI to hide sensitive variable fields when set to True
 741 | hide_sensitive_variable_fields = True
 742 | 
 743 | [elasticsearch]
 744 | # Elasticsearch host
 745 | host =
 746 | 
 747 | # Format of the log_id, which is used to query for a given tasks logs
 748 | log_id_template = {dag_id}-{task_id}-{execution_date}-{try_number}
 749 | 
 750 | # Used to mark the end of a log stream for a task
 751 | end_of_log_mark = end_of_log
 752 | 
 753 | # Qualified URL for an elasticsearch frontend (like Kibana) with a template argument for log_id
 754 | # Code will construct log_id using the log_id template from the argument above.
 755 | # NOTE: The code will prefix the https:// automatically, don't include that here.
 756 | frontend =
 757 | 
 758 | # Write the task logs to the stdout of the worker, rather than the default files
 759 | write_stdout = False
 760 | 
 761 | # Instead of the default log formatter, write the log lines as JSON
 762 | json_format = False
 763 | 
 764 | # Log fields to also attach to the json output, if enabled
 765 | json_fields = asctime, filename, lineno, levelname, message
 766 | 
 767 | [elasticsearch_configs]
 768 | use_ssl = False
 769 | verify_certs = True
 770 | 
 771 | [kubernetes]
 772 | # The repository, tag and imagePullPolicy of the Kubernetes Image for the Worker to Run
 773 | worker_container_repository =
 774 | worker_container_tag =
 775 | worker_container_image_pull_policy = IfNotPresent
 776 | 
 777 | # If True (default), worker pods will be deleted upon termination
 778 | delete_worker_pods = True
 779 | 
 780 | # Number of Kubernetes Worker Pod creation calls per scheduler loop
 781 | worker_pods_creation_batch_size = 1
 782 | 
 783 | # The Kubernetes namespace where airflow workers should be created. Defaults to ``default``
 784 | namespace = default
 785 | 
 786 | # The name of the Kubernetes ConfigMap containing the Airflow Configuration (this file)
 787 | # Example: airflow_configmap = airflow-configmap
 788 | airflow_configmap =
 789 | 
 790 | # The name of the Kubernetes ConfigMap containing ``airflow_local_settings.py`` file.
 791 | #
 792 | # For example:
 793 | #
 794 | # ``airflow_local_settings_configmap = "airflow-configmap"`` if you have the following ConfigMap.
 795 | #
 796 | # ``airflow-configmap.yaml``:
 797 | #
 798 | # .. code-block:: yaml
 799 | #
 800 | #   ---
 801 | #   apiVersion: v1
 802 | #   kind: ConfigMap
 803 | #   metadata:
 804 | #     name: airflow-configmap
 805 | #   data:
 806 | #     airflow_local_settings.py: |
 807 | #         def pod_mutation_hook(pod):
 808 | #             ...
 809 | #     airflow.cfg: |
 810 | #         ...
 811 | # Example: airflow_local_settings_configmap = airflow-configmap
 812 | airflow_local_settings_configmap =
 813 | 
 814 | # For docker image already contains DAGs, this is set to ``True``, and the worker will
 815 | # search for dags in dags_folder,
 816 | # otherwise use git sync or dags volume claim to mount DAGs
 817 | dags_in_image = False
 818 | 
 819 | # For either git sync or volume mounted DAGs, the worker will look in this subpath for DAGs
 820 | dags_volume_subpath =
 821 | 
 822 | # For DAGs mounted via a volume claim (mutually exclusive with git-sync and host path)
 823 | dags_volume_claim =
 824 | 
 825 | # For volume mounted logs, the worker will look in this subpath for logs
 826 | logs_volume_subpath =
 827 | 
 828 | # A shared volume claim for the logs
 829 | logs_volume_claim =
 830 | 
 831 | # For DAGs mounted via a hostPath volume (mutually exclusive with volume claim and git-sync)
 832 | # Useful in local environment, discouraged in production
 833 | dags_volume_host =
 834 | 
 835 | # A hostPath volume for the logs
 836 | # Useful in local environment, discouraged in production
 837 | logs_volume_host =
 838 | 
 839 | # A list of configMapsRefs to envFrom. If more than one configMap is
 840 | # specified, provide a comma separated list: configmap_a,configmap_b
 841 | env_from_configmap_ref =
 842 | 
 843 | # A list of secretRefs to envFrom. If more than one secret is
 844 | # specified, provide a comma separated list: secret_a,secret_b
 845 | env_from_secret_ref =
 846 | 
 847 | # Git credentials and repository for DAGs mounted via Git (mutually exclusive with volume claim)
 848 | git_repo =
 849 | git_branch =
 850 | git_subpath =
 851 | 
 852 | # The specific rev or hash the git_sync init container will checkout
 853 | # This becomes GIT_SYNC_REV environment variable in the git_sync init container for worker pods
 854 | git_sync_rev =
 855 | 
 856 | # Use git_user and git_password for user authentication or git_ssh_key_secret_name
 857 | # and git_ssh_key_secret_key for SSH authentication
 858 | git_user =
 859 | git_password =
 860 | git_sync_root = /git
 861 | git_sync_dest = repo
 862 | 
 863 | # Mount point of the volume if git-sync is being used.
 864 | # i.e. /Users/alexaabbas/Desktop/airflow-tutorial/dags
 865 | git_dags_folder_mount_point =
 866 | 
 867 | # To get Git-sync SSH authentication set up follow this format
 868 | #
 869 | # ``airflow-secrets.yaml``:
 870 | #
 871 | # .. code-block:: yaml
 872 | #
 873 | #   ---
 874 | #   apiVersion: v1
 875 | #   kind: Secret
 876 | #   metadata:
 877 | #     name: airflow-secrets
 878 | #   data:
 879 | #     # key needs to be gitSshKey
 880 | #     gitSshKey: <base64_encoded_data>
 881 | # Example: git_ssh_key_secret_name = airflow-secrets
 882 | git_ssh_key_secret_name =
 883 | 
 884 | # To get Git-sync SSH authentication set up follow this format
 885 | #
 886 | # ``airflow-configmap.yaml``:
 887 | #
 888 | # .. code-block:: yaml
 889 | #
 890 | #   ---
 891 | #   apiVersion: v1
 892 | #   kind: ConfigMap
 893 | #   metadata:
 894 | #     name: airflow-configmap
 895 | #   data:
 896 | #     known_hosts: |
 897 | #         github.com ssh-rsa <...>
 898 | #     airflow.cfg: |
 899 | #         ...
 900 | # Example: git_ssh_known_hosts_configmap_name = airflow-configmap
 901 | git_ssh_known_hosts_configmap_name =
 902 | 
 903 | # To give the git_sync init container credentials via a secret, create a secret
 904 | # with two fields: GIT_SYNC_USERNAME and GIT_SYNC_PASSWORD (example below) and
 905 | # add ``git_sync_credentials_secret = <secret_name>`` to your airflow config under the
 906 | # ``kubernetes`` section
 907 | #
 908 | # Secret Example:
 909 | #
 910 | # .. code-block:: yaml
 911 | #
 912 | #   ---
 913 | #   apiVersion: v1
 914 | #   kind: Secret
 915 | #   metadata:
 916 | #     name: git-credentials
 917 | #   data:
 918 | #     GIT_SYNC_USERNAME: <base64_encoded_git_username>
 919 | #     GIT_SYNC_PASSWORD: <base64_encoded_git_password>
 920 | git_sync_credentials_secret =
 921 | 
 922 | # For cloning DAGs from git repositories into volumes: https://github.com/kubernetes/git-sync
 923 | git_sync_container_repository = k8s.gcr.io/git-sync
 924 | git_sync_container_tag = v3.1.1
 925 | git_sync_init_container_name = git-sync-clone
 926 | git_sync_run_as_user = 65533
 927 | 
 928 | # The name of the Kubernetes service account to be associated with airflow workers, if any.
 929 | # Service accounts are required for workers that require access to secrets or cluster resources.
 930 | # See the Kubernetes RBAC documentation for more:
 931 | # https://kubernetes.io/docs/admin/authorization/rbac/
 932 | worker_service_account_name =
 933 | 
 934 | # Any image pull secrets to be given to worker pods, If more than one secret is
 935 | # required, provide a comma separated list: secret_a,secret_b
 936 | image_pull_secrets =
 937 | 
 938 | # GCP Service Account Keys to be provided to tasks run on Kubernetes Executors
 939 | # Should be supplied in the format: key-name-1:key-path-1,key-name-2:key-path-2
 940 | gcp_service_account_keys =
 941 | 
 942 | # Use the service account kubernetes gives to pods to connect to kubernetes cluster.
 943 | # It's intended for clients that expect to be running inside a pod running on kubernetes.
 944 | # It will raise an exception if called from a process not running in a kubernetes environment.
 945 | in_cluster = True
 946 | 
 947 | # When running with in_cluster=False change the default cluster_context or config_file
 948 | # options to Kubernetes client. Leave blank these to use default behaviour like ``kubectl`` has.
 949 | # cluster_context =
 950 | # config_file =
 951 | 
 952 | # Affinity configuration as a single line formatted JSON object.
 953 | # See the affinity model for top-level key names (e.g. ``nodeAffinity``, etc.):
 954 | # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.12/#affinity-v1-core
 955 | affinity =
 956 | 
 957 | # A list of toleration objects as a single line formatted JSON array
 958 | # See:
 959 | # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.12/#toleration-v1-core
 960 | tolerations =
 961 | 
 962 | # Keyword parameters to pass while calling a kubernetes client core_v1_api methods
 963 | # from Kubernetes Executor provided as a single line formatted JSON dictionary string.
 964 | # List of supported params are similar for all core_v1_apis, hence a single config
 965 | # variable for all apis.
 966 | # See:
 967 | # https://raw.githubusercontent.com/kubernetes-client/python/master/kubernetes/client/apis/core_v1_api.py
 968 | # Note that if no _request_timeout is specified, the kubernetes client will wait indefinitely
 969 | # for kubernetes api responses, which will cause the scheduler to hang.
 970 | # The timeout is specified as [connect timeout, read timeout]
 971 | kube_client_request_args =
 972 | 
 973 | # Specifies the uid to run the first process of the worker pods containers as
 974 | run_as_user =
 975 | 
 976 | # Specifies a gid to associate with all containers in the worker pods
 977 | # if using a git_ssh_key_secret_name use an fs_group
 978 | # that allows for the key to be read, e.g. 65533
 979 | fs_group =
 980 | 
 981 | [kubernetes_node_selectors]
 982 | 
 983 | # The Key-value pairs to be given to worker pods.
 984 | # The worker pods will be scheduled to the nodes of the specified key-value pairs.
 985 | # Should be supplied in the format: key = value
 986 | 
 987 | [kubernetes_annotations]
 988 | 
 989 | # The Key-value annotations pairs to be given to worker pods.
 990 | # Should be supplied in the format: key = value
 991 | 
 992 | [kubernetes_environment_variables]
 993 | 
 994 | # The scheduler sets the following environment variables into your workers. You may define as
 995 | # many environment variables as needed and the kubernetes launcher will set them in the launched workers.
 996 | # Environment variables in this section are defined as follows
 997 | # ``<environment_variable_key> = <environment_variable_value>``
 998 | #
 999 | # For example if you wanted to set an environment variable with value `prod` and key
1000 | # ``ENVIRONMENT`` you would follow the following format:
1001 | # ENVIRONMENT = prod
1002 | #
1003 | # Additionally you may override worker airflow settings with the ``AIRFLOW__<SECTION>__<KEY>``
1004 | # formatting as supported by airflow normally.
1005 | 
1006 | [kubernetes_secrets]
1007 | 
1008 | # The scheduler mounts the following secrets into your workers as they are launched by the
1009 | # scheduler. You may define as many secrets as needed and the kubernetes launcher will parse the
1010 | # defined secrets and mount them as secret environment variables in the launched workers.
1011 | # Secrets in this section are defined as follows
1012 | # ``<environment_variable_mount> = <kubernetes_secret_object>=<kubernetes_secret_key>``
1013 | #
1014 | # For example if you wanted to mount a kubernetes secret key named ``postgres_password`` from the
1015 | # kubernetes secret object ``airflow-secret`` as the environment variable ``POSTGRES_PASSWORD`` into
1016 | # your workers you would follow the following format:
1017 | # ``POSTGRES_PASSWORD = airflow-secret=postgres_credentials``
1018 | #
1019 | # Additionally you may override worker airflow settings with the ``AIRFLOW__<SECTION>__<KEY>``
1020 | # formatting as supported by airflow normally.
1021 | 
1022 | [kubernetes_labels]
1023 | 
1024 | # The Key-value pairs to be given to worker pods.
1025 | # The worker pods will be given these static labels, as well as some additional dynamic labels
1026 | # to identify the task.
1027 | # Should be supplied in the format: ``key = value``


--------------------------------------------------------------------------------
/dags/bigquery_data_analytics.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ## Example PySpark dag
 3 | This example dag walks you through the concepts of branching, subdags and trigger rules.
 4 | It creates a Dataproc cluster in Google Cloud and runs a series of PySpark jobs.
 5 | """
 6 | from airflow import DAG
 7 | from airflow.operators.python_operator import BranchPythonOperator
 8 | from airflow.operators.subdag_operator import SubDagOperator
 9 | from airflow.contrib.operators.dataproc_operator import (
10 |     DataprocClusterCreateOperator,
11 |     DataProcPySparkOperator,
12 |     DataprocClusterDeleteOperator,
13 | )
14 | 
15 | from airflow.utils.dates import days_ago
16 | from datetime import datetime
17 | 
18 | from pyspark_subdag import weekday_subdag
19 | 
20 | default_arguments = {"owner": "YOUR-NAME-HERE", "start_date": days_ago(1)}
21 | 
22 | 
23 | def assess_day(execution_date=None):
24 |     date = datetime.strptime(execution_date, "%Y-%m-%d")
25 | 
26 |     if date.isoweekday() < 6:
27 |         return "weekday_analytics"
28 | 
29 |     return "weekend_analytics"
30 | 
31 | 
32 | with DAG(
33 |     "bigquery_data_analytics",
34 |     schedule_interval="0 20 * * *",
35 |     catchup=False,
36 |     default_args=default_arguments,
37 | ) as dag:
38 | 
39 |     dag.doc_md = __doc__
40 | 
41 |     create_cluster = DataprocClusterCreateOperator(
42 |         task_id="create_cluster",
43 |         project_id="YOUR-PROJECT-NAME-HERE",
44 |         cluster_name="spark-cluster-{{ ds_nodash }}",
45 |         num_workers=2,
46 |         storage_bucket="YOUR-BUCKET-NAME-HERE",
47 |         zone="europe-west2-a",
48 |     )
49 | 
50 |     create_cluster.doc_md = """## Create Dataproc cluster
51 |     This task creates a Dataproc cluster in your project.
52 |     """
53 | 
54 |     weekday_or_weekend = BranchPythonOperator(
55 |         task_id="weekday_or_weekend",
56 |         python_callable=assess_day,
57 |         op_kwargs={"execution_date": "{{ ds }}"},
58 |     )
59 | 
60 |     weekend_analytics = DataProcPySparkOperator(
61 |         task_id="weekend_analytics",
62 |         main="gs://YOUR-BUCKET-NAME-HERE/pyspark/weekend/gas_composition_count.py",
63 |         cluster_name="spark-cluster-{{ ds_nodash }}",
64 |         dataproc_pyspark_jars="gs://spark-lib/bigquery/spark-bigquery-latest.jar",
65 |     )
66 | 
67 |     weekday_analytics = SubDagOperator(
68 |         task_id="weekday_analytics",
69 |         subdag=weekday_subdag(
70 |             parent_dag="bigquery_data_analytics",
71 |             task_id="weekday_analytics",
72 |             schedule_interval="0 20 * * *",
73 |             default_args=default_arguments,
74 |         ),
75 |     )
76 | 
77 |     delete_cluster = DataprocClusterDeleteOperator(
78 |         task_id="delete_cluster",
79 |         project_id="YOUR-PROJECT-NAME-HERE",
80 |         cluster_name="spark-cluster-{{ ds_nodash }}",
81 |         trigger_rule="all_done",
82 |     )
83 | 
84 | create_cluster >> weekday_or_weekend >> [
85 |     weekend_analytics,
86 |     weekday_analytics,
87 | ] >> delete_cluster
88 | 


--------------------------------------------------------------------------------
/dags/bigquery_data_load.py:
--------------------------------------------------------------------------------
  1 | from airflow import DAG
  2 | from airflow.models import Variable
  3 | from airflow.operators.python_operator import PythonOperator
  4 | from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook
  5 | from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator
  6 | from airflow.contrib.operators.bigquery_operator import BigQueryOperator
  7 | 
  8 | from airflow.utils.dates import days_ago
  9 | 
 10 | PROJECT_ID = Variable.get("project")
 11 | LANDING_BUCKET = Variable.get("landing_bucket")
 12 | BACKUP_BUCKET = Variable.get("backup_bucket")
 13 | 
 14 | default_arguments = {"owner": "YOUR-NAME-HERE", "start_date": days_ago(1)}
 15 | 
 16 | 
 17 | def list_objects(bucket=None):
 18 |     hook = GoogleCloudStorageHook()
 19 |     storage_objects = hook.list(bucket)
 20 | 
 21 |     return storage_objects
 22 | 
 23 | 
 24 | def move_objects(source_bucket=None, destination_bucket=None, prefix=None, **kwargs):
 25 | 
 26 |     storage_objects = kwargs["ti"].xcom_pull(task_ids="list_files")
 27 | 
 28 |     hook = GoogleCloudStorageHook()
 29 | 
 30 |     for storage_object in storage_objects:
 31 |         destination_object = storage_object
 32 | 
 33 |         if prefix:
 34 |             destination_object = "{}/{}".format(prefix, storage_object)
 35 | 
 36 |         hook.copy(source_bucket, storage_object, destination_bucket, destination_object)
 37 |         hook.delete(source_bucket, storage_object)
 38 | 
 39 | 
 40 | with DAG(
 41 |     "bigquery_data_load",
 42 |     schedule_interval="@hourly",
 43 |     catchup=False,
 44 |     default_args=default_arguments,
 45 |     max_active_runs=1,
 46 |     user_defined_macros={"project": PROJECT_ID},
 47 | ) as dag:
 48 | 
 49 |     list_files = PythonOperator(
 50 |         task_id="list_files",
 51 |         python_callable=list_objects,
 52 |         op_kwargs={"bucket": LANDING_BUCKET},
 53 |     )
 54 | 
 55 |     load_data = GoogleCloudStorageToBigQueryOperator(
 56 |         task_id="load_data",
 57 |         bucket=LANDING_BUCKET,
 58 |         source_objects=["*"],
 59 |         source_format="CSV",
 60 |         skip_leading_rows=1,
 61 |         field_delimiter=",",
 62 |         destination_project_dataset_table="{{ project }}.vehicle_analytics.history",
 63 |         create_disposition="CREATE_IF_NEEDED",
 64 |         write_disposition="WRITE_APPEND",
 65 |         bigquery_conn_id="google_cloud_default",
 66 |         google_cloud_storage_conn_id="google_cloud_default",
 67 |     )
 68 | 
 69 |     query = """
 70 |         SELECT * except (rank)
 71 |         FROM (
 72 |             SELECT
 73 |                 *,
 74 |                 ROW_NUMBER() OVER (
 75 |                     PARTITION BY vehicle_id ORDER BY DATETIME(date, TIME(hour, minute, 0)) DESC
 76 |                 ) as rank
 77 |             FROM `{{ project }}.vehicle_analytics.history`) as latest
 78 |         WHERE rank = 1;
 79 |         """
 80 | 
 81 |     create_table = BigQueryOperator(
 82 |         task_id="create_table",
 83 |         sql=query,
 84 |         destination_dataset_table="{{ project }}.vehicle_analytics.latest",
 85 |         write_disposition="WRITE_TRUNCATE",
 86 |         create_disposition="CREATE_IF_NEEDED",
 87 |         use_legacy_sql=False,
 88 |         location="europe-west2",
 89 |         bigquery_conn_id="google_cloud_default",
 90 |     )
 91 | 
 92 |     move_files = PythonOperator(
 93 |         task_id="move_files",
 94 |         python_callable=move_objects,
 95 |         op_kwargs={
 96 |             "source_bucket": LANDING_BUCKET,
 97 |             "destination_bucket": BACKUP_BUCKET,
 98 |             "prefix": "{{ ts_nodash }}",
 99 |         },
100 |         provide_context=True,
101 |     )
102 | 
103 | list_files >> load_data >> create_table >> move_files
104 | 


--------------------------------------------------------------------------------
/dags/bigquery_data_validation.py:
--------------------------------------------------------------------------------
 1 | from airflow import DAG
 2 | from airflow.utils.dates import days_ago
 3 | 
 4 | from airflow.operators.bigquery_plugin import (
 5 |     BigQueryDataValidationOperator,
 6 |     BigQueryDatasetSensor,
 7 | )
 8 | 
 9 | default_arguments = {"owner": "YOUR-NAME-HERE", "start_date": days_ago(1)}
10 | 
11 | with DAG(
12 |     "bigquery_data_validation",
13 |     schedule_interval="@daily",
14 |     catchup=False,
15 |     default_args=default_arguments,
16 |     user_defined_macros={"project": "YOUR-PROJECT-NAME-HERE"},
17 | ) as dag:
18 | 
19 |     is_table_empty = BigQueryDataValidationOperator(
20 |         task_id="is_table_empty",
21 |         sql="SELECT COUNT(*) FROM `{{ project }}.vehicle_analytics.history`",
22 |         location="europe-west2",
23 |     )
24 | 
25 |     dataset_exists = BigQueryDatasetSensor(
26 |         task_id="dataset_exists",
27 |         project_id="{{ project }}",
28 |         dataset_id="vehicle_analytics",
29 |     )
30 | 
31 | dataset_exists >> is_table_empty
32 | 


--------------------------------------------------------------------------------
/dags/core_concepts.py:
--------------------------------------------------------------------------------
 1 | from airflow import DAG
 2 | from airflow.utils.dates import days_ago
 3 | 
 4 | from airflow.operators.bash_operator import BashOperator
 5 | from airflow.operators.python_operator import PythonOperator
 6 | 
 7 | from airflow.utils.helpers import chain, cross_downstream
 8 | 
 9 | from random import seed, random
10 | 
11 | from datetime import timedelta
12 | 
13 | default_arguments = {
14 |     "owner": "YOUR-NAME-HERE",
15 |     "start_date": days_ago(1),
16 |     "sla": timedelta(hours=1),
17 | }
18 | 
19 | 
20 | with DAG(
21 |     "core_concepts",
22 |     schedule_interval="@daily",
23 |     catchup=False,
24 |     default_args=default_arguments,
25 | ) as dag:
26 | 
27 |     bash_task = BashOperator(
28 |         task_id="bash_command",
29 |         bash_command="echo $TODAY",
30 |         env={"TODAY": "2020-05-21"},
31 |         sla=timedelta(hours=2),
32 |     )
33 | 
34 |     def print_random_number(number):
35 |         seed(number)
36 |         print(random())
37 | 
38 |     python_task = PythonOperator(
39 |         task_id="python_function", python_callable=print_random_number, op_args=[1],
40 |     )
41 | 
42 | bash_task >> python_task
43 | 


--------------------------------------------------------------------------------
/dags/pyspark_subdag.py:
--------------------------------------------------------------------------------
 1 | from airflow import DAG
 2 | from airflow.contrib.operators.dataproc_operator import DataProcPySparkOperator
 3 | 
 4 | 
 5 | def weekday_subdag(
 6 |     parent_dag=None, task_id=None, schedule_interval=None, default_args=None
 7 | ):
 8 | 
 9 |     subdag = DAG(
10 |         f"{parent_dag}.{task_id}",
11 |         schedule_interval=schedule_interval,
12 |         default_args=default_args,
13 |     )
14 | 
15 |     pyspark_jobs = ["avg_speed", "avg_temperature", "avg_tire_pressure"]
16 | 
17 |     for job in pyspark_jobs:
18 | 
19 |         DataProcPySparkOperator(
20 |             task_id=f"{job}",
21 |             main=f"gs://YOUR-BUCKET-NAME-HERE/pyspark/weekday/{job}.py",
22 |             cluster_name="spark-cluster-{{ ds_nodash }}",
23 |             dataproc_pyspark_jars="gs://spark-lib/bigquery/spark-bigquery-latest.jar",
24 |             dag=subdag,
25 |         )
26 | 
27 |     return subdag
28 | 


--------------------------------------------------------------------------------
/data/4649493c.csv:
--------------------------------------------------------------------------------
 1 | vehicle_id,date,hour,minute,latitude,longitude,tire_pressure,speed,temperature,gas_composition
 2 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,0,34.995728,104.655337,10896.939,131.4,34.19,Blue
 3 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,1,-6.8561144,107.5193473,15108.392,73.4,-8.36,Indigo
 4 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,2,50.2923198,27.9814782,8668.885,53.9,29.0,Purple
 5 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,3,22.636828,113.814606,13176.617,51.7,-3.03,Yellow
 6 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,4,41.0872247,-8.1084595,9754.625,282.9,33.55,Maroon
 7 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,5,22.919769,113.618216,7290.369,274.4,18.18,Mauv
 8 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,6,-8.1448805,-79.0517936,12215.39,93.4,22.0,Pink
 9 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,7,41.6088994,-8.7234712,1458.653,107.0,25.93,Fuscia
10 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,8,-6.4185424,106.8502879,11149.99,273.8,31.14,Purple
11 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,9,-6.4104949,107.0126076,9128.732,9.5,-6.3,Blue
12 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,10,25.185809,111.579535,3642.755,274.5,12.1,Pink
13 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,11,30.2827181,120.1245708,12015.249,2.4,37.96,Red
14 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,12,15.2586581,100.8677256,13419.904,243.0,36.63,Blue
15 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,13,46.987383,123.769368,464.688,11.5,39.62,Crimson
16 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,14,-13.8556,-73.758263,2109.403,288.4,12.52,Mauv
17 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,15,45.0417524,-73.9260044,4857.905,74.6,12.33,Violet
18 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,16,51.1434519,40.2997102,2400.634,199.2,12.0,Orange
19 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,17,8.490877,124.345771,5255.081,67.3,3.16,Violet
20 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,18,48.8493975,2.4751086,1680.678,265.0,33.61,Violet
21 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,19,40.8245332,-8.0435522,14994.36,74.5,14.93,Violet
22 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,20,49.5519718,17.3375604,4957.965,103.5,13.71,Green
23 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,21,-7.056066,108.4780523,9239.987,66.9,4.75,Khaki
24 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,22,32.1942601,35.3736237,18576.176,120.1,5.59,Pink
25 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,23,46.817914,-0.626516,8795.055,217.8,25.72,Blue
26 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,24,-23.4214264,-57.4344451,7004.375,234.7,16.34,Aquamarine
27 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,25,53.6040518,24.741899,18664.287,121.3,0.74,Red
28 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,26,61.7983586,34.3753781,6894.003,187.8,37.48,Goldenrod
29 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,27,-26.3018088,-54.7192376,9188.153,232.1,24.68,Purple
30 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,28,-6.6591023,106.2931013,10759.753,51.9,11.41,Orange
31 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,29,35.579427,109.262961,2214.161,87.3,30.42,Orange
32 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,30,39.851469,113.463703,943.845,276.4,39.16,Mauv
33 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,31,27.358123,110.514837,2393.395,208.5,30.98,Orange
34 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,32,32.119799,34.986653,2771.459,130.5,-4.73,Fuscia
35 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,33,45.4123755,20.8041177,2289.512,186.6,1.95,Maroon
36 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,34,50.3781731,15.5447069,18739.197,108.6,20.51,Khaki
37 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,35,-23.5893189,-46.0107997,174.271,62.5,2.29,Crimson
38 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,36,-43.7567555,172.0223196,4055.949,182.5,0.73,Crimson
39 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,37,59.4411274,30.1610699,16611.812,117.6,3.71,Blue
40 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,38,23.5989926,56.5448304,1330.887,257.2,2.46,Green
41 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,39,39.1251493,23.6799766,6137.24,257.7,34.03,Crimson
42 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,40,-26.0440358,28.1574467,16995.984,0.9,10.82,Crimson
43 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,41,20.9414842,105.9569025,2487.218,16.6,13.63,Green
44 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,42,-34.6698749,-58.5616502,3969.342,173.7,26.85,Mauv
45 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,43,32.6892815,-16.7907398,5789.196,186.5,9.3,Violet
46 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,44,-19.6818529,-49.0817124,12341.257,266.3,-4.35,Aquamarine
47 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,45,47.357916,88.027707,7922.845,264.7,18.4,Aquamarine
48 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,46,49.51688,-96.50029,16218.418,225.4,4.32,Maroon
49 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,47,15.4418966,-61.2583352,17530.821,276.7,8.49,Maroon
50 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,48,49.9808189,21.726063,18768.76,6.0,16.33,Maroon
51 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,49,36.097577,114.392392,1553.011,155.9,22.06,Red
52 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,50,34.379742,117.788836,13737.251,18.4,17.22,Puce
53 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,51,29.985295,122.207215,19613.727,114.8,0.93,Orange
54 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,52,-7.9636675,112.6225104,5630.25,64.2,31.86,Pink
55 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,53,50.2014324,14.8328189,1172.312,2.2,5.07,Yellow
56 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,54,8.0155697,-71.7637309,19388.976,242.6,-6.2,Khaki
57 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,55,-31.3699,27.03523,2508.02,37.4,4.01,Mauv
58 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,56,24.1301619,55.8023118,1030.756,188.4,7.93,Orange
59 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,57,36.0720984,49.7013486,6494.651,77.2,38.58,Fuscia
60 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,58,40.6945206,-7.8725232,11217.461,22.0,2.13,Blue
61 | 4649493c-e5ac-409e-adbc-3f98168db2da,2020-05-22,2,59,48.4691324,37.0871224,16649.352,173.5,21.07,Green
62 | 


--------------------------------------------------------------------------------
/data/c876bd01.csv:
--------------------------------------------------------------------------------
 1 | vehicle_id,date,hour,minute,latitude,longitude,tire_pressure,speed,temperature,gas_composition
 2 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,0,-8.1448805,-79.0517936,12215.39,93.4,22.0,Pink
 3 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,1,34.199479,119.578364,17703.228,78.7,-0.06,Indigo
 4 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,2,49.954706,15.0305859,6308.355,58.0,-4.19,Fuscia
 5 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,3,34.995728,104.655337,10896.939,131.4,34.19,Blue
 6 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,4,37.8925401,140.5266555,2504.253,227.5,14.05,Green
 7 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,5,38.4687834,48.8728029,2300.072,207.4,5.16,Green
 8 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,6,30.780217,120.644805,9852.028,0.1,12.21,Khaki
 9 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,7,40.280218,-8.4788248,12868.242,213.9,-9.6,Puce
10 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,8,37.1878209,50.1575212,1277.421,96.4,29.91,Maroon
11 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,9,-6.4306844,106.7175669,19316.439,111.0,17.87,Pink
12 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,10,-5.8335001,34.9644426,9019.275,173.5,24.98,Aquamarine
13 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,11,38.013999,24.4198995,2293.132,204.8,25.63,Maroon
14 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,12,52.2163528,61.2809373,3104.701,67.3,38.86,Orange
15 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,13,38.734814,93.330613,4822.982,0.7,6.53,Pink
16 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,14,59.3462826,18.0843085,2680.192,119.3,-8.12,Turquoise
17 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,15,35.72154,111.350842,4813.115,118.4,-3.69,Indigo
18 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,16,37.79446,20.85188,8463.648,297.9,29.99,Maroon
19 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,17,31.191643,121.389262,4611.078,218.2,30.57,Puce
20 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,18,9.7457208,123.8401962,3290.653,63.9,10.85,Khaki
21 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,19,53.9762845,43.8688442,10250.0,34.2,28.02,Purple
22 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,20,24.513425,117.723153,18560.283,172.4,21.44,Pink
23 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,21,35.7017899,59.8468432,15401.556,180.3,33.62,Pink
24 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,22,44.840524,82.353656,6517.541,14.0,17.78,Goldenrod
25 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,23,18.9237513,-70.4144776,1764.466,205.4,23.52,Turquoise
26 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,24,15.4053048,-91.7142051,18070.46,185.8,5.36,Green
27 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,25,6.129226,102.236216,12329.13,264.8,28.18,Puce
28 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,26,-8.5294459,119.0109502,13301.391,47.4,3.82,Green
29 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,27,59.8863041,29.9085976,10543.427,286.2,5.3,Maroon
30 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,28,54.0297214,28.0892299,14565.844,269.1,31.19,Teal
31 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,29,55.6967262,39.2331589,15211.381,274.2,39.29,Purple
32 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,30,7.6447222,149.4208333,4035.918,294.7,-8.25,Purple
33 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,31,42.9895326,131.8411237,3914.634,167.6,24.91,Fuscia
34 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,32,55.6942718,74.3214928,13474.821,221.1,-9.11,Khaki
35 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,33,38.430793,100.812859,750.106,106.5,17.87,Indigo
36 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,34,41.0490009,39.513623,11325.001,157.9,0.91,Crimson
37 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,35,62.6571846,26.0472266,3598.447,159.3,27.45,Crimson
38 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,36,15.1539332,-87.8721602,11346.493,190.0,8.49,Mauv
39 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,37,57.7311038,12.0586612,18059.776,20.2,-2.16,Teal
40 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,38,60.3448681,17.4966539,14181.513,281.9,19.21,Indigo
41 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,39,29.306756,120.07514,9057.801,159.2,7.51,Maroon
42 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,40,40.417358,117.500558,11505.682,178.2,10.48,Green
43 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,41,10.3696393,-66.9571026,12820.516,35.4,4.39,Goldenrod
44 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,42,31.03094,103.183075,1314.934,22.5,32.19,Red
45 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,43,34.014215,105.298756,12460.738,231.2,26.25,Aquamarine
46 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,44,48.8610504,2.3237084,432.326,249.5,18.08,Turquoise
47 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,45,47.2952721,39.8734276,4035.261,227.4,32.39,Red
48 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,46,32.147679,114.091192,14946.176,194.9,10.54,Fuscia
49 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,47,58.2774681,11.4424559,10162.833,108.5,2.17,Turquoise
50 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,48,-6.8058522,111.9611919,16282.329,159.7,19.21,Turquoise
51 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,49,31.9963592,-5.1174039,1263.775,74.0,3.31,Pink
52 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,50,9.9497452,126.0068121,15802.549,125.7,37.82,Indigo
53 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,51,-26.4070347,-61.4128561,2934.254,221.2,17.79,Blue
54 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,52,14.0242809,-60.9758292,4843.068,124.3,25.85,Blue
55 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,53,34.6718873,133.8964708,15607.254,236.9,-5.56,Puce
56 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,54,53.3547991,-113.7233907,4433.983,205.5,30.7,Red
57 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,55,-7.2893545,-34.8403408,8702.503,189.4,20.44,Pink
58 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,56,23.284628,116.268675,15210.3,273.5,3.85,Green
59 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,57,22.654032,110.18122,19745.464,21.7,3.32,Crimson
60 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,58,22.270978,113.576677,14309.905,36.4,25.93,Red
61 | c876bd01-2830-499a-8430-8cc62389722d,2020-05-22,2,59,7.193611,100.592145,8750.315,87.0,-7.97,Indigo
62 | 


--------------------------------------------------------------------------------
/plugins/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexandraabbas/apache-airflow-course/4280ef25705f38f74c2ac4aa2ac7afcbc8aba024/plugins/__init__.py


--------------------------------------------------------------------------------
/plugins/bigquery_plugin.py:
--------------------------------------------------------------------------------
  1 | from airflow.plugins_manager import AirflowPlugin
  2 | 
  3 | from airflow.models import BaseOperator
  4 | from airflow.contrib.hooks.bigquery_hook import BigQueryHook
  5 | from airflow.sensors.base_sensor_operator import BaseSensorOperator
  6 | 
  7 | from airflow.exceptions import AirflowException
  8 | from airflow.utils.decorators import apply_defaults
  9 | 
 10 | from googleapiclient.errors import HttpError
 11 | from google.cloud import bigquery
 12 | 
 13 | 
 14 | class BigQueryDataValidationOperator(BaseOperator):
 15 |     template_fields = ["sql"]
 16 |     ui_color = "#fcf197"
 17 | 
 18 |     @apply_defaults
 19 |     def __init__(
 20 |         self,
 21 |         sql,
 22 |         gcp_conn_id="google_cloud_default",
 23 |         use_legacy_sql=False,
 24 |         location=None,
 25 |         *args,
 26 |         **kwargs,
 27 |     ):
 28 | 
 29 |         super().__init__(*args, **kwargs)
 30 |         self.sql = sql
 31 |         self.gcp_conn_id = gcp_conn_id
 32 |         self.use_legacy_sql = use_legacy_sql
 33 |         self.location = location
 34 | 
 35 |     def run_query(self, project, credentials):
 36 |         client = bigquery.Client(project=project, credentials=credentials)
 37 | 
 38 |         query_job = client.query(self.sql)
 39 |         results = query_job.result()
 40 | 
 41 |         return [list(row.values()) for row in results][0]
 42 | 
 43 |     def execute(self, context):
 44 |         hook = BigQueryHook(
 45 |             bigquery_conn_id=self.gcp_conn_id,
 46 |             use_legacy_sql=self.use_legacy_sql,
 47 |             location=self.location,
 48 |         )
 49 | 
 50 |         records = self.run_query(
 51 |             project=hook._get_field("project"), credentials=hook._get_credentials()
 52 |         )
 53 | 
 54 |         if not records:
 55 |             raise AirflowException("Query returned no results.")
 56 |         elif not all([bool(record) for record in records]):
 57 |             raise AirflowException(
 58 |                 f"Test failed\nQuery: {self.sql}\nRecords: {records}"
 59 |             )
 60 | 
 61 |         self.log.info(f"Test passed\nQuery: {self.sql}\nRecords: {records}")
 62 | 
 63 | 
 64 | class BigQueryDatasetSensor(BaseSensorOperator):
 65 |     template_fields = ["project_id", "dataset_id"]
 66 |     ui_color = "#feeef1"
 67 | 
 68 |     def __init__(
 69 |         self,
 70 |         project_id,
 71 |         dataset_id,
 72 |         gcp_conn_id="google_cloud_default",
 73 |         *args,
 74 |         **kwargs,
 75 |     ):
 76 |         super().__init__(*args, **kwargs)
 77 |         self.project_id = project_id
 78 |         self.dataset_id = dataset_id
 79 |         self.gcp_conn_id = gcp_conn_id
 80 | 
 81 |     def poke(self, context):
 82 |         hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id)
 83 |         service = hook.get_service()
 84 | 
 85 |         try:
 86 |             service.datasets().get(
 87 |                 datasetId=self.dataset_id, projectId=self.project_id
 88 |             ).execute()
 89 | 
 90 |             return True
 91 |         except HttpError as e:
 92 |             if e.resp["status"] == "404":
 93 |                 return False
 94 | 
 95 |             raise AirflowException(f"Error: {e}")
 96 | 
 97 | 
 98 | class BigQueryPlugin(AirflowPlugin):
 99 |     name = "bigquery_plugin"
100 |     operators = [BigQueryDataValidationOperator]
101 |     sensors = [BigQueryDatasetSensor]
102 | 


--------------------------------------------------------------------------------
/pyspark/weekday/avg_speed.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | from pyspark.sql import SparkSession
 3 | 
 4 | spark = SparkSession \
 5 |   .builder \
 6 |   .master('yarn') \
 7 |   .appName('bigquery-analytics-avg-speed') \
 8 |   .getOrCreate()
 9 | 
10 | bucket = 'YOUR-BUCKET-NAME-HERE'
11 | spark.conf.set('temporaryGcsBucket', bucket)
12 | 
13 | history = spark.read.format('bigquery') \
14 |   .option('table', 'vehicle_analytics.history') \
15 |   .load()
16 | history.createOrReplaceTempView('history')
17 | 
18 | avg_speed = spark.sql(
19 |     'SELECT vehicle_id, date, AVG(speed) AS avg_speed FROM history GROUP BY vehicle_id, date'
20 | )
21 | avg_speed.show()
22 | avg_speed.printSchema()
23 | 
24 | avg_speed.write.format('bigquery') \
25 |     .option('table', 'vehicle_analytics.avg_speed') \
26 |     .mode('append') \
27 |     .save()
28 | 


--------------------------------------------------------------------------------
/pyspark/weekday/avg_temperature.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | from pyspark.sql import SparkSession
 3 | 
 4 | spark = SparkSession \
 5 |   .builder \
 6 |   .master('yarn') \
 7 |   .appName('bigquery-analytics-avg-temperature') \
 8 |   .getOrCreate()
 9 | 
10 | bucket = 'YOUR-BUCKET-NAME-HERE'
11 | spark.conf.set('temporaryGcsBucket', bucket)
12 | 
13 | history = spark.read.format('bigquery') \
14 |   .option('table', 'vehicle_analytics.history') \
15 |   .load()
16 | history.createOrReplaceTempView('history')
17 | 
18 | avg_temperature = spark.sql(
19 |     'SELECT vehicle_id, date, AVG(temperature) AS avg_temperature FROM history GROUP BY vehicle_id, date'
20 | )
21 | avg_temperature.show()
22 | avg_temperature.printSchema()
23 | 
24 | avg_temperature.write.format('bigquery') \
25 |     .option('table', 'vehicle_analytics.avg_temperature') \
26 |     .mode('append') \
27 |     .save()
28 | 


--------------------------------------------------------------------------------
/pyspark/weekday/avg_tire_pressure.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | from pyspark.sql import SparkSession
 3 | 
 4 | spark = SparkSession \
 5 |   .builder \
 6 |   .master('yarn') \
 7 |   .appName('bigquery-analytics-avg-tire-pressure') \
 8 |   .getOrCreate()
 9 | 
10 | bucket = 'YOUR-BUCKET-NAME-HERE'
11 | spark.conf.set('temporaryGcsBucket', bucket)
12 | 
13 | history = spark.read.format('bigquery') \
14 |   .option('table', 'vehicle_analytics.history') \
15 |   .load()
16 | history.createOrReplaceTempView('history')
17 | 
18 | avg_tire_pressure = spark.sql(
19 |     'SELECT vehicle_id, date, AVG(tire_pressure) AS avg_tire_pressure FROM history GROUP BY vehicle_id, date'
20 | )
21 | avg_tire_pressure.show()
22 | avg_tire_pressure.printSchema()
23 | 
24 | avg_tire_pressure.write.format('bigquery') \
25 |     .option('table', 'vehicle_analytics.avg_tire_pressure') \
26 |     .mode('append') \
27 |     .save()
28 | 


--------------------------------------------------------------------------------
/pyspark/weekend/gas_composition_count.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | from pyspark.sql import SparkSession
 3 | 
 4 | spark = SparkSession \
 5 |   .builder \
 6 |   .master('yarn') \
 7 |   .appName('bigquery-analytics-gas-composition-count') \
 8 |   .getOrCreate()
 9 | 
10 | bucket = 'YOUR-BUCKET-NAME-HERE'
11 | spark.conf.set('temporaryGcsBucket', bucket)
12 | 
13 | history = spark.read.format('bigquery') \
14 |   .option('table', 'vehicle_analytics.history') \
15 |   .load()
16 | history.createOrReplaceTempView('history')
17 | 
18 | gas_composition_count = spark.sql(
19 |     'SELECT vehicle_id, date, COUNT(DISTINCT gas_composition) AS gas_composition_count FROM history GROUP BY vehicle_id, date'
20 | )
21 | gas_composition_count.show()
22 | gas_composition_count.printSchema()
23 | 
24 | gas_composition_count.write.format('bigquery') \
25 |     .option('table', 'vehicle_analytics.gas_composition_count') \
26 |     .mode('append') \
27 |     .save()
28 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
 1 | # Section 7: Testing Airflow DAGs
 2 | 
 3 | This directory holds the source code of Section 7: Testing Airflow DAGs.
 4 | 
 5 | ## How to run unit tests
 6 | 
 7 | ```Bash
 8 | python3 -m unittest -v {TEST-MODULE}
 9 | ```
10 | 
11 | For example to test the `core_concepts` DAG and its operators run the following.
12 | 
13 | ```Bash
14 | python3 -m unittest -v test_core_concepts
15 | ```
16 | 


--------------------------------------------------------------------------------
/tests/test_bigquery_data_validation.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from unittest.mock import patch
 3 | 
 4 | from datetime import datetime
 5 | 
 6 | from airflow import DAG
 7 | from airflow.utils.state import State
 8 | from airflow.models import TaskInstance
 9 | from airflow.operators.bigquery_plugin import BigQueryDataValidationOperator
10 | 
11 | from airflow.exceptions import AirflowException
12 | 
13 | 
14 | def mock_run_query():
15 |     def return_empty_list(*args, **kwargs):
16 |         return []
17 | 
18 |     return return_empty_list
19 | 
20 | 
21 | class TestBigQueryDataValidationOperator(unittest.TestCase):
22 |     def setUp(self):
23 |         EXEC_DATE = "2020-06-25"
24 | 
25 |         self.dag = DAG(
26 |             "test_bigquery_data_validation",
27 |             schedule_interval="@daily",
28 |             default_args={"start_date": EXEC_DATE},
29 |         )
30 | 
31 |         self.op = BigQueryDataValidationOperator(
32 |             task_id="bigquery_op",
33 |             sql="SELECT COUNT(*) FROM `example.example.example`",
34 |             location="europe-west2",
35 |             dag=self.dag,
36 |         )
37 | 
38 |         self.ti = TaskInstance(
39 |             task=self.op, execution_date=datetime.strptime(EXEC_DATE, "%Y-%m-%d")
40 |         )
41 | 
42 |     @patch.object(
43 |         BigQueryDataValidationOperator, "run_query", new_callable=mock_run_query
44 |     )
45 |     def test_with_empty_result(self, mock):
46 |         with self.assertRaises(AirflowException) as context:
47 |             self.ti.run()
48 |         self.assertEqual(self.ti.state, State.FAILED)
49 |         self.assertEqual(str(context.exception), "Query returned no results.")
50 | 


--------------------------------------------------------------------------------
/tests/test_core_concepts.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from airflow.models import DagBag
 3 | 
 4 | 
 5 | class TestCoreConceptsDAG(unittest.TestCase):
 6 |     def setUp(self):
 7 |         self.dagbag = DagBag()
 8 |         self.dag = self.dagbag.get_dag(dag_id="core_concepts")
 9 | 
10 |     def test_dag_loaded(self):
11 |         self.assertDictEqual(self.dagbag.import_errors, {})
12 |         self.assertIsNotNone(self.dag)
13 | 
14 |     def test_contain_tasks(self):
15 |         self.assertListEqual(self.dag.task_ids, ["bash_command", "python_function"])
16 | 
17 |     def test_dependencies_of_bash_command(self):
18 |         bash_task = self.dag.get_task("bash_command")
19 | 
20 |         self.assertEqual(bash_task.upstream_task_ids, set())
21 |         self.assertEqual(bash_task.downstream_task_ids, set(["python_function"]))
22 | 
23 |     def assertDagDictEqual(self, structure, dag):
24 |         self.assertEqual(dag.task_dict.keys(), structure.keys())
25 | 
26 |         for task_id, downstream_list in structure.items():
27 |             self.assertTrue(dag.has_task(task_id))
28 | 
29 |             task = dag.get_task(task_id)
30 | 
31 |             self.assertEqual(task.downstream_task_ids, set(downstream_list))
32 | 
33 |     def test_dag_structure(self):
34 |         self.assertDagDictEqual(
35 |             {"bash_command": ["python_function"], "python_function": []}, self.dag
36 |         )
37 | 
38 | 


--------------------------------------------------------------------------------
/variables/dev.json:
--------------------------------------------------------------------------------
1 | {
2 |     "project": "YOUR-PROJECT-NAME-HERE",
3 |     "landing_bucket": "YOUR-LANDING-BUCKET-NAME-HERE",
4 |     "backup_bucket": "YOUR-BACKUP-BUCKET-NAME-HERE"
5 | }


--------------------------------------------------------------------------------