├── images
└── sponser_banner.png
├── meta
└── crypto.sql
├── dags
├── mysql_to_gcs.py
├── tutorial.py
├── eth_rates.py
└── sakila_main_tables.py
├── LICENSE
└── README.md
/images/sponser_banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikeghen/airflow-tutorial/HEAD/images/sponser_banner.png
--------------------------------------------------------------------------------
/meta/crypto.sql:
--------------------------------------------------------------------------------
1 | create table rates (
2 | id varchar(10),
3 | price numeric(15,2),
4 | last_price numeric(15,2),
5 | volume numeric(15,2),
6 | recorded_at timestamp without time zone default (now() at time zone 'utc')
7 | );
8 |
9 | create table markets (
10 | market varchar(32),
11 | id varchar(10),
12 | price numeric(15,2),
13 | volume_btc numeric(15,2),
14 | volume numeric(15,2),
15 | recorded_at timestamp without time zone default (now() at time zone 'utc')
16 | );
17 |
--------------------------------------------------------------------------------
/dags/mysql_to_gcs.py:
--------------------------------------------------------------------------------
1 | from airflow import DAG
2 | from airflow.contrib.operators.mysql_to_gcs import MySqlToGoogleCloudStorageOperator
3 | from airflow.operators.bash_operator import BashOperator
4 | from datetime import datetime, timedelta
5 |
6 | default_args = {
7 | 'owner': 'mikeghen',
8 | 'start_date': datetime(2017, 8, 11),
9 | 'depends_on_past': False,
10 | 'retries': 1,
11 | 'retry_delay': timedelta(minutes=5),
12 | }
13 |
14 | dag = DAG('mysql_to_gcs', default_args=default_args)
15 |
16 | export_actor = MySqlToGoogleCloudStorageOperator(
17 | task_id='extract_actors',
18 | mysql_conn_id='sakila_test',
19 | google_cloud_storage_conn_id='gcp_test',
20 | sql='SELECT * FROM sakila.actor',
21 | bucket='ghen-airflow',
22 | filename='sakila/actors/actors{}.json',
23 | schema_filename='sakila/schemas/actors.json',
24 | dag=dag)
25 |
--------------------------------------------------------------------------------
/dags/tutorial.py:
--------------------------------------------------------------------------------
1 | from airflow import DAG
2 | from airflow.operators.bash_operator import BashOperator
3 | from datetime import datetime, timedelta
4 |
5 |
6 | default_args = {
7 | 'owner': 'airflow',
8 | 'depends_on_past': False,
9 | 'start_date': datetime(2015, 6, 1),
10 | 'email': ['airflow@airflow.com'],
11 | 'email_on_failure': False,
12 | 'email_on_retry': False,
13 | 'retries': 1,
14 | 'retry_delay': timedelta(minutes=5),
15 | # 'queue': 'bash_queue',
16 | # 'pool': 'backfill',
17 | # 'priority_weight': 10,
18 | # 'end_date': datetime(2016, 1, 1),
19 | }
20 |
21 | dag = DAG('tutorial', default_args=default_args)
22 |
23 | # t1, t2 and t3 are examples of tasks created by instantiating operators
24 | t1 = BashOperator(
25 | task_id='print_date',
26 | bash_command='date',
27 | dag=dag)
28 |
29 | t2 = BashOperator(
30 | task_id='sleep',
31 | bash_command='sleep 5',
32 | retries=3,
33 | dag=dag)
34 |
35 | templated_command = """
36 | {% for i in range(5) %}
37 | echo "{{ ds }}"
38 | echo "{{ macros.ds_add(ds, 7)}}"
39 | echo "{{ params.my_param }}"
40 | {% endfor %}
41 | """
42 |
43 | t3 = BashOperator(
44 | task_id='templated',
45 | bash_command=templated_command,
46 | params={'my_param': 'Parameter I passed in'},
47 | dag=dag)
48 |
49 | t2.set_upstream(t1)
50 | t3.set_upstream(t1)
51 |
--------------------------------------------------------------------------------
/dags/eth_rates.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 | import json
3 |
4 | from airflow.hooks import HttpHook
5 | from airflow.hooks.postgres_hook import PostgresHook
6 | from airflow.operators import PythonOperator,
7 | from airflow.models import DAG
8 |
9 | PAIR = 'eth/usd'
10 |
11 | def get_rates(ds, **kwargs):
12 | # connection: a Airflow connection
13 | pg_hook = PostgresHook(postgres_conn_id='crypto')
14 | api_hook = HttpHook(http_conn_id='cryptocoincharts_eth', method='GET')
15 |
16 | resp = api_hook.run('')
17 | resp = json.loads(resp.content)
18 |
19 | rates_insert = """INSERT INTO rates (id, price, last_price, volume)
20 | VALUES (%s, %s, %s, %s);"""
21 | markets_insert = """INSERT INTO markets (market, pair, price, volume_btc, volume)
22 | VALUES (%s, %s, %s, %s, %s);"""
23 |
24 | pg_hook.run(rates_insert, parameters=(resp['id'], resp['price'], resp['price_before_24h'], resp['volume_second']))
25 |
26 | for market in resp['markets']:
27 | pg_hook.run(markets_insert, parameters=(market['market'], PAIR, market['price'], market['volume_btc'], market['volume']))
28 |
29 |
30 | args = {
31 | 'owner': 'mikeghen',
32 | 'depends_on_past': False,
33 | 'start_date': datetime.utcnow(),
34 | 'retries': 1,
35 | 'retry_delay': timedelta(minutes=30),
36 | }
37 |
38 | dag = DAG(dag_id='eth_rates',
39 | default_args=args,
40 | schedule_interval='0 */5 * * *',
41 | dagrun_timeout=timedelta(seconds=5))
42 |
43 | get_rates_task = \
44 | PythonOperator(task_id='get_rates',
45 | provide_context=True,
46 | python_callable=get_rates,
47 | dag=dag)
48 |
--------------------------------------------------------------------------------
/dags/sakila_main_tables.py:
--------------------------------------------------------------------------------
1 | from airflow import DAG
2 | from airflow.contrib.operators.mysql_to_gcs import MySqlToGoogleCloudStorageOperator
3 | from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator
4 | from airflow.operators.slack_operator import SlackAPIPostOperator
5 | from datetime import datetime, timedelta
6 |
7 | """
8 | Extracts most tables from sakila_1 and sakila_2 to BigQuery
9 |
10 | Prerequisties:
11 | 1. sakila_1, sakila_2 connection in Airflow (MySQL)
12 | 2. gcp_test connection in Airflow (Google Cloud Platform)
13 | 3. ghen-airflow bucket in Google Cloud Storage
14 | a. Make path to ghen-airflow/sakila_1 and ghen-airflow/sakila_2
15 | 4. sakila_1, sakila_2 datasets in BigQuerys
16 | """
17 |
18 | sakila_connections = [
19 | # NOTE: Also the names of the databases in MySQL and datasets in BigQuery
20 | 'sakila_1',
21 | 'sakila_2'
22 | ]
23 |
24 | sakila_tables = [
25 | 'actor',
26 | # 'address', NOTE: This table has some sketchy encoding, use another DAG to load
27 | 'category',
28 | 'city',
29 | 'country',
30 | 'customer',
31 | 'film',
32 | 'film_actor',
33 | 'film_category',
34 | 'film_text',
35 | 'inventory',
36 | 'language',
37 | 'payment',
38 | 'rental',
39 | # 'staff', NOTE: This table has some sketchy encoding, use another DAG to load
40 | 'store'
41 | ]
42 |
43 | default_args = {
44 | 'owner': 'mikeghen',
45 | 'start_date': datetime(2017, 8, 11),
46 | 'depends_on_past': False,
47 | 'retries': 1,
48 | 'retry_delay': timedelta(minutes=5),
49 | }
50 |
51 | dag = DAG('sakila_main_tables', default_args=default_args, schedule_interval=timedelta(days=1))
52 |
53 | slack_notify = SlackAPIPostOperator(
54 | task_id='slack_notify',
55 | token='xxxxxx',
56 | channel='data-status',
57 | username='airflow',
58 | text='Successfully performed sakila ETL operation',
59 | dag=dag)
60 |
61 | for connection in sakila_connections:
62 | for table in sakila_tables:
63 | extract = MySqlToGoogleCloudStorageOperator(
64 | task_id="extract_mysql_%s_%s"%(connection,table),
65 | mysql_conn_id=connection,
66 | google_cloud_storage_conn_id='gcp_test',
67 | sql="SELECT *, '%s' as source FROM sakila.%s"%(connection,table),
68 | bucket='ghen-airflow',
69 | filename="%s/%s/%s{}.json"%(connection,table,table),
70 | schema_filename="%s/schemas/%s.json"%(connection,table),
71 | dag=dag)
72 |
73 | load = GoogleCloudStorageToBigQueryOperator(
74 | task_id="load_bq_%s_%s"%(connection,table),
75 | bigquery_conn_id='gcp_test',
76 | google_cloud_storage_conn_id='gcp_test',
77 | bucket='ghen-airflow',
78 | destination_project_dataset_table="spark-test-173322.%s.%s"%(connection,table),
79 | source_objects=["%s/%s/%s*.json"%(connection,table,table)],
80 | schema_object="%s/schemas/%s.json"%(connection,table),
81 | source_format='NEWLINE_DELIMITED_JSON',
82 | create_disposition='CREATE_IF_NEEDED',
83 | write_disposition='WRITE_TRUNCATE',
84 | project_id='spark-test-173322',
85 | dag=dag)
86 |
87 | load.set_upstream(extract)
88 | slack_notify.set_upstream(load)
89 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright 2020 Michael Ghen
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
203 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Airflow Tutorial
2 | This documents some of the work I did getting started with Airflow on Google Cloud Platform.
3 |
4 | :warning: Work in progress :pencil:
5 |
6 | ## About this Tutorial
7 | I found the tutorial within the Airflow Documentation to be sparse and I also found that in order to achieve what I was trying to do, I'd have to just read all the documentation. The purpose of this tutorial is to help others get started with Airflow without reading all the documentation. _I'd still recommend reading all the documentation at some point_ but if all you're trying to do is use Airflow to move data from an RDBMS like MySQL or Postgres, this is a great place to start.
8 |
9 | In this tutorial, I will walk you through setting up Airflow on Google Cloud Platform. I will cover creating a data flow that moves data from MySQL to BigQuery. My goal is to make this tutorial comprehensive enough so that it can be used to configure a production Airflow deployment.
10 |
11 | # Setup
12 | I'm using Google Cloud Platform for hosting. The end goal here is to take data from 5 MySQL databases and load it into Google BigQuery.
13 |
14 | ## Installation
15 | I installed Airflow on an instance using Compute Engine (using Ubuntu 16 OS). The installation was pretty trivial simply:
16 | ```
17 | export AIRFLOW_HOME=~/airflow
18 | pip install airflow
19 | airflow initdb
20 | airflow webserver -p 8080
21 | ```
22 | and I was up and running. You can find [more on the installation in the Airflow Documentation](http://pythonhosted.org/airflow/installation.html).
23 |
24 | ### About the Installation
25 | Airflow is install as a Python package and all the configuration files are stored in `~/airflow`.
26 |
27 | The primary file you need to know of is `~/airflow/airflow.cfg` which stores the configuration information for Airflow. I will edit those in the next section to setup Security.
28 |
29 | Airflow is a [Flask](http://flask.pocoo.org/) application by the way.
30 |
31 | ## Database Setup
32 | According to the Airflow Documentation:
33 | >If you want to take a real test drive of Airflow, you should consider setting up a real database backend and switching to the LocalExecutor.
34 |
35 | I decided I would just install Postgres on my Airflow instance (Ubuntu 16):
36 | ```
37 | sudo apt-get install postgresql postgresql-contrib
38 | pip install psycopg2
39 | ```
40 | Then to create a user for airflow:
41 | ```
42 | $ sudo -u postgres createuser --interactive
43 | Enter name of role to add: airflow
44 | Shall the new role be a superuser? (y/n) n
45 | Shall the new role be allowed to create databases? (y/n) n
46 | Shall the new role be allowed to create more new roles? (y/n) n
47 | ```
48 | Then set the user's password and create the database:
49 | ```
50 | sudo -u postgres psql
51 | psql (9.5.7)
52 | Type "help" for help.
53 |
54 | postgres=# ALTER USER airflow WITH PASSWORD 'airflow_password';
55 | ALTER ROLE
56 | postgres=# CREATE DATABASE airflow;
57 | CREATE DATABASE
58 | ```
59 | Next, edit the `airflow.cfg` to use Postgres by adding:
60 | ```
61 | # The Postgres connection string
62 | sql_alchemy_conn = postgresql://airflow:airflow_password@localhost/airflow
63 | ```
64 | and comment out the SQLite config.
65 |
66 | Finally, reinitialize the database:
67 | ```
68 | airflow initdb
69 | ```
70 | ### Restarting Airflow
71 | I had to restart Airflow which wasn't as simple as I expected. I ended up using `kill -9` to kill all the airflow processes. I tried other solutions posted on Stack Overflow, but eventually just killed the processes using `-9` and restarted using:
72 | ```
73 | airflow webserver -p 8080 -D
74 | ```
75 | :warning: You should really configure systemd
76 |
77 | ## Starting the Scheduler
78 | The scheduler needs to be running in order for jobs and tasks to be executed. To start the scheduler, run:
79 | ```
80 | airflow scheduler
81 | ```
82 |
83 | ## Integration with systemd
84 | :pencil: http://pythonhosted.org/airflow/configuration.html#integration-with-systemd
85 |
86 | # Sponsorship Message
87 | This Airflow Tutorial is being sponsored by the following tool; please help to support us by taking a look and signing up to a free trial
88 |
89 |
90 |
91 |
92 | ## Security
93 | ### User Access Control
94 | One of my concerns was user access controls so after the install I jumped down to the [Security](http://pythonhosted.org/airflow/security.html?highlight=users) portion of the Airflow Documentation. Per the docs:
95 | > By default, all gates are opened. An easy way to restrict access to the web application is to do it at the network level, or by using SSH tunnels.
96 |
97 | I plan on setting up SSH tunneling in production (until we have a VPN in place) but I still want my users to have to authenticate.
98 |
99 | To setup Airflow to require usernames and password, I edited my `aiflow.cfg` file under the `[webserver]` section per the documentation:
100 |
101 | ```
102 | authenticate = True
103 | auth_backend = airflow.contrib.auth.backends.password_auth
104 | ```
105 | Next, you'll need to install `flask_bcrypt`. Since Airflow is a Flask application it will need this package to manage authentication encryption:
106 | ```
107 | pip install flask_bcrypt
108 | ```
109 | Finally, I needed to create the initial user by running Python on the command line first:
110 | ```
111 | cd ~/airflow
112 | python
113 | ```
114 | Then in the Python interpreter:
115 | ```python
116 | import airflow
117 | from airflow import models, settings
118 | from airflow.contrib.auth.backends.password_auth import PasswordUser
119 | user = PasswordUser(models.User())
120 | user.username = 'admin'
121 | user.email = 'admin@example.com'
122 | user.password = 'admin_password'
123 | session = settings.Session()
124 | session.add(user)
125 | session.commit()
126 | session.close()
127 | exit()
128 | ```
129 | Finally, restart airflow.
130 |
131 | :pencil: I will probably come back later and setup [GitHub Enterprise Auth](http://pythonhosted.org/airflow/security.html#github-enterprise-ghe-authentication)
132 |
133 | ### Encryption
134 | When clicking around, I ran into this nasty message on `/admin/connections`:
135 | > Warning: Connection passwords are stored in plaintext until you install the Python "cryptography" library. You can find installation instructions here: https://cryptography.io/en/latest/installation/. Once installed, instructions for creating an encryption key will be displayed the next time you import Airflow.
136 |
137 | So I did a simple install of two encryption packages, both recommended in the Airflow Documentation:
138 | ```
139 | pip install cryptography
140 | pip install crypto
141 | ```
142 | and the message went away. I will revisit this when I start setting up connections.
143 |
144 | :pencil: Still figuring this out...
145 |
146 | ## Clearing out the Default DAGs and Connections
147 | Clearing out the default connections was easy. I just selected all and then `With Selected > Delete` them.
148 |
149 | I added back a single connection to the local Airflow Postgres database.
150 |
151 | I made the mistake of trying to delete the DAGs from Postgres CLI, :warning: bad idea... Just edit `airflow.cfg` and set:
152 |
153 | ```
154 | load_examples = False
155 | ```
156 |
157 | :pencil: I had a problem where connections showed by up after I restarted Airflow, still figuring this out...
158 |
159 | ## Final Notes on Setup
160 | There are probably a few other setup related things to do like workers and backups but I'm all set for now. Next, I move onto setting up for my data flows.
161 |
162 | # Working with DAGs
163 | The Airflow Documentation talks a lot about "DAGs" but I found the documentation spread out all over the place. This section will walk you through configuring Airflow to move data from MySQL databases into BigQuery.
164 |
165 | ## Operators and Hooks
166 | Before I get into coding up some things, I think it's important to understand what Operators and Hooks are within Airflow.
167 |
168 | **Operators** allow for generation of certain types of tasks that become nodes in the DAG when instantiated.
169 |
170 | There are 3 main types of operators:
171 | * Operators that performs an action, or tell another system to perform an action
172 | * e.g. `BashOperator`
173 | * Transfer operators move data from one system to another
174 | * e.g. `MySqlToGoogleCloudStorageOperator` or `MySqlToGoogleCloudStorageOperator`
175 | * Sensors are a certain type of operator that will keep running until a certain criterion is met.
176 | * e.g. `GoogleCloudStorageObjectSensor`
177 |
178 | **Hooks** manage the interaction between systems (e.g. BigQuery) and Airflow. Hooks provide methods like `get_conn` and `get_pandas_df` which interact with systems to access data.
179 |
180 | ## Creating and Testing the First DAGs
181 | DAGs are created using Python code. To make a DAG, you can create a Python script and save it into `dag_folder` as specified in `airflow.cfg` (defaults to `~/airflow/dags`). I'm going to create a simple DAG to test that Airflow is finding DAGs correctly. So I'm creating a file call `tutorial.py` in `~/airflow/dags`.
182 |
183 | To make sure Airflow finds the DAG I ran:
184 | ```
185 | airflow list_dags
186 | ```
187 | form the command line.
188 |
189 | # Hands On Exercises :weight_lifting_man:
190 | The first real DAG I want to make is one that exports data from a MySQL Database and dumps it into Google Cloud Storage.
191 |
192 | ## Creating a MySQL to Google Cloud Storage DAG
193 | ### Setup
194 | To setup for this exercise, I first needed to create some infrastructure to simulate an operational system I could do some ETL on. My setup looked like this:
195 |
196 | 1. A MySQL database using GCP's SQL which I loaded MySQL's "sakila" database into (see `sakila-database.sql` for the dump file I imported into my instance)
197 | 2. A Google Cloud Storage Bucket I could dump some data into
198 |
199 | ### Instructions
200 | The first task is to demonstrate that I could use the `MySqlToGoogleCloudStorageOperator` to export data from MySQL to a GCS bucket. I crafted this simple DAG `mysql_to_gcs.py`.
201 |
202 | There is some backend configuration work to do before this DAG will run.
203 |
204 | First, setup connection in Airflow.
205 |
206 | 1. Create a MySQL connection:
207 | ```
208 | Conn Id: sakila_test
209 | Conn Type: MySQL
210 | Host: 10.10.10.10
211 | Schema: sakila
212 | Login: airflow
213 | Password: airflow_password
214 | ```
215 | 2. Create a GCP connection
216 | 1. Create a Service Account and download the credentials you need, save them to somewhere on the airflow instance. I put mine in `/etc/gcp/creds.json`
217 | 2. Setup the connections:
218 | ```
219 | Conn Id: gcp_test
220 | Conn Type: Google Cloud Platform
221 | Project Id: my-gcp-project-id-00000
222 | Keyfile Path: /etc/gcp/creds.json
223 | Scopes (comma seperated): https://www.googleapis.com/auth/cloud-platform
224 | ```
225 |
226 | 3. Install MySQL dependacies on the airflow instance:
227 | ```
228 | sudo apt-get install python-mysqldb
229 | pip install pymysql
230 | ```
231 |
232 | 4. Create the `mysql_to_gcs.py` DAG in `~/airflow/dags` (find code in `./dags`)
233 |
234 | 5. Test for python compilation to make sure you don't have any syntax errors:
235 | ```
236 | cd ~/airflow/dags
237 | python mysql_to_gcs.py
238 | ```
239 |
240 | 6. Now test run the task using airflow. This will actually execute a DAG task as if it were running in airflow so expect to see a file created in the bucket you're using:
241 | ```
242 | airflow test mysql_to_gcs.py extract_actor 08-11-2017
243 | ```
244 |
245 | 7. Once you've tested it, you're all set!
246 |
247 | ## Creating a DAG to Extract multple tables from multiple MySQL databases to BigQuery
248 | In this exercise, we'll pull data from two MySQL databases and dump it to GCS then load it from GCS to BigQuery.
249 | ### Setup
250 | To setup for this exercise, I first needed to create some infrastructure to simulate an operational system I could do some ETL on. My setup looked like this:
251 |
252 | 1. Create your first MySQL database (`sakila_1`) using GCP's SQL which you will loaded MySQL's "sakila" database into (see `sakila-database.sql` for the dump file I imported into my instance)
253 | 1. Create your second MySQL database (`sakila_2`) using GCP's SQL which you will loaded MySQL's "sakila" database into (see `sakila-database.sql` for the dump file I imported into my instance)
254 | 2. Create a Google Cloud Storage Bucket you could dump some data into
255 | 4. Create a BigQuery Dataset for `sakila_1` and `sakila_2` (could probably make one dataset now that I think about it)
256 |
257 | ### Instructions
258 | 1. Create two MySQL connection:
259 | ```
260 | Conn Id: sakila_1
261 | Conn Type: MySQL
262 | Host: 10.10.10.10
263 | Schema: sakila
264 | Login: airflow
265 | Password: airflow_password
266 | ```
267 | ```
268 | Conn Id: sakila_2
269 | Conn Type: MySQL
270 | Host: 10.10.10.11
271 | Schema: sakila
272 | Login: airflow
273 | Password: airflow_password
274 | ```
275 | 2. Create a GCP connection
276 | 1. Create a Service Account and download the credentials you need, save them to somewhere on the airflow instance. I put mine in `/etc/gcp/creds.json`
277 | 2. Setup the connections:
278 | ```
279 | Conn Id: gcp_test
280 | Conn Type: Google Cloud Platform
281 | Project Id: my-gcp-project-id-00000
282 | Keyfile Path: /etc/gcp/creds.json
283 | Scopes (comma seperated): https://www.googleapis.com/auth/cloud-platform
284 | ```
285 | 3. Create the `sakila_main_tables.py` DAG in `~/airflow/dags` (find code in `./dags`)
286 |
287 | 4. Test for python compilation to make sure you don't have any syntax errors:
288 | ```
289 | cd ~/airflow/dags
290 | python sakila_main_tables.py
291 | ```
292 |
293 | 5. Now test run the two main tasks using airflow. This will actually execute a DAG task as if it were running in airflow so expect to see a file created in the bucket and a table created in BigQuery when you get a success:
294 | ```
295 | airflow test sakila_main_tables.py extract_mysql_sakila_1_actor 08-11-2017
296 | airflow test sakila_main_tables.py load_bq_sakila_1_actor 08-11-2017
297 | ```
298 |
299 | # Future Works, To Do Items
300 | - [ ] Add `eth_rates.py` exercise with an example showing how to use `plugins`
301 | - [ ] Figure out Encryption for connections
302 | - [ ] Document setting up for `CeleryExecutor`
303 | - [ ] Include instructions for setting up `systemd`
304 |
305 | # Sponsorship Message
306 | Future works coming soon thanks to my sponsor:
307 | 
308 |
--------------------------------------------------------------------------------