├── images └── sponser_banner.png ├── meta └── crypto.sql ├── dags ├── mysql_to_gcs.py ├── tutorial.py ├── eth_rates.py └── sakila_main_tables.py ├── LICENSE └── README.md /images/sponser_banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikeghen/airflow-tutorial/HEAD/images/sponser_banner.png -------------------------------------------------------------------------------- /meta/crypto.sql: -------------------------------------------------------------------------------- 1 | create table rates ( 2 | id varchar(10), 3 | price numeric(15,2), 4 | last_price numeric(15,2), 5 | volume numeric(15,2), 6 | recorded_at timestamp without time zone default (now() at time zone 'utc') 7 | ); 8 | 9 | create table markets ( 10 | market varchar(32), 11 | id varchar(10), 12 | price numeric(15,2), 13 | volume_btc numeric(15,2), 14 | volume numeric(15,2), 15 | recorded_at timestamp without time zone default (now() at time zone 'utc') 16 | ); 17 | -------------------------------------------------------------------------------- /dags/mysql_to_gcs.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.contrib.operators.mysql_to_gcs import MySqlToGoogleCloudStorageOperator 3 | from airflow.operators.bash_operator import BashOperator 4 | from datetime import datetime, timedelta 5 | 6 | default_args = { 7 | 'owner': 'mikeghen', 8 | 'start_date': datetime(2017, 8, 11), 9 | 'depends_on_past': False, 10 | 'retries': 1, 11 | 'retry_delay': timedelta(minutes=5), 12 | } 13 | 14 | dag = DAG('mysql_to_gcs', default_args=default_args) 15 | 16 | export_actor = MySqlToGoogleCloudStorageOperator( 17 | task_id='extract_actors', 18 | mysql_conn_id='sakila_test', 19 | google_cloud_storage_conn_id='gcp_test', 20 | sql='SELECT * FROM sakila.actor', 21 | bucket='ghen-airflow', 22 | filename='sakila/actors/actors{}.json', 23 | schema_filename='sakila/schemas/actors.json', 24 | dag=dag) 25 | -------------------------------------------------------------------------------- /dags/tutorial.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.operators.bash_operator import BashOperator 3 | from datetime import datetime, timedelta 4 | 5 | 6 | default_args = { 7 | 'owner': 'airflow', 8 | 'depends_on_past': False, 9 | 'start_date': datetime(2015, 6, 1), 10 | 'email': ['airflow@airflow.com'], 11 | 'email_on_failure': False, 12 | 'email_on_retry': False, 13 | 'retries': 1, 14 | 'retry_delay': timedelta(minutes=5), 15 | # 'queue': 'bash_queue', 16 | # 'pool': 'backfill', 17 | # 'priority_weight': 10, 18 | # 'end_date': datetime(2016, 1, 1), 19 | } 20 | 21 | dag = DAG('tutorial', default_args=default_args) 22 | 23 | # t1, t2 and t3 are examples of tasks created by instantiating operators 24 | t1 = BashOperator( 25 | task_id='print_date', 26 | bash_command='date', 27 | dag=dag) 28 | 29 | t2 = BashOperator( 30 | task_id='sleep', 31 | bash_command='sleep 5', 32 | retries=3, 33 | dag=dag) 34 | 35 | templated_command = """ 36 | {% for i in range(5) %} 37 | echo "{{ ds }}" 38 | echo "{{ macros.ds_add(ds, 7)}}" 39 | echo "{{ params.my_param }}" 40 | {% endfor %} 41 | """ 42 | 43 | t3 = BashOperator( 44 | task_id='templated', 45 | bash_command=templated_command, 46 | params={'my_param': 'Parameter I passed in'}, 47 | dag=dag) 48 | 49 | t2.set_upstream(t1) 50 | t3.set_upstream(t1) 51 | -------------------------------------------------------------------------------- /dags/eth_rates.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | import json 3 | 4 | from airflow.hooks import HttpHook 5 | from airflow.hooks.postgres_hook import PostgresHook 6 | from airflow.operators import PythonOperator, 7 | from airflow.models import DAG 8 | 9 | PAIR = 'eth/usd' 10 | 11 | def get_rates(ds, **kwargs): 12 | # connection: a Airflow connection 13 | pg_hook = PostgresHook(postgres_conn_id='crypto') 14 | api_hook = HttpHook(http_conn_id='cryptocoincharts_eth', method='GET') 15 | 16 | resp = api_hook.run('') 17 | resp = json.loads(resp.content) 18 | 19 | rates_insert = """INSERT INTO rates (id, price, last_price, volume) 20 | VALUES (%s, %s, %s, %s);""" 21 | markets_insert = """INSERT INTO markets (market, pair, price, volume_btc, volume) 22 | VALUES (%s, %s, %s, %s, %s);""" 23 | 24 | pg_hook.run(rates_insert, parameters=(resp['id'], resp['price'], resp['price_before_24h'], resp['volume_second'])) 25 | 26 | for market in resp['markets']: 27 | pg_hook.run(markets_insert, parameters=(market['market'], PAIR, market['price'], market['volume_btc'], market['volume'])) 28 | 29 | 30 | args = { 31 | 'owner': 'mikeghen', 32 | 'depends_on_past': False, 33 | 'start_date': datetime.utcnow(), 34 | 'retries': 1, 35 | 'retry_delay': timedelta(minutes=30), 36 | } 37 | 38 | dag = DAG(dag_id='eth_rates', 39 | default_args=args, 40 | schedule_interval='0 */5 * * *', 41 | dagrun_timeout=timedelta(seconds=5)) 42 | 43 | get_rates_task = \ 44 | PythonOperator(task_id='get_rates', 45 | provide_context=True, 46 | python_callable=get_rates, 47 | dag=dag) 48 | -------------------------------------------------------------------------------- /dags/sakila_main_tables.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.contrib.operators.mysql_to_gcs import MySqlToGoogleCloudStorageOperator 3 | from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator 4 | from airflow.operators.slack_operator import SlackAPIPostOperator 5 | from datetime import datetime, timedelta 6 | 7 | """ 8 | Extracts most tables from sakila_1 and sakila_2 to BigQuery 9 | 10 | Prerequisties: 11 | 1. sakila_1, sakila_2 connection in Airflow (MySQL) 12 | 2. gcp_test connection in Airflow (Google Cloud Platform) 13 | 3. ghen-airflow bucket in Google Cloud Storage 14 | a. Make path to ghen-airflow/sakila_1 and ghen-airflow/sakila_2 15 | 4. sakila_1, sakila_2 datasets in BigQuerys 16 | """ 17 | 18 | sakila_connections = [ 19 | # NOTE: Also the names of the databases in MySQL and datasets in BigQuery 20 | 'sakila_1', 21 | 'sakila_2' 22 | ] 23 | 24 | sakila_tables = [ 25 | 'actor', 26 | # 'address', NOTE: This table has some sketchy encoding, use another DAG to load 27 | 'category', 28 | 'city', 29 | 'country', 30 | 'customer', 31 | 'film', 32 | 'film_actor', 33 | 'film_category', 34 | 'film_text', 35 | 'inventory', 36 | 'language', 37 | 'payment', 38 | 'rental', 39 | # 'staff', NOTE: This table has some sketchy encoding, use another DAG to load 40 | 'store' 41 | ] 42 | 43 | default_args = { 44 | 'owner': 'mikeghen', 45 | 'start_date': datetime(2017, 8, 11), 46 | 'depends_on_past': False, 47 | 'retries': 1, 48 | 'retry_delay': timedelta(minutes=5), 49 | } 50 | 51 | dag = DAG('sakila_main_tables', default_args=default_args, schedule_interval=timedelta(days=1)) 52 | 53 | slack_notify = SlackAPIPostOperator( 54 | task_id='slack_notify', 55 | token='xxxxxx', 56 | channel='data-status', 57 | username='airflow', 58 | text='Successfully performed sakila ETL operation', 59 | dag=dag) 60 | 61 | for connection in sakila_connections: 62 | for table in sakila_tables: 63 | extract = MySqlToGoogleCloudStorageOperator( 64 | task_id="extract_mysql_%s_%s"%(connection,table), 65 | mysql_conn_id=connection, 66 | google_cloud_storage_conn_id='gcp_test', 67 | sql="SELECT *, '%s' as source FROM sakila.%s"%(connection,table), 68 | bucket='ghen-airflow', 69 | filename="%s/%s/%s{}.json"%(connection,table,table), 70 | schema_filename="%s/schemas/%s.json"%(connection,table), 71 | dag=dag) 72 | 73 | load = GoogleCloudStorageToBigQueryOperator( 74 | task_id="load_bq_%s_%s"%(connection,table), 75 | bigquery_conn_id='gcp_test', 76 | google_cloud_storage_conn_id='gcp_test', 77 | bucket='ghen-airflow', 78 | destination_project_dataset_table="spark-test-173322.%s.%s"%(connection,table), 79 | source_objects=["%s/%s/%s*.json"%(connection,table,table)], 80 | schema_object="%s/schemas/%s.json"%(connection,table), 81 | source_format='NEWLINE_DELIMITED_JSON', 82 | create_disposition='CREATE_IF_NEEDED', 83 | write_disposition='WRITE_TRUNCATE', 84 | project_id='spark-test-173322', 85 | dag=dag) 86 | 87 | load.set_upstream(extract) 88 | slack_notify.set_upstream(load) 89 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2020 Michael Ghen 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Airflow Tutorial 2 | This documents some of the work I did getting started with Airflow on Google Cloud Platform. 3 | 4 | :warning: Work in progress :pencil: 5 | 6 | ## About this Tutorial 7 | I found the tutorial within the Airflow Documentation to be sparse and I also found that in order to achieve what I was trying to do, I'd have to just read all the documentation. The purpose of this tutorial is to help others get started with Airflow without reading all the documentation. _I'd still recommend reading all the documentation at some point_ but if all you're trying to do is use Airflow to move data from an RDBMS like MySQL or Postgres, this is a great place to start. 8 | 9 | In this tutorial, I will walk you through setting up Airflow on Google Cloud Platform. I will cover creating a data flow that moves data from MySQL to BigQuery. My goal is to make this tutorial comprehensive enough so that it can be used to configure a production Airflow deployment. 10 | 11 | # Setup 12 | I'm using Google Cloud Platform for hosting. The end goal here is to take data from 5 MySQL databases and load it into Google BigQuery. 13 | 14 | ## Installation 15 | I installed Airflow on an instance using Compute Engine (using Ubuntu 16 OS). The installation was pretty trivial simply: 16 | ``` 17 | export AIRFLOW_HOME=~/airflow 18 | pip install airflow 19 | airflow initdb 20 | airflow webserver -p 8080 21 | ``` 22 | and I was up and running. You can find [more on the installation in the Airflow Documentation](http://pythonhosted.org/airflow/installation.html). 23 | 24 | ### About the Installation 25 | Airflow is install as a Python package and all the configuration files are stored in `~/airflow`. 26 | 27 | The primary file you need to know of is `~/airflow/airflow.cfg` which stores the configuration information for Airflow. I will edit those in the next section to setup Security. 28 | 29 | Airflow is a [Flask](http://flask.pocoo.org/) application by the way. 30 | 31 | ## Database Setup 32 | According to the Airflow Documentation: 33 | >If you want to take a real test drive of Airflow, you should consider setting up a real database backend and switching to the LocalExecutor. 34 | 35 | I decided I would just install Postgres on my Airflow instance (Ubuntu 16): 36 | ``` 37 | sudo apt-get install postgresql postgresql-contrib 38 | pip install psycopg2 39 | ``` 40 | Then to create a user for airflow: 41 | ``` 42 | $ sudo -u postgres createuser --interactive 43 | Enter name of role to add: airflow 44 | Shall the new role be a superuser? (y/n) n 45 | Shall the new role be allowed to create databases? (y/n) n 46 | Shall the new role be allowed to create more new roles? (y/n) n 47 | ``` 48 | Then set the user's password and create the database: 49 | ``` 50 | sudo -u postgres psql 51 | psql (9.5.7) 52 | Type "help" for help. 53 | 54 | postgres=# ALTER USER airflow WITH PASSWORD 'airflow_password'; 55 | ALTER ROLE 56 | postgres=# CREATE DATABASE airflow; 57 | CREATE DATABASE 58 | ``` 59 | Next, edit the `airflow.cfg` to use Postgres by adding: 60 | ``` 61 | # The Postgres connection string 62 | sql_alchemy_conn = postgresql://airflow:airflow_password@localhost/airflow 63 | ``` 64 | and comment out the SQLite config. 65 | 66 | Finally, reinitialize the database: 67 | ``` 68 | airflow initdb 69 | ``` 70 | ### Restarting Airflow 71 | I had to restart Airflow which wasn't as simple as I expected. I ended up using `kill -9` to kill all the airflow processes. I tried other solutions posted on Stack Overflow, but eventually just killed the processes using `-9` and restarted using: 72 | ``` 73 | airflow webserver -p 8080 -D 74 | ``` 75 | :warning: You should really configure systemd 76 | 77 | ## Starting the Scheduler 78 | The scheduler needs to be running in order for jobs and tasks to be executed. To start the scheduler, run: 79 | ``` 80 | airflow scheduler 81 | ``` 82 | 83 | ## Integration with systemd 84 | :pencil: http://pythonhosted.org/airflow/configuration.html#integration-with-systemd 85 | 86 | # Sponsorship Message 87 | This Airflow Tutorial is being sponsored by the following tool; please help to support us by taking a look and signing up to a free trial 88 | 89 | GitAds 90 | 91 | 92 | ## Security 93 | ### User Access Control 94 | One of my concerns was user access controls so after the install I jumped down to the [Security](http://pythonhosted.org/airflow/security.html?highlight=users) portion of the Airflow Documentation. Per the docs: 95 | > By default, all gates are opened. An easy way to restrict access to the web application is to do it at the network level, or by using SSH tunnels. 96 | 97 | I plan on setting up SSH tunneling in production (until we have a VPN in place) but I still want my users to have to authenticate. 98 | 99 | To setup Airflow to require usernames and password, I edited my `aiflow.cfg` file under the `[webserver]` section per the documentation: 100 | 101 | ``` 102 | authenticate = True 103 | auth_backend = airflow.contrib.auth.backends.password_auth 104 | ``` 105 | Next, you'll need to install `flask_bcrypt`. Since Airflow is a Flask application it will need this package to manage authentication encryption: 106 | ``` 107 | pip install flask_bcrypt 108 | ``` 109 | Finally, I needed to create the initial user by running Python on the command line first: 110 | ``` 111 | cd ~/airflow 112 | python 113 | ``` 114 | Then in the Python interpreter: 115 | ```python 116 | import airflow 117 | from airflow import models, settings 118 | from airflow.contrib.auth.backends.password_auth import PasswordUser 119 | user = PasswordUser(models.User()) 120 | user.username = 'admin' 121 | user.email = 'admin@example.com' 122 | user.password = 'admin_password' 123 | session = settings.Session() 124 | session.add(user) 125 | session.commit() 126 | session.close() 127 | exit() 128 | ``` 129 | Finally, restart airflow. 130 | 131 | :pencil: I will probably come back later and setup [GitHub Enterprise Auth](http://pythonhosted.org/airflow/security.html#github-enterprise-ghe-authentication) 132 | 133 | ### Encryption 134 | When clicking around, I ran into this nasty message on `/admin/connections`: 135 | > Warning: Connection passwords are stored in plaintext until you install the Python "cryptography" library. You can find installation instructions here: https://cryptography.io/en/latest/installation/. Once installed, instructions for creating an encryption key will be displayed the next time you import Airflow. 136 | 137 | So I did a simple install of two encryption packages, both recommended in the Airflow Documentation: 138 | ``` 139 | pip install cryptography 140 | pip install crypto 141 | ``` 142 | and the message went away. I will revisit this when I start setting up connections. 143 | 144 | :pencil: Still figuring this out... 145 | 146 | ## Clearing out the Default DAGs and Connections 147 | Clearing out the default connections was easy. I just selected all and then `With Selected > Delete` them. 148 | 149 | I added back a single connection to the local Airflow Postgres database. 150 | 151 | I made the mistake of trying to delete the DAGs from Postgres CLI, :warning: bad idea... Just edit `airflow.cfg` and set: 152 | 153 | ``` 154 | load_examples = False 155 | ``` 156 | 157 | :pencil: I had a problem where connections showed by up after I restarted Airflow, still figuring this out... 158 | 159 | ## Final Notes on Setup 160 | There are probably a few other setup related things to do like workers and backups but I'm all set for now. Next, I move onto setting up for my data flows. 161 | 162 | # Working with DAGs 163 | The Airflow Documentation talks a lot about "DAGs" but I found the documentation spread out all over the place. This section will walk you through configuring Airflow to move data from MySQL databases into BigQuery. 164 | 165 | ## Operators and Hooks 166 | Before I get into coding up some things, I think it's important to understand what Operators and Hooks are within Airflow. 167 | 168 | **Operators** allow for generation of certain types of tasks that become nodes in the DAG when instantiated. 169 | 170 | There are 3 main types of operators: 171 | * Operators that performs an action, or tell another system to perform an action 172 | * e.g. `BashOperator` 173 | * Transfer operators move data from one system to another 174 | * e.g. `MySqlToGoogleCloudStorageOperator` or `MySqlToGoogleCloudStorageOperator` 175 | * Sensors are a certain type of operator that will keep running until a certain criterion is met. 176 | * e.g. `GoogleCloudStorageObjectSensor` 177 | 178 | **Hooks** manage the interaction between systems (e.g. BigQuery) and Airflow. Hooks provide methods like `get_conn` and `get_pandas_df` which interact with systems to access data. 179 | 180 | ## Creating and Testing the First DAGs 181 | DAGs are created using Python code. To make a DAG, you can create a Python script and save it into `dag_folder` as specified in `airflow.cfg` (defaults to `~/airflow/dags`). I'm going to create a simple DAG to test that Airflow is finding DAGs correctly. So I'm creating a file call `tutorial.py` in `~/airflow/dags`. 182 | 183 | To make sure Airflow finds the DAG I ran: 184 | ``` 185 | airflow list_dags 186 | ``` 187 | form the command line. 188 | 189 | # Hands On Exercises :weight_lifting_man: 190 | The first real DAG I want to make is one that exports data from a MySQL Database and dumps it into Google Cloud Storage. 191 | 192 | ## Creating a MySQL to Google Cloud Storage DAG 193 | ### Setup 194 | To setup for this exercise, I first needed to create some infrastructure to simulate an operational system I could do some ETL on. My setup looked like this: 195 | 196 | 1. A MySQL database using GCP's SQL which I loaded MySQL's "sakila" database into (see `sakila-database.sql` for the dump file I imported into my instance) 197 | 2. A Google Cloud Storage Bucket I could dump some data into 198 | 199 | ### Instructions 200 | The first task is to demonstrate that I could use the `MySqlToGoogleCloudStorageOperator` to export data from MySQL to a GCS bucket. I crafted this simple DAG `mysql_to_gcs.py`. 201 | 202 | There is some backend configuration work to do before this DAG will run. 203 | 204 | First, setup connection in Airflow. 205 | 206 | 1. Create a MySQL connection: 207 | ``` 208 | Conn Id: sakila_test 209 | Conn Type: MySQL 210 | Host: 10.10.10.10 211 | Schema: sakila 212 | Login: airflow 213 | Password: airflow_password 214 | ``` 215 | 2. Create a GCP connection 216 | 1. Create a Service Account and download the credentials you need, save them to somewhere on the airflow instance. I put mine in `/etc/gcp/creds.json` 217 | 2. Setup the connections: 218 | ``` 219 | Conn Id: gcp_test 220 | Conn Type: Google Cloud Platform 221 | Project Id: my-gcp-project-id-00000 222 | Keyfile Path: /etc/gcp/creds.json 223 | Scopes (comma seperated): https://www.googleapis.com/auth/cloud-platform 224 | ``` 225 | 226 | 3. Install MySQL dependacies on the airflow instance: 227 | ``` 228 | sudo apt-get install python-mysqldb 229 | pip install pymysql 230 | ``` 231 | 232 | 4. Create the `mysql_to_gcs.py` DAG in `~/airflow/dags` (find code in `./dags`) 233 | 234 | 5. Test for python compilation to make sure you don't have any syntax errors: 235 | ``` 236 | cd ~/airflow/dags 237 | python mysql_to_gcs.py 238 | ``` 239 | 240 | 6. Now test run the task using airflow. This will actually execute a DAG task as if it were running in airflow so expect to see a file created in the bucket you're using: 241 | ``` 242 | airflow test mysql_to_gcs.py extract_actor 08-11-2017 243 | ``` 244 | 245 | 7. Once you've tested it, you're all set! 246 | 247 | ## Creating a DAG to Extract multple tables from multiple MySQL databases to BigQuery 248 | In this exercise, we'll pull data from two MySQL databases and dump it to GCS then load it from GCS to BigQuery. 249 | ### Setup 250 | To setup for this exercise, I first needed to create some infrastructure to simulate an operational system I could do some ETL on. My setup looked like this: 251 | 252 | 1. Create your first MySQL database (`sakila_1`) using GCP's SQL which you will loaded MySQL's "sakila" database into (see `sakila-database.sql` for the dump file I imported into my instance) 253 | 1. Create your second MySQL database (`sakila_2`) using GCP's SQL which you will loaded MySQL's "sakila" database into (see `sakila-database.sql` for the dump file I imported into my instance) 254 | 2. Create a Google Cloud Storage Bucket you could dump some data into 255 | 4. Create a BigQuery Dataset for `sakila_1` and `sakila_2` (could probably make one dataset now that I think about it) 256 | 257 | ### Instructions 258 | 1. Create two MySQL connection: 259 | ``` 260 | Conn Id: sakila_1 261 | Conn Type: MySQL 262 | Host: 10.10.10.10 263 | Schema: sakila 264 | Login: airflow 265 | Password: airflow_password 266 | ``` 267 | ``` 268 | Conn Id: sakila_2 269 | Conn Type: MySQL 270 | Host: 10.10.10.11 271 | Schema: sakila 272 | Login: airflow 273 | Password: airflow_password 274 | ``` 275 | 2. Create a GCP connection 276 | 1. Create a Service Account and download the credentials you need, save them to somewhere on the airflow instance. I put mine in `/etc/gcp/creds.json` 277 | 2. Setup the connections: 278 | ``` 279 | Conn Id: gcp_test 280 | Conn Type: Google Cloud Platform 281 | Project Id: my-gcp-project-id-00000 282 | Keyfile Path: /etc/gcp/creds.json 283 | Scopes (comma seperated): https://www.googleapis.com/auth/cloud-platform 284 | ``` 285 | 3. Create the `sakila_main_tables.py` DAG in `~/airflow/dags` (find code in `./dags`) 286 | 287 | 4. Test for python compilation to make sure you don't have any syntax errors: 288 | ``` 289 | cd ~/airflow/dags 290 | python sakila_main_tables.py 291 | ``` 292 | 293 | 5. Now test run the two main tasks using airflow. This will actually execute a DAG task as if it were running in airflow so expect to see a file created in the bucket and a table created in BigQuery when you get a success: 294 | ``` 295 | airflow test sakila_main_tables.py extract_mysql_sakila_1_actor 08-11-2017 296 | airflow test sakila_main_tables.py load_bq_sakila_1_actor 08-11-2017 297 | ``` 298 | 299 | # Future Works, To Do Items 300 | - [ ] Add `eth_rates.py` exercise with an example showing how to use `plugins` 301 | - [ ] Figure out Encryption for connections 302 | - [ ] Document setting up for `CeleryExecutor` 303 | - [ ] Include instructions for setting up `systemd` 304 | 305 | # Sponsorship Message 306 | Future works coming soon thanks to my sponsor: 307 | ![GitAds](https://images.gitads.io/airflow-tutorial) 308 | --------------------------------------------------------------------------------