├── .gitignore ├── LICENSE ├── Makefile ├── Pipfile ├── Pipfile.lock ├── README.md ├── azure-pipelines.yml ├── dags ├── generate_twitter.py ├── parameters.py ├── simple_dag.py ├── subdags │ └── twitter_subdag.py └── twitter_airflow.py ├── docs ├── .buildinfo ├── .nojekyll ├── _sources │ ├── index.rst.txt │ └── setup.md.txt ├── _static │ ├── alabaster.css │ ├── basic.css │ ├── custom.css │ ├── doctools.js │ ├── documentation_options.js │ ├── file.png │ ├── jquery-3.2.1.js │ ├── jquery.js │ ├── language_data.js │ ├── minus.png │ ├── plus.png │ ├── pygments.css │ ├── python.png │ ├── searchtools.js │ ├── underscore-1.3.1.js │ └── underscore.js ├── genindex.html ├── html │ ├── .buildinfo │ ├── .nojekyll │ ├── _images │ │ ├── 12.png │ │ ├── 4.jpg │ │ ├── DAG.png │ │ ├── airflow-logo.jpeg │ │ ├── airflow.png │ │ ├── architecture.png │ │ ├── automation1.jpg │ │ ├── azure.png │ │ ├── gooddata.png │ │ ├── gooddata1.png │ │ ├── luigi.png │ │ ├── mssignin.png │ │ ├── twitter1.png │ │ ├── twitter2.png │ │ ├── twitter3.png │ │ └── uses.png │ ├── _sources │ │ ├── about.md.txt │ │ ├── airflow-intro.md.txt │ │ ├── azure.md.txt │ │ ├── first-airflow.md.txt │ │ ├── index.rst.txt │ │ ├── pipelines.md.txt │ │ └── setup.rst.txt │ ├── _static │ │ ├── 12.png │ │ ├── 4.jpg │ │ ├── DAG.png │ │ ├── GUI.png │ │ ├── airflow-logo.jpeg │ │ ├── airflow.png │ │ ├── alabaster.css │ │ ├── architecture.png │ │ ├── automation1.jpg │ │ ├── azure.png │ │ ├── basic.css │ │ ├── connection.png │ │ ├── custom.css │ │ ├── datapyramid.png │ │ ├── doctools.js │ │ ├── documentation_options.js │ │ ├── file.png │ │ ├── gooddata.png │ │ ├── gooddata1.png │ │ ├── jquery-3.2.1.js │ │ ├── jquery.js │ │ ├── language_data.js │ │ ├── luigi.png │ │ ├── minus.png │ │ ├── mssignin.png │ │ ├── pipeline1.png │ │ ├── plus.png │ │ ├── pygments.css │ │ ├── python.png │ │ ├── searchtools.js │ │ ├── twitter1.png │ │ ├── twitter2.png │ │ ├── twitter3.png │ │ ├── underscore-1.3.1.js │ │ ├── underscore.js │ │ └── uses.png │ ├── about.html │ ├── airflow-intro.html │ ├── azure.html │ ├── first-airflow.html │ ├── genindex.html │ ├── index.html │ ├── objects.inv │ ├── pipelines.html │ ├── search.html │ ├── searchindex.js │ └── setup.html ├── index.html ├── objects.inv ├── search.html ├── searchindex.js └── setup.html ├── environment.yaml ├── make.bat ├── requirements.txt ├── solutions ├── dags │ ├── dags │ │ ├── data │ │ │ └── tweets │ │ │ │ └── #pycon since:2019-04-30 until:2019-05-01_05012019070912.csv │ │ ├── generate_twitter.py │ │ ├── parameters.py │ │ ├── simple_dag.py │ │ ├── subdags │ │ │ └── twitter_subdag.py │ │ └── twitter_airflow.py │ └── twitter_airflow.py └── etl-basic │ ├── analyse_twitter.py │ ├── etl.sh │ ├── stream_twitter.py │ ├── stream_twitter_alt.py │ └── stream_twitter_timed.py └── source ├── _static ├── 12.png ├── 4.jpg ├── DAG.png ├── GUI.png ├── airflow-logo.jpeg ├── airflow.png ├── architecture.png ├── automation1.jpg ├── azure.png ├── connection.png ├── custom.css ├── datapyramid.png ├── gooddata.png ├── gooddata1.png ├── luigi.png ├── mssignin.png ├── pipeline1.png ├── python.png ├── twitter1.png ├── twitter2.png ├── twitter3.png └── uses.png ├── _templates └── sidebarlogo.html ├── about.md ├── airflow-intro.md ├── azure.md ├── conf.py ├── first-airflow.md ├── index.rst ├── pipelines.md └── setup.rst /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | \.vscode/ 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | source/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don’t work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # celery beat schedule file 97 | celerybeat-schedule 98 | 99 | # SageMath parsed files 100 | *.sage.py 101 | 102 | # Environments 103 | .env 104 | .venv 105 | env/ 106 | venv/ 107 | ENV/ 108 | env.bak/ 109 | venv.bak/ 110 | 111 | # Spyder project settings 112 | .spyderproject 113 | .spyproject 114 | 115 | # Rope project settings 116 | .ropeproject 117 | 118 | # mkdocs documentation 119 | /site 120 | 121 | # mypy 122 | .mypy_cache/ 123 | .dmypy.json 124 | dmypy.json 125 | 126 | # Pyre type checker 127 | .pyre/ 128 | 129 | \.DS_Store 130 | 131 | docs/\.doctrees/ 132 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = source 8 | BUILDDIR = build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "pypi" 3 | url = "https://pypi.org/simple" 4 | verify_ssl = true 5 | 6 | [dev-packages] 7 | sphinx = "*" 8 | sphinxcontrib-inlinesyntaxhighlight = "*" 9 | pylint = "*" 10 | recommonmark = "*" 11 | 12 | [packages] 13 | jupyter = "*" 14 | jupyterlab = "*" 15 | papermill = "*" 16 | celery = "*" 17 | mysqlclient = "*" 18 | tweepy = "*" 19 | numpy = "*" 20 | pandas = "*" 21 | hypothesis = "*" 22 | matplotlib = "*" 23 | seaborn = "*" 24 | mysql-connector-python = "*" 25 | apache-airflow = {extras = ["celery", "mysql"],version = "*"} 26 | 27 | [requires] 28 | python_version = "3.7" 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Airflow Tutorial 2 | 3 | ![License: CC BY 4.0](https://img.shields.io/badge/License-CC%20BY%204.0-lightgrey.svg) 4 | 5 | This repo contains the materials for the pipelines tutorial on Pycon -> from scripts soups to Airflow. 6 | 7 | The tutorial covers: 8 | 9 | - Setting up local databases 10 | - Creating basic ETL pipelines in Python: query APIs, load data to databases, perform data cleaning and filtering and persist the consumption ready data 11 | - How to set a local instance of Airflow and get it running 12 | - Creating basic DAGS in Airflow 13 | - Transform script soups ETLS into Airflow dags 14 | - Set up an Airflow instance in Azure 15 | 16 | To add: 17 | - Setting a Kubernetes powered instance on Azure AKS 18 | - Adding CI/CD to using Azure pipelines 19 | 20 | If you are interested in following along visit: 21 | 22 | 23 | The setup instructions can be found at: [https://airflow-tutorial.readthedocs.io/en/latest/setup.html](https://airflow-tutorial.readthedocs.io/en/latest/setup.html) 24 | 25 | If you would like to experiment with Azure [follow this link](https://azure.microsoft.com/en-us/free//?wt.mc_id=PyCon-github-taallard) to get a free trial subscription with 150 dollars. 26 | 27 | 28 | 🚀 PRs and Issues are welcome 29 | 30 | ### License 31 | 32 | [![License: CC BY 4.0](https://licensebuttons.net/l/by/4.0/80x15.png)](https://creativecommons.org/licenses/by/4.0/) 33 | 34 | 35 | This repo is licensed using a CC-BY so you are free to use, remix, and share so long attribution is provided to the original author. 36 | -------------------------------------------------------------------------------- /azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | # Publish the docs to GitHub pages 2 | 3 | # we only build and publish when changes ocurr to the master branch 4 | trigger: 5 | - master 6 | 7 | pool: 8 | vmImage: 'Ubuntu-16.04' 9 | 10 | steps: 11 | # https://docs.microsoft.com/en-us/azure/devops/pipelines/yaml-schema?view=azdevops&tabs=schema#checkout 12 | - checkout: self 13 | persistCredentials: true # set to 'true' to leave the OAuth token in the Git config after the initial fetch 14 | 15 | - task: UsePythonVersion@0 16 | inputs: 17 | versionSpec: '3.7' 18 | addToPath: true 19 | displayName: 'Using defined Python version' 20 | 21 | - script: | 22 | python -m pip install --upgrade pip pipenv 23 | pipenv install --dev --system --deploy 24 | displayName: 'Install dependencies via Pipfile' 25 | 26 | - script: | 27 | sphinx-build -n -b html ./source $(Build.ArtifactStagingDirectory)/build/html 28 | displayName: 'Building Sphinx docs' 29 | 30 | - script: | 31 | git config --local user.name "Tania Allard" 32 | git config --local user.email "trallard@bitsandchips.me" 33 | cp -a $(Build.ArtifactStagingDirectory)/build/html/ $(Build.Repository.LocalPath)/docs 34 | rm -rf $(Build.Repository.LocalPath)/docs/html/.doctrees 35 | displayName: 'Copy artifacts to clean branch' 36 | 37 | - script: | 38 | cd $(Build.Repository.LocalPath) 39 | git add --all 40 | git commit -m "Build documentation [skip ci]" 41 | git push origin HEAD:master 42 | displayName: 'Publish GitHub Pages' 43 | condition: | 44 | and(not(eq(variables['Build.Reason'], 'PullRequest')), 45 | eq(variables['Build.SourceBranch'], 'refs/heads/master')) -------------------------------------------------------------------------------- /dags/generate_twitter.py: -------------------------------------------------------------------------------- 1 | """ Simple example of creating subdags and generating work dynamically""" 2 | from airflow import DAG 3 | from airflow.hooks import SqliteHook 4 | 5 | from airflow.hooks.mysql_hook import MySqlHook 6 | from airflow.models import Variable 7 | from airflow.operators.email_operator import EmailOperator 8 | from airflow.operators.python_operator import PythonOperator, BranchPythonOperator 9 | from airflow.operators.bash_operator import BashOperator 10 | from airflow.operators.subdag_operator import SubDagOperator 11 | 12 | 13 | from twitter_airflow import search_twitter, RAW_TWEET_DIR 14 | from subdags.twitter_subdag import subdag 15 | from datetime import datetime, timedelta 16 | import pandas as pd 17 | import re 18 | import random 19 | 20 | 21 | SEARCH_TERMS = ["#python", "#pydata", "#airflow", "data wrangling", "data pipelines"] 22 | 23 | 24 | default_args = { 25 | "owner": "admin", 26 | "depends_on_past": False, 27 | "start_date": datetime.now() - timedelta(days=4), 28 | "retries": 1, 29 | "retry_delay": timedelta(minutes=5), 30 | } 31 | 32 | dag = DAG( 33 | "generate_twitter_dags", default_args=default_args, schedule_interval="@daily" 34 | ) 35 | 36 | 37 | def fill_terms(my_terms=SEARCH_TERMS, **kwargs): 38 | """ Fill sqlite database with a few search terms. """ 39 | dbconn = MySqlHook(mysql_conn_id="mysql_default") 40 | conn = dbconn.get_connection() 41 | cursor = conn.cursor() 42 | df = pd.DataFrame(my_terms, columns=["search_term"]) 43 | try: 44 | df.to_sql("twitter_terms", conn) 45 | except ValueError: 46 | # table already exists 47 | pass 48 | 49 | 50 | def generate_search_terms(**kwargs): 51 | """ Generate subdag to search twitter for terms. """ 52 | dbconn = MySqlHook(mysql_conn_id="mysql_default") 53 | conn = dbconn.get_connection() 54 | cursor = conn.cursor() 55 | query = "select * from twitter_terms" 56 | df = pd.read_sql_query(query, conn) 57 | return random.choice( 58 | [ 59 | "search_{}_twitter".format(re.sub(r"\W+", "", t)) 60 | for t in df.search_term.values 61 | ] 62 | ) 63 | 64 | 65 | fill_search_terms = PythonOperator( 66 | task_id="fill_terms", provide_context=True, python_callable=fill_terms, dag=dag 67 | ) 68 | 69 | 70 | gen_search_terms = BranchPythonOperator( 71 | task_id="generate_search_terms", 72 | provide_context=True, 73 | python_callable=generate_search_terms, 74 | dag=dag, 75 | ) 76 | 77 | 78 | email_links = EmailOperator( 79 | task_id="email_best_links", 80 | to="MYEMAIL@MYSITE.com", 81 | subject="Latest popular links", 82 | html_content="Check out the latest!!", 83 | files=["{}/latest_links.txt".format(RAW_TWEET_DIR)], 84 | dag=dag, 85 | ) 86 | 87 | 88 | sub = SubDagOperator( 89 | subdag=subdag, task_id="insert_and_id_pop", trigger_rule="one_success", dag=dag 90 | ) 91 | 92 | 93 | clear_latest = BashOperator( 94 | bash_command="rm -rf {}/latest_links.txt".format(RAW_TWEET_DIR), 95 | task_id="clear_latest", 96 | dag=dag, 97 | ) 98 | 99 | 100 | gen_search_terms.set_upstream(fill_search_terms) 101 | 102 | for term in SEARCH_TERMS: 103 | term_without_punctuation = re.sub(r"\W+", "", term) 104 | simple_search = PythonOperator( 105 | task_id="search_{}_twitter".format(term_without_punctuation), 106 | provide_context=True, 107 | python_callable=search_twitter, 108 | dag=dag, 109 | params={"query": term}, 110 | ) 111 | simple_search.set_upstream(gen_search_terms) 112 | simple_search.set_downstream(sub) 113 | 114 | sub.set_downstream(email_links) 115 | email_links.set_downstream(clear_latest) 116 | -------------------------------------------------------------------------------- /dags/parameters.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example uses the existing Dummy Operator and Variable model to 3 | demonstrate dynamic creation of DAGs based on a Variable setting. As 4 | shown below, a list of customer objects is retrieved and used to create 5 | unique dags based on the imput. 6 | """ 7 | 8 | from datetime import datetime, timedelta 9 | from airflow.models import DAG 10 | from airflow.models import Variable 11 | from airflow.operators.dummy_operator import DummyOperator 12 | 13 | # Create JSON Variable if it doesn't exist 14 | 15 | CUSTOMERS = [ 16 | { 17 | "customer_name": "Faux Customer", 18 | "customer_id": "faux_customer", 19 | "email": ["admin@fauxcustomer.com", "admin@astronomer.io"], 20 | "schedule_interval": None, 21 | "enabled": True, 22 | }, 23 | { 24 | "customer_name": "Bogus Customer", 25 | "customer_id": "bogus_customer", 26 | "email": ["admin@boguscustomer.com", "admin@astronomer.io"], 27 | "schedule_interval": "@once", 28 | "enabled": True, 29 | }, 30 | ] 31 | 32 | # Get JSON Variable 33 | CUSTOMERS = Variable.get("customer_list", default_var=CUSTOMERS, deserialize_json=True) 34 | 35 | 36 | def create_dag(customer): 37 | """ 38 | Accepts a customer parameters dict and 39 | overrides default args to create a DAG object 40 | 41 | Returns: DAG() Object 42 | """ 43 | default_args = { 44 | "owner": "airflow", 45 | "depends_on_past": False, 46 | "email": "xyz@xyz.com", 47 | "retries": 1, 48 | "retry_delay": timedelta(minutes=5), 49 | "start_date": datetime(2017, 1, 1, 0, 0), 50 | "end_date": None, 51 | } 52 | 53 | """ 54 | This allows DAG parameters to be passed in from the Variable if 55 | a customer needs something specific overridden in their DAG. 56 | Consider how email being passed in from the customer object 57 | overrides email in the resulting replaced_args object. 58 | """ 59 | replaced_args = { 60 | k: default_args[k] if customer.get(k, None) is None else customer[k] 61 | for k in default_args 62 | } 63 | 64 | dag_id = "{base_name}_{id}".format( 65 | base_name="load_clickstream_data", id=customer["customer_id"] 66 | ) 67 | 68 | return DAG( 69 | dag_id=dag_id, 70 | default_args=replaced_args, 71 | schedule_interval=customer["schedule_interval"], 72 | ) 73 | 74 | # Loop customers array of containing customer objects 75 | for cust in CUSTOMERS: 76 | if cust["enabled"]: 77 | 78 | dag = create_dag(cust) 79 | 80 | globals()[dag.dag_id] = dag 81 | 82 | extract = DummyOperator(task_id="extract_data", dag=dag) 83 | 84 | transform = DummyOperator(task_id="transform_data", dag=dag) 85 | 86 | load = DummyOperator(task_id="load_data", dag=dag) 87 | 88 | extract >> transform >> load 89 | 90 | else: 91 | # TODO Create but programmatically pause 92 | pass 93 | -------------------------------------------------------------------------------- /dags/simple_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.operators.dummy_operator import DummyOperator 5 | from airflow.operators.python_operator import PythonOperator 6 | 7 | 8 | def print_hello(): 9 | return "Hello world!" 10 | 11 | 12 | default_args = { 13 | "owner": "airflow", 14 | "depends_on_past": False, 15 | "start_date": datetime(2019, 4, 30), 16 | "email": ["airflow@example.com"], 17 | "email_on_failure": False, 18 | "email_on_retry": False, 19 | "retries": 1, 20 | "retry_delay": timedelta(minutes=2), 21 | } 22 | 23 | dag = DAG( 24 | "hello_world", 25 | description="Simple tutorial DAG", 26 | schedule_interval="0 12 * * *", 27 | default_args=default_args, 28 | catchup=False, 29 | ) 30 | 31 | t1 = DummyOperator(task_id="dummy_task", retries=3, dag=dag) 32 | 33 | t2 = PythonOperator(task_id="hello_task", python_callable=print_hello, dag=dag) 34 | 35 | # sets downstream foe t1 36 | t1 >> t2 37 | 38 | # equivalent 39 | # t2.set_upstream(t1) 40 | -------------------------------------------------------------------------------- /dags/subdags/twitter_subdag.py: -------------------------------------------------------------------------------- 1 | """ Simple subdag example """ 2 | from airflow import DAG 3 | from airflow.operators import PythonOperator 4 | from twitter_airflow import csv_to_sql, identify_popular_links 5 | from datetime import datetime, timedelta 6 | 7 | 8 | default_args = { 9 | "owner": "admin", 10 | "depends_on_past": False, 11 | "start_date": datetime(2016, 1, 1), 12 | "retries": 1, 13 | "retry_delay": timedelta(minutes=5), 14 | } 15 | 16 | subdag = DAG("generate_twitter_dags.insert_and_id_pop", default_args=default_args) 17 | 18 | move_tweets_to_sql = PythonOperator( 19 | task_id="csv_to_sqlite", 20 | provide_context=True, 21 | python_callable=csv_to_sql, 22 | dag=subdag, 23 | ) 24 | 25 | id_popular = PythonOperator( 26 | task_id="identify_popular_links", 27 | provide_context=True, 28 | python_callable=identify_popular_links, 29 | dag=subdag, 30 | params={"write_mode": "a"}, 31 | ) 32 | 33 | id_popular.set_upstream(move_tweets_to_sql) 34 | -------------------------------------------------------------------------------- /dags/twitter_airflow.py: -------------------------------------------------------------------------------- 1 | """ Simple Airflow data pipeline example using Twitter API """ 2 | import ast 3 | import glob 4 | import itertools 5 | import os.path 6 | import shutil 7 | from collections import Counter 8 | from configparser import ConfigParser 9 | from csv import DictWriter, writer 10 | from datetime import datetime, timedelta 11 | from pathlib import Path 12 | import MySQLdb 13 | import MySQLdb.cursors 14 | 15 | import pandas as pd 16 | from tweepy import API, Cursor, OAuthHandler 17 | 18 | from airflow import DAG 19 | from airflow.hooks import sqlite_hook 20 | from airflow.hooks.mysql_hook import MySqlHook 21 | from airflow.models import Variable 22 | from airflow.operators.email_operator import EmailOperator 23 | from airflow.operators.python_operator import PythonOperator 24 | 25 | RAW_TWEET_DIR = os.path.abspath(os.path.join(__file__, "../data/tweets/")) 26 | CONFIG_FILE = os.path.abspath(os.path.join(__file__, "../config/prod.cfg")) 27 | MAX_TWEEPY_PAGE = 2 28 | 29 | # since there do not exist task on their own we need to create the DAG 30 | default_args = { 31 | "owner": "admin", 32 | "depends_on_past": False, 33 | "start_date": datetime.now() - timedelta(days=5), 34 | "retries": 1, 35 | "retry_delay": timedelta(minutes=5), 36 | } 37 | 38 | dag = DAG("twitter_links", default_args=default_args, schedule_interval="@daily") 39 | 40 | 41 | def extract_tweet_data(tweepy_obj, query): 42 | """ Extract relevant and serializable data from a tweepy Tweet object 43 | params: 44 | tweepy_obj: Tweepy Tweet Object 45 | query: str 46 | returns dict 47 | """ 48 | return { 49 | "user_id": tweepy_obj.user.id, 50 | "user_name": tweepy_obj.user.name, 51 | "user_screenname": tweepy_obj.user.screen_name, 52 | "user_url": tweepy_obj.user.url, 53 | "user_description": tweepy_obj.user.description, 54 | "user_followers": tweepy_obj.user.followers_count, 55 | "user_friends": tweepy_obj.user.friends_count, 56 | "created": tweepy_obj.created_at.isoformat(), 57 | "text": tweepy_obj.text, 58 | "hashtags": [ht.get("text") for ht in tweepy_obj.entities.get("hashtags")], 59 | "mentions": [ 60 | (um.get("id"), um.get("screen_name")) 61 | for um in tweepy_obj.entities.get("user_mentions") 62 | ], 63 | "urls": [url.get("expanded_url") for url in tweepy_obj.entities.get("urls")], 64 | "tweet_id": tweepy_obj.id, 65 | "is_quote_status": tweepy_obj.is_quote_status, 66 | "favorite_count": tweepy_obj.favorite_count, 67 | "retweet_count": tweepy_obj.retweet_count, 68 | "reply_status_id": tweepy_obj.in_reply_to_status_id, 69 | "lang": tweepy_obj.lang, 70 | "source": tweepy_obj.source, 71 | "location": tweepy_obj.coordinates, 72 | "query": query, 73 | } 74 | 75 | 76 | def search_twitter(**kwargs): 77 | """ Search for a query in public tweets""" 78 | query = kwargs.get("params").get("query") 79 | 80 | auth = OAuthHandler(Variable.get("consumer_key"), Variable.get("consumer_secret")) 81 | auth.set_access_token( 82 | Variable.get("access_token"), Variable.get("access_token_secret") 83 | ) 84 | api = API(auth) 85 | 86 | all_tweets = [] 87 | page_num = 0 88 | since_date = datetime.strptime(kwargs.get("ds"), "%Y-%m-%d").date() - timedelta( 89 | days=1 90 | ) 91 | query += " since:{} until:{}".format( 92 | since_date.strftime("%Y-%m-%d"), kwargs.get("ds") 93 | ) 94 | print(f"searching twitter with: {query}") 95 | for page in Cursor( 96 | api.search, q=query, monitor_rate_limit=True, wait_on_rate_limit=True 97 | ).pages(): 98 | all_tweets.extend([extract_tweet_data(t, query) for t in page]) 99 | page_num += 1 100 | if page_num > MAX_TWEEPY_PAGE: 101 | break 102 | 103 | # if it's an empty list, stop here 104 | if not len(all_tweets): 105 | return 106 | 107 | filename = "{}/{}_{}.csv".format( 108 | RAW_TWEET_DIR, query, datetime.now().strftime("%m%d%Y%H%M%S") 109 | ) 110 | 111 | # check that the directory exists 112 | if not Path(filename).resolve().parent.exists(): 113 | 114 | os.mkdir(Path(filename).resolve().parent) 115 | 116 | with open(filename, "w") as raw_file: 117 | raw_wrtr = DictWriter(raw_file, fieldnames=all_tweets[0].keys()) 118 | raw_wrtr.writeheader() 119 | raw_wrtr.writerows(all_tweets) 120 | 121 | 122 | def csv_to_sql(directory=RAW_TWEET_DIR, **kwargs): 123 | """ csv to sql pipeline using pandas 124 | params: 125 | directory: str (file path to csv files) 126 | """ 127 | dbconn = MySqlHook(mysql_conn_id="mysql_default") 128 | conn = dbconn.get_connection() 129 | cursor = conn.cursor() 130 | 131 | for fname in glob.glob("{}/*.csv".format(directory)): 132 | if "_read" not in fname: 133 | try: 134 | df = pd.read_csv(fname) 135 | df.to_sql("tweets", dbconn, if_exists="append", index=False) 136 | shutil.move(fname, fname.replace(".csv", "_read.csv")) 137 | except pd.io.common.EmptyDataError: 138 | # probably an io error with another task / open file 139 | continue 140 | 141 | 142 | def identify_popular_links(directory=RAW_TWEET_DIR, write_mode="w", **kwargs): 143 | """ Identify the most popular links from the last day of tweest in the db 144 | Writes them to latest_links.txt in the RAW_TWEET_DIR 145 | (or directory kwarg) 146 | """ 147 | dbconn = MySqlHook(mysql_conn_id="mysql_default") 148 | conn = dbconn.get_connection() 149 | cursor = conn.cursor() 150 | 151 | query = """select * from tweets where 152 | created > date('now', '-1 days') and urls is not null 153 | order by favorite_count""" 154 | df = pd.read_sql_query(query, conn) 155 | df.urls = df.urls.map(ast.literal_eval) 156 | cntr = Counter(itertools.chain.from_iterable(df.urls.values)) 157 | with open("{}/latest_links.txt".format(directory), write_mode) as latest: 158 | wrtr = writer(latest) 159 | wrtr.writerow(["url", "count"]) 160 | wrtr.writerows(cntr.most_common(5)) 161 | 162 | 163 | # -------------------------------------- 164 | # Tasks 165 | # ------------------------------------- 166 | simple_search = PythonOperator( 167 | task_id="search_twitter", 168 | provide_context=True, 169 | python_callable=search_twitter, 170 | dag=dag, 171 | # note we pass this as a params obj 172 | params={"query": "#pycon"}, 173 | ) 174 | 175 | 176 | move_tweets_to_sql = PythonOperator( 177 | task_id="csv_to_sql", 178 | # extra DAG context 179 | provide_context=True, 180 | # call the function 181 | python_callable=csv_to_sql, 182 | dag=dag, 183 | ) 184 | 185 | 186 | id_popular = PythonOperator( 187 | task_id="identify_popular_links", 188 | provide_context=True, 189 | python_callable=identify_popular_links, 190 | dag=dag, 191 | ) 192 | 193 | 194 | email_links = EmailOperator( 195 | task_id="email_best_links", 196 | to="trallard@bitsandchips.me", 197 | subject="Latest popular links", 198 | html_content="Check out the latest!!", 199 | files=["{}/latest_links.txt".format(RAW_TWEET_DIR)], 200 | dag=dag, 201 | ) 202 | 203 | 204 | simple_search.set_downstream(move_tweets_to_sql) 205 | id_popular.set_upstream(move_tweets_to_sql) 206 | email_links.set_upstream(id_popular) 207 | -------------------------------------------------------------------------------- /docs/.buildinfo: -------------------------------------------------------------------------------- 1 | # Sphinx build info version 1 2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. 3 | config: 67cd5c5b948c82ac9d91d9479af6e978 4 | tags: 645f666f9bcd5a90fca523b33c5a78b7 5 | -------------------------------------------------------------------------------- /docs/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/.nojekyll -------------------------------------------------------------------------------- /docs/_sources/index.rst.txt: -------------------------------------------------------------------------------- 1 | .. Airflow tutorial documentation master file, created by 2 | sphinx-quickstart on Mon Apr 15 15:52:00 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Airflow tutorial 7 | ============================================ 8 | This tutorial was originally developed for PyCon US 2019. 9 | 10 | .. toctree:: 11 | :caption: Table of Contents 12 | :hidden: 13 | :maxdepth: 2 14 | 15 | setup 16 | 17 | 18 | .. toctree:: 19 | :maxdepth: 2 20 | :caption: Contents: 21 | 22 | About your facilitator 23 | ====================== 24 | 25 | My name is Tania. I live in Manchester UK where I work as a 26 | Cloud Advocate for Microsoft. 27 | 28 | Over the years, I have worked as a data engineer, machine learning engineer, 29 | and research software engineer. I love data intensive 30 | enviroments and I am particularly interested in the tools and workflows to 31 | deliver robust, reproducible data insights. 32 | 33 | If you have any questions or feedback about this tutorial please, 34 | file an issue using the following link: ``_. 35 | 36 | You can also contact me via the following channels: 37 | 38 | - E-mail: trallard@bitsandchips.me 39 | - Twitter: `@ixek `_ 40 | - `Tania on GitHub `_ 41 | 42 | Code of Conduct 43 | ================ 44 | All attendees to this workshop are expected to adhere to PyCon's Code of Conduct, 45 | in brief: 46 | **Be open, considerate, and respectful.** 47 | 48 | License 49 | ======= 50 | The content in this workshop is Licensed under `CC-BY-SA 4.0 `_. 51 | Which means that you can use, remix and re-distribute so long attribution to the original 52 | author is maintained (Tania Allard). 53 | 54 | The logo used here was designed by Ashley McNamara for the Microsoft Developer Advocates team use. 55 | 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /docs/_sources/setup.md.txt: -------------------------------------------------------------------------------- 1 | # Getting started 2 | 3 | -------------------------------------------------------------------------------- /docs/_static/custom.css: -------------------------------------------------------------------------------- 1 | /* */ 2 | @import url('https://fonts.googleapis.com/css?family=Itim|Montserrat|Roboto+Mono'); 3 | 4 | a { 5 | color: rgb(96, 138, 197); 6 | } 7 | 8 | a:hover { 9 | color: rgb(65, 129, 218); 10 | } 11 | 12 | div.body h1 { 13 | color: #5F6366; 14 | font-family: 'Itim', cursive; 15 | font-weight: bold; 16 | font-size: 300%; 17 | } 18 | 19 | div.sphinxsidebarwrapper h1.logo { 20 | text-align: center; 21 | margin: 0 0 -20px 0; 22 | } 23 | 24 | div.sphinxsidebar p.blurb { 25 | font-size: 130%; 26 | text-align: center; 27 | font-family: 'Itim', cursive; 28 | color: rgb(151, 139, 196); 29 | } 30 | 31 | div.sphinxsidebar h1{ 32 | font-size: 160%; 33 | color: #5F6366; 34 | font-family: 'Itim', cursive; 35 | } 36 | 37 | div.sphinxsidebar h1 a { 38 | font-size: 160%; 39 | color: #5F6366; 40 | text-decoration: none; 41 | border: none; 42 | font-family: 'Itim', cursive; 43 | } 44 | 45 | div.sphinxsidebar h1 a:hover { 46 | border: none; 47 | } 48 | 49 | div.sphinxsidebar h3 { 50 | display: none; 51 | } 52 | 53 | div.sphinxsidebar a { 54 | color: #5F6366; 55 | } 56 | 57 | code.descname { 58 | color: rgb(151, 139, 196); 59 | } 60 | 61 | th.field-name { 62 | min-width: 100px; 63 | color: rgb(151, 139, 196); 64 | } 65 | 66 | tt, code { 67 | color: #F8F8F2; 68 | background: #015259; 69 | border-radius: 0.3em; 70 | padding: 0.0em 0.3em; 71 | } 72 | 73 | a.reference.internal code.xref span.pre { 74 | color: #F8F8F2; 75 | background: #015259; 76 | border-bottom: none; 77 | border-radius: 0; 78 | padding: 0; 79 | } 80 | 81 | a.reference.internal, a.reference.internal:hover { 82 | border-bottom: none; 83 | } 84 | 85 | a.reference.internal:hover code { 86 | background: #027bab 87 | } 88 | 89 | a.reference.internal:hover code.xref span.pre { 90 | color: #F8F8F2; 91 | background: #027bab; 92 | border-bottom: none; 93 | } 94 | 95 | tt.xref, code.xref, a tt { 96 | background: none; 97 | border-bottom: none; 98 | } 99 | 100 | code.literal { 101 | color: #F8F8F2; 102 | background: #015259; 103 | } 104 | 105 | pre { 106 | padding: 20px 30px; 107 | background: #003038; 108 | } 109 | 110 | div > dl { 111 | border-left: 2px solid #00384021; 112 | padding-left: 5px; 113 | } 114 | 115 | dt { 116 | color: rgb(96, 138, 197); 117 | } 118 | 119 | 120 | div.footer::before { 121 | display: block; 122 | content: ''; 123 | border-top: 2px solid #EDB5BF; 124 | width: 50%; 125 | margin: 2em auto 2em auto; 126 | } 127 | 128 | div.footer { 129 | text-align: center; 130 | /* color: #029be2; */ 131 | } 132 | 133 | div.footer a { 134 | color: #027bab; 135 | text-decoration: none; 136 | } 137 | 138 | @media screen and (max-width: 875px) { 139 | div.sphinxsidebar { 140 | background: #4D6D9A; 141 | } 142 | div.sphinxsidebar h1.logo, div.sphinxsidebar p.blurb{ 143 | text-align: left; 144 | } 145 | div.sphinxsidebar h1 a { 146 | color: #1bc5e0; 147 | } 148 | div.sphinxsidebar a { 149 | /* color: rgb(151, 139, 196); */ 150 | color: white; 151 | } 152 | div.sphinxsidebar ul { 153 | /* color: rgb(151, 139, 196); */ 154 | color: white; 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /docs/_static/documentation_options.js: -------------------------------------------------------------------------------- 1 | var DOCUMENTATION_OPTIONS = { 2 | URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'), 3 | VERSION: '', 4 | LANGUAGE: 'None', 5 | COLLAPSE_INDEX: false, 6 | FILE_SUFFIX: '.html', 7 | HAS_SOURCE: true, 8 | SOURCELINK_SUFFIX: '.txt', 9 | NAVIGATION_WITH_KEYS: false 10 | }; -------------------------------------------------------------------------------- /docs/_static/file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/_static/file.png -------------------------------------------------------------------------------- /docs/_static/minus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/_static/minus.png -------------------------------------------------------------------------------- /docs/_static/plus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/_static/plus.png -------------------------------------------------------------------------------- /docs/_static/pygments.css: -------------------------------------------------------------------------------- 1 | .highlight .hll { background-color: #49483e } 2 | .highlight { background: #272822; color: #f8f8f2 } 3 | .highlight .c { color: #75715e } /* Comment */ 4 | .highlight .err { color: #960050; background-color: #1e0010 } /* Error */ 5 | .highlight .k { color: #66d9ef } /* Keyword */ 6 | .highlight .l { color: #ae81ff } /* Literal */ 7 | .highlight .n { color: #f8f8f2 } /* Name */ 8 | .highlight .o { color: #f92672 } /* Operator */ 9 | .highlight .p { color: #f8f8f2 } /* Punctuation */ 10 | .highlight .ch { color: #75715e } /* Comment.Hashbang */ 11 | .highlight .cm { color: #75715e } /* Comment.Multiline */ 12 | .highlight .cp { color: #75715e } /* Comment.Preproc */ 13 | .highlight .cpf { color: #75715e } /* Comment.PreprocFile */ 14 | .highlight .c1 { color: #75715e } /* Comment.Single */ 15 | .highlight .cs { color: #75715e } /* Comment.Special */ 16 | .highlight .gd { color: #f92672 } /* Generic.Deleted */ 17 | .highlight .ge { font-style: italic } /* Generic.Emph */ 18 | .highlight .gi { color: #a6e22e } /* Generic.Inserted */ 19 | .highlight .gs { font-weight: bold } /* Generic.Strong */ 20 | .highlight .gu { color: #75715e } /* Generic.Subheading */ 21 | .highlight .kc { color: #66d9ef } /* Keyword.Constant */ 22 | .highlight .kd { color: #66d9ef } /* Keyword.Declaration */ 23 | .highlight .kn { color: #f92672 } /* Keyword.Namespace */ 24 | .highlight .kp { color: #66d9ef } /* Keyword.Pseudo */ 25 | .highlight .kr { color: #66d9ef } /* Keyword.Reserved */ 26 | .highlight .kt { color: #66d9ef } /* Keyword.Type */ 27 | .highlight .ld { color: #e6db74 } /* Literal.Date */ 28 | .highlight .m { color: #ae81ff } /* Literal.Number */ 29 | .highlight .s { color: #e6db74 } /* Literal.String */ 30 | .highlight .na { color: #a6e22e } /* Name.Attribute */ 31 | .highlight .nb { color: #f8f8f2 } /* Name.Builtin */ 32 | .highlight .nc { color: #a6e22e } /* Name.Class */ 33 | .highlight .no { color: #66d9ef } /* Name.Constant */ 34 | .highlight .nd { color: #a6e22e } /* Name.Decorator */ 35 | .highlight .ni { color: #f8f8f2 } /* Name.Entity */ 36 | .highlight .ne { color: #a6e22e } /* Name.Exception */ 37 | .highlight .nf { color: #a6e22e } /* Name.Function */ 38 | .highlight .nl { color: #f8f8f2 } /* Name.Label */ 39 | .highlight .nn { color: #f8f8f2 } /* Name.Namespace */ 40 | .highlight .nx { color: #a6e22e } /* Name.Other */ 41 | .highlight .py { color: #f8f8f2 } /* Name.Property */ 42 | .highlight .nt { color: #f92672 } /* Name.Tag */ 43 | .highlight .nv { color: #f8f8f2 } /* Name.Variable */ 44 | .highlight .ow { color: #f92672 } /* Operator.Word */ 45 | .highlight .w { color: #f8f8f2 } /* Text.Whitespace */ 46 | .highlight .mb { color: #ae81ff } /* Literal.Number.Bin */ 47 | .highlight .mf { color: #ae81ff } /* Literal.Number.Float */ 48 | .highlight .mh { color: #ae81ff } /* Literal.Number.Hex */ 49 | .highlight .mi { color: #ae81ff } /* Literal.Number.Integer */ 50 | .highlight .mo { color: #ae81ff } /* Literal.Number.Oct */ 51 | .highlight .sa { color: #e6db74 } /* Literal.String.Affix */ 52 | .highlight .sb { color: #e6db74 } /* Literal.String.Backtick */ 53 | .highlight .sc { color: #e6db74 } /* Literal.String.Char */ 54 | .highlight .dl { color: #e6db74 } /* Literal.String.Delimiter */ 55 | .highlight .sd { color: #e6db74 } /* Literal.String.Doc */ 56 | .highlight .s2 { color: #e6db74 } /* Literal.String.Double */ 57 | .highlight .se { color: #ae81ff } /* Literal.String.Escape */ 58 | .highlight .sh { color: #e6db74 } /* Literal.String.Heredoc */ 59 | .highlight .si { color: #e6db74 } /* Literal.String.Interpol */ 60 | .highlight .sx { color: #e6db74 } /* Literal.String.Other */ 61 | .highlight .sr { color: #e6db74 } /* Literal.String.Regex */ 62 | .highlight .s1 { color: #e6db74 } /* Literal.String.Single */ 63 | .highlight .ss { color: #e6db74 } /* Literal.String.Symbol */ 64 | .highlight .bp { color: #f8f8f2 } /* Name.Builtin.Pseudo */ 65 | .highlight .fm { color: #a6e22e } /* Name.Function.Magic */ 66 | .highlight .vc { color: #f8f8f2 } /* Name.Variable.Class */ 67 | .highlight .vg { color: #f8f8f2 } /* Name.Variable.Global */ 68 | .highlight .vi { color: #f8f8f2 } /* Name.Variable.Instance */ 69 | .highlight .vm { color: #f8f8f2 } /* Name.Variable.Magic */ 70 | .highlight .il { color: #ae81ff } /* Literal.Number.Integer.Long */ -------------------------------------------------------------------------------- /docs/_static/python.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/_static/python.png -------------------------------------------------------------------------------- /docs/genindex.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Index — Airflow tutorial documentation 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 |
28 |
29 |
30 | 37 | 38 | 39 |
40 | 41 | 42 |

Index

43 | 44 |
45 | 46 |
47 | 48 | 49 |
50 | 57 | 58 |
59 |
60 | 115 |
116 |
117 | 125 | 126 | 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /docs/html/.buildinfo: -------------------------------------------------------------------------------- 1 | # Sphinx build info version 1 2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. 3 | config: 7f6d2b706dda0a3b5cf0f2c68897deb7 4 | tags: 645f666f9bcd5a90fca523b33c5a78b7 5 | -------------------------------------------------------------------------------- /docs/html/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/.nojekyll -------------------------------------------------------------------------------- /docs/html/_images/12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/12.png -------------------------------------------------------------------------------- /docs/html/_images/4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/4.jpg -------------------------------------------------------------------------------- /docs/html/_images/DAG.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/DAG.png -------------------------------------------------------------------------------- /docs/html/_images/airflow-logo.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/airflow-logo.jpeg -------------------------------------------------------------------------------- /docs/html/_images/airflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/airflow.png -------------------------------------------------------------------------------- /docs/html/_images/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/architecture.png -------------------------------------------------------------------------------- /docs/html/_images/automation1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/automation1.jpg -------------------------------------------------------------------------------- /docs/html/_images/azure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/azure.png -------------------------------------------------------------------------------- /docs/html/_images/gooddata.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/gooddata.png -------------------------------------------------------------------------------- /docs/html/_images/gooddata1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/gooddata1.png -------------------------------------------------------------------------------- /docs/html/_images/luigi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/luigi.png -------------------------------------------------------------------------------- /docs/html/_images/mssignin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/mssignin.png -------------------------------------------------------------------------------- /docs/html/_images/twitter1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/twitter1.png -------------------------------------------------------------------------------- /docs/html/_images/twitter2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/twitter2.png -------------------------------------------------------------------------------- /docs/html/_images/twitter3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/twitter3.png -------------------------------------------------------------------------------- /docs/html/_images/uses.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/uses.png -------------------------------------------------------------------------------- /docs/html/_sources/about.md.txt: -------------------------------------------------------------------------------- 1 | # About the workshop 2 | 3 | We will be taking a look at the basic concepts of data pipelines as well as practical use cases using Python. 4 | 5 | ## About you: 6 | - Some experience using the command line 7 | - Intermediate Python knowledge / use 8 | - Be able to apply what we learn and adopt to your use cases 9 | - Interested in data and systems 10 | - Aspring or current data engineering 11 | - Some knowledge about systems and databases (enough to be dangerous) 12 | 13 | ## Our focus for the day 14 | - Greater understanding on how to apply data pipelines using the Python toolset 15 | - Focus on concepts 16 | - Apply knowledge with each library 17 | - Will give you the building blocks 18 | 19 | ## Keeping on track 20 | 21 | You will find 🚦 across the tutorial examples. We will use this to identify how folks are doing over the workshop (if following along in person). 22 | Place the post it as follows: 23 | 24 | 🚦 Purple postit: all good, task has been completed 25 | 26 | 🚦 Orange postit: I need extra time or need help with the task in hand -------------------------------------------------------------------------------- /docs/html/_sources/airflow-intro.md.txt: -------------------------------------------------------------------------------- 1 | # Airflow basics 2 | 3 | ## What is Airflow? 4 | 5 | ![airflow logo](_static/airflow-logo.jpeg) 6 | 7 | Airflow is a Workflow engine which means: 8 | 9 | - Manage scheduling and running jobs and data pipelines 10 | - Ensures jobs are ordered correctly based on dependencies 11 | - Manage the allocation of scarce resources 12 | - Provides mechanisms for tracking the state of jobs and recovering from failure 13 | 14 | It is highly versatile and can be used across many many domains: 15 | ![](_static/uses.png) 16 | 17 | ## Basic Airflow concepts 18 | 19 | - **Task**: a defined unit of work (these are called operators in Airflow) 20 | - **Task instance**: an individual run of a single task. Task instances also have an indicative state, which could be “running”, “success”, “failed”, “skipped”, “up for retry”, etc. 21 | - **DAG**: Directed acyclic graph, 22 | a set of tasks with explicit execution order, beginning, and end 23 | - **DAG run**: individual execution/run of a DAG 24 | 25 | **Debunking the DAG** 26 | 27 | The vertices and edges (the arrows linking the nodes) have an order and direction associated to them 28 | 29 | ![](_static/DAG.png) 30 | 31 | each node in a DAG corresponds to a task, which in turn represents some sort of data processing. For example: 32 | 33 | Node A could be the code for pulling data from an API, node B could be the code for anonymizing the data. Node B could be the code for checking that there are no duplicate records, and so on. 34 | 35 | These 'pipelines' are acyclic since they need a point of completion. 36 | 37 | **Dependencies** 38 | 39 | Each of the vertices has a particular direction that shows the relationship between certain nodes. For example, we can only anonymize data once this has been pulled out from the API. 40 | 41 | ## Idempotency 42 | 43 | This is one of the most important characteristics of good ETL architectures. 44 | 45 | When we say that something is idempotent it means it will produce the same result regardless of how many times this is run (i.e. the results are reproducible). 46 | 47 | Reproducibility is particularly important in data-intensive environments as this ensures that the same inputs will always return the same outputs. 48 | 49 | ## Airflow components 50 | 51 | ![](_static/architecture.png) 52 | 53 | There are 4 main components to Apache Airflow: 54 | 55 | ### Web server 56 | 57 | The GUI. This is under the hood a Flask app where you can track the status of your jobs and read logs from a remote file store (e.g. [Azure Blobstorage](https://docs.microsoft.com/en-us/azure/storage/blobs/storage-blobs-overview/?wt.mc_id=PyCon-github-taallard)). 58 | 59 | ### Scheduler 60 | 61 | This component is responsible for scheduling jobs. This is a multithreaded Python process that uses the DAGb object to decide what tasks need to be run, when and where. 62 | 63 | The task state is retrieved and updated from the database accordingly. The web server then uses these saved states to display job information. 64 | 65 | ### Executor 66 | 67 | The mechanism that gets the tasks done. 68 | 69 | ### Metadata database 70 | 71 | - Powers how the other components interact 72 | - Stores the Airflow states 73 | - All processes read and write from here 74 | 75 | ## Workflow as a code 76 | One of the main advantages of using a workflow system like Airflow is that all is code, which makes your workflows maintainable, versionable, testable, and collaborative. 77 | 78 | Thus your workflows become more explicit and maintainable (atomic tasks). 79 | 80 | Not only your code is dynamic but also is your infrastructure. 81 | 82 | ### Defining tasks 83 | 84 | Tasks are defined based on the abstraction of `Operators` (see Airflow docs [here](https://airflow.apache.org/concepts.html#operators)) which represent a single **idempotent task**. 85 | 86 | The best practice is to have atomic operators (i.e. can stand on their own and do not need to share resources among them). 87 | 88 | You can choose among; 89 | - `BashOperator` 90 | - `PythonOperator` 91 | - `EmailOperator` 92 | - `SimpleHttpOperator` 93 | - `MySqlOperator` (and other DB) 94 | 95 | Examples: 96 | 97 | ```python 98 | t1 = BashOperator(task_id='print_date', 99 | bash_command='date, 100 | dag=dag) 101 | ``` 102 | 103 | ```python 104 | def print_context(ds, **kwargs): 105 | pprint(kwargs) 106 | print(ds) 107 | return 'Whatever you return gets printed in the logs' 108 | 109 | 110 | run_this = PythonOperator( 111 | task_id='print_the_context', 112 | provide_context=True, 113 | python_callable=print_context, 114 | dag=dag, 115 | ) 116 | ``` 117 | 118 | ## Comparing Luigi and Airflow 119 | 120 | ### Luigi 121 | 122 | - Created at Spotify (named after the plumber) 123 | - Open sourced in late 2012 124 | - GNU make for data 125 | 126 | ### Airflow 127 | - Airbnb data team 128 | - Open-sourced mud 2015 129 | - Apache incubator mid-2016 130 | - ETL pipelines 131 | 132 | ### Similarities 133 | - Python open source projects for data pipelines 134 | - Integrate with a number of sources (databases, filesystems) 135 | - Tracking failure, retries, success 136 | - Ability to identify the dependencies and execution 137 | 138 | ### Differences 139 | - Scheduler support: Airflow has built-in support using schedulers 140 | - Scalability: Airflow has had stability issues in the past 141 | - Web interfaces 142 | 143 | ![](_static/luigi.png) 144 | 145 | 146 | ![](_static/airflow.png) 147 | 148 | 149 | | Airflow | Luigi | 150 | | ------------------------------------------------ | ------------------------------------------------------------------------------ | 151 | | Task are defined by`dag_id` defined by user name | Task are defined by task name and parameters | 152 | | Task retries based on definitions | Decide if a task is done via input/output | 153 | | Task code to the worker | Workers started by Python file where the tasks are defined | 154 | | Centralized scheduler (Celery spins up workers) | Centralized scheduler in charge of deduplication sending tasks (Tornado based) | -------------------------------------------------------------------------------- /docs/html/_sources/azure.md.txt: -------------------------------------------------------------------------------- 1 | ### Deploying to the cloud 2 | 3 | 4 | ![](_static/azure.png) 5 | 6 | [This Docker image](https://hub.docker.com/r/puckel/docker-airflow/) has been used as the base for many deployments. 7 | 8 | 9 | Let's try and get Airflow running on Docker: 10 | 11 | ``` 12 | docker pull puckel/docker-airflow 13 | ``` 14 | 15 | Once you have the container you can run as 16 | 17 | ``` 18 | docker run -d --rm -p 8080:8080 puckel/docker-airflow webserver 19 | ``` 20 | 21 | To load the examples you can do: 22 | ``` 23 | docker run -d -p 8080:8080 -e LOAD_EX=y puckel/docker-airflow 24 | ``` 25 | 26 | Based on this container we can deploy to [Azure](https://azure.microsoft.com/en-us/blog/deploying-apache-airflow-in-azure-to-build-and-run-data-pipelines//?wt.mc_id=PyCon-github-taallard) 27 | 28 | 29 | [![Deploy to Azure](https://azuredeploy.net/deploybutton.svg)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fsavjani%2Fazure-quickstart-templates%2Fmaster%2F101-webapp-linux-airflow-postgresql%2Fazuredeploy.json/?wt.mc_id=PyCon-github-taallard) 30 | 31 | 32 | Note that this is a very basic deployment on Azure. -------------------------------------------------------------------------------- /docs/html/_sources/first-airflow.md.txt: -------------------------------------------------------------------------------- 1 | # Airflow 101: working locally and familiarise with the tool 2 | 3 | ### Pre-requisites 4 | 5 | The following prerequisites are needed: 6 | 7 | - Libraries detailed in the Setting up section (either via conda or pipenv) 8 | - MySQL installed 9 | - text editor 10 | - command line 11 | 12 | ## Getting your environment up and running 13 | 14 | If you followed the instructions you should have Airflow installed as well as the rest of the packages we will be using. 15 | 16 | So let's get our environment up and running: 17 | 18 | If you are using conda start your environment via: 19 | ``` 20 | $ source activate airflow-env 21 | ``` 22 | If using pipenv then: 23 | ``` 24 | $ pipenv shell 25 | ```` 26 | 27 | this will start a shell within a virtual environment, to exit the shell you need to type `exit` and this will exit the virtual environment. 28 | 29 | ## Starting Airflow locally 30 | 31 | Airflow home lives in `~/airflow` by default, but you can change the location before installing airflow. You first need to set the `AIRFLOW_HOME` environment variable and then install airflow. For example, using pip: 32 | 33 | ```sh 34 | export AIRFLOW_HOME=~/mydir/airflow 35 | 36 | # install from PyPI using pip 37 | pip install apache-airflow 38 | ``` 39 | 40 | once you have completed the installation you should see something like this in the `airflow` directory (wherever it lives for you) 41 | 42 | ``` 43 | drwxr-xr-x - myuser 18 Apr 14:02 . 44 | .rw-r--r-- 26k myuser 18 Apr 14:02 ├── airflow.cfg 45 | drwxr-xr-x - myuser 18 Apr 14:02 ├── logs 46 | drwxr-xr-x - myuser 18 Apr 14:02 │ └── scheduler 47 | drwxr-xr-x - myuser 18 Apr 14:02 │ ├── 2019-04-18 48 | lrwxr-xr-x 46 myuser 18 Apr 14:02 │ └── latest -> /Users/myuser/airflow/logs/scheduler/2019-04-18 49 | .rw-r--r-- 2.5k myuser 18 Apr 14:02 └── unittests.cfg 50 | ``` 51 | We need to create a local dag folder: 52 | 53 | ``` 54 | mkdir ~/airflow/dags 55 | ``` 56 | 57 | As your project evolves, your directory will look something like this: 58 | 59 | ``` 60 | airflow # the root directory. 61 | ├── dags # root folder for all dags. files inside folders are not searched for dags. 62 | │ ├── my_dag.py, # my dag (definitions of tasks/operators) including precedence. 63 | │ └── ... 64 | ├── logs # logs for the various tasks that are run 65 | │ └── my_dag # DAG specific logs 66 | │ │ ├── src1_s3 # folder for task-specific logs (log files are created by date of a run) 67 | │ │ ├── src2_hdfs 68 | │ │ ├── src3_s3 69 | │ │ └── spark_task_etl 70 | ├── airflow.db # SQLite database used by Airflow internally to track the status of each DAG. 71 | ├── airflow.cfg # global configuration for Airflow (this can be overridden by config inside the file.) 72 | └── ... 73 | ``` 74 | 75 | ## Prepare your database 76 | 77 | As we mentioned before Airflow uses a database to keep track of the tasks and their statuses. So it is critical to have one set up. 78 | 79 | To start the default database we can run 80 | ` airflow initdb`. This will initialize your database via alembic so that it matches the latest Airflow release. 81 | 82 | The default database used is `sqlite` which means you cannot parallelize tasks using this database. Since we have MySQL and MySQL client installed we will set them up so that we can use them with airflow. 83 | 84 | 🚦Create an airflow database 85 | 86 | From the command line: 87 | 88 | ``` 89 | MySQL -u root -p 90 | mysql> CREATE DATABASE airflow CHARACTER SET utf8 COLLATE utf8_unicode_ci; 91 | mysql> GRANT ALL PRIVILEGES ON airflow.* To 'airflow'@'localhost'; 92 | mysql> FLUSH PRIVILEGES; 93 | ``` 94 | and initialize the database: 95 | 96 | ``` 97 | airflow initdb 98 | ``` 99 | 100 | Notice that this will fail with the default `airflow.cfg` 101 | 102 | 103 | ## Update your local configuration 104 | 105 | Open your airflow configuration file `~/airflow/airflow.cf` and make the following changes: 106 | 107 | 108 | ``` 109 | executor = CeleryExecutor 110 | ``` 111 | 112 | ``` 113 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#broker-settings 114 | # needs rabbitmq running 115 | broker_url = amqp://guest:guest@127.0.0.1/ 116 | 117 | 118 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#task-result-backend-settings 119 | result_backend = db+mysql://airflow:airflow@localhost:3306/airflow 120 | 121 | sql_alchemy_conn = mysql://airflow:python2019@localhost:3306/airflow 122 | 123 | ``` 124 | 125 | Here we are replacing the default executor (`SequentialExecutor`) with the `CeleryExecutor` so that we can run multiple DAGs in parallel. 126 | We also replace the default `sqlite` database with our newly created `airflow` database. 127 | 128 | Now we can initialize the database: 129 | ``` 130 | airflow initdb 131 | ``` 132 | 133 | Let's now start the web server locally: 134 | 135 | 136 | ``` 137 | airflow webserver -p 8080 138 | ``` 139 | 140 | we can head over to [http://localhost:8080](http://localhost:8080) now and you will see that there are a number of examples DAGS already there. 141 | 142 | 🚦 Take some time to familiarise with the UI and get your local instance set up 143 | 144 | Now let's have a look at the connections ([http://localhost:8080/admin/connection/](http://localhost:8080/admin/connection/)) go to `admin > connections`. You should be able to see a number of connections available. For this tutorial, we will use some of the connections including `mysql`. 145 | 146 | 152 | 153 | ### Commands 154 | Let us go over some of the commands. Back on your command line: 155 | 156 | ``` 157 | airflow list_dags 158 | ``` 159 | we can list the DAG tasks in a tree view 160 | 161 | ``` 162 | airflow list_tasks tutorial --tree 163 | ``` 164 | 165 | we can tests the dags too, but we will need to set a date parameter so that this executes: 166 | 167 | ``` 168 | airflow test tutorial print_date 2019-05-01 169 | ``` 170 | (note that you cannot use a future date or you will get an error) 171 | ``` 172 | airflow test tutorial templated 2019-05-01 173 | ``` 174 | By using the test commands these are not saved in the database. 175 | 176 | Now let's start the scheduler: 177 | ``` 178 | airflow scheduler 179 | ``` 180 | 181 | Behind the scenes, it monitors and stays in sync with a folder for all DAG objects it contains. The Airflow scheduler is designed to run as a service in an Airflow production environment. 182 | 183 | Now with the schedule up and running we can trigger an instance: 184 | ``` 185 | $ airflow run airflow run example_bash_operator runme_0 2015-01-01 186 | ``` 187 | 188 | This will be stored in the database and you can see the change of the status change straight away. 189 | 190 | What would happen for example if we wanted to run or trigger the `tutorial` task? 🤔 191 | 192 | Let's try from the CLI and see what happens. 193 | 194 | ``` 195 | airflow trigger_dag tutorial 196 | ``` 197 | 198 | 199 | ## Writing your first DAG 200 | 201 | Let's create our first simple DAG. 202 | Inside the dag directory (`~/airflow/dags)` create a `simple_dag.py` file. 203 | 204 | 205 | ```python 206 | from datetime import datetime, timedelta 207 | from airflow import DAG 208 | from airflow.operators.dummy_operator import DummyOperator 209 | from airflow.operators.python_operator import PythonOperator 210 | 211 | 212 | def print_hello(): 213 | return "Hello world!" 214 | 215 | 216 | default_args = { 217 | "owner": "airflow", 218 | "depends_on_past": False, 219 | "start_date": datetime(2019, 4, 30), 220 | "email": ["airflow@example.com"], 221 | "email_on_failure": False, 222 | "email_on_retry": False, 223 | "retries": 1, 224 | "retry_delay": timedelta(minutes=2), 225 | } 226 | 227 | dag = DAG( 228 | "hello_world", 229 | description="Simple tutorial DAG", 230 | schedule_interval="0 12 * * *", 231 | default_args=default_args, 232 | catchup=False, 233 | ) 234 | 235 | t1 = DummyOperator(task_id="dummy_task", retries=3, dag=dag) 236 | 237 | t2 = PythonOperator(task_id="hello_task", python_callable=print_hello, dag=dag) 238 | 239 | # sets downstream foe t1 240 | t1 >> t2 241 | 242 | # equivalent 243 | # t2.set_upstream(t1) 244 | 245 | ``` 246 | If it is properly setup you should be able to see this straight away on your instance. 247 | 248 | 249 | ### Now let's create a DAG from the previous ETL pipeline (kind of) 250 | 251 | All hands on - check the solutions -------------------------------------------------------------------------------- /docs/html/_sources/index.rst.txt: -------------------------------------------------------------------------------- 1 | .. Airflow tutorial documentation master file, created by 2 | sphinx-quickstart on Mon Apr 15 15:52:00 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Airflow tutorial 7 | ============================================ 8 | This tutorial was originally developed for PyCon US 2019. 9 | 10 | .. toctree:: 11 | :caption: Table of Contents 12 | :hidden: 13 | :maxdepth: 2 14 | 15 | setup 16 | about 17 | pipelines 18 | airflow-intro 19 | first-airflow 20 | 21 | .. toctree:: 22 | :maxdepth: 2 23 | :caption: Contents: 24 | 25 | About your facilitator 26 | ====================== 27 | 28 | My name is Tania. I live in Manchester UK where I work as a 29 | Cloud Advocate for Microsoft. 30 | 31 | Over the years, I have worked as a data engineer, machine learning engineer, 32 | and research software engineer. I love data intensive 33 | enviroments and I am particularly interested in the tools and workflows to 34 | deliver robust, reproducible data insights. 35 | 36 | If you have any questions or feedback about this tutorial please, 37 | file an issue using the following link: ``_. 38 | 39 | You can also contact me via the following channels: 40 | 41 | - E-mail: trallard@bitsandchips.me 42 | - Twitter: `@ixek `_ 43 | - `Tania on GitHub `_ 44 | 45 | Code of Conduct 46 | ================ 47 | All attendees to this workshop are expected to adhere to PyCon's Code of Conduct, 48 | in brief: 49 | **Be open, considerate, and respectful.** 50 | 51 | License 52 | ======= 53 | The content in this workshop is Licensed under `CC-BY-SA 4.0 `_. 54 | Which means that you can use, remix and re-distribute so long attribution to the original 55 | author is maintained (Tania Allard). 56 | 57 | The logo used here was designed by Ashley McNamara for the Microsoft Developer Advocates team use. 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /docs/html/_static/12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/12.png -------------------------------------------------------------------------------- /docs/html/_static/4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/4.jpg -------------------------------------------------------------------------------- /docs/html/_static/DAG.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/DAG.png -------------------------------------------------------------------------------- /docs/html/_static/GUI.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/GUI.png -------------------------------------------------------------------------------- /docs/html/_static/airflow-logo.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/airflow-logo.jpeg -------------------------------------------------------------------------------- /docs/html/_static/airflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/airflow.png -------------------------------------------------------------------------------- /docs/html/_static/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/architecture.png -------------------------------------------------------------------------------- /docs/html/_static/automation1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/automation1.jpg -------------------------------------------------------------------------------- /docs/html/_static/azure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/azure.png -------------------------------------------------------------------------------- /docs/html/_static/connection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/connection.png -------------------------------------------------------------------------------- /docs/html/_static/custom.css: -------------------------------------------------------------------------------- 1 | /* */ 2 | @import url('https://fonts.googleapis.com/css?family=Itim|Nunito|Source+Code+Pro'); 3 | 4 | a { 5 | color: rgb(96, 138, 197); 6 | } 7 | 8 | a:hover { 9 | color: rgb(65, 129, 218); 10 | } 11 | 12 | div.body h1 { 13 | color: #5F6366; 14 | font-family: 'Itim', cursive; 15 | font-weight: bold; 16 | font-size: 300%; 17 | } 18 | 19 | div.body h2 { 20 | color: #5F6366; 21 | font-family: 'Itim', cursive; 22 | font-weight: bold; 23 | } 24 | div.body h3 { 25 | color: #5F6366; 26 | font-family: 'Itim', cursive; 27 | font-weight: bold; 28 | } 29 | 30 | div.sphinxsidebarwrapper h1.logo { 31 | text-align: center; 32 | margin: 0 0 -20px 0; 33 | } 34 | 35 | div.sphinxsidebar p.blurb { 36 | font-size: 130%; 37 | text-align: center; 38 | font-family: 'Itim', cursive; 39 | color: rgb(151, 139, 196); 40 | } 41 | 42 | div.sphinxsidebar h1{ 43 | font-size: 160%; 44 | color: #5F6366; 45 | font-family: 'Itim', cursive; 46 | } 47 | 48 | div.sphinxsidebar h1 a { 49 | font-size: 160%; 50 | color: #5F6366; 51 | text-decoration: none; 52 | border: none; 53 | font-family: 'Itim', cursive; 54 | } 55 | 56 | div.sphinxsidebar h1 a:hover { 57 | border: none; 58 | } 59 | 60 | div.sphinxsidebar h3 { 61 | display: none; 62 | } 63 | 64 | div.sphinxsidebar a { 65 | color: #5F6366; 66 | } 67 | 68 | code.descname { 69 | color: rgb(151, 139, 196); 70 | } 71 | 72 | th.field-name { 73 | min-width: 100px; 74 | color: rgb(151, 139, 196); 75 | } 76 | 77 | tt, code { 78 | color: #F8F8F2; 79 | background: #1d1941; 80 | border-radius: 0.3em; 81 | padding: 0.0em 0.3em; 82 | } 83 | 84 | a.reference.internal code.xref span.pre { 85 | color: #F8F8F2; 86 | background: #1d1941; 87 | border-bottom: none; 88 | border-radius: 0; 89 | padding: 0; 90 | } 91 | 92 | a.reference.internal, a.reference.internal:hover { 93 | border-bottom: none; 94 | } 95 | 96 | a.reference.internal:hover code { 97 | background: #027bab 98 | } 99 | 100 | a.reference.internal:hover code.xref span.pre { 101 | color: #F8F8F2; 102 | background: #027bab; 103 | border-bottom: none; 104 | } 105 | 106 | tt.xref, code.xref, a tt { 107 | background: none; 108 | border-bottom: none; 109 | } 110 | 111 | code.literal { 112 | color: #F8F8F2; 113 | background:#1d1941; 114 | } 115 | 116 | pre { 117 | padding: 20px 30px; 118 | background: #1d1941; 119 | } 120 | 121 | div > dl { 122 | border-left: 2px solid #00384021; 123 | padding-left: 5px; 124 | } 125 | 126 | dt { 127 | color: rgb(96, 138, 197); 128 | } 129 | 130 | 131 | div.footer::before { 132 | display: block; 133 | content: ''; 134 | border-top: 2px solid #EDB5BF; 135 | width: 50%; 136 | margin: 2em auto 2em auto; 137 | } 138 | 139 | div.footer { 140 | text-align: center; 141 | /* color: #029be2; */ 142 | } 143 | 144 | div.footer a { 145 | color: #027bab; 146 | text-decoration: none; 147 | } 148 | 149 | p.caption { 150 | font-family: 'Itim', cursive; 151 | font-size: inherit; 152 | font-size: 150%; 153 | } 154 | 155 | @media screen and (max-width: 875px) { 156 | div.sphinxsidebar { 157 | background: #4D6D9A; 158 | } 159 | div.sphinxsidebar h1.logo, div.sphinxsidebar p.blurb{ 160 | text-align: left; 161 | } 162 | div.sphinxsidebar h1 a { 163 | color: #1bc5e0; 164 | } 165 | div.sphinxsidebar a { 166 | /* color: rgb(151, 139, 196); */ 167 | color: white; 168 | } 169 | div.sphinxsidebar ul { 170 | /* color: rgb(151, 139, 196); */ 171 | color: white; 172 | } 173 | } 174 | 175 | 176 | /* other */ 177 | 178 | .alert { 179 | position: relative; 180 | padding: 10px; 181 | margin-bottom: 5px; 182 | border: 2px solid transparent; 183 | border-radius: 2px; 184 | } 185 | 186 | .alert-primary { 187 | color: #004085; 188 | background-color: #cce5ff; 189 | border-color: #b8daff; 190 | } 191 | .alert-custom { 192 | background-color: rgb(229, 224, 247); 193 | border-color:rgb(229, 224, 247); 194 | color: rgb(128, 117, 165); 195 | } -------------------------------------------------------------------------------- /docs/html/_static/datapyramid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/datapyramid.png -------------------------------------------------------------------------------- /docs/html/_static/documentation_options.js: -------------------------------------------------------------------------------- 1 | var DOCUMENTATION_OPTIONS = { 2 | URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'), 3 | VERSION: '', 4 | LANGUAGE: 'None', 5 | COLLAPSE_INDEX: false, 6 | FILE_SUFFIX: '.html', 7 | HAS_SOURCE: true, 8 | SOURCELINK_SUFFIX: '.txt', 9 | NAVIGATION_WITH_KEYS: false 10 | }; -------------------------------------------------------------------------------- /docs/html/_static/file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/file.png -------------------------------------------------------------------------------- /docs/html/_static/gooddata.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/gooddata.png -------------------------------------------------------------------------------- /docs/html/_static/gooddata1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/gooddata1.png -------------------------------------------------------------------------------- /docs/html/_static/luigi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/luigi.png -------------------------------------------------------------------------------- /docs/html/_static/minus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/minus.png -------------------------------------------------------------------------------- /docs/html/_static/mssignin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/mssignin.png -------------------------------------------------------------------------------- /docs/html/_static/pipeline1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/pipeline1.png -------------------------------------------------------------------------------- /docs/html/_static/plus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/plus.png -------------------------------------------------------------------------------- /docs/html/_static/pygments.css: -------------------------------------------------------------------------------- 1 | .highlight .hll { background-color: #49483e } 2 | .highlight { background: #272822; color: #f8f8f2 } 3 | .highlight .c { color: #75715e } /* Comment */ 4 | .highlight .err { color: #960050; background-color: #1e0010 } /* Error */ 5 | .highlight .k { color: #66d9ef } /* Keyword */ 6 | .highlight .l { color: #ae81ff } /* Literal */ 7 | .highlight .n { color: #f8f8f2 } /* Name */ 8 | .highlight .o { color: #f92672 } /* Operator */ 9 | .highlight .p { color: #f8f8f2 } /* Punctuation */ 10 | .highlight .ch { color: #75715e } /* Comment.Hashbang */ 11 | .highlight .cm { color: #75715e } /* Comment.Multiline */ 12 | .highlight .cp { color: #75715e } /* Comment.Preproc */ 13 | .highlight .cpf { color: #75715e } /* Comment.PreprocFile */ 14 | .highlight .c1 { color: #75715e } /* Comment.Single */ 15 | .highlight .cs { color: #75715e } /* Comment.Special */ 16 | .highlight .gd { color: #f92672 } /* Generic.Deleted */ 17 | .highlight .ge { font-style: italic } /* Generic.Emph */ 18 | .highlight .gi { color: #a6e22e } /* Generic.Inserted */ 19 | .highlight .gs { font-weight: bold } /* Generic.Strong */ 20 | .highlight .gu { color: #75715e } /* Generic.Subheading */ 21 | .highlight .kc { color: #66d9ef } /* Keyword.Constant */ 22 | .highlight .kd { color: #66d9ef } /* Keyword.Declaration */ 23 | .highlight .kn { color: #f92672 } /* Keyword.Namespace */ 24 | .highlight .kp { color: #66d9ef } /* Keyword.Pseudo */ 25 | .highlight .kr { color: #66d9ef } /* Keyword.Reserved */ 26 | .highlight .kt { color: #66d9ef } /* Keyword.Type */ 27 | .highlight .ld { color: #e6db74 } /* Literal.Date */ 28 | .highlight .m { color: #ae81ff } /* Literal.Number */ 29 | .highlight .s { color: #e6db74 } /* Literal.String */ 30 | .highlight .na { color: #a6e22e } /* Name.Attribute */ 31 | .highlight .nb { color: #f8f8f2 } /* Name.Builtin */ 32 | .highlight .nc { color: #a6e22e } /* Name.Class */ 33 | .highlight .no { color: #66d9ef } /* Name.Constant */ 34 | .highlight .nd { color: #a6e22e } /* Name.Decorator */ 35 | .highlight .ni { color: #f8f8f2 } /* Name.Entity */ 36 | .highlight .ne { color: #a6e22e } /* Name.Exception */ 37 | .highlight .nf { color: #a6e22e } /* Name.Function */ 38 | .highlight .nl { color: #f8f8f2 } /* Name.Label */ 39 | .highlight .nn { color: #f8f8f2 } /* Name.Namespace */ 40 | .highlight .nx { color: #a6e22e } /* Name.Other */ 41 | .highlight .py { color: #f8f8f2 } /* Name.Property */ 42 | .highlight .nt { color: #f92672 } /* Name.Tag */ 43 | .highlight .nv { color: #f8f8f2 } /* Name.Variable */ 44 | .highlight .ow { color: #f92672 } /* Operator.Word */ 45 | .highlight .w { color: #f8f8f2 } /* Text.Whitespace */ 46 | .highlight .mb { color: #ae81ff } /* Literal.Number.Bin */ 47 | .highlight .mf { color: #ae81ff } /* Literal.Number.Float */ 48 | .highlight .mh { color: #ae81ff } /* Literal.Number.Hex */ 49 | .highlight .mi { color: #ae81ff } /* Literal.Number.Integer */ 50 | .highlight .mo { color: #ae81ff } /* Literal.Number.Oct */ 51 | .highlight .sa { color: #e6db74 } /* Literal.String.Affix */ 52 | .highlight .sb { color: #e6db74 } /* Literal.String.Backtick */ 53 | .highlight .sc { color: #e6db74 } /* Literal.String.Char */ 54 | .highlight .dl { color: #e6db74 } /* Literal.String.Delimiter */ 55 | .highlight .sd { color: #e6db74 } /* Literal.String.Doc */ 56 | .highlight .s2 { color: #e6db74 } /* Literal.String.Double */ 57 | .highlight .se { color: #ae81ff } /* Literal.String.Escape */ 58 | .highlight .sh { color: #e6db74 } /* Literal.String.Heredoc */ 59 | .highlight .si { color: #e6db74 } /* Literal.String.Interpol */ 60 | .highlight .sx { color: #e6db74 } /* Literal.String.Other */ 61 | .highlight .sr { color: #e6db74 } /* Literal.String.Regex */ 62 | .highlight .s1 { color: #e6db74 } /* Literal.String.Single */ 63 | .highlight .ss { color: #e6db74 } /* Literal.String.Symbol */ 64 | .highlight .bp { color: #f8f8f2 } /* Name.Builtin.Pseudo */ 65 | .highlight .fm { color: #a6e22e } /* Name.Function.Magic */ 66 | .highlight .vc { color: #f8f8f2 } /* Name.Variable.Class */ 67 | .highlight .vg { color: #f8f8f2 } /* Name.Variable.Global */ 68 | .highlight .vi { color: #f8f8f2 } /* Name.Variable.Instance */ 69 | .highlight .vm { color: #f8f8f2 } /* Name.Variable.Magic */ 70 | .highlight .il { color: #ae81ff } /* Literal.Number.Integer.Long */ -------------------------------------------------------------------------------- /docs/html/_static/python.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/python.png -------------------------------------------------------------------------------- /docs/html/_static/twitter1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/twitter1.png -------------------------------------------------------------------------------- /docs/html/_static/twitter2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/twitter2.png -------------------------------------------------------------------------------- /docs/html/_static/twitter3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/twitter3.png -------------------------------------------------------------------------------- /docs/html/_static/uses.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/uses.png -------------------------------------------------------------------------------- /docs/html/about.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | About the workshop — Airflow tutorial documentation 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 |
30 |
31 |
32 | 47 | 48 | 49 |
50 | 51 |
52 |

About the workshop

53 |

We will be taking a look at the basic concepts of data pipelines as well as practical use cases using Python.

54 |
55 |

About you:

56 |
    57 |
  • Some experience using the command line

  • 58 |
  • Intermediate Python knowledge / use

  • 59 |
  • Be able to apply what we learn and adopt to your use cases

  • 60 |
  • Interested in data and systems

  • 61 |
  • Aspring or current data engineering

  • 62 |
  • Some knowledge about systems and databases (enough to be dangerous)

  • 63 |
64 |
65 |
66 |

Our focus for the day

67 |
    68 |
  • Greater understanding on how to apply data pipelines using the Python toolset

  • 69 |
  • Focus on concepts

  • 70 |
  • Apply knowledge with each library

  • 71 |
  • Will give you the building blocks

  • 72 |
73 |
74 |
75 |

Keeping on track

76 |

You will find 🚦 across the tutorial examples. We will use this to identify how folks are doing over the workshop (if following along in person). 77 | Place the post it as follows:

78 |

🚦 Purple postit: all good, task has been completed

79 |

🚦 Orange postit: I need extra time or need help with the task in hand

80 |
81 |
82 | 83 | 84 |
85 | 100 | 101 |
102 |
103 | 179 |
180 |
181 | 192 | 193 | 194 | 195 | 196 | 197 | -------------------------------------------------------------------------------- /docs/html/azure.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Deploying to the cloud — Airflow tutorial documentation 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 |
28 |
29 |
30 | 37 | 38 | 39 |
40 | 41 |
42 |

Deploying to the cloud

43 |

_images/azure.png

44 |

This Docker image has been used as the base for many deployments.

45 |

Let’s try and get Airflow running on Docker:

46 |
docker pull puckel/docker-airflow
 47 | 
48 |
49 |

Once you have the container you can run as

50 |
docker run -d --rm -p 8080:8080 puckel/docker-airflow webserver
 51 | 
52 |
53 |

To load the examples you can do:

54 |
docker run -d -p 8080:8080 -e LOAD_EX=y puckel/docker-airflow
 55 | 
56 |
57 |

Based on this container we can deploy to Azure

58 |

https://azuredeploy.net/deploybutton.svgDeploy to Azure

59 |

Note that this is a very basic deployment on Azure.

60 |
61 | 62 | 63 |
64 | 71 | 72 |
73 |
74 | 133 |
134 |
135 | 146 | 147 | 148 | 149 | 150 | 151 | -------------------------------------------------------------------------------- /docs/html/genindex.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Index — Airflow tutorial documentation 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 |
29 |
30 |
31 | 38 | 39 | 40 |
41 | 42 | 43 |

Index

44 | 45 |
46 | 47 |
48 | 49 | 50 |
51 | 58 | 59 |
60 |
61 | 120 |
121 |
122 | 130 | 131 | 132 | 133 | 134 | 135 | -------------------------------------------------------------------------------- /docs/html/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Airflow tutorial — Airflow tutorial documentation 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 |
29 |
30 |
31 | 42 | 43 | 44 |
45 | 46 |
47 |

Airflow tutorial

48 |

This tutorial was originally developed for PyCon US 2019.

49 |
50 |
51 |
52 |
53 |
54 |
55 |

About your facilitator

56 |

My name is Tania. I live in Manchester UK where I work as a 57 | Cloud Advocate for Microsoft.

58 |

Over the years, I have worked as a data engineer, machine learning engineer, 59 | and research software engineer. I love data intensive 60 | enviroments and I am particularly interested in the tools and workflows to 61 | deliver robust, reproducible data insights.

62 |

If you have any questions or feedback about this tutorial please, 63 | file an issue using the following link: https://github.com/trallard/airflow-tutorial/issues/new.

64 |

You can also contact me via the following channels:

65 | 70 |
71 |
72 |

Code of Conduct

73 |

All attendees to this workshop are expected to adhere to PyCon’s Code of Conduct, 74 | in brief: 75 | Be open, considerate, and respectful.

76 |
77 |
78 |

License

79 |

The content in this workshop is Licensed under CC-BY-SA 4.0. 80 | Which means that you can use, remix and re-distribute so long attribution to the original 81 | author is maintained (Tania Allard).

82 |

The logo used here was designed by Ashley McNamara for the Microsoft Developer Advocates team use.

83 |
84 | 85 | 86 |
87 | 98 | 99 |
100 |
101 | 171 |
172 |
173 | 184 | 185 | 186 | 187 | 188 | 189 | -------------------------------------------------------------------------------- /docs/html/objects.inv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/objects.inv -------------------------------------------------------------------------------- /docs/html/search.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Search — Airflow tutorial documentation 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 |
33 |
34 |
35 | 42 | 43 | 44 |
45 | 46 |

Search

47 |
48 | 49 |

50 | Please activate JavaScript to enable the search 51 | functionality. 52 |

53 |
54 |

55 | From here you can search these documents. Enter your search 56 | words into the box below and click "search". Note that the search 57 | function will automatically search for all of the words. Pages 58 | containing fewer words won't appear in the result list. 59 |

60 |
61 | 62 | 63 | 64 |
65 | 66 |
67 | 68 |
69 | 70 |
71 | 78 | 79 |
80 |
81 | 130 |
131 |
132 | 140 | 141 | 142 | 143 | 144 | 145 | -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Airflow tutorial — Airflow tutorial documentation 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 |
28 |
29 |
30 | 41 | 42 | 43 |
44 | 45 |
46 |

Airflow tutorial

47 |

This tutorial was originally developed for PyCon US 2019.

48 |
49 |
50 |
51 |
52 |
53 |
54 |

About your facilitator

55 |

My name is Tania. I live in Manchester UK where I work as a 56 | Cloud Advocate for Microsoft.

57 |

Over the years, I have worked as a data engineer, machine learning engineer, 58 | and research software engineer. I love data intensive 59 | enviroments and I am particularly interested in the tools and workflows to 60 | deliver robust, reproducible data insights.

61 |

If you have any questions or feedback about this tutorial please, 62 | file an issue using the following link: https://github.com/trallard/airflow-tutorial/issues/new.

63 |

You can also contact me via the following channels:

64 | 69 |
70 |
71 |

Code of Conduct

72 |

All attendees to this workshop are expected to adhere to PyCon’s Code of Conduct, 73 | in brief: 74 | Be open, considerate, and respectful.

75 |
76 |
77 |

License

78 |

The content in this workshop is Licensed under CC-BY-SA 4.0. 79 | Which means that you can use, remix and re-distribute so long attribution to the original 80 | author is maintained (Tania Allard).

81 |

The logo used here was designed by Ashley McNamara for the Microsoft Developer Advocates team use.

82 |
83 | 84 | 85 |
86 | 97 | 98 |
99 |
100 | 166 |
167 |
168 | 179 | 180 | 181 | 182 | 183 | 184 | -------------------------------------------------------------------------------- /docs/objects.inv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/objects.inv -------------------------------------------------------------------------------- /docs/search.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Search — Airflow tutorial documentation 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
32 |
33 |
34 | 41 | 42 | 43 |
44 | 45 |

Search

46 |
47 | 48 |

49 | Please activate JavaScript to enable the search 50 | functionality. 51 |

52 |
53 |

54 | From here you can search these documents. Enter your search 55 | words into the box below and click "search". Note that the search 56 | function will automatically search for all of the words. Pages 57 | containing fewer words won't appear in the result list. 58 |

59 |
60 | 61 | 62 | 63 |
64 | 65 |
66 | 67 |
68 | 69 |
70 | 77 | 78 |
79 |
80 | 125 |
126 |
127 | 135 | 136 | 137 | 138 | 139 | 140 | -------------------------------------------------------------------------------- /docs/searchindex.js: -------------------------------------------------------------------------------- 1 | Search.setIndex({docnames:["index","setup"],envversion:{"sphinx.domains.c":1,"sphinx.domains.changeset":1,"sphinx.domains.cpp":1,"sphinx.domains.javascript":1,"sphinx.domains.math":2,"sphinx.domains.python":1,"sphinx.domains.rst":1,"sphinx.domains.std":1,"sphinx.ext.intersphinx":1,sphinx:56},filenames:["index.rst","setup.md"],objects:{},objnames:{},objtypes:{},terms:{"long":0,"new":0,The:0,adher:0,advoc:0,all:0,allard:0,also:0,ani:0,ashlei:0,attende:0,attribut:0,author:0,bitsandchip:0,brief:0,can:0,channel:0,cloud:0,com:0,consider:0,contact:0,content:0,data:0,deliv:0,design:0,develop:0,distribut:0,engin:0,enviro:0,expect:0,feedback:0,file:0,follow:0,github:0,have:0,here:0,http:0,insight:0,intens:0,interest:0,issu:0,ixek:0,learn:0,link:0,live:0,logo:0,love:0,machin:0,mail:0,maintain:0,manchest:0,mcnamara:0,mean:0,microsoft:0,name:0,open:0,origin:0,over:0,particularli:0,pleas:0,pycon:0,question:0,remix:0,reproduc:0,research:0,respect:0,robust:0,softwar:0,tania:0,team:0,thi:0,tool:0,trallard:0,twitter:0,under:0,use:0,used:0,using:0,via:0,where:0,which:0,work:0,workflow:0,workshop:0,year:0,you:0},titles:["Airflow tutorial","Getting started"],titleterms:{about:0,airflow:0,code:0,conduct:0,facilit:0,get:1,licens:0,start:1,tutori:0,your:0}}) -------------------------------------------------------------------------------- /docs/setup.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Getting started — Airflow tutorial documentation 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 |
28 |
29 |
30 | 41 | 42 | 43 |
44 | 45 |
46 |

Getting started

47 |
48 | 49 | 50 |
51 | 62 | 63 |
64 |
65 | 121 |
122 |
123 | 134 | 135 | 136 | 137 | 138 | 139 | -------------------------------------------------------------------------------- /environment.yaml: -------------------------------------------------------------------------------- 1 | name: airflow-env 2 | dependencies: 3 | - jupyter==1.0.0 4 | - jupyterlab==0.35.5 5 | - matplotlib==3.0.3 6 | - mysqlclient==1.3.14 7 | - numpy==1.16.3 8 | - pandas==0.24.2 9 | - scipy==1.2.1 10 | - seaborn==0.9.0 11 | - pip: 12 | - tweepy==3.7.0 13 | - hypothesis==4.18.0 14 | - celery==4.1.1 15 | - apache-airflow[celery,kubernetes,mysql,password,slack]==1.10.3 16 | - mysql-connector-python==8.0.16 17 | - papermill==1.0.0 -------------------------------------------------------------------------------- /make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -i https://pypi.org/simple 2 | alembic==0.9.10 3 | amqp==2.4.2 4 | ansiwrap==0.8.4 5 | apache-airflow[celery,kubernetes,mysql,password,slack]==1.10.3 6 | appnope==0.1.0 ; sys_platform == 'darwin' 7 | asn1crypto==0.24.0 8 | attrs==19.1.0 9 | babel==2.6.0 10 | backcall==0.1.0 11 | bcrypt==3.1.6 12 | billiard==3.5.0.5 13 | bleach==3.1.0 14 | cachetools==3.1.0 15 | celery==4.1.1 16 | certifi==2019.3.9 17 | cffi==1.12.3 18 | chardet==3.0.4 19 | click==7.0 20 | colorama==0.4.1 21 | configparser==3.5.3 22 | croniter==0.3.30 23 | cryptography==3.2 24 | cycler==0.10.0 25 | decorator==4.4.0 26 | defusedxml==0.6.0 27 | dill==0.2.9 28 | docutils==0.14 29 | entrypoints==0.3 30 | flask-admin==1.5.3 31 | flask-appbuilder==1.12.3 32 | flask-babel==0.12.2 33 | flask-bcrypt==0.7.1 34 | flask-caching==1.3.3 35 | flask-login==0.4.1 36 | flask-openid==1.2.5 37 | flask-sqlalchemy==2.4.0 38 | flask-swagger==0.2.13 39 | flask-wtf==0.14.2 40 | flask==1.0.2 41 | flower==0.9.3 42 | funcsigs==1.0.0 43 | future==0.16.0 44 | gitdb2==2.0.5 45 | gitpython==2.1.11 46 | google-auth==1.6.3 47 | gunicorn==19.9.0 48 | hypothesis==4.18.0 49 | idna==2.8 50 | ipykernel==5.1.0 51 | ipython-genutils==0.2.0 52 | ipython==7.5.0 ; python_version >= '3.3' 53 | ipywidgets==7.4.2 54 | iso8601==0.1.12 55 | itsdangerous==1.1.0 56 | jedi==0.13.3 57 | jinja2==2.10 58 | json-merge-patch==0.2 59 | jsonschema==3.0.1 60 | jupyter-client==5.2.4 61 | jupyter-console==6.0.0 62 | jupyter-core==4.4.0 63 | jupyter==1.0.0 64 | jupyterlab-server==0.2.0 65 | jupyterlab==0.35.5 66 | kiwisolver==1.1.0 67 | kombu==4.5.0 68 | kubernetes==9.0.0 69 | lockfile==0.12.2 70 | lxml==4.3.3 71 | mako==1.0.9 72 | markdown==2.6.11 73 | markupsafe==1.1.1 74 | matplotlib==3.0.3 75 | mistune==0.8.4 76 | mysql-connector-python==8.0.16 77 | mysqlclient==1.3.14 78 | nbconvert==5.5.0 79 | nbformat==4.4.0 80 | notebook==5.7.8 81 | numpy==1.16.3 82 | oauthlib==3.0.1 83 | ordereddict==1.1 84 | pandas==0.24.2 85 | pandocfilters==1.4.2 86 | papermill==1.0.0 87 | parso==0.4.0 88 | pendulum==1.4.4 89 | pexpect==4.7.0 ; sys_platform != 'win32' 90 | pickleshare==0.7.5 91 | prometheus-client==0.6.0 92 | prompt-toolkit==2.0.9 93 | protobuf==3.7.1 94 | psutil==5.6.2 95 | ptyprocess==0.6.0 ; os_name != 'nt' 96 | pyasn1-modules==0.2.5 97 | pyasn1==0.4.5 98 | pycparser==2.19 99 | pygments==2.3.1 100 | pyparsing==2.4.0 101 | pyrsistent==0.15.1 102 | pysocks==1.6.8 103 | python-daemon==2.1.2 104 | python-dateutil==2.8.0 105 | python-editor==1.0.4 106 | python3-openid==3.1.0 107 | pytz==2019.1 108 | pytzdata==2019.1 109 | pyyaml==5.1 110 | pyzmq==18.0.1 111 | qtconsole==4.4.3 112 | requests-oauthlib==1.2.0 113 | requests==2.21.0 114 | rsa==4.0 115 | scipy==1.2.1 116 | seaborn==0.9.0 117 | send2trash==1.5.0 118 | setproctitle==1.1.10 119 | six==1.12.0 120 | slackclient==1.3.1 121 | smmap2==2.0.5 122 | sqlalchemy==1.2.19 123 | tabulate==0.8.3 124 | tenacity==4.12.0 125 | terminado==0.8.2 126 | testpath==0.4.2 127 | text-unidecode==1.2 128 | textwrap3==0.9.2 129 | thrift==0.11.0 130 | tornado==5.1.1 131 | tqdm==4.31.1 132 | traitlets==4.3.2 133 | tweepy==3.7.0 134 | tzlocal==1.5.1 135 | unicodecsv==0.14.1 136 | urllib3==1.24.2 137 | vine==1.3.0 138 | wcwidth==0.1.7 139 | webencodings==0.5.1 140 | websocket-client==0.54.0 141 | werkzeug==0.14.1 142 | widgetsnbextension==3.4.2 143 | wtforms==2.2.1 144 | zope.deprecation==4.4.0 145 | -------------------------------------------------------------------------------- /solutions/dags/dags/generate_twitter.py: -------------------------------------------------------------------------------- 1 | """ Simple example of creating subdags and generating work dynamically""" 2 | from airflow import DAG 3 | from airflow.hooks import SqliteHook 4 | 5 | from airflow.hooks.mysql_hook import MySqlHook 6 | from airflow.models import Variable 7 | from airflow.operators.email_operator import EmailOperator 8 | from airflow.operators.python_operator import PythonOperator, BranchPythonOperator 9 | from airflow.operators.bash_operator import BashOperator 10 | from airflow.operators.subdag_operator import SubDagOperator 11 | 12 | 13 | from twitter_airflow import search_twitter, RAW_TWEET_DIR 14 | from subdags.twitter_subdag import subdag 15 | from datetime import datetime, timedelta 16 | import pandas as pd 17 | import re 18 | import random 19 | 20 | 21 | SEARCH_TERMS = ["#python", "#pydata", "#airflow", "data wrangling", "data pipelines"] 22 | 23 | 24 | default_args = { 25 | "owner": "admin", 26 | "depends_on_past": False, 27 | "start_date": datetime.now() - timedelta(days=4), 28 | "retries": 1, 29 | "retry_delay": timedelta(minutes=5), 30 | } 31 | 32 | dag = DAG( 33 | "generate_twitter_dags", default_args=default_args, schedule_interval="@daily" 34 | ) 35 | 36 | 37 | def fill_terms(my_terms=SEARCH_TERMS, **kwargs): 38 | """ Fill sqlite database with a few search terms. """ 39 | dbconn = MySqlHook(mysql_conn_id="mysql_default") 40 | conn = dbconn.get_connection() 41 | cursor = conn.cursor() 42 | df = pd.DataFrame(my_terms, columns=["search_term"]) 43 | try: 44 | df.to_sql("twitter_terms", conn) 45 | except ValueError: 46 | # table already exists 47 | pass 48 | 49 | 50 | def generate_search_terms(**kwargs): 51 | """ Generate subdag to search twitter for terms. """ 52 | dbconn = MySqlHook(mysql_conn_id="mysql_default") 53 | conn = dbconn.get_connection() 54 | cursor = conn.cursor() 55 | query = "select * from twitter_terms" 56 | df = pd.read_sql_query(query, conn) 57 | return random.choice( 58 | [ 59 | "search_{}_twitter".format(re.sub(r"\W+", "", t)) 60 | for t in df.search_term.values 61 | ] 62 | ) 63 | 64 | 65 | fill_search_terms = PythonOperator( 66 | task_id="fill_terms", provide_context=True, python_callable=fill_terms, dag=dag 67 | ) 68 | 69 | 70 | gen_search_terms = BranchPythonOperator( 71 | task_id="generate_search_terms", 72 | provide_context=True, 73 | python_callable=generate_search_terms, 74 | dag=dag, 75 | ) 76 | 77 | 78 | email_links = EmailOperator( 79 | task_id="email_best_links", 80 | to="MYEMAIL@MYSITE.com", 81 | subject="Latest popular links", 82 | html_content="Check out the latest!!", 83 | files=["{}/latest_links.txt".format(RAW_TWEET_DIR)], 84 | dag=dag, 85 | ) 86 | 87 | 88 | sub = SubDagOperator( 89 | subdag=subdag, task_id="insert_and_id_pop", trigger_rule="one_success", dag=dag 90 | ) 91 | 92 | 93 | clear_latest = BashOperator( 94 | bash_command="rm -rf {}/latest_links.txt".format(RAW_TWEET_DIR), 95 | task_id="clear_latest", 96 | dag=dag, 97 | ) 98 | 99 | 100 | gen_search_terms.set_upstream(fill_search_terms) 101 | 102 | for term in SEARCH_TERMS: 103 | term_without_punctuation = re.sub(r"\W+", "", term) 104 | simple_search = PythonOperator( 105 | task_id="search_{}_twitter".format(term_without_punctuation), 106 | provide_context=True, 107 | python_callable=search_twitter, 108 | dag=dag, 109 | params={"query": term}, 110 | ) 111 | simple_search.set_upstream(gen_search_terms) 112 | simple_search.set_downstream(sub) 113 | 114 | sub.set_downstream(email_links) 115 | email_links.set_downstream(clear_latest) 116 | -------------------------------------------------------------------------------- /solutions/dags/dags/parameters.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example uses the existing Dummy Operator and Variable model to 3 | demonstrate dynamic creation of DAGs based on a Variable setting. As 4 | shown below, a list of customer objects is retrieved and used to create 5 | unique dags based on the imput. 6 | """ 7 | 8 | from datetime import datetime, timedelta 9 | from airflow.models import DAG 10 | from airflow.models import Variable 11 | from airflow.operators.dummy_operator import DummyOperator 12 | 13 | # Create JSON Variable if it doesn't exist 14 | 15 | CUSTOMERS = [ 16 | { 17 | "customer_name": "Faux Customer", 18 | "customer_id": "faux_customer", 19 | "email": ["admin@fauxcustomer.com", "admin@astronomer.io"], 20 | "schedule_interval": None, 21 | "enabled": True, 22 | }, 23 | { 24 | "customer_name": "Bogus Customer", 25 | "customer_id": "bogus_customer", 26 | "email": ["admin@boguscustomer.com", "admin@astronomer.io"], 27 | "schedule_interval": "@once", 28 | "enabled": True, 29 | }, 30 | ] 31 | 32 | # Get JSON Variable 33 | CUSTOMERS = Variable.get("customer_list", default_var=CUSTOMERS, deserialize_json=True) 34 | 35 | 36 | def create_dag(customer): 37 | """ 38 | Accepts a customer parameters dict and 39 | overrides default args to create a DAG object 40 | 41 | Returns: DAG() Object 42 | """ 43 | default_args = { 44 | "owner": "airflow", 45 | "depends_on_past": False, 46 | "email": "xyz@xyz.com", 47 | "retries": 1, 48 | "retry_delay": timedelta(minutes=5), 49 | "start_date": datetime(2017, 1, 1, 0, 0), 50 | "end_date": None, 51 | } 52 | 53 | """ 54 | This allows DAG parameters to be passed in from the Variable if 55 | a customer needs something specific overridden in their DAG. 56 | Consider how email being passed in from the customer object 57 | overrides email in the resulting replaced_args object. 58 | """ 59 | replaced_args = { 60 | k: default_args[k] if customer.get(k, None) is None else customer[k] 61 | for k in default_args 62 | } 63 | 64 | dag_id = "{base_name}_{id}".format( 65 | base_name="load_clickstream_data", id=customer["customer_id"] 66 | ) 67 | 68 | return DAG( 69 | dag_id=dag_id, 70 | default_args=replaced_args, 71 | schedule_interval=customer["schedule_interval"], 72 | ) 73 | 74 | # Loop customers array of containing customer objects 75 | for cust in CUSTOMERS: 76 | if cust["enabled"]: 77 | 78 | dag = create_dag(cust) 79 | 80 | globals()[dag.dag_id] = dag 81 | 82 | extract = DummyOperator(task_id="extract_data", dag=dag) 83 | 84 | transform = DummyOperator(task_id="transform_data", dag=dag) 85 | 86 | load = DummyOperator(task_id="load_data", dag=dag) 87 | 88 | extract >> transform >> load 89 | 90 | else: 91 | # TODO Create but programmatically pause 92 | pass 93 | -------------------------------------------------------------------------------- /solutions/dags/dags/simple_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.operators.dummy_operator import DummyOperator 5 | from airflow.operators.python_operator import PythonOperator 6 | 7 | 8 | def print_hello(): 9 | return "Hello world!" 10 | 11 | 12 | default_args = { 13 | "owner": "airflow", 14 | "depends_on_past": False, 15 | "start_date": datetime(2019, 4, 30), 16 | "email": ["airflow@example.com"], 17 | "email_on_failure": False, 18 | "email_on_retry": False, 19 | "retries": 1, 20 | "retry_delay": timedelta(minutes=2), 21 | } 22 | 23 | dag = DAG( 24 | "hello_world", 25 | description="Simple tutorial DAG", 26 | schedule_interval="0 12 * * *", 27 | default_args=default_args, 28 | catchup=False, 29 | ) 30 | 31 | t1 = DummyOperator(task_id="dummy_task", retries=3, dag=dag) 32 | 33 | t2 = PythonOperator(task_id="hello_task", python_callable=print_hello, dag=dag) 34 | 35 | # sets downstream foe t1 36 | t1 >> t2 37 | 38 | # equivalent 39 | # t2.set_upstream(t1) 40 | -------------------------------------------------------------------------------- /solutions/dags/dags/subdags/twitter_subdag.py: -------------------------------------------------------------------------------- 1 | """ Simple subdag example """ 2 | from airflow import DAG 3 | from airflow.operators import PythonOperator 4 | from twitter_airflow import csv_to_sql, identify_popular_links 5 | from datetime import datetime, timedelta 6 | 7 | 8 | default_args = { 9 | "owner": "admin", 10 | "depends_on_past": False, 11 | "start_date": datetime(2016, 1, 1), 12 | "retries": 1, 13 | "retry_delay": timedelta(minutes=5), 14 | } 15 | 16 | subdag = DAG("generate_twitter_dags.insert_and_id_pop", default_args=default_args) 17 | 18 | move_tweets_to_sql = PythonOperator( 19 | task_id="csv_to_sqlite", 20 | provide_context=True, 21 | python_callable=csv_to_sql, 22 | dag=subdag, 23 | ) 24 | 25 | id_popular = PythonOperator( 26 | task_id="identify_popular_links", 27 | provide_context=True, 28 | python_callable=identify_popular_links, 29 | dag=subdag, 30 | params={"write_mode": "a"}, 31 | ) 32 | 33 | id_popular.set_upstream(move_tweets_to_sql) 34 | -------------------------------------------------------------------------------- /solutions/dags/dags/twitter_airflow.py: -------------------------------------------------------------------------------- 1 | """ Simple Airflow data pipeline example using Twitter API """ 2 | import ast 3 | import glob 4 | import itertools 5 | import os.path 6 | import shutil 7 | from collections import Counter 8 | from configparser import ConfigParser 9 | from csv import DictWriter, writer 10 | from datetime import datetime, timedelta 11 | from pathlib import Path 12 | import MySQLdb 13 | import MySQLdb.cursors 14 | 15 | import pandas as pd 16 | from tweepy import API, Cursor, OAuthHandler 17 | 18 | from airflow import DAG 19 | from airflow.hooks import sqlite_hook 20 | from airflow.hooks.mysql_hook import MySqlHook 21 | from airflow.models import Variable 22 | from airflow.operators.email_operator import EmailOperator 23 | from airflow.operators.python_operator import PythonOperator 24 | 25 | RAW_TWEET_DIR = os.path.abspath(os.path.join(__file__, "../data/tweets/")) 26 | CONFIG_FILE = os.path.abspath(os.path.join(__file__, "../config/prod.cfg")) 27 | MAX_TWEEPY_PAGE = 2 28 | 29 | # since there do not exist task on their own we need to create the DAG 30 | default_args = { 31 | "owner": "admin", 32 | "depends_on_past": False, 33 | "start_date": datetime.now() - timedelta(days=5), 34 | "retries": 1, 35 | "retry_delay": timedelta(minutes=5), 36 | } 37 | 38 | dag = DAG("twitter_links", default_args=default_args, schedule_interval="@daily") 39 | 40 | 41 | def extract_tweet_data(tweepy_obj, query): 42 | """ Extract relevant and serializable data from a tweepy Tweet object 43 | params: 44 | tweepy_obj: Tweepy Tweet Object 45 | query: str 46 | returns dict 47 | """ 48 | return { 49 | "user_id": tweepy_obj.user.id, 50 | "user_name": tweepy_obj.user.name, 51 | "user_screenname": tweepy_obj.user.screen_name, 52 | "user_url": tweepy_obj.user.url, 53 | "user_description": tweepy_obj.user.description, 54 | "user_followers": tweepy_obj.user.followers_count, 55 | "user_friends": tweepy_obj.user.friends_count, 56 | "created": tweepy_obj.created_at.isoformat(), 57 | "text": tweepy_obj.text, 58 | "hashtags": [ht.get("text") for ht in tweepy_obj.entities.get("hashtags")], 59 | "mentions": [ 60 | (um.get("id"), um.get("screen_name")) 61 | for um in tweepy_obj.entities.get("user_mentions") 62 | ], 63 | "urls": [url.get("expanded_url") for url in tweepy_obj.entities.get("urls")], 64 | "tweet_id": tweepy_obj.id, 65 | "is_quote_status": tweepy_obj.is_quote_status, 66 | "favorite_count": tweepy_obj.favorite_count, 67 | "retweet_count": tweepy_obj.retweet_count, 68 | "reply_status_id": tweepy_obj.in_reply_to_status_id, 69 | "lang": tweepy_obj.lang, 70 | "source": tweepy_obj.source, 71 | "location": tweepy_obj.coordinates, 72 | "query": query, 73 | } 74 | 75 | 76 | def search_twitter(**kwargs): 77 | """ Search for a query in public tweets""" 78 | query = kwargs.get("params").get("query") 79 | 80 | auth = OAuthHandler(Variable.get("consumer_key"), Variable.get("consumer_secret")) 81 | auth.set_access_token( 82 | Variable.get("access_token"), Variable.get("access_token_secret") 83 | ) 84 | api = API(auth) 85 | 86 | all_tweets = [] 87 | page_num = 0 88 | since_date = datetime.strptime(kwargs.get("ds"), "%Y-%m-%d").date() - timedelta( 89 | days=1 90 | ) 91 | query += " since:{} until:{}".format( 92 | since_date.strftime("%Y-%m-%d"), kwargs.get("ds") 93 | ) 94 | print(f"searching twitter with: {query}") 95 | for page in Cursor( 96 | api.search, q=query, monitor_rate_limit=True, wait_on_rate_limit=True 97 | ).pages(): 98 | all_tweets.extend([extract_tweet_data(t, query) for t in page]) 99 | page_num += 1 100 | if page_num > MAX_TWEEPY_PAGE: 101 | break 102 | 103 | # if it's an empty list, stop here 104 | if not len(all_tweets): 105 | return 106 | 107 | filename = "{}/{}_{}.csv".format( 108 | RAW_TWEET_DIR, query, datetime.now().strftime("%m%d%Y%H%M%S") 109 | ) 110 | 111 | # check that the directory exists 112 | if not Path(filename).resolve().parent.exists(): 113 | 114 | os.mkdir(Path(filename).resolve().parent) 115 | 116 | with open(filename, "w") as raw_file: 117 | raw_wrtr = DictWriter(raw_file, fieldnames=all_tweets[0].keys()) 118 | raw_wrtr.writeheader() 119 | raw_wrtr.writerows(all_tweets) 120 | 121 | 122 | def csv_to_sql(directory=RAW_TWEET_DIR, **kwargs): 123 | """ csv to sql pipeline using pandas 124 | params: 125 | directory: str (file path to csv files) 126 | """ 127 | dbconn = MySqlHook(mysql_conn_id="mysql_default") 128 | conn = dbconn.get_connection() 129 | cursor = conn.cursor() 130 | 131 | for fname in glob.glob("{}/*.csv".format(directory)): 132 | if "_read" not in fname: 133 | try: 134 | df = pd.read_csv(fname) 135 | df.to_sql("tweets", dbconn, if_exists="append", index=False) 136 | shutil.move(fname, fname.replace(".csv", "_read.csv")) 137 | except pd.io.common.EmptyDataError: 138 | # probably an io error with another task / open file 139 | continue 140 | 141 | 142 | def identify_popular_links(directory=RAW_TWEET_DIR, write_mode="w", **kwargs): 143 | """ Identify the most popular links from the last day of tweest in the db 144 | Writes them to latest_links.txt in the RAW_TWEET_DIR 145 | (or directory kwarg) 146 | """ 147 | dbconn = MySqlHook(mysql_conn_id="mysql_default") 148 | conn = dbconn.get_connection() 149 | cursor = conn.cursor() 150 | 151 | query = """select * from tweets where 152 | created > date('now', '-1 days') and urls is not null 153 | order by favorite_count""" 154 | df = pd.read_sql_query(query, conn) 155 | df.urls = df.urls.map(ast.literal_eval) 156 | cntr = Counter(itertools.chain.from_iterable(df.urls.values)) 157 | with open("{}/latest_links.txt".format(directory), write_mode) as latest: 158 | wrtr = writer(latest) 159 | wrtr.writerow(["url", "count"]) 160 | wrtr.writerows(cntr.most_common(5)) 161 | 162 | 163 | # -------------------------------------- 164 | # Tasks 165 | # ------------------------------------- 166 | simple_search = PythonOperator( 167 | task_id="search_twitter", 168 | provide_context=True, 169 | python_callable=search_twitter, 170 | dag=dag, 171 | # note we pass this as a params obj 172 | params={"query": "#pycon"}, 173 | ) 174 | 175 | 176 | move_tweets_to_sql = PythonOperator( 177 | task_id="csv_to_sql", 178 | # extra DAG context 179 | provide_context=True, 180 | # call the function 181 | python_callable=csv_to_sql, 182 | dag=dag, 183 | ) 184 | 185 | 186 | id_popular = PythonOperator( 187 | task_id="identify_popular_links", 188 | provide_context=True, 189 | python_callable=identify_popular_links, 190 | dag=dag, 191 | ) 192 | 193 | 194 | email_links = EmailOperator( 195 | task_id="email_best_links", 196 | to="trallard@bitsandchips.me", 197 | subject="Latest popular links", 198 | html_content="Check out the latest!!", 199 | files=["{}/latest_links.txt".format(RAW_TWEET_DIR)], 200 | dag=dag, 201 | ) 202 | 203 | 204 | simple_search.set_downstream(move_tweets_to_sql) 205 | id_popular.set_upstream(move_tweets_to_sql) 206 | email_links.set_upstream(id_popular) 207 | -------------------------------------------------------------------------------- /solutions/dags/twitter_airflow.py: -------------------------------------------------------------------------------- 1 | """ Simple Airflow data pipeline example using Twitter API """ 2 | import ast 3 | import glob 4 | import itertools 5 | import os.path 6 | import shutil 7 | from collections import Counter 8 | from configparser import ConfigParser 9 | from csv import DictWriter, writer 10 | from datetime import datetime, timedelta 11 | from pathlib import Path 12 | import MySQLdb 13 | import MySQLdb.cursors 14 | 15 | import pandas as pd 16 | from tweepy import API, Cursor, OAuthHandler 17 | 18 | from airflow import DAG 19 | from airflow.hooks import sqlite_hook 20 | from airflow.hooks.mysql_hook import MySqlHook 21 | from airflow.models import Variable 22 | from airflow.operators.email_operator import EmailOperator 23 | from airflow.operators.python_operator import PythonOperator 24 | 25 | RAW_TWEET_DIR = os.path.abspath(os.path.join(__file__, "../data/tweets/")) 26 | CONFIG_FILE = os.path.abspath(os.path.join(__file__, "../config/prod.cfg")) 27 | MAX_TWEEPY_PAGE = 2 28 | 29 | # since there do not exist task on their own we need to create the DAG 30 | default_args = { 31 | "owner": "admin", 32 | "depends_on_past": False, 33 | "start_date": datetime.now() - timedelta(days=5), 34 | "retries": 1, 35 | "retry_delay": timedelta(minutes=5), 36 | } 37 | 38 | dag = DAG("twitter_links", default_args=default_args, schedule_interval="@daily") 39 | 40 | 41 | def extract_tweet_data(tweepy_obj, query): 42 | """ Extract relevant and serializable data from a tweepy Tweet object 43 | params: 44 | tweepy_obj: Tweepy Tweet Object 45 | query: str 46 | returns dict 47 | """ 48 | return { 49 | "user_id": tweepy_obj.user.id, 50 | "user_name": tweepy_obj.user.name, 51 | "user_screenname": tweepy_obj.user.screen_name, 52 | "user_url": tweepy_obj.user.url, 53 | "user_description": tweepy_obj.user.description, 54 | "user_followers": tweepy_obj.user.followers_count, 55 | "user_friends": tweepy_obj.user.friends_count, 56 | "created": tweepy_obj.created_at.isoformat(), 57 | "text": tweepy_obj.text, 58 | "hashtags": [ht.get("text") for ht in tweepy_obj.entities.get("hashtags")], 59 | "mentions": [ 60 | (um.get("id"), um.get("screen_name")) 61 | for um in tweepy_obj.entities.get("user_mentions") 62 | ], 63 | "urls": [url.get("expanded_url") for url in tweepy_obj.entities.get("urls")], 64 | "tweet_id": tweepy_obj.id, 65 | "is_quote_status": tweepy_obj.is_quote_status, 66 | "favorite_count": tweepy_obj.favorite_count, 67 | "retweet_count": tweepy_obj.retweet_count, 68 | "reply_status_id": tweepy_obj.in_reply_to_status_id, 69 | "lang": tweepy_obj.lang, 70 | "source": tweepy_obj.source, 71 | "location": tweepy_obj.coordinates, 72 | "query": query, 73 | } 74 | 75 | 76 | def search_twitter(**kwargs): 77 | """ Search for a query in public tweets""" 78 | query = kwargs.get("params").get("query") 79 | 80 | auth = OAuthHandler(Variable.get("consumer_key"), Variable.get("consumer_secret")) 81 | auth.set_access_token( 82 | Variable.get("access_token"), Variable.get("access_token_secret") 83 | ) 84 | api = API(auth) 85 | 86 | all_tweets = [] 87 | page_num = 0 88 | since_date = datetime.strptime(kwargs.get("ds"), "%Y-%m-%d").date() - timedelta( 89 | days=1 90 | ) 91 | query += " since:{} until:{}".format( 92 | since_date.strftime("%Y-%m-%d"), kwargs.get("ds") 93 | ) 94 | print(f"searching twitter with: {query}") 95 | for page in Cursor( 96 | api.search, q=query, monitor_rate_limit=True, wait_on_rate_limit=True 97 | ).pages(): 98 | all_tweets.extend([extract_tweet_data(t, query) for t in page]) 99 | page_num += 1 100 | if page_num > MAX_TWEEPY_PAGE: 101 | break 102 | 103 | # if it's an empty list, stop here 104 | if not len(all_tweets): 105 | return 106 | 107 | filename = "{}/{}_{}.csv".format( 108 | RAW_TWEET_DIR, query, datetime.now().strftime("%m%d%Y%H%M%S") 109 | ) 110 | 111 | # check that the directory exists 112 | if not Path(filename).resolve().parent.exists(): 113 | 114 | os.mkdir(Path(filename).resolve().parent) 115 | 116 | with open(filename, "w") as raw_file: 117 | raw_wrtr = DictWriter(raw_file, fieldnames=all_tweets[0].keys()) 118 | raw_wrtr.writeheader() 119 | raw_wrtr.writerows(all_tweets) 120 | 121 | 122 | def csv_to_sql(directory=RAW_TWEET_DIR, **kwargs): 123 | """ csv to sql pipeline using pandas 124 | params: 125 | directory: str (file path to csv files) 126 | """ 127 | dbconn = MySqlHook(mysl_conn_id="mysql_default") 128 | cursor = dbconn.get_cursor() 129 | 130 | for fname in glob.glob("{}/*.csv".format(directory)): 131 | if "_read" not in fname: 132 | try: 133 | df = pd.read_csv(fname) 134 | df.to_sql("tweets", dbconn, if_exists="append", index=False) 135 | shutil.move(fname, fname.replace(".csv", "_read.csv")) 136 | except pd.io.common.EmptyDataError: 137 | # probably an io error with another task / open file 138 | continue 139 | 140 | 141 | def identify_popular_links(directory=RAW_TWEET_DIR, write_mode="w", **kwargs): 142 | """ Identify the most popular links from the last day of tweest in the db 143 | Writes them to latest_links.txt in the RAW_TWEET_DIR 144 | (or directory kwarg) 145 | """ 146 | dbconn = MySqlHook(mysl_conn_id="mysql_default") 147 | cursor = dbconn.cursor() 148 | 149 | query = """select * from tweets where 150 | created > date('now', '-1 days') and urls is not null 151 | order by favorite_count""" 152 | df = pd.read_sql_query(query, conn) 153 | df.urls = df.urls.map(ast.literal_eval) 154 | cntr = Counter(itertools.chain.from_iterable(df.urls.values)) 155 | with open("{}/latest_links.txt".format(directory), write_mode) as latest: 156 | wrtr = writer(latest) 157 | wrtr.writerow(["url", "count"]) 158 | wrtr.writerows(cntr.most_common(5)) 159 | 160 | 161 | # -------------------------------------- 162 | # Tasks 163 | # ------------------------------------- 164 | simple_search = PythonOperator( 165 | task_id="search_twitter", 166 | provide_context=True, 167 | python_callable=search_twitter, 168 | dag=dag, 169 | # note we pass this as a params obj 170 | params={"query": "#pycon"}, 171 | ) 172 | 173 | 174 | move_tweets_to_sql = PythonOperator( 175 | task_id="csv_to_sql", 176 | # extra DAG context 177 | provide_context=True, 178 | # call the function 179 | python_callable=csv_to_sql, 180 | dag=dag, 181 | ) 182 | 183 | 184 | id_popular = PythonOperator( 185 | task_id="identify_popular_links", 186 | provide_context=True, 187 | python_callable=identify_popular_links, 188 | dag=dag, 189 | ) 190 | 191 | 192 | email_links = EmailOperator( 193 | task_id="email_best_links", 194 | to="trallard@bitsandchips.me", 195 | subject="Latest popular links", 196 | html_content="Check out the latest!!", 197 | files=["{}/latest_links.txt".format(RAW_TWEET_DIR)], 198 | dag=dag, 199 | ) 200 | 201 | 202 | simple_search.set_downstream(move_tweets_to_sql) 203 | id_popular.set_upstream(move_tweets_to_sql) 204 | email_links.set_upstream(id_popular) 205 | -------------------------------------------------------------------------------- /solutions/etl-basic/analyse_twitter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path 3 | import re 4 | from datetime import datetime 5 | from pathlib import Path 6 | 7 | import matplotlib.pyplot as plt 8 | import mysql.connector as mysql 9 | import numpy as np 10 | import pandas as pd 11 | 12 | # import the previously created functions 13 | from stream_twitter import connect_db 14 | 15 | # Details for our MySql connection 16 | DATABASE = { 17 | "host": "localhost", 18 | "user": "airflow", 19 | "password": "python2019", 20 | "db": "airflowdb", 21 | } 22 | 23 | # ---------------------------------------------- 24 | # Database related functions 25 | # ---------------------------------------------- 26 | 27 | 28 | def sql_to_csv(my_database, my_table): 29 | 30 | dbconnect = connect_db(my_database) 31 | 32 | cursor = dbconnect.cursor() 33 | 34 | query = f"SELECT * FROM {table}" 35 | all_tweets = pd.read_sql_query(query, dbconnect) 36 | 37 | if os.path.exists("./data"): 38 | all_tweets.to_csv("./data/raw_tweets.csv", index=False) 39 | 40 | else: 41 | os.mkdir("./data") 42 | all_tweets.to_csv("./data/raw_tweets.csv", index=False) 43 | 44 | 45 | def sql_to_df(my_database, my_table): 46 | dbconnect = connect_db(my_database) 47 | 48 | cursor = dbconnect.cursor() 49 | 50 | query = f"SELECT * FROM {my_table}" 51 | 52 | # store in dataframe 53 | 54 | df = pd.read_sql_query(query, dbconnect, index_col="id") 55 | 56 | cursor.close() 57 | dbconnect.close() 58 | 59 | return df 60 | 61 | 62 | # ---------------------------------------------- 63 | # Data processing 64 | # ---------------------------------------------- 65 | 66 | 67 | def clean_data(df): 68 | 69 | # Make all usernames lowercase 70 | clean_df = df.copy() 71 | clean_df["user"] = df["user"].str.lower() 72 | 73 | # keep only non RT 74 | clean_df = clean_df[~clean_df["tweet"].str.contains("RT")] 75 | 76 | return clean_df 77 | 78 | 79 | def create_plots(df): 80 | x = df["language"].unique() 81 | fig, ax = plt.subplots() 82 | countries = df["language"].value_counts() 83 | plt.bar(range(len(countries)), countries) 84 | fig.suptitle("Language counts") 85 | plt.xlabel("languages") 86 | plt.ylabel("count") 87 | ax.set_xticklabels(x) 88 | 89 | if os.path.exists("./plots"): 90 | fig.savefig("./plots/barchart_lang.png") 91 | 92 | else: 93 | os.mkdir("./plots") 94 | fig.savefig("./plots/barchart_lang.png") 95 | 96 | 97 | def save_df(df): 98 | today = datetime.today().strftime("%Y-%m-%d") 99 | 100 | if os.path.exists("./data"): 101 | df.to_csv(f"./data/{today}-clean-df.csv", index=None) 102 | 103 | else: 104 | os.mkdir("./data") 105 | df.to_csv(f"./data/{today}-clean-df.csv", index=None) 106 | 107 | 108 | if __name__ == "__main__": 109 | 110 | df = sql_to_df(DATABASE, "tweets_long") 111 | print("Database loaded in df") 112 | 113 | clean_df = clean_data(df) 114 | 115 | create_plots(clean_df) 116 | 117 | save_df(clean_df) 118 | -------------------------------------------------------------------------------- /solutions/etl-basic/etl.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | python etl-basic/stream_twitter_timed.py 4 | 5 | echo "Completed extraction starting cleaning" 6 | 7 | python etl-basic/analyse_twitter.py -------------------------------------------------------------------------------- /solutions/etl-basic/stream_twitter.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------ 2 | # This script is used to stream Twitter data into a MySQL my_database 3 | # note that for this you need an approved Twitter develope account 4 | # an app and the keys for said app 5 | # ------------------------------------------------------------------ 6 | 7 | # Import libraries needed 8 | import sys 9 | import json 10 | import time 11 | from configparser import ConfigParser 12 | from pathlib import Path 13 | 14 | import tweepy 15 | from dateutil import parser 16 | from mysql import connector as mysql 17 | 18 | # Path to the config file with the keys make sure not to commit this file 19 | CONFIG_FILE = Path.cwd() / "config.cfg" 20 | 21 | # Details for our MySql connection 22 | DATABASE = { 23 | "host": "localhost", 24 | "user": "airflow", 25 | "password": "python2019", 26 | "db": "airflowdb", 27 | } 28 | 29 | # ---------------------------------------------- 30 | # Database related functions 31 | # ---------------------------------------------- 32 | 33 | 34 | def connect_db(my_database): 35 | """Connect to a given my_database 36 | 37 | Args: 38 | my_database(dict): dictionary with the my_database details 39 | 40 | Returns: 41 | dbconnect: MySql my_database connection object 42 | """ 43 | try: 44 | dbconnect = mysql.connect( 45 | host=my_database.get("host"), 46 | user=my_database.get("user"), 47 | password=my_database.get("password"), 48 | db=my_database.get("db"), 49 | ) 50 | print("connected") 51 | return dbconnect 52 | except mysql.Error as e: 53 | print(e) 54 | 55 | 56 | def create_table(my_database, new_table): 57 | """Create new table in a my_database 58 | 59 | Args: 60 | my_database (dict): details for the db 61 | new_table (str): name of the table to create 62 | """ 63 | 64 | dbconnect = connect_db(my_database) 65 | 66 | # create a cursor for the queries 67 | cursor = dbconnect.cursor() 68 | cursor.execute("USE airflowdb") 69 | 70 | # here we delete the table, it can be kept or else 71 | cursor.execute(f"DROP TABLE IF EXISTS {new_table}") 72 | 73 | # these matches the Twitter data 74 | query = ( 75 | f"CREATE TABLE `{new_table}` (" 76 | " `id` INT(11) NOT NULL AUTO_INCREMENT," 77 | " `user` varchar(100) NOT NULL ," 78 | " `created_at` timestamp," 79 | " `tweet` varchar(255) NOT NULL," 80 | " `retweet_count` int(11) ," 81 | " `id_str` varchar(100)," 82 | " PRIMARY KEY (`id`))" 83 | ) 84 | 85 | cursor.execute(query) 86 | dbconnect.close() 87 | cursor.close() 88 | 89 | return print(f"Created {new_table} table") 90 | 91 | 92 | def populate_table( 93 | user, created_at, tweet, retweet_count, id_str, my_database=DATABASE 94 | ): 95 | """Populate a given table witht he Twitter collected data 96 | 97 | Args: 98 | user (str): username from the status 99 | created_at (datetime): when the tweet was created 100 | tweet (str): text 101 | retweet_count (int): number of retweets 102 | id_str (int): unique id for the tweet 103 | """ 104 | 105 | dbconnect = connect_db(DATABASE) 106 | 107 | cursor = dbconnect.cursor() 108 | cursor.execute("USE airflowdb") 109 | 110 | query = "INSERT INTO tweets (user, created_at, tweet, retweet_count, id_str) VALUES (%s, %s, %s, %s, %s)" 111 | 112 | try: 113 | cursor.execute(query, (user, created_at, tweet, retweet_count, id_str)) 114 | dbconnect.commit() 115 | print("commited") 116 | 117 | except mysql.Error as e: 118 | print(e) 119 | dbconnect.rollback() 120 | 121 | cursor.close() 122 | dbconnect.close() 123 | 124 | return 125 | 126 | 127 | # ---------------------------------------------- 128 | # Access the Twitter API 129 | # ---------------------------------------------- 130 | 131 | 132 | def connectTwitter(): 133 | config = ConfigParser() 134 | config.read(CONFIG_FILE) 135 | 136 | # Authenticate to Twitter 137 | auth = tweepy.OAuthHandler( 138 | config.get("twitter", "consumer_key"), config.get("twitter", "consumer_secret") 139 | ) 140 | auth.set_access_token( 141 | config.get("twitter", "access_token"), 142 | config.get("twitter", "access_token_secret"), 143 | ) 144 | 145 | # Create Twitter API object 146 | twitter = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) 147 | 148 | print(f"🦄 Connected as {twitter.me().screen_name}") 149 | 150 | return twitter 151 | 152 | 153 | class customListener(tweepy.StreamListener): 154 | """We need to create an instance of the Stream Listener 155 | http://docs.tweepy.org/en/v3.4.0/streaming_how_to.html 156 | """ 157 | 158 | def on_error(self, status_code): 159 | if status_code == 420: 160 | # returning False in on_data disconnects the stream 161 | return False 162 | 163 | def on_status(self, status): 164 | print(status.text) 165 | return True 166 | 167 | def on_data(self, data): 168 | """ 169 | Automatic detection of the kind of data collected from Twitter 170 | This method reads in tweet data as Json and extracts the data we want. 171 | """ 172 | try: 173 | # parse as json 174 | raw_data = json.loads(data) 175 | 176 | # extract the relevant data 177 | if "text" in raw_data: 178 | user = raw_data["user"]["screen_name"] 179 | created_at = parser.parse(raw_data["created_at"]) 180 | tweet = raw_data["text"] 181 | retweet_count = raw_data["retweet_count"] 182 | id_str = raw_data["id_str"] 183 | 184 | # insert data just collected into MySQL my_database 185 | populate_table(user, created_at, tweet, retweet_count, id_str) 186 | print(f"Tweet colleted at: {created_at}") 187 | 188 | except Error as e: 189 | print(e) 190 | 191 | 192 | def start_stream(stream, **kwargs): 193 | """Start the stream, prints the disconnection error 194 | 195 | Args: 196 | stream (obj): stream object to start 197 | """ 198 | 199 | try: 200 | stream.filter(**kwargs) 201 | except Exception: 202 | stream.disconnect() 203 | print("Fatal exception") 204 | 205 | 206 | if __name__ == "__main__": 207 | create_table(DATABASE, "tweets") 208 | # first we need to authenticate 209 | twitter = connectTwitter() 210 | 211 | # next: create stream listener 212 | myStreamListener = customListener() 213 | myStream = tweepy.Stream(auth=twitter.auth, listener=myStreamListener, timeout=30) 214 | 215 | # stream tweets using the filter method 216 | version = float(f"{sys.version_info[0]}.{sys.version_info[1]}") 217 | if version >= 3.7: 218 | kwargs = { 219 | 'track': ["python", "pycon", "jupyter", "#pycon2019"], 220 | 'is_async': True 221 | } 222 | else: 223 | kwargs = { 224 | 'track': ["python", "pycon", "jupyter", "#pycon2019"], 225 | 'async': True 226 | } 227 | pass 228 | start_stream(myStream, **kwargs) 229 | pass 230 | 231 | -------------------------------------------------------------------------------- /solutions/etl-basic/stream_twitter_alt.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------ 2 | # This script is used to stream Twitter data into a MySQL my_database 3 | # note that for this you need an approved Twitter develope account 4 | # an app and the keys for said app 5 | # ------------------------------------------------------------------ 6 | 7 | # Import libraries needed 8 | import sys 9 | import json 10 | import time 11 | from configparser import ConfigParser 12 | from pathlib import Path 13 | 14 | import tweepy 15 | from dateutil import parser 16 | from mysql import connector as mysql 17 | 18 | # Path to the config file with the keys make sure not to commit this file 19 | CONFIG_FILE = Path.cwd() / "config.cfg" 20 | 21 | # Details for our MySql connection 22 | DATABASE = { 23 | "host": "localhost", 24 | "user": "airflow", 25 | "password": "python2019", 26 | "db": "airflowdb", 27 | } 28 | 29 | MAX_TWEEPY_PAGE = 300 30 | 31 | 32 | # ---------------------------------------------- 33 | # Database related functions 34 | # ---------------------------------------------- 35 | 36 | 37 | def connect_db(my_database): 38 | """Connect to a given my_database 39 | 40 | Args: 41 | my_database(dict): dictionary with the my_database details 42 | 43 | Returns: 44 | dbconnect: MySql my_database connection object 45 | """ 46 | try: 47 | dbconnect = mysql.connect( 48 | host=my_database.get("host"), 49 | user=my_database.get("user"), 50 | password=my_database.get("password"), 51 | db=my_database.get("db"), 52 | ) 53 | print("connected") 54 | return dbconnect 55 | except mysql.Error as e: 56 | print(e) 57 | 58 | 59 | def create_table(my_database, new_table): 60 | """Create new table in a my_database 61 | 62 | Args: 63 | my_database (dict): details for the db 64 | new_table (str): name of the table to create 65 | """ 66 | 67 | dbconnect = connect_db(my_database) 68 | 69 | # create a cursor for the queries 70 | cursor = dbconnect.cursor() 71 | cursor.execute("USE airflowdb") 72 | 73 | # here we delete the table, it can be kept or else 74 | cursor.execute(f"DROP TABLE IF EXISTS {new_table}") 75 | 76 | # these matches the Twitter data 77 | query = ( 78 | f"CREATE TABLE `{new_table}` (" 79 | " `id` INT(11) NOT NULL AUTO_INCREMENT," 80 | " `user` varchar(100) NOT NULL ," 81 | " `created_at` timestamp," 82 | " `tweet` varchar(255) NOT NULL," 83 | " `retweet_count` int(11) ," 84 | " `id_str` varchar(100)," 85 | " `country` varchar(255)," 86 | " `followers` varchar(100)," 87 | " `language` varchar(100)," 88 | " PRIMARY KEY (`id`))" 89 | ) 90 | 91 | cursor.execute(query) 92 | dbconnect.close() 93 | cursor.close() 94 | 95 | return print(f"Created {new_table} table") 96 | 97 | 98 | def populate_table( 99 | user, 100 | created_at, 101 | tweet, 102 | retweet_count, 103 | id_str, 104 | country, 105 | followers, 106 | language, 107 | my_table, 108 | my_database=DATABASE, 109 | ): 110 | """Populate a given table witht he Twitter collected data 111 | 112 | Args: 113 | user (str): username from the status 114 | created_at (datetime): when the tweet was created 115 | tweet (str): text 116 | retweet_count (int): number of retweets 117 | id_str (int): unique id for the tweet 118 | """ 119 | 120 | dbconnect = connect_db(DATABASE) 121 | 122 | cursor = dbconnect.cursor() 123 | cursor.execute("USE airflowdb") 124 | 125 | query = f"INSERT INTO {my_table} (user, created_at, tweet, retweet_count, id_str, country, followers, language) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)" 126 | 127 | try: 128 | cursor.execute( 129 | query, 130 | ( 131 | user, 132 | created_at, 133 | tweet, 134 | retweet_count, 135 | id_str, 136 | country, 137 | followers, 138 | language, 139 | ), 140 | ) 141 | dbconnect.commit() 142 | print("commited") 143 | 144 | except mysql.Error as e: 145 | print(e) 146 | dbconnect.rollback() 147 | 148 | cursor.close() 149 | dbconnect.close() 150 | 151 | return 152 | 153 | 154 | # ---------------------------------------------- 155 | # Access the Twitter API 156 | # ---------------------------------------------- 157 | 158 | 159 | def connectTwitter(): 160 | config = ConfigParser() 161 | config.read(CONFIG_FILE) 162 | 163 | # Authenticate to Twitter 164 | auth = tweepy.OAuthHandler( 165 | config.get("twitter", "consumer_key"), config.get("twitter", "consumer_secret") 166 | ) 167 | auth.set_access_token( 168 | config.get("twitter", "access_token"), 169 | config.get("twitter", "access_token_secret"), 170 | ) 171 | 172 | # Create Twitter API object 173 | twitter = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) 174 | 175 | print(f"🦄 Connected as {twitter.me().screen_name}") 176 | 177 | return twitter 178 | 179 | 180 | class customListener(tweepy.StreamListener): 181 | """We need to create an instance of the Stream Listener 182 | http://docs.tweepy.org/en/v3.4.0/streaming_how_to.html 183 | """ 184 | 185 | def on_error(self, status_code): 186 | if status_code == 420: 187 | # returning False in on_data disconnects the stream 188 | return False 189 | 190 | def on_status(self, status): 191 | print(status.text) 192 | return True 193 | 194 | def on_data(self, data): 195 | """ 196 | Automatic detection of the kind of data collected from Twitter 197 | This method reads in tweet data as Json and extracts the data we want. 198 | """ 199 | try: 200 | # parse as json 201 | json_data = json.loads(data) 202 | 203 | # extract the relevant data 204 | if "text" in json_data: 205 | user = json_data["user"]["screen_name"] 206 | created_at = parser.parse(json_data["created_at"]) 207 | tweet = json_data["text"] 208 | retweet_count = json_data["retweet_count"] 209 | id_str = json_data["id_str"] 210 | followers = json_data["user"]["followers_count"] 211 | language = json_data["user"]["lang"] 212 | if json_data["place"] is not None: 213 | country = json_data["place"]["country"] 214 | else: 215 | country = None 216 | 217 | # insert data just collected into MySQL my_database 218 | populate_table( 219 | user, 220 | created_at, 221 | tweet, 222 | retweet_count, 223 | id_str, 224 | country, 225 | followers, 226 | language, 227 | "tweets_long", 228 | ) 229 | print(f"Tweet colleted at: {created_at}") 230 | 231 | except Error as e: 232 | print(e) 233 | 234 | 235 | def start_stream(stream, **kwargs): 236 | """Start the stream, prints the disconnection error 237 | 238 | Args: 239 | stream (obj): stream object to start 240 | """ 241 | try: 242 | stream.filter(**kwargs) 243 | except Exception: 244 | stream.disconnect() 245 | print("Fatal exception") 246 | 247 | 248 | if __name__ == "__main__": 249 | 250 | create_table(DATABASE, "tweets_long") 251 | # first we need to authenticate 252 | twitter = connectTwitter() 253 | 254 | # next: create stream listener 255 | myStreamListener = customListener() 256 | myStream = tweepy.Stream(auth=twitter.auth, listener=myStreamListener, timeout=30) 257 | 258 | # stream tweets using the filter method 259 | version = float(f"{sys.version_info[0]}.{sys.version_info[1]}") 260 | if version >= 3.7: 261 | kwargs = { 262 | 'track': ["python", "pycon", "jupyter", "#pycon2019"], 263 | 'is_async': True 264 | } 265 | else: 266 | kwargs = { 267 | 'track': ["python", "pycon", "jupyter", "#pycon2019"], 268 | 'async': True 269 | } 270 | pass 271 | start_stream(myStream, **kwargs) 272 | pass 273 | -------------------------------------------------------------------------------- /solutions/etl-basic/stream_twitter_timed.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------ 2 | # This script is used to stream Twitter data into a MySQL my_database 3 | # note that for this you need an approved Twitter develope account 4 | # an app and the keys for said app 5 | # ------------------------------------------------------------------ 6 | 7 | # Import libraries needed 8 | import json 9 | import time 10 | from configparser import ConfigParser 11 | from pathlib import Path 12 | 13 | import tweepy 14 | from dateutil import parser 15 | from mysql import connector as mysql 16 | 17 | # Path to the config file with the keys make sure not to commit this file 18 | CONFIG_FILE = Path.cwd() / "config.cfg" 19 | 20 | # Details for our MySql connection 21 | DATABASE = { 22 | "host": "localhost", 23 | "user": "airflow", 24 | "password": "python2019", 25 | "db": "airflowdb", 26 | } 27 | 28 | MAX_TWEEPY_PAGE = 300 29 | 30 | 31 | # ---------------------------------------------- 32 | # Database related functions 33 | # ---------------------------------------------- 34 | 35 | 36 | def connect_db(my_database): 37 | """Connect to a given my_database 38 | 39 | Args: 40 | my_database(dict): dictionary with the my_database details 41 | 42 | Returns: 43 | dbconnect: MySql my_database connection object 44 | """ 45 | try: 46 | dbconnect = mysql.connect( 47 | host=my_database.get("host"), 48 | user=my_database.get("user"), 49 | password=my_database.get("password"), 50 | db=my_database.get("db"), 51 | ) 52 | print("connected") 53 | return dbconnect 54 | except mysql.Error as e: 55 | print(e) 56 | 57 | 58 | def create_table(my_database, new_table): 59 | """Create new table in a my_database 60 | 61 | Args: 62 | my_database (dict): details for the db 63 | new_table (str): name of the table to create 64 | """ 65 | 66 | dbconnect = connect_db(my_database) 67 | 68 | # create a cursor for the queries 69 | cursor = dbconnect.cursor() 70 | cursor.execute("USE airflowdb") 71 | 72 | # here we delete the table, it can be kept or else 73 | cursor.execute(f"DROP TABLE IF EXISTS {new_table}") 74 | 75 | # these matches the Twitter data 76 | query = ( 77 | f"CREATE TABLE `{new_table}` (" 78 | " `id` INT(11) NOT NULL AUTO_INCREMENT," 79 | " `user` varchar(100) NOT NULL ," 80 | " `created_at` timestamp," 81 | " `tweet` varchar(255) NOT NULL," 82 | " `retweet_count` int(11) ," 83 | " `id_str` varchar(100)," 84 | " PRIMARY KEY (`id`))" 85 | ) 86 | 87 | cursor.execute(query) 88 | dbconnect.close() 89 | cursor.close() 90 | 91 | return print(f"Created {new_table} table") 92 | 93 | 94 | def populate_table( 95 | user, created_at, tweet, retweet_count, id_str, my_database=DATABASE 96 | ): 97 | """Populate a given table witht he Twitter collected data 98 | 99 | Args: 100 | user (str): username from the status 101 | created_at (datetime): when the tweet was created 102 | tweet (str): text 103 | retweet_count (int): number of retweets 104 | id_str (int): unique id for the tweet 105 | """ 106 | 107 | dbconnect = connect_db(DATABASE) 108 | 109 | cursor = dbconnect.cursor() 110 | cursor.execute("USE airflowdb") 111 | 112 | query = "INSERT INTO tweets (user, created_at, tweet, retweet_count, id_str) VALUES (%s, %s, %s, %s, %s)" 113 | 114 | try: 115 | cursor.execute(query, (user, created_at, tweet, retweet_count, id_str)) 116 | dbconnect.commit() 117 | print("commited") 118 | 119 | except mysql.Error as e: 120 | print(e) 121 | dbconnect.rollback() 122 | 123 | cursor.close() 124 | dbconnect.close() 125 | 126 | return 127 | 128 | 129 | # ---------------------------------------------- 130 | # Access the Twitter API 131 | # ---------------------------------------------- 132 | 133 | 134 | def connectTwitter(): 135 | config = ConfigParser() 136 | config.read(CONFIG_FILE) 137 | 138 | # Authenticate to Twitter 139 | auth = tweepy.OAuthHandler( 140 | config.get("twitter", "consumer_key"), config.get("twitter", "consumer_secret") 141 | ) 142 | auth.set_access_token( 143 | config.get("twitter", "access_token"), 144 | config.get("twitter", "access_token_secret"), 145 | ) 146 | 147 | # Create Twitter API object 148 | twitter = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) 149 | 150 | print(f"🦄 Connected as {twitter.me().screen_name}") 151 | 152 | return twitter 153 | 154 | 155 | class customListener(tweepy.StreamListener): 156 | """We need to create an instance of the Stream Listener 157 | http://docs.tweepy.org/en/v3.4.0/streaming_how_to.html 158 | """ 159 | 160 | def __init__(self, time_limit=60): 161 | self.start_time = time.time() 162 | self.limit = time_limit 163 | super(customListener, self).__init__() 164 | 165 | def on_error(self, status_code): 166 | if status_code == 420: 167 | # returning False in on_data disconnects the stream 168 | return False 169 | 170 | def on_status(self, status): 171 | print(status.text) 172 | return True 173 | 174 | def on_data(self, data): 175 | """ 176 | Automatic detection of the kind of data collected from Twitter 177 | This method reads in tweet data as Json and extracts the data we want. 178 | """ 179 | if (time.time() - self.start_time) < self.limit: 180 | try: 181 | # parse as json 182 | raw_data = json.loads(data) 183 | 184 | # extract the relevant data 185 | if "text" in raw_data: 186 | user = raw_data["user"]["screen_name"] 187 | created_at = parser.parse(raw_data["created_at"]) 188 | tweet = raw_data["text"] 189 | retweet_count = raw_data["retweet_count"] 190 | id_str = raw_data["id_str"] 191 | 192 | # insert data just collected into MySQL my_database 193 | populate_table(user, created_at, tweet, retweet_count, id_str) 194 | print(f"Tweet colleted at: {created_at}") 195 | 196 | except Error as e: 197 | print(e) 198 | else: 199 | self.saveFile.close() 200 | return False 201 | 202 | 203 | def start_stream(stream, **kwargs): 204 | """Start the stream, prints the disconnection error 205 | 206 | Args: 207 | stream (obj): stream object to start 208 | """ 209 | 210 | try: 211 | stream.filter(**kwargs) 212 | except Exception: 213 | stream.disconnect() 214 | print("Fatal exception") 215 | 216 | 217 | if __name__ == "__main__": 218 | 219 | create_table(DATABASE, "tweets") 220 | # first we need to authenticate 221 | twitter = connectTwitter() 222 | 223 | # next: create stream listener 224 | myStreamListener = customListener() 225 | myStream = tweepy.Stream(auth=twitter.auth, listener=myStreamListener, timeout=30) 226 | 227 | # stream tweets using the filter method 228 | start_stream(myStream, track=["python", "pycon", "jupyter", "#pycon2019"]) 229 | 230 | -------------------------------------------------------------------------------- /source/_static/12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/12.png -------------------------------------------------------------------------------- /source/_static/4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/4.jpg -------------------------------------------------------------------------------- /source/_static/DAG.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/DAG.png -------------------------------------------------------------------------------- /source/_static/GUI.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/GUI.png -------------------------------------------------------------------------------- /source/_static/airflow-logo.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/airflow-logo.jpeg -------------------------------------------------------------------------------- /source/_static/airflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/airflow.png -------------------------------------------------------------------------------- /source/_static/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/architecture.png -------------------------------------------------------------------------------- /source/_static/automation1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/automation1.jpg -------------------------------------------------------------------------------- /source/_static/azure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/azure.png -------------------------------------------------------------------------------- /source/_static/connection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/connection.png -------------------------------------------------------------------------------- /source/_static/custom.css: -------------------------------------------------------------------------------- 1 | /* */ 2 | @import url('https://fonts.googleapis.com/css?family=Itim|Nunito|Source+Code+Pro'); 3 | 4 | a { 5 | color: rgb(96, 138, 197); 6 | } 7 | 8 | a:hover { 9 | color: rgb(65, 129, 218); 10 | } 11 | 12 | div.body h1 { 13 | color: #5F6366; 14 | font-family: 'Itim', cursive; 15 | font-weight: bold; 16 | font-size: 300%; 17 | } 18 | 19 | div.body h2 { 20 | color: #5F6366; 21 | font-family: 'Itim', cursive; 22 | font-weight: bold; 23 | } 24 | div.body h3 { 25 | color: #5F6366; 26 | font-family: 'Itim', cursive; 27 | font-weight: bold; 28 | } 29 | 30 | div.sphinxsidebarwrapper h1.logo { 31 | text-align: center; 32 | margin: 0 0 -20px 0; 33 | } 34 | 35 | div.sphinxsidebar p.blurb { 36 | font-size: 130%; 37 | text-align: center; 38 | font-family: 'Itim', cursive; 39 | color: rgb(151, 139, 196); 40 | } 41 | 42 | div.sphinxsidebar h1{ 43 | font-size: 160%; 44 | color: #5F6366; 45 | font-family: 'Itim', cursive; 46 | } 47 | 48 | div.sphinxsidebar h1 a { 49 | font-size: 160%; 50 | color: #5F6366; 51 | text-decoration: none; 52 | border: none; 53 | font-family: 'Itim', cursive; 54 | } 55 | 56 | div.sphinxsidebar h1 a:hover { 57 | border: none; 58 | } 59 | 60 | div.sphinxsidebar h3 { 61 | display: none; 62 | } 63 | 64 | div.sphinxsidebar a { 65 | color: #5F6366; 66 | } 67 | 68 | code.descname { 69 | color: rgb(151, 139, 196); 70 | } 71 | 72 | th.field-name { 73 | min-width: 100px; 74 | color: rgb(151, 139, 196); 75 | } 76 | 77 | tt, code { 78 | color: #F8F8F2; 79 | background: #1d1941; 80 | border-radius: 0.3em; 81 | padding: 0.0em 0.3em; 82 | } 83 | 84 | a.reference.internal code.xref span.pre { 85 | color: #F8F8F2; 86 | background: #1d1941; 87 | border-bottom: none; 88 | border-radius: 0; 89 | padding: 0; 90 | } 91 | 92 | a.reference.internal, a.reference.internal:hover { 93 | border-bottom: none; 94 | } 95 | 96 | a.reference.internal:hover code { 97 | background: #027bab 98 | } 99 | 100 | a.reference.internal:hover code.xref span.pre { 101 | color: #F8F8F2; 102 | background: #027bab; 103 | border-bottom: none; 104 | } 105 | 106 | tt.xref, code.xref, a tt { 107 | background: none; 108 | border-bottom: none; 109 | } 110 | 111 | code.literal { 112 | color: #F8F8F2; 113 | background:#1d1941; 114 | } 115 | 116 | pre { 117 | padding: 20px 30px; 118 | background: #1d1941; 119 | } 120 | 121 | div > dl { 122 | border-left: 2px solid #00384021; 123 | padding-left: 5px; 124 | } 125 | 126 | dt { 127 | color: rgb(96, 138, 197); 128 | } 129 | 130 | 131 | div.footer::before { 132 | display: block; 133 | content: ''; 134 | border-top: 2px solid #EDB5BF; 135 | width: 50%; 136 | margin: 2em auto 2em auto; 137 | } 138 | 139 | div.footer { 140 | text-align: center; 141 | /* color: #029be2; */ 142 | } 143 | 144 | div.footer a { 145 | color: #027bab; 146 | text-decoration: none; 147 | } 148 | 149 | p.caption { 150 | font-family: 'Itim', cursive; 151 | font-size: inherit; 152 | font-size: 150%; 153 | } 154 | 155 | @media screen and (max-width: 875px) { 156 | div.sphinxsidebar { 157 | background: #4D6D9A; 158 | } 159 | div.sphinxsidebar h1.logo, div.sphinxsidebar p.blurb{ 160 | text-align: left; 161 | } 162 | div.sphinxsidebar h1 a { 163 | color: #1bc5e0; 164 | } 165 | div.sphinxsidebar a { 166 | /* color: rgb(151, 139, 196); */ 167 | color: white; 168 | } 169 | div.sphinxsidebar ul { 170 | /* color: rgb(151, 139, 196); */ 171 | color: white; 172 | } 173 | } 174 | 175 | 176 | /* other */ 177 | 178 | .alert { 179 | position: relative; 180 | padding: 10px; 181 | margin-bottom: 5px; 182 | border: 2px solid transparent; 183 | border-radius: 2px; 184 | } 185 | 186 | .alert-primary { 187 | color: #004085; 188 | background-color: #cce5ff; 189 | border-color: #b8daff; 190 | } 191 | .alert-custom { 192 | background-color: rgb(229, 224, 247); 193 | border-color:rgb(229, 224, 247); 194 | color: rgb(128, 117, 165); 195 | } -------------------------------------------------------------------------------- /source/_static/datapyramid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/datapyramid.png -------------------------------------------------------------------------------- /source/_static/gooddata.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/gooddata.png -------------------------------------------------------------------------------- /source/_static/gooddata1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/gooddata1.png -------------------------------------------------------------------------------- /source/_static/luigi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/luigi.png -------------------------------------------------------------------------------- /source/_static/mssignin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/mssignin.png -------------------------------------------------------------------------------- /source/_static/pipeline1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/pipeline1.png -------------------------------------------------------------------------------- /source/_static/python.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/python.png -------------------------------------------------------------------------------- /source/_static/twitter1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/twitter1.png -------------------------------------------------------------------------------- /source/_static/twitter2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/twitter2.png -------------------------------------------------------------------------------- /source/_static/twitter3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/twitter3.png -------------------------------------------------------------------------------- /source/_static/uses.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/uses.png -------------------------------------------------------------------------------- /source/_templates/sidebarlogo.html: -------------------------------------------------------------------------------- 1 |

3 | 4 |

5 | 6 |

7 | -------------------------------------------------------------------------------- /source/about.md: -------------------------------------------------------------------------------- 1 | # About the workshop 2 | 3 | We will be taking a look at the basic concepts of data pipelines as well as practical use cases using Python. 4 | 5 | ## About you: 6 | - Some experience using the command line 7 | - Intermediate Python knowledge / use 8 | - Be able to apply what we learn and adopt to your use cases 9 | - Interested in data and systems 10 | - Aspring or current data engineering 11 | - Some knowledge about systems and databases (enough to be dangerous) 12 | 13 | ## Our focus for the day 14 | - Greater understanding on how to apply data pipelines using the Python toolset 15 | - Focus on concepts 16 | - Apply knowledge with each library 17 | - Will give you the building blocks 18 | 19 | ## Keeping on track 20 | 21 | You will find 🚦 across the tutorial examples. We will use this to identify how folks are doing over the workshop (if following along in person). 22 | Place the post it as follows: 23 | 24 | 🚦 Purple postit: all good, task has been completed 25 | 26 | 🚦 Orange postit: I need extra time or need help with the task in hand -------------------------------------------------------------------------------- /source/airflow-intro.md: -------------------------------------------------------------------------------- 1 | # Airflow basics 2 | 3 | ## What is Airflow? 4 | 5 | ![airflow logo](_static/airflow-logo.jpeg) 6 | 7 | Airflow is a Workflow engine which means: 8 | 9 | - Manage scheduling and running jobs and data pipelines 10 | - Ensures jobs are ordered correctly based on dependencies 11 | - Manage the allocation of scarce resources 12 | - Provides mechanisms for tracking the state of jobs and recovering from failure 13 | 14 | It is highly versatile and can be used across many many domains: 15 | ![](_static/uses.png) 16 | 17 | ## Basic Airflow concepts 18 | 19 | - **Task**: a defined unit of work (these are called operators in Airflow) 20 | - **Task instance**: an individual run of a single task. Task instances also have an indicative state, which could be “running”, “success”, “failed”, “skipped”, “up for retry”, etc. 21 | - **DAG**: Directed acyclic graph, 22 | a set of tasks with explicit execution order, beginning, and end 23 | - **DAG run**: individual execution/run of a DAG 24 | 25 | **Debunking the DAG** 26 | 27 | The vertices and edges (the arrows linking the nodes) have an order and direction associated to them 28 | 29 | ![](_static/DAG.png) 30 | 31 | each node in a DAG corresponds to a task, which in turn represents some sort of data processing. For example: 32 | 33 | Node A could be the code for pulling data from an API, node B could be the code for anonymizing the data. Node B could be the code for checking that there are no duplicate records, and so on. 34 | 35 | These 'pipelines' are acyclic since they need a point of completion. 36 | 37 | **Dependencies** 38 | 39 | Each of the vertices has a particular direction that shows the relationship between certain nodes. For example, we can only anonymize data once this has been pulled out from the API. 40 | 41 | ## Idempotency 42 | 43 | This is one of the most important characteristics of good ETL architectures. 44 | 45 | When we say that something is idempotent it means it will produce the same result regardless of how many times this is run (i.e. the results are reproducible). 46 | 47 | Reproducibility is particularly important in data-intensive environments as this ensures that the same inputs will always return the same outputs. 48 | 49 | ## Airflow components 50 | 51 | ![](_static/architecture.png) 52 | 53 | There are 4 main components to Apache Airflow: 54 | 55 | ### Web server 56 | 57 | The GUI. This is under the hood a Flask app where you can track the status of your jobs and read logs from a remote file store (e.g. [Azure Blobstorage](https://docs.microsoft.com/en-us/azure/storage/blobs/storage-blobs-overview/?wt.mc_id=PyCon-github-taallard)). 58 | 59 | ### Scheduler 60 | 61 | This component is responsible for scheduling jobs. This is a multithreaded Python process that uses the DAGb object to decide what tasks need to be run, when and where. 62 | 63 | The task state is retrieved and updated from the database accordingly. The web server then uses these saved states to display job information. 64 | 65 | ### Executor 66 | 67 | The mechanism that gets the tasks done. 68 | 69 | ### Metadata database 70 | 71 | - Powers how the other components interact 72 | - Stores the Airflow states 73 | - All processes read and write from here 74 | 75 | ## Workflow as a code 76 | One of the main advantages of using a workflow system like Airflow is that all is code, which makes your workflows maintainable, versionable, testable, and collaborative. 77 | 78 | Thus your workflows become more explicit and maintainable (atomic tasks). 79 | 80 | Not only your code is dynamic but also is your infrastructure. 81 | 82 | ### Defining tasks 83 | 84 | Tasks are defined based on the abstraction of `Operators` (see Airflow docs [here](https://airflow.apache.org/concepts.html#operators)) which represent a single **idempotent task**. 85 | 86 | The best practice is to have atomic operators (i.e. can stand on their own and do not need to share resources among them). 87 | 88 | You can choose among; 89 | - `BashOperator` 90 | - `PythonOperator` 91 | - `EmailOperator` 92 | - `SimpleHttpOperator` 93 | - `MySqlOperator` (and other DB) 94 | 95 | Examples: 96 | 97 | ```python 98 | t1 = BashOperator(task_id='print_date', 99 | bash_command='date, 100 | dag=dag) 101 | ``` 102 | 103 | ```python 104 | def print_context(ds, **kwargs): 105 | pprint(kwargs) 106 | print(ds) 107 | return 'Whatever you return gets printed in the logs' 108 | 109 | 110 | run_this = PythonOperator( 111 | task_id='print_the_context', 112 | provide_context=True, 113 | python_callable=print_context, 114 | dag=dag, 115 | ) 116 | ``` 117 | 118 | ## Comparing Luigi and Airflow 119 | 120 | ### Luigi 121 | 122 | - Created at Spotify (named after the plumber) 123 | - Open sourced in late 2012 124 | - GNU make for data 125 | 126 | ### Airflow 127 | - Airbnb data team 128 | - Open-sourced mud 2015 129 | - Apache incubator mid-2016 130 | - ETL pipelines 131 | 132 | ### Similarities 133 | - Python open source projects for data pipelines 134 | - Integrate with a number of sources (databases, filesystems) 135 | - Tracking failure, retries, success 136 | - Ability to identify the dependencies and execution 137 | 138 | ### Differences 139 | - Scheduler support: Airflow has built-in support using schedulers 140 | - Scalability: Airflow has had stability issues in the past 141 | - Web interfaces 142 | 143 | ![](_static/luigi.png) 144 | 145 | 146 | ![](_static/airflow.png) 147 | 148 | 149 | | Airflow | Luigi | 150 | | ------------------------------------------------ | ------------------------------------------------------------------------------ | 151 | | Task are defined by`dag_id` defined by user name | Task are defined by task name and parameters | 152 | | Task retries based on definitions | Decide if a task is done via input/output | 153 | | Task code to the worker | Workers started by Python file where the tasks are defined | 154 | | Centralized scheduler (Celery spins up workers) | Centralized scheduler in charge of deduplication sending tasks (Tornado based) | -------------------------------------------------------------------------------- /source/azure.md: -------------------------------------------------------------------------------- 1 | ### Deploying to the cloud 2 | 3 | 4 | ![](_static/azure.png) 5 | 6 | [This Docker image](https://hub.docker.com/r/puckel/docker-airflow/) has been used as the base for many deployments. 7 | 8 | 9 | Let's try and get Airflow running on Docker: 10 | 11 | ``` 12 | docker pull puckel/docker-airflow 13 | ``` 14 | 15 | Once you have the container you can run as 16 | 17 | ``` 18 | docker run -d --rm -p 8080:8080 puckel/docker-airflow webserver 19 | ``` 20 | 21 | To load the examples you can do: 22 | ``` 23 | docker run -d -p 8080:8080 -e LOAD_EX=y puckel/docker-airflow 24 | ``` 25 | 26 | Based on this container we can deploy to [Azure](https://azure.microsoft.com/en-us/blog/deploying-apache-airflow-in-azure-to-build-and-run-data-pipelines//?wt.mc_id=PyCon-github-taallard) 27 | 28 | 29 | [![Deploy to Azure](https://azuredeploy.net/deploybutton.svg)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fsavjani%2Fazure-quickstart-templates%2Fmaster%2F101-webapp-linux-airflow-postgresql%2Fazuredeploy.json/?wt.mc_id=PyCon-github-taallard) 30 | 31 | 32 | Note that this is a very basic deployment on Azure. -------------------------------------------------------------------------------- /source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | # import os 16 | # import sys 17 | # sys.path.insert(0, os.path.abspath('.')) 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = "Airflow tutorial" 23 | copyright = "2019, Tania Allard" 24 | author = "Tania Allard" 25 | 26 | # The short X.Y version 27 | version = "" 28 | # The full version, including alpha/beta/rc tags 29 | release = "" 30 | 31 | 32 | # -- General configuration --------------------------------------------------- 33 | 34 | # If your documentation needs a minimal Sphinx version, state it here. 35 | # 36 | # needs_sphinx = '1.0' 37 | 38 | # Add any Sphinx extension module names here, as strings. They can be 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 40 | # ones. 41 | extensions = [ 42 | "sphinx.ext.doctest", 43 | "sphinx.ext.intersphinx", 44 | "sphinx.ext.mathjax", 45 | "sphinx.ext.githubpages", 46 | "recommonmark", 47 | ] 48 | 49 | # Add any paths that contain templates here, relative to this directory. 50 | templates_path = ["_templates"] 51 | 52 | # The suffix(es) of source filenames. 53 | # You can specify multiple suffix as a list of string: 54 | # 55 | source_suffix = [".rst", ".md"] 56 | 57 | # The master toctree document. 58 | master_doc = "index" 59 | 60 | # The language for content autogenerated by Sphinx. Refer to documentation 61 | # for a list of supported languages. 62 | # 63 | # This is also used if you do content translation via gettext catalogs. 64 | # Usually you set "language" from the command line for these cases. 65 | language = None 66 | 67 | # List of patterns, relative to source directory, that match files and 68 | # directories to ignore when looking for source files. 69 | # This pattern also affects html_static_path and html_extra_path. 70 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 71 | 72 | # The name of the Pygments (syntax highlighting) style to use. 73 | pygments_style = "monokai" 74 | 75 | 76 | # -- Options for HTML output ------------------------------------------------- 77 | 78 | # The theme to use for HTML and HTML Help pages. See the documentation for 79 | # a list of builtin themes. 80 | # 81 | html_theme = "alabaster" 82 | 83 | # Theme options are theme-specific and customize the look and feel of a theme 84 | # further. For a list of options available for each theme, see the 85 | # documentation. 86 | # 87 | html_theme_options = { 88 | "github_banner": False, 89 | "github_button": True, 90 | "github_user": "trallard", 91 | "github_repo": "airflow-tutorial", 92 | "github_type": "star", 93 | "font_family": "Nunito, Georgia, sans", 94 | "head_font_family": "Nunito, Georgia, serif", 95 | "code_font_family": "'Source Code Pro', 'Consolas', monospace", 96 | "description": "a.k.a an introduction to all things DAGS and pipelines joy", 97 | "show_relbars": True, 98 | "logo": "python.png", 99 | } 100 | 101 | # Add any paths that contain custom static files (such as style sheets) here, 102 | # relative to this directory. They are copied after the builtin static files, 103 | # so a file named "default.css" will overwrite the builtin "default.css". 104 | html_static_path = ["_static"] 105 | 106 | # Custom sidebar templates, must be a dictionary that maps document names 107 | # to template names. 108 | # 109 | # The default sidebars (for documents that don't match any pattern) are 110 | # defined by theme itself. Builtin themes are using these templates by 111 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 112 | # 'searchbox.html']``. 113 | # 114 | # Custom sidebar templates, maps document names to template names. 115 | html_sidebars = { 116 | "**": [ 117 | "about.html", 118 | "localtoc.html", 119 | "searchbox.html", 120 | "navigation.html", 121 | "relations.html", 122 | "sidebarlogo.html", 123 | ] 124 | } 125 | 126 | # -- Options for HTMLHelp output --------------------------------------------- 127 | 128 | # Output file base name for HTML help builder. 129 | htmlhelp_basename = "Airflowtutorialdoc" 130 | 131 | 132 | # -- Options for LaTeX output ------------------------------------------------ 133 | 134 | latex_elements = { 135 | # The paper size ('letterpaper' or 'a4paper'). 136 | # 137 | # 'papersize': 'letterpaper', 138 | # The font size ('10pt', '11pt' or '12pt'). 139 | # 140 | # 'pointsize': '10pt', 141 | # Additional stuff for the LaTeX preamble. 142 | # 143 | # 'preamble': '', 144 | # Latex figure (float) alignment 145 | # 146 | # 'figure_align': 'htbp', 147 | } 148 | 149 | # Grouping the document tree into LaTeX files. List of tuples 150 | # (source start file, target name, title, 151 | # author, documentclass [howto, manual, or own class]). 152 | latex_documents = [ 153 | ( 154 | master_doc, 155 | "Airflowtutorial.tex", 156 | "Airflow tutorial Documentation", 157 | "Tania Allard", 158 | "manual", 159 | ) 160 | ] 161 | 162 | 163 | # -- Options for manual page output ------------------------------------------ 164 | 165 | # One entry per manual page. List of tuples 166 | # (source start file, name, description, authors, manual section). 167 | man_pages = [ 168 | (master_doc, "airflowtutorial", "Airflow tutorial Documentation", [author], 1) 169 | ] 170 | 171 | 172 | # -- Options for Texinfo output ---------------------------------------------- 173 | 174 | # Grouping the document tree into Texinfo files. List of tuples 175 | # (source start file, target name, title, author, 176 | # dir menu entry, description, category) 177 | texinfo_documents = [ 178 | ( 179 | master_doc, 180 | "Airflowtutorial", 181 | "Airflow tutorial Documentation", 182 | author, 183 | "Airflowtutorial", 184 | "One line description of project.", 185 | "Miscellaneous", 186 | ) 187 | ] 188 | 189 | 190 | # -- Options for Epub output ------------------------------------------------- 191 | 192 | # Bibliographic Dublin Core info. 193 | epub_title = project 194 | 195 | # The unique identifier of the text. This can be a ISBN number 196 | # or the project homepage. 197 | # 198 | # epub_identifier = '' 199 | 200 | # A unique identification for the text. 201 | # 202 | # epub_uid = '' 203 | 204 | # A list of files that should not be packed into the epub file. 205 | epub_exclude_files = ["search.html"] 206 | 207 | 208 | # -- Extension configuration ------------------------------------------------- 209 | 210 | # -- Options for intersphinx extension --------------------------------------- 211 | 212 | # Example configuration for intersphinx: refer to the Python standard library. 213 | intersphinx_mapping = {"https://docs.python.org/": None} 214 | 215 | -------------------------------------------------------------------------------- /source/first-airflow.md: -------------------------------------------------------------------------------- 1 | # Airflow 101: working locally and familiarise with the tool 2 | 3 | ### Pre-requisites 4 | 5 | The following prerequisites are needed: 6 | 7 | - Libraries detailed in the Setting up section (either via conda or pipenv) 8 | - MySQL installed 9 | - text editor 10 | - command line 11 | 12 | ## Getting your environment up and running 13 | 14 | If you followed the instructions you should have Airflow installed as well as the rest of the packages we will be using. 15 | 16 | So let's get our environment up and running: 17 | 18 | If you are using conda start your environment via: 19 | ``` 20 | $ source activate airflow-env 21 | ``` 22 | If using pipenv then: 23 | ``` 24 | $ pipenv shell 25 | ```` 26 | 27 | this will start a shell within a virtual environment, to exit the shell you need to type `exit` and this will exit the virtual environment. 28 | 29 | ## Starting Airflow locally 30 | 31 | Airflow home lives in `~/airflow` by default, but you can change the location before installing airflow. You first need to set the `AIRFLOW_HOME` environment variable and then install airflow. For example, using pip: 32 | 33 | ```sh 34 | export AIRFLOW_HOME=~/mydir/airflow 35 | 36 | # install from PyPI using pip 37 | pip install apache-airflow 38 | ``` 39 | 40 | once you have completed the installation you should see something like this in the `airflow` directory (wherever it lives for you) 41 | 42 | ``` 43 | drwxr-xr-x - myuser 18 Apr 14:02 . 44 | .rw-r--r-- 26k myuser 18 Apr 14:02 ├── airflow.cfg 45 | drwxr-xr-x - myuser 18 Apr 14:02 ├── logs 46 | drwxr-xr-x - myuser 18 Apr 14:02 │ └── scheduler 47 | drwxr-xr-x - myuser 18 Apr 14:02 │ ├── 2019-04-18 48 | lrwxr-xr-x 46 myuser 18 Apr 14:02 │ └── latest -> /Users/myuser/airflow/logs/scheduler/2019-04-18 49 | .rw-r--r-- 2.5k myuser 18 Apr 14:02 └── unittests.cfg 50 | ``` 51 | We need to create a local dag folder: 52 | 53 | ``` 54 | mkdir ~/airflow/dags 55 | ``` 56 | 57 | As your project evolves, your directory will look something like this: 58 | 59 | ``` 60 | airflow # the root directory. 61 | ├── dags # root folder for all dags. files inside folders are not searched for dags. 62 | │ ├── my_dag.py, # my dag (definitions of tasks/operators) including precedence. 63 | │ └── ... 64 | ├── logs # logs for the various tasks that are run 65 | │ └── my_dag # DAG specific logs 66 | │ │ ├── src1_s3 # folder for task-specific logs (log files are created by date of a run) 67 | │ │ ├── src2_hdfs 68 | │ │ ├── src3_s3 69 | │ │ └── spark_task_etl 70 | ├── airflow.db # SQLite database used by Airflow internally to track the status of each DAG. 71 | ├── airflow.cfg # global configuration for Airflow (this can be overridden by config inside the file.) 72 | └── ... 73 | ``` 74 | 75 | ## Prepare your database 76 | 77 | As we mentioned before Airflow uses a database to keep track of the tasks and their statuses. So it is critical to have one set up. 78 | 79 | To start the default database we can run 80 | ` airflow initdb`. This will initialize your database via alembic so that it matches the latest Airflow release. 81 | 82 | The default database used is `sqlite` which means you cannot parallelize tasks using this database. Since we have MySQL and MySQL client installed we will set them up so that we can use them with airflow. 83 | 84 | 🚦Create an airflow database 85 | 86 | From the command line: 87 | 88 | ``` 89 | MySQL -u root -p 90 | mysql> CREATE DATABASE airflow CHARACTER SET utf8 COLLATE utf8_unicode_ci; 91 | mysql> GRANT ALL PRIVILEGES ON airflow.* To 'airflow'@'localhost'; 92 | mysql> FLUSH PRIVILEGES; 93 | ``` 94 | and initialize the database: 95 | 96 | ``` 97 | airflow initdb 98 | ``` 99 | 100 | Notice that this will fail with the default `airflow.cfg` 101 | 102 | 103 | ## Update your local configuration 104 | 105 | Open your airflow configuration file `~/airflow/airflow.cf` and make the following changes: 106 | 107 | 108 | ``` 109 | executor = CeleryExecutor 110 | ``` 111 | 112 | ``` 113 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#broker-settings 114 | # needs rabbitmq running 115 | broker_url = amqp://guest:guest@127.0.0.1/ 116 | 117 | 118 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#task-result-backend-settings 119 | result_backend = db+mysql://airflow:airflow@localhost:3306/airflow 120 | 121 | sql_alchemy_conn = mysql://airflow:python2019@localhost:3306/airflow 122 | 123 | ``` 124 | 125 | Here we are replacing the default executor (`SequentialExecutor`) with the `CeleryExecutor` so that we can run multiple DAGs in parallel. 126 | We also replace the default `sqlite` database with our newly created `airflow` database. 127 | 128 | Now we can initialize the database: 129 | ``` 130 | airflow initdb 131 | ``` 132 | 133 | Let's now start the web server locally: 134 | 135 | 136 | ``` 137 | airflow webserver -p 8080 138 | ``` 139 | 140 | we can head over to [http://localhost:8080](http://localhost:8080) now and you will see that there are a number of examples DAGS already there. 141 | 142 | 🚦 Take some time to familiarise with the UI and get your local instance set up 143 | 144 | Now let's have a look at the connections ([http://localhost:8080/admin/connection/](http://localhost:8080/admin/connection/)) go to `admin > connections`. You should be able to see a number of connections available. For this tutorial, we will use some of the connections including `mysql`. 145 | 146 | 152 | 153 | ### Commands 154 | Let us go over some of the commands. Back on your command line: 155 | 156 | ``` 157 | airflow list_dags 158 | ``` 159 | we can list the DAG tasks in a tree view 160 | 161 | ``` 162 | airflow list_tasks tutorial --tree 163 | ``` 164 | 165 | we can tests the dags too, but we will need to set a date parameter so that this executes: 166 | 167 | ``` 168 | airflow test tutorial print_date 2019-05-01 169 | ``` 170 | (note that you cannot use a future date or you will get an error) 171 | ``` 172 | airflow test tutorial templated 2019-05-01 173 | ``` 174 | By using the test commands these are not saved in the database. 175 | 176 | Now let's start the scheduler: 177 | ``` 178 | airflow scheduler 179 | ``` 180 | 181 | Behind the scenes, it monitors and stays in sync with a folder for all DAG objects it contains. The Airflow scheduler is designed to run as a service in an Airflow production environment. 182 | 183 | Now with the schedule up and running we can trigger an instance: 184 | ``` 185 | $ airflow run airflow run example_bash_operator runme_0 2015-01-01 186 | ``` 187 | 188 | This will be stored in the database and you can see the change of the status change straight away. 189 | 190 | What would happen for example if we wanted to run or trigger the `tutorial` task? 🤔 191 | 192 | Let's try from the CLI and see what happens. 193 | 194 | ``` 195 | airflow trigger_dag tutorial 196 | ``` 197 | 198 | 199 | ## Writing your first DAG 200 | 201 | Let's create our first simple DAG. 202 | Inside the dag directory (`~/airflow/dags)` create a `simple_dag.py` file. 203 | 204 | 205 | ```python 206 | from datetime import datetime, timedelta 207 | from airflow import DAG 208 | from airflow.operators.dummy_operator import DummyOperator 209 | from airflow.operators.python_operator import PythonOperator 210 | 211 | 212 | def print_hello(): 213 | return "Hello world!" 214 | 215 | 216 | default_args = { 217 | "owner": "airflow", 218 | "depends_on_past": False, 219 | "start_date": datetime(2019, 4, 30), 220 | "email": ["airflow@example.com"], 221 | "email_on_failure": False, 222 | "email_on_retry": False, 223 | "retries": 1, 224 | "retry_delay": timedelta(minutes=2), 225 | } 226 | 227 | dag = DAG( 228 | "hello_world", 229 | description="Simple tutorial DAG", 230 | schedule_interval="0 12 * * *", 231 | default_args=default_args, 232 | catchup=False, 233 | ) 234 | 235 | t1 = DummyOperator(task_id="dummy_task", retries=3, dag=dag) 236 | 237 | t2 = PythonOperator(task_id="hello_task", python_callable=print_hello, dag=dag) 238 | 239 | # sets downstream foe t1 240 | t1 >> t2 241 | 242 | # equivalent 243 | # t2.set_upstream(t1) 244 | 245 | ``` 246 | If it is properly setup you should be able to see this straight away on your instance. 247 | 248 | 249 | ### Now let's create a DAG from the previous ETL pipeline (kind of) 250 | 251 | All hands on - check the solutions -------------------------------------------------------------------------------- /source/index.rst: -------------------------------------------------------------------------------- 1 | .. Airflow tutorial documentation master file, created by 2 | sphinx-quickstart on Mon Apr 15 15:52:00 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Airflow tutorial 7 | ============================================ 8 | This tutorial was originally developed for PyCon US 2019. 9 | 10 | .. toctree:: 11 | :caption: Table of Contents 12 | :hidden: 13 | :maxdepth: 2 14 | 15 | setup 16 | about 17 | pipelines 18 | airflow-intro 19 | first-airflow 20 | 21 | .. toctree:: 22 | :maxdepth: 2 23 | :caption: Contents: 24 | 25 | About your facilitator 26 | ====================== 27 | 28 | My name is Tania. I live in Manchester UK where I work as a 29 | Cloud Advocate for Microsoft. 30 | 31 | Over the years, I have worked as a data engineer, machine learning engineer, 32 | and research software engineer. I love data intensive 33 | enviroments and I am particularly interested in the tools and workflows to 34 | deliver robust, reproducible data insights. 35 | 36 | If you have any questions or feedback about this tutorial please, 37 | file an issue using the following link: ``_. 38 | 39 | You can also contact me via the following channels: 40 | 41 | - E-mail: trallard@bitsandchips.me 42 | - Twitter: `@ixek `_ 43 | - `Tania on GitHub `_ 44 | 45 | Code of Conduct 46 | ================ 47 | All attendees to this workshop are expected to adhere to PyCon's Code of Conduct, 48 | in brief: 49 | **Be open, considerate, and respectful.** 50 | 51 | License 52 | ======= 53 | The content in this workshop is Licensed under `CC-BY-SA 4.0 `_. 54 | Which means that you can use, remix and re-distribute so long attribution to the original 55 | author is maintained (Tania Allard). 56 | 57 | The logo used here was designed by Ashley McNamara for the Microsoft Developer Advocates team use. 58 | 59 | 60 | 61 | 62 | --------------------------------------------------------------------------------