├── .gitignore
├── LICENSE
├── Makefile
├── Pipfile
├── Pipfile.lock
├── README.md
├── azure-pipelines.yml
├── dags
    ├── generate_twitter.py
    ├── parameters.py
    ├── simple_dag.py
    ├── subdags
    │   └── twitter_subdag.py
    └── twitter_airflow.py
├── docs
    ├── .buildinfo
    ├── .nojekyll
    ├── _sources
    │   ├── index.rst.txt
    │   └── setup.md.txt
    ├── _static
    │   ├── alabaster.css
    │   ├── basic.css
    │   ├── custom.css
    │   ├── doctools.js
    │   ├── documentation_options.js
    │   ├── file.png
    │   ├── jquery-3.2.1.js
    │   ├── jquery.js
    │   ├── language_data.js
    │   ├── minus.png
    │   ├── plus.png
    │   ├── pygments.css
    │   ├── python.png
    │   ├── searchtools.js
    │   ├── underscore-1.3.1.js
    │   └── underscore.js
    ├── genindex.html
    ├── html
    │   ├── .buildinfo
    │   ├── .nojekyll
    │   ├── _images
    │   │   ├── 12.png
    │   │   ├── 4.jpg
    │   │   ├── DAG.png
    │   │   ├── airflow-logo.jpeg
    │   │   ├── airflow.png
    │   │   ├── architecture.png
    │   │   ├── automation1.jpg
    │   │   ├── azure.png
    │   │   ├── gooddata.png
    │   │   ├── gooddata1.png
    │   │   ├── luigi.png
    │   │   ├── mssignin.png
    │   │   ├── twitter1.png
    │   │   ├── twitter2.png
    │   │   ├── twitter3.png
    │   │   └── uses.png
    │   ├── _sources
    │   │   ├── about.md.txt
    │   │   ├── airflow-intro.md.txt
    │   │   ├── azure.md.txt
    │   │   ├── first-airflow.md.txt
    │   │   ├── index.rst.txt
    │   │   ├── pipelines.md.txt
    │   │   └── setup.rst.txt
    │   ├── _static
    │   │   ├── 12.png
    │   │   ├── 4.jpg
    │   │   ├── DAG.png
    │   │   ├── GUI.png
    │   │   ├── airflow-logo.jpeg
    │   │   ├── airflow.png
    │   │   ├── alabaster.css
    │   │   ├── architecture.png
    │   │   ├── automation1.jpg
    │   │   ├── azure.png
    │   │   ├── basic.css
    │   │   ├── connection.png
    │   │   ├── custom.css
    │   │   ├── datapyramid.png
    │   │   ├── doctools.js
    │   │   ├── documentation_options.js
    │   │   ├── file.png
    │   │   ├── gooddata.png
    │   │   ├── gooddata1.png
    │   │   ├── jquery-3.2.1.js
    │   │   ├── jquery.js
    │   │   ├── language_data.js
    │   │   ├── luigi.png
    │   │   ├── minus.png
    │   │   ├── mssignin.png
    │   │   ├── pipeline1.png
    │   │   ├── plus.png
    │   │   ├── pygments.css
    │   │   ├── python.png
    │   │   ├── searchtools.js
    │   │   ├── twitter1.png
    │   │   ├── twitter2.png
    │   │   ├── twitter3.png
    │   │   ├── underscore-1.3.1.js
    │   │   ├── underscore.js
    │   │   └── uses.png
    │   ├── about.html
    │   ├── airflow-intro.html
    │   ├── azure.html
    │   ├── first-airflow.html
    │   ├── genindex.html
    │   ├── index.html
    │   ├── objects.inv
    │   ├── pipelines.html
    │   ├── search.html
    │   ├── searchindex.js
    │   └── setup.html
    ├── index.html
    ├── objects.inv
    ├── search.html
    ├── searchindex.js
    └── setup.html
├── environment.yaml
├── make.bat
├── requirements.txt
├── solutions
    ├── dags
    │   ├── dags
    │   │   ├── data
    │   │   │   └── tweets
    │   │   │   │   └── #pycon since:2019-04-30 until:2019-05-01_05012019070912.csv
    │   │   ├── generate_twitter.py
    │   │   ├── parameters.py
    │   │   ├── simple_dag.py
    │   │   ├── subdags
    │   │   │   └── twitter_subdag.py
    │   │   └── twitter_airflow.py
    │   └── twitter_airflow.py
    └── etl-basic
    │   ├── analyse_twitter.py
    │   ├── etl.sh
    │   ├── stream_twitter.py
    │   ├── stream_twitter_alt.py
    │   └── stream_twitter_timed.py
└── source
    ├── _static
        ├── 12.png
        ├── 4.jpg
        ├── DAG.png
        ├── GUI.png
        ├── airflow-logo.jpeg
        ├── airflow.png
        ├── architecture.png
        ├── automation1.jpg
        ├── azure.png
        ├── connection.png
        ├── custom.css
        ├── datapyramid.png
        ├── gooddata.png
        ├── gooddata1.png
        ├── luigi.png
        ├── mssignin.png
        ├── pipeline1.png
        ├── python.png
        ├── twitter1.png
        ├── twitter2.png
        ├── twitter3.png
        └── uses.png
    ├── _templates
        └── sidebarlogo.html
    ├── about.md
    ├── airflow-intro.md
    ├── azure.md
    ├── conf.py
    ├── first-airflow.md
    ├── index.rst
    ├── pipelines.md
    └── setup.rst


/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | \.vscode/
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | pip-wheel-metadata/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | source/_build/
 75 | 
 76 | # PyBuilder
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | .python-version
 88 | 
 89 | # pipenv
 90 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 91 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 92 | #   having no cross-platform support, pipenv may install dependencies that don’t work, or not
 93 | #   install all needed dependencies.
 94 | #Pipfile.lock
 95 | 
 96 | # celery beat schedule file
 97 | celerybeat-schedule
 98 | 
 99 | # SageMath parsed files
100 | *.sage.py
101 | 
102 | # Environments
103 | .env
104 | .venv
105 | env/
106 | venv/
107 | ENV/
108 | env.bak/
109 | venv.bak/
110 | 
111 | # Spyder project settings
112 | .spyderproject
113 | .spyproject
114 | 
115 | # Rope project settings
116 | .ropeproject
117 | 
118 | # mkdocs documentation
119 | /site
120 | 
121 | # mypy
122 | .mypy_cache/
123 | .dmypy.json
124 | dmypy.json
125 | 
126 | # Pyre type checker
127 | .pyre/
128 | 
129 | \.DS_Store
130 | 
131 | docs/\.doctrees/
132 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SOURCEDIR     = source
 8 | BUILDDIR      = build
 9 | 
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 | 
14 | .PHONY: help Makefile
15 | 
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | name = "pypi"
 3 | url = "https://pypi.org/simple"
 4 | verify_ssl = true
 5 | 
 6 | [dev-packages]
 7 | sphinx = "*"
 8 | sphinxcontrib-inlinesyntaxhighlight = "*"
 9 | pylint = "*"
10 | recommonmark = "*"
11 | 
12 | [packages]
13 | jupyter = "*"
14 | jupyterlab = "*"
15 | papermill = "*"
16 | celery = "*"
17 | mysqlclient = "*"
18 | tweepy = "*"
19 | numpy = "*"
20 | pandas = "*"
21 | hypothesis = "*"
22 | matplotlib = "*"
23 | seaborn = "*"
24 | mysql-connector-python = "*"
25 | apache-airflow = {extras = ["celery", "mysql"],version = "*"}
26 | 
27 | [requires]
28 | python_version = "3.7"
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Airflow Tutorial
 2 | 
 3 | ![License: CC BY 4.0](https://img.shields.io/badge/License-CC%20BY%204.0-lightgrey.svg)
 4 | 
 5 | This repo contains the materials for the pipelines tutorial on Pycon -> from scripts soups to Airflow.
 6 | 
 7 | The tutorial covers:
 8 | 
 9 | - Setting up local databases
10 | - Creating basic ETL pipelines in Python: query APIs, load data to databases, perform data cleaning and filtering and persist the consumption ready data
11 | - How to set a local instance of Airflow and get it running
12 | - Creating basic DAGS in Airflow
13 | - Transform script soups ETLS into Airflow dags 
14 | - Set up an Airflow instance in Azure
15 | 
16 | To add:
17 | - Setting a Kubernetes powered instance on Azure AKS
18 | - Adding CI/CD to using Azure pipelines
19 | 
20 | If you are interested in following along visit: <https://airflow-tutorial.readthedocs.io/en/latest/>
21 | 
22 | 
23 | The setup instructions can be found at: [https://airflow-tutorial.readthedocs.io/en/latest/setup.html](https://airflow-tutorial.readthedocs.io/en/latest/setup.html)
24 | 
25 | If you would like to experiment with Azure [follow this link](https://azure.microsoft.com/en-us/free//?wt.mc_id=PyCon-github-taallard) to get a free trial subscription with 150 dollars.
26 | 
27 | 
28 | 🚀 PRs and Issues are welcome
29 | 
30 | ### License
31 | 
32 | [![License: CC BY 4.0](https://licensebuttons.net/l/by/4.0/80x15.png)](https://creativecommons.org/licenses/by/4.0/)
33 | 
34 | 
35 | This repo is licensed using a CC-BY so you are free to use, remix, and share so long attribution is provided to the original author.
36 | 


--------------------------------------------------------------------------------
/azure-pipelines.yml:
--------------------------------------------------------------------------------
 1 | # Publish the docs to GitHub pages
 2 | 
 3 | # we only build and publish when changes ocurr to the master branch
 4 | trigger:
 5 | - master
 6 | 
 7 | pool:
 8 |   vmImage: 'Ubuntu-16.04'
 9 | 
10 | steps:
11 | # https://docs.microsoft.com/en-us/azure/devops/pipelines/yaml-schema?view=azdevops&tabs=schema#checkout
12 | - checkout: self
13 |   persistCredentials: true # set to 'true' to leave the OAuth token in the Git config after the initial fetch
14 | 
15 | - task: UsePythonVersion@0
16 |   inputs:
17 |     versionSpec: '3.7'
18 |     addToPath: true 
19 |   displayName: 'Using defined Python version'
20 | 
21 | - script: |
22 |     python -m pip install --upgrade pip pipenv
23 |     pipenv install --dev --system --deploy
24 |   displayName: 'Install dependencies via Pipfile'
25 | 
26 | - script: |
27 |     sphinx-build -n -b html ./source $(Build.ArtifactStagingDirectory)/build/html
28 |   displayName: 'Building Sphinx docs'
29 | 
30 | - script: |
31 |     git config --local user.name "Tania Allard"
32 |     git config --local user.email "trallard@bitsandchips.me"
33 |     cp -a $(Build.ArtifactStagingDirectory)/build/html/ $(Build.Repository.LocalPath)/docs
34 |     rm -rf $(Build.Repository.LocalPath)/docs/html/.doctrees
35 |   displayName: 'Copy artifacts to clean branch'
36 | 
37 | - script: |
38 |     cd $(Build.Repository.LocalPath)
39 |     git add --all
40 |     git commit -m "Build documentation [skip ci]"
41 |     git push origin HEAD:master
42 |   displayName: 'Publish GitHub Pages'
43 |   condition: |
44 |     and(not(eq(variables['Build.Reason'], 'PullRequest')),
45 |         eq(variables['Build.SourceBranch'], 'refs/heads/master'))


--------------------------------------------------------------------------------
/dags/generate_twitter.py:
--------------------------------------------------------------------------------
  1 | """ Simple example of creating subdags and generating work dynamically"""
  2 | from airflow import DAG
  3 | from airflow.hooks import SqliteHook
  4 | 
  5 | from airflow.hooks.mysql_hook import MySqlHook
  6 | from airflow.models import Variable
  7 | from airflow.operators.email_operator import EmailOperator
  8 | from airflow.operators.python_operator import PythonOperator, BranchPythonOperator
  9 | from airflow.operators.bash_operator import BashOperator
 10 | from airflow.operators.subdag_operator import SubDagOperator
 11 | 
 12 | 
 13 | from twitter_airflow import search_twitter, RAW_TWEET_DIR
 14 | from subdags.twitter_subdag import subdag
 15 | from datetime import datetime, timedelta
 16 | import pandas as pd
 17 | import re
 18 | import random
 19 | 
 20 | 
 21 | SEARCH_TERMS = ["#python", "#pydata", "#airflow", "data wrangling", "data pipelines"]
 22 | 
 23 | 
 24 | default_args = {
 25 |     "owner": "admin",
 26 |     "depends_on_past": False,
 27 |     "start_date": datetime.now() - timedelta(days=4),
 28 |     "retries": 1,
 29 |     "retry_delay": timedelta(minutes=5),
 30 | }
 31 | 
 32 | dag = DAG(
 33 |     "generate_twitter_dags", default_args=default_args, schedule_interval="@daily"
 34 | )
 35 | 
 36 | 
 37 | def fill_terms(my_terms=SEARCH_TERMS, **kwargs):
 38 |     """ Fill sqlite database with a few search terms. """
 39 |     dbconn = MySqlHook(mysql_conn_id="mysql_default")
 40 |     conn = dbconn.get_connection()
 41 |     cursor = conn.cursor()
 42 |     df = pd.DataFrame(my_terms, columns=["search_term"])
 43 |     try:
 44 |         df.to_sql("twitter_terms", conn)
 45 |     except ValueError:
 46 |         # table already exists
 47 |         pass
 48 | 
 49 | 
 50 | def generate_search_terms(**kwargs):
 51 |     """ Generate subdag to search twitter for terms. """
 52 |     dbconn = MySqlHook(mysql_conn_id="mysql_default")
 53 |     conn = dbconn.get_connection()
 54 |     cursor = conn.cursor()
 55 |     query = "select * from twitter_terms"
 56 |     df = pd.read_sql_query(query, conn)
 57 |     return random.choice(
 58 |         [
 59 |             "search_{}_twitter".format(re.sub(r"\W+", "", t))
 60 |             for t in df.search_term.values
 61 |         ]
 62 |     )
 63 | 
 64 | 
 65 | fill_search_terms = PythonOperator(
 66 |     task_id="fill_terms", provide_context=True, python_callable=fill_terms, dag=dag
 67 | )
 68 | 
 69 | 
 70 | gen_search_terms = BranchPythonOperator(
 71 |     task_id="generate_search_terms",
 72 |     provide_context=True,
 73 |     python_callable=generate_search_terms,
 74 |     dag=dag,
 75 | )
 76 | 
 77 | 
 78 | email_links = EmailOperator(
 79 |     task_id="email_best_links",
 80 |     to="MYEMAIL@MYSITE.com",
 81 |     subject="Latest popular links",
 82 |     html_content="Check out the latest!!",
 83 |     files=["{}/latest_links.txt".format(RAW_TWEET_DIR)],
 84 |     dag=dag,
 85 | )
 86 | 
 87 | 
 88 | sub = SubDagOperator(
 89 |     subdag=subdag, task_id="insert_and_id_pop", trigger_rule="one_success", dag=dag
 90 | )
 91 | 
 92 | 
 93 | clear_latest = BashOperator(
 94 |     bash_command="rm -rf {}/latest_links.txt".format(RAW_TWEET_DIR),
 95 |     task_id="clear_latest",
 96 |     dag=dag,
 97 | )
 98 | 
 99 | 
100 | gen_search_terms.set_upstream(fill_search_terms)
101 | 
102 | for term in SEARCH_TERMS:
103 |     term_without_punctuation = re.sub(r"\W+", "", term)
104 |     simple_search = PythonOperator(
105 |         task_id="search_{}_twitter".format(term_without_punctuation),
106 |         provide_context=True,
107 |         python_callable=search_twitter,
108 |         dag=dag,
109 |         params={"query": term},
110 |     )
111 |     simple_search.set_upstream(gen_search_terms)
112 |     simple_search.set_downstream(sub)
113 | 
114 | sub.set_downstream(email_links)
115 | email_links.set_downstream(clear_latest)
116 | 


--------------------------------------------------------------------------------
/dags/parameters.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This example uses the existing Dummy Operator and Variable model to
 3 | demonstrate dynamic creation of DAGs based on a Variable setting. As
 4 | shown below, a list of customer objects is retrieved and used to create
 5 | unique dags based on the imput.
 6 | """
 7 | 
 8 | from datetime import datetime, timedelta
 9 | from airflow.models import DAG
10 | from airflow.models import Variable
11 | from airflow.operators.dummy_operator import DummyOperator
12 | 
13 | # Create JSON Variable if it doesn't exist
14 | 
15 | CUSTOMERS = [
16 |     {
17 |         "customer_name": "Faux Customer",
18 |         "customer_id": "faux_customer",
19 |         "email": ["admin@fauxcustomer.com", "admin@astronomer.io"],
20 |         "schedule_interval": None,
21 |         "enabled": True,
22 |     },
23 |     {
24 |         "customer_name": "Bogus Customer",
25 |         "customer_id": "bogus_customer",
26 |         "email": ["admin@boguscustomer.com", "admin@astronomer.io"],
27 |         "schedule_interval": "@once",
28 |         "enabled": True,
29 |     },
30 | ]
31 | 
32 | # Get JSON Variable
33 | CUSTOMERS = Variable.get("customer_list", default_var=CUSTOMERS, deserialize_json=True)
34 | 
35 | 
36 | def create_dag(customer):
37 |     """
38 |     Accepts a customer parameters dict and
39 |     overrides default args to create a DAG object
40 | 
41 |     Returns: DAG() Object
42 |     """
43 |     default_args = {
44 |         "owner": "airflow",
45 |         "depends_on_past": False,
46 |         "email": "xyz@xyz.com",
47 |         "retries": 1,
48 |         "retry_delay": timedelta(minutes=5),
49 |         "start_date": datetime(2017, 1, 1, 0, 0),
50 |         "end_date": None,
51 |     }
52 | 
53 |     """
54 |     This allows DAG parameters to be passed in from the Variable if
55 |     a customer needs something specific overridden in their DAG.
56 |     Consider how email being passed in from the customer object
57 |     overrides email in the resulting replaced_args object.
58 |     """
59 |     replaced_args = {
60 |         k: default_args[k] if customer.get(k, None) is None else customer[k]
61 |         for k in default_args
62 |     }
63 | 
64 |     dag_id = "{base_name}_{id}".format(
65 |         base_name="load_clickstream_data", id=customer["customer_id"]
66 |     )
67 | 
68 |     return DAG(
69 |         dag_id=dag_id,
70 |         default_args=replaced_args,
71 |         schedule_interval=customer["schedule_interval"],
72 |     )
73 | 
74 |     # Loop customers array of containing customer objects
75 |     for cust in CUSTOMERS:
76 |         if cust["enabled"]:
77 | 
78 |             dag = create_dag(cust)
79 | 
80 |             globals()[dag.dag_id] = dag
81 | 
82 |             extract = DummyOperator(task_id="extract_data", dag=dag)
83 | 
84 |             transform = DummyOperator(task_id="transform_data", dag=dag)
85 | 
86 |             load = DummyOperator(task_id="load_data", dag=dag)
87 | 
88 |             extract >> transform >> load
89 | 
90 |         else:
91 |             # TODO Create but programmatically pause
92 |             pass
93 | 


--------------------------------------------------------------------------------
/dags/simple_dag.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.operators.dummy_operator import DummyOperator
 5 | from airflow.operators.python_operator import PythonOperator
 6 | 
 7 | 
 8 | def print_hello():
 9 |     return "Hello world!"
10 | 
11 | 
12 | default_args = {
13 |     "owner": "airflow",
14 |     "depends_on_past": False,
15 |     "start_date": datetime(2019, 4, 30),
16 |     "email": ["airflow@example.com"],
17 |     "email_on_failure": False,
18 |     "email_on_retry": False,
19 |     "retries": 1,
20 |     "retry_delay": timedelta(minutes=2),
21 | }
22 | 
23 | dag = DAG(
24 |     "hello_world",
25 |     description="Simple tutorial DAG",
26 |     schedule_interval="0 12 * * *",
27 |     default_args=default_args,
28 |     catchup=False,
29 | )
30 | 
31 | t1 = DummyOperator(task_id="dummy_task", retries=3, dag=dag)
32 | 
33 | t2 = PythonOperator(task_id="hello_task", python_callable=print_hello, dag=dag)
34 | 
35 | # sets downstream foe t1
36 | t1 >> t2
37 | 
38 | # equivalent
39 | # t2.set_upstream(t1)
40 | 


--------------------------------------------------------------------------------
/dags/subdags/twitter_subdag.py:
--------------------------------------------------------------------------------
 1 | """ Simple subdag example """
 2 | from airflow import DAG
 3 | from airflow.operators import PythonOperator
 4 | from twitter_airflow import csv_to_sql, identify_popular_links
 5 | from datetime import datetime, timedelta
 6 | 
 7 | 
 8 | default_args = {
 9 |     "owner": "admin",
10 |     "depends_on_past": False,
11 |     "start_date": datetime(2016, 1, 1),
12 |     "retries": 1,
13 |     "retry_delay": timedelta(minutes=5),
14 | }
15 | 
16 | subdag = DAG("generate_twitter_dags.insert_and_id_pop", default_args=default_args)
17 | 
18 | move_tweets_to_sql = PythonOperator(
19 |     task_id="csv_to_sqlite",
20 |     provide_context=True,
21 |     python_callable=csv_to_sql,
22 |     dag=subdag,
23 | )
24 | 
25 | id_popular = PythonOperator(
26 |     task_id="identify_popular_links",
27 |     provide_context=True,
28 |     python_callable=identify_popular_links,
29 |     dag=subdag,
30 |     params={"write_mode": "a"},
31 | )
32 | 
33 | id_popular.set_upstream(move_tweets_to_sql)
34 | 


--------------------------------------------------------------------------------
/dags/twitter_airflow.py:
--------------------------------------------------------------------------------
  1 | """ Simple Airflow data pipeline example using Twitter API """
  2 | import ast
  3 | import glob
  4 | import itertools
  5 | import os.path
  6 | import shutil
  7 | from collections import Counter
  8 | from configparser import ConfigParser
  9 | from csv import DictWriter, writer
 10 | from datetime import datetime, timedelta
 11 | from pathlib import Path
 12 | import MySQLdb
 13 | import MySQLdb.cursors
 14 | 
 15 | import pandas as pd
 16 | from tweepy import API, Cursor, OAuthHandler
 17 | 
 18 | from airflow import DAG
 19 | from airflow.hooks import sqlite_hook
 20 | from airflow.hooks.mysql_hook import MySqlHook
 21 | from airflow.models import Variable
 22 | from airflow.operators.email_operator import EmailOperator
 23 | from airflow.operators.python_operator import PythonOperator
 24 | 
 25 | RAW_TWEET_DIR = os.path.abspath(os.path.join(__file__, "../data/tweets/"))
 26 | CONFIG_FILE = os.path.abspath(os.path.join(__file__, "../config/prod.cfg"))
 27 | MAX_TWEEPY_PAGE = 2
 28 | 
 29 | # since there do not exist task on their own we need to create the DAG
 30 | default_args = {
 31 |     "owner": "admin",
 32 |     "depends_on_past": False,
 33 |     "start_date": datetime.now() - timedelta(days=5),
 34 |     "retries": 1,
 35 |     "retry_delay": timedelta(minutes=5),
 36 | }
 37 | 
 38 | dag = DAG("twitter_links", default_args=default_args, schedule_interval="@daily")
 39 | 
 40 | 
 41 | def extract_tweet_data(tweepy_obj, query):
 42 |     """ Extract relevant and serializable data from a tweepy Tweet object
 43 |         params:
 44 |             tweepy_obj: Tweepy Tweet Object
 45 |             query: str
 46 |         returns dict
 47 |     """
 48 |     return {
 49 |         "user_id": tweepy_obj.user.id,
 50 |         "user_name": tweepy_obj.user.name,
 51 |         "user_screenname": tweepy_obj.user.screen_name,
 52 |         "user_url": tweepy_obj.user.url,
 53 |         "user_description": tweepy_obj.user.description,
 54 |         "user_followers": tweepy_obj.user.followers_count,
 55 |         "user_friends": tweepy_obj.user.friends_count,
 56 |         "created": tweepy_obj.created_at.isoformat(),
 57 |         "text": tweepy_obj.text,
 58 |         "hashtags": [ht.get("text") for ht in tweepy_obj.entities.get("hashtags")],
 59 |         "mentions": [
 60 |             (um.get("id"), um.get("screen_name"))
 61 |             for um in tweepy_obj.entities.get("user_mentions")
 62 |         ],
 63 |         "urls": [url.get("expanded_url") for url in tweepy_obj.entities.get("urls")],
 64 |         "tweet_id": tweepy_obj.id,
 65 |         "is_quote_status": tweepy_obj.is_quote_status,
 66 |         "favorite_count": tweepy_obj.favorite_count,
 67 |         "retweet_count": tweepy_obj.retweet_count,
 68 |         "reply_status_id": tweepy_obj.in_reply_to_status_id,
 69 |         "lang": tweepy_obj.lang,
 70 |         "source": tweepy_obj.source,
 71 |         "location": tweepy_obj.coordinates,
 72 |         "query": query,
 73 |     }
 74 | 
 75 | 
 76 | def search_twitter(**kwargs):
 77 |     """ Search for a query in public tweets"""
 78 |     query = kwargs.get("params").get("query")
 79 | 
 80 |     auth = OAuthHandler(Variable.get("consumer_key"), Variable.get("consumer_secret"))
 81 |     auth.set_access_token(
 82 |         Variable.get("access_token"), Variable.get("access_token_secret")
 83 |     )
 84 |     api = API(auth)
 85 | 
 86 |     all_tweets = []
 87 |     page_num = 0
 88 |     since_date = datetime.strptime(kwargs.get("ds"), "%Y-%m-%d").date() - timedelta(
 89 |         days=1
 90 |     )
 91 |     query += " since:{} until:{}".format(
 92 |         since_date.strftime("%Y-%m-%d"), kwargs.get("ds")
 93 |     )
 94 |     print(f"searching twitter with: {query}")
 95 |     for page in Cursor(
 96 |         api.search, q=query, monitor_rate_limit=True, wait_on_rate_limit=True
 97 |     ).pages():
 98 |         all_tweets.extend([extract_tweet_data(t, query) for t in page])
 99 |         page_num += 1
100 |         if page_num > MAX_TWEEPY_PAGE:
101 |             break
102 | 
103 |     # if it's an empty list, stop here
104 |     if not len(all_tweets):
105 |         return
106 | 
107 |     filename = "{}/{}_{}.csv".format(
108 |         RAW_TWEET_DIR, query, datetime.now().strftime("%m%d%Y%H%M%S")
109 |     )
110 | 
111 |     # check that the directory exists
112 |     if not Path(filename).resolve().parent.exists():
113 | 
114 |         os.mkdir(Path(filename).resolve().parent)
115 | 
116 |     with open(filename, "w") as raw_file:
117 |         raw_wrtr = DictWriter(raw_file, fieldnames=all_tweets[0].keys())
118 |         raw_wrtr.writeheader()
119 |         raw_wrtr.writerows(all_tweets)
120 | 
121 | 
122 | def csv_to_sql(directory=RAW_TWEET_DIR, **kwargs):
123 |     """ csv to sql pipeline using pandas
124 |         params:
125 |             directory: str (file path to csv files)
126 |     """
127 |     dbconn = MySqlHook(mysql_conn_id="mysql_default")
128 |     conn = dbconn.get_connection()
129 |     cursor = conn.cursor()
130 | 
131 |     for fname in glob.glob("{}/*.csv".format(directory)):
132 |         if "_read" not in fname:
133 |             try:
134 |                 df = pd.read_csv(fname)
135 |                 df.to_sql("tweets", dbconn, if_exists="append", index=False)
136 |                 shutil.move(fname, fname.replace(".csv", "_read.csv"))
137 |             except pd.io.common.EmptyDataError:
138 |                 # probably an io error with another task / open file
139 |                 continue
140 | 
141 | 
142 | def identify_popular_links(directory=RAW_TWEET_DIR, write_mode="w", **kwargs):
143 |     """ Identify the most popular links from the last day of tweest in the db
144 |         Writes them to latest_links.txt in the RAW_TWEET_DIR
145 |         (or directory kwarg)
146 |     """
147 |     dbconn = MySqlHook(mysql_conn_id="mysql_default")
148 |     conn = dbconn.get_connection()
149 |     cursor = conn.cursor()
150 | 
151 |     query = """select * from tweets where
152 |     created > date('now', '-1 days') and urls is not null
153 |     order by favorite_count"""
154 |     df = pd.read_sql_query(query, conn)
155 |     df.urls = df.urls.map(ast.literal_eval)
156 |     cntr = Counter(itertools.chain.from_iterable(df.urls.values))
157 |     with open("{}/latest_links.txt".format(directory), write_mode) as latest:
158 |         wrtr = writer(latest)
159 |         wrtr.writerow(["url", "count"])
160 |         wrtr.writerows(cntr.most_common(5))
161 | 
162 | 
163 | # --------------------------------------
164 | # Tasks
165 | # -------------------------------------
166 | simple_search = PythonOperator(
167 |     task_id="search_twitter",
168 |     provide_context=True,
169 |     python_callable=search_twitter,
170 |     dag=dag,
171 |     # note we pass this as a params obj
172 |     params={"query": "#pycon"},
173 | )
174 | 
175 | 
176 | move_tweets_to_sql = PythonOperator(
177 |     task_id="csv_to_sql",
178 |     # extra DAG context
179 |     provide_context=True,
180 |     # call the function
181 |     python_callable=csv_to_sql,
182 |     dag=dag,
183 | )
184 | 
185 | 
186 | id_popular = PythonOperator(
187 |     task_id="identify_popular_links",
188 |     provide_context=True,
189 |     python_callable=identify_popular_links,
190 |     dag=dag,
191 | )
192 | 
193 | 
194 | email_links = EmailOperator(
195 |     task_id="email_best_links",
196 |     to="trallard@bitsandchips.me",
197 |     subject="Latest popular links",
198 |     html_content="Check out the latest!!",
199 |     files=["{}/latest_links.txt".format(RAW_TWEET_DIR)],
200 |     dag=dag,
201 | )
202 | 
203 | 
204 | simple_search.set_downstream(move_tweets_to_sql)
205 | id_popular.set_upstream(move_tweets_to_sql)
206 | email_links.set_upstream(id_popular)
207 | 


--------------------------------------------------------------------------------
/docs/.buildinfo:
--------------------------------------------------------------------------------
1 | # Sphinx build info version 1
2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3 | config: 67cd5c5b948c82ac9d91d9479af6e978
4 | tags: 645f666f9bcd5a90fca523b33c5a78b7
5 | 


--------------------------------------------------------------------------------
/docs/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/.nojekyll


--------------------------------------------------------------------------------
/docs/_sources/index.rst.txt:
--------------------------------------------------------------------------------
 1 | .. Airflow tutorial documentation master file, created by
 2 |    sphinx-quickstart on Mon Apr 15 15:52:00 2019.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Airflow tutorial
 7 | ============================================
 8 | This tutorial was originally developed for PyCon US 2019.
 9 | 
10 | .. toctree::
11 |    :caption: Table of Contents
12 |    :hidden:
13 |    :maxdepth: 2
14 | 
15 |    setup
16 | 
17 | 
18 | .. toctree::
19 |    :maxdepth: 2
20 |    :caption: Contents:
21 | 
22 | About your facilitator
23 | ======================
24 | 
25 | My name is Tania. I live in Manchester UK where I work as a 
26 | Cloud Advocate for Microsoft.
27 | 
28 | Over the years, I have worked as a data engineer, machine learning engineer,
29 | and research software engineer. I love data intensive
30 | enviroments and I am particularly interested in the tools and workflows to
31 | deliver robust, reproducible data insights.
32 | 
33 | If you have any questions or feedback about this tutorial please, 
34 | file an issue using the following link: `<https://github.com/trallard/airflow-tutorial/issues/new>`_.
35 | 
36 | You can also contact me via the following channels:
37 | 
38 | - E-mail: trallard@bitsandchips.me
39 | - Twitter: `@ixek <https://twitter.com/ixek>`_
40 | - `Tania on GitHub <https://github.com/ixek>`_
41 | 
42 | Code of Conduct
43 | ================
44 | All attendees to this workshop are expected to adhere to PyCon's Code of Conduct,
45 | in brief:
46 | **Be open, considerate, and respectful.**
47 | 
48 | License
49 | =======
50 | The content in this workshop is Licensed under `CC-BY-SA 4.0 <https://creativecommons.org/licenses/by-sa/4.0/>`_.
51 | Which means that you can use, remix and re-distribute so long attribution to the original
52 | author is maintained (Tania Allard).
53 | 
54 | The logo used here was designed by Ashley McNamara for the Microsoft Developer Advocates team use.
55 | 
56 | 
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/docs/_sources/setup.md.txt:
--------------------------------------------------------------------------------
1 | # Getting started
2 | 
3 | 


--------------------------------------------------------------------------------
/docs/_static/custom.css:
--------------------------------------------------------------------------------
  1 | /*  */
  2 | @import url('https://fonts.googleapis.com/css?family=Itim|Montserrat|Roboto+Mono');
  3 | 
  4 | a {
  5 |     color: rgb(96, 138, 197);
  6 | }
  7 | 
  8 | a:hover {
  9 |     color: rgb(65, 129, 218);
 10 | }
 11 | 
 12 | div.body h1 {
 13 |     color: #5F6366;
 14 |     font-family: 'Itim', cursive;
 15 |     font-weight: bold;
 16 |     font-size: 300%;
 17 | }
 18 | 
 19 | div.sphinxsidebarwrapper h1.logo {
 20 |     text-align: center;
 21 |     margin: 0 0 -20px 0;
 22 | }
 23 | 
 24 | div.sphinxsidebar p.blurb {
 25 |     font-size: 130%;
 26 |     text-align: center;
 27 |     font-family: 'Itim', cursive;
 28 |     color: rgb(151, 139, 196);
 29 | }
 30 | 
 31 | div.sphinxsidebar h1{
 32 |     font-size: 160%;
 33 |     color: #5F6366;
 34 |     font-family: 'Itim', cursive;
 35 | }
 36 | 
 37 | div.sphinxsidebar h1 a {
 38 |     font-size: 160%;
 39 |     color: #5F6366;
 40 |     text-decoration: none;
 41 |     border: none;
 42 |     font-family: 'Itim', cursive;
 43 | }
 44 | 
 45 | div.sphinxsidebar h1 a:hover {
 46 |     border: none;
 47 | }
 48 | 
 49 | div.sphinxsidebar h3 {
 50 |     display: none;
 51 | }
 52 | 
 53 | div.sphinxsidebar a {
 54 |     color: #5F6366;
 55 | }
 56 | 
 57 | code.descname {
 58 |     color: rgb(151, 139, 196);
 59 | }
 60 | 
 61 | th.field-name {
 62 |     min-width: 100px;
 63 |     color: rgb(151, 139, 196);
 64 | }
 65 | 
 66 | tt, code {
 67 |     color: #F8F8F2;
 68 |     background: #015259;
 69 |     border-radius: 0.3em;
 70 |     padding: 0.0em 0.3em;
 71 | }
 72 | 
 73 | a.reference.internal code.xref span.pre {
 74 |     color: #F8F8F2;
 75 |     background: #015259;
 76 |     border-bottom: none;
 77 |     border-radius: 0;
 78 |     padding: 0;
 79 | }
 80 | 
 81 | a.reference.internal, a.reference.internal:hover {
 82 |     border-bottom: none;
 83 | }
 84 | 
 85 | a.reference.internal:hover code {
 86 |     background: #027bab
 87 | }
 88 | 
 89 | a.reference.internal:hover code.xref span.pre {
 90 |     color: #F8F8F2;
 91 |     background: #027bab;
 92 |     border-bottom: none;
 93 | }
 94 | 
 95 | tt.xref, code.xref, a tt {
 96 |     background: none;
 97 |     border-bottom: none;
 98 | }
 99 | 
100 | code.literal {
101 |     color: #F8F8F2;
102 |     background: #015259;
103 | }
104 | 
105 | pre {
106 |     padding: 20px 30px;
107 |     background: #003038;
108 | }
109 | 
110 | div > dl {
111 |     border-left: 2px solid #00384021;
112 |     padding-left: 5px;
113 | }
114 | 
115 | dt {
116 |     color: rgb(96, 138, 197);
117 | }
118 | 
119 | 
120 | div.footer::before {
121 |     display: block;
122 |     content: '';
123 |     border-top: 2px solid #EDB5BF;
124 |     width: 50%;
125 |     margin: 2em auto 2em auto;
126 | }
127 | 
128 | div.footer {
129 |     text-align: center;
130 |     /* color: #029be2;  */
131 | }
132 | 
133 | div.footer a {
134 |     color: #027bab;
135 |     text-decoration: none;
136 | }
137 | 
138 | @media screen and (max-width: 875px) {
139 |     div.sphinxsidebar {
140 |         background: #4D6D9A;
141 |     }
142 |     div.sphinxsidebar h1.logo, div.sphinxsidebar p.blurb{
143 |         text-align: left;
144 |     }
145 |     div.sphinxsidebar h1 a {
146 |         color: #1bc5e0;
147 |     }
148 |     div.sphinxsidebar a {
149 |         /* color: rgb(151, 139, 196); */
150 |         color: white;
151 |     }
152 |     div.sphinxsidebar ul {
153 |         /* color: rgb(151, 139, 196); */
154 |         color: white;
155 |     }
156 | }
157 | 


--------------------------------------------------------------------------------
/docs/_static/documentation_options.js:
--------------------------------------------------------------------------------
 1 | var DOCUMENTATION_OPTIONS = {
 2 |     URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'),
 3 |     VERSION: '',
 4 |     LANGUAGE: 'None',
 5 |     COLLAPSE_INDEX: false,
 6 |     FILE_SUFFIX: '.html',
 7 |     HAS_SOURCE: true,
 8 |     SOURCELINK_SUFFIX: '.txt',
 9 |     NAVIGATION_WITH_KEYS: false
10 | };


--------------------------------------------------------------------------------
/docs/_static/file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/_static/file.png


--------------------------------------------------------------------------------
/docs/_static/minus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/_static/minus.png


--------------------------------------------------------------------------------
/docs/_static/plus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/_static/plus.png


--------------------------------------------------------------------------------
/docs/_static/pygments.css:
--------------------------------------------------------------------------------
 1 | .highlight .hll { background-color: #49483e }
 2 | .highlight  { background: #272822; color: #f8f8f2 }
 3 | .highlight .c { color: #75715e } /* Comment */
 4 | .highlight .err { color: #960050; background-color: #1e0010 } /* Error */
 5 | .highlight .k { color: #66d9ef } /* Keyword */
 6 | .highlight .l { color: #ae81ff } /* Literal */
 7 | .highlight .n { color: #f8f8f2 } /* Name */
 8 | .highlight .o { color: #f92672 } /* Operator */
 9 | .highlight .p { color: #f8f8f2 } /* Punctuation */
10 | .highlight .ch { color: #75715e } /* Comment.Hashbang */
11 | .highlight .cm { color: #75715e } /* Comment.Multiline */
12 | .highlight .cp { color: #75715e } /* Comment.Preproc */
13 | .highlight .cpf { color: #75715e } /* Comment.PreprocFile */
14 | .highlight .c1 { color: #75715e } /* Comment.Single */
15 | .highlight .cs { color: #75715e } /* Comment.Special */
16 | .highlight .gd { color: #f92672 } /* Generic.Deleted */
17 | .highlight .ge { font-style: italic } /* Generic.Emph */
18 | .highlight .gi { color: #a6e22e } /* Generic.Inserted */
19 | .highlight .gs { font-weight: bold } /* Generic.Strong */
20 | .highlight .gu { color: #75715e } /* Generic.Subheading */
21 | .highlight .kc { color: #66d9ef } /* Keyword.Constant */
22 | .highlight .kd { color: #66d9ef } /* Keyword.Declaration */
23 | .highlight .kn { color: #f92672 } /* Keyword.Namespace */
24 | .highlight .kp { color: #66d9ef } /* Keyword.Pseudo */
25 | .highlight .kr { color: #66d9ef } /* Keyword.Reserved */
26 | .highlight .kt { color: #66d9ef } /* Keyword.Type */
27 | .highlight .ld { color: #e6db74 } /* Literal.Date */
28 | .highlight .m { color: #ae81ff } /* Literal.Number */
29 | .highlight .s { color: #e6db74 } /* Literal.String */
30 | .highlight .na { color: #a6e22e } /* Name.Attribute */
31 | .highlight .nb { color: #f8f8f2 } /* Name.Builtin */
32 | .highlight .nc { color: #a6e22e } /* Name.Class */
33 | .highlight .no { color: #66d9ef } /* Name.Constant */
34 | .highlight .nd { color: #a6e22e } /* Name.Decorator */
35 | .highlight .ni { color: #f8f8f2 } /* Name.Entity */
36 | .highlight .ne { color: #a6e22e } /* Name.Exception */
37 | .highlight .nf { color: #a6e22e } /* Name.Function */
38 | .highlight .nl { color: #f8f8f2 } /* Name.Label */
39 | .highlight .nn { color: #f8f8f2 } /* Name.Namespace */
40 | .highlight .nx { color: #a6e22e } /* Name.Other */
41 | .highlight .py { color: #f8f8f2 } /* Name.Property */
42 | .highlight .nt { color: #f92672 } /* Name.Tag */
43 | .highlight .nv { color: #f8f8f2 } /* Name.Variable */
44 | .highlight .ow { color: #f92672 } /* Operator.Word */
45 | .highlight .w { color: #f8f8f2 } /* Text.Whitespace */
46 | .highlight .mb { color: #ae81ff } /* Literal.Number.Bin */
47 | .highlight .mf { color: #ae81ff } /* Literal.Number.Float */
48 | .highlight .mh { color: #ae81ff } /* Literal.Number.Hex */
49 | .highlight .mi { color: #ae81ff } /* Literal.Number.Integer */
50 | .highlight .mo { color: #ae81ff } /* Literal.Number.Oct */
51 | .highlight .sa { color: #e6db74 } /* Literal.String.Affix */
52 | .highlight .sb { color: #e6db74 } /* Literal.String.Backtick */
53 | .highlight .sc { color: #e6db74 } /* Literal.String.Char */
54 | .highlight .dl { color: #e6db74 } /* Literal.String.Delimiter */
55 | .highlight .sd { color: #e6db74 } /* Literal.String.Doc */
56 | .highlight .s2 { color: #e6db74 } /* Literal.String.Double */
57 | .highlight .se { color: #ae81ff } /* Literal.String.Escape */
58 | .highlight .sh { color: #e6db74 } /* Literal.String.Heredoc */
59 | .highlight .si { color: #e6db74 } /* Literal.String.Interpol */
60 | .highlight .sx { color: #e6db74 } /* Literal.String.Other */
61 | .highlight .sr { color: #e6db74 } /* Literal.String.Regex */
62 | .highlight .s1 { color: #e6db74 } /* Literal.String.Single */
63 | .highlight .ss { color: #e6db74 } /* Literal.String.Symbol */
64 | .highlight .bp { color: #f8f8f2 } /* Name.Builtin.Pseudo */
65 | .highlight .fm { color: #a6e22e } /* Name.Function.Magic */
66 | .highlight .vc { color: #f8f8f2 } /* Name.Variable.Class */
67 | .highlight .vg { color: #f8f8f2 } /* Name.Variable.Global */
68 | .highlight .vi { color: #f8f8f2 } /* Name.Variable.Instance */
69 | .highlight .vm { color: #f8f8f2 } /* Name.Variable.Magic */
70 | .highlight .il { color: #ae81ff } /* Literal.Number.Integer.Long */


--------------------------------------------------------------------------------
/docs/_static/python.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/_static/python.png


--------------------------------------------------------------------------------
/docs/genindex.html:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | <!DOCTYPE html>
  4 | 
  5 | <html xmlns="http://www.w3.org/1999/xhtml">
  6 |   <head>
  7 |     <meta charset="utf-8" />
  8 |     <title>Index &#8212; Airflow tutorial  documentation</title>
  9 |     <link rel="stylesheet" href="_static/alabaster.css" type="text/css" />
 10 |     <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
 11 |     <script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
 12 |     <script type="text/javascript" src="_static/jquery.js"></script>
 13 |     <script type="text/javascript" src="_static/underscore.js"></script>
 14 |     <script type="text/javascript" src="_static/doctools.js"></script>
 15 |     <script type="text/javascript" src="_static/language_data.js"></script>
 16 |     <link rel="index" title="Index" href="#" />
 17 |     <link rel="search" title="Search" href="search.html" />
 18 |    
 19 |   <link rel="stylesheet" href="_static/custom.css" type="text/css" />
 20 |   
 21 |   
 22 |   <meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
 23 | 
 24 |   </head><body>
 25 |   
 26 | 
 27 |     <div class="document">
 28 |       <div class="documentwrapper">
 29 |         <div class="bodywrapper">
 30 |               <div class="related top">
 31 |                 &nbsp;
 32 |   <nav id="rellinks">
 33 |     <ul>
 34 |     </ul>
 35 |   </nav>
 36 |               </div>
 37 |           
 38 | 
 39 |           <div class="body" role="main">
 40 |             
 41 | 
 42 | <h1 id="index">Index</h1>
 43 | 
 44 | <div class="genindex-jumpbox">
 45 |  
 46 | </div>
 47 | 
 48 | 
 49 |           </div>
 50 |               <div class="related bottom">
 51 |                 &nbsp;
 52 |   <nav id="rellinks">
 53 |     <ul>
 54 |     </ul>
 55 |   </nav>
 56 |               </div>
 57 |           
 58 |         </div>
 59 |       </div>
 60 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
 61 |         <div class="sphinxsidebarwrapper">
 62 | <p class="logo">
 63 |   <a href="index.html">
 64 |     <img class="logo" src="_static/python.png" alt="Logo"/>
 65 |     
 66 |   </a>
 67 | </p>
 68 | 
 69 | 
 70 | 
 71 | <p class="blurb">a.k.a an introduction to all things DAGS and pipelines joy</p>
 72 | 
 73 | 
 74 | 
 75 | 
 76 | <p>
 77 | <iframe src="https://ghbtns.com/github-btn.html?user=trallard&repo=airflow-tutorial&type=star&count=true&size=large&v=2"
 78 |   allowtransparency="true" frameborder="0" scrolling="0" width="200px" height="35px"></iframe>
 79 | </p>
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | <div id="searchbox" style="display: none" role="search">
 87 |   <h3>Quick search</h3>
 88 |     <div class="searchformwrapper">
 89 |     <form class="search" action="search.html" method="get">
 90 |       <input type="text" name="q" />
 91 |       <input type="submit" value="Go" />
 92 |     </form>
 93 |     </div>
 94 | </div>
 95 | <script type="text/javascript">$('#searchbox').show(0);</script><h3>Navigation</h3>
 96 | <p class="caption"><span class="caption-text">Table of Contents</span></p>
 97 | <ul>
 98 | <li class="toctree-l1"><a class="reference internal" href="setup.html">Getting started</a></li>
 99 | </ul>
100 | 
101 | <div class="relations">
102 | <h3>Related Topics</h3>
103 | <ul>
104 |   <li><a href="index.html">Documentation overview</a><ul>
105 |   </ul></li>
106 | </ul>
107 | </div><p><iframe src="https://ghbtns.com/github-btn.html?user=trallard&type=follow&count=false" allowtransparency="true"
108 |         frameborder="0" scrolling="0" width="200" height="20"></iframe></p>
109 | 
110 | <p><a href="https://twitter.com/ixek" class="twitter-follow-button" data-show-count="false">Follow @ixek</a>
111 |     <script>!function (d, s, id) { var js, fjs = d.getElementsByTagName(s)[0], p = /^http:/.test(d.location) ? 'http' : 'https'; if (!d.getElementById(id)) { js = d.createElement(s); js.id = id; js.src = p + '://platform.twitter.com/widgets.js'; fjs.parentNode.insertBefore(js, fjs); } }(document, 'script', 'twitter-wjs');</script>
112 | </p>
113 |         </div>
114 |       </div>
115 |       <div class="clearer"></div>
116 |     </div>
117 |     <div class="footer">
118 |       &copy;2019, Tania Allard.
119 |       
120 |       |
121 |       Powered by <a href="http://sphinx-doc.org/">Sphinx 2.0.1</a>
122 |       &amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.12</a>
123 |       
124 |     </div>
125 | 
126 |     
127 | 
128 |     
129 |   </body>
130 | </html>


--------------------------------------------------------------------------------
/docs/html/.buildinfo:
--------------------------------------------------------------------------------
1 | # Sphinx build info version 1
2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3 | config: 7f6d2b706dda0a3b5cf0f2c68897deb7
4 | tags: 645f666f9bcd5a90fca523b33c5a78b7
5 | 


--------------------------------------------------------------------------------
/docs/html/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/.nojekyll


--------------------------------------------------------------------------------
/docs/html/_images/12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/12.png


--------------------------------------------------------------------------------
/docs/html/_images/4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/4.jpg


--------------------------------------------------------------------------------
/docs/html/_images/DAG.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/DAG.png


--------------------------------------------------------------------------------
/docs/html/_images/airflow-logo.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/airflow-logo.jpeg


--------------------------------------------------------------------------------
/docs/html/_images/airflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/airflow.png


--------------------------------------------------------------------------------
/docs/html/_images/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/architecture.png


--------------------------------------------------------------------------------
/docs/html/_images/automation1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/automation1.jpg


--------------------------------------------------------------------------------
/docs/html/_images/azure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/azure.png


--------------------------------------------------------------------------------
/docs/html/_images/gooddata.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/gooddata.png


--------------------------------------------------------------------------------
/docs/html/_images/gooddata1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/gooddata1.png


--------------------------------------------------------------------------------
/docs/html/_images/luigi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/luigi.png


--------------------------------------------------------------------------------
/docs/html/_images/mssignin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/mssignin.png


--------------------------------------------------------------------------------
/docs/html/_images/twitter1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/twitter1.png


--------------------------------------------------------------------------------
/docs/html/_images/twitter2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/twitter2.png


--------------------------------------------------------------------------------
/docs/html/_images/twitter3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/twitter3.png


--------------------------------------------------------------------------------
/docs/html/_images/uses.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_images/uses.png


--------------------------------------------------------------------------------
/docs/html/_sources/about.md.txt:
--------------------------------------------------------------------------------
 1 | # About the workshop
 2 | 
 3 | We will be taking a look at the basic concepts of data pipelines as well as practical use cases using Python. 
 4 | 
 5 | ## About you:
 6 | - Some experience using the command line
 7 | - Intermediate Python knowledge / use
 8 | - Be able to apply what we learn and adopt to your use cases
 9 | - Interested in data and systems
10 | - Aspring or current data engineering
11 | - Some knowledge about systems and databases (enough to be dangerous)
12 | 
13 | ## Our focus for the day
14 | - Greater understanding on how to apply data pipelines using the Python toolset
15 | - Focus on concepts
16 | - Apply knowledge with each library
17 | - Will give you the building blocks
18 | 
19 | ## Keeping on track
20 | 
21 | You will find 🚦 across the tutorial examples. We will use this to identify how folks are doing over the workshop (if following along in person). 
22 | Place the post it as follows:
23 | 
24 | 🚦 Purple postit: all good, task has been completed
25 | 
26 | 🚦 Orange postit: I need extra time or need help with the task in hand


--------------------------------------------------------------------------------
/docs/html/_sources/airflow-intro.md.txt:
--------------------------------------------------------------------------------
  1 | # Airflow basics
  2 | 
  3 | ## What is Airflow?
  4 | 
  5 | ![airflow logo](_static/airflow-logo.jpeg)
  6 | 
  7 | Airflow is a Workflow engine which means:
  8 | 
  9 | - Manage scheduling and running jobs and data pipelines
 10 | - Ensures jobs are ordered correctly based on dependencies
 11 | - Manage the allocation of scarce resources
 12 | - Provides mechanisms for tracking the state of jobs and recovering from failure
 13 | 
 14 | It is highly versatile and can be used across many many domains:
 15 | ![](_static/uses.png)
 16 | 
 17 | ## Basic Airflow concepts
 18 | 
 19 | - **Task**: a defined unit of work (these are called operators in Airflow)
 20 | - **Task instance**: an individual run of a single task. Task instances also have an indicative state, which could be “running”, “success”, “failed”, “skipped”, “up for retry”, etc.
 21 | - **DAG**: Directed acyclic graph,
 22 |   a set of tasks with explicit execution order, beginning, and end
 23 | - **DAG run**: individual execution/run of a DAG
 24 | 
 25 | **Debunking the DAG**
 26 | 
 27 | The vertices and edges (the arrows linking the nodes) have an order and direction associated to them
 28 | 
 29 | ![](_static/DAG.png)
 30 | 
 31 | each node in a DAG corresponds to a task, which in turn represents some sort of data processing. For example:
 32 | 
 33 | Node A could be the code for pulling data from an API, node B could be the code for anonymizing the data. Node B could be the code for checking that there are no duplicate records, and so on.
 34 | 
 35 | These 'pipelines' are acyclic since they need a point of completion.
 36 | 
 37 | **Dependencies**
 38 | 
 39 | Each of the vertices has a particular direction that shows the relationship between certain nodes. For example, we can only anonymize data once this has been pulled out from the API.
 40 | 
 41 | ## Idempotency
 42 | 
 43 | This is one of the most important characteristics of good ETL architectures.
 44 | 
 45 | When we say that something is idempotent it means it will produce the same result regardless of how many times this is run (i.e. the results are reproducible).
 46 | 
 47 | Reproducibility is particularly important in data-intensive environments as this ensures that the same inputs will always return the same outputs.
 48 | 
 49 | ## Airflow components
 50 | 
 51 | ![](_static/architecture.png)
 52 | 
 53 | There are 4 main components to Apache Airflow:
 54 | 
 55 | ### Web server
 56 | 
 57 | The GUI. This is under the hood a Flask app where you can track the status of your jobs and read logs from a remote file store (e.g. [Azure Blobstorage](https://docs.microsoft.com/en-us/azure/storage/blobs/storage-blobs-overview/?wt.mc_id=PyCon-github-taallard)).
 58 | 
 59 | ### Scheduler
 60 | 
 61 | This component is responsible for scheduling jobs. This is a multithreaded Python process that uses the DAGb object to decide what tasks need to be run, when and where.
 62 | 
 63 | The task state is retrieved and updated from the database accordingly. The web server then uses these saved states to display job information.
 64 | 
 65 | ### Executor
 66 | 
 67 | The mechanism that gets the tasks done.
 68 | 
 69 | ### Metadata database
 70 | 
 71 | - Powers how the other components interact
 72 | - Stores the Airflow states
 73 | - All processes read and write from here
 74 | 
 75 | ## Workflow as a code
 76 | One of the main advantages of using a workflow system like Airflow is that all is code, which makes your workflows maintainable, versionable, testable, and collaborative.
 77 | 
 78 | Thus your workflows become more explicit and maintainable (atomic tasks).
 79 | 
 80 | Not only your code is dynamic but also is your infrastructure.
 81 | 
 82 | ### Defining tasks
 83 | 
 84 | Tasks are defined based on the abstraction of `Operators` (see Airflow docs [here](https://airflow.apache.org/concepts.html#operators)) which represent a single **idempotent task**.
 85 | 
 86 | The best practice is to have atomic operators (i.e. can stand on their own and do not need to share resources among them).
 87 | 
 88 | You can choose among;
 89 | - `BashOperator`
 90 | - `PythonOperator`
 91 | - `EmailOperator`
 92 | - `SimpleHttpOperator`
 93 | - `MySqlOperator` (and other DB)
 94 | 
 95 | Examples:
 96 | 
 97 | ```python
 98 | t1 = BashOperator(task_id='print_date',
 99 |     bash_command='date,
100 |     dag=dag) 
101 | ```
102 | 
103 | ```python
104 | def print_context(ds, **kwargs):
105 |     pprint(kwargs)
106 |     print(ds)
107 |     return 'Whatever you return gets printed in the logs'
108 | 
109 | 
110 | run_this = PythonOperator(
111 |     task_id='print_the_context',
112 |     provide_context=True,
113 |     python_callable=print_context,
114 |     dag=dag,
115 | )
116 | ```
117 | 
118 | ## Comparing Luigi and Airflow
119 | 
120 | ### Luigi 
121 | 
122 | - Created at Spotify (named after the plumber)
123 | - Open sourced in late 2012
124 | - GNU make for data
125 | 
126 | ### Airflow
127 | - Airbnb data team
128 | - Open-sourced mud 2015
129 | - Apache incubator mid-2016
130 | - ETL pipelines
131 | 
132 | ### Similarities
133 | - Python open source projects for data pipelines
134 | - Integrate with a number of sources (databases, filesystems)
135 | - Tracking failure, retries, success
136 | - Ability to identify the dependencies and execution
137 | 
138 | ### Differences
139 | - Scheduler support: Airflow has built-in support using schedulers
140 | - Scalability: Airflow has had stability issues in the past
141 | - Web interfaces
142 | 
143 | ![](_static/luigi.png)
144 | 
145 | 
146 | ![](_static/airflow.png)
147 | 
148 | 
149 | | Airflow                                          | Luigi                                                                          |
150 | | ------------------------------------------------ | ------------------------------------------------------------------------------ |
151 | | Task are defined by`dag_id` defined by user name | Task are defined by task name and parameters                                   |
152 | | Task retries based on definitions                | Decide if a task is done via input/output                                      |
153 | | Task code to the worker                          | Workers started by Python file where the tasks are defined                     |
154 | | Centralized scheduler (Celery spins up workers)  | Centralized scheduler in charge of deduplication sending tasks (Tornado based) |


--------------------------------------------------------------------------------
/docs/html/_sources/azure.md.txt:
--------------------------------------------------------------------------------
 1 | ### Deploying to the cloud
 2 | 
 3 | 
 4 | ![](_static/azure.png)
 5 | 
 6 | [This Docker image](https://hub.docker.com/r/puckel/docker-airflow/) has been used as the base for many deployments. 
 7 | 
 8 | 
 9 | Let's try and get Airflow running on Docker:
10 | 
11 | ```
12 | docker pull puckel/docker-airflow
13 | ```
14 | 
15 | Once you have the container you can run as
16 | 
17 | ```
18 | docker run -d --rm -p 8080:8080 puckel/docker-airflow webserver
19 | ```
20 | 
21 | To load the examples you can do:
22 | ```
23 | docker run -d -p 8080:8080 -e LOAD_EX=y puckel/docker-airflow
24 | ```
25 | 
26 | Based on this container we can deploy to [Azure](https://azure.microsoft.com/en-us/blog/deploying-apache-airflow-in-azure-to-build-and-run-data-pipelines//?wt.mc_id=PyCon-github-taallard)
27 | 
28 | 
29 | [![Deploy to Azure](https://azuredeploy.net/deploybutton.svg)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fsavjani%2Fazure-quickstart-templates%2Fmaster%2F101-webapp-linux-airflow-postgresql%2Fazuredeploy.json/?wt.mc_id=PyCon-github-taallard)
30 | 
31 | 
32 | Note that this is a very basic deployment on Azure.


--------------------------------------------------------------------------------
/docs/html/_sources/first-airflow.md.txt:
--------------------------------------------------------------------------------
  1 | # Airflow 101: working locally and familiarise with the tool
  2 | 
  3 | ### Pre-requisites
  4 | 
  5 | The following prerequisites are needed:
  6 | 
  7 | - Libraries detailed in the Setting up section (either via conda or pipenv)
  8 | - MySQL installed
  9 | - text editor
 10 | - command line
 11 |   
 12 | ## Getting your environment up and running
 13 | 
 14 | If you followed the instructions you should have Airflow installed as well as the rest of the packages we will be using. 
 15 | 
 16 | So let's get our environment up and running:
 17 | 
 18 | If you are using conda start your environment via:
 19 | ```
 20 | $ source activate airflow-env
 21 | ```
 22 | If using pipenv then:
 23 | ```
 24 | $ pipenv shell
 25 | ````
 26 | 
 27 | this will start a shell within a virtual environment, to exit the shell you need to type `exit` and this will exit the virtual environment.
 28 | 
 29 | ## Starting Airflow locally
 30 | 
 31 | Airflow home lives in `~/airflow` by default, but you can change the location before installing airflow. You first need to set the `AIRFLOW_HOME` environment variable and then install airflow. For example, using pip:
 32 | 
 33 | ```sh
 34 | export AIRFLOW_HOME=~/mydir/airflow
 35 | 
 36 | # install from PyPI using pip
 37 | pip install apache-airflow
 38 | ```
 39 | 
 40 | once you have completed the installation you should see something like this in the `airflow` directory (wherever it lives for you)
 41 | 
 42 | ```
 43 | drwxr-xr-x    - myuser 18 Apr 14:02 .
 44 | .rw-r--r--  26k myuser 18 Apr 14:02 ├── airflow.cfg
 45 | drwxr-xr-x    - myuser 18 Apr 14:02 ├── logs
 46 | drwxr-xr-x    - myuser 18 Apr 14:02 │  └── scheduler
 47 | drwxr-xr-x    - myuser 18 Apr 14:02 │     ├── 2019-04-18
 48 | lrwxr-xr-x   46 myuser 18 Apr 14:02 │     └── latest -> /Users/myuser/airflow/logs/scheduler/2019-04-18
 49 | .rw-r--r-- 2.5k myuser 18 Apr 14:02 └── unittests.cfg
 50 | ```
 51 | We need to create a local dag folder:
 52 | 
 53 | ```
 54 | mkdir ~/airflow/dags
 55 | ```
 56 | 
 57 | As your project evolves, your directory will look something like this:
 58 | 
 59 | ```
 60 | airflow                  # the root directory.
 61 | ├── dags                 # root folder for all dags. files inside folders are not searched for dags.
 62 | │   ├── my_dag.py, # my dag (definitions of tasks/operators) including precedence.
 63 | │   └── ...
 64 | ├── logs                 # logs for the various tasks that are run
 65 | │   └── my_dag           # DAG specific logs
 66 | │   │   ├── src1_s3      # folder for task-specific logs (log files are created by date of a run)
 67 | │   │   ├── src2_hdfs
 68 | │   │   ├── src3_s3
 69 | │   │   └── spark_task_etl
 70 | ├── airflow.db           # SQLite database used by Airflow internally to track the status of each DAG.
 71 | ├── airflow.cfg          # global configuration for Airflow (this can be overridden by config inside the file.)
 72 | └── ...
 73 | ```
 74 | 
 75 | ## Prepare your database
 76 | 
 77 | As we mentioned before Airflow uses a database to keep track of the tasks and their statuses. So it is critical to have one set up.
 78 | 
 79 | To start the default database we can run
 80 | ` airflow initdb`. This will initialize your database via alembic so that it matches the latest Airflow release.
 81 | 
 82 | The default database used is `sqlite` which means you cannot parallelize tasks using this database. Since we have MySQL and MySQL client installed we will set them up so that we can use them with airflow.
 83 | 
 84 | 🚦Create an airflow database
 85 | 
 86 | From the command line:
 87 | 
 88 | ```
 89 | MySQL -u root -p
 90 | mysql> CREATE DATABASE airflow CHARACTER SET utf8 COLLATE utf8_unicode_ci;
 91 | mysql> GRANT ALL PRIVILEGES ON airflow.* To 'airflow'@'localhost';
 92 | mysql> FLUSH PRIVILEGES;
 93 | ```
 94 | and initialize the database:
 95 | 
 96 | ```
 97 | airflow initdb
 98 | ```
 99 | 
100 | Notice that this will fail with the default `airflow.cfg`
101 | 
102 | 
103 | ## Update your local configuration 
104 | 
105 | Open your airflow configuration file `~/airflow/airflow.cf` and make the following changes:
106 | 
107 | 
108 | ```
109 | executor = CeleryExecutor
110 | ```
111 | 
112 | ```
113 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#broker-settings
114 | # needs rabbitmq running
115 | broker_url = amqp://guest:guest@127.0.0.1/
116 | 
117 | 
118 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#task-result-backend-settings
119 | result_backend = db+mysql://airflow:airflow@localhost:3306/airflow
120 | 
121 | sql_alchemy_conn = mysql://airflow:python2019@localhost:3306/airflow
122 | 
123 | ```
124 | 
125 | Here we are replacing the default executor (`SequentialExecutor`) with the `CeleryExecutor` so that we can run multiple DAGs in parallel.
126 | We also replace the default `sqlite` database with our newly created `airflow` database.
127 | 
128 | Now we can initialize the database:
129 | ```
130 | airflow initdb
131 | ```
132 | 
133 | Let's now start the web server locally:
134 | 
135 | 
136 | ```
137 | airflow webserver -p 8080
138 | ```
139 | 
140 | we can head over to [http://localhost:8080](http://localhost:8080) now and you will see that there are a number of examples DAGS already there.
141 | 
142 | 🚦 Take some time to familiarise with the UI and get your local instance set up
143 | 
144 | Now let's have a look at the connections ([http://localhost:8080/admin/connection/](http://localhost:8080/admin/connection/)) go to `admin > connections`. You should be able to see a number of connections available. For this tutorial, we will use some of the connections including  `mysql`.
145 | 
146 | <!-- For example, if you have `mysql` running but you have a different password for the root user you can edit it by clicking on the connection name.
147 | 
148 | 
149 | 🚦Now let's create a db for our local project
150 | 
151 | ![](_static/connection.png) -->
152 | 
153 | ### Commands
154 | Let us go over some of the commands. Back on your command line:
155 | 
156 | ```
157 | airflow list_dags
158 | ```
159 | we can list the DAG tasks in a tree view
160 | 
161 | ```
162 | airflow list_tasks tutorial --tree
163 | ```
164 | 
165 | we can tests the dags too, but we will need to set a date parameter so that this executes:
166 | 
167 | ```
168 | airflow test tutorial print_date 2019-05-01
169 | ```
170 | (note that you cannot use a future date or you will get an error)
171 | ```
172 | airflow test tutorial templated 2019-05-01
173 | ```
174 | By using the test commands these are not saved in the database.
175 | 
176 | Now let's start the scheduler:
177 | ```
178 | airflow scheduler
179 | ```
180 | 
181 | Behind the scenes, it monitors and stays in sync with a folder for all DAG objects it contains. The Airflow scheduler is designed to run as a service in an Airflow production environment.
182 | 
183 | Now with the schedule up and running we can trigger an instance:
184 | ```
185 | $ airflow run airflow run example_bash_operator runme_0 2015-01-01
186 | ```
187 | 
188 | This will be stored in the database and you can see the change of the status change straight away.
189 | 
190 | What would happen for example if we wanted to run or trigger the `tutorial` task? 🤔
191 | 
192 | Let's try from the CLI and see what happens.
193 | 
194 | ```
195 | airflow trigger_dag tutorial
196 | ```
197 | 
198 | 
199 | ## Writing your first DAG
200 | 
201 | Let's create our first simple DAG. 
202 | Inside the dag directory (`~/airflow/dags)` create a `simple_dag.py` file.
203 | 
204 | 
205 | ```python
206 | from datetime import datetime, timedelta
207 | from airflow import DAG
208 | from airflow.operators.dummy_operator import DummyOperator
209 | from airflow.operators.python_operator import PythonOperator
210 | 
211 | 
212 | def print_hello():
213 |     return "Hello world!"
214 | 
215 | 
216 | default_args = {
217 |     "owner": "airflow",
218 |     "depends_on_past": False,
219 |     "start_date": datetime(2019, 4, 30),
220 |     "email": ["airflow@example.com"],
221 |     "email_on_failure": False,
222 |     "email_on_retry": False,
223 |     "retries": 1,
224 |     "retry_delay": timedelta(minutes=2),
225 | }
226 | 
227 | dag = DAG(
228 |     "hello_world",
229 |     description="Simple tutorial DAG",
230 |     schedule_interval="0 12 * * *",
231 |     default_args=default_args,
232 |     catchup=False,
233 | )
234 | 
235 | t1 = DummyOperator(task_id="dummy_task", retries=3, dag=dag)
236 | 
237 | t2 = PythonOperator(task_id="hello_task", python_callable=print_hello, dag=dag)
238 | 
239 | # sets downstream foe t1
240 | t1 >> t2
241 | 
242 | # equivalent
243 | # t2.set_upstream(t1)
244 | 
245 | ```
246 | If it is properly setup you should be able to see this straight away on your instance.
247 | 
248 | 
249 | ### Now let's create a DAG from the previous ETL pipeline (kind of)
250 | 
251 | All hands on - check the solutions


--------------------------------------------------------------------------------
/docs/html/_sources/index.rst.txt:
--------------------------------------------------------------------------------
 1 | .. Airflow tutorial documentation master file, created by
 2 |    sphinx-quickstart on Mon Apr 15 15:52:00 2019.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Airflow tutorial
 7 | ============================================
 8 | This tutorial was originally developed for PyCon US 2019.
 9 | 
10 | .. toctree::
11 |    :caption: Table of Contents
12 |    :hidden:
13 |    :maxdepth: 2
14 | 
15 |    setup
16 |    about
17 |    pipelines
18 |    airflow-intro
19 |    first-airflow
20 | 
21 | .. toctree::
22 |    :maxdepth: 2
23 |    :caption: Contents:
24 | 
25 | About your facilitator
26 | ======================
27 | 
28 | My name is Tania. I live in Manchester UK where I work as a 
29 | Cloud Advocate for Microsoft.
30 | 
31 | Over the years, I have worked as a data engineer, machine learning engineer,
32 | and research software engineer. I love data intensive
33 | enviroments and I am particularly interested in the tools and workflows to
34 | deliver robust, reproducible data insights.
35 | 
36 | If you have any questions or feedback about this tutorial please, 
37 | file an issue using the following link: `<https://github.com/trallard/airflow-tutorial/issues/new>`_.
38 | 
39 | You can also contact me via the following channels:
40 | 
41 | - E-mail: trallard@bitsandchips.me
42 | - Twitter: `@ixek <https://twitter.com/ixek>`_
43 | - `Tania on GitHub <https://github.com/ixek>`_
44 | 
45 | Code of Conduct
46 | ================
47 | All attendees to this workshop are expected to adhere to PyCon's Code of Conduct,
48 | in brief:
49 | **Be open, considerate, and respectful.**
50 | 
51 | License
52 | =======
53 | The content in this workshop is Licensed under `CC-BY-SA 4.0 <https://creativecommons.org/licenses/by-sa/4.0/>`_.
54 | Which means that you can use, remix and re-distribute so long attribution to the original
55 | author is maintained (Tania Allard).
56 | 
57 | The logo used here was designed by Ashley McNamara for the Microsoft Developer Advocates team use.
58 | 
59 | 
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/docs/html/_static/12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/12.png


--------------------------------------------------------------------------------
/docs/html/_static/4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/4.jpg


--------------------------------------------------------------------------------
/docs/html/_static/DAG.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/DAG.png


--------------------------------------------------------------------------------
/docs/html/_static/GUI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/GUI.png


--------------------------------------------------------------------------------
/docs/html/_static/airflow-logo.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/airflow-logo.jpeg


--------------------------------------------------------------------------------
/docs/html/_static/airflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/airflow.png


--------------------------------------------------------------------------------
/docs/html/_static/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/architecture.png


--------------------------------------------------------------------------------
/docs/html/_static/automation1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/automation1.jpg


--------------------------------------------------------------------------------
/docs/html/_static/azure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/azure.png


--------------------------------------------------------------------------------
/docs/html/_static/connection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/connection.png


--------------------------------------------------------------------------------
/docs/html/_static/custom.css:
--------------------------------------------------------------------------------
  1 | /*  */
  2 | @import url('https://fonts.googleapis.com/css?family=Itim|Nunito|Source+Code+Pro');
  3 | 
  4 | a {
  5 |     color: rgb(96, 138, 197);
  6 | }
  7 | 
  8 | a:hover {
  9 |     color: rgb(65, 129, 218);
 10 | }
 11 | 
 12 | div.body h1 {
 13 |     color: #5F6366;
 14 |     font-family: 'Itim', cursive;
 15 |     font-weight: bold;
 16 |     font-size: 300%;
 17 | }
 18 | 
 19 | div.body h2 {
 20 |     color: #5F6366;
 21 |     font-family: 'Itim', cursive;
 22 |     font-weight: bold;
 23 | }
 24 | div.body h3 {
 25 |     color: #5F6366;
 26 |     font-family: 'Itim', cursive;
 27 |     font-weight: bold;
 28 | }
 29 | 
 30 | div.sphinxsidebarwrapper h1.logo {
 31 |     text-align: center;
 32 |     margin: 0 0 -20px 0;
 33 | }
 34 | 
 35 | div.sphinxsidebar p.blurb {
 36 |     font-size: 130%;
 37 |     text-align: center;
 38 |     font-family: 'Itim', cursive;
 39 |     color: rgb(151, 139, 196);
 40 | }
 41 | 
 42 | div.sphinxsidebar h1{
 43 |     font-size: 160%;
 44 |     color: #5F6366;
 45 |     font-family: 'Itim', cursive;
 46 | }
 47 | 
 48 | div.sphinxsidebar h1 a {
 49 |     font-size: 160%;
 50 |     color: #5F6366;
 51 |     text-decoration: none;
 52 |     border: none;
 53 |     font-family: 'Itim', cursive;
 54 | }
 55 | 
 56 | div.sphinxsidebar h1 a:hover {
 57 |     border: none;
 58 | }
 59 | 
 60 | div.sphinxsidebar h3 {
 61 |     display: none;
 62 | }
 63 | 
 64 | div.sphinxsidebar a {
 65 |     color: #5F6366;
 66 | }
 67 | 
 68 | code.descname {
 69 |     color: rgb(151, 139, 196);
 70 | }
 71 | 
 72 | th.field-name {
 73 |     min-width: 100px;
 74 |     color: rgb(151, 139, 196);
 75 | }
 76 | 
 77 | tt, code {
 78 |     color: #F8F8F2;
 79 |     background: #1d1941;
 80 |     border-radius: 0.3em;
 81 |     padding: 0.0em 0.3em;
 82 | }
 83 | 
 84 | a.reference.internal code.xref span.pre {
 85 |     color: #F8F8F2;
 86 |     background: #1d1941;
 87 |     border-bottom: none;
 88 |     border-radius: 0;
 89 |     padding: 0;
 90 | }
 91 | 
 92 | a.reference.internal, a.reference.internal:hover {
 93 |     border-bottom: none;
 94 | }
 95 | 
 96 | a.reference.internal:hover code {
 97 |     background: #027bab
 98 | }
 99 | 
100 | a.reference.internal:hover code.xref span.pre {
101 |     color: #F8F8F2;
102 |     background: #027bab;
103 |     border-bottom: none;
104 | }
105 | 
106 | tt.xref, code.xref, a tt {
107 |     background: none;
108 |     border-bottom: none;
109 | }
110 | 
111 | code.literal {
112 |     color: #F8F8F2;
113 |     background:#1d1941;
114 | }
115 | 
116 | pre {
117 |     padding: 20px 30px;
118 |     background: #1d1941;
119 | }
120 | 
121 | div > dl {
122 |     border-left: 2px solid #00384021;
123 |     padding-left: 5px;
124 | }
125 | 
126 | dt {
127 |     color: rgb(96, 138, 197);
128 | }
129 | 
130 | 
131 | div.footer::before {
132 |     display: block;
133 |     content: '';
134 |     border-top: 2px solid #EDB5BF;
135 |     width: 50%;
136 |     margin: 2em auto 2em auto;
137 | }
138 | 
139 | div.footer {
140 |     text-align: center;
141 |     /* color: #029be2;  */
142 | }
143 | 
144 | div.footer a {
145 |     color: #027bab;
146 |     text-decoration: none;
147 | }
148 | 
149 | p.caption {
150 |     font-family: 'Itim', cursive;
151 |     font-size: inherit;
152 |     font-size: 150%;
153 | }
154 | 
155 | @media screen and (max-width: 875px) {
156 |     div.sphinxsidebar {
157 |         background: #4D6D9A;
158 |     }
159 |     div.sphinxsidebar h1.logo, div.sphinxsidebar p.blurb{
160 |         text-align: left;
161 |     }
162 |     div.sphinxsidebar h1 a {
163 |         color: #1bc5e0;
164 |     }
165 |     div.sphinxsidebar a {
166 |         /* color: rgb(151, 139, 196); */
167 |         color: white;
168 |     }
169 |     div.sphinxsidebar ul {
170 |         /* color: rgb(151, 139, 196); */
171 |         color: white;
172 |     }
173 | }
174 | 
175 | 
176 | /* other */
177 | 
178 | .alert {
179 |   position: relative;
180 |   padding: 10px;
181 |   margin-bottom: 5px;
182 |   border: 2px solid transparent;
183 |   border-radius: 2px;
184 | }
185 | 
186 | .alert-primary {
187 |     color: #004085;
188 |     background-color: #cce5ff;
189 |     border-color: #b8daff;
190 | }
191 | .alert-custom {
192 |     background-color: rgb(229, 224, 247);
193 |     border-color:rgb(229, 224, 247);
194 |     color: rgb(128, 117, 165);
195 | }


--------------------------------------------------------------------------------
/docs/html/_static/datapyramid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/datapyramid.png


--------------------------------------------------------------------------------
/docs/html/_static/documentation_options.js:
--------------------------------------------------------------------------------
 1 | var DOCUMENTATION_OPTIONS = {
 2 |     URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'),
 3 |     VERSION: '',
 4 |     LANGUAGE: 'None',
 5 |     COLLAPSE_INDEX: false,
 6 |     FILE_SUFFIX: '.html',
 7 |     HAS_SOURCE: true,
 8 |     SOURCELINK_SUFFIX: '.txt',
 9 |     NAVIGATION_WITH_KEYS: false
10 | };


--------------------------------------------------------------------------------
/docs/html/_static/file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/file.png


--------------------------------------------------------------------------------
/docs/html/_static/gooddata.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/gooddata.png


--------------------------------------------------------------------------------
/docs/html/_static/gooddata1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/gooddata1.png


--------------------------------------------------------------------------------
/docs/html/_static/luigi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/luigi.png


--------------------------------------------------------------------------------
/docs/html/_static/minus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/minus.png


--------------------------------------------------------------------------------
/docs/html/_static/mssignin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/mssignin.png


--------------------------------------------------------------------------------
/docs/html/_static/pipeline1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/pipeline1.png


--------------------------------------------------------------------------------
/docs/html/_static/plus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/plus.png


--------------------------------------------------------------------------------
/docs/html/_static/pygments.css:
--------------------------------------------------------------------------------
 1 | .highlight .hll { background-color: #49483e }
 2 | .highlight  { background: #272822; color: #f8f8f2 }
 3 | .highlight .c { color: #75715e } /* Comment */
 4 | .highlight .err { color: #960050; background-color: #1e0010 } /* Error */
 5 | .highlight .k { color: #66d9ef } /* Keyword */
 6 | .highlight .l { color: #ae81ff } /* Literal */
 7 | .highlight .n { color: #f8f8f2 } /* Name */
 8 | .highlight .o { color: #f92672 } /* Operator */
 9 | .highlight .p { color: #f8f8f2 } /* Punctuation */
10 | .highlight .ch { color: #75715e } /* Comment.Hashbang */
11 | .highlight .cm { color: #75715e } /* Comment.Multiline */
12 | .highlight .cp { color: #75715e } /* Comment.Preproc */
13 | .highlight .cpf { color: #75715e } /* Comment.PreprocFile */
14 | .highlight .c1 { color: #75715e } /* Comment.Single */
15 | .highlight .cs { color: #75715e } /* Comment.Special */
16 | .highlight .gd { color: #f92672 } /* Generic.Deleted */
17 | .highlight .ge { font-style: italic } /* Generic.Emph */
18 | .highlight .gi { color: #a6e22e } /* Generic.Inserted */
19 | .highlight .gs { font-weight: bold } /* Generic.Strong */
20 | .highlight .gu { color: #75715e } /* Generic.Subheading */
21 | .highlight .kc { color: #66d9ef } /* Keyword.Constant */
22 | .highlight .kd { color: #66d9ef } /* Keyword.Declaration */
23 | .highlight .kn { color: #f92672 } /* Keyword.Namespace */
24 | .highlight .kp { color: #66d9ef } /* Keyword.Pseudo */
25 | .highlight .kr { color: #66d9ef } /* Keyword.Reserved */
26 | .highlight .kt { color: #66d9ef } /* Keyword.Type */
27 | .highlight .ld { color: #e6db74 } /* Literal.Date */
28 | .highlight .m { color: #ae81ff } /* Literal.Number */
29 | .highlight .s { color: #e6db74 } /* Literal.String */
30 | .highlight .na { color: #a6e22e } /* Name.Attribute */
31 | .highlight .nb { color: #f8f8f2 } /* Name.Builtin */
32 | .highlight .nc { color: #a6e22e } /* Name.Class */
33 | .highlight .no { color: #66d9ef } /* Name.Constant */
34 | .highlight .nd { color: #a6e22e } /* Name.Decorator */
35 | .highlight .ni { color: #f8f8f2 } /* Name.Entity */
36 | .highlight .ne { color: #a6e22e } /* Name.Exception */
37 | .highlight .nf { color: #a6e22e } /* Name.Function */
38 | .highlight .nl { color: #f8f8f2 } /* Name.Label */
39 | .highlight .nn { color: #f8f8f2 } /* Name.Namespace */
40 | .highlight .nx { color: #a6e22e } /* Name.Other */
41 | .highlight .py { color: #f8f8f2 } /* Name.Property */
42 | .highlight .nt { color: #f92672 } /* Name.Tag */
43 | .highlight .nv { color: #f8f8f2 } /* Name.Variable */
44 | .highlight .ow { color: #f92672 } /* Operator.Word */
45 | .highlight .w { color: #f8f8f2 } /* Text.Whitespace */
46 | .highlight .mb { color: #ae81ff } /* Literal.Number.Bin */
47 | .highlight .mf { color: #ae81ff } /* Literal.Number.Float */
48 | .highlight .mh { color: #ae81ff } /* Literal.Number.Hex */
49 | .highlight .mi { color: #ae81ff } /* Literal.Number.Integer */
50 | .highlight .mo { color: #ae81ff } /* Literal.Number.Oct */
51 | .highlight .sa { color: #e6db74 } /* Literal.String.Affix */
52 | .highlight .sb { color: #e6db74 } /* Literal.String.Backtick */
53 | .highlight .sc { color: #e6db74 } /* Literal.String.Char */
54 | .highlight .dl { color: #e6db74 } /* Literal.String.Delimiter */
55 | .highlight .sd { color: #e6db74 } /* Literal.String.Doc */
56 | .highlight .s2 { color: #e6db74 } /* Literal.String.Double */
57 | .highlight .se { color: #ae81ff } /* Literal.String.Escape */
58 | .highlight .sh { color: #e6db74 } /* Literal.String.Heredoc */
59 | .highlight .si { color: #e6db74 } /* Literal.String.Interpol */
60 | .highlight .sx { color: #e6db74 } /* Literal.String.Other */
61 | .highlight .sr { color: #e6db74 } /* Literal.String.Regex */
62 | .highlight .s1 { color: #e6db74 } /* Literal.String.Single */
63 | .highlight .ss { color: #e6db74 } /* Literal.String.Symbol */
64 | .highlight .bp { color: #f8f8f2 } /* Name.Builtin.Pseudo */
65 | .highlight .fm { color: #a6e22e } /* Name.Function.Magic */
66 | .highlight .vc { color: #f8f8f2 } /* Name.Variable.Class */
67 | .highlight .vg { color: #f8f8f2 } /* Name.Variable.Global */
68 | .highlight .vi { color: #f8f8f2 } /* Name.Variable.Instance */
69 | .highlight .vm { color: #f8f8f2 } /* Name.Variable.Magic */
70 | .highlight .il { color: #ae81ff } /* Literal.Number.Integer.Long */


--------------------------------------------------------------------------------
/docs/html/_static/python.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/python.png


--------------------------------------------------------------------------------
/docs/html/_static/twitter1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/twitter1.png


--------------------------------------------------------------------------------
/docs/html/_static/twitter2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/twitter2.png


--------------------------------------------------------------------------------
/docs/html/_static/twitter3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/twitter3.png


--------------------------------------------------------------------------------
/docs/html/_static/uses.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/_static/uses.png


--------------------------------------------------------------------------------
/docs/html/about.html:
--------------------------------------------------------------------------------
  1 | 
  2 | <!DOCTYPE html>
  3 | 
  4 | <html xmlns="http://www.w3.org/1999/xhtml">
  5 |   <head>
  6 |     <meta charset="utf-8" />
  7 |     <title>About the workshop &#8212; Airflow tutorial  documentation</title>
  8 |     <link rel="stylesheet" href="_static/alabaster.css" type="text/css" />
  9 |     <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
 10 |     <script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
 11 |     <script type="text/javascript" src="_static/jquery.js"></script>
 12 |     <script type="text/javascript" src="_static/underscore.js"></script>
 13 |     <script type="text/javascript" src="_static/doctools.js"></script>
 14 |     <script type="text/javascript" src="_static/language_data.js"></script>
 15 |     <link rel="author" title="About these documents" href="#" />
 16 |     <link rel="index" title="Index" href="genindex.html" />
 17 |     <link rel="search" title="Search" href="search.html" />
 18 |     <link rel="next" title="Pipelines" href="pipelines.html" />
 19 |     <link rel="prev" title="Setup" href="setup.html" />
 20 |    
 21 |   <link rel="stylesheet" href="_static/custom.css" type="text/css" />
 22 |   
 23 |   
 24 |   <meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
 25 | 
 26 |   </head><body>
 27 |   
 28 | 
 29 |     <div class="document">
 30 |       <div class="documentwrapper">
 31 |         <div class="bodywrapper">
 32 |               <div class="related top">
 33 |                 &nbsp;
 34 |   <nav id="rellinks">
 35 |     <ul>
 36 |         <li>
 37 |           &larr;
 38 |           <a href="setup.html" title="Previous document">Setup</a>
 39 |         </li>
 40 |         <li>
 41 |           <a href="pipelines.html" title="Next document">Pipelines</a>
 42 |           &rarr;
 43 |         </li>
 44 |     </ul>
 45 |   </nav>
 46 |               </div>
 47 |           
 48 | 
 49 |           <div class="body" role="main">
 50 |             
 51 |   <div class="section" id="about-the-workshop">
 52 | <h1>About the workshop<a class="headerlink" href="#about-the-workshop" title="Permalink to this headline">¶</a></h1>
 53 | <p>We will be taking a look at the basic concepts of data pipelines as well as practical use cases using Python.</p>
 54 | <div class="section" id="about-you">
 55 | <h2>About you:<a class="headerlink" href="#about-you" title="Permalink to this headline">¶</a></h2>
 56 | <ul class="simple">
 57 | <li><p>Some experience using the command line</p></li>
 58 | <li><p>Intermediate Python knowledge / use</p></li>
 59 | <li><p>Be able to apply what we learn and adopt to your use cases</p></li>
 60 | <li><p>Interested in data and systems</p></li>
 61 | <li><p>Aspring or current data engineering</p></li>
 62 | <li><p>Some knowledge about systems and databases (enough to be dangerous)</p></li>
 63 | </ul>
 64 | </div>
 65 | <div class="section" id="our-focus-for-the-day">
 66 | <h2>Our focus for the day<a class="headerlink" href="#our-focus-for-the-day" title="Permalink to this headline">¶</a></h2>
 67 | <ul class="simple">
 68 | <li><p>Greater understanding on how to apply data pipelines using the Python toolset</p></li>
 69 | <li><p>Focus on concepts</p></li>
 70 | <li><p>Apply knowledge with each library</p></li>
 71 | <li><p>Will give you the building blocks</p></li>
 72 | </ul>
 73 | </div>
 74 | <div class="section" id="keeping-on-track">
 75 | <h2>Keeping on track<a class="headerlink" href="#keeping-on-track" title="Permalink to this headline">¶</a></h2>
 76 | <p>You will find 🚦 across the tutorial examples. We will use this to identify how folks are doing over the workshop (if following along in person).
 77 | Place the post it as follows:</p>
 78 | <p>🚦 Purple postit: all good, task has been completed</p>
 79 | <p>🚦 Orange postit: I need extra time or need help with the task in hand</p>
 80 | </div>
 81 | </div>
 82 | 
 83 | 
 84 |           </div>
 85 |               <div class="related bottom">
 86 |                 &nbsp;
 87 |   <nav id="rellinks">
 88 |     <ul>
 89 |         <li>
 90 |           &larr;
 91 |           <a href="setup.html" title="Previous document">Setup</a>
 92 |         </li>
 93 |         <li>
 94 |           <a href="pipelines.html" title="Next document">Pipelines</a>
 95 |           &rarr;
 96 |         </li>
 97 |     </ul>
 98 |   </nav>
 99 |               </div>
100 |           
101 |         </div>
102 |       </div>
103 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
104 |         <div class="sphinxsidebarwrapper">
105 | <p class="logo">
106 |   <a href="index.html">
107 |     <img class="logo" src="_static/python.png" alt="Logo"/>
108 |     
109 |   </a>
110 | </p>
111 | 
112 | 
113 | 
114 | <p class="blurb">a.k.a an introduction to all things DAGS and pipelines joy</p>
115 | 
116 | 
117 | 
118 | 
119 | <p>
120 | <iframe src="https://ghbtns.com/github-btn.html?user=trallard&repo=airflow-tutorial&type=star&count=true&size=large&v=2"
121 |   allowtransparency="true" frameborder="0" scrolling="0" width="200px" height="35px"></iframe>
122 | </p>
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 |   <h3><a href="index.html">Table of Contents</a></h3>
130 |   <ul>
131 | <li><a class="reference internal" href="#">About the workshop</a><ul>
132 | <li><a class="reference internal" href="#about-you">About you:</a></li>
133 | <li><a class="reference internal" href="#our-focus-for-the-day">Our focus for the day</a></li>
134 | <li><a class="reference internal" href="#keeping-on-track">Keeping on track</a></li>
135 | </ul>
136 | </li>
137 | </ul>
138 | 
139 | <div id="searchbox" style="display: none" role="search">
140 |   <h3>Quick search</h3>
141 |     <div class="searchformwrapper">
142 |     <form class="search" action="search.html" method="get">
143 |       <input type="text" name="q" />
144 |       <input type="submit" value="Go" />
145 |     </form>
146 |     </div>
147 | </div>
148 | <script type="text/javascript">$('#searchbox').show(0);</script><h3>Navigation</h3>
149 | <p class="caption"><span class="caption-text">Table of Contents</span></p>
150 | <ul class="current">
151 | <li class="toctree-l1"><a class="reference internal" href="setup.html">Setup</a></li>
152 | <li class="toctree-l1 current"><a class="current reference internal" href="#">About the workshop</a><ul>
153 | <li class="toctree-l2"><a class="reference internal" href="#about-you">About you:</a></li>
154 | <li class="toctree-l2"><a class="reference internal" href="#our-focus-for-the-day">Our focus for the day</a></li>
155 | <li class="toctree-l2"><a class="reference internal" href="#keeping-on-track">Keeping on track</a></li>
156 | </ul>
157 | </li>
158 | <li class="toctree-l1"><a class="reference internal" href="pipelines.html">Pipelines</a></li>
159 | <li class="toctree-l1"><a class="reference internal" href="airflow-intro.html">Airflow basics</a></li>
160 | <li class="toctree-l1"><a class="reference internal" href="first-airflow.html">Airflow 101: working locally and familiarise with the tool</a></li>
161 | </ul>
162 | 
163 | <div class="relations">
164 | <h3>Related Topics</h3>
165 | <ul>
166 |   <li><a href="index.html">Documentation overview</a><ul>
167 |       <li>Previous: <a href="setup.html" title="previous chapter">Setup</a></li>
168 |       <li>Next: <a href="pipelines.html" title="next chapter">Pipelines</a></li>
169 |   </ul></li>
170 | </ul>
171 | </div><p><iframe src="https://ghbtns.com/github-btn.html?user=trallard&type=follow&count=false" allowtransparency="true"
172 |         frameborder="0" scrolling="0" width="200" height="20"></iframe></p>
173 | 
174 | <p><a href="https://twitter.com/ixek" class="twitter-follow-button" data-show-count="false">Follow @ixek</a>
175 |     <script>!function (d, s, id) { var js, fjs = d.getElementsByTagName(s)[0], p = /^http:/.test(d.location) ? 'http' : 'https'; if (!d.getElementById(id)) { js = d.createElement(s); js.id = id; js.src = p + '://platform.twitter.com/widgets.js'; fjs.parentNode.insertBefore(js, fjs); } }(document, 'script', 'twitter-wjs');</script>
176 | </p>
177 |         </div>
178 |       </div>
179 |       <div class="clearer"></div>
180 |     </div>
181 |     <div class="footer">
182 |       &copy;2019, Tania Allard.
183 |       
184 |       |
185 |       Powered by <a href="http://sphinx-doc.org/">Sphinx 2.0.1</a>
186 |       &amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.12</a>
187 |       
188 |       |
189 |       <a href="_sources/about.md.txt"
190 |           rel="nofollow">Page source</a>
191 |     </div>
192 | 
193 |     
194 | 
195 |     
196 |   </body>
197 | </html>


--------------------------------------------------------------------------------
/docs/html/azure.html:
--------------------------------------------------------------------------------
  1 | 
  2 | <!DOCTYPE html>
  3 | 
  4 | <html xmlns="http://www.w3.org/1999/xhtml">
  5 |   <head>
  6 |     <meta charset="utf-8" />
  7 |     <title>Deploying to the cloud &#8212; Airflow tutorial  documentation</title>
  8 |     <link rel="stylesheet" href="_static/alabaster.css" type="text/css" />
  9 |     <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
 10 |     <script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
 11 |     <script type="text/javascript" src="_static/jquery.js"></script>
 12 |     <script type="text/javascript" src="_static/underscore.js"></script>
 13 |     <script type="text/javascript" src="_static/doctools.js"></script>
 14 |     <script type="text/javascript" src="_static/language_data.js"></script>
 15 |     <link rel="author" title="About these documents" href="about.html" />
 16 |     <link rel="index" title="Index" href="genindex.html" />
 17 |     <link rel="search" title="Search" href="search.html" />
 18 |    
 19 |   <link rel="stylesheet" href="_static/custom.css" type="text/css" />
 20 |   
 21 |   
 22 |   <meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
 23 | 
 24 |   </head><body>
 25 |   
 26 | 
 27 |     <div class="document">
 28 |       <div class="documentwrapper">
 29 |         <div class="bodywrapper">
 30 |               <div class="related top">
 31 |                 &nbsp;
 32 |   <nav id="rellinks">
 33 |     <ul>
 34 |     </ul>
 35 |   </nav>
 36 |               </div>
 37 |           
 38 | 
 39 |           <div class="body" role="main">
 40 |             
 41 |   <div class="section" id="deploying-to-the-cloud">
 42 | <h1>Deploying to the cloud<a class="headerlink" href="#deploying-to-the-cloud" title="Permalink to this headline">¶</a></h1>
 43 | <p><img alt="_images/azure.png" src="_images/azure.png" /></p>
 44 | <p><a class="reference external" href="https://hub.docker.com/r/puckel/docker-airflow/">This Docker image</a> has been used as the base for many deployments.</p>
 45 | <p>Let’s try and get Airflow running on Docker:</p>
 46 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">docker</span> <span class="n">pull</span> <span class="n">puckel</span><span class="o">/</span><span class="n">docker</span><span class="o">-</span><span class="n">airflow</span>
 47 | </pre></div>
 48 | </div>
 49 | <p>Once you have the container you can run as</p>
 50 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">docker</span> <span class="n">run</span> <span class="o">-</span><span class="n">d</span> <span class="o">--</span><span class="n">rm</span> <span class="o">-</span><span class="n">p</span> <span class="mi">8080</span><span class="p">:</span><span class="mi">8080</span> <span class="n">puckel</span><span class="o">/</span><span class="n">docker</span><span class="o">-</span><span class="n">airflow</span> <span class="n">webserver</span>
 51 | </pre></div>
 52 | </div>
 53 | <p>To load the examples you can do:</p>
 54 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">docker</span> <span class="n">run</span> <span class="o">-</span><span class="n">d</span> <span class="o">-</span><span class="n">p</span> <span class="mi">8080</span><span class="p">:</span><span class="mi">8080</span> <span class="o">-</span><span class="n">e</span> <span class="n">LOAD_EX</span><span class="o">=</span><span class="n">y</span> <span class="n">puckel</span><span class="o">/</span><span class="n">docker</span><span class="o">-</span><span class="n">airflow</span>
 55 | </pre></div>
 56 | </div>
 57 | <p>Based on this container we can deploy to <a class="reference external" href="https://azure.microsoft.com/en-us/blog/deploying-apache-airflow-in-azure-to-build-and-run-data-pipelines//?wt.mc_id=PyCon-github-taallard">Azure</a></p>
 58 | <p><a class="reference external" href="https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fsavjani%2Fazure-quickstart-templates%2Fmaster%2F101-webapp-linux-airflow-postgresql%2Fazuredeploy.json/?wt.mc_id=PyCon-github-taallard"><img alt="https://azuredeploy.net/deploybutton.svg" src="https://azuredeploy.net/deploybutton.svg" />Deploy to Azure</a></p>
 59 | <p>Note that this is a very basic deployment on Azure.</p>
 60 | </div>
 61 | 
 62 | 
 63 |           </div>
 64 |               <div class="related bottom">
 65 |                 &nbsp;
 66 |   <nav id="rellinks">
 67 |     <ul>
 68 |     </ul>
 69 |   </nav>
 70 |               </div>
 71 |           
 72 |         </div>
 73 |       </div>
 74 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
 75 |         <div class="sphinxsidebarwrapper">
 76 | <p class="logo">
 77 |   <a href="index.html">
 78 |     <img class="logo" src="_static/python.png" alt="Logo"/>
 79 |     
 80 |   </a>
 81 | </p>
 82 | 
 83 | 
 84 | 
 85 | <p class="blurb">a.k.a an introduction to all things DAGS and pipelines joy</p>
 86 | 
 87 | 
 88 | 
 89 | 
 90 | <p>
 91 | <iframe src="https://ghbtns.com/github-btn.html?user=trallard&repo=airflow-tutorial&type=star&count=true&size=large&v=2"
 92 |   allowtransparency="true" frameborder="0" scrolling="0" width="200px" height="35px"></iframe>
 93 | </p>
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | <div id="searchbox" style="display: none" role="search">
101 |   <h3>Quick search</h3>
102 |     <div class="searchformwrapper">
103 |     <form class="search" action="search.html" method="get">
104 |       <input type="text" name="q" />
105 |       <input type="submit" value="Go" />
106 |     </form>
107 |     </div>
108 | </div>
109 | <script type="text/javascript">$('#searchbox').show(0);</script><h3>Navigation</h3>
110 | <p class="caption"><span class="caption-text">Table of Contents</span></p>
111 | <ul>
112 | <li class="toctree-l1"><a class="reference internal" href="setup.html">Setup</a></li>
113 | <li class="toctree-l1"><a class="reference internal" href="about.html">About the workshop</a></li>
114 | <li class="toctree-l1"><a class="reference internal" href="pipelines.html">Pipelines</a></li>
115 | <li class="toctree-l1"><a class="reference internal" href="airflow-intro.html">Airflow basics</a></li>
116 | <li class="toctree-l1"><a class="reference internal" href="first-airflow.html">Airflow 101: working locally and familiarise with the tool</a></li>
117 | </ul>
118 | 
119 | <div class="relations">
120 | <h3>Related Topics</h3>
121 | <ul>
122 |   <li><a href="index.html">Documentation overview</a><ul>
123 |   </ul></li>
124 | </ul>
125 | </div><p><iframe src="https://ghbtns.com/github-btn.html?user=trallard&type=follow&count=false" allowtransparency="true"
126 |         frameborder="0" scrolling="0" width="200" height="20"></iframe></p>
127 | 
128 | <p><a href="https://twitter.com/ixek" class="twitter-follow-button" data-show-count="false">Follow @ixek</a>
129 |     <script>!function (d, s, id) { var js, fjs = d.getElementsByTagName(s)[0], p = /^http:/.test(d.location) ? 'http' : 'https'; if (!d.getElementById(id)) { js = d.createElement(s); js.id = id; js.src = p + '://platform.twitter.com/widgets.js'; fjs.parentNode.insertBefore(js, fjs); } }(document, 'script', 'twitter-wjs');</script>
130 | </p>
131 |         </div>
132 |       </div>
133 |       <div class="clearer"></div>
134 |     </div>
135 |     <div class="footer">
136 |       &copy;2019, Tania Allard.
137 |       
138 |       |
139 |       Powered by <a href="http://sphinx-doc.org/">Sphinx 2.0.1</a>
140 |       &amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.12</a>
141 |       
142 |       |
143 |       <a href="_sources/azure.md.txt"
144 |           rel="nofollow">Page source</a>
145 |     </div>
146 | 
147 |     
148 | 
149 |     
150 |   </body>
151 | </html>


--------------------------------------------------------------------------------
/docs/html/genindex.html:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | <!DOCTYPE html>
  4 | 
  5 | <html xmlns="http://www.w3.org/1999/xhtml">
  6 |   <head>
  7 |     <meta charset="utf-8" />
  8 |     <title>Index &#8212; Airflow tutorial  documentation</title>
  9 |     <link rel="stylesheet" href="_static/alabaster.css" type="text/css" />
 10 |     <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
 11 |     <script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
 12 |     <script type="text/javascript" src="_static/jquery.js"></script>
 13 |     <script type="text/javascript" src="_static/underscore.js"></script>
 14 |     <script type="text/javascript" src="_static/doctools.js"></script>
 15 |     <script type="text/javascript" src="_static/language_data.js"></script>
 16 |     <link rel="author" title="About these documents" href="about.html" />
 17 |     <link rel="index" title="Index" href="#" />
 18 |     <link rel="search" title="Search" href="search.html" />
 19 |    
 20 |   <link rel="stylesheet" href="_static/custom.css" type="text/css" />
 21 |   
 22 |   
 23 |   <meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
 24 | 
 25 |   </head><body>
 26 |   
 27 | 
 28 |     <div class="document">
 29 |       <div class="documentwrapper">
 30 |         <div class="bodywrapper">
 31 |               <div class="related top">
 32 |                 &nbsp;
 33 |   <nav id="rellinks">
 34 |     <ul>
 35 |     </ul>
 36 |   </nav>
 37 |               </div>
 38 |           
 39 | 
 40 |           <div class="body" role="main">
 41 |             
 42 | 
 43 | <h1 id="index">Index</h1>
 44 | 
 45 | <div class="genindex-jumpbox">
 46 |  
 47 | </div>
 48 | 
 49 | 
 50 |           </div>
 51 |               <div class="related bottom">
 52 |                 &nbsp;
 53 |   <nav id="rellinks">
 54 |     <ul>
 55 |     </ul>
 56 |   </nav>
 57 |               </div>
 58 |           
 59 |         </div>
 60 |       </div>
 61 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
 62 |         <div class="sphinxsidebarwrapper">
 63 | <p class="logo">
 64 |   <a href="index.html">
 65 |     <img class="logo" src="_static/python.png" alt="Logo"/>
 66 |     
 67 |   </a>
 68 | </p>
 69 | 
 70 | 
 71 | 
 72 | <p class="blurb">a.k.a an introduction to all things DAGS and pipelines joy</p>
 73 | 
 74 | 
 75 | 
 76 | 
 77 | <p>
 78 | <iframe src="https://ghbtns.com/github-btn.html?user=trallard&repo=airflow-tutorial&type=star&count=true&size=large&v=2"
 79 |   allowtransparency="true" frameborder="0" scrolling="0" width="200px" height="35px"></iframe>
 80 | </p>
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | <div id="searchbox" style="display: none" role="search">
 88 |   <h3>Quick search</h3>
 89 |     <div class="searchformwrapper">
 90 |     <form class="search" action="search.html" method="get">
 91 |       <input type="text" name="q" />
 92 |       <input type="submit" value="Go" />
 93 |     </form>
 94 |     </div>
 95 | </div>
 96 | <script type="text/javascript">$('#searchbox').show(0);</script><h3>Navigation</h3>
 97 | <p class="caption"><span class="caption-text">Table of Contents</span></p>
 98 | <ul>
 99 | <li class="toctree-l1"><a class="reference internal" href="setup.html">Setup</a></li>
100 | <li class="toctree-l1"><a class="reference internal" href="about.html">About the workshop</a></li>
101 | <li class="toctree-l1"><a class="reference internal" href="pipelines.html">Pipelines</a></li>
102 | <li class="toctree-l1"><a class="reference internal" href="airflow-intro.html">Airflow basics</a></li>
103 | <li class="toctree-l1"><a class="reference internal" href="first-airflow.html">Airflow 101: working locally and familiarise with the tool</a></li>
104 | </ul>
105 | 
106 | <div class="relations">
107 | <h3>Related Topics</h3>
108 | <ul>
109 |   <li><a href="index.html">Documentation overview</a><ul>
110 |   </ul></li>
111 | </ul>
112 | </div><p><iframe src="https://ghbtns.com/github-btn.html?user=trallard&type=follow&count=false" allowtransparency="true"
113 |         frameborder="0" scrolling="0" width="200" height="20"></iframe></p>
114 | 
115 | <p><a href="https://twitter.com/ixek" class="twitter-follow-button" data-show-count="false">Follow @ixek</a>
116 |     <script>!function (d, s, id) { var js, fjs = d.getElementsByTagName(s)[0], p = /^http:/.test(d.location) ? 'http' : 'https'; if (!d.getElementById(id)) { js = d.createElement(s); js.id = id; js.src = p + '://platform.twitter.com/widgets.js'; fjs.parentNode.insertBefore(js, fjs); } }(document, 'script', 'twitter-wjs');</script>
117 | </p>
118 |         </div>
119 |       </div>
120 |       <div class="clearer"></div>
121 |     </div>
122 |     <div class="footer">
123 |       &copy;2019, Tania Allard.
124 |       
125 |       |
126 |       Powered by <a href="http://sphinx-doc.org/">Sphinx 2.0.1</a>
127 |       &amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.12</a>
128 |       
129 |     </div>
130 | 
131 |     
132 | 
133 |     
134 |   </body>
135 | </html>


--------------------------------------------------------------------------------
/docs/html/index.html:
--------------------------------------------------------------------------------
  1 | 
  2 | <!DOCTYPE html>
  3 | 
  4 | <html xmlns="http://www.w3.org/1999/xhtml">
  5 |   <head>
  6 |     <meta charset="utf-8" />
  7 |     <title>Airflow tutorial &#8212; Airflow tutorial  documentation</title>
  8 |     <link rel="stylesheet" href="_static/alabaster.css" type="text/css" />
  9 |     <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
 10 |     <script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
 11 |     <script type="text/javascript" src="_static/jquery.js"></script>
 12 |     <script type="text/javascript" src="_static/underscore.js"></script>
 13 |     <script type="text/javascript" src="_static/doctools.js"></script>
 14 |     <script type="text/javascript" src="_static/language_data.js"></script>
 15 |     <link rel="author" title="About these documents" href="about.html" />
 16 |     <link rel="index" title="Index" href="genindex.html" />
 17 |     <link rel="search" title="Search" href="search.html" />
 18 |     <link rel="next" title="Setup" href="setup.html" />
 19 |    
 20 |   <link rel="stylesheet" href="_static/custom.css" type="text/css" />
 21 |   
 22 |   
 23 |   <meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
 24 | 
 25 |   </head><body>
 26 |   
 27 | 
 28 |     <div class="document">
 29 |       <div class="documentwrapper">
 30 |         <div class="bodywrapper">
 31 |               <div class="related top">
 32 |                 &nbsp;
 33 |   <nav id="rellinks">
 34 |     <ul>
 35 |         <li>
 36 |           <a href="setup.html" title="Next document">Setup</a>
 37 |           &rarr;
 38 |         </li>
 39 |     </ul>
 40 |   </nav>
 41 |               </div>
 42 |           
 43 | 
 44 |           <div class="body" role="main">
 45 |             
 46 |   <div class="section" id="airflow-tutorial">
 47 | <h1>Airflow tutorial<a class="headerlink" href="#airflow-tutorial" title="Permalink to this headline">¶</a></h1>
 48 | <p>This tutorial was originally developed for PyCon US 2019.</p>
 49 | <div class="toctree-wrapper compound">
 50 | </div>
 51 | <div class="toctree-wrapper compound">
 52 | </div>
 53 | </div>
 54 | <div class="section" id="about-your-facilitator">
 55 | <h1>About your facilitator<a class="headerlink" href="#about-your-facilitator" title="Permalink to this headline">¶</a></h1>
 56 | <p>My name is Tania. I live in Manchester UK where I work as a
 57 | Cloud Advocate for Microsoft.</p>
 58 | <p>Over the years, I have worked as a data engineer, machine learning engineer,
 59 | and research software engineer. I love data intensive
 60 | enviroments and I am particularly interested in the tools and workflows to
 61 | deliver robust, reproducible data insights.</p>
 62 | <p>If you have any questions or feedback about this tutorial please,
 63 | file an issue using the following link: <a class="reference external" href="https://github.com/trallard/airflow-tutorial/issues/new">https://github.com/trallard/airflow-tutorial/issues/new</a>.</p>
 64 | <p>You can also contact me via the following channels:</p>
 65 | <ul class="simple">
 66 | <li><p>E-mail: <a class="reference external" href="mailto:trallard&#37;&#52;&#48;bitsandchips&#46;me">trallard<span>&#64;</span>bitsandchips<span>&#46;</span>me</a></p></li>
 67 | <li><p>Twitter: <a class="reference external" href="https://twitter.com/ixek">&#64;ixek</a></p></li>
 68 | <li><p><a class="reference external" href="https://github.com/ixek">Tania on GitHub</a></p></li>
 69 | </ul>
 70 | </div>
 71 | <div class="section" id="code-of-conduct">
 72 | <h1>Code of Conduct<a class="headerlink" href="#code-of-conduct" title="Permalink to this headline">¶</a></h1>
 73 | <p>All attendees to this workshop are expected to adhere to PyCon’s Code of Conduct,
 74 | in brief:
 75 | <strong>Be open, considerate, and respectful.</strong></p>
 76 | </div>
 77 | <div class="section" id="license">
 78 | <h1>License<a class="headerlink" href="#license" title="Permalink to this headline">¶</a></h1>
 79 | <p>The content in this workshop is Licensed under <a class="reference external" href="https://creativecommons.org/licenses/by-sa/4.0/">CC-BY-SA 4.0</a>.
 80 | Which means that you can use, remix and re-distribute so long attribution to the original
 81 | author is maintained (Tania Allard).</p>
 82 | <p>The logo used here was designed by Ashley McNamara for the Microsoft Developer Advocates team use.</p>
 83 | </div>
 84 | 
 85 | 
 86 |           </div>
 87 |               <div class="related bottom">
 88 |                 &nbsp;
 89 |   <nav id="rellinks">
 90 |     <ul>
 91 |         <li>
 92 |           <a href="setup.html" title="Next document">Setup</a>
 93 |           &rarr;
 94 |         </li>
 95 |     </ul>
 96 |   </nav>
 97 |               </div>
 98 |           
 99 |         </div>
100 |       </div>
101 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
102 |         <div class="sphinxsidebarwrapper">
103 | <p class="logo">
104 |   <a href="#">
105 |     <img class="logo" src="_static/python.png" alt="Logo"/>
106 |     
107 |   </a>
108 | </p>
109 | 
110 | 
111 | 
112 | <p class="blurb">a.k.a an introduction to all things DAGS and pipelines joy</p>
113 | 
114 | 
115 | 
116 | 
117 | <p>
118 | <iframe src="https://ghbtns.com/github-btn.html?user=trallard&repo=airflow-tutorial&type=star&count=true&size=large&v=2"
119 |   allowtransparency="true" frameborder="0" scrolling="0" width="200px" height="35px"></iframe>
120 | </p>
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 |   <h3><a href="#">Table of Contents</a></h3>
128 |   <ul>
129 | <li><a class="reference internal" href="#">Airflow tutorial</a><ul>
130 | </ul>
131 | </li>
132 | <li><a class="reference internal" href="#about-your-facilitator">About your facilitator</a></li>
133 | <li><a class="reference internal" href="#code-of-conduct">Code of Conduct</a></li>
134 | <li><a class="reference internal" href="#license">License</a></li>
135 | </ul>
136 | 
137 | <div id="searchbox" style="display: none" role="search">
138 |   <h3>Quick search</h3>
139 |     <div class="searchformwrapper">
140 |     <form class="search" action="search.html" method="get">
141 |       <input type="text" name="q" />
142 |       <input type="submit" value="Go" />
143 |     </form>
144 |     </div>
145 | </div>
146 | <script type="text/javascript">$('#searchbox').show(0);</script><h3>Navigation</h3>
147 | <p class="caption"><span class="caption-text">Table of Contents</span></p>
148 | <ul>
149 | <li class="toctree-l1"><a class="reference internal" href="setup.html">Setup</a></li>
150 | <li class="toctree-l1"><a class="reference internal" href="about.html">About the workshop</a></li>
151 | <li class="toctree-l1"><a class="reference internal" href="pipelines.html">Pipelines</a></li>
152 | <li class="toctree-l1"><a class="reference internal" href="airflow-intro.html">Airflow basics</a></li>
153 | <li class="toctree-l1"><a class="reference internal" href="first-airflow.html">Airflow 101: working locally and familiarise with the tool</a></li>
154 | </ul>
155 | 
156 | <div class="relations">
157 | <h3>Related Topics</h3>
158 | <ul>
159 |   <li><a href="#">Documentation overview</a><ul>
160 |       <li>Next: <a href="setup.html" title="next chapter">Setup</a></li>
161 |   </ul></li>
162 | </ul>
163 | </div><p><iframe src="https://ghbtns.com/github-btn.html?user=trallard&type=follow&count=false" allowtransparency="true"
164 |         frameborder="0" scrolling="0" width="200" height="20"></iframe></p>
165 | 
166 | <p><a href="https://twitter.com/ixek" class="twitter-follow-button" data-show-count="false">Follow @ixek</a>
167 |     <script>!function (d, s, id) { var js, fjs = d.getElementsByTagName(s)[0], p = /^http:/.test(d.location) ? 'http' : 'https'; if (!d.getElementById(id)) { js = d.createElement(s); js.id = id; js.src = p + '://platform.twitter.com/widgets.js'; fjs.parentNode.insertBefore(js, fjs); } }(document, 'script', 'twitter-wjs');</script>
168 | </p>
169 |         </div>
170 |       </div>
171 |       <div class="clearer"></div>
172 |     </div>
173 |     <div class="footer">
174 |       &copy;2019, Tania Allard.
175 |       
176 |       |
177 |       Powered by <a href="http://sphinx-doc.org/">Sphinx 2.0.1</a>
178 |       &amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.12</a>
179 |       
180 |       |
181 |       <a href="_sources/index.rst.txt"
182 |           rel="nofollow">Page source</a>
183 |     </div>
184 | 
185 |     
186 | 
187 |     
188 |   </body>
189 | </html>


--------------------------------------------------------------------------------
/docs/html/objects.inv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/html/objects.inv


--------------------------------------------------------------------------------
/docs/html/search.html:
--------------------------------------------------------------------------------
  1 | 
  2 | <!DOCTYPE html>
  3 | 
  4 | <html xmlns="http://www.w3.org/1999/xhtml">
  5 |   <head>
  6 |     <meta charset="utf-8" />
  7 |     <title>Search &#8212; Airflow tutorial  documentation</title>
  8 |     <link rel="stylesheet" href="_static/alabaster.css" type="text/css" />
  9 |     <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
 10 |     
 11 |     <script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
 12 |     <script type="text/javascript" src="_static/jquery.js"></script>
 13 |     <script type="text/javascript" src="_static/underscore.js"></script>
 14 |     <script type="text/javascript" src="_static/doctools.js"></script>
 15 |     <script type="text/javascript" src="_static/language_data.js"></script>
 16 |     <script type="text/javascript" src="_static/searchtools.js"></script>
 17 |     <link rel="author" title="About these documents" href="about.html" />
 18 |     <link rel="index" title="Index" href="genindex.html" />
 19 |     <link rel="search" title="Search" href="#" />
 20 |   <script type="text/javascript" src="searchindex.js" defer></script>
 21 |   
 22 |    
 23 |   <link rel="stylesheet" href="_static/custom.css" type="text/css" />
 24 |   
 25 |   
 26 |   <meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
 27 | 
 28 | 
 29 |   </head><body>
 30 |   
 31 | 
 32 |     <div class="document">
 33 |       <div class="documentwrapper">
 34 |         <div class="bodywrapper">
 35 |               <div class="related top">
 36 |                 &nbsp;
 37 |   <nav id="rellinks">
 38 |     <ul>
 39 |     </ul>
 40 |   </nav>
 41 |               </div>
 42 |           
 43 | 
 44 |           <div class="body" role="main">
 45 |             
 46 |   <h1 id="search-documentation">Search</h1>
 47 |   <div id="fallback" class="admonition warning">
 48 |   <script type="text/javascript">$('#fallback').hide();</script>
 49 |   <p>
 50 |     Please activate JavaScript to enable the search
 51 |     functionality.
 52 |   </p>
 53 |   </div>
 54 |   <p>
 55 |     From here you can search these documents. Enter your search
 56 |     words into the box below and click "search". Note that the search
 57 |     function will automatically search for all of the words. Pages
 58 |     containing fewer words won't appear in the result list.
 59 |   </p>
 60 |   <form action="" method="get">
 61 |     <input type="text" name="q" value="" />
 62 |     <input type="submit" value="search" />
 63 |     <span id="search-progress" style="padding-left: 10px"></span>
 64 |   </form>
 65 |   
 66 |   <div id="search-results">
 67 |   
 68 |   </div>
 69 | 
 70 |           </div>
 71 |               <div class="related bottom">
 72 |                 &nbsp;
 73 |   <nav id="rellinks">
 74 |     <ul>
 75 |     </ul>
 76 |   </nav>
 77 |               </div>
 78 |           
 79 |         </div>
 80 |       </div>
 81 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
 82 |         <div class="sphinxsidebarwrapper">
 83 | <p class="logo">
 84 |   <a href="index.html">
 85 |     <img class="logo" src="_static/python.png" alt="Logo"/>
 86 |     
 87 |   </a>
 88 | </p>
 89 | 
 90 | 
 91 | 
 92 | <p class="blurb">a.k.a an introduction to all things DAGS and pipelines joy</p>
 93 | 
 94 | 
 95 | 
 96 | 
 97 | <p>
 98 | <iframe src="https://ghbtns.com/github-btn.html?user=trallard&repo=airflow-tutorial&type=star&count=true&size=large&v=2"
 99 |   allowtransparency="true" frameborder="0" scrolling="0" width="200px" height="35px"></iframe>
100 | </p>
101 | 
102 | 
103 | 
104 | 
105 | 
106 | <h3>Navigation</h3>
107 | <p class="caption"><span class="caption-text">Table of Contents</span></p>
108 | <ul>
109 | <li class="toctree-l1"><a class="reference internal" href="setup.html">Setup</a></li>
110 | <li class="toctree-l1"><a class="reference internal" href="about.html">About the workshop</a></li>
111 | <li class="toctree-l1"><a class="reference internal" href="pipelines.html">Pipelines</a></li>
112 | <li class="toctree-l1"><a class="reference internal" href="airflow-intro.html">Airflow basics</a></li>
113 | <li class="toctree-l1"><a class="reference internal" href="first-airflow.html">Airflow 101: working locally and familiarise with the tool</a></li>
114 | </ul>
115 | 
116 | <div class="relations">
117 | <h3>Related Topics</h3>
118 | <ul>
119 |   <li><a href="index.html">Documentation overview</a><ul>
120 |   </ul></li>
121 | </ul>
122 | </div><p><iframe src="https://ghbtns.com/github-btn.html?user=trallard&type=follow&count=false" allowtransparency="true"
123 |         frameborder="0" scrolling="0" width="200" height="20"></iframe></p>
124 | 
125 | <p><a href="https://twitter.com/ixek" class="twitter-follow-button" data-show-count="false">Follow @ixek</a>
126 |     <script>!function (d, s, id) { var js, fjs = d.getElementsByTagName(s)[0], p = /^http:/.test(d.location) ? 'http' : 'https'; if (!d.getElementById(id)) { js = d.createElement(s); js.id = id; js.src = p + '://platform.twitter.com/widgets.js'; fjs.parentNode.insertBefore(js, fjs); } }(document, 'script', 'twitter-wjs');</script>
127 | </p>
128 |         </div>
129 |       </div>
130 |       <div class="clearer"></div>
131 |     </div>
132 |     <div class="footer">
133 |       &copy;2019, Tania Allard.
134 |       
135 |       |
136 |       Powered by <a href="http://sphinx-doc.org/">Sphinx 2.0.1</a>
137 |       &amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.12</a>
138 |       
139 |     </div>
140 | 
141 |     
142 | 
143 |     
144 |   </body>
145 | </html>


--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
  1 | 
  2 | <!DOCTYPE html>
  3 | 
  4 | <html xmlns="http://www.w3.org/1999/xhtml">
  5 |   <head>
  6 |     <meta charset="utf-8" />
  7 |     <title>Airflow tutorial &#8212; Airflow tutorial  documentation</title>
  8 |     <link rel="stylesheet" href="_static/alabaster.css" type="text/css" />
  9 |     <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
 10 |     <script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
 11 |     <script type="text/javascript" src="_static/jquery.js"></script>
 12 |     <script type="text/javascript" src="_static/underscore.js"></script>
 13 |     <script type="text/javascript" src="_static/doctools.js"></script>
 14 |     <script type="text/javascript" src="_static/language_data.js"></script>
 15 |     <link rel="index" title="Index" href="genindex.html" />
 16 |     <link rel="search" title="Search" href="search.html" />
 17 |     <link rel="next" title="Getting started" href="setup.html" />
 18 |    
 19 |   <link rel="stylesheet" href="_static/custom.css" type="text/css" />
 20 |   
 21 |   
 22 |   <meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
 23 | 
 24 |   </head><body>
 25 |   
 26 | 
 27 |     <div class="document">
 28 |       <div class="documentwrapper">
 29 |         <div class="bodywrapper">
 30 |               <div class="related top">
 31 |                 &nbsp;
 32 |   <nav id="rellinks">
 33 |     <ul>
 34 |         <li>
 35 |           <a href="setup.html" title="Next document">Getting started</a>
 36 |           &rarr;
 37 |         </li>
 38 |     </ul>
 39 |   </nav>
 40 |               </div>
 41 |           
 42 | 
 43 |           <div class="body" role="main">
 44 |             
 45 |   <div class="section" id="airflow-tutorial">
 46 | <h1>Airflow tutorial<a class="headerlink" href="#airflow-tutorial" title="Permalink to this headline">¶</a></h1>
 47 | <p>This tutorial was originally developed for PyCon US 2019.</p>
 48 | <div class="toctree-wrapper compound">
 49 | </div>
 50 | <div class="toctree-wrapper compound">
 51 | </div>
 52 | </div>
 53 | <div class="section" id="about-your-facilitator">
 54 | <h1>About your facilitator<a class="headerlink" href="#about-your-facilitator" title="Permalink to this headline">¶</a></h1>
 55 | <p>My name is Tania. I live in Manchester UK where I work as a
 56 | Cloud Advocate for Microsoft.</p>
 57 | <p>Over the years, I have worked as a data engineer, machine learning engineer,
 58 | and research software engineer. I love data intensive
 59 | enviroments and I am particularly interested in the tools and workflows to
 60 | deliver robust, reproducible data insights.</p>
 61 | <p>If you have any questions or feedback about this tutorial please,
 62 | file an issue using the following link: <a class="reference external" href="https://github.com/trallard/airflow-tutorial/issues/new">https://github.com/trallard/airflow-tutorial/issues/new</a>.</p>
 63 | <p>You can also contact me via the following channels:</p>
 64 | <ul class="simple">
 65 | <li><p>E-mail: <a class="reference external" href="mailto:trallard&#37;&#52;&#48;bitsandchips&#46;me">trallard<span>&#64;</span>bitsandchips<span>&#46;</span>me</a></p></li>
 66 | <li><p>Twitter: <a class="reference external" href="https://twitter.com/ixek">&#64;ixek</a></p></li>
 67 | <li><p><a class="reference external" href="https://github.com/ixek">Tania on GitHub</a></p></li>
 68 | </ul>
 69 | </div>
 70 | <div class="section" id="code-of-conduct">
 71 | <h1>Code of Conduct<a class="headerlink" href="#code-of-conduct" title="Permalink to this headline">¶</a></h1>
 72 | <p>All attendees to this workshop are expected to adhere to PyCon’s Code of Conduct,
 73 | in brief:
 74 | <strong>Be open, considerate, and respectful.</strong></p>
 75 | </div>
 76 | <div class="section" id="license">
 77 | <h1>License<a class="headerlink" href="#license" title="Permalink to this headline">¶</a></h1>
 78 | <p>The content in this workshop is Licensed under <a class="reference external" href="https://creativecommons.org/licenses/by-sa/4.0/">CC-BY-SA 4.0</a>.
 79 | Which means that you can use, remix and re-distribute so long attribution to the original
 80 | author is maintained (Tania Allard).</p>
 81 | <p>The logo used here was designed by Ashley McNamara for the Microsoft Developer Advocates team use.</p>
 82 | </div>
 83 | 
 84 | 
 85 |           </div>
 86 |               <div class="related bottom">
 87 |                 &nbsp;
 88 |   <nav id="rellinks">
 89 |     <ul>
 90 |         <li>
 91 |           <a href="setup.html" title="Next document">Getting started</a>
 92 |           &rarr;
 93 |         </li>
 94 |     </ul>
 95 |   </nav>
 96 |               </div>
 97 |           
 98 |         </div>
 99 |       </div>
100 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
101 |         <div class="sphinxsidebarwrapper">
102 | <p class="logo">
103 |   <a href="#">
104 |     <img class="logo" src="_static/python.png" alt="Logo"/>
105 |     
106 |   </a>
107 | </p>
108 | 
109 | 
110 | 
111 | <p class="blurb">a.k.a an introduction to all things DAGS and pipelines joy</p>
112 | 
113 | 
114 | 
115 | 
116 | <p>
117 | <iframe src="https://ghbtns.com/github-btn.html?user=trallard&repo=airflow-tutorial&type=star&count=true&size=large&v=2"
118 |   allowtransparency="true" frameborder="0" scrolling="0" width="200px" height="35px"></iframe>
119 | </p>
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 |   <h3><a href="#">Table of Contents</a></h3>
127 |   <ul>
128 | <li><a class="reference internal" href="#">Airflow tutorial</a><ul>
129 | </ul>
130 | </li>
131 | <li><a class="reference internal" href="#about-your-facilitator">About your facilitator</a></li>
132 | <li><a class="reference internal" href="#code-of-conduct">Code of Conduct</a></li>
133 | <li><a class="reference internal" href="#license">License</a></li>
134 | </ul>
135 | 
136 | <div id="searchbox" style="display: none" role="search">
137 |   <h3>Quick search</h3>
138 |     <div class="searchformwrapper">
139 |     <form class="search" action="search.html" method="get">
140 |       <input type="text" name="q" />
141 |       <input type="submit" value="Go" />
142 |     </form>
143 |     </div>
144 | </div>
145 | <script type="text/javascript">$('#searchbox').show(0);</script><h3>Navigation</h3>
146 | <p class="caption"><span class="caption-text">Table of Contents</span></p>
147 | <ul>
148 | <li class="toctree-l1"><a class="reference internal" href="setup.html">Getting started</a></li>
149 | </ul>
150 | 
151 | <div class="relations">
152 | <h3>Related Topics</h3>
153 | <ul>
154 |   <li><a href="#">Documentation overview</a><ul>
155 |       <li>Next: <a href="setup.html" title="next chapter">Getting started</a></li>
156 |   </ul></li>
157 | </ul>
158 | </div><p><iframe src="https://ghbtns.com/github-btn.html?user=trallard&type=follow&count=false" allowtransparency="true"
159 |         frameborder="0" scrolling="0" width="200" height="20"></iframe></p>
160 | 
161 | <p><a href="https://twitter.com/ixek" class="twitter-follow-button" data-show-count="false">Follow @ixek</a>
162 |     <script>!function (d, s, id) { var js, fjs = d.getElementsByTagName(s)[0], p = /^http:/.test(d.location) ? 'http' : 'https'; if (!d.getElementById(id)) { js = d.createElement(s); js.id = id; js.src = p + '://platform.twitter.com/widgets.js'; fjs.parentNode.insertBefore(js, fjs); } }(document, 'script', 'twitter-wjs');</script>
163 | </p>
164 |         </div>
165 |       </div>
166 |       <div class="clearer"></div>
167 |     </div>
168 |     <div class="footer">
169 |       &copy;2019, Tania Allard.
170 |       
171 |       |
172 |       Powered by <a href="http://sphinx-doc.org/">Sphinx 2.0.1</a>
173 |       &amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.12</a>
174 |       
175 |       |
176 |       <a href="_sources/index.rst.txt"
177 |           rel="nofollow">Page source</a>
178 |     </div>
179 | 
180 |     
181 | 
182 |     
183 |   </body>
184 | </html>


--------------------------------------------------------------------------------
/docs/objects.inv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/docs/objects.inv


--------------------------------------------------------------------------------
/docs/search.html:
--------------------------------------------------------------------------------
  1 | 
  2 | <!DOCTYPE html>
  3 | 
  4 | <html xmlns="http://www.w3.org/1999/xhtml">
  5 |   <head>
  6 |     <meta charset="utf-8" />
  7 |     <title>Search &#8212; Airflow tutorial  documentation</title>
  8 |     <link rel="stylesheet" href="_static/alabaster.css" type="text/css" />
  9 |     <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
 10 |     
 11 |     <script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
 12 |     <script type="text/javascript" src="_static/jquery.js"></script>
 13 |     <script type="text/javascript" src="_static/underscore.js"></script>
 14 |     <script type="text/javascript" src="_static/doctools.js"></script>
 15 |     <script type="text/javascript" src="_static/language_data.js"></script>
 16 |     <script type="text/javascript" src="_static/searchtools.js"></script>
 17 |     <link rel="index" title="Index" href="genindex.html" />
 18 |     <link rel="search" title="Search" href="#" />
 19 |   <script type="text/javascript" src="searchindex.js" defer></script>
 20 |   
 21 |    
 22 |   <link rel="stylesheet" href="_static/custom.css" type="text/css" />
 23 |   
 24 |   
 25 |   <meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
 26 | 
 27 | 
 28 |   </head><body>
 29 |   
 30 | 
 31 |     <div class="document">
 32 |       <div class="documentwrapper">
 33 |         <div class="bodywrapper">
 34 |               <div class="related top">
 35 |                 &nbsp;
 36 |   <nav id="rellinks">
 37 |     <ul>
 38 |     </ul>
 39 |   </nav>
 40 |               </div>
 41 |           
 42 | 
 43 |           <div class="body" role="main">
 44 |             
 45 |   <h1 id="search-documentation">Search</h1>
 46 |   <div id="fallback" class="admonition warning">
 47 |   <script type="text/javascript">$('#fallback').hide();</script>
 48 |   <p>
 49 |     Please activate JavaScript to enable the search
 50 |     functionality.
 51 |   </p>
 52 |   </div>
 53 |   <p>
 54 |     From here you can search these documents. Enter your search
 55 |     words into the box below and click "search". Note that the search
 56 |     function will automatically search for all of the words. Pages
 57 |     containing fewer words won't appear in the result list.
 58 |   </p>
 59 |   <form action="" method="get">
 60 |     <input type="text" name="q" value="" />
 61 |     <input type="submit" value="search" />
 62 |     <span id="search-progress" style="padding-left: 10px"></span>
 63 |   </form>
 64 |   
 65 |   <div id="search-results">
 66 |   
 67 |   </div>
 68 | 
 69 |           </div>
 70 |               <div class="related bottom">
 71 |                 &nbsp;
 72 |   <nav id="rellinks">
 73 |     <ul>
 74 |     </ul>
 75 |   </nav>
 76 |               </div>
 77 |           
 78 |         </div>
 79 |       </div>
 80 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
 81 |         <div class="sphinxsidebarwrapper">
 82 | <p class="logo">
 83 |   <a href="index.html">
 84 |     <img class="logo" src="_static/python.png" alt="Logo"/>
 85 |     
 86 |   </a>
 87 | </p>
 88 | 
 89 | 
 90 | 
 91 | <p class="blurb">a.k.a an introduction to all things DAGS and pipelines joy</p>
 92 | 
 93 | 
 94 | 
 95 | 
 96 | <p>
 97 | <iframe src="https://ghbtns.com/github-btn.html?user=trallard&repo=airflow-tutorial&type=star&count=true&size=large&v=2"
 98 |   allowtransparency="true" frameborder="0" scrolling="0" width="200px" height="35px"></iframe>
 99 | </p>
100 | 
101 | 
102 | 
103 | 
104 | 
105 | <h3>Navigation</h3>
106 | <p class="caption"><span class="caption-text">Table of Contents</span></p>
107 | <ul>
108 | <li class="toctree-l1"><a class="reference internal" href="setup.html">Getting started</a></li>
109 | </ul>
110 | 
111 | <div class="relations">
112 | <h3>Related Topics</h3>
113 | <ul>
114 |   <li><a href="index.html">Documentation overview</a><ul>
115 |   </ul></li>
116 | </ul>
117 | </div><p><iframe src="https://ghbtns.com/github-btn.html?user=trallard&type=follow&count=false" allowtransparency="true"
118 |         frameborder="0" scrolling="0" width="200" height="20"></iframe></p>
119 | 
120 | <p><a href="https://twitter.com/ixek" class="twitter-follow-button" data-show-count="false">Follow @ixek</a>
121 |     <script>!function (d, s, id) { var js, fjs = d.getElementsByTagName(s)[0], p = /^http:/.test(d.location) ? 'http' : 'https'; if (!d.getElementById(id)) { js = d.createElement(s); js.id = id; js.src = p + '://platform.twitter.com/widgets.js'; fjs.parentNode.insertBefore(js, fjs); } }(document, 'script', 'twitter-wjs');</script>
122 | </p>
123 |         </div>
124 |       </div>
125 |       <div class="clearer"></div>
126 |     </div>
127 |     <div class="footer">
128 |       &copy;2019, Tania Allard.
129 |       
130 |       |
131 |       Powered by <a href="http://sphinx-doc.org/">Sphinx 2.0.1</a>
132 |       &amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.12</a>
133 |       
134 |     </div>
135 | 
136 |     
137 | 
138 |     
139 |   </body>
140 | </html>


--------------------------------------------------------------------------------
/docs/searchindex.js:
--------------------------------------------------------------------------------
1 | Search.setIndex({docnames:["index","setup"],envversion:{"sphinx.domains.c":1,"sphinx.domains.changeset":1,"sphinx.domains.cpp":1,"sphinx.domains.javascript":1,"sphinx.domains.math":2,"sphinx.domains.python":1,"sphinx.domains.rst":1,"sphinx.domains.std":1,"sphinx.ext.intersphinx":1,sphinx:56},filenames:["index.rst","setup.md"],objects:{},objnames:{},objtypes:{},terms:{"long":0,"new":0,The:0,adher:0,advoc:0,all:0,allard:0,also:0,ani:0,ashlei:0,attende:0,attribut:0,author:0,bitsandchip:0,brief:0,can:0,channel:0,cloud:0,com:0,consider:0,contact:0,content:0,data:0,deliv:0,design:0,develop:0,distribut:0,engin:0,enviro:0,expect:0,feedback:0,file:0,follow:0,github:0,have:0,here:0,http:0,insight:0,intens:0,interest:0,issu:0,ixek:0,learn:0,link:0,live:0,logo:0,love:0,machin:0,mail:0,maintain:0,manchest:0,mcnamara:0,mean:0,microsoft:0,name:0,open:0,origin:0,over:0,particularli:0,pleas:0,pycon:0,question:0,remix:0,reproduc:0,research:0,respect:0,robust:0,softwar:0,tania:0,team:0,thi:0,tool:0,trallard:0,twitter:0,under:0,use:0,used:0,using:0,via:0,where:0,which:0,work:0,workflow:0,workshop:0,year:0,you:0},titles:["Airflow tutorial","Getting started"],titleterms:{about:0,airflow:0,code:0,conduct:0,facilit:0,get:1,licens:0,start:1,tutori:0,your:0}})


--------------------------------------------------------------------------------
/docs/setup.html:
--------------------------------------------------------------------------------
  1 | 
  2 | <!DOCTYPE html>
  3 | 
  4 | <html xmlns="http://www.w3.org/1999/xhtml">
  5 |   <head>
  6 |     <meta charset="utf-8" />
  7 |     <title>Getting started &#8212; Airflow tutorial  documentation</title>
  8 |     <link rel="stylesheet" href="_static/alabaster.css" type="text/css" />
  9 |     <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
 10 |     <script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
 11 |     <script type="text/javascript" src="_static/jquery.js"></script>
 12 |     <script type="text/javascript" src="_static/underscore.js"></script>
 13 |     <script type="text/javascript" src="_static/doctools.js"></script>
 14 |     <script type="text/javascript" src="_static/language_data.js"></script>
 15 |     <link rel="index" title="Index" href="genindex.html" />
 16 |     <link rel="search" title="Search" href="search.html" />
 17 |     <link rel="prev" title="Airflow tutorial" href="index.html" />
 18 |    
 19 |   <link rel="stylesheet" href="_static/custom.css" type="text/css" />
 20 |   
 21 |   
 22 |   <meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
 23 | 
 24 |   </head><body>
 25 |   
 26 | 
 27 |     <div class="document">
 28 |       <div class="documentwrapper">
 29 |         <div class="bodywrapper">
 30 |               <div class="related top">
 31 |                 &nbsp;
 32 |   <nav id="rellinks">
 33 |     <ul>
 34 |         <li>
 35 |           &larr;
 36 |           <a href="index.html" title="Previous document">Airflow tutorial</a>
 37 |         </li>
 38 |     </ul>
 39 |   </nav>
 40 |               </div>
 41 |           
 42 | 
 43 |           <div class="body" role="main">
 44 |             
 45 |   <div class="section" id="getting-started">
 46 | <h1>Getting started<a class="headerlink" href="#getting-started" title="Permalink to this headline">¶</a></h1>
 47 | </div>
 48 | 
 49 | 
 50 |           </div>
 51 |               <div class="related bottom">
 52 |                 &nbsp;
 53 |   <nav id="rellinks">
 54 |     <ul>
 55 |         <li>
 56 |           &larr;
 57 |           <a href="index.html" title="Previous document">Airflow tutorial</a>
 58 |         </li>
 59 |     </ul>
 60 |   </nav>
 61 |               </div>
 62 |           
 63 |         </div>
 64 |       </div>
 65 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
 66 |         <div class="sphinxsidebarwrapper">
 67 | <p class="logo">
 68 |   <a href="index.html">
 69 |     <img class="logo" src="_static/python.png" alt="Logo"/>
 70 |     
 71 |   </a>
 72 | </p>
 73 | 
 74 | 
 75 | 
 76 | <p class="blurb">a.k.a an introduction to all things DAGS and pipelines joy</p>
 77 | 
 78 | 
 79 | 
 80 | 
 81 | <p>
 82 | <iframe src="https://ghbtns.com/github-btn.html?user=trallard&repo=airflow-tutorial&type=star&count=true&size=large&v=2"
 83 |   allowtransparency="true" frameborder="0" scrolling="0" width="200px" height="35px"></iframe>
 84 | </p>
 85 | 
 86 | 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | <div id="searchbox" style="display: none" role="search">
 92 |   <h3>Quick search</h3>
 93 |     <div class="searchformwrapper">
 94 |     <form class="search" action="search.html" method="get">
 95 |       <input type="text" name="q" />
 96 |       <input type="submit" value="Go" />
 97 |     </form>
 98 |     </div>
 99 | </div>
100 | <script type="text/javascript">$('#searchbox').show(0);</script><h3>Navigation</h3>
101 | <p class="caption"><span class="caption-text">Table of Contents</span></p>
102 | <ul class="current">
103 | <li class="toctree-l1 current"><a class="current reference internal" href="#">Getting started</a></li>
104 | </ul>
105 | 
106 | <div class="relations">
107 | <h3>Related Topics</h3>
108 | <ul>
109 |   <li><a href="index.html">Documentation overview</a><ul>
110 |       <li>Previous: <a href="index.html" title="previous chapter">Airflow tutorial</a></li>
111 |   </ul></li>
112 | </ul>
113 | </div><p><iframe src="https://ghbtns.com/github-btn.html?user=trallard&type=follow&count=false" allowtransparency="true"
114 |         frameborder="0" scrolling="0" width="200" height="20"></iframe></p>
115 | 
116 | <p><a href="https://twitter.com/ixek" class="twitter-follow-button" data-show-count="false">Follow @ixek</a>
117 |     <script>!function (d, s, id) { var js, fjs = d.getElementsByTagName(s)[0], p = /^http:/.test(d.location) ? 'http' : 'https'; if (!d.getElementById(id)) { js = d.createElement(s); js.id = id; js.src = p + '://platform.twitter.com/widgets.js'; fjs.parentNode.insertBefore(js, fjs); } }(document, 'script', 'twitter-wjs');</script>
118 | </p>
119 |         </div>
120 |       </div>
121 |       <div class="clearer"></div>
122 |     </div>
123 |     <div class="footer">
124 |       &copy;2019, Tania Allard.
125 |       
126 |       |
127 |       Powered by <a href="http://sphinx-doc.org/">Sphinx 2.0.1</a>
128 |       &amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.12</a>
129 |       
130 |       |
131 |       <a href="_sources/setup.md.txt"
132 |           rel="nofollow">Page source</a>
133 |     </div>
134 | 
135 |     
136 | 
137 |     
138 |   </body>
139 | </html>


--------------------------------------------------------------------------------
/environment.yaml:
--------------------------------------------------------------------------------
 1 | name: airflow-env
 2 | dependencies:
 3 |   - jupyter==1.0.0
 4 |   - jupyterlab==0.35.5
 5 |   - matplotlib==3.0.3
 6 |   - mysqlclient==1.3.14
 7 |   - numpy==1.16.3
 8 |   - pandas==0.24.2
 9 |   - scipy==1.2.1
10 |   - seaborn==0.9.0
11 |   - pip:
12 |     - tweepy==3.7.0
13 |     - hypothesis==4.18.0
14 |     - celery==4.1.1
15 |     - apache-airflow[celery,kubernetes,mysql,password,slack]==1.10.3
16 |     - mysql-connector-python==8.0.16
17 |     - papermill==1.0.0


--------------------------------------------------------------------------------
/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | -i https://pypi.org/simple
  2 | alembic==0.9.10
  3 | amqp==2.4.2
  4 | ansiwrap==0.8.4
  5 | apache-airflow[celery,kubernetes,mysql,password,slack]==1.10.3
  6 | appnope==0.1.0 ; sys_platform == 'darwin'
  7 | asn1crypto==0.24.0
  8 | attrs==19.1.0
  9 | babel==2.6.0
 10 | backcall==0.1.0
 11 | bcrypt==3.1.6
 12 | billiard==3.5.0.5
 13 | bleach==3.1.0
 14 | cachetools==3.1.0
 15 | celery==4.1.1
 16 | certifi==2019.3.9
 17 | cffi==1.12.3
 18 | chardet==3.0.4
 19 | click==7.0
 20 | colorama==0.4.1
 21 | configparser==3.5.3
 22 | croniter==0.3.30
 23 | cryptography==3.2
 24 | cycler==0.10.0
 25 | decorator==4.4.0
 26 | defusedxml==0.6.0
 27 | dill==0.2.9
 28 | docutils==0.14
 29 | entrypoints==0.3
 30 | flask-admin==1.5.3
 31 | flask-appbuilder==1.12.3
 32 | flask-babel==0.12.2
 33 | flask-bcrypt==0.7.1
 34 | flask-caching==1.3.3
 35 | flask-login==0.4.1
 36 | flask-openid==1.2.5
 37 | flask-sqlalchemy==2.4.0
 38 | flask-swagger==0.2.13
 39 | flask-wtf==0.14.2
 40 | flask==1.0.2
 41 | flower==0.9.3
 42 | funcsigs==1.0.0
 43 | future==0.16.0
 44 | gitdb2==2.0.5
 45 | gitpython==2.1.11
 46 | google-auth==1.6.3
 47 | gunicorn==19.9.0
 48 | hypothesis==4.18.0
 49 | idna==2.8
 50 | ipykernel==5.1.0
 51 | ipython-genutils==0.2.0
 52 | ipython==7.5.0 ; python_version >= '3.3'
 53 | ipywidgets==7.4.2
 54 | iso8601==0.1.12
 55 | itsdangerous==1.1.0
 56 | jedi==0.13.3
 57 | jinja2==2.10
 58 | json-merge-patch==0.2
 59 | jsonschema==3.0.1
 60 | jupyter-client==5.2.4
 61 | jupyter-console==6.0.0
 62 | jupyter-core==4.4.0
 63 | jupyter==1.0.0
 64 | jupyterlab-server==0.2.0
 65 | jupyterlab==0.35.5
 66 | kiwisolver==1.1.0
 67 | kombu==4.5.0
 68 | kubernetes==9.0.0
 69 | lockfile==0.12.2
 70 | lxml==4.3.3
 71 | mako==1.0.9
 72 | markdown==2.6.11
 73 | markupsafe==1.1.1
 74 | matplotlib==3.0.3
 75 | mistune==0.8.4
 76 | mysql-connector-python==8.0.16
 77 | mysqlclient==1.3.14
 78 | nbconvert==5.5.0
 79 | nbformat==4.4.0
 80 | notebook==5.7.8
 81 | numpy==1.16.3
 82 | oauthlib==3.0.1
 83 | ordereddict==1.1
 84 | pandas==0.24.2
 85 | pandocfilters==1.4.2
 86 | papermill==1.0.0
 87 | parso==0.4.0
 88 | pendulum==1.4.4
 89 | pexpect==4.7.0 ; sys_platform != 'win32'
 90 | pickleshare==0.7.5
 91 | prometheus-client==0.6.0
 92 | prompt-toolkit==2.0.9
 93 | protobuf==3.7.1
 94 | psutil==5.6.2
 95 | ptyprocess==0.6.0 ; os_name != 'nt'
 96 | pyasn1-modules==0.2.5
 97 | pyasn1==0.4.5
 98 | pycparser==2.19
 99 | pygments==2.3.1
100 | pyparsing==2.4.0
101 | pyrsistent==0.15.1
102 | pysocks==1.6.8
103 | python-daemon==2.1.2
104 | python-dateutil==2.8.0
105 | python-editor==1.0.4
106 | python3-openid==3.1.0
107 | pytz==2019.1
108 | pytzdata==2019.1
109 | pyyaml==5.1
110 | pyzmq==18.0.1
111 | qtconsole==4.4.3
112 | requests-oauthlib==1.2.0
113 | requests==2.21.0
114 | rsa==4.0
115 | scipy==1.2.1
116 | seaborn==0.9.0
117 | send2trash==1.5.0
118 | setproctitle==1.1.10
119 | six==1.12.0
120 | slackclient==1.3.1
121 | smmap2==2.0.5
122 | sqlalchemy==1.2.19
123 | tabulate==0.8.3
124 | tenacity==4.12.0
125 | terminado==0.8.2
126 | testpath==0.4.2
127 | text-unidecode==1.2
128 | textwrap3==0.9.2
129 | thrift==0.11.0
130 | tornado==5.1.1
131 | tqdm==4.31.1
132 | traitlets==4.3.2
133 | tweepy==3.7.0
134 | tzlocal==1.5.1
135 | unicodecsv==0.14.1
136 | urllib3==1.24.2
137 | vine==1.3.0
138 | wcwidth==0.1.7
139 | webencodings==0.5.1
140 | websocket-client==0.54.0
141 | werkzeug==0.14.1
142 | widgetsnbextension==3.4.2
143 | wtforms==2.2.1
144 | zope.deprecation==4.4.0
145 | 


--------------------------------------------------------------------------------
/solutions/dags/dags/generate_twitter.py:
--------------------------------------------------------------------------------
  1 | """ Simple example of creating subdags and generating work dynamically"""
  2 | from airflow import DAG
  3 | from airflow.hooks import SqliteHook
  4 | 
  5 | from airflow.hooks.mysql_hook import MySqlHook
  6 | from airflow.models import Variable
  7 | from airflow.operators.email_operator import EmailOperator
  8 | from airflow.operators.python_operator import PythonOperator, BranchPythonOperator
  9 | from airflow.operators.bash_operator import BashOperator
 10 | from airflow.operators.subdag_operator import SubDagOperator
 11 | 
 12 | 
 13 | from twitter_airflow import search_twitter, RAW_TWEET_DIR
 14 | from subdags.twitter_subdag import subdag
 15 | from datetime import datetime, timedelta
 16 | import pandas as pd
 17 | import re
 18 | import random
 19 | 
 20 | 
 21 | SEARCH_TERMS = ["#python", "#pydata", "#airflow", "data wrangling", "data pipelines"]
 22 | 
 23 | 
 24 | default_args = {
 25 |     "owner": "admin",
 26 |     "depends_on_past": False,
 27 |     "start_date": datetime.now() - timedelta(days=4),
 28 |     "retries": 1,
 29 |     "retry_delay": timedelta(minutes=5),
 30 | }
 31 | 
 32 | dag = DAG(
 33 |     "generate_twitter_dags", default_args=default_args, schedule_interval="@daily"
 34 | )
 35 | 
 36 | 
 37 | def fill_terms(my_terms=SEARCH_TERMS, **kwargs):
 38 |     """ Fill sqlite database with a few search terms. """
 39 |     dbconn = MySqlHook(mysql_conn_id="mysql_default")
 40 |     conn = dbconn.get_connection()
 41 |     cursor = conn.cursor()
 42 |     df = pd.DataFrame(my_terms, columns=["search_term"])
 43 |     try:
 44 |         df.to_sql("twitter_terms", conn)
 45 |     except ValueError:
 46 |         # table already exists
 47 |         pass
 48 | 
 49 | 
 50 | def generate_search_terms(**kwargs):
 51 |     """ Generate subdag to search twitter for terms. """
 52 |     dbconn = MySqlHook(mysql_conn_id="mysql_default")
 53 |     conn = dbconn.get_connection()
 54 |     cursor = conn.cursor()
 55 |     query = "select * from twitter_terms"
 56 |     df = pd.read_sql_query(query, conn)
 57 |     return random.choice(
 58 |         [
 59 |             "search_{}_twitter".format(re.sub(r"\W+", "", t))
 60 |             for t in df.search_term.values
 61 |         ]
 62 |     )
 63 | 
 64 | 
 65 | fill_search_terms = PythonOperator(
 66 |     task_id="fill_terms", provide_context=True, python_callable=fill_terms, dag=dag
 67 | )
 68 | 
 69 | 
 70 | gen_search_terms = BranchPythonOperator(
 71 |     task_id="generate_search_terms",
 72 |     provide_context=True,
 73 |     python_callable=generate_search_terms,
 74 |     dag=dag,
 75 | )
 76 | 
 77 | 
 78 | email_links = EmailOperator(
 79 |     task_id="email_best_links",
 80 |     to="MYEMAIL@MYSITE.com",
 81 |     subject="Latest popular links",
 82 |     html_content="Check out the latest!!",
 83 |     files=["{}/latest_links.txt".format(RAW_TWEET_DIR)],
 84 |     dag=dag,
 85 | )
 86 | 
 87 | 
 88 | sub = SubDagOperator(
 89 |     subdag=subdag, task_id="insert_and_id_pop", trigger_rule="one_success", dag=dag
 90 | )
 91 | 
 92 | 
 93 | clear_latest = BashOperator(
 94 |     bash_command="rm -rf {}/latest_links.txt".format(RAW_TWEET_DIR),
 95 |     task_id="clear_latest",
 96 |     dag=dag,
 97 | )
 98 | 
 99 | 
100 | gen_search_terms.set_upstream(fill_search_terms)
101 | 
102 | for term in SEARCH_TERMS:
103 |     term_without_punctuation = re.sub(r"\W+", "", term)
104 |     simple_search = PythonOperator(
105 |         task_id="search_{}_twitter".format(term_without_punctuation),
106 |         provide_context=True,
107 |         python_callable=search_twitter,
108 |         dag=dag,
109 |         params={"query": term},
110 |     )
111 |     simple_search.set_upstream(gen_search_terms)
112 |     simple_search.set_downstream(sub)
113 | 
114 | sub.set_downstream(email_links)
115 | email_links.set_downstream(clear_latest)
116 | 


--------------------------------------------------------------------------------
/solutions/dags/dags/parameters.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This example uses the existing Dummy Operator and Variable model to
 3 | demonstrate dynamic creation of DAGs based on a Variable setting. As
 4 | shown below, a list of customer objects is retrieved and used to create
 5 | unique dags based on the imput.
 6 | """
 7 | 
 8 | from datetime import datetime, timedelta
 9 | from airflow.models import DAG
10 | from airflow.models import Variable
11 | from airflow.operators.dummy_operator import DummyOperator
12 | 
13 | # Create JSON Variable if it doesn't exist
14 | 
15 | CUSTOMERS = [
16 |     {
17 |         "customer_name": "Faux Customer",
18 |         "customer_id": "faux_customer",
19 |         "email": ["admin@fauxcustomer.com", "admin@astronomer.io"],
20 |         "schedule_interval": None,
21 |         "enabled": True,
22 |     },
23 |     {
24 |         "customer_name": "Bogus Customer",
25 |         "customer_id": "bogus_customer",
26 |         "email": ["admin@boguscustomer.com", "admin@astronomer.io"],
27 |         "schedule_interval": "@once",
28 |         "enabled": True,
29 |     },
30 | ]
31 | 
32 | # Get JSON Variable
33 | CUSTOMERS = Variable.get("customer_list", default_var=CUSTOMERS, deserialize_json=True)
34 | 
35 | 
36 | def create_dag(customer):
37 |     """
38 |     Accepts a customer parameters dict and
39 |     overrides default args to create a DAG object
40 | 
41 |     Returns: DAG() Object
42 |     """
43 |     default_args = {
44 |         "owner": "airflow",
45 |         "depends_on_past": False,
46 |         "email": "xyz@xyz.com",
47 |         "retries": 1,
48 |         "retry_delay": timedelta(minutes=5),
49 |         "start_date": datetime(2017, 1, 1, 0, 0),
50 |         "end_date": None,
51 |     }
52 | 
53 |     """
54 |     This allows DAG parameters to be passed in from the Variable if
55 |     a customer needs something specific overridden in their DAG.
56 |     Consider how email being passed in from the customer object
57 |     overrides email in the resulting replaced_args object.
58 |     """
59 |     replaced_args = {
60 |         k: default_args[k] if customer.get(k, None) is None else customer[k]
61 |         for k in default_args
62 |     }
63 | 
64 |     dag_id = "{base_name}_{id}".format(
65 |         base_name="load_clickstream_data", id=customer["customer_id"]
66 |     )
67 | 
68 |     return DAG(
69 |         dag_id=dag_id,
70 |         default_args=replaced_args,
71 |         schedule_interval=customer["schedule_interval"],
72 |     )
73 | 
74 |     # Loop customers array of containing customer objects
75 |     for cust in CUSTOMERS:
76 |         if cust["enabled"]:
77 | 
78 |             dag = create_dag(cust)
79 | 
80 |             globals()[dag.dag_id] = dag
81 | 
82 |             extract = DummyOperator(task_id="extract_data", dag=dag)
83 | 
84 |             transform = DummyOperator(task_id="transform_data", dag=dag)
85 | 
86 |             load = DummyOperator(task_id="load_data", dag=dag)
87 | 
88 |             extract >> transform >> load
89 | 
90 |         else:
91 |             # TODO Create but programmatically pause
92 |             pass
93 | 


--------------------------------------------------------------------------------
/solutions/dags/dags/simple_dag.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.operators.dummy_operator import DummyOperator
 5 | from airflow.operators.python_operator import PythonOperator
 6 | 
 7 | 
 8 | def print_hello():
 9 |     return "Hello world!"
10 | 
11 | 
12 | default_args = {
13 |     "owner": "airflow",
14 |     "depends_on_past": False,
15 |     "start_date": datetime(2019, 4, 30),
16 |     "email": ["airflow@example.com"],
17 |     "email_on_failure": False,
18 |     "email_on_retry": False,
19 |     "retries": 1,
20 |     "retry_delay": timedelta(minutes=2),
21 | }
22 | 
23 | dag = DAG(
24 |     "hello_world",
25 |     description="Simple tutorial DAG",
26 |     schedule_interval="0 12 * * *",
27 |     default_args=default_args,
28 |     catchup=False,
29 | )
30 | 
31 | t1 = DummyOperator(task_id="dummy_task", retries=3, dag=dag)
32 | 
33 | t2 = PythonOperator(task_id="hello_task", python_callable=print_hello, dag=dag)
34 | 
35 | # sets downstream foe t1
36 | t1 >> t2
37 | 
38 | # equivalent
39 | # t2.set_upstream(t1)
40 | 


--------------------------------------------------------------------------------
/solutions/dags/dags/subdags/twitter_subdag.py:
--------------------------------------------------------------------------------
 1 | """ Simple subdag example """
 2 | from airflow import DAG
 3 | from airflow.operators import PythonOperator
 4 | from twitter_airflow import csv_to_sql, identify_popular_links
 5 | from datetime import datetime, timedelta
 6 | 
 7 | 
 8 | default_args = {
 9 |     "owner": "admin",
10 |     "depends_on_past": False,
11 |     "start_date": datetime(2016, 1, 1),
12 |     "retries": 1,
13 |     "retry_delay": timedelta(minutes=5),
14 | }
15 | 
16 | subdag = DAG("generate_twitter_dags.insert_and_id_pop", default_args=default_args)
17 | 
18 | move_tweets_to_sql = PythonOperator(
19 |     task_id="csv_to_sqlite",
20 |     provide_context=True,
21 |     python_callable=csv_to_sql,
22 |     dag=subdag,
23 | )
24 | 
25 | id_popular = PythonOperator(
26 |     task_id="identify_popular_links",
27 |     provide_context=True,
28 |     python_callable=identify_popular_links,
29 |     dag=subdag,
30 |     params={"write_mode": "a"},
31 | )
32 | 
33 | id_popular.set_upstream(move_tweets_to_sql)
34 | 


--------------------------------------------------------------------------------
/solutions/dags/dags/twitter_airflow.py:
--------------------------------------------------------------------------------
  1 | """ Simple Airflow data pipeline example using Twitter API """
  2 | import ast
  3 | import glob
  4 | import itertools
  5 | import os.path
  6 | import shutil
  7 | from collections import Counter
  8 | from configparser import ConfigParser
  9 | from csv import DictWriter, writer
 10 | from datetime import datetime, timedelta
 11 | from pathlib import Path
 12 | import MySQLdb
 13 | import MySQLdb.cursors
 14 | 
 15 | import pandas as pd
 16 | from tweepy import API, Cursor, OAuthHandler
 17 | 
 18 | from airflow import DAG
 19 | from airflow.hooks import sqlite_hook
 20 | from airflow.hooks.mysql_hook import MySqlHook
 21 | from airflow.models import Variable
 22 | from airflow.operators.email_operator import EmailOperator
 23 | from airflow.operators.python_operator import PythonOperator
 24 | 
 25 | RAW_TWEET_DIR = os.path.abspath(os.path.join(__file__, "../data/tweets/"))
 26 | CONFIG_FILE = os.path.abspath(os.path.join(__file__, "../config/prod.cfg"))
 27 | MAX_TWEEPY_PAGE = 2
 28 | 
 29 | # since there do not exist task on their own we need to create the DAG
 30 | default_args = {
 31 |     "owner": "admin",
 32 |     "depends_on_past": False,
 33 |     "start_date": datetime.now() - timedelta(days=5),
 34 |     "retries": 1,
 35 |     "retry_delay": timedelta(minutes=5),
 36 | }
 37 | 
 38 | dag = DAG("twitter_links", default_args=default_args, schedule_interval="@daily")
 39 | 
 40 | 
 41 | def extract_tweet_data(tweepy_obj, query):
 42 |     """ Extract relevant and serializable data from a tweepy Tweet object
 43 |         params:
 44 |             tweepy_obj: Tweepy Tweet Object
 45 |             query: str
 46 |         returns dict
 47 |     """
 48 |     return {
 49 |         "user_id": tweepy_obj.user.id,
 50 |         "user_name": tweepy_obj.user.name,
 51 |         "user_screenname": tweepy_obj.user.screen_name,
 52 |         "user_url": tweepy_obj.user.url,
 53 |         "user_description": tweepy_obj.user.description,
 54 |         "user_followers": tweepy_obj.user.followers_count,
 55 |         "user_friends": tweepy_obj.user.friends_count,
 56 |         "created": tweepy_obj.created_at.isoformat(),
 57 |         "text": tweepy_obj.text,
 58 |         "hashtags": [ht.get("text") for ht in tweepy_obj.entities.get("hashtags")],
 59 |         "mentions": [
 60 |             (um.get("id"), um.get("screen_name"))
 61 |             for um in tweepy_obj.entities.get("user_mentions")
 62 |         ],
 63 |         "urls": [url.get("expanded_url") for url in tweepy_obj.entities.get("urls")],
 64 |         "tweet_id": tweepy_obj.id,
 65 |         "is_quote_status": tweepy_obj.is_quote_status,
 66 |         "favorite_count": tweepy_obj.favorite_count,
 67 |         "retweet_count": tweepy_obj.retweet_count,
 68 |         "reply_status_id": tweepy_obj.in_reply_to_status_id,
 69 |         "lang": tweepy_obj.lang,
 70 |         "source": tweepy_obj.source,
 71 |         "location": tweepy_obj.coordinates,
 72 |         "query": query,
 73 |     }
 74 | 
 75 | 
 76 | def search_twitter(**kwargs):
 77 |     """ Search for a query in public tweets"""
 78 |     query = kwargs.get("params").get("query")
 79 | 
 80 |     auth = OAuthHandler(Variable.get("consumer_key"), Variable.get("consumer_secret"))
 81 |     auth.set_access_token(
 82 |         Variable.get("access_token"), Variable.get("access_token_secret")
 83 |     )
 84 |     api = API(auth)
 85 | 
 86 |     all_tweets = []
 87 |     page_num = 0
 88 |     since_date = datetime.strptime(kwargs.get("ds"), "%Y-%m-%d").date() - timedelta(
 89 |         days=1
 90 |     )
 91 |     query += " since:{} until:{}".format(
 92 |         since_date.strftime("%Y-%m-%d"), kwargs.get("ds")
 93 |     )
 94 |     print(f"searching twitter with: {query}")
 95 |     for page in Cursor(
 96 |         api.search, q=query, monitor_rate_limit=True, wait_on_rate_limit=True
 97 |     ).pages():
 98 |         all_tweets.extend([extract_tweet_data(t, query) for t in page])
 99 |         page_num += 1
100 |         if page_num > MAX_TWEEPY_PAGE:
101 |             break
102 | 
103 |     # if it's an empty list, stop here
104 |     if not len(all_tweets):
105 |         return
106 | 
107 |     filename = "{}/{}_{}.csv".format(
108 |         RAW_TWEET_DIR, query, datetime.now().strftime("%m%d%Y%H%M%S")
109 |     )
110 | 
111 |     # check that the directory exists
112 |     if not Path(filename).resolve().parent.exists():
113 | 
114 |         os.mkdir(Path(filename).resolve().parent)
115 | 
116 |     with open(filename, "w") as raw_file:
117 |         raw_wrtr = DictWriter(raw_file, fieldnames=all_tweets[0].keys())
118 |         raw_wrtr.writeheader()
119 |         raw_wrtr.writerows(all_tweets)
120 | 
121 | 
122 | def csv_to_sql(directory=RAW_TWEET_DIR, **kwargs):
123 |     """ csv to sql pipeline using pandas
124 |         params:
125 |             directory: str (file path to csv files)
126 |     """
127 |     dbconn = MySqlHook(mysql_conn_id="mysql_default")
128 |     conn = dbconn.get_connection()
129 |     cursor = conn.cursor()
130 | 
131 |     for fname in glob.glob("{}/*.csv".format(directory)):
132 |         if "_read" not in fname:
133 |             try:
134 |                 df = pd.read_csv(fname)
135 |                 df.to_sql("tweets", dbconn, if_exists="append", index=False)
136 |                 shutil.move(fname, fname.replace(".csv", "_read.csv"))
137 |             except pd.io.common.EmptyDataError:
138 |                 # probably an io error with another task / open file
139 |                 continue
140 | 
141 | 
142 | def identify_popular_links(directory=RAW_TWEET_DIR, write_mode="w", **kwargs):
143 |     """ Identify the most popular links from the last day of tweest in the db
144 |         Writes them to latest_links.txt in the RAW_TWEET_DIR
145 |         (or directory kwarg)
146 |     """
147 |     dbconn = MySqlHook(mysql_conn_id="mysql_default")
148 |     conn = dbconn.get_connection()
149 |     cursor = conn.cursor()
150 | 
151 |     query = """select * from tweets where
152 |     created > date('now', '-1 days') and urls is not null
153 |     order by favorite_count"""
154 |     df = pd.read_sql_query(query, conn)
155 |     df.urls = df.urls.map(ast.literal_eval)
156 |     cntr = Counter(itertools.chain.from_iterable(df.urls.values))
157 |     with open("{}/latest_links.txt".format(directory), write_mode) as latest:
158 |         wrtr = writer(latest)
159 |         wrtr.writerow(["url", "count"])
160 |         wrtr.writerows(cntr.most_common(5))
161 | 
162 | 
163 | # --------------------------------------
164 | # Tasks
165 | # -------------------------------------
166 | simple_search = PythonOperator(
167 |     task_id="search_twitter",
168 |     provide_context=True,
169 |     python_callable=search_twitter,
170 |     dag=dag,
171 |     # note we pass this as a params obj
172 |     params={"query": "#pycon"},
173 | )
174 | 
175 | 
176 | move_tweets_to_sql = PythonOperator(
177 |     task_id="csv_to_sql",
178 |     # extra DAG context
179 |     provide_context=True,
180 |     # call the function
181 |     python_callable=csv_to_sql,
182 |     dag=dag,
183 | )
184 | 
185 | 
186 | id_popular = PythonOperator(
187 |     task_id="identify_popular_links",
188 |     provide_context=True,
189 |     python_callable=identify_popular_links,
190 |     dag=dag,
191 | )
192 | 
193 | 
194 | email_links = EmailOperator(
195 |     task_id="email_best_links",
196 |     to="trallard@bitsandchips.me",
197 |     subject="Latest popular links",
198 |     html_content="Check out the latest!!",
199 |     files=["{}/latest_links.txt".format(RAW_TWEET_DIR)],
200 |     dag=dag,
201 | )
202 | 
203 | 
204 | simple_search.set_downstream(move_tweets_to_sql)
205 | id_popular.set_upstream(move_tweets_to_sql)
206 | email_links.set_upstream(id_popular)
207 | 


--------------------------------------------------------------------------------
/solutions/dags/twitter_airflow.py:
--------------------------------------------------------------------------------
  1 | """ Simple Airflow data pipeline example using Twitter API """
  2 | import ast
  3 | import glob
  4 | import itertools
  5 | import os.path
  6 | import shutil
  7 | from collections import Counter
  8 | from configparser import ConfigParser
  9 | from csv import DictWriter, writer
 10 | from datetime import datetime, timedelta
 11 | from pathlib import Path
 12 | import MySQLdb
 13 | import MySQLdb.cursors
 14 | 
 15 | import pandas as pd
 16 | from tweepy import API, Cursor, OAuthHandler
 17 | 
 18 | from airflow import DAG
 19 | from airflow.hooks import sqlite_hook
 20 | from airflow.hooks.mysql_hook import MySqlHook
 21 | from airflow.models import Variable
 22 | from airflow.operators.email_operator import EmailOperator
 23 | from airflow.operators.python_operator import PythonOperator
 24 | 
 25 | RAW_TWEET_DIR = os.path.abspath(os.path.join(__file__, "../data/tweets/"))
 26 | CONFIG_FILE = os.path.abspath(os.path.join(__file__, "../config/prod.cfg"))
 27 | MAX_TWEEPY_PAGE = 2
 28 | 
 29 | # since there do not exist task on their own we need to create the DAG
 30 | default_args = {
 31 |     "owner": "admin",
 32 |     "depends_on_past": False,
 33 |     "start_date": datetime.now() - timedelta(days=5),
 34 |     "retries": 1,
 35 |     "retry_delay": timedelta(minutes=5),
 36 | }
 37 | 
 38 | dag = DAG("twitter_links", default_args=default_args, schedule_interval="@daily")
 39 | 
 40 | 
 41 | def extract_tweet_data(tweepy_obj, query):
 42 |     """ Extract relevant and serializable data from a tweepy Tweet object
 43 |         params:
 44 |             tweepy_obj: Tweepy Tweet Object
 45 |             query: str
 46 |         returns dict
 47 |     """
 48 |     return {
 49 |         "user_id": tweepy_obj.user.id,
 50 |         "user_name": tweepy_obj.user.name,
 51 |         "user_screenname": tweepy_obj.user.screen_name,
 52 |         "user_url": tweepy_obj.user.url,
 53 |         "user_description": tweepy_obj.user.description,
 54 |         "user_followers": tweepy_obj.user.followers_count,
 55 |         "user_friends": tweepy_obj.user.friends_count,
 56 |         "created": tweepy_obj.created_at.isoformat(),
 57 |         "text": tweepy_obj.text,
 58 |         "hashtags": [ht.get("text") for ht in tweepy_obj.entities.get("hashtags")],
 59 |         "mentions": [
 60 |             (um.get("id"), um.get("screen_name"))
 61 |             for um in tweepy_obj.entities.get("user_mentions")
 62 |         ],
 63 |         "urls": [url.get("expanded_url") for url in tweepy_obj.entities.get("urls")],
 64 |         "tweet_id": tweepy_obj.id,
 65 |         "is_quote_status": tweepy_obj.is_quote_status,
 66 |         "favorite_count": tweepy_obj.favorite_count,
 67 |         "retweet_count": tweepy_obj.retweet_count,
 68 |         "reply_status_id": tweepy_obj.in_reply_to_status_id,
 69 |         "lang": tweepy_obj.lang,
 70 |         "source": tweepy_obj.source,
 71 |         "location": tweepy_obj.coordinates,
 72 |         "query": query,
 73 |     }
 74 | 
 75 | 
 76 | def search_twitter(**kwargs):
 77 |     """ Search for a query in public tweets"""
 78 |     query = kwargs.get("params").get("query")
 79 | 
 80 |     auth = OAuthHandler(Variable.get("consumer_key"), Variable.get("consumer_secret"))
 81 |     auth.set_access_token(
 82 |         Variable.get("access_token"), Variable.get("access_token_secret")
 83 |     )
 84 |     api = API(auth)
 85 | 
 86 |     all_tweets = []
 87 |     page_num = 0
 88 |     since_date = datetime.strptime(kwargs.get("ds"), "%Y-%m-%d").date() - timedelta(
 89 |         days=1
 90 |     )
 91 |     query += " since:{} until:{}".format(
 92 |         since_date.strftime("%Y-%m-%d"), kwargs.get("ds")
 93 |     )
 94 |     print(f"searching twitter with: {query}")
 95 |     for page in Cursor(
 96 |         api.search, q=query, monitor_rate_limit=True, wait_on_rate_limit=True
 97 |     ).pages():
 98 |         all_tweets.extend([extract_tweet_data(t, query) for t in page])
 99 |         page_num += 1
100 |         if page_num > MAX_TWEEPY_PAGE:
101 |             break
102 | 
103 |     # if it's an empty list, stop here
104 |     if not len(all_tweets):
105 |         return
106 | 
107 |     filename = "{}/{}_{}.csv".format(
108 |         RAW_TWEET_DIR, query, datetime.now().strftime("%m%d%Y%H%M%S")
109 |     )
110 | 
111 |     # check that the directory exists
112 |     if not Path(filename).resolve().parent.exists():
113 | 
114 |         os.mkdir(Path(filename).resolve().parent)
115 | 
116 |     with open(filename, "w") as raw_file:
117 |         raw_wrtr = DictWriter(raw_file, fieldnames=all_tweets[0].keys())
118 |         raw_wrtr.writeheader()
119 |         raw_wrtr.writerows(all_tweets)
120 | 
121 | 
122 | def csv_to_sql(directory=RAW_TWEET_DIR, **kwargs):
123 |     """ csv to sql pipeline using pandas
124 |         params:
125 |             directory: str (file path to csv files)
126 |     """
127 |     dbconn = MySqlHook(mysl_conn_id="mysql_default")
128 |     cursor = dbconn.get_cursor()
129 | 
130 |     for fname in glob.glob("{}/*.csv".format(directory)):
131 |         if "_read" not in fname:
132 |             try:
133 |                 df = pd.read_csv(fname)
134 |                 df.to_sql("tweets", dbconn, if_exists="append", index=False)
135 |                 shutil.move(fname, fname.replace(".csv", "_read.csv"))
136 |             except pd.io.common.EmptyDataError:
137 |                 # probably an io error with another task / open file
138 |                 continue
139 | 
140 | 
141 | def identify_popular_links(directory=RAW_TWEET_DIR, write_mode="w", **kwargs):
142 |     """ Identify the most popular links from the last day of tweest in the db
143 |         Writes them to latest_links.txt in the RAW_TWEET_DIR
144 |         (or directory kwarg)
145 |     """
146 |     dbconn = MySqlHook(mysl_conn_id="mysql_default")
147 |     cursor = dbconn.cursor()
148 | 
149 |     query = """select * from tweets where
150 |     created > date('now', '-1 days') and urls is not null
151 |     order by favorite_count"""
152 |     df = pd.read_sql_query(query, conn)
153 |     df.urls = df.urls.map(ast.literal_eval)
154 |     cntr = Counter(itertools.chain.from_iterable(df.urls.values))
155 |     with open("{}/latest_links.txt".format(directory), write_mode) as latest:
156 |         wrtr = writer(latest)
157 |         wrtr.writerow(["url", "count"])
158 |         wrtr.writerows(cntr.most_common(5))
159 | 
160 | 
161 | # --------------------------------------
162 | # Tasks
163 | # -------------------------------------
164 | simple_search = PythonOperator(
165 |     task_id="search_twitter",
166 |     provide_context=True,
167 |     python_callable=search_twitter,
168 |     dag=dag,
169 |     # note we pass this as a params obj
170 |     params={"query": "#pycon"},
171 | )
172 | 
173 | 
174 | move_tweets_to_sql = PythonOperator(
175 |     task_id="csv_to_sql",
176 |     # extra DAG context
177 |     provide_context=True,
178 |     # call the function
179 |     python_callable=csv_to_sql,
180 |     dag=dag,
181 | )
182 | 
183 | 
184 | id_popular = PythonOperator(
185 |     task_id="identify_popular_links",
186 |     provide_context=True,
187 |     python_callable=identify_popular_links,
188 |     dag=dag,
189 | )
190 | 
191 | 
192 | email_links = EmailOperator(
193 |     task_id="email_best_links",
194 |     to="trallard@bitsandchips.me",
195 |     subject="Latest popular links",
196 |     html_content="Check out the latest!!",
197 |     files=["{}/latest_links.txt".format(RAW_TWEET_DIR)],
198 |     dag=dag,
199 | )
200 | 
201 | 
202 | simple_search.set_downstream(move_tweets_to_sql)
203 | id_popular.set_upstream(move_tweets_to_sql)
204 | email_links.set_upstream(id_popular)
205 | 


--------------------------------------------------------------------------------
/solutions/etl-basic/analyse_twitter.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import os.path
  3 | import re
  4 | from datetime import datetime
  5 | from pathlib import Path
  6 | 
  7 | import matplotlib.pyplot as plt
  8 | import mysql.connector as mysql
  9 | import numpy as np
 10 | import pandas as pd
 11 | 
 12 | # import the previously created functions
 13 | from stream_twitter import connect_db
 14 | 
 15 | # Details for our MySql connection
 16 | DATABASE = {
 17 |     "host": "localhost",
 18 |     "user": "airflow",
 19 |     "password": "python2019",
 20 |     "db": "airflowdb",
 21 | }
 22 | 
 23 | # ----------------------------------------------
 24 | #  Database related functions
 25 | # ----------------------------------------------
 26 | 
 27 | 
 28 | def sql_to_csv(my_database, my_table):
 29 | 
 30 |     dbconnect = connect_db(my_database)
 31 | 
 32 |     cursor = dbconnect.cursor()
 33 | 
 34 |     query = f"SELECT * FROM {table}"
 35 |     all_tweets = pd.read_sql_query(query, dbconnect)
 36 | 
 37 |     if os.path.exists("./data"):
 38 |         all_tweets.to_csv("./data/raw_tweets.csv", index=False)
 39 | 
 40 |     else:
 41 |         os.mkdir("./data")
 42 |         all_tweets.to_csv("./data/raw_tweets.csv", index=False)
 43 | 
 44 | 
 45 | def sql_to_df(my_database, my_table):
 46 |     dbconnect = connect_db(my_database)
 47 | 
 48 |     cursor = dbconnect.cursor()
 49 | 
 50 |     query = f"SELECT * FROM {my_table}"
 51 | 
 52 |     # store in dataframe
 53 | 
 54 |     df = pd.read_sql_query(query, dbconnect, index_col="id")
 55 | 
 56 |     cursor.close()
 57 |     dbconnect.close()
 58 | 
 59 |     return df
 60 | 
 61 | 
 62 | # ----------------------------------------------
 63 | #  Data processing
 64 | # ----------------------------------------------
 65 | 
 66 | 
 67 | def clean_data(df):
 68 | 
 69 |     # Make all usernames lowercase
 70 |     clean_df = df.copy()
 71 |     clean_df["user"] = df["user"].str.lower()
 72 | 
 73 |     # keep only non RT
 74 |     clean_df = clean_df[~clean_df["tweet"].str.contains("RT")]
 75 | 
 76 |     return clean_df
 77 | 
 78 | 
 79 | def create_plots(df):
 80 |     x = df["language"].unique()
 81 |     fig, ax = plt.subplots()
 82 |     countries = df["language"].value_counts()
 83 |     plt.bar(range(len(countries)), countries)
 84 |     fig.suptitle("Language counts")
 85 |     plt.xlabel("languages")
 86 |     plt.ylabel("count")
 87 |     ax.set_xticklabels(x)
 88 | 
 89 |     if os.path.exists("./plots"):
 90 |         fig.savefig("./plots/barchart_lang.png")
 91 | 
 92 |     else:
 93 |         os.mkdir("./plots")
 94 |         fig.savefig("./plots/barchart_lang.png")
 95 | 
 96 | 
 97 | def save_df(df):
 98 |     today = datetime.today().strftime("%Y-%m-%d")
 99 | 
100 |     if os.path.exists("./data"):
101 |         df.to_csv(f"./data/{today}-clean-df.csv", index=None)
102 | 
103 |     else:
104 |         os.mkdir("./data")
105 |         df.to_csv(f"./data/{today}-clean-df.csv", index=None)
106 | 
107 | 
108 | if __name__ == "__main__":
109 | 
110 |     df = sql_to_df(DATABASE, "tweets_long")
111 |     print("Database loaded in df")
112 | 
113 |     clean_df = clean_data(df)
114 | 
115 |     create_plots(clean_df)
116 | 
117 |     save_df(clean_df)
118 | 


--------------------------------------------------------------------------------
/solutions/etl-basic/etl.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | python etl-basic/stream_twitter_timed.py
4 | 
5 | echo "Completed extraction starting cleaning"
6 | 
7 | python etl-basic/analyse_twitter.py


--------------------------------------------------------------------------------
/solutions/etl-basic/stream_twitter.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------
  2 | # This script is used to stream Twitter data into a MySQL my_database
  3 | # note that for this you need an approved Twitter develope account
  4 | # an app and the keys for said app
  5 | # ------------------------------------------------------------------
  6 | 
  7 | # Import libraries needed
  8 | import sys
  9 | import json
 10 | import time
 11 | from configparser import ConfigParser
 12 | from pathlib import Path
 13 | 
 14 | import tweepy
 15 | from dateutil import parser
 16 | from mysql import connector as mysql
 17 | 
 18 | # Path to the config file with the keys make sure not to commit this file
 19 | CONFIG_FILE = Path.cwd() / "config.cfg"
 20 | 
 21 | # Details for our MySql connection
 22 | DATABASE = {
 23 |     "host": "localhost",
 24 |     "user": "airflow",
 25 |     "password": "python2019",
 26 |     "db": "airflowdb",
 27 | }
 28 | 
 29 | # ----------------------------------------------
 30 | #  Database related functions
 31 | # ----------------------------------------------
 32 | 
 33 | 
 34 | def connect_db(my_database):
 35 |     """Connect to a given my_database
 36 |     
 37 |     Args:
 38 |         my_database(dict): dictionary with the my_database details
 39 |     
 40 |     Returns:
 41 |         dbconnect: MySql my_database connection object
 42 |     """
 43 |     try:
 44 |         dbconnect = mysql.connect(
 45 |             host=my_database.get("host"),
 46 |             user=my_database.get("user"),
 47 |             password=my_database.get("password"),
 48 |             db=my_database.get("db"),
 49 |         )
 50 |         print("connected")
 51 |         return dbconnect
 52 |     except mysql.Error as e:
 53 |         print(e)
 54 | 
 55 | 
 56 | def create_table(my_database, new_table):
 57 |     """Create new table in a my_database
 58 |     
 59 |     Args:
 60 |         my_database (dict): details for the db
 61 |         new_table (str): name of the table to create
 62 |     """
 63 | 
 64 |     dbconnect = connect_db(my_database)
 65 | 
 66 |     # create a cursor for the queries
 67 |     cursor = dbconnect.cursor()
 68 |     cursor.execute("USE airflowdb")
 69 | 
 70 |     # here we delete the table, it can be kept or else
 71 |     cursor.execute(f"DROP TABLE IF EXISTS {new_table}")
 72 | 
 73 |     # these matches the Twitter data
 74 |     query = (
 75 |         f"CREATE TABLE `{new_table}` ("
 76 |         "  `id` INT(11) NOT NULL AUTO_INCREMENT,"
 77 |         "  `user` varchar(100) NOT NULL ,"
 78 |         "  `created_at` timestamp,"
 79 |         "  `tweet` varchar(255) NOT NULL,"
 80 |         "  `retweet_count` int(11) ,"
 81 |         "  `id_str` varchar(100),"
 82 |         "  PRIMARY KEY (`id`))"
 83 |     )
 84 | 
 85 |     cursor.execute(query)
 86 |     dbconnect.close()
 87 |     cursor.close()
 88 | 
 89 |     return print(f"Created {new_table} table")
 90 | 
 91 | 
 92 | def populate_table(
 93 |     user, created_at, tweet, retweet_count, id_str, my_database=DATABASE
 94 | ):
 95 |     """Populate a given table witht he Twitter collected data
 96 |     
 97 |     Args:
 98 |         user (str): username from the status
 99 |         created_at (datetime): when the tweet was created
100 |         tweet (str): text
101 |         retweet_count (int): number of retweets
102 |         id_str (int): unique id for the tweet
103 |     """
104 | 
105 |     dbconnect = connect_db(DATABASE)
106 | 
107 |     cursor = dbconnect.cursor()
108 |     cursor.execute("USE airflowdb")
109 | 
110 |     query = "INSERT INTO tweets (user, created_at, tweet, retweet_count, id_str) VALUES (%s, %s, %s, %s, %s)"
111 | 
112 |     try:
113 |         cursor.execute(query, (user, created_at, tweet, retweet_count, id_str))
114 |         dbconnect.commit()
115 |         print("commited")
116 | 
117 |     except mysql.Error as e:
118 |         print(e)
119 |         dbconnect.rollback()
120 | 
121 |     cursor.close()
122 |     dbconnect.close()
123 | 
124 |     return
125 | 
126 | 
127 | # ----------------------------------------------
128 | #  Access the Twitter API
129 | # ----------------------------------------------
130 | 
131 | 
132 | def connectTwitter():
133 |     config = ConfigParser()
134 |     config.read(CONFIG_FILE)
135 | 
136 |     # Authenticate to Twitter
137 |     auth = tweepy.OAuthHandler(
138 |         config.get("twitter", "consumer_key"), config.get("twitter", "consumer_secret")
139 |     )
140 |     auth.set_access_token(
141 |         config.get("twitter", "access_token"),
142 |         config.get("twitter", "access_token_secret"),
143 |     )
144 | 
145 |     # Create Twitter API object
146 |     twitter = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
147 | 
148 |     print(f"🦄 Connected as {twitter.me().screen_name}")
149 | 
150 |     return twitter
151 | 
152 | 
153 | class customListener(tweepy.StreamListener):
154 |     """We need to create an instance of the Stream Listener
155 |     http://docs.tweepy.org/en/v3.4.0/streaming_how_to.html      
156 |     """
157 | 
158 |     def on_error(self, status_code):
159 |         if status_code == 420:
160 |             # returning False in on_data disconnects the stream
161 |             return False
162 | 
163 |     def on_status(self, status):
164 |         print(status.text)
165 |         return True
166 | 
167 |     def on_data(self, data):
168 |         """
169 |         Automatic detection of the kind of data collected from Twitter
170 |         This method reads in tweet data as Json and extracts the data we want.
171 |         """
172 |         try:
173 |             # parse as json
174 |             raw_data = json.loads(data)
175 | 
176 |             # extract the relevant data
177 |             if "text" in raw_data:
178 |                 user = raw_data["user"]["screen_name"]
179 |                 created_at = parser.parse(raw_data["created_at"])
180 |                 tweet = raw_data["text"]
181 |                 retweet_count = raw_data["retweet_count"]
182 |                 id_str = raw_data["id_str"]
183 | 
184 |             # insert data just collected into MySQL my_database
185 |             populate_table(user, created_at, tweet, retweet_count, id_str)
186 |             print(f"Tweet colleted at: {created_at}")
187 | 
188 |         except Error as e:
189 |             print(e)
190 | 
191 | 
192 | def start_stream(stream, **kwargs):
193 |     """Start the stream, prints the disconnection error
194 |     
195 |     Args:
196 |         stream (obj): stream object to start
197 |     """
198 | 
199 |     try:
200 |         stream.filter(**kwargs)
201 |     except Exception:
202 |         stream.disconnect()
203 |         print("Fatal exception")
204 | 
205 | 
206 | if __name__ == "__main__":
207 |     create_table(DATABASE, "tweets")
208 |     # first we need to authenticate
209 |     twitter = connectTwitter()
210 | 
211 |     # next: create stream listener
212 |     myStreamListener = customListener()
213 |     myStream = tweepy.Stream(auth=twitter.auth, listener=myStreamListener, timeout=30)
214 | 
215 |     # stream tweets using the filter method
216 |     version = float(f"{sys.version_info[0]}.{sys.version_info[1]}")
217 |     if version >= 3.7:
218 |         kwargs = {
219 |             'track': ["python", "pycon", "jupyter", "#pycon2019"],
220 |             'is_async': True
221 |         }
222 |     else:
223 |         kwargs = {
224 |             'track': ["python", "pycon", "jupyter", "#pycon2019"],
225 |             'async': True
226 |         }
227 |         pass
228 |     start_stream(myStream, **kwargs)
229 |     pass
230 | 
231 | 


--------------------------------------------------------------------------------
/solutions/etl-basic/stream_twitter_alt.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------
  2 | # This script is used to stream Twitter data into a MySQL my_database
  3 | # note that for this you need an approved Twitter develope account
  4 | # an app and the keys for said app
  5 | # ------------------------------------------------------------------
  6 | 
  7 | # Import libraries needed
  8 | import sys
  9 | import json
 10 | import time
 11 | from configparser import ConfigParser
 12 | from pathlib import Path
 13 | 
 14 | import tweepy
 15 | from dateutil import parser
 16 | from mysql import connector as mysql
 17 | 
 18 | # Path to the config file with the keys make sure not to commit this file
 19 | CONFIG_FILE = Path.cwd() / "config.cfg"
 20 | 
 21 | # Details for our MySql connection
 22 | DATABASE = {
 23 |     "host": "localhost",
 24 |     "user": "airflow",
 25 |     "password": "python2019",
 26 |     "db": "airflowdb",
 27 | }
 28 | 
 29 | MAX_TWEEPY_PAGE = 300
 30 | 
 31 | 
 32 | # ----------------------------------------------
 33 | #  Database related functions
 34 | # ----------------------------------------------
 35 | 
 36 | 
 37 | def connect_db(my_database):
 38 |     """Connect to a given my_database
 39 |     
 40 |     Args:
 41 |         my_database(dict): dictionary with the my_database details
 42 |     
 43 |     Returns:
 44 |         dbconnect: MySql my_database connection object
 45 |     """
 46 |     try:
 47 |         dbconnect = mysql.connect(
 48 |             host=my_database.get("host"),
 49 |             user=my_database.get("user"),
 50 |             password=my_database.get("password"),
 51 |             db=my_database.get("db"),
 52 |         )
 53 |         print("connected")
 54 |         return dbconnect
 55 |     except mysql.Error as e:
 56 |         print(e)
 57 | 
 58 | 
 59 | def create_table(my_database, new_table):
 60 |     """Create new table in a my_database
 61 |     
 62 |     Args:
 63 |         my_database (dict): details for the db
 64 |         new_table (str): name of the table to create
 65 |     """
 66 | 
 67 |     dbconnect = connect_db(my_database)
 68 | 
 69 |     # create a cursor for the queries
 70 |     cursor = dbconnect.cursor()
 71 |     cursor.execute("USE airflowdb")
 72 | 
 73 |     # here we delete the table, it can be kept or else
 74 |     cursor.execute(f"DROP TABLE IF EXISTS {new_table}")
 75 | 
 76 |     # these matches the Twitter data
 77 |     query = (
 78 |         f"CREATE TABLE `{new_table}` ("
 79 |         "  `id` INT(11) NOT NULL AUTO_INCREMENT,"
 80 |         "  `user` varchar(100) NOT NULL ,"
 81 |         "  `created_at` timestamp,"
 82 |         "  `tweet` varchar(255) NOT NULL,"
 83 |         "  `retweet_count` int(11) ,"
 84 |         "  `id_str` varchar(100),"
 85 |         "  `country` varchar(255),"
 86 |         "   `followers` varchar(100),"
 87 |         "   `language` varchar(100),"
 88 |         "  PRIMARY KEY (`id`))"
 89 |     )
 90 | 
 91 |     cursor.execute(query)
 92 |     dbconnect.close()
 93 |     cursor.close()
 94 | 
 95 |     return print(f"Created {new_table} table")
 96 | 
 97 | 
 98 | def populate_table(
 99 |     user,
100 |     created_at,
101 |     tweet,
102 |     retweet_count,
103 |     id_str,
104 |     country,
105 |     followers,
106 |     language,
107 |     my_table,
108 |     my_database=DATABASE,
109 | ):
110 |     """Populate a given table witht he Twitter collected data
111 |     
112 |     Args:
113 |         user (str): username from the status
114 |         created_at (datetime): when the tweet was created
115 |         tweet (str): text
116 |         retweet_count (int): number of retweets
117 |         id_str (int): unique id for the tweet
118 |     """
119 | 
120 |     dbconnect = connect_db(DATABASE)
121 | 
122 |     cursor = dbconnect.cursor()
123 |     cursor.execute("USE airflowdb")
124 | 
125 |     query = f"INSERT INTO {my_table} (user, created_at, tweet, retweet_count, id_str, country, followers, language) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"
126 | 
127 |     try:
128 |         cursor.execute(
129 |             query,
130 |             (
131 |                 user,
132 |                 created_at,
133 |                 tweet,
134 |                 retweet_count,
135 |                 id_str,
136 |                 country,
137 |                 followers,
138 |                 language,
139 |             ),
140 |         )
141 |         dbconnect.commit()
142 |         print("commited")
143 | 
144 |     except mysql.Error as e:
145 |         print(e)
146 |         dbconnect.rollback()
147 | 
148 |     cursor.close()
149 |     dbconnect.close()
150 | 
151 |     return
152 | 
153 | 
154 | # ----------------------------------------------
155 | #  Access the Twitter API
156 | # ----------------------------------------------
157 | 
158 | 
159 | def connectTwitter():
160 |     config = ConfigParser()
161 |     config.read(CONFIG_FILE)
162 | 
163 |     # Authenticate to Twitter
164 |     auth = tweepy.OAuthHandler(
165 |         config.get("twitter", "consumer_key"), config.get("twitter", "consumer_secret")
166 |     )
167 |     auth.set_access_token(
168 |         config.get("twitter", "access_token"),
169 |         config.get("twitter", "access_token_secret"),
170 |     )
171 | 
172 |     # Create Twitter API object
173 |     twitter = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
174 | 
175 |     print(f"🦄 Connected as {twitter.me().screen_name}")
176 | 
177 |     return twitter
178 | 
179 | 
180 | class customListener(tweepy.StreamListener):
181 |     """We need to create an instance of the Stream Listener
182 |     http://docs.tweepy.org/en/v3.4.0/streaming_how_to.html      
183 |     """
184 | 
185 |     def on_error(self, status_code):
186 |         if status_code == 420:
187 |             # returning False in on_data disconnects the stream
188 |             return False
189 | 
190 |     def on_status(self, status):
191 |         print(status.text)
192 |         return True
193 | 
194 |     def on_data(self, data):
195 |         """
196 |         Automatic detection of the kind of data collected from Twitter
197 |         This method reads in tweet data as Json and extracts the data we want.
198 |         """
199 |         try:
200 |             # parse as json
201 |             json_data = json.loads(data)
202 | 
203 |             # extract the relevant data
204 |             if "text" in json_data:
205 |                 user = json_data["user"]["screen_name"]
206 |                 created_at = parser.parse(json_data["created_at"])
207 |                 tweet = json_data["text"]
208 |                 retweet_count = json_data["retweet_count"]
209 |                 id_str = json_data["id_str"]
210 |                 followers = json_data["user"]["followers_count"]
211 |                 language = json_data["user"]["lang"]
212 |                 if json_data["place"] is not None:
213 |                     country = json_data["place"]["country"]
214 |                 else:
215 |                     country = None
216 | 
217 |             # insert data just collected into MySQL my_database
218 |             populate_table(
219 |                 user,
220 |                 created_at,
221 |                 tweet,
222 |                 retweet_count,
223 |                 id_str,
224 |                 country,
225 |                 followers,
226 |                 language,
227 |                 "tweets_long",
228 |             )
229 |             print(f"Tweet colleted at: {created_at}")
230 | 
231 |         except Error as e:
232 |             print(e)
233 | 
234 | 
235 | def start_stream(stream, **kwargs):
236 |     """Start the stream, prints the disconnection error
237 |     
238 |     Args:
239 |         stream (obj): stream object to start
240 |     """
241 |     try:
242 |         stream.filter(**kwargs)
243 |     except Exception:
244 |         stream.disconnect()
245 |         print("Fatal exception")
246 | 
247 | 
248 | if __name__ == "__main__":
249 | 
250 |     create_table(DATABASE, "tweets_long")
251 |     # first we need to authenticate
252 |     twitter = connectTwitter()
253 | 
254 |     # next: create stream listener
255 |     myStreamListener = customListener()
256 |     myStream = tweepy.Stream(auth=twitter.auth, listener=myStreamListener, timeout=30)
257 | 
258 |     # stream tweets using the filter method
259 |     version = float(f"{sys.version_info[0]}.{sys.version_info[1]}")
260 |     if version >= 3.7:
261 |         kwargs = {
262 |             'track': ["python", "pycon", "jupyter", "#pycon2019"],
263 |             'is_async': True
264 |         }
265 |     else:
266 |         kwargs = {
267 |             'track': ["python", "pycon", "jupyter", "#pycon2019"],
268 |             'async': True
269 |         }
270 |         pass
271 |     start_stream(myStream, **kwargs)
272 |     pass
273 | 


--------------------------------------------------------------------------------
/solutions/etl-basic/stream_twitter_timed.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------
  2 | # This script is used to stream Twitter data into a MySQL my_database
  3 | # note that for this you need an approved Twitter develope account
  4 | # an app and the keys for said app
  5 | # ------------------------------------------------------------------
  6 | 
  7 | # Import libraries needed
  8 | import json
  9 | import time
 10 | from configparser import ConfigParser
 11 | from pathlib import Path
 12 | 
 13 | import tweepy
 14 | from dateutil import parser
 15 | from mysql import connector as mysql
 16 | 
 17 | # Path to the config file with the keys make sure not to commit this file
 18 | CONFIG_FILE = Path.cwd() / "config.cfg"
 19 | 
 20 | # Details for our MySql connection
 21 | DATABASE = {
 22 |     "host": "localhost",
 23 |     "user": "airflow",
 24 |     "password": "python2019",
 25 |     "db": "airflowdb",
 26 | }
 27 | 
 28 | MAX_TWEEPY_PAGE = 300
 29 | 
 30 | 
 31 | # ----------------------------------------------
 32 | #  Database related functions
 33 | # ----------------------------------------------
 34 | 
 35 | 
 36 | def connect_db(my_database):
 37 |     """Connect to a given my_database
 38 |     
 39 |     Args:
 40 |         my_database(dict): dictionary with the my_database details
 41 |     
 42 |     Returns:
 43 |         dbconnect: MySql my_database connection object
 44 |     """
 45 |     try:
 46 |         dbconnect = mysql.connect(
 47 |             host=my_database.get("host"),
 48 |             user=my_database.get("user"),
 49 |             password=my_database.get("password"),
 50 |             db=my_database.get("db"),
 51 |         )
 52 |         print("connected")
 53 |         return dbconnect
 54 |     except mysql.Error as e:
 55 |         print(e)
 56 | 
 57 | 
 58 | def create_table(my_database, new_table):
 59 |     """Create new table in a my_database
 60 |     
 61 |     Args:
 62 |         my_database (dict): details for the db
 63 |         new_table (str): name of the table to create
 64 |     """
 65 | 
 66 |     dbconnect = connect_db(my_database)
 67 | 
 68 |     # create a cursor for the queries
 69 |     cursor = dbconnect.cursor()
 70 |     cursor.execute("USE airflowdb")
 71 | 
 72 |     # here we delete the table, it can be kept or else
 73 |     cursor.execute(f"DROP TABLE IF EXISTS {new_table}")
 74 | 
 75 |     # these matches the Twitter data
 76 |     query = (
 77 |         f"CREATE TABLE `{new_table}` ("
 78 |         "  `id` INT(11) NOT NULL AUTO_INCREMENT,"
 79 |         "  `user` varchar(100) NOT NULL ,"
 80 |         "  `created_at` timestamp,"
 81 |         "  `tweet` varchar(255) NOT NULL,"
 82 |         "  `retweet_count` int(11) ,"
 83 |         "  `id_str` varchar(100),"
 84 |         "  PRIMARY KEY (`id`))"
 85 |     )
 86 | 
 87 |     cursor.execute(query)
 88 |     dbconnect.close()
 89 |     cursor.close()
 90 | 
 91 |     return print(f"Created {new_table} table")
 92 | 
 93 | 
 94 | def populate_table(
 95 |     user, created_at, tweet, retweet_count, id_str, my_database=DATABASE
 96 | ):
 97 |     """Populate a given table witht he Twitter collected data
 98 |     
 99 |     Args:
100 |         user (str): username from the status
101 |         created_at (datetime): when the tweet was created
102 |         tweet (str): text
103 |         retweet_count (int): number of retweets
104 |         id_str (int): unique id for the tweet
105 |     """
106 | 
107 |     dbconnect = connect_db(DATABASE)
108 | 
109 |     cursor = dbconnect.cursor()
110 |     cursor.execute("USE airflowdb")
111 | 
112 |     query = "INSERT INTO tweets (user, created_at, tweet, retweet_count, id_str) VALUES (%s, %s, %s, %s, %s)"
113 | 
114 |     try:
115 |         cursor.execute(query, (user, created_at, tweet, retweet_count, id_str))
116 |         dbconnect.commit()
117 |         print("commited")
118 | 
119 |     except mysql.Error as e:
120 |         print(e)
121 |         dbconnect.rollback()
122 | 
123 |     cursor.close()
124 |     dbconnect.close()
125 | 
126 |     return
127 | 
128 | 
129 | # ----------------------------------------------
130 | #  Access the Twitter API
131 | # ----------------------------------------------
132 | 
133 | 
134 | def connectTwitter():
135 |     config = ConfigParser()
136 |     config.read(CONFIG_FILE)
137 | 
138 |     # Authenticate to Twitter
139 |     auth = tweepy.OAuthHandler(
140 |         config.get("twitter", "consumer_key"), config.get("twitter", "consumer_secret")
141 |     )
142 |     auth.set_access_token(
143 |         config.get("twitter", "access_token"),
144 |         config.get("twitter", "access_token_secret"),
145 |     )
146 | 
147 |     # Create Twitter API object
148 |     twitter = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
149 | 
150 |     print(f"🦄 Connected as {twitter.me().screen_name}")
151 | 
152 |     return twitter
153 | 
154 | 
155 | class customListener(tweepy.StreamListener):
156 |     """We need to create an instance of the Stream Listener
157 |     http://docs.tweepy.org/en/v3.4.0/streaming_how_to.html      
158 |     """
159 | 
160 |     def __init__(self, time_limit=60):
161 |         self.start_time = time.time()
162 |         self.limit = time_limit
163 |         super(customListener, self).__init__()
164 | 
165 |     def on_error(self, status_code):
166 |         if status_code == 420:
167 |             # returning False in on_data disconnects the stream
168 |             return False
169 | 
170 |     def on_status(self, status):
171 |         print(status.text)
172 |         return True
173 | 
174 |     def on_data(self, data):
175 |         """
176 |         Automatic detection of the kind of data collected from Twitter
177 |         This method reads in tweet data as Json and extracts the data we want.
178 |         """
179 |         if (time.time() - self.start_time) < self.limit:
180 |             try:
181 |                 # parse as json
182 |                 raw_data = json.loads(data)
183 | 
184 |                 # extract the relevant data
185 |                 if "text" in raw_data:
186 |                     user = raw_data["user"]["screen_name"]
187 |                     created_at = parser.parse(raw_data["created_at"])
188 |                     tweet = raw_data["text"]
189 |                     retweet_count = raw_data["retweet_count"]
190 |                     id_str = raw_data["id_str"]
191 | 
192 |                 # insert data just collected into MySQL my_database
193 |                 populate_table(user, created_at, tweet, retweet_count, id_str)
194 |                 print(f"Tweet colleted at: {created_at}")
195 | 
196 |             except Error as e:
197 |                 print(e)
198 |         else:
199 |             self.saveFile.close()
200 |             return False
201 | 
202 | 
203 | def start_stream(stream, **kwargs):
204 |     """Start the stream, prints the disconnection error
205 |     
206 |     Args:
207 |         stream (obj): stream object to start
208 |     """
209 | 
210 |     try:
211 |         stream.filter(**kwargs)
212 |     except Exception:
213 |         stream.disconnect()
214 |         print("Fatal exception")
215 | 
216 | 
217 | if __name__ == "__main__":
218 | 
219 |     create_table(DATABASE, "tweets")
220 |     # first we need to authenticate
221 |     twitter = connectTwitter()
222 | 
223 |     # next: create stream listener
224 |     myStreamListener = customListener()
225 |     myStream = tweepy.Stream(auth=twitter.auth, listener=myStreamListener, timeout=30)
226 | 
227 |     # stream tweets using the filter method
228 |     start_stream(myStream, track=["python", "pycon", "jupyter", "#pycon2019"])
229 | 
230 | 


--------------------------------------------------------------------------------
/source/_static/12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/12.png


--------------------------------------------------------------------------------
/source/_static/4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/4.jpg


--------------------------------------------------------------------------------
/source/_static/DAG.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/DAG.png


--------------------------------------------------------------------------------
/source/_static/GUI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/GUI.png


--------------------------------------------------------------------------------
/source/_static/airflow-logo.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/airflow-logo.jpeg


--------------------------------------------------------------------------------
/source/_static/airflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/airflow.png


--------------------------------------------------------------------------------
/source/_static/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/architecture.png


--------------------------------------------------------------------------------
/source/_static/automation1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/automation1.jpg


--------------------------------------------------------------------------------
/source/_static/azure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/azure.png


--------------------------------------------------------------------------------
/source/_static/connection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/connection.png


--------------------------------------------------------------------------------
/source/_static/custom.css:
--------------------------------------------------------------------------------
  1 | /*  */
  2 | @import url('https://fonts.googleapis.com/css?family=Itim|Nunito|Source+Code+Pro');
  3 | 
  4 | a {
  5 |     color: rgb(96, 138, 197);
  6 | }
  7 | 
  8 | a:hover {
  9 |     color: rgb(65, 129, 218);
 10 | }
 11 | 
 12 | div.body h1 {
 13 |     color: #5F6366;
 14 |     font-family: 'Itim', cursive;
 15 |     font-weight: bold;
 16 |     font-size: 300%;
 17 | }
 18 | 
 19 | div.body h2 {
 20 |     color: #5F6366;
 21 |     font-family: 'Itim', cursive;
 22 |     font-weight: bold;
 23 | }
 24 | div.body h3 {
 25 |     color: #5F6366;
 26 |     font-family: 'Itim', cursive;
 27 |     font-weight: bold;
 28 | }
 29 | 
 30 | div.sphinxsidebarwrapper h1.logo {
 31 |     text-align: center;
 32 |     margin: 0 0 -20px 0;
 33 | }
 34 | 
 35 | div.sphinxsidebar p.blurb {
 36 |     font-size: 130%;
 37 |     text-align: center;
 38 |     font-family: 'Itim', cursive;
 39 |     color: rgb(151, 139, 196);
 40 | }
 41 | 
 42 | div.sphinxsidebar h1{
 43 |     font-size: 160%;
 44 |     color: #5F6366;
 45 |     font-family: 'Itim', cursive;
 46 | }
 47 | 
 48 | div.sphinxsidebar h1 a {
 49 |     font-size: 160%;
 50 |     color: #5F6366;
 51 |     text-decoration: none;
 52 |     border: none;
 53 |     font-family: 'Itim', cursive;
 54 | }
 55 | 
 56 | div.sphinxsidebar h1 a:hover {
 57 |     border: none;
 58 | }
 59 | 
 60 | div.sphinxsidebar h3 {
 61 |     display: none;
 62 | }
 63 | 
 64 | div.sphinxsidebar a {
 65 |     color: #5F6366;
 66 | }
 67 | 
 68 | code.descname {
 69 |     color: rgb(151, 139, 196);
 70 | }
 71 | 
 72 | th.field-name {
 73 |     min-width: 100px;
 74 |     color: rgb(151, 139, 196);
 75 | }
 76 | 
 77 | tt, code {
 78 |     color: #F8F8F2;
 79 |     background: #1d1941;
 80 |     border-radius: 0.3em;
 81 |     padding: 0.0em 0.3em;
 82 | }
 83 | 
 84 | a.reference.internal code.xref span.pre {
 85 |     color: #F8F8F2;
 86 |     background: #1d1941;
 87 |     border-bottom: none;
 88 |     border-radius: 0;
 89 |     padding: 0;
 90 | }
 91 | 
 92 | a.reference.internal, a.reference.internal:hover {
 93 |     border-bottom: none;
 94 | }
 95 | 
 96 | a.reference.internal:hover code {
 97 |     background: #027bab
 98 | }
 99 | 
100 | a.reference.internal:hover code.xref span.pre {
101 |     color: #F8F8F2;
102 |     background: #027bab;
103 |     border-bottom: none;
104 | }
105 | 
106 | tt.xref, code.xref, a tt {
107 |     background: none;
108 |     border-bottom: none;
109 | }
110 | 
111 | code.literal {
112 |     color: #F8F8F2;
113 |     background:#1d1941;
114 | }
115 | 
116 | pre {
117 |     padding: 20px 30px;
118 |     background: #1d1941;
119 | }
120 | 
121 | div > dl {
122 |     border-left: 2px solid #00384021;
123 |     padding-left: 5px;
124 | }
125 | 
126 | dt {
127 |     color: rgb(96, 138, 197);
128 | }
129 | 
130 | 
131 | div.footer::before {
132 |     display: block;
133 |     content: '';
134 |     border-top: 2px solid #EDB5BF;
135 |     width: 50%;
136 |     margin: 2em auto 2em auto;
137 | }
138 | 
139 | div.footer {
140 |     text-align: center;
141 |     /* color: #029be2;  */
142 | }
143 | 
144 | div.footer a {
145 |     color: #027bab;
146 |     text-decoration: none;
147 | }
148 | 
149 | p.caption {
150 |     font-family: 'Itim', cursive;
151 |     font-size: inherit;
152 |     font-size: 150%;
153 | }
154 | 
155 | @media screen and (max-width: 875px) {
156 |     div.sphinxsidebar {
157 |         background: #4D6D9A;
158 |     }
159 |     div.sphinxsidebar h1.logo, div.sphinxsidebar p.blurb{
160 |         text-align: left;
161 |     }
162 |     div.sphinxsidebar h1 a {
163 |         color: #1bc5e0;
164 |     }
165 |     div.sphinxsidebar a {
166 |         /* color: rgb(151, 139, 196); */
167 |         color: white;
168 |     }
169 |     div.sphinxsidebar ul {
170 |         /* color: rgb(151, 139, 196); */
171 |         color: white;
172 |     }
173 | }
174 | 
175 | 
176 | /* other */
177 | 
178 | .alert {
179 |   position: relative;
180 |   padding: 10px;
181 |   margin-bottom: 5px;
182 |   border: 2px solid transparent;
183 |   border-radius: 2px;
184 | }
185 | 
186 | .alert-primary {
187 |     color: #004085;
188 |     background-color: #cce5ff;
189 |     border-color: #b8daff;
190 | }
191 | .alert-custom {
192 |     background-color: rgb(229, 224, 247);
193 |     border-color:rgb(229, 224, 247);
194 |     color: rgb(128, 117, 165);
195 | }


--------------------------------------------------------------------------------
/source/_static/datapyramid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/datapyramid.png


--------------------------------------------------------------------------------
/source/_static/gooddata.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/gooddata.png


--------------------------------------------------------------------------------
/source/_static/gooddata1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/gooddata1.png


--------------------------------------------------------------------------------
/source/_static/luigi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/luigi.png


--------------------------------------------------------------------------------
/source/_static/mssignin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/mssignin.png


--------------------------------------------------------------------------------
/source/_static/pipeline1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/pipeline1.png


--------------------------------------------------------------------------------
/source/_static/python.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/python.png


--------------------------------------------------------------------------------
/source/_static/twitter1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/twitter1.png


--------------------------------------------------------------------------------
/source/_static/twitter2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/twitter2.png


--------------------------------------------------------------------------------
/source/_static/twitter3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/twitter3.png


--------------------------------------------------------------------------------
/source/_static/uses.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/airflow-tutorial/eb16a3c2cebc898bc439058598f912c31d54eecd/source/_static/uses.png


--------------------------------------------------------------------------------
/source/_templates/sidebarlogo.html:
--------------------------------------------------------------------------------
1 | <p><iframe src="https://ghbtns.com/github-btn.html?user=trallard&type=follow&count=false" allowtransparency="true"
2 |         frameborder="0" scrolling="0" width="200" height="20"></iframe></p>
3 | 
4 | <p><a href="https://twitter.com/ixek" class="twitter-follow-button" data-show-count="false">Follow @ixek</a>
5 |     <script>!function (d, s, id) { var js, fjs = d.getElementsByTagName(s)[0], p = /^http:/.test(d.location) ? 'http' : 'https'; if (!d.getElementById(id)) { js = d.createElement(s); js.id = id; js.src = p + '://platform.twitter.com/widgets.js'; fjs.parentNode.insertBefore(js, fjs); } }(document, 'script', 'twitter-wjs');</script>
6 | </p>
7 | 


--------------------------------------------------------------------------------
/source/about.md:
--------------------------------------------------------------------------------
 1 | # About the workshop
 2 | 
 3 | We will be taking a look at the basic concepts of data pipelines as well as practical use cases using Python. 
 4 | 
 5 | ## About you:
 6 | - Some experience using the command line
 7 | - Intermediate Python knowledge / use
 8 | - Be able to apply what we learn and adopt to your use cases
 9 | - Interested in data and systems
10 | - Aspring or current data engineering
11 | - Some knowledge about systems and databases (enough to be dangerous)
12 | 
13 | ## Our focus for the day
14 | - Greater understanding on how to apply data pipelines using the Python toolset
15 | - Focus on concepts
16 | - Apply knowledge with each library
17 | - Will give you the building blocks
18 | 
19 | ## Keeping on track
20 | 
21 | You will find 🚦 across the tutorial examples. We will use this to identify how folks are doing over the workshop (if following along in person). 
22 | Place the post it as follows:
23 | 
24 | 🚦 Purple postit: all good, task has been completed
25 | 
26 | 🚦 Orange postit: I need extra time or need help with the task in hand


--------------------------------------------------------------------------------
/source/airflow-intro.md:
--------------------------------------------------------------------------------
  1 | # Airflow basics
  2 | 
  3 | ## What is Airflow?
  4 | 
  5 | ![airflow logo](_static/airflow-logo.jpeg)
  6 | 
  7 | Airflow is a Workflow engine which means:
  8 | 
  9 | - Manage scheduling and running jobs and data pipelines
 10 | - Ensures jobs are ordered correctly based on dependencies
 11 | - Manage the allocation of scarce resources
 12 | - Provides mechanisms for tracking the state of jobs and recovering from failure
 13 | 
 14 | It is highly versatile and can be used across many many domains:
 15 | ![](_static/uses.png)
 16 | 
 17 | ## Basic Airflow concepts
 18 | 
 19 | - **Task**: a defined unit of work (these are called operators in Airflow)
 20 | - **Task instance**: an individual run of a single task. Task instances also have an indicative state, which could be “running”, “success”, “failed”, “skipped”, “up for retry”, etc.
 21 | - **DAG**: Directed acyclic graph,
 22 |   a set of tasks with explicit execution order, beginning, and end
 23 | - **DAG run**: individual execution/run of a DAG
 24 | 
 25 | **Debunking the DAG**
 26 | 
 27 | The vertices and edges (the arrows linking the nodes) have an order and direction associated to them
 28 | 
 29 | ![](_static/DAG.png)
 30 | 
 31 | each node in a DAG corresponds to a task, which in turn represents some sort of data processing. For example:
 32 | 
 33 | Node A could be the code for pulling data from an API, node B could be the code for anonymizing the data. Node B could be the code for checking that there are no duplicate records, and so on.
 34 | 
 35 | These 'pipelines' are acyclic since they need a point of completion.
 36 | 
 37 | **Dependencies**
 38 | 
 39 | Each of the vertices has a particular direction that shows the relationship between certain nodes. For example, we can only anonymize data once this has been pulled out from the API.
 40 | 
 41 | ## Idempotency
 42 | 
 43 | This is one of the most important characteristics of good ETL architectures.
 44 | 
 45 | When we say that something is idempotent it means it will produce the same result regardless of how many times this is run (i.e. the results are reproducible).
 46 | 
 47 | Reproducibility is particularly important in data-intensive environments as this ensures that the same inputs will always return the same outputs.
 48 | 
 49 | ## Airflow components
 50 | 
 51 | ![](_static/architecture.png)
 52 | 
 53 | There are 4 main components to Apache Airflow:
 54 | 
 55 | ### Web server
 56 | 
 57 | The GUI. This is under the hood a Flask app where you can track the status of your jobs and read logs from a remote file store (e.g. [Azure Blobstorage](https://docs.microsoft.com/en-us/azure/storage/blobs/storage-blobs-overview/?wt.mc_id=PyCon-github-taallard)).
 58 | 
 59 | ### Scheduler
 60 | 
 61 | This component is responsible for scheduling jobs. This is a multithreaded Python process that uses the DAGb object to decide what tasks need to be run, when and where.
 62 | 
 63 | The task state is retrieved and updated from the database accordingly. The web server then uses these saved states to display job information.
 64 | 
 65 | ### Executor
 66 | 
 67 | The mechanism that gets the tasks done.
 68 | 
 69 | ### Metadata database
 70 | 
 71 | - Powers how the other components interact
 72 | - Stores the Airflow states
 73 | - All processes read and write from here
 74 | 
 75 | ## Workflow as a code
 76 | One of the main advantages of using a workflow system like Airflow is that all is code, which makes your workflows maintainable, versionable, testable, and collaborative.
 77 | 
 78 | Thus your workflows become more explicit and maintainable (atomic tasks).
 79 | 
 80 | Not only your code is dynamic but also is your infrastructure.
 81 | 
 82 | ### Defining tasks
 83 | 
 84 | Tasks are defined based on the abstraction of `Operators` (see Airflow docs [here](https://airflow.apache.org/concepts.html#operators)) which represent a single **idempotent task**.
 85 | 
 86 | The best practice is to have atomic operators (i.e. can stand on their own and do not need to share resources among them).
 87 | 
 88 | You can choose among;
 89 | - `BashOperator`
 90 | - `PythonOperator`
 91 | - `EmailOperator`
 92 | - `SimpleHttpOperator`
 93 | - `MySqlOperator` (and other DB)
 94 | 
 95 | Examples:
 96 | 
 97 | ```python
 98 | t1 = BashOperator(task_id='print_date',
 99 |     bash_command='date,
100 |     dag=dag) 
101 | ```
102 | 
103 | ```python
104 | def print_context(ds, **kwargs):
105 |     pprint(kwargs)
106 |     print(ds)
107 |     return 'Whatever you return gets printed in the logs'
108 | 
109 | 
110 | run_this = PythonOperator(
111 |     task_id='print_the_context',
112 |     provide_context=True,
113 |     python_callable=print_context,
114 |     dag=dag,
115 | )
116 | ```
117 | 
118 | ## Comparing Luigi and Airflow
119 | 
120 | ### Luigi 
121 | 
122 | - Created at Spotify (named after the plumber)
123 | - Open sourced in late 2012
124 | - GNU make for data
125 | 
126 | ### Airflow
127 | - Airbnb data team
128 | - Open-sourced mud 2015
129 | - Apache incubator mid-2016
130 | - ETL pipelines
131 | 
132 | ### Similarities
133 | - Python open source projects for data pipelines
134 | - Integrate with a number of sources (databases, filesystems)
135 | - Tracking failure, retries, success
136 | - Ability to identify the dependencies and execution
137 | 
138 | ### Differences
139 | - Scheduler support: Airflow has built-in support using schedulers
140 | - Scalability: Airflow has had stability issues in the past
141 | - Web interfaces
142 | 
143 | ![](_static/luigi.png)
144 | 
145 | 
146 | ![](_static/airflow.png)
147 | 
148 | 
149 | | Airflow                                          | Luigi                                                                          |
150 | | ------------------------------------------------ | ------------------------------------------------------------------------------ |
151 | | Task are defined by`dag_id` defined by user name | Task are defined by task name and parameters                                   |
152 | | Task retries based on definitions                | Decide if a task is done via input/output                                      |
153 | | Task code to the worker                          | Workers started by Python file where the tasks are defined                     |
154 | | Centralized scheduler (Celery spins up workers)  | Centralized scheduler in charge of deduplication sending tasks (Tornado based) |


--------------------------------------------------------------------------------
/source/azure.md:
--------------------------------------------------------------------------------
 1 | ### Deploying to the cloud
 2 | 
 3 | 
 4 | ![](_static/azure.png)
 5 | 
 6 | [This Docker image](https://hub.docker.com/r/puckel/docker-airflow/) has been used as the base for many deployments. 
 7 | 
 8 | 
 9 | Let's try and get Airflow running on Docker:
10 | 
11 | ```
12 | docker pull puckel/docker-airflow
13 | ```
14 | 
15 | Once you have the container you can run as
16 | 
17 | ```
18 | docker run -d --rm -p 8080:8080 puckel/docker-airflow webserver
19 | ```
20 | 
21 | To load the examples you can do:
22 | ```
23 | docker run -d -p 8080:8080 -e LOAD_EX=y puckel/docker-airflow
24 | ```
25 | 
26 | Based on this container we can deploy to [Azure](https://azure.microsoft.com/en-us/blog/deploying-apache-airflow-in-azure-to-build-and-run-data-pipelines//?wt.mc_id=PyCon-github-taallard)
27 | 
28 | 
29 | [![Deploy to Azure](https://azuredeploy.net/deploybutton.svg)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fsavjani%2Fazure-quickstart-templates%2Fmaster%2F101-webapp-linux-airflow-postgresql%2Fazuredeploy.json/?wt.mc_id=PyCon-github-taallard)
30 | 
31 | 
32 | Note that this is a very basic deployment on Azure.


--------------------------------------------------------------------------------
/source/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Configuration file for the Sphinx documentation builder.
  4 | #
  5 | # This file does only contain a selection of the most common options. For a
  6 | # full list see the documentation:
  7 | # http://www.sphinx-doc.org/en/master/config
  8 | 
  9 | # -- Path setup --------------------------------------------------------------
 10 | 
 11 | # If extensions (or modules to document with autodoc) are in another directory,
 12 | # add these directories to sys.path here. If the directory is relative to the
 13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 14 | #
 15 | # import os
 16 | # import sys
 17 | # sys.path.insert(0, os.path.abspath('.'))
 18 | 
 19 | 
 20 | # -- Project information -----------------------------------------------------
 21 | 
 22 | project = "Airflow tutorial"
 23 | copyright = "2019, Tania Allard"
 24 | author = "Tania Allard"
 25 | 
 26 | # The short X.Y version
 27 | version = ""
 28 | # The full version, including alpha/beta/rc tags
 29 | release = ""
 30 | 
 31 | 
 32 | # -- General configuration ---------------------------------------------------
 33 | 
 34 | # If your documentation needs a minimal Sphinx version, state it here.
 35 | #
 36 | # needs_sphinx = '1.0'
 37 | 
 38 | # Add any Sphinx extension module names here, as strings. They can be
 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 40 | # ones.
 41 | extensions = [
 42 |     "sphinx.ext.doctest",
 43 |     "sphinx.ext.intersphinx",
 44 |     "sphinx.ext.mathjax",
 45 |     "sphinx.ext.githubpages",
 46 |     "recommonmark",
 47 | ]
 48 | 
 49 | # Add any paths that contain templates here, relative to this directory.
 50 | templates_path = ["_templates"]
 51 | 
 52 | # The suffix(es) of source filenames.
 53 | # You can specify multiple suffix as a list of string:
 54 | #
 55 | source_suffix = [".rst", ".md"]
 56 | 
 57 | # The master toctree document.
 58 | master_doc = "index"
 59 | 
 60 | # The language for content autogenerated by Sphinx. Refer to documentation
 61 | # for a list of supported languages.
 62 | #
 63 | # This is also used if you do content translation via gettext catalogs.
 64 | # Usually you set "language" from the command line for these cases.
 65 | language = None
 66 | 
 67 | # List of patterns, relative to source directory, that match files and
 68 | # directories to ignore when looking for source files.
 69 | # This pattern also affects html_static_path and html_extra_path.
 70 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 71 | 
 72 | # The name of the Pygments (syntax highlighting) style to use.
 73 | pygments_style = "monokai"
 74 | 
 75 | 
 76 | # -- Options for HTML output -------------------------------------------------
 77 | 
 78 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 79 | # a list of builtin themes.
 80 | #
 81 | html_theme = "alabaster"
 82 | 
 83 | # Theme options are theme-specific and customize the look and feel of a theme
 84 | # further.  For a list of options available for each theme, see the
 85 | # documentation.
 86 | #
 87 | html_theme_options = {
 88 |     "github_banner": False,
 89 |     "github_button": True,
 90 |     "github_user": "trallard",
 91 |     "github_repo": "airflow-tutorial",
 92 |     "github_type": "star",
 93 |     "font_family": "Nunito, Georgia, sans",
 94 |     "head_font_family": "Nunito, Georgia, serif",
 95 |     "code_font_family": "'Source Code Pro', 'Consolas', monospace",
 96 |     "description": "a.k.a an introduction to all things DAGS and pipelines joy",
 97 |     "show_relbars": True,
 98 |     "logo": "python.png",
 99 | }
100 | 
101 | # Add any paths that contain custom static files (such as style sheets) here,
102 | # relative to this directory. They are copied after the builtin static files,
103 | # so a file named "default.css" will overwrite the builtin "default.css".
104 | html_static_path = ["_static"]
105 | 
106 | # Custom sidebar templates, must be a dictionary that maps document names
107 | # to template names.
108 | #
109 | # The default sidebars (for documents that don't match any pattern) are
110 | # defined by theme itself.  Builtin themes are using these templates by
111 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
112 | # 'searchbox.html']``.
113 | #
114 | # Custom sidebar templates, maps document names to template names.
115 | html_sidebars = {
116 |     "**": [
117 |         "about.html",
118 |         "localtoc.html",
119 |         "searchbox.html",
120 |         "navigation.html",
121 |         "relations.html",
122 |         "sidebarlogo.html",
123 |     ]
124 | }
125 | 
126 | # -- Options for HTMLHelp output ---------------------------------------------
127 | 
128 | # Output file base name for HTML help builder.
129 | htmlhelp_basename = "Airflowtutorialdoc"
130 | 
131 | 
132 | # -- Options for LaTeX output ------------------------------------------------
133 | 
134 | latex_elements = {
135 |     # The paper size ('letterpaper' or 'a4paper').
136 |     #
137 |     # 'papersize': 'letterpaper',
138 |     # The font size ('10pt', '11pt' or '12pt').
139 |     #
140 |     # 'pointsize': '10pt',
141 |     # Additional stuff for the LaTeX preamble.
142 |     #
143 |     # 'preamble': '',
144 |     # Latex figure (float) alignment
145 |     #
146 |     # 'figure_align': 'htbp',
147 | }
148 | 
149 | # Grouping the document tree into LaTeX files. List of tuples
150 | # (source start file, target name, title,
151 | #  author, documentclass [howto, manual, or own class]).
152 | latex_documents = [
153 |     (
154 |         master_doc,
155 |         "Airflowtutorial.tex",
156 |         "Airflow tutorial Documentation",
157 |         "Tania Allard",
158 |         "manual",
159 |     )
160 | ]
161 | 
162 | 
163 | # -- Options for manual page output ------------------------------------------
164 | 
165 | # One entry per manual page. List of tuples
166 | # (source start file, name, description, authors, manual section).
167 | man_pages = [
168 |     (master_doc, "airflowtutorial", "Airflow tutorial Documentation", [author], 1)
169 | ]
170 | 
171 | 
172 | # -- Options for Texinfo output ----------------------------------------------
173 | 
174 | # Grouping the document tree into Texinfo files. List of tuples
175 | # (source start file, target name, title, author,
176 | #  dir menu entry, description, category)
177 | texinfo_documents = [
178 |     (
179 |         master_doc,
180 |         "Airflowtutorial",
181 |         "Airflow tutorial Documentation",
182 |         author,
183 |         "Airflowtutorial",
184 |         "One line description of project.",
185 |         "Miscellaneous",
186 |     )
187 | ]
188 | 
189 | 
190 | # -- Options for Epub output -------------------------------------------------
191 | 
192 | # Bibliographic Dublin Core info.
193 | epub_title = project
194 | 
195 | # The unique identifier of the text. This can be a ISBN number
196 | # or the project homepage.
197 | #
198 | # epub_identifier = ''
199 | 
200 | # A unique identification for the text.
201 | #
202 | # epub_uid = ''
203 | 
204 | # A list of files that should not be packed into the epub file.
205 | epub_exclude_files = ["search.html"]
206 | 
207 | 
208 | # -- Extension configuration -------------------------------------------------
209 | 
210 | # -- Options for intersphinx extension ---------------------------------------
211 | 
212 | # Example configuration for intersphinx: refer to the Python standard library.
213 | intersphinx_mapping = {"https://docs.python.org/": None}
214 | 
215 | 


--------------------------------------------------------------------------------
/source/first-airflow.md:
--------------------------------------------------------------------------------
  1 | # Airflow 101: working locally and familiarise with the tool
  2 | 
  3 | ### Pre-requisites
  4 | 
  5 | The following prerequisites are needed:
  6 | 
  7 | - Libraries detailed in the Setting up section (either via conda or pipenv)
  8 | - MySQL installed
  9 | - text editor
 10 | - command line
 11 |   
 12 | ## Getting your environment up and running
 13 | 
 14 | If you followed the instructions you should have Airflow installed as well as the rest of the packages we will be using. 
 15 | 
 16 | So let's get our environment up and running:
 17 | 
 18 | If you are using conda start your environment via:
 19 | ```
 20 | $ source activate airflow-env
 21 | ```
 22 | If using pipenv then:
 23 | ```
 24 | $ pipenv shell
 25 | ````
 26 | 
 27 | this will start a shell within a virtual environment, to exit the shell you need to type `exit` and this will exit the virtual environment.
 28 | 
 29 | ## Starting Airflow locally
 30 | 
 31 | Airflow home lives in `~/airflow` by default, but you can change the location before installing airflow. You first need to set the `AIRFLOW_HOME` environment variable and then install airflow. For example, using pip:
 32 | 
 33 | ```sh
 34 | export AIRFLOW_HOME=~/mydir/airflow
 35 | 
 36 | # install from PyPI using pip
 37 | pip install apache-airflow
 38 | ```
 39 | 
 40 | once you have completed the installation you should see something like this in the `airflow` directory (wherever it lives for you)
 41 | 
 42 | ```
 43 | drwxr-xr-x    - myuser 18 Apr 14:02 .
 44 | .rw-r--r--  26k myuser 18 Apr 14:02 ├── airflow.cfg
 45 | drwxr-xr-x    - myuser 18 Apr 14:02 ├── logs
 46 | drwxr-xr-x    - myuser 18 Apr 14:02 │  └── scheduler
 47 | drwxr-xr-x    - myuser 18 Apr 14:02 │     ├── 2019-04-18
 48 | lrwxr-xr-x   46 myuser 18 Apr 14:02 │     └── latest -> /Users/myuser/airflow/logs/scheduler/2019-04-18
 49 | .rw-r--r-- 2.5k myuser 18 Apr 14:02 └── unittests.cfg
 50 | ```
 51 | We need to create a local dag folder:
 52 | 
 53 | ```
 54 | mkdir ~/airflow/dags
 55 | ```
 56 | 
 57 | As your project evolves, your directory will look something like this:
 58 | 
 59 | ```
 60 | airflow                  # the root directory.
 61 | ├── dags                 # root folder for all dags. files inside folders are not searched for dags.
 62 | │   ├── my_dag.py, # my dag (definitions of tasks/operators) including precedence.
 63 | │   └── ...
 64 | ├── logs                 # logs for the various tasks that are run
 65 | │   └── my_dag           # DAG specific logs
 66 | │   │   ├── src1_s3      # folder for task-specific logs (log files are created by date of a run)
 67 | │   │   ├── src2_hdfs
 68 | │   │   ├── src3_s3
 69 | │   │   └── spark_task_etl
 70 | ├── airflow.db           # SQLite database used by Airflow internally to track the status of each DAG.
 71 | ├── airflow.cfg          # global configuration for Airflow (this can be overridden by config inside the file.)
 72 | └── ...
 73 | ```
 74 | 
 75 | ## Prepare your database
 76 | 
 77 | As we mentioned before Airflow uses a database to keep track of the tasks and their statuses. So it is critical to have one set up.
 78 | 
 79 | To start the default database we can run
 80 | ` airflow initdb`. This will initialize your database via alembic so that it matches the latest Airflow release.
 81 | 
 82 | The default database used is `sqlite` which means you cannot parallelize tasks using this database. Since we have MySQL and MySQL client installed we will set them up so that we can use them with airflow.
 83 | 
 84 | 🚦Create an airflow database
 85 | 
 86 | From the command line:
 87 | 
 88 | ```
 89 | MySQL -u root -p
 90 | mysql> CREATE DATABASE airflow CHARACTER SET utf8 COLLATE utf8_unicode_ci;
 91 | mysql> GRANT ALL PRIVILEGES ON airflow.* To 'airflow'@'localhost';
 92 | mysql> FLUSH PRIVILEGES;
 93 | ```
 94 | and initialize the database:
 95 | 
 96 | ```
 97 | airflow initdb
 98 | ```
 99 | 
100 | Notice that this will fail with the default `airflow.cfg`
101 | 
102 | 
103 | ## Update your local configuration 
104 | 
105 | Open your airflow configuration file `~/airflow/airflow.cf` and make the following changes:
106 | 
107 | 
108 | ```
109 | executor = CeleryExecutor
110 | ```
111 | 
112 | ```
113 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#broker-settings
114 | # needs rabbitmq running
115 | broker_url = amqp://guest:guest@127.0.0.1/
116 | 
117 | 
118 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#task-result-backend-settings
119 | result_backend = db+mysql://airflow:airflow@localhost:3306/airflow
120 | 
121 | sql_alchemy_conn = mysql://airflow:python2019@localhost:3306/airflow
122 | 
123 | ```
124 | 
125 | Here we are replacing the default executor (`SequentialExecutor`) with the `CeleryExecutor` so that we can run multiple DAGs in parallel.
126 | We also replace the default `sqlite` database with our newly created `airflow` database.
127 | 
128 | Now we can initialize the database:
129 | ```
130 | airflow initdb
131 | ```
132 | 
133 | Let's now start the web server locally:
134 | 
135 | 
136 | ```
137 | airflow webserver -p 8080
138 | ```
139 | 
140 | we can head over to [http://localhost:8080](http://localhost:8080) now and you will see that there are a number of examples DAGS already there.
141 | 
142 | 🚦 Take some time to familiarise with the UI and get your local instance set up
143 | 
144 | Now let's have a look at the connections ([http://localhost:8080/admin/connection/](http://localhost:8080/admin/connection/)) go to `admin > connections`. You should be able to see a number of connections available. For this tutorial, we will use some of the connections including  `mysql`.
145 | 
146 | <!-- For example, if you have `mysql` running but you have a different password for the root user you can edit it by clicking on the connection name.
147 | 
148 | 
149 | 🚦Now let's create a db for our local project
150 | 
151 | ![](_static/connection.png) -->
152 | 
153 | ### Commands
154 | Let us go over some of the commands. Back on your command line:
155 | 
156 | ```
157 | airflow list_dags
158 | ```
159 | we can list the DAG tasks in a tree view
160 | 
161 | ```
162 | airflow list_tasks tutorial --tree
163 | ```
164 | 
165 | we can tests the dags too, but we will need to set a date parameter so that this executes:
166 | 
167 | ```
168 | airflow test tutorial print_date 2019-05-01
169 | ```
170 | (note that you cannot use a future date or you will get an error)
171 | ```
172 | airflow test tutorial templated 2019-05-01
173 | ```
174 | By using the test commands these are not saved in the database.
175 | 
176 | Now let's start the scheduler:
177 | ```
178 | airflow scheduler
179 | ```
180 | 
181 | Behind the scenes, it monitors and stays in sync with a folder for all DAG objects it contains. The Airflow scheduler is designed to run as a service in an Airflow production environment.
182 | 
183 | Now with the schedule up and running we can trigger an instance:
184 | ```
185 | $ airflow run airflow run example_bash_operator runme_0 2015-01-01
186 | ```
187 | 
188 | This will be stored in the database and you can see the change of the status change straight away.
189 | 
190 | What would happen for example if we wanted to run or trigger the `tutorial` task? 🤔
191 | 
192 | Let's try from the CLI and see what happens.
193 | 
194 | ```
195 | airflow trigger_dag tutorial
196 | ```
197 | 
198 | 
199 | ## Writing your first DAG
200 | 
201 | Let's create our first simple DAG. 
202 | Inside the dag directory (`~/airflow/dags)` create a `simple_dag.py` file.
203 | 
204 | 
205 | ```python
206 | from datetime import datetime, timedelta
207 | from airflow import DAG
208 | from airflow.operators.dummy_operator import DummyOperator
209 | from airflow.operators.python_operator import PythonOperator
210 | 
211 | 
212 | def print_hello():
213 |     return "Hello world!"
214 | 
215 | 
216 | default_args = {
217 |     "owner": "airflow",
218 |     "depends_on_past": False,
219 |     "start_date": datetime(2019, 4, 30),
220 |     "email": ["airflow@example.com"],
221 |     "email_on_failure": False,
222 |     "email_on_retry": False,
223 |     "retries": 1,
224 |     "retry_delay": timedelta(minutes=2),
225 | }
226 | 
227 | dag = DAG(
228 |     "hello_world",
229 |     description="Simple tutorial DAG",
230 |     schedule_interval="0 12 * * *",
231 |     default_args=default_args,
232 |     catchup=False,
233 | )
234 | 
235 | t1 = DummyOperator(task_id="dummy_task", retries=3, dag=dag)
236 | 
237 | t2 = PythonOperator(task_id="hello_task", python_callable=print_hello, dag=dag)
238 | 
239 | # sets downstream foe t1
240 | t1 >> t2
241 | 
242 | # equivalent
243 | # t2.set_upstream(t1)
244 | 
245 | ```
246 | If it is properly setup you should be able to see this straight away on your instance.
247 | 
248 | 
249 | ### Now let's create a DAG from the previous ETL pipeline (kind of)
250 | 
251 | All hands on - check the solutions


--------------------------------------------------------------------------------
/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. Airflow tutorial documentation master file, created by
 2 |    sphinx-quickstart on Mon Apr 15 15:52:00 2019.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Airflow tutorial
 7 | ============================================
 8 | This tutorial was originally developed for PyCon US 2019.
 9 | 
10 | .. toctree::
11 |    :caption: Table of Contents
12 |    :hidden:
13 |    :maxdepth: 2
14 | 
15 |    setup
16 |    about
17 |    pipelines
18 |    airflow-intro
19 |    first-airflow
20 | 
21 | .. toctree::
22 |    :maxdepth: 2
23 |    :caption: Contents:
24 | 
25 | About your facilitator
26 | ======================
27 | 
28 | My name is Tania. I live in Manchester UK where I work as a 
29 | Cloud Advocate for Microsoft.
30 | 
31 | Over the years, I have worked as a data engineer, machine learning engineer,
32 | and research software engineer. I love data intensive
33 | enviroments and I am particularly interested in the tools and workflows to
34 | deliver robust, reproducible data insights.
35 | 
36 | If you have any questions or feedback about this tutorial please, 
37 | file an issue using the following link: `<https://github.com/trallard/airflow-tutorial/issues/new>`_.
38 | 
39 | You can also contact me via the following channels:
40 | 
41 | - E-mail: trallard@bitsandchips.me
42 | - Twitter: `@ixek <https://twitter.com/ixek>`_
43 | - `Tania on GitHub <https://github.com/ixek>`_
44 | 
45 | Code of Conduct
46 | ================
47 | All attendees to this workshop are expected to adhere to PyCon's Code of Conduct,
48 | in brief:
49 | **Be open, considerate, and respectful.**
50 | 
51 | License
52 | =======
53 | The content in this workshop is Licensed under `CC-BY-SA 4.0 <https://creativecommons.org/licenses/by-sa/4.0/>`_.
54 | Which means that you can use, remix and re-distribute so long attribution to the original
55 | author is maintained (Tania Allard).
56 | 
57 | The logo used here was designed by Ashley McNamara for the Microsoft Developer Advocates team use.
58 | 
59 | 
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------