├── .gitignore
├── LICENSE
├── README.md
├── census_analysis
    ├── dags
    │   ├── census_pipeline.py
    │   └── snippets.py
    ├── data
    │   └── raw
    │   │   ├── acs_data.csv.gz
    │   │   └── acs_data.dta.gz
    └── src
    │   ├── analysis.py
    │   ├── clean_data.py
    │   ├── get_data.py
    │   ├── solutions-Analysis.ipynb
    │   └── solutions-Data_Prep.ipynb
├── dag_example
    └── simple_dag.py
├── deployments
    └── jupyterhub-cluster
    │   ├── config
    │       └── config.yaml
    │   └── image
    │       └── Dockerfile
├── docs
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── _build
    │       └── html
    │       │   ├── _static
    │       │       └── uses.png
    │       │   ├── about.html
    │       │   ├── airflow-intro.html
    │       │   ├── first-airflow.html
    │       │   ├── genindex.html
    │       │   ├── index.html
    │       │   ├── objects.inv
    │       │   ├── pipelines.html
    │       │   ├── search.html
    │       │   ├── searchindex.js
    │       │   └── setup.html
    │   ├── _static
    │       ├── 12.png
    │       ├── 4.jpg
    │       ├── DAG.png
    │       ├── GUI.png
    │       ├── airflow-logo.jpeg
    │       ├── airflow.png
    │       ├── architecture.png
    │       ├── automate.png
    │       ├── automation1.jpg
    │       ├── azure.png
    │       ├── connection.png
    │       ├── custom.css
    │       ├── dag-time.png
    │       ├── datapyramid.png
    │       ├── gooddata.png
    │       ├── gooddata1.png
    │       ├── luigi.png
    │       ├── mssignin.png
    │       ├── pipeline1.png
    │       ├── python.png
    │       ├── twitter1.png
    │       ├── twitter2.png
    │       ├── twitter3.png
    │       └── uses.png
    │   ├── _templates
    │       └── sidebarlogo.html
    │   ├── about.md
    │   ├── airflow-intro.md
    │   ├── conf.py
    │   ├── first-airflow.md
    │   ├── index.rst
    │   ├── pipelines.md
    │   └── setup.rst
├── environment.yml
├── extra_tfx_example
    ├── dags
    │   ├── taxi_pipeline.py
    │   └── taxi_utils.py
    ├── data
    │   └── taxi_data
    │   │   └── data.csv
    └── setup
    │   ├── chicago_data
    │       ├── taxi_pipeline_simple.py
    │       └── taxi_utils.py
    │   ├── reset_env.sh
    │   └── setup.sh
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | .vscode/
  3 | 
  4 | *.azcli
  5 | 
  6 | deployments/jupyterhub-cluster/secrets/
  7 | 
  8 | 
  9 | 
 10 | # Byte-compiled / optimized / DLL files
 11 | __pycache__/
 12 | *.py[cod]
 13 | *$py.class
 14 | 
 15 | # C extensions
 16 | *.so
 17 | 
 18 | # Distribution / packaging
 19 | .Python
 20 | build/
 21 | develop-eggs/
 22 | dist/
 23 | downloads/
 24 | eggs/
 25 | .eggs/
 26 | lib/
 27 | lib64/
 28 | parts/
 29 | sdist/
 30 | var/
 31 | wheels/
 32 | pip-wheel-metadata/
 33 | share/python-wheels/
 34 | *.egg-info/
 35 | .installed.cfg
 36 | *.egg
 37 | MANIFEST
 38 | 
 39 | # PyInstaller
 40 | #  Usually these files are written by a python script from a template
 41 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 42 | *.manifest
 43 | *.spec
 44 | 
 45 | # Installer logs
 46 | pip-log.txt
 47 | pip-delete-this-directory.txt
 48 | 
 49 | # Unit test / coverage reports
 50 | htmlcov/
 51 | .tox/
 52 | .nox/
 53 | .coverage
 54 | .coverage.*
 55 | .cache
 56 | nosetests.xml
 57 | coverage.xml
 58 | *.cover
 59 | .hypothesis/
 60 | .pytest_cache/
 61 | 
 62 | # Translations
 63 | *.mo
 64 | *.pot
 65 | 
 66 | # Django stuff:
 67 | *.log
 68 | local_settings.py
 69 | db.sqlite3
 70 | 
 71 | # Flask stuff:
 72 | instance/
 73 | .webassets-cache
 74 | 
 75 | # Scrapy stuff:
 76 | .scrapy
 77 | 
 78 | # Sphinx documentation
 79 | docs/_build/
 80 | source/_build/
 81 | 
 82 | # PyBuilder
 83 | target/
 84 | 
 85 | # Jupyter Notebook
 86 | .ipynb_checkpoints
 87 | 
 88 | # IPython
 89 | profile_default/
 90 | ipython_config.py
 91 | 
 92 | # pyenv
 93 | .python-version
 94 | 
 95 | # pipenv
 96 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 97 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 98 | #   having no cross-platform support, pipenv may install dependencies that don’t work, or not
 99 | #   install all needed dependencies.
100 | #Pipfile.lock
101 | 
102 | # celery beat schedule file
103 | celerybeat-schedule
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | \.DS_Store
135 | 
136 | docs/\.doctrees/
137 | 
138 | census_analysis/data/interim/
139 | 
140 | census_analysis/data/raw/counties/
141 | 
142 | docs/source/_build
143 | 
144 | deployments/*


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2019, Tania Allard
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Airflow tutorials with open data sets
 2 | 
 3 | 
 4 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/trallard/opendata-airflow-tutorial/master)
 5 | 
 6 | 
 7 | This repo contains a tutorial on Airflow using census data and the Chicago taxi dataset. 
 8 | 
 9 | For a detailed overview of the requirements, setup and contents visit the docs URL: <https://opendata-airflow-tutorial.readthedocs.io/en/latest/>
10 | 
11 | 
12 | _Note_: this is still much in progress and I plan to add more pipelines, use cases and a how-to deploy to Azure Kubernetes services.
13 | 
14 | 


--------------------------------------------------------------------------------
/census_analysis/dags/census_pipeline.py:
--------------------------------------------------------------------------------
 1 | """Airflow dag to demonstrate a simple analysis pipeline"""
 2 | 
 3 | import io
 4 | import os
 5 | from datetime import datetime
 6 | from datetime import timedelta
 7 | from pathlib import Path
 8 | from zipfile import ZipFile
 9 | 
10 | import requests
11 | from airflow import DAG
12 | from airflow.operators.email_operator import EmailOperator
13 | from airflow.operators.python_operator import PythonOperator
14 | 
15 | _dags_root = os.path.join(os.environ["HOME"], "airflow")
16 | _data_root = os.path.join(_dags_root, "data/raw")
17 | 
18 | # Airflow-specific configs; these will be passed directly to airflow
19 | default_args = {
20 |     "owner": "admin",
21 |     "depends_on_past": False,
22 |     "start_date": datetime.now() - timedelta(days=5),
23 |     "retries": 1,
24 |     "retry_delay": timedelta(minutes=2),
25 |     "email_on_failure": False,
26 | }
27 | 
28 | # --------------
29 | # DAG methods
30 | # --------------
31 | 
32 | 
33 | def collect_data():
34 |     url = "https://www2.census.gov/geo/tiger/TIGER2018/COUNTY/tl_2018_us_county.zip"
35 |     site = requests.get(url)
36 | 
37 |     z = ZipFile(io.BytesIO(site.content))
38 |     z.extractall(_data_root)
39 | 
40 |     print("Data collected")
41 | 
42 | 
43 | # ---------------------
44 | # DAG implementation
45 | # ---------------------
46 | 
47 | dag = DAG(
48 |     "census_pipeline",
49 |     default_args=default_args,
50 |     schedule_interval="@daily",
51 |     catchup=False,
52 | )
53 | 
54 | 
55 | t1 = PythonOperator(task_id="collect_data", python_callable=collect_data(), dag=dag)
56 | 


--------------------------------------------------------------------------------
/census_analysis/dags/snippets.py:
--------------------------------------------------------------------------------
  1 | from airflow.operators.email_operator import EmailOperator
  2 | from datetime import timedelta, datetime
  3 | 
  4 | email_task = EmailOperator(
  5 |     to="some@email.com",
  6 |     task_id="email_task",
  7 |     subject="Templated Subject: start_date {{ ds }}",
  8 |     params={"content1": "random"},
  9 |     html_content="Templated Content: content1 - {{ params.content1 }}  task_key - {{ task_instance_key_str }} test_mode - {{ test_mode }} task_owner - {{ task.owner}} hostname - {{ ti.hostname }}",
 10 |     dag=dag,
 11 | )
 12 | 
 13 | #  run
 14 | 
 15 | airflow test dag_name email_task <today date>
 16 | 
 17 | 
 18 | # Adding params
 19 | 
 20 | # You can pass `params` dict to DAG object
 21 | default_args = {
 22 |     'owner': 'airflow',
 23 |     'depends_on_past': False,
 24 |     'start_date': airflow.utils.dates.days_ago(2),
 25 | }
 26 | 
 27 | dag = DAG(
 28 |     dag_id='airflow_tutorial_2',
 29 |     default_args=default_args,
 30 |     schedule_interval=None,
 31 |     params={
 32 |         "param1": "value1",
 33 |         "param2": "value2"
 34 |     }
 35 | )
 36 | 
 37 | bash = BashOperator(
 38 |     task_id='bash',
 39 |     bash_command='echo {{ params.param1 }}', # Output: value1
 40 |     dag=dag
 41 | )
 42 | 
 43 | 
 44 | # accessing sensitive data in connections
 45 | # install pip install apache-airflow[crypto]
 46 | 
 47 | from airflow.hooks.base_hook import BaseHook
 48 | slack_token = BaseHook.get_connection('slack').password
 49 | 
 50 | 
 51 | #  accesing variables
 52 | from airflow.models import Variable
 53 | 
 54 | # Common (Not-so-nice way)
 55 | # 3 DB connections when the file is parsed
 56 | var1 = Variable.get("var1")
 57 | var2 = Variable.get("var2")
 58 | var3 = Variable.get("var3")
 59 | 
 60 | # Recommended Way
 61 | # Just 1 Database call
 62 | dag_config = Variable.get("dag1_config", deserialize_json=True)
 63 | dag_config["var1"]
 64 | dag_config["var2"]
 65 | dag_config["var3"]
 66 | 
 67 | # You can directly use it Templated arguments {{ var.json.my_var.path }}
 68 | bash_task = BashOperator(
 69 |     task_id="bash_task",
 70 |     bash_command='{{ var.json.dag1_config.var1 }} ',
 71 |     dag=dag,
 72 | )
 73 | 
 74 | # macros reference
 75 | 
 76 | # https://airflow.apache.org/macros.html
 77 | 
 78 | {
 79 |       'dag': task.dag,
 80 |       'ds': ds,
 81 |       'next_ds': next_ds,
 82 |       'next_ds_nodash': next_ds_nodash,
 83 |       'prev_ds': prev_ds,
 84 |       'prev_ds_nodash': prev_ds_nodash,
 85 |       'ds_nodash': ds_nodash,
 86 |       'ts': ts,
 87 |       'ts_nodash': ts_nodash,
 88 |       'ts_nodash_with_tz': ts_nodash_with_tz,
 89 |       'yesterday_ds': yesterday_ds,
 90 |       'yesterday_ds_nodash': yesterday_ds_nodash,
 91 |       'tomorrow_ds': tomorrow_ds,
 92 |       'tomorrow_ds_nodash': tomorrow_ds_nodash,
 93 |       'END_DATE': ds,
 94 |       'end_date': ds,
 95 |       'dag_run': dag_run,
 96 |       'run_id': run_id,
 97 |       'execution_date': self.execution_date,
 98 |       'prev_execution_date': prev_execution_date,
 99 |       'next_execution_date': next_execution_date,
100 |       'latest_date': ds,
101 |       'macros': macros,
102 |       'params': params,
103 |       'tables': tables,
104 |       'task': task,
105 |       'task_instance': self,
106 |       'ti': self,
107 |       'task_instance_key_str': ti_key_str,
108 |       'conf': configuration,
109 |       'test_mode': self.test_mode,
110 |       'var': {
111 |           'value': VariableAccessor(),
112 |           'json': VariableJsonAccessor()
113 |       },
114 |       'inlets': task.inlets,
115 |       'outlets': task.outlets,
116 | }
117 | 
118 | # dynamic dags
119 | 
120 | # Using DummyOperator
121 | a = []
122 | for i in range(0,10):
123 |     a.append(DummyOperator(
124 |         task_id='Component'+str(i),
125 |         dag=dag))
126 |     if i != 0: 
127 |         a[i-1] >> a[i]
128 | 
129 | # From a List
130 | sample_list = ["val1", "val2", "val3"]
131 | tasks_list = []
132 | for index, value in enumerate(sample_list):
133 |     tasks_list.append(DummyOperator(
134 |         task_id='Component'+str(index),
135 |         dag=dag))
136 |     if index != 0: 
137 |         tasks_list[index-1] >> tasks_list[index]
138 | 
139 | # database
140 | 
141 | airflow initdb # first time only
142 | 
143 | airflow upgradedb # apply missing migrations


--------------------------------------------------------------------------------
/census_analysis/data/raw/acs_data.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/census_analysis/data/raw/acs_data.csv.gz


--------------------------------------------------------------------------------
/census_analysis/data/raw/acs_data.dta.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/census_analysis/data/raw/acs_data.dta.gz


--------------------------------------------------------------------------------
/census_analysis/src/analysis.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | import os
 3 | from datetime import datetime as dt
 4 | from pathlib import Path
 5 | 
 6 | import pandas as pd
 7 | 
 8 | # data folder and paths
 9 | RAW_DATA_PATH = Path("../data/raw/")
10 | INTERIM_DATA_PATH = Path("../data/interim/")
11 | PROCESSED_DATA_PATH = Path("../data/processed/")
12 | FINAL_DATA_PATH = Path("../data/final/")
13 | 
14 | 
15 | # analysis methods
16 | 
17 | 
18 | def load_data(date):
19 |     data = pd.read_stata(INTERIM_DATA_PATH / f"working_data-{date}.dta")
20 | 
21 |     return data
22 | 
23 | 
24 | def drop_rows(data):
25 |     """Drop observations where pernum does not equal 1
26 |     """
27 |     mask_pernum = data["pernum"] == 1
28 |     return data[mask_pernum].copy()
29 | 
30 | 
31 | def define_groups(data):
32 |     mask_latino = data["hispan"] != "not hispanic"
33 |     mask_white = (data["hispan"] == "not hispanic") & (data["race"] == "white")
34 |     mask_black = (data["hispan"] == "not hispanic") & (
35 |         data["race"].str.contains("black")
36 |     )
37 |     mask_native = (data["hispan"] == "not hispanic") & (
38 |         data["race"] == "american indian or alaska native"
39 |     )
40 |     mask_API = (data["hispan"] == "not hispanic") & (
41 |         (data["race"] >= "chinese")
42 |         & (data["race"] <= "other asian or pacific islander")
43 |     )
44 |     mask_other = (data["hispan"] == "not hispanic") & (
45 |         data["race"] >= "other race, nec"
46 |     )
47 | 
48 |     data.loc[mask_latino, "racen"] = "Latino"
49 |     data.loc[mask_white, "racen"] = "White"
50 |     data.loc[mask_black, "racen"] = "Black/African-American"
51 |     data.loc[mask_native, "racen"] = "Am. Indian / Alaska Native"
52 |     data.loc[mask_API, "racen"] = "Asian / Pacific Islander"
53 |     data.loc[mask_other, "racen"] = "other"
54 | 
55 |     return data
56 | 
57 | 
58 | def analyse_data(data):
59 |     cihispeed_by_racen = data.groupby(["racen", "cihispeed"])[["hhwt"]].sum()
60 |     households_by_racen = data.groupby("racen")[["hhwt"]].sum()
61 | 
62 |     shares_cihispeed_by_racen = cihispeed_by_racen / households_by_racen
63 |     shares_cihispeed_by_racen = shares_cihispeed_by_racen.reset_index()
64 | 
65 |     mask_yes_cihispeed = (
66 |         shares_cihispeed_by_racen["cihispeed"]
67 |         == "yes (cable modem, fiber optic or dsl service)"
68 |     )
69 | 
70 |     return shares_cihispeed_by_racen[mask_yes_cihispeed]
71 | 
72 | 
73 | if __name__ == "__main__":
74 | 
75 |     date = dt.today().strftime("%d-%b-%y")
76 |     raw_data = load_data(date)
77 |     data = drop_rows(raw_data)
78 |     data_groups = define_groups(data)
79 |     speed_data = analyse_data(data_groups)
80 |     speed_data.to_csv(f"{FINAL_DATA_PATH}/{date}-internet-speed.csv", "r")
81 | 


--------------------------------------------------------------------------------
/census_analysis/src/clean_data.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | import os
 3 | from datetime import datetime as dt
 4 | from pathlib import Path
 5 | 
 6 | import pandas as pd
 7 | 
 8 | # we will use today date
 9 | today = dt.today().strftime("%d-%b-%y")
10 | 
11 | 
12 | # data folder and paths
13 | RAW_DATA_PATH = Path("../data/raw/")
14 | INTERIM_DATA_PATH = Path("../data/interim/")
15 | PROCESSED_DATA_PATH = Path("../data/processed/")
16 | FINAL_DATA_PATH = Path("../data/final/")
17 | 
18 | 
19 | # supporting functions
20 | # ------------------------
21 | 
22 | 
23 | def dir_exists(dir_path):
24 |     if not os.path.exists(dir_path):
25 |         os.makedirs(dir_path)
26 |     else:
27 |         print(f"{dir_path} found, skipping")
28 | 
29 | 
30 | def load_data(data_path):
31 |     """load data into a pd dataframe
32 |     
33 |     Args:
34 |         data_path (path): path to the gzipped data
35 |     """
36 |     with gzip.open(RAW_DATA_PATH / "acs_data.dta.gz") as file:
37 |         data = pd.read_stata(file)
38 |         return data
39 | 
40 | 
41 | def state_mask(state, df):
42 |     """Used to select only one state
43 |     
44 |     Args:
45 |         state (string): state to be masked
46 |     
47 |     Returns:
48 |         df: subset of the data
49 |     """
50 |     mask_state = df["statefip"] == f"{state}"
51 |     return df[mask_state].copy()
52 | 
53 | 
54 | def clean_masked(df):
55 |     df.drop(columns=["related", "raced", "hispand"], inplace=True)
56 |     mask_household = (df["gq"] == "households under 1970 definition") | (
57 |         df["gq"] == "additional households under 1990 definition"
58 |     )
59 |     return df[mask_household].copy()
60 | 
61 | 
62 | def save_df(data_path, df):
63 |     df.to_stata(f"{data_path}/state_data-{today}.dta", write_index=False)
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     state = "ohio"
68 |     raw_data = load_data(RAW_DATA_PATH)
69 |     state_data = state_mask(state, raw_data)
70 |     clean_state = clean_masked(state_data)
71 |     save_df(INTERIM_DATA_PATH, clean_state)
72 | 
73 |     print(f"Completed cleaning for {state}")
74 | 


--------------------------------------------------------------------------------
/census_analysis/src/get_data.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | from pathlib import Path
 3 | from zipfile import ZipFile
 4 | 
 5 | import requests
 6 | 
 7 | RAW_DATA_PATH = Path("data/raw/counties/")
 8 | 
 9 | url = "https://www2.census.gov/geo/tiger/TIGER2018/COUNTY/tl_2018_us_county.zip"
10 | site = requests.get(url)
11 | 
12 | z = ZipFile(io.BytesIO(site.content))
13 | z.extractall(RAW_DATA_PATH)
14 | 


--------------------------------------------------------------------------------
/dag_example/simple_dag.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.operators.dummy_operator import DummyOperator
 5 | from airflow.operators.python_operator import PythonOperator
 6 | 
 7 | 
 8 | def print_hello():
 9 |     return "Hello world!"
10 | 
11 | 
12 | default_args = {
13 |     "owner": "airflow",
14 |     "depends_on_past": False,
15 |     "start_date": datetime(2019, 8, 30),
16 |     "email": ["airflow@example.com"],
17 |     "email_on_failure": False,
18 |     "email_on_retry": False,
19 |     "retries": 1,
20 |     "retry_delay": timedelta(minutes=2),
21 | }
22 | 
23 | dag = DAG(
24 |     "hello_world",
25 |     description="Simple tutorial DAG",
26 |     schedule_interval="0 12 * * *",
27 |     default_args=default_args,
28 |     catchup=False,
29 | )
30 | 
31 | t1 = DummyOperator(task_id="dummy_task", retries=3, dag=dag)
32 | 
33 | t2 = PythonOperator(task_id="hello_task", python_callable=print_hello, dag=dag)
34 | 
35 | # sets downstream for t1
36 | t1 >> t2
37 | 
38 | # equivalent
39 | # t2.set_upstream(t1)
40 | 


--------------------------------------------------------------------------------
/deployments/jupyterhub-cluster/config/config.yaml:
--------------------------------------------------------------------------------
 1 | singleuser:
 2 |   defaultUrl: "/lab"
 3 |   memory:
 4 |     guarantee: 512M
 5 |     limit: 1G
 6 |   image: 
 7 |     name: trallard/jupyter-rserver
 8 |     tag: 1.4
 9 |   lifecycleHooks:
10 |     postStart:
11 |       exec:
12 |         command: ["gitpuller", "https://github.com/pyladies-nwuk/Python_meets_R", "master", "reticulate-ws"]
13 | 
14 | hub:
15 |   extraConfig: 
16 |     jupyterlab: |
17 |       c.Spawner.cmd = ['jupyter-labhub']
18 | 
19 | # prepare added nodes for arriving users
20 | prepuller:
21 |   continuous:
22 |     enabled: true


--------------------------------------------------------------------------------
/deployments/jupyterhub-cluster/image/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM jupyter/scipy-notebook
 2 | 
 3 | LABEL maintainer="Tania Allard trallard[at]bitsandchips.me"
 4 | 
 5 | ENV SLUGIFY_USES_TEXT_UNIDECODE yes
 6 | ARG AIRFLOW_USER_HOME=/home/jovyan/work
 7 | 
 8 | 
 9 | COPY requirements.txt /tmp/requirements.txt
10 | RUN pip install --no-cache-dir -r /tmp/requirements.txt
11 | 
12 | EXPOSE 8080 8888 5555 8793


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/source/_build/html/_static/uses.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_build/html/_static/uses.png


--------------------------------------------------------------------------------
/docs/source/_build/html/about.html:
--------------------------------------------------------------------------------
  1 | 
  2 | <!DOCTYPE html>
  3 | 
  4 | <html xmlns="http://www.w3.org/1999/xhtml">
  5 |   <head>
  6 |     <meta charset="utf-8" />
  7 |     <title>About the workshop &#8212; EuroScipy tutorial  documentation</title>
  8 |     <link rel="stylesheet" href="_static/alabaster.css" type="text/css" />
  9 |     <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
 10 |     <script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
 11 |     <script type="text/javascript" src="_static/jquery.js"></script>
 12 |     <script type="text/javascript" src="_static/underscore.js"></script>
 13 |     <script type="text/javascript" src="_static/doctools.js"></script>
 14 |     <script type="text/javascript" src="_static/language_data.js"></script>
 15 |     <link rel="author" title="About these documents" href="#" />
 16 |     <link rel="index" title="Index" href="genindex.html" />
 17 |     <link rel="search" title="Search" href="search.html" />
 18 |    
 19 |   <link rel="stylesheet" href="_static/custom.css" type="text/css" />
 20 |   
 21 |   
 22 |   <meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
 23 | 
 24 |   </head><body>
 25 |   
 26 | 
 27 |     <div class="document">
 28 |       <div class="documentwrapper">
 29 |         <div class="bodywrapper">
 30 |               <div class="related top">
 31 |                 &nbsp;
 32 |   <nav id="rellinks">
 33 |     <ul>
 34 |     </ul>
 35 |   </nav>
 36 |               </div>
 37 |           
 38 | 
 39 |           <div class="body" role="main">
 40 |             
 41 |   <div class="section" id="about-the-workshop">
 42 | <h1>About the workshop<a class="headerlink" href="#about-the-workshop" title="Permalink to this headline">¶</a></h1>
 43 | <p>We will be taking a look at the basic concepts of data pipelines as well as practical use cases using Python.</p>
 44 | <div class="section" id="about-you">
 45 | <h2>About you:<a class="headerlink" href="#about-you" title="Permalink to this headline">¶</a></h2>
 46 | <ul class="simple">
 47 | <li><p>Some experience using the command line</p></li>
 48 | <li><p>Intermediate Python knowledge / use</p></li>
 49 | <li><p>Be able to apply what we learn and adopt to your use cases</p></li>
 50 | <li><p>Interested in data and systems</p></li>
 51 | <li><p>Aspring or current data engineering</p></li>
 52 | <li><p>Some knowledge about systems and databases (enough to be dangerous)</p></li>
 53 | </ul>
 54 | </div>
 55 | <div class="section" id="our-focus-for-the-day">
 56 | <h2>Our focus for the day<a class="headerlink" href="#our-focus-for-the-day" title="Permalink to this headline">¶</a></h2>
 57 | <ul class="simple">
 58 | <li><p>Greater understanding on how to apply data pipelines using the Python toolset</p></li>
 59 | <li><p>Focus on concepts</p></li>
 60 | <li><p>Apply knowledge with each library</p></li>
 61 | <li><p>Will give you the building blocks</p></li>
 62 | </ul>
 63 | </div>
 64 | <div class="section" id="keeping-on-track">
 65 | <h2>Keeping on track<a class="headerlink" href="#keeping-on-track" title="Permalink to this headline">¶</a></h2>
 66 | <p>You will find 🚦 across the tutorial examples. We will use this to identify how folks are doing over the workshop (if following along in person).
 67 | Place the post it as follows:</p>
 68 | <p>🚦 Purple postit: all good, task has been completed</p>
 69 | <p>🚦 Orange postit: I need extra time or need help with the task in hand</p>
 70 | </div>
 71 | </div>
 72 | 
 73 | 
 74 |           </div>
 75 |               <div class="related bottom">
 76 |                 &nbsp;
 77 |   <nav id="rellinks">
 78 |     <ul>
 79 |     </ul>
 80 |   </nav>
 81 |               </div>
 82 |           
 83 |         </div>
 84 |       </div>
 85 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
 86 |         <div class="sphinxsidebarwrapper">
 87 | <p class="logo">
 88 |   <a href="index.html">
 89 |     <img class="logo" src="_static/python.png" alt="Logo"/>
 90 |     
 91 |   </a>
 92 | </p>
 93 | 
 94 | 
 95 | 
 96 | <p class="blurb">a.k.a an introduction to all things DAGS and pipelines joy</p>
 97 | 
 98 | 
 99 | 
100 | 
101 | <p>
102 | <iframe src="https://ghbtns.com/github-btn.html?user=trallard&repo=opendata-EuroScipy&type=star&count=true&size=large&v=2"
103 |   allowtransparency="true" frameborder="0" scrolling="0" width="200px" height="35px"></iframe>
104 | </p>
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 |   <h3><a href="index.html">Table of Contents</a></h3>
112 |   <ul>
113 | <li><a class="reference internal" href="#">About the workshop</a><ul>
114 | <li><a class="reference internal" href="#about-you">About you:</a></li>
115 | <li><a class="reference internal" href="#our-focus-for-the-day">Our focus for the day</a></li>
116 | <li><a class="reference internal" href="#keeping-on-track">Keeping on track</a></li>
117 | </ul>
118 | </li>
119 | </ul>
120 | 
121 | <div id="searchbox" style="display: none" role="search">
122 |   <h3 id="searchlabel">Quick search</h3>
123 |     <div class="searchformwrapper">
124 |     <form class="search" action="search.html" method="get">
125 |       <input type="text" name="q" aria-labelledby="searchlabel" />
126 |       <input type="submit" value="Go" />
127 |     </form>
128 |     </div>
129 | </div>
130 | <script type="text/javascript">$('#searchbox').show(0);</script><h3>Navigation</h3>
131 | 
132 | <div class="relations">
133 | <h3>Related Topics</h3>
134 | <ul>
135 |   <li><a href="index.html">Documentation overview</a><ul>
136 |   </ul></li>
137 | </ul>
138 | </div><p><iframe src="https://ghbtns.com/github-btn.html?user=trallard&type=follow&count=false" allowtransparency="true"
139 |         frameborder="0" scrolling="0" width="200" height="20"></iframe></p>
140 | 
141 | <p><a href="https://twitter.com/ixek" class="twitter-follow-button" data-show-count="false">Follow @ixek</a>
142 |     <script>!function (d, s, id) { var js, fjs = d.getElementsByTagName(s)[0], p = /^http:/.test(d.location) ? 'http' : 'https'; if (!d.getElementById(id)) { js = d.createElement(s); js.id = id; js.src = p + '://platform.twitter.com/widgets.js'; fjs.parentNode.insertBefore(js, fjs); } }(document, 'script', 'twitter-wjs');</script>
143 | </p>
144 |         </div>
145 |       </div>
146 |       <div class="clearer"></div>
147 |     </div>
148 |     <div class="footer">
149 |       &copy;2019, Tania Allard.
150 |       
151 |       |
152 |       Powered by <a href="http://sphinx-doc.org/">Sphinx 2.2.0</a>
153 |       &amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.12</a>
154 |       
155 |       |
156 |       <a href="_sources/about.md.txt"
157 |           rel="nofollow">Page source</a>
158 |     </div>
159 | 
160 |     
161 | 
162 |     
163 |   </body>
164 | </html>


--------------------------------------------------------------------------------
/docs/source/_build/html/airflow-intro.html:
--------------------------------------------------------------------------------
  1 | 
  2 | <!DOCTYPE html>
  3 | 
  4 | <html xmlns="http://www.w3.org/1999/xhtml">
  5 |   <head>
  6 |     <meta charset="utf-8" />
  7 |     <title>Airflow basics &#8212; EuroScipy tutorial  documentation</title>
  8 |     <link rel="stylesheet" href="_static/alabaster.css" type="text/css" />
  9 |     <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
 10 |     <script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
 11 |     <script type="text/javascript" src="_static/jquery.js"></script>
 12 |     <script type="text/javascript" src="_static/underscore.js"></script>
 13 |     <script type="text/javascript" src="_static/doctools.js"></script>
 14 |     <script type="text/javascript" src="_static/language_data.js"></script>
 15 |     <link rel="author" title="About these documents" href="about.html" />
 16 |     <link rel="index" title="Index" href="genindex.html" />
 17 |     <link rel="search" title="Search" href="search.html" />
 18 |     <link rel="next" title="Airflow 101: working locally and familiarise with the tool" href="first-airflow.html" />
 19 |     <link rel="prev" title="Pipelines" href="pipelines.html" />
 20 |    
 21 |   <link rel="stylesheet" href="_static/custom.css" type="text/css" />
 22 |   
 23 |   
 24 |   <meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
 25 | 
 26 |   </head><body>
 27 |   
 28 | 
 29 |     <div class="document">
 30 |       <div class="documentwrapper">
 31 |         <div class="bodywrapper">
 32 |               <div class="related top">
 33 |                 &nbsp;
 34 |   <nav id="rellinks">
 35 |     <ul>
 36 |         <li>
 37 |           &larr;
 38 |           <a href="pipelines.html" title="Previous document">Pipelines</a>
 39 |         </li>
 40 |         <li>
 41 |           <a href="first-airflow.html" title="Next document">Airflow 101: working locally and familiarise with the tool</a>
 42 |           &rarr;
 43 |         </li>
 44 |     </ul>
 45 |   </nav>
 46 |               </div>
 47 |           
 48 | 
 49 |           <div class="body" role="main">
 50 |             
 51 |   <div class="section" id="airflow-basics">
 52 | <h1>Airflow basics<a class="headerlink" href="#airflow-basics" title="Permalink to this headline">¶</a></h1>
 53 | <div class="section" id="what-is-airflow">
 54 | <h2>What is Airflow?<a class="headerlink" href="#what-is-airflow" title="Permalink to this headline">¶</a></h2>
 55 | <p><img alt="airflow logo" src="_images/airflow-logo.jpeg" /></p>
 56 | <p>Airflow is a Workflow engine which means:</p>
 57 | <ul class="simple">
 58 | <li><p>Manage scheduling and running jobs and data pipelines</p></li>
 59 | <li><p>Ensures jobs are ordered correctly based on dependencies</p></li>
 60 | <li><p>Manage the allocation of scarce resources</p></li>
 61 | <li><p>Provides mechanisms for tracking the state of jobs and recovering from failure</p></li>
 62 | </ul>
 63 | <p>It is highly versatile and can be used across many many domains:
 64 | <img alt="_images/uses.png" src="_images/uses.png" /></p>
 65 | </div>
 66 | <div class="section" id="basic-airflow-concepts">
 67 | <h2>Basic Airflow concepts<a class="headerlink" href="#basic-airflow-concepts" title="Permalink to this headline">¶</a></h2>
 68 | <ul class="simple">
 69 | <li><p><strong>Task</strong>: a defined unit of work (these are called operators in Airflow)</p></li>
 70 | <li><p><strong>Task instance</strong>: an individual run of a single task. Task instances also have an indicative state, which could be “running”, “success”, “failed”, “skipped”, “up for retry”, etc.</p></li>
 71 | <li><p><strong>DAG</strong>: Directed acyclic graph,
 72 | a set of tasks with explicit execution order, beginning, and end</p></li>
 73 | <li><p><strong>DAG run</strong>: individual execution/run of a DAG</p></li>
 74 | </ul>
 75 | <p><strong>Debunking the DAG</strong></p>
 76 | <p>The vertices and edges (the arrows linking the nodes) have an order and direction associated to them</p>
 77 | <p><img alt="_images/DAG.png" src="_images/DAG.png" /></p>
 78 | <p>each node in a DAG corresponds to a task, which in turn represents some sort of data processing. For example:</p>
 79 | <p>Node A could be the code for pulling data from an API, node B could be the code for anonymizing the data. Node B could be the code for checking that there are no duplicate records, and so on.</p>
 80 | <p>These ‘pipelines’ are acyclic since they need a point of completion.</p>
 81 | <p><strong>Dependencies</strong></p>
 82 | <p>Each of the vertices has a particular direction that shows the relationship between certain nodes. For example, we can only anonymize data once this has been pulled out from the API.</p>
 83 | </div>
 84 | <div class="section" id="idempotency">
 85 | <h2>Idempotency<a class="headerlink" href="#idempotency" title="Permalink to this headline">¶</a></h2>
 86 | <p>This is one of the most important characteristics of good ETL architectures.</p>
 87 | <p>When we say that something is idempotent it means it will produce the same result regardless of how many times this is run (i.e. the results are reproducible).</p>
 88 | <p>Reproducibility is particularly important in data-intensive environments as this ensures that the same inputs will always return the same outputs.</p>
 89 | </div>
 90 | <div class="section" id="airflow-components">
 91 | <h2>Airflow components<a class="headerlink" href="#airflow-components" title="Permalink to this headline">¶</a></h2>
 92 | <p><img alt="_images/architecture.png" src="_images/architecture.png" /></p>
 93 | <p>There are 4 main components to Apache Airflow:</p>
 94 | <div class="section" id="web-server">
 95 | <h3>Web server<a class="headerlink" href="#web-server" title="Permalink to this headline">¶</a></h3>
 96 | <p>The GUI. This is under the hood a Flask app where you can track the status of your jobs and read logs from a remote file store (e.g. <a class="reference external" href="https://docs.microsoft.com/en-us/azure/storage/blobs/storage-blobs-overview/?wt.mc_id=PyCon-github-taallard">Azure Blobstorage</a>).</p>
 97 | </div>
 98 | <div class="section" id="scheduler">
 99 | <h3>Scheduler<a class="headerlink" href="#scheduler" title="Permalink to this headline">¶</a></h3>
100 | <p>This component is responsible for scheduling jobs. This is a multithreaded Python process that uses the DAGb object to decide what tasks need to be run, when and where.</p>
101 | <p>The task state is retrieved and updated from the database accordingly. The web server then uses these saved states to display job information.</p>
102 | </div>
103 | <div class="section" id="executor">
104 | <h3>Executor<a class="headerlink" href="#executor" title="Permalink to this headline">¶</a></h3>
105 | <p>The mechanism that gets the tasks done.</p>
106 | </div>
107 | <div class="section" id="metadata-database">
108 | <h3>Metadata database<a class="headerlink" href="#metadata-database" title="Permalink to this headline">¶</a></h3>
109 | <ul class="simple">
110 | <li><p>Powers how the other components interact</p></li>
111 | <li><p>Stores the Airflow states</p></li>
112 | <li><p>All processes read and write from here</p></li>
113 | </ul>
114 | </div>
115 | </div>
116 | <div class="section" id="workflow-as-a-code">
117 | <h2>Workflow as a code<a class="headerlink" href="#workflow-as-a-code" title="Permalink to this headline">¶</a></h2>
118 | <p>One of the main advantages of using a workflow system like Airflow is that all is code, which makes your workflows maintainable, versionable, testable, and collaborative.</p>
119 | <p>Thus your workflows become more explicit and maintainable (atomic tasks).</p>
120 | <p>Not only your code is dynamic but also is your infrastructure.</p>
121 | <div class="section" id="defining-tasks">
122 | <h3>Defining tasks<a class="headerlink" href="#defining-tasks" title="Permalink to this headline">¶</a></h3>
123 | <p>Tasks are defined based on the abstraction of <code class="docutils literal notranslate"><span class="pre">Operators</span></code> (see Airflow docs <a class="reference external" href="https://airflow.apache.org/concepts.html#operators">here</a>) which represent a single <strong>idempotent task</strong>.</p>
124 | <p>The best practice is to have atomic operators (i.e. can stand on their own and do not need to share resources among them).</p>
125 | <p>You can choose among;</p>
126 | <ul class="simple">
127 | <li><p><code class="docutils literal notranslate"><span class="pre">BashOperator</span></code></p></li>
128 | <li><p><code class="docutils literal notranslate"><span class="pre">PythonOperator</span></code></p></li>
129 | <li><p><code class="docutils literal notranslate"><span class="pre">EmailOperator</span></code></p></li>
130 | <li><p><code class="docutils literal notranslate"><span class="pre">SimpleHttpOperator</span></code></p></li>
131 | <li><p><code class="docutils literal notranslate"><span class="pre">MySqlOperator</span></code> (and other DB)</p></li>
132 | </ul>
133 | <p>Examples:</p>
134 | <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">t1</span> <span class="o">=</span> <span class="n">BashOperator</span><span class="p">(</span><span class="n">task_id</span><span class="o">=</span><span class="s1">&#39;print_date&#39;</span><span class="p">,</span>
135 |     <span class="n">bash_command</span><span class="o">=</span><span class="s1">&#39;date,</span>
136 |     <span class="n">dag</span><span class="o">=</span><span class="n">dag</span><span class="p">)</span> 
137 | </pre></div>
138 | </div>
139 | <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">print_context</span><span class="p">(</span><span class="n">ds</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
140 |     <span class="n">pprint</span><span class="p">(</span><span class="n">kwargs</span><span class="p">)</span>
141 |     <span class="k">print</span><span class="p">(</span><span class="n">ds</span><span class="p">)</span>
142 |     <span class="k">return</span> <span class="s1">&#39;Whatever you return gets printed in the logs&#39;</span>
143 | 
144 | 
145 | <span class="n">run_this</span> <span class="o">=</span> <span class="n">PythonOperator</span><span class="p">(</span>
146 |     <span class="n">task_id</span><span class="o">=</span><span class="s1">&#39;print_the_context&#39;</span><span class="p">,</span>
147 |     <span class="n">provide_context</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span>
148 |     <span class="n">python_callable</span><span class="o">=</span><span class="n">print_context</span><span class="p">,</span>
149 |     <span class="n">dag</span><span class="o">=</span><span class="n">dag</span><span class="p">,</span>
150 | <span class="p">)</span>
151 | </pre></div>
152 | </div>
153 | </div>
154 | </div>
155 | <div class="section" id="comparing-luigi-and-airflow">
156 | <h2>Comparing Luigi and Airflow<a class="headerlink" href="#comparing-luigi-and-airflow" title="Permalink to this headline">¶</a></h2>
157 | <div class="section" id="luigi">
158 | <h3>Luigi<a class="headerlink" href="#luigi" title="Permalink to this headline">¶</a></h3>
159 | <ul class="simple">
160 | <li><p>Created at Spotify (named after the plumber)</p></li>
161 | <li><p>Open sourced in late 2012</p></li>
162 | <li><p>GNU make for data</p></li>
163 | </ul>
164 | </div>
165 | <div class="section" id="airflow">
166 | <h3>Airflow<a class="headerlink" href="#airflow" title="Permalink to this headline">¶</a></h3>
167 | <ul class="simple">
168 | <li><p>Airbnb data team</p></li>
169 | <li><p>Open-sourced mud 2015</p></li>
170 | <li><p>Apache incubator mid-2016</p></li>
171 | <li><p>ETL pipelines</p></li>
172 | </ul>
173 | </div>
174 | <div class="section" id="similarities">
175 | <h3>Similarities<a class="headerlink" href="#similarities" title="Permalink to this headline">¶</a></h3>
176 | <ul class="simple">
177 | <li><p>Python open source projects for data pipelines</p></li>
178 | <li><p>Integrate with a number of sources (databases, filesystems)</p></li>
179 | <li><p>Tracking failure, retries, success</p></li>
180 | <li><p>Ability to identify the dependencies and execution</p></li>
181 | </ul>
182 | </div>
183 | <div class="section" id="differences">
184 | <h3>Differences<a class="headerlink" href="#differences" title="Permalink to this headline">¶</a></h3>
185 | <ul class="simple">
186 | <li><p>Scheduler support: Airflow has built-in support using schedulers</p></li>
187 | <li><p>Scalability: Airflow has had stability issues in the past</p></li>
188 | <li><p>Web interfaces</p></li>
189 | </ul>
190 | <p><img alt="_images/luigi.png" src="_images/luigi.png" /></p>
191 | <p><img alt="_images/airflow.png" src="_images/airflow.png" /></p>
192 | <p>| Airflow                                          | Luigi                                                                          |
193 | | ———————————————— | —————————————————————————— |
194 | | Task are defined by<code class="docutils literal notranslate"><span class="pre">dag_id</span></code> defined by user name | Task are defined by task name and parameters                                   |
195 | | Task retries based on definitions                | Decide if a task is done via input/output                                      |
196 | | Task code to the worker                          | Workers started by Python file where the tasks are defined                     |
197 | | Centralized scheduler (Celery spins up workers)  | Centralized scheduler in charge of deduplication sending tasks (Tornado based) |</p>
198 | </div>
199 | </div>
200 | </div>
201 | 
202 | 
203 |           </div>
204 |               <div class="related bottom">
205 |                 &nbsp;
206 |   <nav id="rellinks">
207 |     <ul>
208 |         <li>
209 |           &larr;
210 |           <a href="pipelines.html" title="Previous document">Pipelines</a>
211 |         </li>
212 |         <li>
213 |           <a href="first-airflow.html" title="Next document">Airflow 101: working locally and familiarise with the tool</a>
214 |           &rarr;
215 |         </li>
216 |     </ul>
217 |   </nav>
218 |               </div>
219 |           
220 |         </div>
221 |       </div>
222 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
223 |         <div class="sphinxsidebarwrapper">
224 | <p class="logo">
225 |   <a href="index.html">
226 |     <img class="logo" src="_static/python.png" alt="Logo"/>
227 |     
228 |   </a>
229 | </p>
230 | 
231 | 
232 | 
233 | <p class="blurb">a.k.a an introduction to all things DAGS and pipelines joy</p>
234 | 
235 | 
236 | 
237 | 
238 | <p>
239 | <iframe src="https://ghbtns.com/github-btn.html?user=trallard&repo=opendata-EuroScipy&type=star&count=true&size=large&v=2"
240 |   allowtransparency="true" frameborder="0" scrolling="0" width="200px" height="35px"></iframe>
241 | </p>
242 | 
243 | 
244 | 
245 | 
246 | 
247 | 
248 |   <h3><a href="index.html">Table of Contents</a></h3>
249 |   <ul>
250 | <li><a class="reference internal" href="#">Airflow basics</a><ul>
251 | <li><a class="reference internal" href="#what-is-airflow">What is Airflow?</a></li>
252 | <li><a class="reference internal" href="#basic-airflow-concepts">Basic Airflow concepts</a></li>
253 | <li><a class="reference internal" href="#idempotency">Idempotency</a></li>
254 | <li><a class="reference internal" href="#airflow-components">Airflow components</a><ul>
255 | <li><a class="reference internal" href="#web-server">Web server</a></li>
256 | <li><a class="reference internal" href="#scheduler">Scheduler</a></li>
257 | <li><a class="reference internal" href="#executor">Executor</a></li>
258 | <li><a class="reference internal" href="#metadata-database">Metadata database</a></li>
259 | </ul>
260 | </li>
261 | <li><a class="reference internal" href="#workflow-as-a-code">Workflow as a code</a><ul>
262 | <li><a class="reference internal" href="#defining-tasks">Defining tasks</a></li>
263 | </ul>
264 | </li>
265 | <li><a class="reference internal" href="#comparing-luigi-and-airflow">Comparing Luigi and Airflow</a><ul>
266 | <li><a class="reference internal" href="#luigi">Luigi</a></li>
267 | <li><a class="reference internal" href="#airflow">Airflow</a></li>
268 | <li><a class="reference internal" href="#similarities">Similarities</a></li>
269 | <li><a class="reference internal" href="#differences">Differences</a></li>
270 | </ul>
271 | </li>
272 | </ul>
273 | </li>
274 | </ul>
275 | 
276 | <div id="searchbox" style="display: none" role="search">
277 |   <h3 id="searchlabel">Quick search</h3>
278 |     <div class="searchformwrapper">
279 |     <form class="search" action="search.html" method="get">
280 |       <input type="text" name="q" aria-labelledby="searchlabel" />
281 |       <input type="submit" value="Go" />
282 |     </form>
283 |     </div>
284 | </div>
285 | <script type="text/javascript">$('#searchbox').show(0);</script><h3>Navigation</h3>
286 | <p class="caption"><span class="caption-text">Table of Contents</span></p>
287 | <ul class="current">
288 | <li class="toctree-l1"><a class="reference internal" href="setup.html">Setup</a></li>
289 | <li class="toctree-l1"><a class="reference internal" href="about.html">About the workshop</a></li>
290 | <li class="toctree-l1"><a class="reference internal" href="pipelines.html">Pipelines</a></li>
291 | <li class="toctree-l1 current"><a class="current reference internal" href="#">Airflow basics</a><ul>
292 | <li class="toctree-l2"><a class="reference internal" href="#what-is-airflow">What is Airflow?</a></li>
293 | <li class="toctree-l2"><a class="reference internal" href="#basic-airflow-concepts">Basic Airflow concepts</a></li>
294 | <li class="toctree-l2"><a class="reference internal" href="#idempotency">Idempotency</a></li>
295 | <li class="toctree-l2"><a class="reference internal" href="#airflow-components">Airflow components</a></li>
296 | <li class="toctree-l2"><a class="reference internal" href="#workflow-as-a-code">Workflow as a code</a></li>
297 | <li class="toctree-l2"><a class="reference internal" href="#comparing-luigi-and-airflow">Comparing Luigi and Airflow</a></li>
298 | </ul>
299 | </li>
300 | <li class="toctree-l1"><a class="reference internal" href="first-airflow.html">Airflow 101: working locally and familiarise with the tool</a></li>
301 | </ul>
302 | 
303 | <div class="relations">
304 | <h3>Related Topics</h3>
305 | <ul>
306 |   <li><a href="index.html">Documentation overview</a><ul>
307 |       <li>Previous: <a href="pipelines.html" title="previous chapter">Pipelines</a></li>
308 |       <li>Next: <a href="first-airflow.html" title="next chapter">Airflow 101: working locally and familiarise with the tool</a></li>
309 |   </ul></li>
310 | </ul>
311 | </div><p><iframe src="https://ghbtns.com/github-btn.html?user=trallard&type=follow&count=false" allowtransparency="true"
312 |         frameborder="0" scrolling="0" width="200" height="20"></iframe></p>
313 | 
314 | <p><a href="https://twitter.com/ixek" class="twitter-follow-button" data-show-count="false">Follow @ixek</a>
315 |     <script>!function (d, s, id) { var js, fjs = d.getElementsByTagName(s)[0], p = /^http:/.test(d.location) ? 'http' : 'https'; if (!d.getElementById(id)) { js = d.createElement(s); js.id = id; js.src = p + '://platform.twitter.com/widgets.js'; fjs.parentNode.insertBefore(js, fjs); } }(document, 'script', 'twitter-wjs');</script>
316 | </p>
317 |         </div>
318 |       </div>
319 |       <div class="clearer"></div>
320 |     </div>
321 |     <div class="footer">
322 |       &copy;2019, Tania Allard.
323 |       
324 |       |
325 |       Powered by <a href="http://sphinx-doc.org/">Sphinx 2.2.0</a>
326 |       &amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.12</a>
327 |       
328 |       |
329 |       <a href="_sources/airflow-intro.md.txt"
330 |           rel="nofollow">Page source</a>
331 |     </div>
332 | 
333 |     
334 | 
335 |     
336 |   </body>
337 | </html>


--------------------------------------------------------------------------------
/docs/source/_build/html/first-airflow.html:
--------------------------------------------------------------------------------
  1 | 
  2 | <!DOCTYPE html>
  3 | 
  4 | <html xmlns="http://www.w3.org/1999/xhtml">
  5 |   <head>
  6 |     <meta charset="utf-8" />
  7 |     <title>Airflow 101: working locally and familiarise with the tool &#8212; EuroScipy tutorial  documentation</title>
  8 |     <link rel="stylesheet" href="_static/alabaster.css" type="text/css" />
  9 |     <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
 10 |     <script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
 11 |     <script type="text/javascript" src="_static/jquery.js"></script>
 12 |     <script type="text/javascript" src="_static/underscore.js"></script>
 13 |     <script type="text/javascript" src="_static/doctools.js"></script>
 14 |     <script type="text/javascript" src="_static/language_data.js"></script>
 15 |     <link rel="author" title="About these documents" href="about.html" />
 16 |     <link rel="index" title="Index" href="genindex.html" />
 17 |     <link rel="search" title="Search" href="search.html" />
 18 |    
 19 |   <link rel="stylesheet" href="_static/custom.css" type="text/css" />
 20 |   
 21 |   
 22 |   <meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
 23 | 
 24 |   </head><body>
 25 |   
 26 | 
 27 |     <div class="document">
 28 |       <div class="documentwrapper">
 29 |         <div class="bodywrapper">
 30 |               <div class="related top">
 31 |                 &nbsp;
 32 |   <nav id="rellinks">
 33 |     <ul>
 34 |     </ul>
 35 |   </nav>
 36 |               </div>
 37 |           
 38 | 
 39 |           <div class="body" role="main">
 40 |             
 41 |   <div class="section" id="airflow-101-working-locally-and-familiarise-with-the-tool">
 42 | <h1>Airflow 101: working locally and familiarise with the tool<a class="headerlink" href="#airflow-101-working-locally-and-familiarise-with-the-tool" title="Permalink to this headline">¶</a></h1>
 43 | <div class="section" id="pre-requisites">
 44 | <h2>Pre-requisites<a class="headerlink" href="#pre-requisites" title="Permalink to this headline">¶</a></h2>
 45 | <p>The following prerequisites are needed:</p>
 46 | <ul class="simple">
 47 | <li><p>Libraries detailed in the Setting up section (either via conda or pipenv)</p></li>
 48 | <li><p>MySQL installed</p></li>
 49 | <li><p>text editor</p></li>
 50 | <li><p>command line</p></li>
 51 | </ul>
 52 | </div>
 53 | <div class="section" id="getting-your-environment-up-and-running">
 54 | <h2>Getting your environment up and running<a class="headerlink" href="#getting-your-environment-up-and-running" title="Permalink to this headline">¶</a></h2>
 55 | <p>If you followed the instructions you should have Airflow installed as well as the rest of the packages we will be using.</p>
 56 | <p>So let’s get our environment up and running:</p>
 57 | <p>If you are using conda start your environment via:</p>
 58 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span>$ source activate airflow-env
 59 | </pre></div>
 60 | </div>
 61 | <p>If using pipenv then:</p>
 62 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span>$ pipenv shell
 63 | </pre></div>
 64 | </div>
 65 | <p>this will start a shell within a virtual environment, to exit the shell you need to type <code class="docutils literal notranslate"><span class="pre">exit</span></code> and this will exit the virtual environment.</p>
 66 | </div>
 67 | <div class="section" id="starting-airflow-locally">
 68 | <h2>Starting Airflow locally<a class="headerlink" href="#starting-airflow-locally" title="Permalink to this headline">¶</a></h2>
 69 | <p>Airflow home lives in <code class="docutils literal notranslate"><span class="pre">~/airflow</span></code> by default, but you can change the location before installing airflow. You first need to set the <code class="docutils literal notranslate"><span class="pre">AIRFLOW_HOME</span></code> environment variable and then install airflow. For example, using pip:</p>
 70 | <div class="highlight-sh notranslate"><div class="highlight"><pre><span></span><span class="nb">export</span> <span class="nv">AIRFLOW_HOME</span><span class="o">=</span>~/mydir/airflow
 71 | 
 72 | <span class="c1"># install from PyPI using pip</span>
 73 | pip install apache-airflow
 74 | </pre></div>
 75 | </div>
 76 | <p>once you have completed the installation you should see something like this in the <code class="docutils literal notranslate"><span class="pre">airflow</span></code> directory (wherever it lives for you)</p>
 77 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span>drwxr-xr-x    - myuser 18 Apr 14:02 .
 78 | .rw-r--r--  26k myuser 18 Apr 14:02 ├── airflow.cfg
 79 | drwxr-xr-x    - myuser 18 Apr 14:02 ├── logs
 80 | drwxr-xr-x    - myuser 18 Apr 14:02 │  └── scheduler
 81 | drwxr-xr-x    - myuser 18 Apr 14:02 │     ├── 2019-04-18
 82 | lrwxr-xr-x   46 myuser 18 Apr 14:02 │     └── latest -&gt; /Users/myuser/airflow/logs/scheduler/2019-04-18
 83 | .rw-r--r-- 2.5k myuser 18 Apr 14:02 └── unittests.cfg
 84 | </pre></div>
 85 | </div>
 86 | <p>We need to create a local dag folder:</p>
 87 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">mkdir</span> <span class="o">~/</span><span class="n">airflow</span><span class="o">/</span><span class="n">dags</span>
 88 | </pre></div>
 89 | </div>
 90 | <p>As your project evolves, your directory will look something like this:</p>
 91 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span>airflow                  # the root directory.
 92 | ├── dags                 # root folder for all dags. files inside folders are not searched for dags.
 93 | │   ├── my_dag.py, # my dag (definitions of tasks/operators) including precedence.
 94 | │   └── ...
 95 | ├── logs                 # logs for the various tasks that are run
 96 | │   └── my_dag           # DAG specific logs
 97 | │   │   ├── src1_s3      # folder for task-specific logs (log files are created by date of a run)
 98 | │   │   ├── src2_hdfs
 99 | │   │   ├── src3_s3
100 | │   │   └── spark_task_etl
101 | ├── airflow.db           # SQLite database used by Airflow internally to track the status of each DAG.
102 | ├── airflow.cfg          # global configuration for Airflow (this can be overridden by config inside the file.)
103 | └── ...
104 | </pre></div>
105 | </div>
106 | </div>
107 | <div class="section" id="prepare-your-database">
108 | <h2>Prepare your database<a class="headerlink" href="#prepare-your-database" title="Permalink to this headline">¶</a></h2>
109 | <p>As we mentioned before Airflow uses a database to keep track of the tasks and their statuses. So it is critical to have one set up.</p>
110 | <p>To start the default database we can run
111 | <code class="docutils literal notranslate"> <span class="pre">airflow</span> <span class="pre">initdb</span></code>. This will initialize your database via alembic so that it matches the latest Airflow release.</p>
112 | <p>The default database used is <code class="docutils literal notranslate"><span class="pre">sqlite</span></code> which means you cannot parallelize tasks using this database. Since we have MySQL and MySQL client installed we will set them up so that we can use them with airflow.</p>
113 | <p>🚦Create an airflow database</p>
114 | <p>From the command line:</p>
115 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">MySQL</span> <span class="o">-</span><span class="n">u</span> <span class="n">root</span> <span class="o">-</span><span class="n">p</span>
116 | <span class="n">mysql</span><span class="o">&gt;</span> <span class="n">CREATE</span> <span class="n">DATABASE</span> <span class="n">airflow</span> <span class="n">CHARACTER</span> <span class="n">SET</span> <span class="n">utf8</span> <span class="n">COLLATE</span> <span class="n">utf8_unicode_ci</span><span class="p">;</span>
117 | <span class="n">mysql</span><span class="o">&gt;</span> <span class="n">GRANT</span> <span class="n">ALL</span> <span class="n">PRIVILEGES</span> <span class="n">ON</span> <span class="n">airflow</span><span class="o">.*</span> <span class="n">To</span> <span class="s1">&#39;airflow&#39;</span><span class="o">@</span><span class="s1">&#39;localhost&#39;</span><span class="p">;</span>
118 | <span class="n">mysql</span><span class="o">&gt;</span> <span class="n">FLUSH</span> <span class="n">PRIVILEGES</span><span class="p">;</span>
119 | </pre></div>
120 | </div>
121 | <p>and initialize the database:</p>
122 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">airflow</span> <span class="n">initdb</span>
123 | </pre></div>
124 | </div>
125 | <p>Notice that this will fail with the default <code class="docutils literal notranslate"><span class="pre">airflow.cfg</span></code></p>
126 | </div>
127 | <div class="section" id="update-your-local-configuration">
128 | <h2>Update your local configuration<a class="headerlink" href="#update-your-local-configuration" title="Permalink to this headline">¶</a></h2>
129 | <p>Open your airflow configuration file <code class="docutils literal notranslate"><span class="pre">~/airflow/airflow.cf</span></code> and make the following changes:</p>
130 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">executor</span> <span class="o">=</span> <span class="n">CeleryExecutor</span>
131 | </pre></div>
132 | </div>
133 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># http://docs.celeryproject.org/en/latest/userguide/configuration.html#broker-settings</span>
134 | <span class="c1"># needs rabbitmq running</span>
135 | <span class="n">broker_url</span> <span class="o">=</span> <span class="n">amqp</span><span class="p">:</span><span class="o">//</span><span class="n">guest</span><span class="p">:</span><span class="n">guest</span><span class="nd">@127</span><span class="o">.</span><span class="mf">0.0</span><span class="o">.</span><span class="mi">1</span><span class="o">/</span>
136 | 
137 | 
138 | <span class="c1"># http://docs.celeryproject.org/en/latest/userguide/configuration.html#task-result-backend-settings</span>
139 | <span class="n">result_backend</span> <span class="o">=</span> <span class="n">db</span><span class="o">+</span><span class="n">mysql</span><span class="p">:</span><span class="o">//</span><span class="n">airflow</span><span class="p">:</span><span class="n">airflow</span><span class="nd">@localhost</span><span class="p">:</span><span class="mi">3306</span><span class="o">/</span><span class="n">airflow</span>
140 | 
141 | <span class="n">sql_alchemy_conn</span> <span class="o">=</span> <span class="n">mysql</span><span class="p">:</span><span class="o">//</span><span class="n">airflow</span><span class="p">:</span><span class="n">python2019</span><span class="nd">@localhost</span><span class="p">:</span><span class="mi">3306</span><span class="o">/</span><span class="n">airflow</span>
142 | </pre></div>
143 | </div>
144 | <p>Here we are replacing the default executor (<code class="docutils literal notranslate"><span class="pre">SequentialExecutor</span></code>) with the <code class="docutils literal notranslate"><span class="pre">CeleryExecutor</span></code> so that we can run multiple DAGs in parallel.
145 | We also replace the default <code class="docutils literal notranslate"><span class="pre">sqlite</span></code> database with our newly created <code class="docutils literal notranslate"><span class="pre">airflow</span></code> database.</p>
146 | <p>Now we can initialize the database:</p>
147 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">airflow</span> <span class="n">initdb</span>
148 | </pre></div>
149 | </div>
150 | <p>Let’s now start the web server locally:</p>
151 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">airflow</span> <span class="n">webserver</span> <span class="o">-</span><span class="n">p</span> <span class="mi">8080</span>
152 | </pre></div>
153 | </div>
154 | <p>we can head over to <a class="reference external" href="http://localhost:8080">http://localhost:8080</a> now and you will see that there are a number of examples DAGS already there.</p>
155 | <p>🚦 Take some time to familiarise with the UI and get your local instance set up</p>
156 | <p>Now let’s have a look at the connections (<a class="reference external" href="http://localhost:8080/admin/connection/">http://localhost:8080/admin/connection/</a>) go to <code class="docutils literal notranslate"><span class="pre">admin</span> <span class="pre">&gt;</span> <span class="pre">connections</span></code>. You should be able to see a number of connections available. For this tutorial, we will use some of the connections including  <code class="docutils literal notranslate"><span class="pre">mysql</span></code>.</p>
157 | <!-- For example, if you have `mysql` running but you have a different password for the root user you can edit it by clicking on the connection name.
158 | 
159 | 
160 | 🚦Now let's create a db for our local project
161 | 
162 | ![](_static/connection.png) --><div class="section" id="commands">
163 | <h3>Commands<a class="headerlink" href="#commands" title="Permalink to this headline">¶</a></h3>
164 | <p>Let us go over some of the commands. Back on your command line:</p>
165 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">airflow</span> <span class="n">list_dags</span>
166 | </pre></div>
167 | </div>
168 | <p>we can list the DAG tasks in a tree view</p>
169 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">airflow</span> <span class="n">list_tasks</span> <span class="n">tutorial</span> <span class="o">--</span><span class="n">tree</span>
170 | </pre></div>
171 | </div>
172 | <p>we can tests the dags too, but we will need to set a date parameter so that this executes:</p>
173 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">airflow</span> <span class="n">test</span> <span class="n">tutorial</span> <span class="n">print_date</span> <span class="mi">2019</span><span class="o">-</span><span class="mi">05</span><span class="o">-</span><span class="mi">01</span>
174 | </pre></div>
175 | </div>
176 | <p>(note that you cannot use a future date or you will get an error)</p>
177 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">airflow</span> <span class="n">test</span> <span class="n">tutorial</span> <span class="n">templated</span> <span class="mi">2019</span><span class="o">-</span><span class="mi">05</span><span class="o">-</span><span class="mi">01</span>
178 | </pre></div>
179 | </div>
180 | <p>By using the test commands these are not saved in the database.</p>
181 | <p>Now let’s start the scheduler:</p>
182 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">airflow</span> <span class="n">scheduler</span>
183 | </pre></div>
184 | </div>
185 | <p>Behind the scenes, it monitors and stays in sync with a folder for all DAG objects it contains. The Airflow scheduler is designed to run as a service in an Airflow production environment.</p>
186 | <p>Now with the schedule up and running we can trigger an instance:</p>
187 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span>$ airflow run airflow run example_bash_operator runme_0 2015-01-01
188 | </pre></div>
189 | </div>
190 | <p>This will be stored in the database and you can see the change of the status change straight away.</p>
191 | <p>What would happen for example if we wanted to run or trigger the <code class="docutils literal notranslate"><span class="pre">tutorial</span></code> task? 🤔</p>
192 | <p>Let’s try from the CLI and see what happens.</p>
193 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">airflow</span> <span class="n">trigger_dag</span> <span class="n">tutorial</span>
194 | </pre></div>
195 | </div>
196 | </div>
197 | </div>
198 | <div class="section" id="writing-your-first-dag">
199 | <h2>Writing your first DAG<a class="headerlink" href="#writing-your-first-dag" title="Permalink to this headline">¶</a></h2>
200 | <p>Let’s create our first simple DAG.
201 | Inside the dag directory (<code class="docutils literal notranslate"><span class="pre">~/airflow/dags)</span></code> create a <code class="docutils literal notranslate"><span class="pre">simple_dag.py</span></code> file.</p>
202 | <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">datetime</span><span class="p">,</span> <span class="n">timedelta</span>
203 | <span class="kn">from</span> <span class="nn">airflow</span> <span class="kn">import</span> <span class="n">DAG</span>
204 | <span class="kn">from</span> <span class="nn">airflow.operators.dummy_operator</span> <span class="kn">import</span> <span class="n">DummyOperator</span>
205 | <span class="kn">from</span> <span class="nn">airflow.operators.python_operator</span> <span class="kn">import</span> <span class="n">PythonOperator</span>
206 | 
207 | 
208 | <span class="k">def</span> <span class="nf">print_hello</span><span class="p">():</span>
209 |     <span class="k">return</span> <span class="s2">&quot;Hello world!&quot;</span>
210 | 
211 | 
212 | <span class="n">default_args</span> <span class="o">=</span> <span class="p">{</span>
213 |     <span class="s2">&quot;owner&quot;</span><span class="p">:</span> <span class="s2">&quot;airflow&quot;</span><span class="p">,</span>
214 |     <span class="s2">&quot;depends_on_past&quot;</span><span class="p">:</span> <span class="bp">False</span><span class="p">,</span>
215 |     <span class="s2">&quot;start_date&quot;</span><span class="p">:</span> <span class="n">datetime</span><span class="p">(</span><span class="mi">2019</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">30</span><span class="p">),</span>
216 |     <span class="s2">&quot;email&quot;</span><span class="p">:</span> <span class="p">[</span><span class="s2">&quot;airflow@example.com&quot;</span><span class="p">],</span>
217 |     <span class="s2">&quot;email_on_failure&quot;</span><span class="p">:</span> <span class="bp">False</span><span class="p">,</span>
218 |     <span class="s2">&quot;email_on_retry&quot;</span><span class="p">:</span> <span class="bp">False</span><span class="p">,</span>
219 |     <span class="s2">&quot;retries&quot;</span><span class="p">:</span> <span class="mi">1</span><span class="p">,</span>
220 |     <span class="s2">&quot;retry_delay&quot;</span><span class="p">:</span> <span class="n">timedelta</span><span class="p">(</span><span class="n">minutes</span><span class="o">=</span><span class="mi">2</span><span class="p">),</span>
221 | <span class="p">}</span>
222 | 
223 | <span class="n">dag</span> <span class="o">=</span> <span class="n">DAG</span><span class="p">(</span>
224 |     <span class="s2">&quot;hello_world&quot;</span><span class="p">,</span>
225 |     <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Simple tutorial DAG&quot;</span><span class="p">,</span>
226 |     <span class="n">schedule_interval</span><span class="o">=</span><span class="s2">&quot;0 12 * * *&quot;</span><span class="p">,</span>
227 |     <span class="n">default_args</span><span class="o">=</span><span class="n">default_args</span><span class="p">,</span>
228 |     <span class="n">catchup</span><span class="o">=</span><span class="bp">False</span><span class="p">,</span>
229 | <span class="p">)</span>
230 | 
231 | <span class="n">t1</span> <span class="o">=</span> <span class="n">DummyOperator</span><span class="p">(</span><span class="n">task_id</span><span class="o">=</span><span class="s2">&quot;dummy_task&quot;</span><span class="p">,</span> <span class="n">retries</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">dag</span><span class="o">=</span><span class="n">dag</span><span class="p">)</span>
232 | 
233 | <span class="n">t2</span> <span class="o">=</span> <span class="n">PythonOperator</span><span class="p">(</span><span class="n">task_id</span><span class="o">=</span><span class="s2">&quot;hello_task&quot;</span><span class="p">,</span> <span class="n">python_callable</span><span class="o">=</span><span class="n">print_hello</span><span class="p">,</span> <span class="n">dag</span><span class="o">=</span><span class="n">dag</span><span class="p">)</span>
234 | 
235 | <span class="c1"># sets downstream foe t1</span>
236 | <span class="n">t1</span> <span class="o">&gt;&gt;</span> <span class="n">t2</span>
237 | 
238 | <span class="c1"># equivalent</span>
239 | <span class="c1"># t2.set_upstream(t1)</span>
240 | </pre></div>
241 | </div>
242 | <p>If it is properly setup you should be able to see this straight away on your instance.</p>
243 | <div class="section" id="now-let-s-create-a-dag-from-the-previous-etl-pipeline-kind-of">
244 | <h3>Now let’s create a DAG from the previous ETL pipeline (kind of)<a class="headerlink" href="#now-let-s-create-a-dag-from-the-previous-etl-pipeline-kind-of" title="Permalink to this headline">¶</a></h3>
245 | <p>All hands on - check the solutions</p>
246 | </div>
247 | </div>
248 | </div>
249 | 
250 | 
251 |           </div>
252 |               <div class="related bottom">
253 |                 &nbsp;
254 |   <nav id="rellinks">
255 |     <ul>
256 |     </ul>
257 |   </nav>
258 |               </div>
259 |           
260 |         </div>
261 |       </div>
262 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
263 |         <div class="sphinxsidebarwrapper">
264 | <p class="logo">
265 |   <a href="index.html">
266 |     <img class="logo" src="_static/python.png" alt="Logo"/>
267 |     
268 |   </a>
269 | </p>
270 | 
271 | 
272 | 
273 | <p class="blurb">a.k.a an introduction to all things DAGS and pipelines joy</p>
274 | 
275 | 
276 | 
277 | 
278 | <p>
279 | <iframe src="https://ghbtns.com/github-btn.html?user=trallard&repo=opendata-EuroScipy&type=star&count=true&size=large&v=2"
280 |   allowtransparency="true" frameborder="0" scrolling="0" width="200px" height="35px"></iframe>
281 | </p>
282 | 
283 | 
284 | 
285 | 
286 | 
287 | 
288 |   <h3><a href="index.html">Table of Contents</a></h3>
289 |   <ul>
290 | <li><a class="reference internal" href="#">Airflow 101: working locally and familiarise with the tool</a><ul>
291 | <li><a class="reference internal" href="#pre-requisites">Pre-requisites</a></li>
292 | <li><a class="reference internal" href="#getting-your-environment-up-and-running">Getting your environment up and running</a></li>
293 | <li><a class="reference internal" href="#starting-airflow-locally">Starting Airflow locally</a></li>
294 | <li><a class="reference internal" href="#prepare-your-database">Prepare your database</a></li>
295 | <li><a class="reference internal" href="#update-your-local-configuration">Update your local configuration</a><ul>
296 | <li><a class="reference internal" href="#commands">Commands</a></li>
297 | </ul>
298 | </li>
299 | <li><a class="reference internal" href="#writing-your-first-dag">Writing your first DAG</a><ul>
300 | <li><a class="reference internal" href="#now-let-s-create-a-dag-from-the-previous-etl-pipeline-kind-of">Now let’s create a DAG from the previous ETL pipeline (kind of)</a></li>
301 | </ul>
302 | </li>
303 | </ul>
304 | </li>
305 | </ul>
306 | 
307 | <div id="searchbox" style="display: none" role="search">
308 |   <h3 id="searchlabel">Quick search</h3>
309 |     <div class="searchformwrapper">
310 |     <form class="search" action="search.html" method="get">
311 |       <input type="text" name="q" aria-labelledby="searchlabel" />
312 |       <input type="submit" value="Go" />
313 |     </form>
314 |     </div>
315 | </div>
316 | <script type="text/javascript">$('#searchbox').show(0);</script><h3>Navigation</h3>
317 | 
318 | <div class="relations">
319 | <h3>Related Topics</h3>
320 | <ul>
321 |   <li><a href="index.html">Documentation overview</a><ul>
322 |   </ul></li>
323 | </ul>
324 | </div><p><iframe src="https://ghbtns.com/github-btn.html?user=trallard&type=follow&count=false" allowtransparency="true"
325 |         frameborder="0" scrolling="0" width="200" height="20"></iframe></p>
326 | 
327 | <p><a href="https://twitter.com/ixek" class="twitter-follow-button" data-show-count="false">Follow @ixek</a>
328 |     <script>!function (d, s, id) { var js, fjs = d.getElementsByTagName(s)[0], p = /^http:/.test(d.location) ? 'http' : 'https'; if (!d.getElementById(id)) { js = d.createElement(s); js.id = id; js.src = p + '://platform.twitter.com/widgets.js'; fjs.parentNode.insertBefore(js, fjs); } }(document, 'script', 'twitter-wjs');</script>
329 | </p>
330 |         </div>
331 |       </div>
332 |       <div class="clearer"></div>
333 |     </div>
334 |     <div class="footer">
335 |       &copy;2019, Tania Allard.
336 |       
337 |       |
338 |       Powered by <a href="http://sphinx-doc.org/">Sphinx 2.2.0</a>
339 |       &amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.12</a>
340 |       
341 |       |
342 |       <a href="_sources/first-airflow.md.txt"
343 |           rel="nofollow">Page source</a>
344 |     </div>
345 | 
346 |     
347 | 
348 |     
349 |   </body>
350 | </html>


--------------------------------------------------------------------------------
/docs/source/_build/html/genindex.html:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | <!DOCTYPE html>
  4 | 
  5 | <html xmlns="http://www.w3.org/1999/xhtml">
  6 |   <head>
  7 |     <meta charset="utf-8" />
  8 |     <title>Index &#8212; EuroScipy tutorial  documentation</title>
  9 |     <link rel="stylesheet" href="_static/alabaster.css" type="text/css" />
 10 |     <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
 11 |     <script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
 12 |     <script type="text/javascript" src="_static/jquery.js"></script>
 13 |     <script type="text/javascript" src="_static/underscore.js"></script>
 14 |     <script type="text/javascript" src="_static/doctools.js"></script>
 15 |     <script type="text/javascript" src="_static/language_data.js"></script>
 16 |     <link rel="author" title="About these documents" href="about.html" />
 17 |     <link rel="index" title="Index" href="#" />
 18 |     <link rel="search" title="Search" href="search.html" />
 19 |    
 20 |   <link rel="stylesheet" href="_static/custom.css" type="text/css" />
 21 |   
 22 |   
 23 |   <meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
 24 | 
 25 |   </head><body>
 26 |   
 27 | 
 28 |     <div class="document">
 29 |       <div class="documentwrapper">
 30 |         <div class="bodywrapper">
 31 |               <div class="related top">
 32 |                 &nbsp;
 33 |   <nav id="rellinks">
 34 |     <ul>
 35 |     </ul>
 36 |   </nav>
 37 |               </div>
 38 |           
 39 | 
 40 |           <div class="body" role="main">
 41 |             
 42 | 
 43 | <h1 id="index">Index</h1>
 44 | 
 45 | <div class="genindex-jumpbox">
 46 |  
 47 | </div>
 48 | 
 49 | 
 50 |           </div>
 51 |               <div class="related bottom">
 52 |                 &nbsp;
 53 |   <nav id="rellinks">
 54 |     <ul>
 55 |     </ul>
 56 |   </nav>
 57 |               </div>
 58 |           
 59 |         </div>
 60 |       </div>
 61 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
 62 |         <div class="sphinxsidebarwrapper">
 63 | <p class="logo">
 64 |   <a href="index.html">
 65 |     <img class="logo" src="_static/python.png" alt="Logo"/>
 66 |     
 67 |   </a>
 68 | </p>
 69 | 
 70 | 
 71 | 
 72 | <p class="blurb">a.k.a an introduction to all things DAGS and pipelines joy</p>
 73 | 
 74 | 
 75 | 
 76 | 
 77 | <p>
 78 | <iframe src="https://ghbtns.com/github-btn.html?user=trallard&repo=opendata-EuroScipy&type=star&count=true&size=large&v=2"
 79 |   allowtransparency="true" frameborder="0" scrolling="0" width="200px" height="35px"></iframe>
 80 | </p>
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | <div id="searchbox" style="display: none" role="search">
 88 |   <h3 id="searchlabel">Quick search</h3>
 89 |     <div class="searchformwrapper">
 90 |     <form class="search" action="search.html" method="get">
 91 |       <input type="text" name="q" aria-labelledby="searchlabel" />
 92 |       <input type="submit" value="Go" />
 93 |     </form>
 94 |     </div>
 95 | </div>
 96 | <script type="text/javascript">$('#searchbox').show(0);</script><h3>Navigation</h3>
 97 | <p class="caption"><span class="caption-text">Table of Contents</span></p>
 98 | <ul>
 99 | <li class="toctree-l1"><a class="reference internal" href="setup.html">Setup</a></li>
100 | <li class="toctree-l1"><a class="reference internal" href="about.html">About the workshop</a></li>
101 | <li class="toctree-l1"><a class="reference internal" href="pipelines.html">Pipelines</a></li>
102 | <li class="toctree-l1"><a class="reference internal" href="airflow-intro.html">Airflow basics</a></li>
103 | <li class="toctree-l1"><a class="reference internal" href="first-airflow.html">Airflow 101: working locally and familiarise with the tool</a></li>
104 | </ul>
105 | 
106 | <div class="relations">
107 | <h3>Related Topics</h3>
108 | <ul>
109 |   <li><a href="index.html">Documentation overview</a><ul>
110 |   </ul></li>
111 | </ul>
112 | </div><p><iframe src="https://ghbtns.com/github-btn.html?user=trallard&type=follow&count=false" allowtransparency="true"
113 |         frameborder="0" scrolling="0" width="200" height="20"></iframe></p>
114 | 
115 | <p><a href="https://twitter.com/ixek" class="twitter-follow-button" data-show-count="false">Follow @ixek</a>
116 |     <script>!function (d, s, id) { var js, fjs = d.getElementsByTagName(s)[0], p = /^http:/.test(d.location) ? 'http' : 'https'; if (!d.getElementById(id)) { js = d.createElement(s); js.id = id; js.src = p + '://platform.twitter.com/widgets.js'; fjs.parentNode.insertBefore(js, fjs); } }(document, 'script', 'twitter-wjs');</script>
117 | </p>
118 |         </div>
119 |       </div>
120 |       <div class="clearer"></div>
121 |     </div>
122 |     <div class="footer">
123 |       &copy;2019, Tania Allard.
124 |       
125 |       |
126 |       Powered by <a href="http://sphinx-doc.org/">Sphinx 2.2.0</a>
127 |       &amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.12</a>
128 |       
129 |     </div>
130 | 
131 |     
132 | 
133 |     
134 |   </body>
135 | </html>


--------------------------------------------------------------------------------
/docs/source/_build/html/index.html:
--------------------------------------------------------------------------------
  1 | 
  2 | <!DOCTYPE html>
  3 | 
  4 | <html xmlns="http://www.w3.org/1999/xhtml">
  5 |   <head>
  6 |     <meta charset="utf-8" />
  7 |     <title>Welcome to the EuroScipy Airflow tutorial &#8212; EuroScipy tutorial  documentation</title>
  8 |     <link rel="stylesheet" href="_static/alabaster.css" type="text/css" />
  9 |     <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
 10 |     <script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
 11 |     <script type="text/javascript" src="_static/jquery.js"></script>
 12 |     <script type="text/javascript" src="_static/underscore.js"></script>
 13 |     <script type="text/javascript" src="_static/doctools.js"></script>
 14 |     <script type="text/javascript" src="_static/language_data.js"></script>
 15 |     <link rel="author" title="About these documents" href="about.html" />
 16 |     <link rel="index" title="Index" href="genindex.html" />
 17 |     <link rel="search" title="Search" href="search.html" />
 18 |     <link rel="next" title="Setup" href="setup.html" />
 19 |    
 20 |   <link rel="stylesheet" href="_static/custom.css" type="text/css" />
 21 |   
 22 |   
 23 |   <meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
 24 | 
 25 |   </head><body>
 26 |   
 27 | 
 28 |     <div class="document">
 29 |       <div class="documentwrapper">
 30 |         <div class="bodywrapper">
 31 |               <div class="related top">
 32 |                 &nbsp;
 33 |   <nav id="rellinks">
 34 |     <ul>
 35 |         <li>
 36 |           <a href="setup.html" title="Next document">Setup</a>
 37 |           &rarr;
 38 |         </li>
 39 |     </ul>
 40 |   </nav>
 41 |               </div>
 42 |           
 43 | 
 44 |           <div class="body" role="main">
 45 |             
 46 |   <div class="section" id="welcome-to-the-euroscipy-airflow-tutorial">
 47 | <h1>Welcome to the EuroScipy Airflow tutorial<a class="headerlink" href="#welcome-to-the-euroscipy-airflow-tutorial" title="Permalink to this headline">¶</a></h1>
 48 | <p>This tutorial was originally developed for EuroScipy 2019.</p>
 49 | <div class="toctree-wrapper compound">
 50 | </div>
 51 | <div class="toctree-wrapper compound">
 52 | </div>
 53 | </div>
 54 | <div class="section" id="about-your-facilitator">
 55 | <h1>About your facilitator<a class="headerlink" href="#about-your-facilitator" title="Permalink to this headline">¶</a></h1>
 56 | <p>My name is Tania. I live in Manchester UK where I work as a
 57 | Cloud Advocate for Microsoft.</p>
 58 | <p>Over the years, I have worked as a data engineer, machine learning engineer,
 59 | and research software engineer. I love data intensive
 60 | enviroments and I am particularly interested in the tools and workflows to
 61 | deliver robust, reproducible data insights.</p>
 62 | <p>If you have any questions or feedback about this tutorial please,
 63 | file an issue using the following link: <a class="reference external" href="https://github.com/trallard/euroscipy-airflow/issues/new">https://github.com/trallard/euroscipy-airflow/issues/new</a>.</p>
 64 | <p>You can also contact me via the following channels:</p>
 65 | <ul class="simple">
 66 | <li><p>E-mail: <a class="reference external" href="mailto:trallard&#37;&#52;&#48;bitsandchips&#46;me">trallard<span>&#64;</span>bitsandchips<span>&#46;</span>me</a></p></li>
 67 | <li><p>Twitter: <a class="reference external" href="https://twitter.com/ixek">&#64;ixek</a></p></li>
 68 | <li><p><a class="reference external" href="https://github.com/ixek">Tania on GitHub</a></p></li>
 69 | </ul>
 70 | </div>
 71 | <div class="section" id="code-of-conduct">
 72 | <h1>Code of Conduct<a class="headerlink" href="#code-of-conduct" title="Permalink to this headline">¶</a></h1>
 73 | <p>All attendees to this workshop are expected to adhere to EuroScipy’s Code of Conduct,
 74 | in brief:
 75 | <strong>Be open, considerate, and respectful.</strong></p>
 76 | </div>
 77 | <div class="section" id="license">
 78 | <h1>License<a class="headerlink" href="#license" title="Permalink to this headline">¶</a></h1>
 79 | <p>The content in this workshop is Licensed under <a class="reference external" href="https://creativecommons.org/licenses/by-sa/4.0/">CC-BY-SA 4.0</a>.
 80 | Which means that you can use, remix and re-distribute so long attribution to the original
 81 | author is maintained (Tania Allard).</p>
 82 | <p>The logo used here was designed by Ashley McNamara for the Microsoft Developer Advocates team use.</p>
 83 | </div>
 84 | 
 85 | 
 86 |           </div>
 87 |               <div class="related bottom">
 88 |                 &nbsp;
 89 |   <nav id="rellinks">
 90 |     <ul>
 91 |         <li>
 92 |           <a href="setup.html" title="Next document">Setup</a>
 93 |           &rarr;
 94 |         </li>
 95 |     </ul>
 96 |   </nav>
 97 |               </div>
 98 |           
 99 |         </div>
100 |       </div>
101 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
102 |         <div class="sphinxsidebarwrapper">
103 | <p class="logo">
104 |   <a href="#">
105 |     <img class="logo" src="_static/python.png" alt="Logo"/>
106 |     
107 |   </a>
108 | </p>
109 | 
110 | 
111 | 
112 | <p class="blurb">a.k.a an introduction to all things DAGS and pipelines joy</p>
113 | 
114 | 
115 | 
116 | 
117 | <p>
118 | <iframe src="https://ghbtns.com/github-btn.html?user=trallard&repo=opendata-EuroScipy&type=star&count=true&size=large&v=2"
119 |   allowtransparency="true" frameborder="0" scrolling="0" width="200px" height="35px"></iframe>
120 | </p>
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 |   <h3><a href="#">Table of Contents</a></h3>
128 |   <ul>
129 | <li><a class="reference internal" href="#">Welcome to the EuroScipy Airflow tutorial</a><ul>
130 | </ul>
131 | </li>
132 | <li><a class="reference internal" href="#about-your-facilitator">About your facilitator</a></li>
133 | <li><a class="reference internal" href="#code-of-conduct">Code of Conduct</a></li>
134 | <li><a class="reference internal" href="#license">License</a></li>
135 | </ul>
136 | 
137 | <div id="searchbox" style="display: none" role="search">
138 |   <h3 id="searchlabel">Quick search</h3>
139 |     <div class="searchformwrapper">
140 |     <form class="search" action="search.html" method="get">
141 |       <input type="text" name="q" aria-labelledby="searchlabel" />
142 |       <input type="submit" value="Go" />
143 |     </form>
144 |     </div>
145 | </div>
146 | <script type="text/javascript">$('#searchbox').show(0);</script><h3>Navigation</h3>
147 | <p class="caption"><span class="caption-text">Table of Contents</span></p>
148 | <ul>
149 | <li class="toctree-l1"><a class="reference internal" href="setup.html">Setup</a></li>
150 | <li class="toctree-l1"><a class="reference internal" href="about.html">About the workshop</a></li>
151 | <li class="toctree-l1"><a class="reference internal" href="pipelines.html">Pipelines</a></li>
152 | <li class="toctree-l1"><a class="reference internal" href="airflow-intro.html">Airflow basics</a></li>
153 | <li class="toctree-l1"><a class="reference internal" href="first-airflow.html">Airflow 101: working locally and familiarise with the tool</a></li>
154 | </ul>
155 | 
156 | <div class="relations">
157 | <h3>Related Topics</h3>
158 | <ul>
159 |   <li><a href="#">Documentation overview</a><ul>
160 |       <li>Next: <a href="setup.html" title="next chapter">Setup</a></li>
161 |   </ul></li>
162 | </ul>
163 | </div><p><iframe src="https://ghbtns.com/github-btn.html?user=trallard&type=follow&count=false" allowtransparency="true"
164 |         frameborder="0" scrolling="0" width="200" height="20"></iframe></p>
165 | 
166 | <p><a href="https://twitter.com/ixek" class="twitter-follow-button" data-show-count="false">Follow @ixek</a>
167 |     <script>!function (d, s, id) { var js, fjs = d.getElementsByTagName(s)[0], p = /^http:/.test(d.location) ? 'http' : 'https'; if (!d.getElementById(id)) { js = d.createElement(s); js.id = id; js.src = p + '://platform.twitter.com/widgets.js'; fjs.parentNode.insertBefore(js, fjs); } }(document, 'script', 'twitter-wjs');</script>
168 | </p>
169 |         </div>
170 |       </div>
171 |       <div class="clearer"></div>
172 |     </div>
173 |     <div class="footer">
174 |       &copy;2019, Tania Allard.
175 |       
176 |       |
177 |       Powered by <a href="http://sphinx-doc.org/">Sphinx 2.2.0</a>
178 |       &amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.12</a>
179 |       
180 |       |
181 |       <a href="_sources/index.rst.txt"
182 |           rel="nofollow">Page source</a>
183 |     </div>
184 | 
185 |     
186 | 
187 |     
188 |   </body>
189 | </html>


--------------------------------------------------------------------------------
/docs/source/_build/html/objects.inv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_build/html/objects.inv


--------------------------------------------------------------------------------
/docs/source/_build/html/search.html:
--------------------------------------------------------------------------------
  1 | 
  2 | <!DOCTYPE html>
  3 | 
  4 | <html xmlns="http://www.w3.org/1999/xhtml">
  5 |   <head>
  6 |     <meta charset="utf-8" />
  7 |     <title>Search &#8212; EuroScipy tutorial  documentation</title>
  8 |     <link rel="stylesheet" href="_static/alabaster.css" type="text/css" />
  9 |     <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
 10 |     
 11 |     <script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
 12 |     <script type="text/javascript" src="_static/jquery.js"></script>
 13 |     <script type="text/javascript" src="_static/underscore.js"></script>
 14 |     <script type="text/javascript" src="_static/doctools.js"></script>
 15 |     <script type="text/javascript" src="_static/language_data.js"></script>
 16 |     <script type="text/javascript" src="_static/searchtools.js"></script>
 17 |     <link rel="author" title="About these documents" href="about.html" />
 18 |     <link rel="index" title="Index" href="genindex.html" />
 19 |     <link rel="search" title="Search" href="#" />
 20 |   <script type="text/javascript" src="searchindex.js" defer></script>
 21 |   
 22 |    
 23 |   <link rel="stylesheet" href="_static/custom.css" type="text/css" />
 24 |   
 25 |   
 26 |   <meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
 27 | 
 28 | 
 29 |   </head><body>
 30 |   
 31 | 
 32 |     <div class="document">
 33 |       <div class="documentwrapper">
 34 |         <div class="bodywrapper">
 35 |               <div class="related top">
 36 |                 &nbsp;
 37 |   <nav id="rellinks">
 38 |     <ul>
 39 |     </ul>
 40 |   </nav>
 41 |               </div>
 42 |           
 43 | 
 44 |           <div class="body" role="main">
 45 |             
 46 |   <h1 id="search-documentation">Search</h1>
 47 |   <div id="fallback" class="admonition warning">
 48 |   <script type="text/javascript">$('#fallback').hide();</script>
 49 |   <p>
 50 |     Please activate JavaScript to enable the search
 51 |     functionality.
 52 |   </p>
 53 |   </div>
 54 |   <p>
 55 |     From here you can search these documents. Enter your search
 56 |     words into the box below and click "search". Note that the search
 57 |     function will automatically search for all of the words. Pages
 58 |     containing fewer words won't appear in the result list.
 59 |   </p>
 60 |   <form action="" method="get">
 61 |     <input type="text" name="q" aria-labelledby="search-documentation" value="" />
 62 |     <input type="submit" value="search" />
 63 |     <span id="search-progress" style="padding-left: 10px"></span>
 64 |   </form>
 65 |   
 66 |   <div id="search-results">
 67 |   
 68 |   </div>
 69 | 
 70 |           </div>
 71 |               <div class="related bottom">
 72 |                 &nbsp;
 73 |   <nav id="rellinks">
 74 |     <ul>
 75 |     </ul>
 76 |   </nav>
 77 |               </div>
 78 |           
 79 |         </div>
 80 |       </div>
 81 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
 82 |         <div class="sphinxsidebarwrapper">
 83 | <p class="logo">
 84 |   <a href="index.html">
 85 |     <img class="logo" src="_static/python.png" alt="Logo"/>
 86 |     
 87 |   </a>
 88 | </p>
 89 | 
 90 | 
 91 | 
 92 | <p class="blurb">a.k.a an introduction to all things DAGS and pipelines joy</p>
 93 | 
 94 | 
 95 | 
 96 | 
 97 | <p>
 98 | <iframe src="https://ghbtns.com/github-btn.html?user=trallard&repo=opendata-EuroScipy&type=star&count=true&size=large&v=2"
 99 |   allowtransparency="true" frameborder="0" scrolling="0" width="200px" height="35px"></iframe>
100 | </p>
101 | 
102 | 
103 | 
104 | 
105 | 
106 | <h3>Navigation</h3>
107 | <p class="caption"><span class="caption-text">Table of Contents</span></p>
108 | <ul>
109 | <li class="toctree-l1"><a class="reference internal" href="setup.html">Setup</a></li>
110 | <li class="toctree-l1"><a class="reference internal" href="about.html">About the workshop</a></li>
111 | <li class="toctree-l1"><a class="reference internal" href="pipelines.html">Pipelines</a></li>
112 | <li class="toctree-l1"><a class="reference internal" href="airflow-intro.html">Airflow basics</a></li>
113 | <li class="toctree-l1"><a class="reference internal" href="first-airflow.html">Airflow 101: working locally and familiarise with the tool</a></li>
114 | </ul>
115 | 
116 | <div class="relations">
117 | <h3>Related Topics</h3>
118 | <ul>
119 |   <li><a href="index.html">Documentation overview</a><ul>
120 |   </ul></li>
121 | </ul>
122 | </div><p><iframe src="https://ghbtns.com/github-btn.html?user=trallard&type=follow&count=false" allowtransparency="true"
123 |         frameborder="0" scrolling="0" width="200" height="20"></iframe></p>
124 | 
125 | <p><a href="https://twitter.com/ixek" class="twitter-follow-button" data-show-count="false">Follow @ixek</a>
126 |     <script>!function (d, s, id) { var js, fjs = d.getElementsByTagName(s)[0], p = /^http:/.test(d.location) ? 'http' : 'https'; if (!d.getElementById(id)) { js = d.createElement(s); js.id = id; js.src = p + '://platform.twitter.com/widgets.js'; fjs.parentNode.insertBefore(js, fjs); } }(document, 'script', 'twitter-wjs');</script>
127 | </p>
128 |         </div>
129 |       </div>
130 |       <div class="clearer"></div>
131 |     </div>
132 |     <div class="footer">
133 |       &copy;2019, Tania Allard.
134 |       
135 |       |
136 |       Powered by <a href="http://sphinx-doc.org/">Sphinx 2.2.0</a>
137 |       &amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.12</a>
138 |       
139 |     </div>
140 | 
141 |     
142 | 
143 |     
144 |   </body>
145 | </html>


--------------------------------------------------------------------------------
/docs/source/_build/html/searchindex.js:
--------------------------------------------------------------------------------
1 | Search.setIndex({docnames:["about","airflow-intro","first-airflow","index","pipelines","setup"],envversion:{"sphinx.domains.c":1,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":1,"sphinx.domains.javascript":1,"sphinx.domains.math":2,"sphinx.domains.python":1,"sphinx.domains.rst":1,"sphinx.domains.std":1,"sphinx.ext.intersphinx":1,sphinx:56},filenames:["about.md","airflow-intro.md","first-airflow.md","index.rst","pipelines.md","setup.rst"],objects:{},objnames:{},objtypes:{},terms:{"26k":2,"3f57fc15ded7dddddcc4e82fe137b58":5,"abstract":1,"break":4,"case":[0,4],"class":4,"default":[2,5],"export":[2,5],"function":4,"import":[1,2,4],"int":4,"long":[3,4],"new":[3,5],"null":4,"public":4,"return":[1,2,4],"true":[1,4],"try":[2,4,5],"while":5,And:4,For:[1,2,4,5],NOT:4,Not:1,One:[1,4],The:[1,2,3,4,5],Then:5,There:[1,4,5],These:1,USE:4,Will:0,__main__:4,__name__:4,abil:[1,4],abl:[0,2,5],about:[],access:4,access_token:4,access_token_secret:4,accordingli:[1,5],achiev:4,across:[0,1],activ:[2,5],actual:4,acycl:1,add:[4,5],added:5,addit:5,address:5,adher:3,admin:2,adopt:0,advanc:5,advantag:1,advoc:3,affili:4,after:[1,4,5],again:5,airbnb:1,airflow:[4,5],airflow_hom:2,airflowdb:4,alemb:2,alert:4,all:[0,1,2,3,4,5],all_tweet:4,allard:[3,5],alloc:1,allow:[4,5],along:[0,5],alreadi:[2,4,5],also:[1,2,3,4,5],alwai:[1,4],among:1,amqp:2,analyse_twitt:4,analysi:[4,5],analyt:4,ani:[3,4,5],anonym:1,anyon:5,anyth:4,apach:[1,2],api:[1,4,5],app:1,appli:0,applic:[4,5],approach:5,appropri:4,approv:[],apr:2,apt:5,architectur:1,arg:4,around:4,arrow:1,asap:5,ashlei:3,ask:[4,5],aspr:0,associ:1,assum:4,async:4,atom:[1,4,5],attach:5,attempt:4,attende:3,attribut:3,auth:4,authent:4,author:[3,4],auto_incr:4,automat:4,avail:2,awai:2,awar:4,azur:1,back:2,backend:2,bar:4,barchart_lang:4,base:[1,5],bash:5,bash_command:1,bashoper:1,bashrc:5,basic:[0,4,5],batch:4,becom:1,been:[0,1,5],befor:[2,4,5],beforehand:5,begin:[1,4],behind:2,being:4,below:[],best:[1,5],between:[1,4],bin:5,bit:[4,5],bitsandchip:[3,5],blobstorag:1,block:0,bore:4,both:5,brew:5,brief:3,broken:4,broker:2,broker_url:2,build:0,built:1,button:5,call:[1,4],can:[1,2,3,4,5],cannot:2,card:5,care:4,carefulli:5,catchup:2,celeri:1,celeryexecutor:2,celeryproject:2,cento:5,central:1,certain:1,cfg:[2,4],chain:4,chang:[2,4,5],channel:3,charact:[2,4],characterist:1,charg:1,check:[1,2,5],checkout:5,choos:[1,5],clean:4,clean_data:4,clean_df:4,cleanup:4,clear:4,clearli:4,cli:2,click:5,client:2,close:[4,5],cloud:3,code:[4,5],collabor:1,collat:[2,4],collect:5,collet:4,column:4,com:[2,3,4,5],come:5,command:[0,4,5],commit:[4,5],compil:5,complet:[0,1,2,4,5],complex:4,concept:0,conclud:4,conda:[2,5],conduct:5,confer:5,config:[2,4],config_fil:4,configpars:4,configur:5,confirm:5,connect:2,connect_db:4,connector:4,connecttwitt:4,consid:4,consider:3,consist:4,consumer_kei:4,consumer_secret:4,contact:3,contain:[2,4],content:[3,4],control:5,copi:[4,5],copyright:4,corpor:4,correct:[4,5],correctli:1,correspond:[1,4],could:1,count:4,countri:4,cppflag:5,creat:1,create_plot:4,create_t:4,created_at:4,creation:4,credit:5,critic:[2,4],csv:4,current:[0,4],cursor:4,customlisten:4,cwd:4,cycl:4,dag:1,dag_id:1,dagb:1,dai:[4,5],danger:0,dashboard:[],data:[0,1,3,5],databas:[0,5],datafram:4,dataset:4,date:[1,2],datetim:[2,4],dateutil:4,dbconnect:4,deactiv:5,deal:4,debian:5,debunk:1,decid:[1,5],dedupl:1,deeper:4,def:[1,2,4],default_arg:2,defin:4,definit:[1,2],delet:4,deliv:3,demand:4,depend:[1,4,5],depends_on_past:2,deploi:5,deposit:4,descript:2,design:[2,3,5],detail:[2,4,5],detect:4,determin:4,dev:5,devel:5,develop:3,dict:4,dictionari:4,differ:4,difficult:[4,5],direct:[1,4],directori:[2,5],disconnect:4,discuss:4,displai:1,distinct:4,distribut:3,doc:[1,2,4,5],docker:[],doe:4,doing:[0,5],dollar:5,domain:1,done:[1,4,5],down:5,download:[4,5],downstream:[2,4],drop:4,drwxr:2,dummy_oper:2,dummy_task:2,dummyoper:2,duplic:1,dure:5,dynam:1,each:[0,1,2,4],earlier:4,easi:4,easier:5,echo:5,edg:1,editor:2,either:[2,5],els:4,email:[2,5],email_on_failur:2,email_on_retri:2,emailoper:1,enabl:5,end:[1,4,5],engin:[0,1,3,4],enough:[0,4],ensur:[1,5],env:[2,5],enviro:3,environ:[1,4],equival:2,error:[2,4,5],especi:5,etc:1,etl:[1,5],eucipi:[],euroscipi:5,euroswcipi:[],event:4,eventu:4,everi:4,evolv:2,exampl:[0,1,2,4,5],example_bash_oper:2,except:4,execut:[1,2,4,5],executor:2,exist:[4,5],exit:[2,5],expect:3,expedit:5,experi:[0,5],explicit:1,extens:5,extra:0,extract:4,extrem:4,facilit:5,fail:[1,2,4],failur:[1,4],fals:[2,4],far:4,fast:4,favourit:5,featur:5,feedback:3,few:4,fig:4,figur:4,file:[1,2,3,4,5],filesystem:1,fill:5,filter:4,find:0,first:5,flask:1,flush:[2,4],focu:4,foe:2,folder:[2,4,5],folk:0,follow:[0,2,3,4,5],forget:4,foundat:4,frame:4,free:5,from:[1,5],further:[4,5],futur:2,get:[1,4,5],gist:5,github:[3,4],give:[0,5],given:4,global:2,gnu:[1,4],going:[4,5],good:[0,1,5],grant:[2,4],graph:1,great:4,greater:0,group:4,guest:2,gui:1,guid:5,had:1,hand:[0,2],handl:4,happen:[2,4],has:[0,1,4,5],hat:5,have:[1,2,3,4,5],head:2,header:5,hello:2,hello_task:2,hello_world:2,help:[0,4,5],here:[1,2,3,4,5],highli:1,home:2,home_timelin:4,homebrew:4,hood:1,host:4,how:[0,1,4],html:[2,4],http:[2,3,4,5],human:4,id_str:4,identifi:[0,1,4,5],imag:5,includ:[2,5],incognito:5,increas:4,incub:1,index:4,index_col:4,indic:1,individu:[1,5],inform:1,infoschema:4,infrastructur:1,initdb:2,initi:2,input:[1,4],insert:4,insid:2,insight:3,instal:[2,5],instanc:[1,2,4,5],instruct:[2,5],instructor:5,integr:[1,5],intens:[1,3,4],interact:1,interest:[0,3],interfac:1,intermedi:0,intern:2,invalid:5,invest:4,investig:4,involv:4,issu:[1,3,4,5],its:4,ixek:3,job:[1,4],json:4,jupyt:4,just:4,keep:[2,4],kei:4,kept:4,kind:4,know:[4,5],knowledg:0,known:5,kwarg:1,languag:4,larg:4,late:1,later:[4,5],latest:[2,4],launch:5,ldflag:5,lead:5,learn:[0,3],leav:5,len:4,let:[4,5],lib:5,libmysqlcli:5,librari:[0,2,4,5],life:5,like:[1,2,4,5],limit:4,line:[0,2,4,5],link:[1,3,5],list:2,list_dag:2,list_task:2,listen:4,live:[2,3,5],load:4,local:5,localhost:[2,4],locat:2,log:[1,2,4],login:[],logo:3,look:[0,2,4],loos:4,lot:[4,5],love:3,lower:4,lowercas:4,lrwxr:2,machin:[3,4,5],made:4,mai:4,mail:3,main:[1,4],mainli:4,maintain:[1,3,4],make:[1,2,4,5],manag:[1,4],manchest:3,mani:[1,4,5],manual:4,match:[2,4],matplotlib:4,mcnamara:3,mean:[1,2,3,4],meantim:5,measur:4,mechan:[1,4],mention:2,messag:4,method:4,microsoft:3,mid:1,might:5,minim:4,minut:2,miss:4,mission:4,mkdir:[2,4,5],modif:4,modifi:4,modul:[],monitor:[2,4],more:[1,4],most:[1,4,5],much:5,mud:1,multipl:2,multithread:1,my_dag:2,my_databas:4,my_tabl:4,mydir:2,mysql:[2,4],mysqlclient:[4,5],mysqloper:1,mystream:4,mystreamlisten:4,myuser:2,name:[1,3,4],need:[0,1,2,4,5],new_tabl:4,newli:[2,5],next:[4,5],nifti:5,node:1,non:4,none:4,note:[2,4,5],notic:2,notifi:[],now:[4,5],nrollr:5,number:[1,2,4],oauthhandl:4,object:[1,2,4],off:4,offici:5,often:4,on_data:4,on_error:4,on_statu:4,onc:[1,2,5],one:[1,2,4,5],ones:[],onli:[1,4],open:[1,2,3],opendata:[],openssl:5,oper:[1,2,5],opt:5,optim:5,option:5,oracl:4,orang:0,orchestr:4,order:1,org:[2,4],origin:3,other:[1,4],our:[2,4],out:[1,4],outcom:4,output:[1,4],over:[0,2,3],overridden:2,own:1,owner:[2,4],packag:[2,5],page:[],pair:4,panda:4,parallel:2,paramet:[1,2,4],pars:4,parser:4,part:[4,5],particular:1,particularli:[1,3],password:[4,5],past:1,path:[4,5],pathlib:4,peopl:5,per:5,perform:4,perhap:4,perman:5,permiss:[],person:[0,5],pip:[2,5],pipe:4,pipelin:[0,1,5],pipenv:2,pipfil:5,place:0,plan:5,pleas:[3,5],plot:4,plt:4,plumber:1,png:4,point:[1,5],popul:4,popular:5,populate_t:4,portal:5,possibl:4,post:0,postit:0,power:[1,4,5],pprint:1,practic:[0,1],pre:5,preced:2,prefer:5,prepar:4,prerequisit:2,prevent:4,previous:4,primari:4,print:[1,4],print_context:1,print_dat:[1,2],print_hello:2,print_the_context:1,privat:5,privileg:[2,4],proce:5,process:[1,4,5],produc:[1,4],product:[2,4],productis:4,program:4,prohibit:4,project:[1,2,4,5],promo:5,prompt:[4,5],properli:2,provid:[1,4,5],provide_context:1,public_tweet:4,pull:1,purpl:0,pycharm:5,pycon2019:4,pycon:4,pypi:2,pyplot:4,python2019:[2,4],python3:5,python:[0,1],python_cal:[1,2],python_oper:2,pythonoper:[1,2],qualiti:4,queri:4,question:[3,4],queu:4,queue:4,quickli:4,quit:5,rabbitmq:2,rang:4,rate:4,raw_data:4,raw_tweet:4,read:[1,4,5],read_sql_queri:4,readi:5,readthedoc:4,reboot:5,receiv:5,recommend:5,record:[1,5],recov:1,red:5,redeem:5,refresh:5,regardless:1,regist:[4,5],regularli:4,relat:4,relationship:1,releas:2,relev:4,reload:5,rememb:4,remix:3,remot:1,remov:4,replac:2,report:4,repositori:5,repres:1,reproduc:[1,3,4],requir:[4,5],requisit:5,research:3,reserv:4,resourc:1,respect:[3,4],respons:[1,4],rest:2,restrict:4,result:[1,2,4],result_backend:2,retri:[1,2],retriev:1,retry_delai:2,retweet:4,retweet_count:4,right:[4,5],robust:3,rollback:4,root:[2,4,5],roughli:4,row:4,run:[1,4,5],run_thi:1,runme_0:2,safe:5,sai:1,same:[1,4,5],save:[1,2,4],save_df:4,savefig:4,scalabl:1,scarc:1,scene:2,schedul:[2,4],schedule_interv:2,scm:5,screen_nam:4,script:[4,5],search:2,sec:4,secret:[],section:[2,4,5],see:[1,2,4,5],select:[4,5],self:4,send:[1,4,5],sent:[4,5],sequentialexecutor:2,server:[2,4],servic:[2,5],session:[4,5],set:[1,2,5],set_access_token:4,set_upstream:2,set_xticklabel:4,settl:5,setup:[2,4],sever:5,share:[1,4,5],shell:[2,5],should:[2,4,5],show:[1,4],simpl:2,simple_dag:2,simplehttpoper:1,sinc:[1,2],singl:[1,5],site:5,skip:1,slow:5,smaller:4,snippet:4,softwar:[3,5],solut:[2,4],some:[0,1,2,4,5],someth:[1,2],sort:[1,5],sourc:[1,2,4,5],spark_task_etl:2,specif:[2,4,5],specifi:[],speed:4,spend:4,spin:1,spotifi:1,sql:4,sql_alchemy_conn:2,sql_to_csv:4,sql_to_df:4,sqlite:2,src1_s3:2,src2_hdf:2,src3_s3:2,stabil:1,stai:2,stand:1,start:[1,4,5],start_dat:2,start_stream:4,state:1,statement:4,statu:[1,2,4],status:2,status_cod:4,step:5,steroid:4,stop:4,store:[1,2,4],str:4,straight:2,stream_twitt:4,streaming_how_to:4,streamlisten:4,strftime:4,style:5,subject:5,subplot:4,subscript:5,subsequ:4,substitut:4,subtask:4,success:[1,4,5],sudo:5,suggest:5,suit:5,support:1,suptitl:4,sure:[4,5],surveil:5,sync:2,sys:4,system:[0,1,5],systemctl:5,take:[0,2,4,5],tania:[3,5],task:[0,2,4],task_id:[1,2],team:[1,3,5],templat:2,test:2,testabl:1,text:[2,4],than:4,thankfulli:4,thei:[1,4,5],them:[1,2],thi:[0,1,2,3,4,5],thing:[4,5],think:4,those:4,thought:4,three:4,through:[4,5],thu:1,time:[0,1,2,4,5],timedelta:2,timelin:4,timeout:4,timestamp:4,tip:4,to_csv:4,todai:4,togeth:4,token:4,too:[2,4,5],tool:[3,5],toolset:0,top:4,tornado:1,track:[1,2,4,5],trademark:4,trallard:[3,4,5],transact:4,transform:4,tree:2,trigger:2,trigger_dag:2,troubleshoot:4,turn:1,tutori:[0,2,4,5],tweepi:4,tweet:5,tweets_long:4,twitter:3,two:4,txt:5,type:[2,4],ubuntu:5,unclear:4,under:[1,3],understand:0,uniqu:[4,5],unit:1,unittest:2,unix:[4,5],until:5,updat:1,upstream:4,use:[0,2,3,4,5],used:[1,2,3,4,5],useful:5,user:[1,2,4],userguid:2,usernam:4,uses:[1,2],using:[0,1,2,3,4,5],usr:5,usual:5,utf8:[2,4],utf8_unicode_ci:[2,4],v17:5,valid:5,value_count:4,varchar:4,variabl:[2,5],varieti:5,variou:2,venv:5,versatil:1,version:[1,4,5],vertic:1,via:[1,2,3,4,5],view:2,virtual:2,visit:[4,5],vscode:5,wai:4,wait_on_rate_limit:4,wait_on_rate_limit_notifi:4,want:[2,4,5],web:[2,5],webserv:2,websit:5,welcom:4,well:[0,2,4],were:5,what:[0,2,5],whatev:1,when:[1,4,5],whenev:4,where:[1,3,4],wherev:2,which:[1,2,3,4,5],who:5,whoever:4,whole:4,whom:4,wifi:5,within:[2,4],witht:4,work:[1,3,4,5],worker:1,workflow:[3,4],workshop:[3,5],world:2,worri:5,worth:5,would:[2,4],wrap:4,write:[1,4,5],written:5,www:5,xlabel:4,xxxxxxxxxxxxxxxxxx:4,yaml:5,year:3,yet:5,ylabel:4,yml:5,you:[1,2,3,4,5],your:[0,1,5],yourself:5,yum:5,zsh:5,zshrc:5},titles:["About the workshop","Airflow basics","Airflow 101: working locally and familiarise with the tool","Welcome to the EuroScipy Airflow tutorial","Pipelines","Setup"],titleterms:{"new":4,IDEs:5,about:[0,3],account:5,airflow:[1,2,3],anaconda:5,app:5,attende:5,autom:4,azur:5,basic:1,check:4,code:[1,3],collect:4,command:2,compar:1,compon:1,concept:1,conduct:3,configur:2,connect:4,creat:[2,4,5],dag:2,dai:0,data:4,databas:[1,2,4],defin:1,develop:5,differ:1,docker:5,document:[],editor:5,environ:[2,5],etl:[2,4],euroscipi:3,executor:1,extend:4,facilit:3,familiaris:2,first:[2,4],focu:0,from:[2,4],get:2,git:5,github:5,good:4,idempot:1,indic:[],keep:0,kind:2,let:2,licens:3,linux:5,local:[2,4],luigi:1,mac:5,matter:4,metadata:1,microsoft:5,mysql:5,now:2,our:0,pass:5,pipelin:[2,4],pipenv:5,pre:2,prepar:2,previou:2,pycon:5,python:[4,5],requisit:2,run:2,schedul:1,server:1,set:4,setup:5,similar:1,start:2,step:4,stream:4,tabl:4,task:1,text:5,tool:2,track:0,troubleshoot:5,tutori:3,tweet:4,twitter:[4,5],updat:2,user:5,virtual:5,virtualenv:5,web:1,welcom:3,what:[1,4],why:4,window:5,work:2,workflow:1,workshop:0,write:2,you:0,your:[2,3,4]}})


--------------------------------------------------------------------------------
/docs/source/_static/12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/12.png


--------------------------------------------------------------------------------
/docs/source/_static/4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/4.jpg


--------------------------------------------------------------------------------
/docs/source/_static/DAG.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/DAG.png


--------------------------------------------------------------------------------
/docs/source/_static/GUI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/GUI.png


--------------------------------------------------------------------------------
/docs/source/_static/airflow-logo.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/airflow-logo.jpeg


--------------------------------------------------------------------------------
/docs/source/_static/airflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/airflow.png


--------------------------------------------------------------------------------
/docs/source/_static/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/architecture.png


--------------------------------------------------------------------------------
/docs/source/_static/automate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/automate.png


--------------------------------------------------------------------------------
/docs/source/_static/automation1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/automation1.jpg


--------------------------------------------------------------------------------
/docs/source/_static/azure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/azure.png


--------------------------------------------------------------------------------
/docs/source/_static/connection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/connection.png


--------------------------------------------------------------------------------
/docs/source/_static/custom.css:
--------------------------------------------------------------------------------
  1 | /*  */
  2 | @import url('https://fonts.googleapis.com/css?family=Itim|Nunito|Source+Code+Pro');
  3 | 
  4 | a {
  5 |     color: rgb(96, 138, 197);
  6 | }
  7 | 
  8 | a:hover {
  9 |     color: rgb(65, 129, 218);
 10 | }
 11 | 
 12 | div.body h1 {
 13 |     color: #5F6366;
 14 |     font-family: 'Itim', cursive;
 15 |     font-weight: bold;
 16 |     font-size: 300%;
 17 | }
 18 | 
 19 | div.body h2 {
 20 |     color: #5F6366;
 21 |     font-family: 'Itim', cursive;
 22 |     font-weight: bold;
 23 | }
 24 | div.body h3 {
 25 |     color: #5F6366;
 26 |     font-family: 'Itim', cursive;
 27 |     font-weight: bold;
 28 | }
 29 | 
 30 | div.sphinxsidebarwrapper h1.logo {
 31 |     text-align: center;
 32 |     margin: 0 0 -20px 0;
 33 | }
 34 | 
 35 | div.sphinxsidebar p.blurb {
 36 |     font-size: 130%;
 37 |     text-align: center;
 38 |     font-family: 'Itim', cursive;
 39 |     color: rgb(151, 139, 196);
 40 | }
 41 | 
 42 | div.sphinxsidebar h1{
 43 |     font-size: 160%;
 44 |     color: #5F6366;
 45 |     font-family: 'Itim', cursive;
 46 | }
 47 | 
 48 | div.sphinxsidebar h1 a {
 49 |     font-size: 160%;
 50 |     color: #5F6366;
 51 |     text-decoration: none;
 52 |     border: none;
 53 |     font-family: 'Itim', cursive;
 54 | }
 55 | 
 56 | div.sphinxsidebar h1 a:hover {
 57 |     border: none;
 58 | }
 59 | 
 60 | div.sphinxsidebar h3 {
 61 |     display: none;
 62 | }
 63 | 
 64 | div.sphinxsidebar a {
 65 |     color: #5F6366;
 66 | }
 67 | 
 68 | code.descname {
 69 |     color: rgb(151, 139, 196);
 70 | }
 71 | 
 72 | th.field-name {
 73 |     min-width: 100px;
 74 |     color: rgb(151, 139, 196);
 75 | }
 76 | 
 77 | tt, code {
 78 |     color: #F8F8F2;
 79 |     background: #1d1941;
 80 |     border-radius: 0.3em;
 81 |     padding: 0.0em 0.3em;
 82 | }
 83 | 
 84 | a.reference.internal code.xref span.pre {
 85 |     color: #F8F8F2;
 86 |     background: #1d1941;
 87 |     border-bottom: none;
 88 |     border-radius: 0;
 89 |     padding: 0;
 90 | }
 91 | 
 92 | a.reference.internal, a.reference.internal:hover {
 93 |     border-bottom: none;
 94 | }
 95 | 
 96 | a.reference.internal:hover code {
 97 |     background: #027bab
 98 | }
 99 | 
100 | a.reference.internal:hover code.xref span.pre {
101 |     color: #F8F8F2;
102 |     background: #027bab;
103 |     border-bottom: none;
104 | }
105 | 
106 | tt.xref, code.xref, a tt {
107 |     background: none;
108 |     border-bottom: none;
109 | }
110 | 
111 | code.literal {
112 |     color: #F8F8F2;
113 |     background:#1d1941;
114 | }
115 | 
116 | pre {
117 |     padding: 20px 30px;
118 |     background: #1d1941;
119 | }
120 | 
121 | div > dl {
122 |     border-left: 2px solid #00384021;
123 |     padding-left: 5px;
124 | }
125 | 
126 | dt {
127 |     color: rgb(96, 138, 197);
128 | }
129 | 
130 | 
131 | div.footer::before {
132 |     display: block;
133 |     content: '';
134 |     border-top: 2px solid #EDB5BF;
135 |     width: 50%;
136 |     margin: 2em auto 2em auto;
137 | }
138 | 
139 | div.footer {
140 |     text-align: center;
141 |     /* color: #029be2;  */
142 | }
143 | 
144 | div.footer a {
145 |     color: #027bab;
146 |     text-decoration: none;
147 | }
148 | 
149 | p.caption {
150 |     font-family: 'Itim', cursive;
151 |     font-size: inherit;
152 |     font-size: 150%;
153 | }
154 | 
155 | @media screen and (max-width: 875px) {
156 |     div.sphinxsidebar {
157 |         background: #4D6D9A;
158 |     }
159 |     div.sphinxsidebar h1.logo, div.sphinxsidebar p.blurb{
160 |         text-align: left;
161 |     }
162 |     div.sphinxsidebar h1 a {
163 |         color: #1bc5e0;
164 |     }
165 |     div.sphinxsidebar a {
166 |         /* color: rgb(151, 139, 196); */
167 |         color: white;
168 |     }
169 |     div.sphinxsidebar ul {
170 |         /* color: rgb(151, 139, 196); */
171 |         color: white;
172 |     }
173 | }
174 | 
175 | 
176 | /* other */
177 | 
178 | .alert {
179 |   position: relative;
180 |   padding: 10px;
181 |   margin-bottom: 5px;
182 |   border: 2px solid transparent;
183 |   border-radius: 2px;
184 | }
185 | 
186 | .alert-primary {
187 |     color: #004085;
188 |     background-color: #cce5ff;
189 |     border-color: #b8daff;
190 | }
191 | .alert-custom {
192 |     background-color: rgb(229, 224, 247);
193 |     border-color:rgb(229, 224, 247);
194 |     color: rgb(128, 117, 165);
195 | }


--------------------------------------------------------------------------------
/docs/source/_static/dag-time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/dag-time.png


--------------------------------------------------------------------------------
/docs/source/_static/datapyramid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/datapyramid.png


--------------------------------------------------------------------------------
/docs/source/_static/gooddata.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/gooddata.png


--------------------------------------------------------------------------------
/docs/source/_static/gooddata1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/gooddata1.png


--------------------------------------------------------------------------------
/docs/source/_static/luigi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/luigi.png


--------------------------------------------------------------------------------
/docs/source/_static/mssignin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/mssignin.png


--------------------------------------------------------------------------------
/docs/source/_static/pipeline1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/pipeline1.png


--------------------------------------------------------------------------------
/docs/source/_static/python.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/python.png


--------------------------------------------------------------------------------
/docs/source/_static/twitter1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/twitter1.png


--------------------------------------------------------------------------------
/docs/source/_static/twitter2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/twitter2.png


--------------------------------------------------------------------------------
/docs/source/_static/twitter3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/twitter3.png


--------------------------------------------------------------------------------
/docs/source/_static/uses.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/uses.png


--------------------------------------------------------------------------------
/docs/source/_templates/sidebarlogo.html:
--------------------------------------------------------------------------------
1 | <p><iframe src="https://ghbtns.com/github-btn.html?user=trallard&type=follow&count=false" allowtransparency="true"
2 |         frameborder="0" scrolling="0" width="200" height="20"></iframe></p>
3 | 
4 | <p><a href="https://twitter.com/ixek" class="twitter-follow-button" data-show-count="false">Follow @ixek</a>
5 |     <script>!function (d, s, id) { var js, fjs = d.getElementsByTagName(s)[0], p = /^http:/.test(d.location) ? 'http' : 'https'; if (!d.getElementById(id)) { js = d.createElement(s); js.id = id; js.src = p + '://platform.twitter.com/widgets.js'; fjs.parentNode.insertBefore(js, fjs); } }(document, 'script', 'twitter-wjs');</script>
6 | </p>
7 | 


--------------------------------------------------------------------------------
/docs/source/about.md:
--------------------------------------------------------------------------------
 1 | # About the workshop
 2 | 
 3 | We will be taking a look at the basic concepts of data pipelines as well as practical use cases using Python and libraries like pandas, matplotlib, and tensorflow.
 4 | 
 5 | ## About you:
 6 | - Some experience using the command line
 7 | - Intermediate Python knowledge / use
 8 | - Be able to apply what we learn and adopt to your use cases
 9 | - Interested in data and systems
10 | - Aspring or current data engineering
11 | - Some knowledge about systems and databases (enough to be dangerous)
12 | 
13 | ## Our focus for the day
14 | - Greater understanding on how to apply data pipelines using the Python and libraries in the Python scientific ecosystem
15 | - Focus on concepts (rather than complex implementations)
16 | - Practical knowledge application
17 | - Create the building blocks needed for your day-to-day work
18 | 
19 | ## Keeping on track
20 | 
21 | You will find 🚦 across the tutorial examples. We will use this to identify how folks are doing over the workshop (if following along in person). 
22 | These will indicate practical or hands-on portions of the tutorial.
23 | 
24 | ## Additional tutorial (PyCon US)
25 | 
26 | For another (much longer) tutorial integrating MYSQL and Twitter stream data check out <https://github.com/trallard/airflow-tutorial>
27 | 
28 | Also in the upcoming months I have planned:
29 | - Deploying Airflow in Kubernetes (AKS)
30 | - In depth programmatic report generation with Airflow and papermill
31 | - Airflow + dagster <https://github.com/dagster-io/dagster>
32 | - Airflow + R? 


--------------------------------------------------------------------------------
/docs/source/airflow-intro.md:
--------------------------------------------------------------------------------
  1 | # Airflow basics
  2 | 
  3 | ## What is Airflow?
  4 | 
  5 | ![airflow logo](_static/airflow-logo.jpeg)
  6 | 
  7 | Airflow is a Workflow engine which means:
  8 | 
  9 | - Manages scheduling and running jobs and data pipelines
 10 | - Ensures jobs are ordered correctly based on dependencies
 11 | - Manage the allocation of scarce resources
 12 | - Provides mechanisms for tracking the state of jobs and recovering from failure
 13 | 
 14 | It is highly versatile and can be used across many many domains:
 15 | ![](_static/uses.png)
 16 | 
 17 | ## Basic Airflow concepts
 18 | 
 19 | - **Task**: a defined unit of work (these are called operators in Airflow)
 20 | - **Task instance**: an individual run of a single task. Task instances also have an indicative state, which could be “running”, “success”, “failed”, “skipped”, “up for retry”, etc.
 21 | - **DAG**: Directed acyclic graph,
 22 |   a set of tasks with explicit execution order, beginning, and end
 23 | - **DAG run**: individual execution/run of a DAG
 24 | 
 25 | **Debunking the DAG**
 26 | 
 27 | The vertices and edges (the arrows linking the nodes) have an order and direction associated to them
 28 | 
 29 | ![](_static/DAG.png)
 30 | 
 31 | each node in a DAG corresponds to a task, which in turn represents some sort of data processing. For example:
 32 | 
 33 | Node A could be the code for pulling data from an API, node B could be the code for anonymizing the data. Node B could be the code for checking that there are no duplicate records, and so on.
 34 | 
 35 | These 'pipelines' are acyclic since they need a point of completion.
 36 | 
 37 | The DAG does not care about what is in its tasks - since it does not do the processing itself. But it ensures that things happen at the right order. 
 38 | 
 39 | ![](https://www.polidea.com/static/bce5fcc8a3c0ead34ab459d243a26349/beee6/image2.png)
 40 | 
 41 | **Dependencies**
 42 | 
 43 | Each of the vertices has a particular direction that shows the relationship between certain nodes. For example, we can only anonymize data once this has been pulled out from the API.
 44 | 
 45 | ## Operators
 46 | While DAGs describe how to run a workflow, Airflow operators determine what actually gets done. There are several types of operators:
 47 | 
 48 | - action operators which perform a single operation and return (e.g. `BashOperator`),
 49 | - sensors which pause the execution (or execute) until a certain criteria is met (e.g. `sql_sensor`)
 50 | - transfer operators which connect 2 services and enable sending data between them (e.g.  GoogleCloudStorageToS3Operator).
 51 | 
 52 | 
 53 | ## Idempotency
 54 | 
 55 | This is one of the most important characteristics of good ETL architectures.
 56 | 
 57 | When we say that something is idempotent it means it will produce the same result regardless of how many times this is run (i.e. the results are reproducible).
 58 | 
 59 | Reproducibility is particularly important in data-intensive environments as this ensures that the same inputs will always return the same outputs.
 60 | 
 61 | ## Airflow components
 62 | 
 63 | ![](_static/architecture.png)
 64 | 
 65 | There are 4 main components to Apache Airflow:
 66 | 
 67 | ### Web server
 68 | 
 69 | The GUI. This is under the hood a Flask app where you can track the status of your jobs and read logs from a remote file store (e.g. [Azure Blobstorage](https://docs.microsoft.com/en-us/azure/storage/blobs/storage-blobs-overview/?wt.mc_id=PyCon-github-taallard)).
 70 | 
 71 | ### Scheduler
 72 | 
 73 | This component is responsible for scheduling jobs. This is a multithreaded Python process that uses the DAGb object to decide what tasks need to be run, when and where.
 74 | 
 75 | The task state is retrieved and updated from the database accordingly. The web server then uses these saved states to display job information.
 76 | 
 77 | ### Executor
 78 | 
 79 | The mechanism that gets the tasks done.
 80 | 
 81 | ### Metadata database
 82 | 
 83 | - Powers how the other components interact
 84 | - Stores the Airflow states
 85 | - All processes read and write from here
 86 | 
 87 | ## Workflow as a code
 88 | One of the main advantages of using a workflow system like Airflow is that all is code, which makes your workflows maintainable, versionable, testable, and collaborative.
 89 | 
 90 | Thus your workflows become more explicit and maintainable (atomic tasks).
 91 | 
 92 | Not only your code is dynamic but also is your infrastructure.
 93 | 
 94 | ### Defining tasks
 95 | 
 96 | Tasks are defined based on the abstraction of `Operators` (see Airflow docs [here](https://airflow.apache.org/concepts.html#operators)) which represent a single **idempotent task**.
 97 | 
 98 | The best practice is to have atomic operators (i.e. can stand on their own and do not need to share resources among them).
 99 | 
100 | You can choose among;
101 | - `BashOperator`
102 | - `PythonOperator`
103 | - `EmailOperator`
104 | - `SimpleHttpOperator`
105 | - `MySqlOperator` (and other DB)
106 | 
107 | Examples:
108 | 
109 | If you have a DAG like this:
110 | 
111 | ![](https://miro.medium.com/max/2120/1*Oqvm3jsGqfHDWoGOd3iB1A.png)
112 | 
113 | Your Dag will be formed by the following operators:
114 | 
115 | ```
116 | source = DummyOperator(task_id=’source’, dag=dag)
117 | a_task = DummyOperator(task_id=’a’, dag=dag)
118 | b_task = DummyOperator(task_id=’b’, dag=dag)
119 | 
120 | source >> a_task >> b_task
121 | 
122 | ```
123 | 
124 | ```python
125 | t1 = BashOperator(task_id='print_date',
126 |     bash_command='date,
127 |     dag=dag) 
128 | ```
129 | 
130 | ```python
131 | def print_context(ds, **kwargs):
132 |     pprint(kwargs)
133 |     print(ds)
134 |     return 'Whatever you return gets printed in the logs'
135 | 
136 | 
137 | run_this = PythonOperator(
138 |     task_id='print_the_context',
139 |     provide_context=True,
140 |     python_callable=print_context,
141 |     dag=dag,
142 | )
143 | ```
144 | 
145 | ## Comparing Luigi and Airflow
146 | 
147 | ### Luigi 
148 | 
149 | - Created at Spotify (named after the plumber)
150 | - Open sourced in late 2012
151 | - GNU make for data
152 | 
153 | ### Airflow
154 | - Airbnb data team
155 | - Open-sourced mud 2015
156 | - Apache incubator mid-2016
157 | - ETL pipelines
158 | 
159 | ### Similarities
160 | - Python open source projects for data pipelines
161 | - Integrate with a number of sources (databases, filesystems)
162 | - Tracking failure, retries, success
163 | - Ability to identify the dependencies and execution
164 | 
165 | ### Differences
166 | - Scheduler support: Airflow has built-in support using schedulers
167 | - Scalability: Airflow has had stability issues in the past
168 | - Web interfaces
169 | 
170 | ![](_static/luigi.png)
171 | 
172 | 
173 | ![](_static/airflow.png)
174 | 
175 | 
176 | | Airflow                                          | Luigi                                                                          |
177 | | ------------------------------------------------ | ------------------------------------------------------------------------------ |
178 | | Task are defined by`dag_id` defined by user name | Task are defined by task name and parameters                                   |
179 | | Task retries based on definitions                | Decide if a task is done via input/output                                      |
180 | | Task code to the worker                          | Workers started by Python file where the tasks are defined                     |
181 | | Centralized scheduler (Celery spins up workers)  | Centralized scheduler in charge of deduplication sending tasks (Tornado based) |


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Configuration file for the Sphinx documentation builder.
  4 | #
  5 | # This file does only contain a selection of the most common options. For a
  6 | # full list see the documentation:
  7 | # http://www.sphinx-doc.org/en/master/config
  8 | 
  9 | # -- Path setup --------------------------------------------------------------
 10 | 
 11 | # If extensions (or modules to document with autodoc) are in another directory,
 12 | # add these directories to sys.path here. If the directory is relative to the
 13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 14 | #
 15 | # import os
 16 | # import sys
 17 | # sys.path.insert(0, os.path.abspath('.'))
 18 | 
 19 | 
 20 | # -- Project information -----------------------------------------------------
 21 | 
 22 | project = "EuroScipy tutorial"
 23 | copyright = "2019, Tania Allard"
 24 | author = "Tania Allard"
 25 | 
 26 | # The short X.Y version
 27 | version = ""
 28 | # The full version, including alpha/beta/rc tags
 29 | release = ""
 30 | 
 31 | 
 32 | # -- General configuration ---------------------------------------------------
 33 | 
 34 | # If your documentation needs a minimal Sphinx version, state it here.
 35 | #
 36 | # needs_sphinx = '1.0'
 37 | 
 38 | # Add any Sphinx extension module names here, as strings. They can be
 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 40 | # ones.
 41 | extensions = [
 42 |     "sphinx.ext.doctest",
 43 |     "sphinx.ext.intersphinx",
 44 |     "sphinx.ext.mathjax",
 45 |     "sphinx.ext.githubpages",
 46 |     "recommonmark",
 47 | ]
 48 | 
 49 | # Add any paths that contain templates here, relative to this directory.
 50 | templates_path = ["_templates"]
 51 | 
 52 | # The suffix(es) of source filenames.
 53 | # You can specify multiple suffix as a list of string:
 54 | #
 55 | source_suffix = [".rst", ".md"]
 56 | 
 57 | # The master toctree document.
 58 | master_doc = "index"
 59 | 
 60 | # The language for content autogenerated by Sphinx. Refer to documentation
 61 | # for a list of supported languages.
 62 | #
 63 | # This is also used if you do content translation via gettext catalogs.
 64 | # Usually you set "language" from the command line for these cases.
 65 | language = None
 66 | 
 67 | # List of patterns, relative to source directory, that match files and
 68 | # directories to ignore when looking for source files.
 69 | # This pattern also affects html_static_path and html_extra_path.
 70 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 71 | 
 72 | # The name of the Pygments (syntax highlighting) style to use.
 73 | pygments_style = "monokai"
 74 | 
 75 | 
 76 | # -- Options for HTML output -------------------------------------------------
 77 | 
 78 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 79 | # a list of builtin themes.
 80 | #
 81 | html_theme = "alabaster"
 82 | 
 83 | # Theme options are theme-specific and customize the look and feel of a theme
 84 | # further.  For a list of options available for each theme, see the
 85 | # documentation.
 86 | #
 87 | html_theme_options = {
 88 |     "github_banner": False,
 89 |     "github_button": True,
 90 |     "github_user": "trallard",
 91 |     "github_repo": "opendata-airflow-tutorial",
 92 |     "github_type": "star",
 93 |     "font_family": "Nunito, Georgia, sans",
 94 |     "head_font_family": "Nunito, Georgia, serif",
 95 |     "code_font_family": "'Source Code Pro', 'Consolas', monospace",
 96 |     "description": "a.k.a an introduction to all things DAGS and pipelines joy",
 97 |     "show_relbars": True,
 98 |     "logo": "python.png",
 99 | }
100 | 
101 | # Add any paths that contain custom static files (such as style sheets) here,
102 | # relative to this directory. They are copied after the builtin static files,
103 | # so a file named "default.css" will overwrite the builtin "default.css".
104 | html_static_path = ["_static"]
105 | 
106 | # Custom sidebar templates, must be a dictionary that maps document names
107 | # to template names.
108 | #
109 | # The default sidebars (for documents that don't match any pattern) are
110 | # defined by theme itself.  Builtin themes are using these templates by
111 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
112 | # 'searchbox.html']``.
113 | #
114 | # Custom sidebar templates, maps document names to template names.
115 | html_sidebars = {
116 |     "**": [
117 |         "about.html",
118 |         "localtoc.html",
119 |         "searchbox.html",
120 |         "navigation.html",
121 |         "relations.html",
122 |         "sidebarlogo.html",
123 |     ]
124 | }
125 | 
126 | # -- Options for HTMLHelp output ---------------------------------------------
127 | 
128 | # Output file base name for HTML help builder.
129 | htmlhelp_basename = "EuroScipytutorialdoc"
130 | 
131 | 
132 | # -- Options for LaTeX output ------------------------------------------------
133 | 
134 | latex_elements = {
135 |     # The paper size ('letterpaper' or 'a4paper').
136 |     #
137 |     # 'papersize': 'letterpaper',
138 |     # The font size ('10pt', '11pt' or '12pt').
139 |     #
140 |     # 'pointsize': '10pt',
141 |     # Additional stuff for the LaTeX preamble.
142 |     #
143 |     # 'preamble': '',
144 |     # Latex figure (float) alignment
145 |     #
146 |     # 'figure_align': 'htbp',
147 | }
148 | 
149 | # Grouping the document tree into LaTeX files. List of tuples
150 | # (source start file, target name, title,
151 | #  author, documentclass [howto, manual, or own class]).
152 | latex_documents = [
153 |     (
154 |         master_doc,
155 |         "EuroScipytutorial.tex",
156 |         "EuroScipy tutorial Documentation",
157 |         "Tania Allard",
158 |         "manual",
159 |     )
160 | ]
161 | 
162 | 
163 | # -- Options for manual page output ------------------------------------------
164 | 
165 | # One entry per manual page. List of tuples
166 | # (source start file, name, description, authors, manual section).
167 | man_pages = [
168 |     (master_doc, "euroscipytutorial", "EuroScipy tutorial Documentation", [author], 1)
169 | ]
170 | 
171 | 
172 | # -- Options for Texinfo output ----------------------------------------------
173 | 
174 | # Grouping the document tree into Texinfo files. List of tuples
175 | # (source start file, target name, title, author,
176 | #  dir menu entry, description, category)
177 | texinfo_documents = [
178 |     (
179 |         master_doc,
180 |         "EuroScipytutorial",
181 |         "EuroScipy tutorial Documentation",
182 |         author,
183 |         "EuroScipytutorial",
184 |         "One line description of project.",
185 |         "Miscellaneous",
186 |     )
187 | ]
188 | 
189 | 
190 | # -- Options for Epub output -------------------------------------------------
191 | 
192 | # Bibliographic Dublin Core info.
193 | epub_title = project
194 | 
195 | # The unique identifier of the text. This can be a ISBN number
196 | # or the project homepage.
197 | #
198 | # epub_identifier = ''
199 | 
200 | # A unique identification for the text.
201 | #
202 | # epub_uid = ''
203 | 
204 | # A list of files that should not be packed into the epub file.
205 | epub_exclude_files = ["search.html"]
206 | 
207 | 
208 | # -- Extension configuration -------------------------------------------------
209 | 
210 | # -- Options for intersphinx extension ---------------------------------------
211 | 
212 | # Example configuration for intersphinx: refer to the Python standard library.
213 | intersphinx_mapping = {"https://docs.python.org/": None}
214 | 


--------------------------------------------------------------------------------
/docs/source/first-airflow.md:
--------------------------------------------------------------------------------
  1 | # Airflow 101: working locally and familiarise with the tool
  2 | 
  3 | ## Pre-requisites
  4 | 
  5 | The following prerequisites are needed:
  6 | 
  7 | - Libraries detailed in the Setting up section (either via conda or pipenv)
  8 | - MySQL installed
  9 | - text editor
 10 | - command line
 11 |   
 12 | ## Getting your environment up and running
 13 | 
 14 | If you followed the instructions you should have Airflow installed as well as the rest of the packages we will be using. 
 15 | 
 16 | So let's get our environment up and running:
 17 | 
 18 | If you are using conda start your environment via:
 19 | ```
 20 | $ source activate airflow-env
 21 | ```
 22 | If using pipenv then:
 23 | ```
 24 | $ pipenv shell
 25 | ````
 26 | 
 27 | this will start a shell within a virtual environment, to exit the shell you need to type `exit` and this will exit the virtual environment.
 28 | 
 29 | ## Starting Airflow locally
 30 | 
 31 | Airflow home lives in `~/airflow` by default, but you can change the location before installing airflow. You first need to set the `AIRFLOW_HOME` environment variable and then install airflow. For example, using pip:
 32 | 
 33 | ```sh
 34 | export AIRFLOW_HOME=~/mydir/airflow
 35 | 
 36 | # install from PyPI using pip
 37 | pip install apache-airflow
 38 | ```
 39 | 
 40 | once you have completed the installation you should see something like this in the `airflow` directory (wherever it lives for you)
 41 | 
 42 | ```
 43 | drwxr-xr-x    - myuser 18 Apr 14:02 .
 44 | .rw-r--r--  26k myuser 18 Apr 14:02 ├── airflow.cfg
 45 | drwxr-xr-x    - myuser 18 Apr 14:02 ├── logs
 46 | drwxr-xr-x    - myuser 18 Apr 14:02 │  └── scheduler
 47 | drwxr-xr-x    - myuser 18 Apr 14:02 │     ├── 2019-04-18
 48 | lrwxr-xr-x   46 myuser 18 Apr 14:02 │     └── latest -> /Users/myuser/airflow/logs/scheduler/2019-04-18
 49 | .rw-r--r-- 2.5k myuser 18 Apr 14:02 └── unittests.cfg
 50 | ```
 51 | We need to create a local dag folder:
 52 | 
 53 | ```
 54 | mkdir ~/airflow/dags
 55 | ```
 56 | 
 57 | As your project evolves, your directory will look something like this:
 58 | 
 59 | ```
 60 | airflow                  # the root directory.
 61 | ├── dags                 # root folder for all dags. files inside folders are not searched for dags.
 62 | │   ├── my_dag.py, # my dag (definitions of tasks/operators) including precedence.
 63 | │   └── ...
 64 | ├── logs                 # logs for the various tasks that are run
 65 | │   └── my_dag           # DAG specific logs
 66 | │   │   ├── src1_s3      # folder for task-specific logs (log files are created by date of a run)
 67 | │   │   ├── src2_hdfs
 68 | │   │   ├── src3_s3
 69 | │   │   └── spark_task_etl
 70 | ├── airflow.db           # SQLite database used by Airflow internally to track the status of each DAG.
 71 | ├── airflow.cfg          # global configuration for Airflow (this can be overridden by config inside the file.)
 72 | └── ...
 73 | ```
 74 | 
 75 | ---
 76 | 
 77 | ## Spinning up a local airflow instance
 78 | 
 79 | ➡️ The first thing we need to do is initialize Airflow database:
 80 | 
 81 | ```
 82 | airflow initdb
 83 | ```
 84 | 
 85 | This will be  cfrated in `airflow.db` by default.
 86 | 
 87 | ```
 88 | airflow_home
 89 | ├── airflow.cfg
 90 | ├── airflow.db        <- Airflow SQLite DB
 91 | └── unittests.cfg
 92 | ```
 93 | 
 94 | 💡Using SQLite is an adequate solution for local testing and development, but it does not support concurrent access. In a production environment you will most certainly want to use a more robust database solution such as Postgres or MySQL (see optional section at the bottom on how to do this locally). 
 95 | 
 96 | Now we need to launch a terminal an start the Airflow web server (which is a Flask application):
 97 | 
 98 | ```
 99 | airflow webserver -p 8080
100 | ```
101 | 
102 | Now we can head over to [http://localhost:8080](http://localhost:8080) now and you will see that there are a number of examples DAGS already there.
103 | 
104 | #### Troubleshooting
105 | 
106 | If you have any issues with loading the Airflow console in your web browser, or if there were any errors when you ran airflow webserver, then you may have another application running on port 8080. That's the default port for Airflow, but you can change it to any other user port that's not being used. For example, to run Airflow on port 7070 you could run:
107 | 
108 | ```
109 | airflow webserver -p 7070
110 | ```
111 | 
112 | 
113 | 🚦 Take some time to familiarise with the UI and get your local instance set up
114 | 
115 | ![](https://www.tensorflow.org/tfx/tutorials/tfx/images/workshop/airflow_dag_buttons.png)
116 | 
117 | These are the buttons that allow you to enable, trigger and refresh dags.
118 | 
119 | ---
120 | 
121 | ### Airflow connections 
122 | Now let's have a look at the connections ([http://localhost:8080/admin/connection/](http://localhost:8080/admin/connection/)) go to `admin > connections`. You should be able to see a number of connections available. 
123 | These allows you to add services or integrate tools with your airflow server.
124 | 
125 | ### Commands
126 | Let us go over some of the commands. Back on your command line:
127 | 
128 | ```
129 | airflow list_dags
130 | ```
131 | we can list the DAG tasks in a tree view
132 | 
133 | ```
134 | airflow list_tasks tutorial --tree
135 | ```
136 | 
137 | we can tests the dags too, but we will need to set a date parameter so that this executes:
138 | 
139 | ```
140 | airflow test tutorial print_date 2019-09-02
141 | ```
142 | (note that you cannot use a future date or you will get an error)
143 | ```
144 | airflow test tutorial templated 2019-09-02
145 | ```
146 | By using the test commands these are not saved in the database.
147 | 
148 | You can also use the command line to enable and trigger DAGS, similar to the buttons in the GUI above:
149 | ```
150 | # enable/disable
151 | airflow unpause <your DAG>
152 | airflow pause <your DAG>
153 | 
154 | # trigger
155 | airflow trigger_dag <your DAG>
156 | ```
157 | 
158 | Now let's start the scheduler:
159 | ```
160 | airflow scheduler
161 | ```
162 | 
163 | Behind the scenes, it monitors and stays in sync with a folder for all DAG objects it contains. The Airflow scheduler is designed to run as a service in an Airflow production environment.
164 | 
165 | Now with the schedule up and running we can trigger an instance:
166 | ```
167 | $ airflow run airflow run example_bash_operator runme_0 2015-01-01
168 | ```
169 | 
170 | This will be stored in the database and you can see the change of the status change straight away.
171 | 
172 | What would happen for example if we wanted to run or trigger the `tutorial` task? 🤔
173 | 
174 | Let's try from the CLI and see what happens.
175 | 
176 | ```
177 | airflow trigger_dag tutorial
178 | ```
179 | 
180 | 
181 | ## Writing your first DAG
182 | 
183 | Let's create our first simple DAG. 
184 | Inside the dag directory (`~/airflow/dags)` create a `simple_dag.py` file.
185 | 
186 | 1. Import Python dependencies
187 | ```python
188 | from datetime import datetime, timedelta
189 | from airflow import DAG
190 | from airflow.operators.dummy_operator import DummyOperator
191 | from airflow.operators.python_operator import PythonOperator
192 | ```
193 | 
194 | 2. Default Airflow arguments
195 | ```python
196 | default_args = {
197 |     "owner": "airflow",
198 |     "depends_on_past": False,
199 |     "start_date": datetime(2019, 4, 30),
200 |     "email": ["airflow@example.com"],
201 |     "email_on_failure": False,
202 |     "email_on_retry": False,
203 |      # If a task fails, retry it once after waiting
204 |     # at least 2 minutes
205 |     "retries": 1,
206 |     "retry_delay": timedelta(minutes=2),
207 | }
208 | ```
209 | 
210 | 3. Instantiate the DAG:Give the DAG name, configure the schedule, and set the DAG settings
211 | 
212 | ```python
213 | dag = DAG(
214 |     "hello_world",
215 |     description="Simple tutorial DAG",
216 |     schedule_interval="0 12 * * *",
217 |     default_args=default_args,
218 |     catchup=False,
219 | )
220 | ```
221 | Here are a couple of options you can use for your `schedule_interval`. You can choose to use some preset argument or cron-like argument:
222 | 
223 | ![](_static/dag-time.png)
224 | 
225 | For example
226 | `schedule_interval='@daily' `
227 | `schedule_interval='0 0 * * *'`
228 | 
229 | For reference <https://devhints.io/cron> or <https://www.codementor.io/akul08/the-ultimate-crontab-cheatsheet-5op0f7o4r>
230 | 
231 | 4. Layout your tasks
232 | 
233 | ```python
234 | t1 = DummyOperator(task_id="dummy_task", retries=3, dag=dag)
235 | 
236 | t2 = PythonOperator(task_id="hello_task", python_callable=print_hello, dag=dag)
237 | ```
238 | 
239 | 5. Setting dependencies
240 |  Set the order of the tasks
241 | 
242 | ```python
243 | t1 >> t2
244 | ```
245 | 
246 | Other ways 
247 | ```python
248 | # This means that t2 will depend on t1
249 | # running successfully to run.
250 | t1.set_downstream(t2)
251 | 
252 | # similar to above where t3 will depend on t1
253 | t3.set_upstream(t1)
254 | ```
255 | 
256 | ```python
257 | # And the upstream dependency with the
258 | # bit shift operator:
259 | t2 << t1
260 | ```
261 | ```python
262 | # A list of tasks can also be set as
263 | # dependencies. These operations
264 | # all have the same effect:
265 | t1.set_downstream([t2, t3])
266 | t1 >> [t2, t3]
267 | [t2, t3] << t1
268 | 
269 | ```
270 | 
271 | Your final DAG should look like this
272 | 
273 | ```python
274 | from datetime import datetime, timedelta
275 | from airflow import DAG
276 | from airflow.operators.dummy_operator import DummyOperator
277 | from airflow.operators.python_operator import PythonOperator
278 | 
279 | 
280 | def print_hello():
281 |     return "Hello world!"
282 | 
283 | 
284 | default_args = {
285 |     "owner": "airflow",
286 |     "depends_on_past": False,
287 |     "start_date": datetime(2019, 8, 31),
288 |     "email": ["airflow@example.com"],
289 |     "email_on_failure": False,
290 |     "email_on_retry": False,
291 |      # If a task fails, retry it once after waiting
292 |     # at least 2 minutes
293 |     "retries": 1,
294 |     "retry_delay": timedelta(minutes=2),
295 | }
296 | 
297 | dag = DAG(
298 |     "hello_world",
299 |     description="Simple tutorial DAG",
300 |     schedule_interval="0 12 * * *",
301 |     default_args=default_args,
302 |     catchup=False,
303 | )
304 | 
305 | t1 = DummyOperator(task_id="dummy_task", retries=3, dag=dag)
306 | 
307 | t2 = PythonOperator(task_id="hello_task", python_callable=print_hello, dag=dag)
308 | 
309 | # sets downstream foe t1
310 | t1 >> t2
311 | 
312 | # equivalent
313 | # t2.set_upstream(t1)
314 | 
315 | ```
316 | 
317 | If it is properly setup you should be able to see this straight away on your instance.
318 | 
319 | You should be able to trigger this DAG straight away. 
320 | 
321 | ### Your first operator
322 | 
323 | An Operator is an atomic block of workflow logic, which performs a single action. Operators are written as Python classes (subclasses of `BaseOperator`), where the `__init__` function can be used to configure settings for the task and a method named execute is called when the task instance is executed.
324 | 
325 | 
326 | The execute method may also raise the `AirflowSkipException` from `airflow.exceptions`. In such a case the task instance would transition to the Skipped status.
327 | 
328 | If another exception is raised, the task will be retried until the maximum number of `retries` is reached.
329 | 
330 | 🚦We need to create a new directory:
331 | 
332 | ```
333 | mkdir <airflow home>/plugins
334 | ```
335 | 
336 | Then `my_operators.py`
337 | 
338 | 
339 | ```
340 | import logging
341 | 
342 | from airflow.models import BaseOperator
343 | from airflow.plugins_manager import AirflowPlugin
344 | from airflow.utils.decorators import apply_defaults
345 | 
346 | log = logging.getLogger(__name__)
347 | 
348 | class MyFirstOperator(BaseOperator):
349 | 
350 |     @apply_defaults
351 |     def __init__(self, my_operator_param, *args, **kwargs):
352 |         self.operator_param = my_operator_param
353 |         super(MyFirstOperator, self).__init__(*args, **kwargs)
354 | 
355 |     def execute(self, context):
356 |         log.info("Hello World!")
357 |         log.info('operator_param: %s', self.operator_param)
358 | 
359 | class MyFirstPlugin(AirflowPlugin):
360 |     name = "my_first_plugin"
361 |     operators = [MyFirstOperator]
362 | ```
363 | In this file we are defining a new operator named `MyFirstOperator`. Its execute method is very simple, all it does is log “Hello World!” and the value of its own single parameter. The parameter is set in the `__init__` function.
364 | 
365 | Now, we’ll need to create a new DAG to test our operator. Create a `dags/test_operators.py` file and fill it with the following content:
366 | 
367 | ```
368 | from datetime import datetime
369 | from airflow import DAG
370 | from airflow.operators.dummy_operator import DummyOperator
371 | from my_operators import MyFirstOperator
372 | 
373 | dag = DAG('my_test_dag', description='Another tutorial DAG',
374 |           schedule_interval='0 12 * * *',
375 |           start_date=datetime(2019, 8, 31), catchup=False)
376 | 
377 | dummy_task = DummyOperator(task_id='dummy_task', dag=dag)
378 | 
379 | operator_task = MyFirstOperator(my_operator_param='This is a test.',
380 |                                 task_id='my_first_operator_task', dag=dag)
381 | 
382 | dummy_task >> operator_task
383 | ```
384 | 
385 | ---
386 | 
387 | ## 🧪 OPTIONAL: Changing your database for a MySQL database
388 | 
389 | As we mentioned before Airflow uses a database to keep track of the tasks and their statuses. So it is critical to have one set up.
390 | 
391 | To start the default database we can run
392 | ` airflow initdb`. This will initialize your database via alembic so that it matches the latest Airflow release.
393 | 
394 | The default database used is `sqlite` which means you cannot parallelize tasks using this database. Since we have MySQL and MySQL client installed we will set them up so that we can use them with airflow.
395 | 
396 | 🚦Create an airflow database
397 | 
398 | From the command line:
399 | 
400 | ```
401 | MySQL -u root -p
402 | mysql> CREATE DATABASE airflow CHARACTER SET utf8 COLLATE utf8_unicode_ci;
403 | mysql> GRANT ALL PRIVILEGES ON airflow.* To 'airflow'@'localhost';
404 | mysql> FLUSH PRIVILEGES;
405 | ```
406 | and initialize the database:
407 | 
408 | ```
409 | airflow initdb
410 | ```
411 | 
412 | Notice that this will fail with the default `airflow.cfg`
413 | 
414 | 
415 | ## Update your local configuration 
416 | 
417 | Open your airflow configuration file `~/airflow/airflow.cf` and make the following changes:
418 | 
419 | 
420 | ```
421 | executor = CeleryExecutor
422 | ```
423 | 
424 | ```
425 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#broker-settings
426 | # needs rabbitmq running
427 | broker_url = amqp://guest:guest@127.0.0.1/
428 | 
429 | 
430 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#task-result-backend-settings
431 | result_backend = db+mysql://airflow:airflow@localhost:3306/airflow
432 | 
433 | sql_alchemy_conn = mysql://airflow:python2019@localhost:3306/airflow
434 | 
435 | ```
436 | 
437 | Here we are replacing the default executor (`SequentialExecutor`) with the `CeleryExecutor` so that we can run multiple DAGs in parallel.
438 | We also replace the default `sqlite` database with our newly created `airflow` database.
439 | 
440 | Now we can initialize the database:
441 | ```
442 | airflow initdb
443 | ```
444 | 
445 | ---


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. EuroScipy tutorial documentation master file, created by
 2 |    sphinx-quickstart on Sun Sep  1 21:47:51 2019.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to the EuroScipy Airflow tutorial
 7 | ==============================================
 8 | This tutorial was originally developed for EuroScipy 2019.
 9 | 
10 | 
11 | .. toctree::
12 |    :caption: Table of Contents
13 |    :hidden:
14 |    :maxdepth: 2
15 | 
16 |    setup
17 |    about
18 |    pipelines
19 |    airflow-intro
20 |    first-airflow
21 | 
22 | .. toctree::
23 |    :maxdepth: 2
24 |    :caption: Contents:
25 | 
26 | About your facilitator
27 | ======================
28 | 
29 | My name is Tania. I live in Manchester UK where I work as a 
30 | Cloud Advocate for Microsoft.
31 | 
32 | Over the years, I have worked as a data engineer, machine learning engineer,
33 | and research software engineer. I love data intensive
34 | enviroments and I am particularly interested in the tools and workflows to
35 | deliver robust, reproducible data insights.
36 | 
37 | If you have any questions or feedback about this tutorial please, 
38 | file an issue using the following link: `<https://github.com/trallard/euroscipy-airflow/issues/new>`_.
39 | 
40 | You can also contact me via the following channels:
41 | 
42 | - E-mail: trallard@bitsandchips.me
43 | - Twitter: `@ixek <https://twitter.com/ixek>`_
44 | - `Tania on GitHub <https://github.com/ixek>`_
45 | 
46 | Code of Conduct
47 | ================
48 | All attendees to this workshop are expected to adhere to EuroScipy's Code of Conduct,
49 | in brief:
50 | **Be open, considerate, and respectful.**
51 | 
52 | License
53 | =======
54 | The content in this workshop is Licensed under `CC-BY-SA 4.0 <https://creativecommons.org/licenses/by-sa/4.0/>`_.
55 | Which means that you can use, remix and re-distribute so long attribution to the original
56 | author is maintained (Tania Allard).
57 | 
58 | The logo used here was designed by Ashley McNamara for the Microsoft Developer Advocates team use.
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/docs/source/pipelines.md:
--------------------------------------------------------------------------------
  1 | # Pipelines
  2 | 
  3 | ![](_static/automation1.jpg)
  4 | 
  5 | Automation helps us speed those manual boring tasks. The ability to automate means you can spend time working on other more thought-intensive projects.
  6 | 
  7 | Automation adds monitoring and logging tasks:
  8 | 
  9 | 
 10 | 
 11 | ![](_static/automate.png)
 12 | 
 13 | ## Steps to automation
 14 | 
 15 | Whenever you consider automating a task ask the following questions:
 16 | - When should this task begin?
 17 | - Does this task have a time limit?
 18 | - What are the inputs for this task?
 19 | - What is success or failure within this task? (How can we clearly identify the outcomes?)
 20 | - If the task fails what should happen?
 21 | - What does the task provide or produce? In what way? To whom?
 22 | - What (if anything) should happen after the task concludes?
 23 | 
 24 | <div class="alert alert-primary">
 25 |   <h4> Top tip </h4>
 26 |   If your project is too large or loosely defined, try breaking it up into smaller tasks and automate a few of those tasks. Perhaps your task involves a report which downloads two datasets, runs cleanup and analysis, and then sends the results to different groups depending on the outcome. 
 27 |   You can break this task into subtasks, automating each step. If any of these subtasks fail, stop the chain and alert the whoever is responsible for maintaining the script so it can be investigated further.
 28 | </div>
 29 | 
 30 | ## What is a data pipeline?
 31 | 
 32 | Roughly this is how all pipelines look like:
 33 | 
 34 | ![](https://i1.wp.com/datapipesoft.com/wp-content/uploads/2017/05/data-pipeline.png?fit=651%2C336&ssl=1)
 35 | 
 36 | they consist mainly of three distinct parts: data engineering processes, data preparation, and analytics. The upstream steps and quality of data determine in great measure the performance and quality of the subsequent steps.
 37 | 
 38 | ## Why do pipelines matter?
 39 | 
 40 | - Analytics and batch processing is mission-critical as they power all data-intensive applications
 41 | - The complexity of the data sources and demands increase every day
 42 | - A lot of time is invested in writing, monitoring jobs, and troubleshooting issues.
 43 | 
 44 | This makes data engineering one of the most critical foundations of the whole analytics cycle.
 45 | 
 46 | ### Good data pipelines are:
 47 | 
 48 | - Reproducible: same code, same data, same environment -> same outcome
 49 | - Easy to productise: need minimal modifications from R&D to production
 50 | - Atomic: broken into smaller well-defined tasks
 51 | 
 52 | When working with data pipelines always remember these two statements:
 53 | 
 54 | 
 55 | ![](_static/gooddata.png)
 56 | 
 57 | ---
 58 | 
 59 | ![](_static/gooddata1.png)
 60 | 
 61 | As your data engineering and data quality demands increase so does the complexity of the processes. So more often than not you will eventually need a workflow manager to help you with the orchestration of such processes.
 62 | 
 63 | <div class="alert alert-custom">
 64 | Think of a workflow manager as:
 65 | 
 66 | GNU Make + Unix pipes + Steroids
 67 | </div>
 68 | 
 69 | 
 70 | ---
 71 | 
 72 | ## Creating a simple data analysis pipeline
 73 | 
 74 | 
 75 | 
 76 | Let's start by cloning the repository<https://github.com/trallard/opendata-airflow-tutorial>
 77 | 
 78 | ```
 79 | git clone https://github.com/trallard/opendata-airflow-tutorial.git
 80 | ```
 81 | 
 82 | You will notice that you have a `census_data` directory. This contains both the scripts and the notebooks versions of the analysis we are going to use.
 83 | 
 84 | Let's have a look at the notebooks!
 85 | 
 86 | ```
 87 | jupyter lab
 88 | ```
 89 | 
 90 | Alternatively:
 91 | 
 92 | ```
 93 | jupyter
 94 | ```
 95 | 
 96 | 
 97 | ### Create your own pipeline
 98 | 
 99 | Note that there is not a single correct answer for this. Many will have different approaches. 
100 | 
101 | 🚦 Create a local script/pipeline that will run:
102 | 
103 | get data  -> clean data -> analyse data -> generate report / generate plots
104 | 
105 | You already have  `get_data.py`, `clean_data.py` and `analysis.py` as a simplified version of the notebooks. You can add a `create_plots.py` or `create_report.py`.


--------------------------------------------------------------------------------
/docs/source/setup.rst:
--------------------------------------------------------------------------------
  1 | Setup
  2 | ===============
  3 | This section will guide you through the pre requisites for the workshop.
  4 | Please make sure to install the libraries before the workshop as the conference WiFi 
  5 | can get quite slow when having too many people downloading and installing things at the same 
  6 | time.
  7 | 
  8 | Make sure to follow all the steps as detailed here.
  9 | 
 10 | Python 3.x
 11 | ++++++++++
 12 | 
 13 | 3.7 Preferred
 14 | 
 15 | We will be using `Python <https://www.python.org/>`_.
 16 | Installing all of Python's packages individually can be a bit
 17 | difficult, so we recommend using `Anaconda <https://www.anaconda.com/>`_ which
 18 | provides a variety of useful packages/tools.
 19 | 
 20 | To download Anaconda, follow the link https://www.anaconda.com/download/ and select
 21 | Python 3. Following the download, run the installer as per usual on your machine.
 22 | 
 23 | If you prefer not using Anaconda then this `tutorial <https://realpython.com/installing-python/>`_ can help you with the installation and 
 24 | setup.
 25 | 
 26 | If you already have Python installed but not via Anaconda do not worry.
 27 | Make sure to have either ``venv`` or ``pipenv`` installed. Then follow the instructions to set 
 28 | your virtual environment further down.
 29 | 
 30 | Git
 31 | +++
 32 | 
 33 | `Git <https://git-scm.com/>`_ is a version control software that records changes
 34 | to a file or set of files. Git is especially helpful for software developers
 35 | as it allows changes to be tracked (including who and when) when working on a
 36 | project.
 37 | 
 38 | To download Git, go to the following link and choose the correct version for your
 39 | operating system: https://git-scm.com/downloads.
 40 | 
 41 | Windows
 42 | --------
 43 | 
 44 | Download the  `git for Windows installer <https://gitforwindows.org/>`_ . 
 45 | Make sure to select "use Git from the Windows command prompt" 
 46 | this will ensure that Git is permanently added to your PATH. 
 47 | 
 48 | Also select "Checkout Windows-style, commit Unix-style line endings" selected and click on "Next".
 49 | 
 50 | This will provide you both git and git bash. We will use the command line quite a lot during the workshop 
 51 | so using git bash is a good option.
 52 | 
 53 | GitHub
 54 | ++++++
 55 | 
 56 | GitHub is a web-based service for version control using Git. You will need
 57 | to set up an account at `https://github.com <https://github.com>`_. Basic GitHub accounts are
 58 | free and you can now also have private repositories.
 59 | 
 60 | Text Editors/IDEs
 61 | ++++++++++++
 62 | 
 63 | Text editors are tools with powerful features designed to optimize writing code.
 64 | There are several text editors that you can choose from.
 65 | Here are some we recommend:
 66 | 
 67 | - `VS code <https://code.visualstudio.com//?wt.mc_id=euroscipy-github-taallard>`_: this is your facilitator's favourite 💜 and it is worth trying if you have not checked it yet
 68 | - `Pycharm <https://www.jetbrains.com/pycharm/download/>`_
 69 | - `Atom <https://atom.io>`_
 70 | 
 71 | We suggest trying several editors before settling on one.
 72 | 
 73 | If you decide to go for VSCode make sure to also
 74 | have the `Python extension <https://marketplace.visualstudio.com/itemdetails?itemName=ms-python.python&wt.mc_id=euroscipy-github-taallard>`_
 75 | installed. This will make your life so much easier (and it comes with a lot of nifty
 76 | features 😎).
 77 | 
 78 | Creating a virtual environment
 79 | +++++++++++++++++++++++++++++++
 80 | 
 81 | You will need to create a virtual environment to make sure that you have the right packages and setup needed to follow along the tutorial.
 82 | Follow the instructions that best suit your installation.
 83 | 
 84 | Anaconda
 85 | --------
 86 | 
 87 | Clone the repository: 
 88 | ::
 89 |     git clone https://github.com/trallard/opendata-airflow-tutorial
 90 | 
 91 | Change into the repo
 92 | ::
 93 |     cd opendata-airflow-tutorial   
 94 | 
 95 | Create a conda environment:
 96 | :: 
 97 |     conda env create -f environment.yml
 98 | 
 99 | Once all the dependencies are installed you can activate your environment through the following commands 
100 | ::
101 |     source activate airflow-env # Mac
102 |     activate airflow-env        # Windows and Linux
103 | To exit the environment you can use 
104 | ::
105 |     conda deactivate   
106 | 
107 | virtualenv
108 | -----------
109 | Create a directory for the tutorial, for example :
110 | ::
111 |     mkdir airflow-tutorial 
112 | and change directories into it (``cd airflow-tutorial``).
113 | Now you  need to run venv 
114 | ::
115 |     python3 -m venv env/airflow # Mac and Linux 
116 |     python -m venv env/airflow  # Windows
117 | 
118 | this will create a virtual Python environment in the ``env/airflow`` folder.
119 | Before installing the required packages you need to activate your virtual environment: 
120 | ::
121 |     source env/bin/activate # Mac and Linux 
122 |     .\env\Scripts\activate  # Windows 
123 | 
124 | 
125 | Now you can install the packages using via pip ``pip install -r requirements.txt``
126 | 
127 | To leave the virtual environment run ``deactivate``
128 | 
129 | Docker
130 | +++++++
131 | 
132 | There is a Docker image built with all the needed libraries. 
133 | 
134 | You can run it locally with:
135 | ::
136 |     docker run --rm -it -p 5555:5555/tcp -p 8080:8080/tcp -p 8793:8793/tcp -p 8888:8888/tcp -e JUPYTER_ENABLE_LAB=yes trallard/airflow-tutorial:1.0


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: airflow-env
2 | dependencies:
3 |   - jupyter==1.0.0
4 |   - jupyterlab==0.35.5
5 |   - matplotlib==3.0.3
6 |   - pandas==0.24.2
7 |   - pip:
8 |       - apache-airflow==1.10.3
9 | 


--------------------------------------------------------------------------------
/extra_tfx_example/dags/taxi_pipeline.py:
--------------------------------------------------------------------------------
  1 | """Chicago taxi example using TFX."""
  2 | 
  3 | from __future__ import absolute_import, division, print_function
  4 | 
  5 | import datetime
  6 | import logging
  7 | import os
  8 | 
  9 | from tfx.components.example_gen.csv_example_gen.component import CsvExampleGen
 10 | from tfx.orchestration.airflow.airflow_runner import AirflowDAGRunner
 11 | from tfx.orchestration.pipeline import PipelineDecorator
 12 | from tfx.utils.dsl_utils import csv_input
 13 | 
 14 | # pylint: disable=line-too-long
 15 | # from tfx.components.statistics_gen.component import StatisticsGen # Step 3
 16 | # from tfx.components.schema_gen.component import SchemaGen # Step 3
 17 | # from tfx.components.example_validator.component import ExampleValidator # Step 3
 18 | 
 19 | # from tfx.components.transform.component import Transform # Step 4
 20 | 
 21 | # from tfx.proto import trainer_pb2 # Step 5
 22 | # from tfx.components.trainer.component import Trainer # Step 5
 23 | 
 24 | # from tfx.proto import evaluator_pb2 # Step 6
 25 | # from tfx.components.evaluator.component import Evaluator # Step 6
 26 | 
 27 | # from tfx.proto import pusher_pb2 # Step 7
 28 | # from tfx.components.model_validator.component import ModelValidator # Step 7
 29 | # from tfx.components.pusher.component import Pusher # Step 7
 30 | 
 31 | 
 32 | # pylint: enable=line-too-long
 33 | 
 34 | # This example assumes that the taxi data is stored in ~/taxi/data and the
 35 | # taxi utility function is in ~/taxi.  Feel free to customize this as needed.
 36 | _taxi_root = os.path.join(os.environ["HOME"], "airflow")
 37 | _data_root = os.path.join(_taxi_root, "data/taxi_data")
 38 | # Python module file to inject customized logic into the TFX components. The
 39 | # Transform and Trainer both require user-defined functions to run successfully.
 40 | _taxi_module_file = os.path.join(_taxi_root, "dags/taxi_utils.py")
 41 | # Path which can be listened to by the model server.  Pusher will output the
 42 | # trained model here.
 43 | _serving_model_dir = os.path.join(_taxi_root, "saved_models/taxi")
 44 | 
 45 | # Directory and data locations.  This example assumes all of the chicago taxi
 46 | # example code and metadata library is relative to $HOME, but you can store
 47 | # these files anywhere on your local filesystem.
 48 | _tfx_root = os.path.join(_taxi_root, "tfx")
 49 | _pipeline_root = os.path.join(_tfx_root, "pipelines")
 50 | _metadata_db_root = os.path.join(_tfx_root, "metadata")
 51 | _log_root = os.path.join(_tfx_root, "logs")
 52 | 
 53 | # Airflow-specific configs; these will be passed directly to airflow
 54 | _airflow_config = {
 55 |     "schedule_interval": None,
 56 |     "start_date": datetime.datetime(2019, 1, 1),
 57 | }
 58 | 
 59 | # Logging overrides
 60 | logger_overrides = {"log_root": _log_root, "log_level": logging.INFO}
 61 | 
 62 | 
 63 | @PipelineDecorator(
 64 |     pipeline_name="taxi",
 65 |     enable_cache=True,
 66 |     metadata_db_root=_metadata_db_root,
 67 |     additional_pipeline_args={"logger_args": logger_overrides},
 68 |     pipeline_root=_pipeline_root,
 69 | )
 70 | def _create_pipeline():
 71 |     """Implements the chicago taxi pipeline with TFX."""
 72 |     examples = csv_input(_data_root)
 73 | 
 74 |     # Brings data into the pipeline or otherwise joins/converts training data.
 75 |     example_gen = CsvExampleGen(input_base=examples)
 76 | 
 77 |     # Computes statistics over data for visualization and example validation.
 78 |     # pylint: disable=line-too-long
 79 |     # statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Step 3
 80 |     # pylint: enable=line-too-long
 81 | 
 82 |     # Generates schema based on statistics files.
 83 |     # infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Step 3
 84 | 
 85 |     # Performs anomaly detection based on statistics and data schema.
 86 |     # validate_stats = ExampleValidator( # Step 3
 87 |     #     stats=statistics_gen.outputs.output, # Step 3
 88 |     #           schema=infer_schema.outputs.output) # Step 3
 89 | 
 90 |     # Performs transformations and feature engineering in training and serving.
 91 |     # transform = Transform( # Step 4
 92 |     #     input_data=example_gen.outputs.examples, # Step 4
 93 |     #     schema=infer_schema.outputs.output, # Step 4
 94 |     #     module_file=_taxi_module_file) # Step 4
 95 | 
 96 |     # Uses user-provided Python function that implements a model using TF-Learn.
 97 |     # trainer = Trainer( # Step 5
 98 |     #     module_file=_taxi_module_file, # Step 5
 99 |     #     transformed_examples=transform.outputs.transformed_examples, # Step 5
100 |     #     schema=infer_schema.outputs.output, # Step 5
101 |     #     transform_output=transform.outputs.transform_output, # Step 5
102 |     #     train_args=trainer_pb2.TrainArgs(num_steps=10000), # Step 5
103 |     #     eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Step 5
104 | 
105 |     # Uses TFMA to compute a evaluation statistics over features of a model.
106 |     # model_analyzer = Evaluator( # Step 6
107 |     #     examples=example_gen.outputs.examples, # Step 6
108 |     #     model_exports=trainer.outputs.output, # Step 6
109 |     #     feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ # Step 6
110 |     #         evaluator_pb2.SingleSlicingSpec( # Step 6
111 |     #             column_for_slicing=['trip_start_hour']) # Step 6
112 |     #     ])) # Step 6
113 | 
114 |     # Performs quality validation of a candidate model (compared to a baseline).
115 |     # model_validator = ModelValidator( # Step 7
116 |     #     examples=example_gen.outputs.examples, # Step 7
117 |     #              model=trainer.outputs.output) # Step 7
118 | 
119 |     # Checks whether the model passed the validation steps and pushes the model
120 |     # to a file destination if check passed.
121 |     # pusher = Pusher( # Step 7
122 |     #     model_export=trainer.outputs.output, # Step 7
123 |     #     model_blessing=model_validator.outputs.blessing, # Step 7
124 |     #     push_destination=pusher_pb2.PushDestination( # Step 7
125 |     #         filesystem=pusher_pb2.PushDestination.Filesystem( # Step 7
126 |     #             base_directory=_serving_model_dir))) # Step 7
127 | 
128 |     return [
129 |         example_gen,
130 |         # statistics_gen, infer_schema, validate_stats, # Step 3
131 |         # transform, # Step 4
132 |         # trainer, # Step 5
133 |         # model_analyzer, # Step 6
134 |         # model_validator, pusher # Step 7
135 |     ]
136 | 
137 | 
138 | pipeline = AirflowDAGRunner(_airflow_config).run(_create_pipeline())
139 | 


--------------------------------------------------------------------------------
/extra_tfx_example/dags/taxi_utils.py:
--------------------------------------------------------------------------------
  1 | """Python source file include taxi pipeline functions and necesasry utils.
  2 | 
  3 | For a TFX pipeline to successfully run, a preprocessing_fn and a
  4 | _build_estimator function needs to be provided.  This file contains both.
  5 | """
  6 | 
  7 | from __future__ import division, print_function
  8 | 
  9 | import os  # pylint: disable=unused-import
 10 | 
 11 | import tensorflow as tf  # pylint: disable=unused-import
 12 | 
 13 | # import tensorflow_transform as tft # Step 4
 14 | # from tensorflow_transform.beam.tft_beam_io import transform_fn_io # Step 4
 15 | # from tensorflow_transform.saved import saved_transform_io # Step 4
 16 | # from tensorflow_transform.tf_metadata import metadata_io # Step 4
 17 | # from tensorflow_transform.tf_metadata import schema_utils # Step 4
 18 | 
 19 | # import tensorflow_model_analysis as tfma # Step 5
 20 | 
 21 | 
 22 | # Categorical features are assumed to each have a maximum value in the dataset.
 23 | _MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12]
 24 | 
 25 | _CATEGORICAL_FEATURE_KEYS = [
 26 |     "trip_start_hour",
 27 |     "trip_start_day",
 28 |     "trip_start_month",
 29 |     "pickup_census_tract",
 30 |     "dropoff_census_tract",
 31 |     "pickup_community_area",
 32 |     "dropoff_community_area",
 33 | ]
 34 | 
 35 | _DENSE_FLOAT_FEATURE_KEYS = ["trip_miles", "fare", "trip_seconds"]
 36 | 
 37 | # Number of buckets used by tf.transform for encoding each feature.
 38 | _FEATURE_BUCKET_COUNT = 10
 39 | 
 40 | _BUCKET_FEATURE_KEYS = [
 41 |     "pickup_latitude",
 42 |     "pickup_longitude",
 43 |     "dropoff_latitude",
 44 |     "dropoff_longitude",
 45 | ]
 46 | 
 47 | # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform
 48 | _VOCAB_SIZE = 1000
 49 | 
 50 | # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed.
 51 | _OOV_SIZE = 10
 52 | 
 53 | _VOCAB_FEATURE_KEYS = ["payment_type", "company"]
 54 | 
 55 | # Keys
 56 | _LABEL_KEY = "tips"
 57 | _FARE_KEY = "fare"
 58 | 
 59 | # Step 4 START --------------------------
 60 | # def _transformed_name(key):
 61 | #   return key + '_xf'
 62 | 
 63 | 
 64 | # def _transformed_names(keys):
 65 | #   return [_transformed_name(key) for key in keys]
 66 | 
 67 | 
 68 | # # Tf.Transform considers these features as "raw"
 69 | # def _get_raw_feature_spec(schema):
 70 | #   return schema_utils.schema_as_feature_spec(schema).feature_spec
 71 | 
 72 | 
 73 | # def _gzip_reader_fn():
 74 | #   """Small utility returning a record reader that can read gzip'ed files."""
 75 | #   return tf.TFRecordReader(
 76 | #       options=tf.python_io.TFRecordOptions(
 77 | #           compression_type=tf.python_io.TFRecordCompressionType.GZIP))
 78 | 
 79 | 
 80 | # def _fill_in_missing(x):
 81 | #   """Replace missing values in a SparseTensor.
 82 | 
 83 | #   Fills in missing values of `x` with '' or 0, and converts to a dense tensor.
 84 | 
 85 | #   Args:
 86 | #     x: A `SparseTensor` of rank 2.  Its dense shape should have size at most 1
 87 | #       in the second dimension.
 88 | 
 89 | #   Returns:
 90 | #     A rank 1 tensor where missing values of `x` have been filled in.
 91 | #   """
 92 | #   default_value = '' if x.dtype == tf.string else 0
 93 | #   return tf.squeeze(
 94 | #       tf.sparse_to_dense(x.indices, [x.dense_shape[0], 1], x.values,
 95 | #                          default_value),
 96 | #       axis=1)
 97 | 
 98 | 
 99 | # def preprocessing_fn(inputs):
100 | #   """tf.transform's callback function for preprocessing inputs.
101 | 
102 | #   Args:
103 | #     inputs: map from feature keys to raw not-yet-transformed features.
104 | 
105 | #   Returns:
106 | #     Map from string feature key to transformed feature operations.
107 | #   """
108 | #   outputs = {}
109 | #   for key in _DENSE_FLOAT_FEATURE_KEYS:
110 | #     # Preserve this feature as a dense float, setting nan's to the mean.
111 | #     outputs[_transformed_name(key)] = tft.scale_to_z_score(
112 | #         _fill_in_missing(inputs[key]))
113 | 
114 | #   for key in _VOCAB_FEATURE_KEYS:
115 | #     # Build a vocabulary for this feature.
116 | #     outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
117 | #         _fill_in_missing(inputs[key]),
118 | #         top_k=_VOCAB_SIZE,
119 | #         num_oov_buckets=_OOV_SIZE)
120 | 
121 | #   for key in _BUCKET_FEATURE_KEYS:
122 | #     outputs[_transformed_name(key)] = tft.bucketize(
123 | #         _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT)
124 | 
125 | #   for key in _CATEGORICAL_FEATURE_KEYS:
126 | #     outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])
127 | 
128 | #   # Was this passenger a big tipper?
129 | #   taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
130 | #   tips = _fill_in_missing(inputs[_LABEL_KEY])
131 | #   outputs[_transformed_name(_LABEL_KEY)] = tf.where(
132 | #       tf.is_nan(taxi_fare),
133 | #       tf.cast(tf.zeros_like(taxi_fare), tf.int64),
134 | #       # Test if the tip was > 20% of the fare.
135 | #       tf.cast(
136 | #           tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
137 | #                      tf.int64))
138 | 
139 | #   return outputs
140 | # Step 4 END --------------------------
141 | 
142 | # Step 5 START --------------------------
143 | # def _build_estimator(transform_output,
144 | #                      config,
145 | #                      hidden_units=None,
146 | #                      warm_start_from=None):
147 | #   """Build an estimator for predicting the tipping behavior of taxi riders.
148 | 
149 | #   Args:
150 | #     transform_output: directory in which the tf-transform model was written
151 | #       during the preprocessing step.
152 | #     config: tf.contrib.learn.RunConfig defining the runtime environment for
153 | #       the estimator (including model_dir).
154 | #     hidden_units: [int], the layer sizes of the DNN (input layer first)
155 | #     warm_start_from: Optional directory to warm start from.
156 | 
157 | #   Returns:
158 | #     A dict of the following:
159 | #       - estimator: The estimator that will be used for training and eval.
160 | #       - train_spec: Spec for training.
161 | #       - eval_spec: Spec for eval.
162 | #       - eval_input_receiver_fn: Input function for eval.
163 | #   """
164 | #   metadata_dir = os.path.join(transform_output,
165 | #                               transform_fn_io.TRANSFORMED_METADATA_DIR)
166 | #   transformed_metadata = metadata_io.read_metadata(metadata_dir)
167 | #   transformed_feature_spec = transformed_metadata.schema.as_feature_spec()
168 | 
169 | #   transformed_feature_spec.pop(_transformed_name(_LABEL_KEY))
170 | 
171 | #   real_valued_columns = [
172 | #       tf.feature_column.numeric_column(key, shape=())
173 | #       for key in _transformed_names(_DENSE_FLOAT_FEATURE_KEYS)
174 | #   ]
175 | #   categorical_columns = [
176 | #       tf.feature_column.categorical_column_with_identity(
177 | #           key, num_buckets=_VOCAB_SIZE + _OOV_SIZE, default_value=0)
178 | #       for key in _transformed_names(_VOCAB_FEATURE_KEYS)
179 | #   ]
180 | #   categorical_columns += [
181 | #       tf.feature_column.categorical_column_with_identity(
182 | #           key, num_buckets=_FEATURE_BUCKET_COUNT, default_value=0)
183 | #       for key in _transformed_names(_BUCKET_FEATURE_KEYS)
184 | #   ]
185 | #   categorical_columns += [
186 | #       tf.feature_column.categorical_column_with_identity(  # pylint: disable=g-complex-comprehension
187 | #           key,
188 | #           num_buckets=num_buckets,
189 | #           default_value=0) for key, num_buckets in zip(
190 | #               _transformed_names(_CATEGORICAL_FEATURE_KEYS),
191 | #               _MAX_CATEGORICAL_FEATURE_VALUES)
192 | #   ]
193 | #   return tf.estimator.DNNLinearCombinedClassifier(
194 | #       config=config,
195 | #       linear_feature_columns=categorical_columns,
196 | #       dnn_feature_columns=real_valued_columns,
197 | #       dnn_hidden_units=hidden_units or [100, 70, 50, 25],
198 | #       warm_start_from=warm_start_from)
199 | 
200 | 
201 | # def _example_serving_receiver_fn(transform_output, schema):
202 | #   """Build the serving in inputs.
203 | 
204 | #   Args:
205 | #     transform_output: directory in which the tf-transform model was written
206 | #       during the preprocessing step.
207 | #     schema: the schema of the input data.
208 | 
209 | #   Returns:
210 | #     Tensorflow graph which parses examples, applying tf-transform to them.
211 | #   """
212 | #   raw_feature_spec = _get_raw_feature_spec(schema)
213 | #   raw_feature_spec.pop(_LABEL_KEY)
214 | 
215 | #   raw_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
216 | #       raw_feature_spec, default_batch_size=None)
217 | #   serving_input_receiver = raw_input_fn()
218 | 
219 | #   _, transformed_features = (
220 | #       saved_transform_io.partially_apply_saved_transform(
221 | #           os.path.join(transform_output, transform_fn_io.TRANSFORM_FN_DIR),
222 | #           serving_input_receiver.features))
223 | 
224 | #   return tf.estimator.export.ServingInputReceiver(
225 | #       transformed_features, serving_input_receiver.receiver_tensors)
226 | 
227 | 
228 | # def _eval_input_receiver_fn(transform_output, schema):
229 | #   """Build everything needed for the tf-model-analysis to run the model.
230 | 
231 | #   Args:
232 | #     transform_output: directory in which the tf-transform model was written
233 | #       during the preprocessing step.
234 | #     schema: the schema of the input data.
235 | 
236 | #   Returns:
237 | #     EvalInputReceiver function, which contains:
238 | #       - Tensorflow graph which parses raw untransformed features, applies the
239 | #         tf-transform preprocessing operators.
240 | #       - Set of raw, untransformed features.
241 | #       - Label against which predictions will be compared.
242 | #   """
243 | #   # Notice that the inputs are raw features, not transformed features here.
244 | #   raw_feature_spec = _get_raw_feature_spec(schema)
245 | 
246 | #   serialized_tf_example = tf.placeholder(
247 | #       dtype=tf.string, shape=[None], name='input_example_tensor')
248 | 
249 | #   # Add a parse_example operator to the tensorflow graph, which will parse
250 | #   # raw, untransformed, tf examples.
251 | #   features = tf.parse_example(serialized_tf_example, raw_feature_spec)
252 | 
253 | #   # Now that we have our raw examples, process them through the tf-transform
254 | #   # function computed during the preprocessing step.
255 | #   _, transformed_features = (
256 | #       saved_transform_io.partially_apply_saved_transform(
257 | #           os.path.join(transform_output, transform_fn_io.TRANSFORM_FN_DIR),
258 | #           features))
259 | 
260 | #   # The key name MUST be 'examples'.
261 | #   receiver_tensors = {'examples': serialized_tf_example}
262 | 
263 | #   # NOTE: Model is driven by transformed features (since training works on the
264 | #   # materialized output of TFT, but slicing will happen on raw features.
265 | #   features.update(transformed_features)
266 | 
267 | #   return tfma.export.EvalInputReceiver(
268 | #       features=features,
269 | #       receiver_tensors=receiver_tensors,
270 | #       labels=transformed_features[_transformed_name(_LABEL_KEY)])
271 | 
272 | 
273 | # def _input_fn(filenames, transform_output, batch_size=200):
274 | #   """Generates features and labels for training or evaluation.
275 | 
276 | #   Args:
277 | #     filenames: [str] list of CSV files to read data from.
278 | #     transform_output: directory in which the tf-transform model was written
279 | #       during the preprocessing step.
280 | #     batch_size: int First dimension size of the Tensors returned by input_fn
281 | 
282 | #   Returns:
283 | #     A (features, indices) tuple where features is a dictionary of
284 | #       Tensors, and indices is a single Tensor of label indices.
285 | #   """
286 | #   metadata_dir = os.path.join(transform_output,
287 | #                               transform_fn_io.TRANSFORMED_METADATA_DIR)
288 | #   transformed_metadata = metadata_io.read_metadata(metadata_dir)
289 | #   transformed_feature_spec = transformed_metadata.schema.as_feature_spec()
290 | 
291 | #   transformed_features = tf.contrib.learn.io.read_batch_features(
292 | #       filenames, batch_size, transformed_feature_spec, reader=_gzip_reader_fn)
293 | 
294 | #   # We pop the label because we do not want to use it as a feature while we're
295 | #   # training.
296 | #   return transformed_features, transformed_features.pop(
297 | #       _transformed_name(_LABEL_KEY))
298 | 
299 | 
300 | # # TFX will call this function
301 | # def trainer_fn(hparams, schema):
302 | #   """Build the estimator using the high level API.
303 | 
304 | #   Args:
305 | #     hparams: Holds hyperparameters used to train the model as name/value pairs
306 | #     schema: Holds the schema of the training examples.
307 | 
308 | #   Returns:
309 | #     A dict of the following:
310 | #       - estimator: The estimator that will be used for training and eval.
311 | #       - train_spec: Spec for training.
312 | #       - eval_spec: Spec for eval.
313 | #       - eval_input_receiver_fn: Input function for eval.
314 | #   """
315 | #   # Number of nodes in the first layer of the DNN
316 | #   first_dnn_layer_size = 100
317 | #   num_dnn_layers = 4
318 | #   dnn_decay_factor = 0.7
319 | 
320 | #   train_batch_size = 40
321 | #   eval_batch_size = 40
322 | 
323 | #   train_input_fn = lambda: _input_fn(  # pylint: disable=g-long-lambda
324 | #       hparams.train_files,
325 | #       hparams.transform_output,
326 | #       batch_size=train_batch_size)
327 | 
328 | #   eval_input_fn = lambda: _input_fn(  # pylint: disable=g-long-lambda
329 | #       hparams.eval_files,
330 | #       hparams.transform_output,
331 | #       batch_size=eval_batch_size)
332 | 
333 | #   train_spec = tf.estimator.TrainSpec(  # pylint: disable=g-long-lambda
334 | #       train_input_fn,
335 | #       max_steps=hparams.train_steps)
336 | 
337 | #   serving_receiver_fn = lambda: _example_serving_receiver_fn(  # pylint: disable=g-long-lambda
338 | #       hparams.transform_output, schema)
339 | 
340 | #   exporter = tf.estimator.FinalExporter('chicago-taxi', serving_receiver_fn)
341 | #   eval_spec = tf.estimator.EvalSpec(
342 | #       eval_input_fn,
343 | #       steps=hparams.eval_steps,
344 | #       exporters=[exporter],
345 | #       name='chicago-taxi-eval')
346 | 
347 | #   run_config = tf.estimator.RunConfig(
348 | #       save_checkpoints_steps=999, keep_checkpoint_max=1)
349 | 
350 | #   run_config = run_config.replace(model_dir=hparams.serving_model_dir)
351 | 
352 | #   estimator = _build_estimator(
353 | #       transform_output=hparams.transform_output,
354 | 
355 | #       # Construct layers sizes with exponetial decay
356 | #       hidden_units=[
357 | #           max(2, int(first_dnn_layer_size * dnn_decay_factor**i))
358 | #           for i in range(num_dnn_layers)
359 | #       ],
360 | #       config=run_config,
361 | #       warm_start_from=hparams.warm_start_from)
362 | 
363 | #   # Create an input receiver for TFMA processing
364 | #   receiver_fn = lambda: _eval_input_receiver_fn(  # pylint: disable=g-long-lambda
365 | #       hparams.transform_output, schema)
366 | 
367 | #   return {
368 | #       'estimator': estimator,
369 | #       'train_spec': train_spec,
370 | #       'eval_spec': eval_spec,
371 | #       'eval_input_receiver_fn': receiver_fn
372 | #   }
373 | # Step 5 END --------------------------
374 | 


--------------------------------------------------------------------------------
/extra_tfx_example/setup/chicago_data/taxi_pipeline_simple.py:
--------------------------------------------------------------------------------
  1 | """Chicago taxi example using TFX."""
  2 | 
  3 | from __future__ import absolute_import, division, print_function
  4 | 
  5 | import datetime
  6 | import logging
  7 | import os
  8 | 
  9 | from tfx.components.evaluator.component import Evaluator
 10 | from tfx.components.example_gen.csv_example_gen.component import CsvExampleGen
 11 | from tfx.components.example_validator.component import ExampleValidator
 12 | from tfx.components.model_validator.component import ModelValidator
 13 | from tfx.components.pusher.component import Pusher
 14 | from tfx.components.schema_gen.component import SchemaGen
 15 | from tfx.components.statistics_gen.component import StatisticsGen
 16 | from tfx.components.trainer.component import Trainer
 17 | from tfx.components.transform.component import Transform
 18 | from tfx.orchestration.airflow.airflow_runner import AirflowDAGRunner
 19 | from tfx.orchestration.pipeline import PipelineDecorator
 20 | from tfx.proto import evaluator_pb2, pusher_pb2, trainer_pb2
 21 | from tfx.utils.dsl_utils import csv_input
 22 | 
 23 | # This example assumes that the taxi data is stored in ~/taxi/data and the
 24 | # taxi utility function is in ~/taxi.  Feel free to customize this as needed.
 25 | _taxi_root = os.path.join(os.environ["HOME"], "taxi")
 26 | _data_root = os.path.join(_taxi_root, "data/simple")
 27 | # Python module file to inject customized logic into the TFX components. The
 28 | # Transform and Trainer both require user-defined functions to run successfully.
 29 | _taxi_module_file = os.path.join(_taxi_root, "taxi_utils.py")
 30 | # Path which can be listened to by the model server.  Pusher will output the
 31 | # trained model here.
 32 | _serving_model_dir = os.path.join(_taxi_root, "serving_model/taxi_simple")
 33 | 
 34 | # Directory and data locations.  This example assumes all of the chicago taxi
 35 | # example code and metadata library is relative to $HOME, but you can store
 36 | # these files anywhere on your local filesystem.
 37 | _tfx_root = os.path.join(os.environ["HOME"], "tfx")
 38 | _pipeline_root = os.path.join(_tfx_root, "pipelines")
 39 | _metadata_db_root = os.path.join(_tfx_root, "metadata")
 40 | _log_root = os.path.join(_tfx_root, "logs")
 41 | 
 42 | # Airflow-specific configs; these will be passed directly to airflow
 43 | _airflow_config = {
 44 |     "schedule_interval": None,
 45 |     "start_date": datetime.datetime(2019, 1, 1),
 46 | }
 47 | 
 48 | # Logging overrides
 49 | logger_overrides = {"log_root": _log_root, "log_level": logging.INFO}
 50 | 
 51 | 
 52 | # TODO(b/124066911): Centralize tfx related config into one place.
 53 | # TODO(zhitaoli): Remove PipelineDecorator after 0.13.0.
 54 | @PipelineDecorator(
 55 |     pipeline_name="chicago_taxi_simple",
 56 |     enable_cache=True,
 57 |     metadata_db_root=_metadata_db_root,
 58 |     additional_pipeline_args={"logger_args": logger_overrides},
 59 |     pipeline_root=_pipeline_root,
 60 | )
 61 | def _create_pipeline():
 62 |     """Implements the chicago taxi pipeline with TFX."""
 63 |     examples = csv_input(_data_root)
 64 | 
 65 |     # Brings data into the pipeline or otherwise joins/converts training data.
 66 |     example_gen = CsvExampleGen(input_base=examples)
 67 | 
 68 |     # Computes statistics over data for visualization and example validation.
 69 |     statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)
 70 | 
 71 |     # Generates schema based on statistics files.
 72 |     infer_schema = SchemaGen(stats=statistics_gen.outputs.output)
 73 | 
 74 |     # Performs anomaly detection based on statistics and data schema.
 75 |     validate_stats = ExampleValidator(
 76 |         stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output
 77 |     )
 78 | 
 79 |     # Performs transformations and feature engineering in training and serving.
 80 |     transform = Transform(
 81 |         input_data=example_gen.outputs.examples,
 82 |         schema=infer_schema.outputs.output,
 83 |         module_file=_taxi_module_file,
 84 |     )
 85 | 
 86 |     # Uses user-provided Python function that implements a model using TF-Learn.
 87 |     trainer = Trainer(
 88 |         module_file=_taxi_module_file,
 89 |         transformed_examples=transform.outputs.transformed_examples,
 90 |         schema=infer_schema.outputs.output,
 91 |         transform_output=transform.outputs.transform_output,
 92 |         train_args=trainer_pb2.TrainArgs(num_steps=10000),
 93 |         eval_args=trainer_pb2.EvalArgs(num_steps=5000),
 94 |     )
 95 | 
 96 |     # Uses TFMA to compute a evaluation statistics over features of a model.
 97 |     model_analyzer = Evaluator(
 98 |         examples=example_gen.outputs.examples,
 99 |         model_exports=trainer.outputs.output,
100 |         feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(
101 |             specs=[
102 |                 evaluator_pb2.SingleSlicingSpec(column_for_slicing=["trip_start_hour"])
103 |             ]
104 |         ),
105 |     )
106 | 
107 |     # Performs quality validation of a candidate model (compared to a baseline).
108 |     model_validator = ModelValidator(
109 |         examples=example_gen.outputs.examples, model=trainer.outputs.output
110 |     )
111 | 
112 |     # Checks whether the model passed the validation steps and pushes the model
113 |     # to a file destination if check passed.
114 |     pusher = Pusher(
115 |         model_export=trainer.outputs.output,
116 |         model_blessing=model_validator.outputs.blessing,
117 |         push_destination=pusher_pb2.PushDestination(
118 |             filesystem=pusher_pb2.PushDestination.Filesystem(
119 |                 base_directory=_serving_model_dir
120 |             )
121 |         ),
122 |     )
123 | 
124 |     return [
125 |         example_gen,
126 |         statistics_gen,
127 |         infer_schema,
128 |         validate_stats,
129 |         transform,
130 |         trainer,
131 |         model_analyzer,
132 |         model_validator,
133 |         pusher,
134 |     ]
135 | 
136 | 
137 | pipeline = AirflowDAGRunner(_airflow_config).run(_create_pipeline())
138 | 


--------------------------------------------------------------------------------
/extra_tfx_example/setup/chicago_data/taxi_utils.py:
--------------------------------------------------------------------------------
  1 | """Python source file include taxi pipeline functions and necesasry utils.
  2 | 
  3 | For a TFX pipeline to successfully run, a preprocessing_fn and a
  4 | _build_estimator function needs to be provided.  This file contains both.
  5 | 
  6 | This file is equivalent to examples/chicago_taxi/trainer/model.py and
  7 | examples/chicago_taxi/preprocess.py.
  8 | """
  9 | 
 10 | from __future__ import division, print_function
 11 | 
 12 | import os
 13 | 
 14 | import tensorflow as tf
 15 | import tensorflow_model_analysis as tfma
 16 | import tensorflow_transform as tft
 17 | from tensorflow_transform.beam.tft_beam_io import transform_fn_io
 18 | from tensorflow_transform.saved import saved_transform_io
 19 | from tensorflow_transform.tf_metadata import metadata_io, schema_utils
 20 | 
 21 | # Categorical features are assumed to each have a maximum value in the dataset.
 22 | _MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12]
 23 | 
 24 | _CATEGORICAL_FEATURE_KEYS = [
 25 |     "trip_start_hour",
 26 |     "trip_start_day",
 27 |     "trip_start_month",
 28 |     "pickup_census_tract",
 29 |     "dropoff_census_tract",
 30 |     "pickup_community_area",
 31 |     "dropoff_community_area",
 32 | ]
 33 | 
 34 | _DENSE_FLOAT_FEATURE_KEYS = ["trip_miles", "fare", "trip_seconds"]
 35 | 
 36 | # Number of buckets used by tf.transform for encoding each feature.
 37 | _FEATURE_BUCKET_COUNT = 10
 38 | 
 39 | _BUCKET_FEATURE_KEYS = [
 40 |     "pickup_latitude",
 41 |     "pickup_longitude",
 42 |     "dropoff_latitude",
 43 |     "dropoff_longitude",
 44 | ]
 45 | 
 46 | # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform
 47 | _VOCAB_SIZE = 1000
 48 | 
 49 | # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed.
 50 | _OOV_SIZE = 10
 51 | 
 52 | _VOCAB_FEATURE_KEYS = ["payment_type", "company"]
 53 | 
 54 | # Keys
 55 | _LABEL_KEY = "tips"
 56 | _FARE_KEY = "fare"
 57 | 
 58 | 
 59 | def _transformed_name(key):
 60 |     return key + "_xf"
 61 | 
 62 | 
 63 | def _transformed_names(keys):
 64 |     return [_transformed_name(key) for key in keys]
 65 | 
 66 | 
 67 | # Tf.Transform considers these features as "raw"
 68 | def _get_raw_feature_spec(schema):
 69 |     return schema_utils.schema_as_feature_spec(schema).feature_spec
 70 | 
 71 | 
 72 | def _gzip_reader_fn():
 73 |     """Small utility returning a record reader that can read gzip'ed files."""
 74 |     return tf.TFRecordReader(
 75 |         options=tf.python_io.TFRecordOptions(
 76 |             compression_type=tf.python_io.TFRecordCompressionType.GZIP
 77 |         )
 78 |     )
 79 | 
 80 | 
 81 | def _fill_in_missing(x):
 82 |     """Replace missing values in a SparseTensor.
 83 | 
 84 |   Fills in missing values of `x` with '' or 0, and converts to a dense tensor.
 85 | 
 86 |   Args:
 87 |     x: A `SparseTensor` of rank 2.  Its dense shape should have size at most 1
 88 |       in the second dimension.
 89 | 
 90 |   Returns:
 91 |     A rank 1 tensor where missing values of `x` have been filled in.
 92 |   """
 93 |     default_value = "" if x.dtype == tf.string else 0
 94 |     return tf.squeeze(
 95 |         tf.sparse.to_dense(
 96 |             tf.SparseTensor(x.indices, x.values, [x.dense_shape[0], 1]), default_value
 97 |         ),
 98 |         axis=1,
 99 |     )
100 | 
101 | 
102 | def preprocessing_fn(inputs):
103 |     """tf.transform's callback function for preprocessing inputs.
104 | 
105 |   Args:
106 |     inputs: map from feature keys to raw not-yet-transformed features.
107 | 
108 |   Returns:
109 |     Map from string feature key to transformed feature operations.
110 |   """
111 |     outputs = {}
112 |     for key in _DENSE_FLOAT_FEATURE_KEYS:
113 |         # Preserve this feature as a dense float, setting nan's to the mean.
114 |         outputs[_transformed_name(key)] = tft.scale_to_z_score(
115 |             _fill_in_missing(inputs[key])
116 |         )
117 | 
118 |     for key in _VOCAB_FEATURE_KEYS:
119 |         # Build a vocabulary for this feature.
120 |         outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
121 |             _fill_in_missing(inputs[key]), top_k=_VOCAB_SIZE, num_oov_buckets=_OOV_SIZE
122 |         )
123 | 
124 |     for key in _BUCKET_FEATURE_KEYS:
125 |         outputs[_transformed_name(key)] = tft.bucketize(
126 |             _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT
127 |         )
128 | 
129 |     for key in _CATEGORICAL_FEATURE_KEYS:
130 |         outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])
131 | 
132 |     # Was this passenger a big tipper?
133 |     taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
134 |     tips = _fill_in_missing(inputs[_LABEL_KEY])
135 |     outputs[_transformed_name(_LABEL_KEY)] = tf.where(
136 |         tf.is_nan(taxi_fare),
137 |         tf.cast(tf.zeros_like(taxi_fare), tf.int64),
138 |         # Test if the tip was > 20% of the fare.
139 |         tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64),
140 |     )
141 | 
142 |     return outputs
143 | 
144 | 
145 | def _build_estimator(config, hidden_units=None, warm_start_from=None):
146 |     """Build an estimator for predicting the tipping behavior of taxi riders.
147 | 
148 |   Args:
149 |     config: tf.contrib.learn.RunConfig defining the runtime environment for the
150 |       estimator (including model_dir).
151 |     hidden_units: [int], the layer sizes of the DNN (input layer first)
152 |     warm_start_from: Optional directory to warm start from.
153 | 
154 |   Returns:
155 |     A dict of the following:
156 |       - estimator: The estimator that will be used for training and eval.
157 |       - train_spec: Spec for training.
158 |       - eval_spec: Spec for eval.
159 |       - eval_input_receiver_fn: Input function for eval.
160 |   """
161 |     real_valued_columns = [
162 |         tf.feature_column.numeric_column(key, shape=())
163 |         for key in _transformed_names(_DENSE_FLOAT_FEATURE_KEYS)
164 |     ]
165 |     categorical_columns = [
166 |         tf.feature_column.categorical_column_with_identity(
167 |             key, num_buckets=_VOCAB_SIZE + _OOV_SIZE, default_value=0
168 |         )
169 |         for key in _transformed_names(_VOCAB_FEATURE_KEYS)
170 |     ]
171 |     categorical_columns += [
172 |         tf.feature_column.categorical_column_with_identity(
173 |             key, num_buckets=_FEATURE_BUCKET_COUNT, default_value=0
174 |         )
175 |         for key in _transformed_names(_BUCKET_FEATURE_KEYS)
176 |     ]
177 |     categorical_columns += [
178 |         tf.feature_column.categorical_column_with_identity(  # pylint: disable=g-complex-comprehension
179 |             key, num_buckets=num_buckets, default_value=0
180 |         )
181 |         for key, num_buckets in zip(
182 |             _transformed_names(_CATEGORICAL_FEATURE_KEYS),
183 |             _MAX_CATEGORICAL_FEATURE_VALUES,
184 |         )
185 |     ]
186 |     return tf.estimator.DNNLinearCombinedClassifier(
187 |         config=config,
188 |         linear_feature_columns=categorical_columns,
189 |         dnn_feature_columns=real_valued_columns,
190 |         dnn_hidden_units=hidden_units or [100, 70, 50, 25],
191 |         warm_start_from=warm_start_from,
192 |     )
193 | 
194 | 
195 | def _example_serving_receiver_fn(transform_output, schema):
196 |     """Build the serving in inputs.
197 | 
198 |   Args:
199 |     transform_output: directory in which the tf-transform model was written
200 |       during the preprocessing step.
201 |     schema: the schema of the input data.
202 | 
203 |   Returns:
204 |     Tensorflow graph which parses examples, applying tf-transform to them.
205 |   """
206 |     raw_feature_spec = _get_raw_feature_spec(schema)
207 |     raw_feature_spec.pop(_LABEL_KEY)
208 | 
209 |     raw_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
210 |         raw_feature_spec, default_batch_size=None
211 |     )
212 |     serving_input_receiver = raw_input_fn()
213 | 
214 |     _, transformed_features = saved_transform_io.partially_apply_saved_transform(
215 |         os.path.join(transform_output, transform_fn_io.TRANSFORM_FN_DIR),
216 |         serving_input_receiver.features,
217 |     )
218 | 
219 |     return tf.estimator.export.ServingInputReceiver(
220 |         transformed_features, serving_input_receiver.receiver_tensors
221 |     )
222 | 
223 | 
224 | def _eval_input_receiver_fn(transform_output, schema):
225 |     """Build everything needed for the tf-model-analysis to run the model.
226 | 
227 |   Args:
228 |     transform_output: directory in which the tf-transform model was written
229 |       during the preprocessing step.
230 |     schema: the schema of the input data.
231 | 
232 |   Returns:
233 |     EvalInputReceiver function, which contains:
234 |       - Tensorflow graph which parses raw untransformed features, applies the
235 |         tf-transform preprocessing operators.
236 |       - Set of raw, untransformed features.
237 |       - Label against which predictions will be compared.
238 |   """
239 |     # Notice that the inputs are raw features, not transformed features here.
240 |     raw_feature_spec = _get_raw_feature_spec(schema)
241 | 
242 |     serialized_tf_example = tf.placeholder(
243 |         dtype=tf.string, shape=[None], name="input_example_tensor"
244 |     )
245 | 
246 |     # Add a parse_example operator to the tensorflow graph, which will parse
247 |     # raw, untransformed, tf examples.
248 |     features = tf.parse_example(serialized_tf_example, raw_feature_spec)
249 | 
250 |     # Now that we have our raw examples, process them through the tf-transform
251 |     # function computed during the preprocessing step.
252 |     _, transformed_features = saved_transform_io.partially_apply_saved_transform(
253 |         os.path.join(transform_output, transform_fn_io.TRANSFORM_FN_DIR), features
254 |     )
255 | 
256 |     # The key name MUST be 'examples'.
257 |     receiver_tensors = {"examples": serialized_tf_example}
258 | 
259 |     # NOTE: Model is driven by transformed features (since training works on the
260 |     # materialized output of TFT, but slicing will happen on raw features.
261 |     features.update(transformed_features)
262 | 
263 |     return tfma.export.EvalInputReceiver(
264 |         features=features,
265 |         receiver_tensors=receiver_tensors,
266 |         labels=transformed_features[_transformed_name(_LABEL_KEY)],
267 |     )
268 | 
269 | 
270 | def _input_fn(filenames, transform_output, batch_size=200):
271 |     """Generates features and labels for training or evaluation.
272 | 
273 |   Args:
274 |     filenames: [str] list of CSV files to read data from.
275 |     transform_output: directory in which the tf-transform model was written
276 |       during the preprocessing step.
277 |     batch_size: int First dimension size of the Tensors returned by input_fn
278 | 
279 |   Returns:
280 |     A (features, indices) tuple where features is a dictionary of
281 |       Tensors, and indices is a single Tensor of label indices.
282 |   """
283 |     metadata_dir = os.path.join(
284 |         transform_output, transform_fn_io.TRANSFORMED_METADATA_DIR
285 |     )
286 |     transformed_metadata = metadata_io.read_metadata(metadata_dir)
287 |     transformed_feature_spec = transformed_metadata.schema.as_feature_spec()
288 | 
289 |     transformed_features = tf.contrib.learn.io.read_batch_features(
290 |         filenames, batch_size, transformed_feature_spec, reader=_gzip_reader_fn
291 |     )
292 | 
293 |     # We pop the label because we do not want to use it as a feature while we're
294 |     # training.
295 |     return transformed_features, transformed_features.pop(_transformed_name(_LABEL_KEY))
296 | 
297 | 
298 | # TFX will call this function
299 | def trainer_fn(hparams, schema):
300 |     """Build the estimator using the high level API.
301 | 
302 |   Args:
303 |     hparams: Holds hyperparameters used to train the model as name/value pairs.
304 |     schema: Holds the schema of the training examples.
305 | 
306 |   Returns:
307 |     A dict of the following:
308 |       - estimator: The estimator that will be used for training and eval.
309 |       - train_spec: Spec for training.
310 |       - eval_spec: Spec for eval.
311 |       - eval_input_receiver_fn: Input function for eval.
312 |   """
313 |     # Number of nodes in the first layer of the DNN
314 |     first_dnn_layer_size = 100
315 |     num_dnn_layers = 4
316 |     dnn_decay_factor = 0.7
317 | 
318 |     train_batch_size = 40
319 |     eval_batch_size = 40
320 | 
321 |     train_input_fn = lambda: _input_fn(  # pylint: disable=g-long-lambda
322 |         hparams.train_files, hparams.transform_output, batch_size=train_batch_size
323 |     )
324 | 
325 |     eval_input_fn = lambda: _input_fn(  # pylint: disable=g-long-lambda
326 |         hparams.eval_files, hparams.transform_output, batch_size=eval_batch_size
327 |     )
328 | 
329 |     train_spec = tf.estimator.TrainSpec(  # pylint: disable=g-long-lambda
330 |         train_input_fn, max_steps=hparams.train_steps
331 |     )
332 | 
333 |     serving_receiver_fn = lambda: _example_serving_receiver_fn(  # pylint: disable=g-long-lambda
334 |         hparams.transform_output, schema
335 |     )
336 | 
337 |     exporter = tf.estimator.FinalExporter("chicago-taxi", serving_receiver_fn)
338 |     eval_spec = tf.estimator.EvalSpec(
339 |         eval_input_fn,
340 |         steps=hparams.eval_steps,
341 |         exporters=[exporter],
342 |         name="chicago-taxi-eval",
343 |     )
344 | 
345 |     run_config = tf.estimator.RunConfig(
346 |         save_checkpoints_steps=999, keep_checkpoint_max=1
347 |     )
348 | 
349 |     run_config = run_config.replace(model_dir=hparams.serving_model_dir)
350 | 
351 |     estimator = _build_estimator(
352 |         # Construct layers sizes with exponetial decay
353 |         hidden_units=[
354 |             max(2, int(first_dnn_layer_size * dnn_decay_factor ** i))
355 |             for i in range(num_dnn_layers)
356 |         ],
357 |         config=run_config,
358 |         warm_start_from=hparams.warm_start_from,
359 |     )
360 | 
361 |     # Create an input receiver for TFMA processing
362 |     receiver_fn = lambda: _eval_input_receiver_fn(  # pylint: disable=g-long-lambda
363 |         hparams.transform_output, schema
364 |     )
365 | 
366 |     return {
367 |         "estimator": estimator,
368 |         "train_spec": train_spec,
369 |         "eval_spec": eval_spec,
370 |         "eval_input_receiver_fn": receiver_fn,
371 |     }
372 | 


--------------------------------------------------------------------------------
/extra_tfx_example/setup/reset_env.sh:
--------------------------------------------------------------------------------
 1 | # Use this to completely nuke the pypi libraries that TFX requires
 2 | # and start with a 'clean' environment.  This will uninstall TF/TFX
 3 | # libraries and airflow libraries.
 4 | #
 5 | # It will not delete the Airflow install itself.  You'll want to delete
 6 | # ~/airflow on your own.
 7 | #
 8 | 
 9 | 
10 | GREEN=$(tput setaf 2)
11 | NORMAL=$(tput sgr0)
12 | 
13 | printf "${GREEN}Resetting TFX workshop${NORMAL}\n\n"
14 | 
15 | pip uninstall tensorflow
16 | pip uninstall tfx
17 | pip uninstall tensorflow-model-analysis
18 | pip uninstall tensorflow-data-validation
19 | pip uninstall tensorflow-metadata
20 | pip uninstall tensorflow-transform
21 | pip uninstall apache-airflow
22 | 
23 | printf "\n\n${GREEN}TFX workshop has been reset${NORMAL}\n"
24 | printf "${GREEN}Remember to delete ~/airflow${NORMAL}\n"
25 | 
26 | 


--------------------------------------------------------------------------------
/extra_tfx_example/setup/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Set up the environment for the tutorial
 4 | 
 5 | 
 6 | GREEN=$(tput setaf 2)
 7 | NORMAL=$(tput sgr0)
 8 | 
 9 | printf "${GREEN}Installing TFX workshop${NORMAL}\n\n"
10 | 
11 | printf "${GREEN}Refreshing setuptools to avoid _NamespacePath issues${NORMAL}\n"
12 | pip uninstall setuptools -y && pip install setuptools
13 | 
14 | printf "${GREEN}Installing httplib2 for Beam compatibility${NORMAL}\n"
15 | pip install httplib2==0.12.0
16 | 
17 | printf "${GREEN}Installing pendulum to avoid problem with tzlocal${NORMAL}\n"
18 | pip install pendulum==1.4.4
19 | 
20 | # TODO: Use range or pin for pip installs.
21 | printf "${GREEN}Installing TensorFlow${NORMAL}\n"
22 | pip install tensorflow==1.14.0
23 | 
24 | printf "${GREEN}Installing TFX${NORMAL}\n"
25 | pip install tfx==0.14.0rc1
26 | 
27 | printf "${GREEN}Installing Google API Client${NORMAL}\n"
28 | pip install google-api-python-client
29 | 
30 | printf "${GREEN}Installing required Jupyter version${NORMAL}\n"
31 | pip install ipykernel
32 | ipython kernel install --user --name=tfx
33 | pip install --upgrade notebook==5.7.8
34 | jupyter nbextension install --py --symlink --sys-prefix tensorflow_model_analysis
35 | jupyter nbextension enable --py --sys-prefix tensorflow_model_analysis
36 | 
37 | printf "${GREEN}Installing packages used by the notebooks${NORMAL}\n"
38 | pip install matplotlib
39 | pip install papermill
40 | pip install pandas
41 | pip install networkx
42 | 
43 | # # Docker images
44 | printf "${GREEN}Installing docker${NORMAL}\n"
45 | pip install docker
46 | 
47 | # Airflow
48 | # Set this to avoid the GPL version; no functionality difference either way
49 | printf "${GREEN}Preparing environment for Airflow${NORMAL}\n"
50 | export SLUGIFY_USES_TEXT_UNIDECODE=yes
51 | printf "${GREEN}Installing Airflow${NORMAL}\n"
52 | 
53 | # TODO(b/136777165): Remove pinned version of Flask and Werkzeug
54 | # after newer version of Airflow: see AIRFLOW-4900.
55 | pip install apache-airflow==1.10.3 Flask==1.0.4 Werkzeug==0.14.1
56 | printf "${GREEN}Initializing Airflow database${NORMAL}\n"
57 | airflow initdb
58 | 
59 | # Adjust configuration
60 | printf "${GREEN}Adjusting Airflow config${NORMAL}\n"
61 | sed -i'.orig' 's/dag_dir_list_interval = 300/dag_dir_list_interval = 1/g' ~/airflow/airflow.cfg
62 | sed -i'.orig' 's/job_heartbeat_sec = 5/job_heartbeat_sec = 1/g' ~/airflow/airflow.cfg
63 | sed -i'.orig' 's/scheduler_heartbeat_sec = 5/scheduler_heartbeat_sec = 1/g' ~/airflow/airflow.cfg
64 | sed -i'.orig' 's/dag_default_view = tree/dag_default_view = graph/g' ~/airflow/airflow.cfg
65 | # sed -i'.orig' 's/load_examples = True/load_examples = False/g' ~/airflow/airflow.cfg
66 | sed -i'.orig' 's/max_threads = 2/max_threads = 1/g' ~/airflow/airflow.cfg
67 | 
68 | printf "${GREEN}Refreshing Airflow to pick up new config${NORMAL}\n"
69 | airflow resetdb --yes
70 | airflow initdb
71 | 
72 | # Copy Dag to ~/airflow/dags
73 | mkdir -p ~/airflow/dags
74 | cp ./dags/taxi_pipeline.py ~/airflow/dags/
75 | cp ./dags/taxi_utils.py ~/airflow/dags/
76 | 
77 | # Copy the simple pipeline example and adjust for user's environment
78 | cp ./chicago_data/taxi_pipeline_simple.py ~/airflow/dags/taxi_pipeline_solution.py
79 | cp ./chicago_data/taxi_utils.py ~/airflow/dags/taxi_utils_solution.py
80 | sed -i'.orig' "s/os.environ\['HOME'\], 'taxi'/os.environ\['HOME'\], 'airflow'/g" ~/airflow/dags/taxi_pipeline_solution.py
81 | sed -i'.orig' "s/_taxi_root, 'data', 'simple'/_taxi_root, 'data', 'taxi_data'/g" ~/airflow/dags/taxi_pipeline_solution.py
82 | sed -i'.orig' "s/taxi_utils.py/dags\/taxi_utils_solution.py/g" ~/airflow/dags/taxi_pipeline_solution.py
83 | sed -i'.orig' "s/os.environ\['HOME'\], 'tfx'/_taxi_root, 'tfx'/g" ~/airflow/dags/taxi_pipeline_solution.py
84 | sed -i'.orig' "s/chicago_taxi_simple/taxi_solution/g" ~/airflow/dags/taxi_pipeline_solution.py
85 | 
86 | # Copy data to ~/airflow/data
87 | # TODO(): Combine Chicago Taxi data files
88 | cp -R data ~/airflow
89 | 
90 | printf "\n${GREEN}TFX workshop installed${NORMAL}\n"
91 | 
92 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | rows
3 | papermill
4 | jupyterlab
5 | apache-airflow
6 | 


--------------------------------------------------------------------------------