├── .gitignore ├── LICENSE ├── README.md ├── census_analysis ├── dags │ ├── census_pipeline.py │ └── snippets.py ├── data │ └── raw │ │ ├── acs_data.csv.gz │ │ └── acs_data.dta.gz └── src │ ├── analysis.py │ ├── clean_data.py │ ├── get_data.py │ ├── solutions-Analysis.ipynb │ └── solutions-Data_Prep.ipynb ├── dag_example └── simple_dag.py ├── deployments └── jupyterhub-cluster │ ├── config │ └── config.yaml │ └── image │ └── Dockerfile ├── docs ├── Makefile ├── make.bat └── source │ ├── _build │ └── html │ │ ├── _static │ │ └── uses.png │ │ ├── about.html │ │ ├── airflow-intro.html │ │ ├── first-airflow.html │ │ ├── genindex.html │ │ ├── index.html │ │ ├── objects.inv │ │ ├── pipelines.html │ │ ├── search.html │ │ ├── searchindex.js │ │ └── setup.html │ ├── _static │ ├── 12.png │ ├── 4.jpg │ ├── DAG.png │ ├── GUI.png │ ├── airflow-logo.jpeg │ ├── airflow.png │ ├── architecture.png │ ├── automate.png │ ├── automation1.jpg │ ├── azure.png │ ├── connection.png │ ├── custom.css │ ├── dag-time.png │ ├── datapyramid.png │ ├── gooddata.png │ ├── gooddata1.png │ ├── luigi.png │ ├── mssignin.png │ ├── pipeline1.png │ ├── python.png │ ├── twitter1.png │ ├── twitter2.png │ ├── twitter3.png │ └── uses.png │ ├── _templates │ └── sidebarlogo.html │ ├── about.md │ ├── airflow-intro.md │ ├── conf.py │ ├── first-airflow.md │ ├── index.rst │ ├── pipelines.md │ └── setup.rst ├── environment.yml ├── extra_tfx_example ├── dags │ ├── taxi_pipeline.py │ └── taxi_utils.py ├── data │ └── taxi_data │ │ └── data.csv └── setup │ ├── chicago_data │ ├── taxi_pipeline_simple.py │ └── taxi_utils.py │ ├── reset_env.sh │ └── setup.sh └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | .vscode/ 3 | 4 | *.azcli 5 | 6 | deployments/jupyterhub-cluster/secrets/ 7 | 8 | 9 | 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | *$py.class 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | wheels/ 32 | pip-wheel-metadata/ 33 | share/python-wheels/ 34 | *.egg-info/ 35 | .installed.cfg 36 | *.egg 37 | MANIFEST 38 | 39 | # PyInstaller 40 | # Usually these files are written by a python script from a template 41 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 42 | *.manifest 43 | *.spec 44 | 45 | # Installer logs 46 | pip-log.txt 47 | pip-delete-this-directory.txt 48 | 49 | # Unit test / coverage reports 50 | htmlcov/ 51 | .tox/ 52 | .nox/ 53 | .coverage 54 | .coverage.* 55 | .cache 56 | nosetests.xml 57 | coverage.xml 58 | *.cover 59 | .hypothesis/ 60 | .pytest_cache/ 61 | 62 | # Translations 63 | *.mo 64 | *.pot 65 | 66 | # Django stuff: 67 | *.log 68 | local_settings.py 69 | db.sqlite3 70 | 71 | # Flask stuff: 72 | instance/ 73 | .webassets-cache 74 | 75 | # Scrapy stuff: 76 | .scrapy 77 | 78 | # Sphinx documentation 79 | docs/_build/ 80 | source/_build/ 81 | 82 | # PyBuilder 83 | target/ 84 | 85 | # Jupyter Notebook 86 | .ipynb_checkpoints 87 | 88 | # IPython 89 | profile_default/ 90 | ipython_config.py 91 | 92 | # pyenv 93 | .python-version 94 | 95 | # pipenv 96 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 97 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 98 | # having no cross-platform support, pipenv may install dependencies that don’t work, or not 99 | # install all needed dependencies. 100 | #Pipfile.lock 101 | 102 | # celery beat schedule file 103 | celerybeat-schedule 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | \.DS_Store 135 | 136 | docs/\.doctrees/ 137 | 138 | census_analysis/data/interim/ 139 | 140 | census_analysis/data/raw/counties/ 141 | 142 | docs/source/_build 143 | 144 | deployments/* -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, Tania Allard 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Airflow tutorials with open data sets 2 | 3 | 4 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/trallard/opendata-airflow-tutorial/master) 5 | 6 | 7 | This repo contains a tutorial on Airflow using census data and the Chicago taxi dataset. 8 | 9 | For a detailed overview of the requirements, setup and contents visit the docs URL: 10 | 11 | 12 | _Note_: this is still much in progress and I plan to add more pipelines, use cases and a how-to deploy to Azure Kubernetes services. 13 | 14 | -------------------------------------------------------------------------------- /census_analysis/dags/census_pipeline.py: -------------------------------------------------------------------------------- 1 | """Airflow dag to demonstrate a simple analysis pipeline""" 2 | 3 | import io 4 | import os 5 | from datetime import datetime 6 | from datetime import timedelta 7 | from pathlib import Path 8 | from zipfile import ZipFile 9 | 10 | import requests 11 | from airflow import DAG 12 | from airflow.operators.email_operator import EmailOperator 13 | from airflow.operators.python_operator import PythonOperator 14 | 15 | _dags_root = os.path.join(os.environ["HOME"], "airflow") 16 | _data_root = os.path.join(_dags_root, "data/raw") 17 | 18 | # Airflow-specific configs; these will be passed directly to airflow 19 | default_args = { 20 | "owner": "admin", 21 | "depends_on_past": False, 22 | "start_date": datetime.now() - timedelta(days=5), 23 | "retries": 1, 24 | "retry_delay": timedelta(minutes=2), 25 | "email_on_failure": False, 26 | } 27 | 28 | # -------------- 29 | # DAG methods 30 | # -------------- 31 | 32 | 33 | def collect_data(): 34 | url = "https://www2.census.gov/geo/tiger/TIGER2018/COUNTY/tl_2018_us_county.zip" 35 | site = requests.get(url) 36 | 37 | z = ZipFile(io.BytesIO(site.content)) 38 | z.extractall(_data_root) 39 | 40 | print("Data collected") 41 | 42 | 43 | # --------------------- 44 | # DAG implementation 45 | # --------------------- 46 | 47 | dag = DAG( 48 | "census_pipeline", 49 | default_args=default_args, 50 | schedule_interval="@daily", 51 | catchup=False, 52 | ) 53 | 54 | 55 | t1 = PythonOperator(task_id="collect_data", python_callable=collect_data(), dag=dag) 56 | -------------------------------------------------------------------------------- /census_analysis/dags/snippets.py: -------------------------------------------------------------------------------- 1 | from airflow.operators.email_operator import EmailOperator 2 | from datetime import timedelta, datetime 3 | 4 | email_task = EmailOperator( 5 | to="some@email.com", 6 | task_id="email_task", 7 | subject="Templated Subject: start_date {{ ds }}", 8 | params={"content1": "random"}, 9 | html_content="Templated Content: content1 - {{ params.content1 }} task_key - {{ task_instance_key_str }} test_mode - {{ test_mode }} task_owner - {{ task.owner}} hostname - {{ ti.hostname }}", 10 | dag=dag, 11 | ) 12 | 13 | # run 14 | 15 | airflow test dag_name email_task 16 | 17 | 18 | # Adding params 19 | 20 | # You can pass `params` dict to DAG object 21 | default_args = { 22 | 'owner': 'airflow', 23 | 'depends_on_past': False, 24 | 'start_date': airflow.utils.dates.days_ago(2), 25 | } 26 | 27 | dag = DAG( 28 | dag_id='airflow_tutorial_2', 29 | default_args=default_args, 30 | schedule_interval=None, 31 | params={ 32 | "param1": "value1", 33 | "param2": "value2" 34 | } 35 | ) 36 | 37 | bash = BashOperator( 38 | task_id='bash', 39 | bash_command='echo {{ params.param1 }}', # Output: value1 40 | dag=dag 41 | ) 42 | 43 | 44 | # accessing sensitive data in connections 45 | # install pip install apache-airflow[crypto] 46 | 47 | from airflow.hooks.base_hook import BaseHook 48 | slack_token = BaseHook.get_connection('slack').password 49 | 50 | 51 | # accesing variables 52 | from airflow.models import Variable 53 | 54 | # Common (Not-so-nice way) 55 | # 3 DB connections when the file is parsed 56 | var1 = Variable.get("var1") 57 | var2 = Variable.get("var2") 58 | var3 = Variable.get("var3") 59 | 60 | # Recommended Way 61 | # Just 1 Database call 62 | dag_config = Variable.get("dag1_config", deserialize_json=True) 63 | dag_config["var1"] 64 | dag_config["var2"] 65 | dag_config["var3"] 66 | 67 | # You can directly use it Templated arguments {{ var.json.my_var.path }} 68 | bash_task = BashOperator( 69 | task_id="bash_task", 70 | bash_command='{{ var.json.dag1_config.var1 }} ', 71 | dag=dag, 72 | ) 73 | 74 | # macros reference 75 | 76 | # https://airflow.apache.org/macros.html 77 | 78 | { 79 | 'dag': task.dag, 80 | 'ds': ds, 81 | 'next_ds': next_ds, 82 | 'next_ds_nodash': next_ds_nodash, 83 | 'prev_ds': prev_ds, 84 | 'prev_ds_nodash': prev_ds_nodash, 85 | 'ds_nodash': ds_nodash, 86 | 'ts': ts, 87 | 'ts_nodash': ts_nodash, 88 | 'ts_nodash_with_tz': ts_nodash_with_tz, 89 | 'yesterday_ds': yesterday_ds, 90 | 'yesterday_ds_nodash': yesterday_ds_nodash, 91 | 'tomorrow_ds': tomorrow_ds, 92 | 'tomorrow_ds_nodash': tomorrow_ds_nodash, 93 | 'END_DATE': ds, 94 | 'end_date': ds, 95 | 'dag_run': dag_run, 96 | 'run_id': run_id, 97 | 'execution_date': self.execution_date, 98 | 'prev_execution_date': prev_execution_date, 99 | 'next_execution_date': next_execution_date, 100 | 'latest_date': ds, 101 | 'macros': macros, 102 | 'params': params, 103 | 'tables': tables, 104 | 'task': task, 105 | 'task_instance': self, 106 | 'ti': self, 107 | 'task_instance_key_str': ti_key_str, 108 | 'conf': configuration, 109 | 'test_mode': self.test_mode, 110 | 'var': { 111 | 'value': VariableAccessor(), 112 | 'json': VariableJsonAccessor() 113 | }, 114 | 'inlets': task.inlets, 115 | 'outlets': task.outlets, 116 | } 117 | 118 | # dynamic dags 119 | 120 | # Using DummyOperator 121 | a = [] 122 | for i in range(0,10): 123 | a.append(DummyOperator( 124 | task_id='Component'+str(i), 125 | dag=dag)) 126 | if i != 0: 127 | a[i-1] >> a[i] 128 | 129 | # From a List 130 | sample_list = ["val1", "val2", "val3"] 131 | tasks_list = [] 132 | for index, value in enumerate(sample_list): 133 | tasks_list.append(DummyOperator( 134 | task_id='Component'+str(index), 135 | dag=dag)) 136 | if index != 0: 137 | tasks_list[index-1] >> tasks_list[index] 138 | 139 | # database 140 | 141 | airflow initdb # first time only 142 | 143 | airflow upgradedb # apply missing migrations -------------------------------------------------------------------------------- /census_analysis/data/raw/acs_data.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/census_analysis/data/raw/acs_data.csv.gz -------------------------------------------------------------------------------- /census_analysis/data/raw/acs_data.dta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/census_analysis/data/raw/acs_data.dta.gz -------------------------------------------------------------------------------- /census_analysis/src/analysis.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import os 3 | from datetime import datetime as dt 4 | from pathlib import Path 5 | 6 | import pandas as pd 7 | 8 | # data folder and paths 9 | RAW_DATA_PATH = Path("../data/raw/") 10 | INTERIM_DATA_PATH = Path("../data/interim/") 11 | PROCESSED_DATA_PATH = Path("../data/processed/") 12 | FINAL_DATA_PATH = Path("../data/final/") 13 | 14 | 15 | # analysis methods 16 | 17 | 18 | def load_data(date): 19 | data = pd.read_stata(INTERIM_DATA_PATH / f"working_data-{date}.dta") 20 | 21 | return data 22 | 23 | 24 | def drop_rows(data): 25 | """Drop observations where pernum does not equal 1 26 | """ 27 | mask_pernum = data["pernum"] == 1 28 | return data[mask_pernum].copy() 29 | 30 | 31 | def define_groups(data): 32 | mask_latino = data["hispan"] != "not hispanic" 33 | mask_white = (data["hispan"] == "not hispanic") & (data["race"] == "white") 34 | mask_black = (data["hispan"] == "not hispanic") & ( 35 | data["race"].str.contains("black") 36 | ) 37 | mask_native = (data["hispan"] == "not hispanic") & ( 38 | data["race"] == "american indian or alaska native" 39 | ) 40 | mask_API = (data["hispan"] == "not hispanic") & ( 41 | (data["race"] >= "chinese") 42 | & (data["race"] <= "other asian or pacific islander") 43 | ) 44 | mask_other = (data["hispan"] == "not hispanic") & ( 45 | data["race"] >= "other race, nec" 46 | ) 47 | 48 | data.loc[mask_latino, "racen"] = "Latino" 49 | data.loc[mask_white, "racen"] = "White" 50 | data.loc[mask_black, "racen"] = "Black/African-American" 51 | data.loc[mask_native, "racen"] = "Am. Indian / Alaska Native" 52 | data.loc[mask_API, "racen"] = "Asian / Pacific Islander" 53 | data.loc[mask_other, "racen"] = "other" 54 | 55 | return data 56 | 57 | 58 | def analyse_data(data): 59 | cihispeed_by_racen = data.groupby(["racen", "cihispeed"])[["hhwt"]].sum() 60 | households_by_racen = data.groupby("racen")[["hhwt"]].sum() 61 | 62 | shares_cihispeed_by_racen = cihispeed_by_racen / households_by_racen 63 | shares_cihispeed_by_racen = shares_cihispeed_by_racen.reset_index() 64 | 65 | mask_yes_cihispeed = ( 66 | shares_cihispeed_by_racen["cihispeed"] 67 | == "yes (cable modem, fiber optic or dsl service)" 68 | ) 69 | 70 | return shares_cihispeed_by_racen[mask_yes_cihispeed] 71 | 72 | 73 | if __name__ == "__main__": 74 | 75 | date = dt.today().strftime("%d-%b-%y") 76 | raw_data = load_data(date) 77 | data = drop_rows(raw_data) 78 | data_groups = define_groups(data) 79 | speed_data = analyse_data(data_groups) 80 | speed_data.to_csv(f"{FINAL_DATA_PATH}/{date}-internet-speed.csv", "r") 81 | -------------------------------------------------------------------------------- /census_analysis/src/clean_data.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import os 3 | from datetime import datetime as dt 4 | from pathlib import Path 5 | 6 | import pandas as pd 7 | 8 | # we will use today date 9 | today = dt.today().strftime("%d-%b-%y") 10 | 11 | 12 | # data folder and paths 13 | RAW_DATA_PATH = Path("../data/raw/") 14 | INTERIM_DATA_PATH = Path("../data/interim/") 15 | PROCESSED_DATA_PATH = Path("../data/processed/") 16 | FINAL_DATA_PATH = Path("../data/final/") 17 | 18 | 19 | # supporting functions 20 | # ------------------------ 21 | 22 | 23 | def dir_exists(dir_path): 24 | if not os.path.exists(dir_path): 25 | os.makedirs(dir_path) 26 | else: 27 | print(f"{dir_path} found, skipping") 28 | 29 | 30 | def load_data(data_path): 31 | """load data into a pd dataframe 32 | 33 | Args: 34 | data_path (path): path to the gzipped data 35 | """ 36 | with gzip.open(RAW_DATA_PATH / "acs_data.dta.gz") as file: 37 | data = pd.read_stata(file) 38 | return data 39 | 40 | 41 | def state_mask(state, df): 42 | """Used to select only one state 43 | 44 | Args: 45 | state (string): state to be masked 46 | 47 | Returns: 48 | df: subset of the data 49 | """ 50 | mask_state = df["statefip"] == f"{state}" 51 | return df[mask_state].copy() 52 | 53 | 54 | def clean_masked(df): 55 | df.drop(columns=["related", "raced", "hispand"], inplace=True) 56 | mask_household = (df["gq"] == "households under 1970 definition") | ( 57 | df["gq"] == "additional households under 1990 definition" 58 | ) 59 | return df[mask_household].copy() 60 | 61 | 62 | def save_df(data_path, df): 63 | df.to_stata(f"{data_path}/state_data-{today}.dta", write_index=False) 64 | 65 | 66 | if __name__ == "__main__": 67 | state = "ohio" 68 | raw_data = load_data(RAW_DATA_PATH) 69 | state_data = state_mask(state, raw_data) 70 | clean_state = clean_masked(state_data) 71 | save_df(INTERIM_DATA_PATH, clean_state) 72 | 73 | print(f"Completed cleaning for {state}") 74 | -------------------------------------------------------------------------------- /census_analysis/src/get_data.py: -------------------------------------------------------------------------------- 1 | import io 2 | from pathlib import Path 3 | from zipfile import ZipFile 4 | 5 | import requests 6 | 7 | RAW_DATA_PATH = Path("data/raw/counties/") 8 | 9 | url = "https://www2.census.gov/geo/tiger/TIGER2018/COUNTY/tl_2018_us_county.zip" 10 | site = requests.get(url) 11 | 12 | z = ZipFile(io.BytesIO(site.content)) 13 | z.extractall(RAW_DATA_PATH) 14 | -------------------------------------------------------------------------------- /dag_example/simple_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.operators.dummy_operator import DummyOperator 5 | from airflow.operators.python_operator import PythonOperator 6 | 7 | 8 | def print_hello(): 9 | return "Hello world!" 10 | 11 | 12 | default_args = { 13 | "owner": "airflow", 14 | "depends_on_past": False, 15 | "start_date": datetime(2019, 8, 30), 16 | "email": ["airflow@example.com"], 17 | "email_on_failure": False, 18 | "email_on_retry": False, 19 | "retries": 1, 20 | "retry_delay": timedelta(minutes=2), 21 | } 22 | 23 | dag = DAG( 24 | "hello_world", 25 | description="Simple tutorial DAG", 26 | schedule_interval="0 12 * * *", 27 | default_args=default_args, 28 | catchup=False, 29 | ) 30 | 31 | t1 = DummyOperator(task_id="dummy_task", retries=3, dag=dag) 32 | 33 | t2 = PythonOperator(task_id="hello_task", python_callable=print_hello, dag=dag) 34 | 35 | # sets downstream for t1 36 | t1 >> t2 37 | 38 | # equivalent 39 | # t2.set_upstream(t1) 40 | -------------------------------------------------------------------------------- /deployments/jupyterhub-cluster/config/config.yaml: -------------------------------------------------------------------------------- 1 | singleuser: 2 | defaultUrl: "/lab" 3 | memory: 4 | guarantee: 512M 5 | limit: 1G 6 | image: 7 | name: trallard/jupyter-rserver 8 | tag: 1.4 9 | lifecycleHooks: 10 | postStart: 11 | exec: 12 | command: ["gitpuller", "https://github.com/pyladies-nwuk/Python_meets_R", "master", "reticulate-ws"] 13 | 14 | hub: 15 | extraConfig: 16 | jupyterlab: | 17 | c.Spawner.cmd = ['jupyter-labhub'] 18 | 19 | # prepare added nodes for arriving users 20 | prepuller: 21 | continuous: 22 | enabled: true -------------------------------------------------------------------------------- /deployments/jupyterhub-cluster/image/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM jupyter/scipy-notebook 2 | 3 | LABEL maintainer="Tania Allard trallard[at]bitsandchips.me" 4 | 5 | ENV SLUGIFY_USES_TEXT_UNIDECODE yes 6 | ARG AIRFLOW_USER_HOME=/home/jovyan/work 7 | 8 | 9 | COPY requirements.txt /tmp/requirements.txt 10 | RUN pip install --no-cache-dir -r /tmp/requirements.txt 11 | 12 | EXPOSE 8080 8888 5555 8793 -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/_build/html/_static/uses.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_build/html/_static/uses.png -------------------------------------------------------------------------------- /docs/source/_build/html/about.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | About the workshop — EuroScipy tutorial documentation 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 |
28 |
29 |
30 | 37 | 38 | 39 |
40 | 41 |
42 |

About the workshop

43 |

We will be taking a look at the basic concepts of data pipelines as well as practical use cases using Python.

44 |
45 |

About you:

46 |
    47 |
  • Some experience using the command line

  • 48 |
  • Intermediate Python knowledge / use

  • 49 |
  • Be able to apply what we learn and adopt to your use cases

  • 50 |
  • Interested in data and systems

  • 51 |
  • Aspring or current data engineering

  • 52 |
  • Some knowledge about systems and databases (enough to be dangerous)

  • 53 |
54 |
55 |
56 |

Our focus for the day

57 |
    58 |
  • Greater understanding on how to apply data pipelines using the Python toolset

  • 59 |
  • Focus on concepts

  • 60 |
  • Apply knowledge with each library

  • 61 |
  • Will give you the building blocks

  • 62 |
63 |
64 |
65 |

Keeping on track

66 |

You will find 🚦 across the tutorial examples. We will use this to identify how folks are doing over the workshop (if following along in person). 67 | Place the post it as follows:

68 |

🚦 Purple postit: all good, task has been completed

69 |

🚦 Orange postit: I need extra time or need help with the task in hand

70 |
71 |
72 | 73 | 74 |
75 | 82 | 83 |
84 |
85 | 146 |
147 |
148 | 159 | 160 | 161 | 162 | 163 | 164 | -------------------------------------------------------------------------------- /docs/source/_build/html/airflow-intro.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Airflow basics — EuroScipy tutorial documentation 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 |
30 |
31 |
32 | 47 | 48 | 49 |
50 | 51 |
52 |

Airflow basics

53 |
54 |

What is Airflow?

55 |

airflow logo

56 |

Airflow is a Workflow engine which means:

57 |
    58 |
  • Manage scheduling and running jobs and data pipelines

  • 59 |
  • Ensures jobs are ordered correctly based on dependencies

  • 60 |
  • Manage the allocation of scarce resources

  • 61 |
  • Provides mechanisms for tracking the state of jobs and recovering from failure

  • 62 |
63 |

It is highly versatile and can be used across many many domains: 64 | _images/uses.png

65 |
66 |
67 |

Basic Airflow concepts

68 |
    69 |
  • Task: a defined unit of work (these are called operators in Airflow)

  • 70 |
  • Task instance: an individual run of a single task. Task instances also have an indicative state, which could be “running”, “success”, “failed”, “skipped”, “up for retry”, etc.

  • 71 |
  • DAG: Directed acyclic graph, 72 | a set of tasks with explicit execution order, beginning, and end

  • 73 |
  • DAG run: individual execution/run of a DAG

  • 74 |
75 |

Debunking the DAG

76 |

The vertices and edges (the arrows linking the nodes) have an order and direction associated to them

77 |

_images/DAG.png

78 |

each node in a DAG corresponds to a task, which in turn represents some sort of data processing. For example:

79 |

Node A could be the code for pulling data from an API, node B could be the code for anonymizing the data. Node B could be the code for checking that there are no duplicate records, and so on.

80 |

These ‘pipelines’ are acyclic since they need a point of completion.

81 |

Dependencies

82 |

Each of the vertices has a particular direction that shows the relationship between certain nodes. For example, we can only anonymize data once this has been pulled out from the API.

83 |
84 |
85 |

Idempotency

86 |

This is one of the most important characteristics of good ETL architectures.

87 |

When we say that something is idempotent it means it will produce the same result regardless of how many times this is run (i.e. the results are reproducible).

88 |

Reproducibility is particularly important in data-intensive environments as this ensures that the same inputs will always return the same outputs.

89 |
90 |
91 |

Airflow components

92 |

_images/architecture.png

93 |

There are 4 main components to Apache Airflow:

94 |
95 |

Web server

96 |

The GUI. This is under the hood a Flask app where you can track the status of your jobs and read logs from a remote file store (e.g. Azure Blobstorage).

97 |
98 |
99 |

Scheduler

100 |

This component is responsible for scheduling jobs. This is a multithreaded Python process that uses the DAGb object to decide what tasks need to be run, when and where.

101 |

The task state is retrieved and updated from the database accordingly. The web server then uses these saved states to display job information.

102 |
103 |
104 |

Executor

105 |

The mechanism that gets the tasks done.

106 |
107 |
108 |

Metadata database

109 |
    110 |
  • Powers how the other components interact

  • 111 |
  • Stores the Airflow states

  • 112 |
  • All processes read and write from here

  • 113 |
114 |
115 |
116 |
117 |

Workflow as a code

118 |

One of the main advantages of using a workflow system like Airflow is that all is code, which makes your workflows maintainable, versionable, testable, and collaborative.

119 |

Thus your workflows become more explicit and maintainable (atomic tasks).

120 |

Not only your code is dynamic but also is your infrastructure.

121 |
122 |

Defining tasks

123 |

Tasks are defined based on the abstraction of Operators (see Airflow docs here) which represent a single idempotent task.

124 |

The best practice is to have atomic operators (i.e. can stand on their own and do not need to share resources among them).

125 |

You can choose among;

126 |
    127 |
  • BashOperator

  • 128 |
  • PythonOperator

  • 129 |
  • EmailOperator

  • 130 |
  • SimpleHttpOperator

  • 131 |
  • MySqlOperator (and other DB)

  • 132 |
133 |

Examples:

134 |
t1 = BashOperator(task_id='print_date',
135 |     bash_command='date,
136 |     dag=dag) 
137 | 
138 |
139 |
def print_context(ds, **kwargs):
140 |     pprint(kwargs)
141 |     print(ds)
142 |     return 'Whatever you return gets printed in the logs'
143 | 
144 | 
145 | run_this = PythonOperator(
146 |     task_id='print_the_context',
147 |     provide_context=True,
148 |     python_callable=print_context,
149 |     dag=dag,
150 | )
151 | 
152 |
153 |
154 |
155 |
156 |

Comparing Luigi and Airflow

157 |
158 |

Luigi

159 |
    160 |
  • Created at Spotify (named after the plumber)

  • 161 |
  • Open sourced in late 2012

  • 162 |
  • GNU make for data

  • 163 |
164 |
165 |
166 |

Airflow

167 |
    168 |
  • Airbnb data team

  • 169 |
  • Open-sourced mud 2015

  • 170 |
  • Apache incubator mid-2016

  • 171 |
  • ETL pipelines

  • 172 |
173 |
174 |
175 |

Similarities

176 |
    177 |
  • Python open source projects for data pipelines

  • 178 |
  • Integrate with a number of sources (databases, filesystems)

  • 179 |
  • Tracking failure, retries, success

  • 180 |
  • Ability to identify the dependencies and execution

  • 181 |
182 |
183 |
184 |

Differences

185 |
    186 |
  • Scheduler support: Airflow has built-in support using schedulers

  • 187 |
  • Scalability: Airflow has had stability issues in the past

  • 188 |
  • Web interfaces

  • 189 |
190 |

_images/luigi.png

191 |

_images/airflow.png

192 |

| Airflow | Luigi | 193 | | ———————————————— | —————————————————————————— | 194 | | Task are defined bydag_id defined by user name | Task are defined by task name and parameters | 195 | | Task retries based on definitions | Decide if a task is done via input/output | 196 | | Task code to the worker | Workers started by Python file where the tasks are defined | 197 | | Centralized scheduler (Celery spins up workers) | Centralized scheduler in charge of deduplication sending tasks (Tornado based) |

198 |
199 |
200 |
201 | 202 | 203 |
204 | 219 | 220 |
221 |
222 | 319 |
320 |
321 | 332 | 333 | 334 | 335 | 336 | 337 | -------------------------------------------------------------------------------- /docs/source/_build/html/first-airflow.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Airflow 101: working locally and familiarise with the tool — EuroScipy tutorial documentation 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 |
28 |
29 |
30 | 37 | 38 | 39 |
40 | 41 |
42 |

Airflow 101: working locally and familiarise with the tool

43 |
44 |

Pre-requisites

45 |

The following prerequisites are needed:

46 |
    47 |
  • Libraries detailed in the Setting up section (either via conda or pipenv)

  • 48 |
  • MySQL installed

  • 49 |
  • text editor

  • 50 |
  • command line

  • 51 |
52 |
53 |
54 |

Getting your environment up and running

55 |

If you followed the instructions you should have Airflow installed as well as the rest of the packages we will be using.

56 |

So let’s get our environment up and running:

57 |

If you are using conda start your environment via:

58 |
$ source activate airflow-env
 59 | 
60 |
61 |

If using pipenv then:

62 |
$ pipenv shell
 63 | 
64 |
65 |

this will start a shell within a virtual environment, to exit the shell you need to type exit and this will exit the virtual environment.

66 |
67 |
68 |

Starting Airflow locally

69 |

Airflow home lives in ~/airflow by default, but you can change the location before installing airflow. You first need to set the AIRFLOW_HOME environment variable and then install airflow. For example, using pip:

70 |
export AIRFLOW_HOME=~/mydir/airflow
 71 | 
 72 | # install from PyPI using pip
 73 | pip install apache-airflow
 74 | 
75 |
76 |

once you have completed the installation you should see something like this in the airflow directory (wherever it lives for you)

77 |
drwxr-xr-x    - myuser 18 Apr 14:02 .
 78 | .rw-r--r--  26k myuser 18 Apr 14:02 ├── airflow.cfg
 79 | drwxr-xr-x    - myuser 18 Apr 14:02 ├── logs
 80 | drwxr-xr-x    - myuser 18 Apr 14:02 │  └── scheduler
 81 | drwxr-xr-x    - myuser 18 Apr 14:02 │     ├── 2019-04-18
 82 | lrwxr-xr-x   46 myuser 18 Apr 14:02 │     └── latest -> /Users/myuser/airflow/logs/scheduler/2019-04-18
 83 | .rw-r--r-- 2.5k myuser 18 Apr 14:02 └── unittests.cfg
 84 | 
85 |
86 |

We need to create a local dag folder:

87 |
mkdir ~/airflow/dags
 88 | 
89 |
90 |

As your project evolves, your directory will look something like this:

91 |
airflow                  # the root directory.
 92 | ├── dags                 # root folder for all dags. files inside folders are not searched for dags.
 93 | │   ├── my_dag.py, # my dag (definitions of tasks/operators) including precedence.
 94 | │   └── ...
 95 | ├── logs                 # logs for the various tasks that are run
 96 | │   └── my_dag           # DAG specific logs
 97 | │   │   ├── src1_s3      # folder for task-specific logs (log files are created by date of a run)
 98 | │   │   ├── src2_hdfs
 99 | │   │   ├── src3_s3
100 | │   │   └── spark_task_etl
101 | ├── airflow.db           # SQLite database used by Airflow internally to track the status of each DAG.
102 | ├── airflow.cfg          # global configuration for Airflow (this can be overridden by config inside the file.)
103 | └── ...
104 | 
105 |
106 |
107 |
108 |

Prepare your database

109 |

As we mentioned before Airflow uses a database to keep track of the tasks and their statuses. So it is critical to have one set up.

110 |

To start the default database we can run 111 | airflow initdb. This will initialize your database via alembic so that it matches the latest Airflow release.

112 |

The default database used is sqlite which means you cannot parallelize tasks using this database. Since we have MySQL and MySQL client installed we will set them up so that we can use them with airflow.

113 |

🚦Create an airflow database

114 |

From the command line:

115 |
MySQL -u root -p
116 | mysql> CREATE DATABASE airflow CHARACTER SET utf8 COLLATE utf8_unicode_ci;
117 | mysql> GRANT ALL PRIVILEGES ON airflow.* To 'airflow'@'localhost';
118 | mysql> FLUSH PRIVILEGES;
119 | 
120 |
121 |

and initialize the database:

122 |
airflow initdb
123 | 
124 |
125 |

Notice that this will fail with the default airflow.cfg

126 |
127 |
128 |

Update your local configuration

129 |

Open your airflow configuration file ~/airflow/airflow.cf and make the following changes:

130 |
executor = CeleryExecutor
131 | 
132 |
133 |
# http://docs.celeryproject.org/en/latest/userguide/configuration.html#broker-settings
134 | # needs rabbitmq running
135 | broker_url = amqp://guest:guest@127.0.0.1/
136 | 
137 | 
138 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#task-result-backend-settings
139 | result_backend = db+mysql://airflow:airflow@localhost:3306/airflow
140 | 
141 | sql_alchemy_conn = mysql://airflow:python2019@localhost:3306/airflow
142 | 
143 |
144 |

Here we are replacing the default executor (SequentialExecutor) with the CeleryExecutor so that we can run multiple DAGs in parallel. 145 | We also replace the default sqlite database with our newly created airflow database.

146 |

Now we can initialize the database:

147 |
airflow initdb
148 | 
149 |
150 |

Let’s now start the web server locally:

151 |
airflow webserver -p 8080
152 | 
153 |
154 |

we can head over to http://localhost:8080 now and you will see that there are a number of examples DAGS already there.

155 |

🚦 Take some time to familiarise with the UI and get your local instance set up

156 |

Now let’s have a look at the connections (http://localhost:8080/admin/connection/) go to admin > connections. You should be able to see a number of connections available. For this tutorial, we will use some of the connections including mysql.

157 |
163 |

Commands

164 |

Let us go over some of the commands. Back on your command line:

165 |
airflow list_dags
166 | 
167 |
168 |

we can list the DAG tasks in a tree view

169 |
airflow list_tasks tutorial --tree
170 | 
171 |
172 |

we can tests the dags too, but we will need to set a date parameter so that this executes:

173 |
airflow test tutorial print_date 2019-05-01
174 | 
175 |
176 |

(note that you cannot use a future date or you will get an error)

177 |
airflow test tutorial templated 2019-05-01
178 | 
179 |
180 |

By using the test commands these are not saved in the database.

181 |

Now let’s start the scheduler:

182 |
airflow scheduler
183 | 
184 |
185 |

Behind the scenes, it monitors and stays in sync with a folder for all DAG objects it contains. The Airflow scheduler is designed to run as a service in an Airflow production environment.

186 |

Now with the schedule up and running we can trigger an instance:

187 |
$ airflow run airflow run example_bash_operator runme_0 2015-01-01
188 | 
189 |
190 |

This will be stored in the database and you can see the change of the status change straight away.

191 |

What would happen for example if we wanted to run or trigger the tutorial task? 🤔

192 |

Let’s try from the CLI and see what happens.

193 |
airflow trigger_dag tutorial
194 | 
195 |
196 |
197 |
198 |
199 |

Writing your first DAG

200 |

Let’s create our first simple DAG. 201 | Inside the dag directory (~/airflow/dags) create a simple_dag.py file.

202 |
from datetime import datetime, timedelta
203 | from airflow import DAG
204 | from airflow.operators.dummy_operator import DummyOperator
205 | from airflow.operators.python_operator import PythonOperator
206 | 
207 | 
208 | def print_hello():
209 |     return "Hello world!"
210 | 
211 | 
212 | default_args = {
213 |     "owner": "airflow",
214 |     "depends_on_past": False,
215 |     "start_date": datetime(2019, 4, 30),
216 |     "email": ["airflow@example.com"],
217 |     "email_on_failure": False,
218 |     "email_on_retry": False,
219 |     "retries": 1,
220 |     "retry_delay": timedelta(minutes=2),
221 | }
222 | 
223 | dag = DAG(
224 |     "hello_world",
225 |     description="Simple tutorial DAG",
226 |     schedule_interval="0 12 * * *",
227 |     default_args=default_args,
228 |     catchup=False,
229 | )
230 | 
231 | t1 = DummyOperator(task_id="dummy_task", retries=3, dag=dag)
232 | 
233 | t2 = PythonOperator(task_id="hello_task", python_callable=print_hello, dag=dag)
234 | 
235 | # sets downstream foe t1
236 | t1 >> t2
237 | 
238 | # equivalent
239 | # t2.set_upstream(t1)
240 | 
241 |
242 |

If it is properly setup you should be able to see this straight away on your instance.

243 |
244 |

Now let’s create a DAG from the previous ETL pipeline (kind of)

245 |

All hands on - check the solutions

246 |
247 |
248 |
249 | 250 | 251 |
252 | 259 | 260 |
261 |
262 | 332 |
333 |
334 | 345 | 346 | 347 | 348 | 349 | 350 | -------------------------------------------------------------------------------- /docs/source/_build/html/genindex.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Index — EuroScipy tutorial documentation 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 |
29 |
30 |
31 | 38 | 39 | 40 |
41 | 42 | 43 |

Index

44 | 45 |
46 | 47 |
48 | 49 | 50 |
51 | 58 | 59 |
60 |
61 | 120 |
121 |
122 | 130 | 131 | 132 | 133 | 134 | 135 | -------------------------------------------------------------------------------- /docs/source/_build/html/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Welcome to the EuroScipy Airflow tutorial — EuroScipy tutorial documentation 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 |
29 |
30 |
31 | 42 | 43 | 44 |
45 | 46 |
47 |

Welcome to the EuroScipy Airflow tutorial

48 |

This tutorial was originally developed for EuroScipy 2019.

49 |
50 |
51 |
52 |
53 |
54 |
55 |

About your facilitator

56 |

My name is Tania. I live in Manchester UK where I work as a 57 | Cloud Advocate for Microsoft.

58 |

Over the years, I have worked as a data engineer, machine learning engineer, 59 | and research software engineer. I love data intensive 60 | enviroments and I am particularly interested in the tools and workflows to 61 | deliver robust, reproducible data insights.

62 |

If you have any questions or feedback about this tutorial please, 63 | file an issue using the following link: https://github.com/trallard/euroscipy-airflow/issues/new.

64 |

You can also contact me via the following channels:

65 | 70 |
71 |
72 |

Code of Conduct

73 |

All attendees to this workshop are expected to adhere to EuroScipy’s Code of Conduct, 74 | in brief: 75 | Be open, considerate, and respectful.

76 |
77 |
78 |

License

79 |

The content in this workshop is Licensed under CC-BY-SA 4.0. 80 | Which means that you can use, remix and re-distribute so long attribution to the original 81 | author is maintained (Tania Allard).

82 |

The logo used here was designed by Ashley McNamara for the Microsoft Developer Advocates team use.

83 |
84 | 85 | 86 |
87 | 98 | 99 |
100 |
101 | 171 |
172 |
173 | 184 | 185 | 186 | 187 | 188 | 189 | -------------------------------------------------------------------------------- /docs/source/_build/html/objects.inv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_build/html/objects.inv -------------------------------------------------------------------------------- /docs/source/_build/html/search.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Search — EuroScipy tutorial documentation 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 |
33 |
34 |
35 | 42 | 43 | 44 |
45 | 46 |

Search

47 |
48 | 49 |

50 | Please activate JavaScript to enable the search 51 | functionality. 52 |

53 |
54 |

55 | From here you can search these documents. Enter your search 56 | words into the box below and click "search". Note that the search 57 | function will automatically search for all of the words. Pages 58 | containing fewer words won't appear in the result list. 59 |

60 |
61 | 62 | 63 | 64 |
65 | 66 |
67 | 68 |
69 | 70 |
71 | 78 | 79 |
80 |
81 | 130 |
131 |
132 | 140 | 141 | 142 | 143 | 144 | 145 | -------------------------------------------------------------------------------- /docs/source/_build/html/searchindex.js: -------------------------------------------------------------------------------- 1 | Search.setIndex({docnames:["about","airflow-intro","first-airflow","index","pipelines","setup"],envversion:{"sphinx.domains.c":1,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":1,"sphinx.domains.javascript":1,"sphinx.domains.math":2,"sphinx.domains.python":1,"sphinx.domains.rst":1,"sphinx.domains.std":1,"sphinx.ext.intersphinx":1,sphinx:56},filenames:["about.md","airflow-intro.md","first-airflow.md","index.rst","pipelines.md","setup.rst"],objects:{},objnames:{},objtypes:{},terms:{"26k":2,"3f57fc15ded7dddddcc4e82fe137b58":5,"abstract":1,"break":4,"case":[0,4],"class":4,"default":[2,5],"export":[2,5],"function":4,"import":[1,2,4],"int":4,"long":[3,4],"new":[3,5],"null":4,"public":4,"return":[1,2,4],"true":[1,4],"try":[2,4,5],"while":5,And:4,For:[1,2,4,5],NOT:4,Not:1,One:[1,4],The:[1,2,3,4,5],Then:5,There:[1,4,5],These:1,USE:4,Will:0,__main__:4,__name__:4,abil:[1,4],abl:[0,2,5],about:[],access:4,access_token:4,access_token_secret:4,accordingli:[1,5],achiev:4,across:[0,1],activ:[2,5],actual:4,acycl:1,add:[4,5],added:5,addit:5,address:5,adher:3,admin:2,adopt:0,advanc:5,advantag:1,advoc:3,affili:4,after:[1,4,5],again:5,airbnb:1,airflow:[4,5],airflow_hom:2,airflowdb:4,alemb:2,alert:4,all:[0,1,2,3,4,5],all_tweet:4,allard:[3,5],alloc:1,allow:[4,5],along:[0,5],alreadi:[2,4,5],also:[1,2,3,4,5],alwai:[1,4],among:1,amqp:2,analyse_twitt:4,analysi:[4,5],analyt:4,ani:[3,4,5],anonym:1,anyon:5,anyth:4,apach:[1,2],api:[1,4,5],app:1,appli:0,applic:[4,5],approach:5,appropri:4,approv:[],apr:2,apt:5,architectur:1,arg:4,around:4,arrow:1,asap:5,ashlei:3,ask:[4,5],aspr:0,associ:1,assum:4,async:4,atom:[1,4,5],attach:5,attempt:4,attende:3,attribut:3,auth:4,authent:4,author:[3,4],auto_incr:4,automat:4,avail:2,awai:2,awar:4,azur:1,back:2,backend:2,bar:4,barchart_lang:4,base:[1,5],bash:5,bash_command:1,bashoper:1,bashrc:5,basic:[0,4,5],batch:4,becom:1,been:[0,1,5],befor:[2,4,5],beforehand:5,begin:[1,4],behind:2,being:4,below:[],best:[1,5],between:[1,4],bin:5,bit:[4,5],bitsandchip:[3,5],blobstorag:1,block:0,bore:4,both:5,brew:5,brief:3,broken:4,broker:2,broker_url:2,build:0,built:1,button:5,call:[1,4],can:[1,2,3,4,5],cannot:2,card:5,care:4,carefulli:5,catchup:2,celeri:1,celeryexecutor:2,celeryproject:2,cento:5,central:1,certain:1,cfg:[2,4],chain:4,chang:[2,4,5],channel:3,charact:[2,4],characterist:1,charg:1,check:[1,2,5],checkout:5,choos:[1,5],clean:4,clean_data:4,clean_df:4,cleanup:4,clear:4,clearli:4,cli:2,click:5,client:2,close:[4,5],cloud:3,code:[4,5],collabor:1,collat:[2,4],collect:5,collet:4,column:4,com:[2,3,4,5],come:5,command:[0,4,5],commit:[4,5],compil:5,complet:[0,1,2,4,5],complex:4,concept:0,conclud:4,conda:[2,5],conduct:5,confer:5,config:[2,4],config_fil:4,configpars:4,configur:5,confirm:5,connect:2,connect_db:4,connector:4,connecttwitt:4,consid:4,consider:3,consist:4,consumer_kei:4,consumer_secret:4,contact:3,contain:[2,4],content:[3,4],control:5,copi:[4,5],copyright:4,corpor:4,correct:[4,5],correctli:1,correspond:[1,4],could:1,count:4,countri:4,cppflag:5,creat:1,create_plot:4,create_t:4,created_at:4,creation:4,credit:5,critic:[2,4],csv:4,current:[0,4],cursor:4,customlisten:4,cwd:4,cycl:4,dag:1,dag_id:1,dagb:1,dai:[4,5],danger:0,dashboard:[],data:[0,1,3,5],databas:[0,5],datafram:4,dataset:4,date:[1,2],datetim:[2,4],dateutil:4,dbconnect:4,deactiv:5,deal:4,debian:5,debunk:1,decid:[1,5],dedupl:1,deeper:4,def:[1,2,4],default_arg:2,defin:4,definit:[1,2],delet:4,deliv:3,demand:4,depend:[1,4,5],depends_on_past:2,deploi:5,deposit:4,descript:2,design:[2,3,5],detail:[2,4,5],detect:4,determin:4,dev:5,devel:5,develop:3,dict:4,dictionari:4,differ:4,difficult:[4,5],direct:[1,4],directori:[2,5],disconnect:4,discuss:4,displai:1,distinct:4,distribut:3,doc:[1,2,4,5],docker:[],doe:4,doing:[0,5],dollar:5,domain:1,done:[1,4,5],down:5,download:[4,5],downstream:[2,4],drop:4,drwxr:2,dummy_oper:2,dummy_task:2,dummyoper:2,duplic:1,dure:5,dynam:1,each:[0,1,2,4],earlier:4,easi:4,easier:5,echo:5,edg:1,editor:2,either:[2,5],els:4,email:[2,5],email_on_failur:2,email_on_retri:2,emailoper:1,enabl:5,end:[1,4,5],engin:[0,1,3,4],enough:[0,4],ensur:[1,5],env:[2,5],enviro:3,environ:[1,4],equival:2,error:[2,4,5],especi:5,etc:1,etl:[1,5],eucipi:[],euroscipi:5,euroswcipi:[],event:4,eventu:4,everi:4,evolv:2,exampl:[0,1,2,4,5],example_bash_oper:2,except:4,execut:[1,2,4,5],executor:2,exist:[4,5],exit:[2,5],expect:3,expedit:5,experi:[0,5],explicit:1,extens:5,extra:0,extract:4,extrem:4,facilit:5,fail:[1,2,4],failur:[1,4],fals:[2,4],far:4,fast:4,favourit:5,featur:5,feedback:3,few:4,fig:4,figur:4,file:[1,2,3,4,5],filesystem:1,fill:5,filter:4,find:0,first:5,flask:1,flush:[2,4],focu:4,foe:2,folder:[2,4,5],folk:0,follow:[0,2,3,4,5],forget:4,foundat:4,frame:4,free:5,from:[1,5],further:[4,5],futur:2,get:[1,4,5],gist:5,github:[3,4],give:[0,5],given:4,global:2,gnu:[1,4],going:[4,5],good:[0,1,5],grant:[2,4],graph:1,great:4,greater:0,group:4,guest:2,gui:1,guid:5,had:1,hand:[0,2],handl:4,happen:[2,4],has:[0,1,4,5],hat:5,have:[1,2,3,4,5],head:2,header:5,hello:2,hello_task:2,hello_world:2,help:[0,4,5],here:[1,2,3,4,5],highli:1,home:2,home_timelin:4,homebrew:4,hood:1,host:4,how:[0,1,4],html:[2,4],http:[2,3,4,5],human:4,id_str:4,identifi:[0,1,4,5],imag:5,includ:[2,5],incognito:5,increas:4,incub:1,index:4,index_col:4,indic:1,individu:[1,5],inform:1,infoschema:4,infrastructur:1,initdb:2,initi:2,input:[1,4],insert:4,insid:2,insight:3,instal:[2,5],instanc:[1,2,4,5],instruct:[2,5],instructor:5,integr:[1,5],intens:[1,3,4],interact:1,interest:[0,3],interfac:1,intermedi:0,intern:2,invalid:5,invest:4,investig:4,involv:4,issu:[1,3,4,5],its:4,ixek:3,job:[1,4],json:4,jupyt:4,just:4,keep:[2,4],kei:4,kept:4,kind:4,know:[4,5],knowledg:0,known:5,kwarg:1,languag:4,larg:4,late:1,later:[4,5],latest:[2,4],launch:5,ldflag:5,lead:5,learn:[0,3],leav:5,len:4,let:[4,5],lib:5,libmysqlcli:5,librari:[0,2,4,5],life:5,like:[1,2,4,5],limit:4,line:[0,2,4,5],link:[1,3,5],list:2,list_dag:2,list_task:2,listen:4,live:[2,3,5],load:4,local:5,localhost:[2,4],locat:2,log:[1,2,4],login:[],logo:3,look:[0,2,4],loos:4,lot:[4,5],love:3,lower:4,lowercas:4,lrwxr:2,machin:[3,4,5],made:4,mai:4,mail:3,main:[1,4],mainli:4,maintain:[1,3,4],make:[1,2,4,5],manag:[1,4],manchest:3,mani:[1,4,5],manual:4,match:[2,4],matplotlib:4,mcnamara:3,mean:[1,2,3,4],meantim:5,measur:4,mechan:[1,4],mention:2,messag:4,method:4,microsoft:3,mid:1,might:5,minim:4,minut:2,miss:4,mission:4,mkdir:[2,4,5],modif:4,modifi:4,modul:[],monitor:[2,4],more:[1,4],most:[1,4,5],much:5,mud:1,multipl:2,multithread:1,my_dag:2,my_databas:4,my_tabl:4,mydir:2,mysql:[2,4],mysqlclient:[4,5],mysqloper:1,mystream:4,mystreamlisten:4,myuser:2,name:[1,3,4],need:[0,1,2,4,5],new_tabl:4,newli:[2,5],next:[4,5],nifti:5,node:1,non:4,none:4,note:[2,4,5],notic:2,notifi:[],now:[4,5],nrollr:5,number:[1,2,4],oauthhandl:4,object:[1,2,4],off:4,offici:5,often:4,on_data:4,on_error:4,on_statu:4,onc:[1,2,5],one:[1,2,4,5],ones:[],onli:[1,4],open:[1,2,3],opendata:[],openssl:5,oper:[1,2,5],opt:5,optim:5,option:5,oracl:4,orang:0,orchestr:4,order:1,org:[2,4],origin:3,other:[1,4],our:[2,4],out:[1,4],outcom:4,output:[1,4],over:[0,2,3],overridden:2,own:1,owner:[2,4],packag:[2,5],page:[],pair:4,panda:4,parallel:2,paramet:[1,2,4],pars:4,parser:4,part:[4,5],particular:1,particularli:[1,3],password:[4,5],past:1,path:[4,5],pathlib:4,peopl:5,per:5,perform:4,perhap:4,perman:5,permiss:[],person:[0,5],pip:[2,5],pipe:4,pipelin:[0,1,5],pipenv:2,pipfil:5,place:0,plan:5,pleas:[3,5],plot:4,plt:4,plumber:1,png:4,point:[1,5],popul:4,popular:5,populate_t:4,portal:5,possibl:4,post:0,postit:0,power:[1,4,5],pprint:1,practic:[0,1],pre:5,preced:2,prefer:5,prepar:4,prerequisit:2,prevent:4,previous:4,primari:4,print:[1,4],print_context:1,print_dat:[1,2],print_hello:2,print_the_context:1,privat:5,privileg:[2,4],proce:5,process:[1,4,5],produc:[1,4],product:[2,4],productis:4,program:4,prohibit:4,project:[1,2,4,5],promo:5,prompt:[4,5],properli:2,provid:[1,4,5],provide_context:1,public_tweet:4,pull:1,purpl:0,pycharm:5,pycon2019:4,pycon:4,pypi:2,pyplot:4,python2019:[2,4],python3:5,python:[0,1],python_cal:[1,2],python_oper:2,pythonoper:[1,2],qualiti:4,queri:4,question:[3,4],queu:4,queue:4,quickli:4,quit:5,rabbitmq:2,rang:4,rate:4,raw_data:4,raw_tweet:4,read:[1,4,5],read_sql_queri:4,readi:5,readthedoc:4,reboot:5,receiv:5,recommend:5,record:[1,5],recov:1,red:5,redeem:5,refresh:5,regardless:1,regist:[4,5],regularli:4,relat:4,relationship:1,releas:2,relev:4,reload:5,rememb:4,remix:3,remot:1,remov:4,replac:2,report:4,repositori:5,repres:1,reproduc:[1,3,4],requir:[4,5],requisit:5,research:3,reserv:4,resourc:1,respect:[3,4],respons:[1,4],rest:2,restrict:4,result:[1,2,4],result_backend:2,retri:[1,2],retriev:1,retry_delai:2,retweet:4,retweet_count:4,right:[4,5],robust:3,rollback:4,root:[2,4,5],roughli:4,row:4,run:[1,4,5],run_thi:1,runme_0:2,safe:5,sai:1,same:[1,4,5],save:[1,2,4],save_df:4,savefig:4,scalabl:1,scarc:1,scene:2,schedul:[2,4],schedule_interv:2,scm:5,screen_nam:4,script:[4,5],search:2,sec:4,secret:[],section:[2,4,5],see:[1,2,4,5],select:[4,5],self:4,send:[1,4,5],sent:[4,5],sequentialexecutor:2,server:[2,4],servic:[2,5],session:[4,5],set:[1,2,5],set_access_token:4,set_upstream:2,set_xticklabel:4,settl:5,setup:[2,4],sever:5,share:[1,4,5],shell:[2,5],should:[2,4,5],show:[1,4],simpl:2,simple_dag:2,simplehttpoper:1,sinc:[1,2],singl:[1,5],site:5,skip:1,slow:5,smaller:4,snippet:4,softwar:[3,5],solut:[2,4],some:[0,1,2,4,5],someth:[1,2],sort:[1,5],sourc:[1,2,4,5],spark_task_etl:2,specif:[2,4,5],specifi:[],speed:4,spend:4,spin:1,spotifi:1,sql:4,sql_alchemy_conn:2,sql_to_csv:4,sql_to_df:4,sqlite:2,src1_s3:2,src2_hdf:2,src3_s3:2,stabil:1,stai:2,stand:1,start:[1,4,5],start_dat:2,start_stream:4,state:1,statement:4,statu:[1,2,4],status:2,status_cod:4,step:5,steroid:4,stop:4,store:[1,2,4],str:4,straight:2,stream_twitt:4,streaming_how_to:4,streamlisten:4,strftime:4,style:5,subject:5,subplot:4,subscript:5,subsequ:4,substitut:4,subtask:4,success:[1,4,5],sudo:5,suggest:5,suit:5,support:1,suptitl:4,sure:[4,5],surveil:5,sync:2,sys:4,system:[0,1,5],systemctl:5,take:[0,2,4,5],tania:[3,5],task:[0,2,4],task_id:[1,2],team:[1,3,5],templat:2,test:2,testabl:1,text:[2,4],than:4,thankfulli:4,thei:[1,4,5],them:[1,2],thi:[0,1,2,3,4,5],thing:[4,5],think:4,those:4,thought:4,three:4,through:[4,5],thu:1,time:[0,1,2,4,5],timedelta:2,timelin:4,timeout:4,timestamp:4,tip:4,to_csv:4,todai:4,togeth:4,token:4,too:[2,4,5],tool:[3,5],toolset:0,top:4,tornado:1,track:[1,2,4,5],trademark:4,trallard:[3,4,5],transact:4,transform:4,tree:2,trigger:2,trigger_dag:2,troubleshoot:4,turn:1,tutori:[0,2,4,5],tweepi:4,tweet:5,tweets_long:4,twitter:3,two:4,txt:5,type:[2,4],ubuntu:5,unclear:4,under:[1,3],understand:0,uniqu:[4,5],unit:1,unittest:2,unix:[4,5],until:5,updat:1,upstream:4,use:[0,2,3,4,5],used:[1,2,3,4,5],useful:5,user:[1,2,4],userguid:2,usernam:4,uses:[1,2],using:[0,1,2,3,4,5],usr:5,usual:5,utf8:[2,4],utf8_unicode_ci:[2,4],v17:5,valid:5,value_count:4,varchar:4,variabl:[2,5],varieti:5,variou:2,venv:5,versatil:1,version:[1,4,5],vertic:1,via:[1,2,3,4,5],view:2,virtual:2,visit:[4,5],vscode:5,wai:4,wait_on_rate_limit:4,wait_on_rate_limit_notifi:4,want:[2,4,5],web:[2,5],webserv:2,websit:5,welcom:4,well:[0,2,4],were:5,what:[0,2,5],whatev:1,when:[1,4,5],whenev:4,where:[1,3,4],wherev:2,which:[1,2,3,4,5],who:5,whoever:4,whole:4,whom:4,wifi:5,within:[2,4],witht:4,work:[1,3,4,5],worker:1,workflow:[3,4],workshop:[3,5],world:2,worri:5,worth:5,would:[2,4],wrap:4,write:[1,4,5],written:5,www:5,xlabel:4,xxxxxxxxxxxxxxxxxx:4,yaml:5,year:3,yet:5,ylabel:4,yml:5,you:[1,2,3,4,5],your:[0,1,5],yourself:5,yum:5,zsh:5,zshrc:5},titles:["About the workshop","Airflow basics","Airflow 101: working locally and familiarise with the tool","Welcome to the EuroScipy Airflow tutorial","Pipelines","Setup"],titleterms:{"new":4,IDEs:5,about:[0,3],account:5,airflow:[1,2,3],anaconda:5,app:5,attende:5,autom:4,azur:5,basic:1,check:4,code:[1,3],collect:4,command:2,compar:1,compon:1,concept:1,conduct:3,configur:2,connect:4,creat:[2,4,5],dag:2,dai:0,data:4,databas:[1,2,4],defin:1,develop:5,differ:1,docker:5,document:[],editor:5,environ:[2,5],etl:[2,4],euroscipi:3,executor:1,extend:4,facilit:3,familiaris:2,first:[2,4],focu:0,from:[2,4],get:2,git:5,github:5,good:4,idempot:1,indic:[],keep:0,kind:2,let:2,licens:3,linux:5,local:[2,4],luigi:1,mac:5,matter:4,metadata:1,microsoft:5,mysql:5,now:2,our:0,pass:5,pipelin:[2,4],pipenv:5,pre:2,prepar:2,previou:2,pycon:5,python:[4,5],requisit:2,run:2,schedul:1,server:1,set:4,setup:5,similar:1,start:2,step:4,stream:4,tabl:4,task:1,text:5,tool:2,track:0,troubleshoot:5,tutori:3,tweet:4,twitter:[4,5],updat:2,user:5,virtual:5,virtualenv:5,web:1,welcom:3,what:[1,4],why:4,window:5,work:2,workflow:1,workshop:0,write:2,you:0,your:[2,3,4]}}) -------------------------------------------------------------------------------- /docs/source/_static/12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/12.png -------------------------------------------------------------------------------- /docs/source/_static/4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/4.jpg -------------------------------------------------------------------------------- /docs/source/_static/DAG.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/DAG.png -------------------------------------------------------------------------------- /docs/source/_static/GUI.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/GUI.png -------------------------------------------------------------------------------- /docs/source/_static/airflow-logo.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/airflow-logo.jpeg -------------------------------------------------------------------------------- /docs/source/_static/airflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/airflow.png -------------------------------------------------------------------------------- /docs/source/_static/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/architecture.png -------------------------------------------------------------------------------- /docs/source/_static/automate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/automate.png -------------------------------------------------------------------------------- /docs/source/_static/automation1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/automation1.jpg -------------------------------------------------------------------------------- /docs/source/_static/azure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/azure.png -------------------------------------------------------------------------------- /docs/source/_static/connection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/connection.png -------------------------------------------------------------------------------- /docs/source/_static/custom.css: -------------------------------------------------------------------------------- 1 | /* */ 2 | @import url('https://fonts.googleapis.com/css?family=Itim|Nunito|Source+Code+Pro'); 3 | 4 | a { 5 | color: rgb(96, 138, 197); 6 | } 7 | 8 | a:hover { 9 | color: rgb(65, 129, 218); 10 | } 11 | 12 | div.body h1 { 13 | color: #5F6366; 14 | font-family: 'Itim', cursive; 15 | font-weight: bold; 16 | font-size: 300%; 17 | } 18 | 19 | div.body h2 { 20 | color: #5F6366; 21 | font-family: 'Itim', cursive; 22 | font-weight: bold; 23 | } 24 | div.body h3 { 25 | color: #5F6366; 26 | font-family: 'Itim', cursive; 27 | font-weight: bold; 28 | } 29 | 30 | div.sphinxsidebarwrapper h1.logo { 31 | text-align: center; 32 | margin: 0 0 -20px 0; 33 | } 34 | 35 | div.sphinxsidebar p.blurb { 36 | font-size: 130%; 37 | text-align: center; 38 | font-family: 'Itim', cursive; 39 | color: rgb(151, 139, 196); 40 | } 41 | 42 | div.sphinxsidebar h1{ 43 | font-size: 160%; 44 | color: #5F6366; 45 | font-family: 'Itim', cursive; 46 | } 47 | 48 | div.sphinxsidebar h1 a { 49 | font-size: 160%; 50 | color: #5F6366; 51 | text-decoration: none; 52 | border: none; 53 | font-family: 'Itim', cursive; 54 | } 55 | 56 | div.sphinxsidebar h1 a:hover { 57 | border: none; 58 | } 59 | 60 | div.sphinxsidebar h3 { 61 | display: none; 62 | } 63 | 64 | div.sphinxsidebar a { 65 | color: #5F6366; 66 | } 67 | 68 | code.descname { 69 | color: rgb(151, 139, 196); 70 | } 71 | 72 | th.field-name { 73 | min-width: 100px; 74 | color: rgb(151, 139, 196); 75 | } 76 | 77 | tt, code { 78 | color: #F8F8F2; 79 | background: #1d1941; 80 | border-radius: 0.3em; 81 | padding: 0.0em 0.3em; 82 | } 83 | 84 | a.reference.internal code.xref span.pre { 85 | color: #F8F8F2; 86 | background: #1d1941; 87 | border-bottom: none; 88 | border-radius: 0; 89 | padding: 0; 90 | } 91 | 92 | a.reference.internal, a.reference.internal:hover { 93 | border-bottom: none; 94 | } 95 | 96 | a.reference.internal:hover code { 97 | background: #027bab 98 | } 99 | 100 | a.reference.internal:hover code.xref span.pre { 101 | color: #F8F8F2; 102 | background: #027bab; 103 | border-bottom: none; 104 | } 105 | 106 | tt.xref, code.xref, a tt { 107 | background: none; 108 | border-bottom: none; 109 | } 110 | 111 | code.literal { 112 | color: #F8F8F2; 113 | background:#1d1941; 114 | } 115 | 116 | pre { 117 | padding: 20px 30px; 118 | background: #1d1941; 119 | } 120 | 121 | div > dl { 122 | border-left: 2px solid #00384021; 123 | padding-left: 5px; 124 | } 125 | 126 | dt { 127 | color: rgb(96, 138, 197); 128 | } 129 | 130 | 131 | div.footer::before { 132 | display: block; 133 | content: ''; 134 | border-top: 2px solid #EDB5BF; 135 | width: 50%; 136 | margin: 2em auto 2em auto; 137 | } 138 | 139 | div.footer { 140 | text-align: center; 141 | /* color: #029be2; */ 142 | } 143 | 144 | div.footer a { 145 | color: #027bab; 146 | text-decoration: none; 147 | } 148 | 149 | p.caption { 150 | font-family: 'Itim', cursive; 151 | font-size: inherit; 152 | font-size: 150%; 153 | } 154 | 155 | @media screen and (max-width: 875px) { 156 | div.sphinxsidebar { 157 | background: #4D6D9A; 158 | } 159 | div.sphinxsidebar h1.logo, div.sphinxsidebar p.blurb{ 160 | text-align: left; 161 | } 162 | div.sphinxsidebar h1 a { 163 | color: #1bc5e0; 164 | } 165 | div.sphinxsidebar a { 166 | /* color: rgb(151, 139, 196); */ 167 | color: white; 168 | } 169 | div.sphinxsidebar ul { 170 | /* color: rgb(151, 139, 196); */ 171 | color: white; 172 | } 173 | } 174 | 175 | 176 | /* other */ 177 | 178 | .alert { 179 | position: relative; 180 | padding: 10px; 181 | margin-bottom: 5px; 182 | border: 2px solid transparent; 183 | border-radius: 2px; 184 | } 185 | 186 | .alert-primary { 187 | color: #004085; 188 | background-color: #cce5ff; 189 | border-color: #b8daff; 190 | } 191 | .alert-custom { 192 | background-color: rgb(229, 224, 247); 193 | border-color:rgb(229, 224, 247); 194 | color: rgb(128, 117, 165); 195 | } -------------------------------------------------------------------------------- /docs/source/_static/dag-time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/dag-time.png -------------------------------------------------------------------------------- /docs/source/_static/datapyramid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/datapyramid.png -------------------------------------------------------------------------------- /docs/source/_static/gooddata.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/gooddata.png -------------------------------------------------------------------------------- /docs/source/_static/gooddata1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/gooddata1.png -------------------------------------------------------------------------------- /docs/source/_static/luigi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/luigi.png -------------------------------------------------------------------------------- /docs/source/_static/mssignin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/mssignin.png -------------------------------------------------------------------------------- /docs/source/_static/pipeline1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/pipeline1.png -------------------------------------------------------------------------------- /docs/source/_static/python.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/python.png -------------------------------------------------------------------------------- /docs/source/_static/twitter1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/twitter1.png -------------------------------------------------------------------------------- /docs/source/_static/twitter2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/twitter2.png -------------------------------------------------------------------------------- /docs/source/_static/twitter3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/twitter3.png -------------------------------------------------------------------------------- /docs/source/_static/uses.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trallard/opendata-airflow-tutorial/942109ec797b3dc579296465b0d27f93d3b53422/docs/source/_static/uses.png -------------------------------------------------------------------------------- /docs/source/_templates/sidebarlogo.html: -------------------------------------------------------------------------------- 1 |

3 | 4 |

5 | 6 |

7 | -------------------------------------------------------------------------------- /docs/source/about.md: -------------------------------------------------------------------------------- 1 | # About the workshop 2 | 3 | We will be taking a look at the basic concepts of data pipelines as well as practical use cases using Python and libraries like pandas, matplotlib, and tensorflow. 4 | 5 | ## About you: 6 | - Some experience using the command line 7 | - Intermediate Python knowledge / use 8 | - Be able to apply what we learn and adopt to your use cases 9 | - Interested in data and systems 10 | - Aspring or current data engineering 11 | - Some knowledge about systems and databases (enough to be dangerous) 12 | 13 | ## Our focus for the day 14 | - Greater understanding on how to apply data pipelines using the Python and libraries in the Python scientific ecosystem 15 | - Focus on concepts (rather than complex implementations) 16 | - Practical knowledge application 17 | - Create the building blocks needed for your day-to-day work 18 | 19 | ## Keeping on track 20 | 21 | You will find 🚦 across the tutorial examples. We will use this to identify how folks are doing over the workshop (if following along in person). 22 | These will indicate practical or hands-on portions of the tutorial. 23 | 24 | ## Additional tutorial (PyCon US) 25 | 26 | For another (much longer) tutorial integrating MYSQL and Twitter stream data check out 27 | 28 | Also in the upcoming months I have planned: 29 | - Deploying Airflow in Kubernetes (AKS) 30 | - In depth programmatic report generation with Airflow and papermill 31 | - Airflow + dagster 32 | - Airflow + R? -------------------------------------------------------------------------------- /docs/source/airflow-intro.md: -------------------------------------------------------------------------------- 1 | # Airflow basics 2 | 3 | ## What is Airflow? 4 | 5 | ![airflow logo](_static/airflow-logo.jpeg) 6 | 7 | Airflow is a Workflow engine which means: 8 | 9 | - Manages scheduling and running jobs and data pipelines 10 | - Ensures jobs are ordered correctly based on dependencies 11 | - Manage the allocation of scarce resources 12 | - Provides mechanisms for tracking the state of jobs and recovering from failure 13 | 14 | It is highly versatile and can be used across many many domains: 15 | ![](_static/uses.png) 16 | 17 | ## Basic Airflow concepts 18 | 19 | - **Task**: a defined unit of work (these are called operators in Airflow) 20 | - **Task instance**: an individual run of a single task. Task instances also have an indicative state, which could be “running”, “success”, “failed”, “skipped”, “up for retry”, etc. 21 | - **DAG**: Directed acyclic graph, 22 | a set of tasks with explicit execution order, beginning, and end 23 | - **DAG run**: individual execution/run of a DAG 24 | 25 | **Debunking the DAG** 26 | 27 | The vertices and edges (the arrows linking the nodes) have an order and direction associated to them 28 | 29 | ![](_static/DAG.png) 30 | 31 | each node in a DAG corresponds to a task, which in turn represents some sort of data processing. For example: 32 | 33 | Node A could be the code for pulling data from an API, node B could be the code for anonymizing the data. Node B could be the code for checking that there are no duplicate records, and so on. 34 | 35 | These 'pipelines' are acyclic since they need a point of completion. 36 | 37 | The DAG does not care about what is in its tasks - since it does not do the processing itself. But it ensures that things happen at the right order. 38 | 39 | ![](https://www.polidea.com/static/bce5fcc8a3c0ead34ab459d243a26349/beee6/image2.png) 40 | 41 | **Dependencies** 42 | 43 | Each of the vertices has a particular direction that shows the relationship between certain nodes. For example, we can only anonymize data once this has been pulled out from the API. 44 | 45 | ## Operators 46 | While DAGs describe how to run a workflow, Airflow operators determine what actually gets done. There are several types of operators: 47 | 48 | - action operators which perform a single operation and return (e.g. `BashOperator`), 49 | - sensors which pause the execution (or execute) until a certain criteria is met (e.g. `sql_sensor`) 50 | - transfer operators which connect 2 services and enable sending data between them (e.g. GoogleCloudStorageToS3Operator). 51 | 52 | 53 | ## Idempotency 54 | 55 | This is one of the most important characteristics of good ETL architectures. 56 | 57 | When we say that something is idempotent it means it will produce the same result regardless of how many times this is run (i.e. the results are reproducible). 58 | 59 | Reproducibility is particularly important in data-intensive environments as this ensures that the same inputs will always return the same outputs. 60 | 61 | ## Airflow components 62 | 63 | ![](_static/architecture.png) 64 | 65 | There are 4 main components to Apache Airflow: 66 | 67 | ### Web server 68 | 69 | The GUI. This is under the hood a Flask app where you can track the status of your jobs and read logs from a remote file store (e.g. [Azure Blobstorage](https://docs.microsoft.com/en-us/azure/storage/blobs/storage-blobs-overview/?wt.mc_id=PyCon-github-taallard)). 70 | 71 | ### Scheduler 72 | 73 | This component is responsible for scheduling jobs. This is a multithreaded Python process that uses the DAGb object to decide what tasks need to be run, when and where. 74 | 75 | The task state is retrieved and updated from the database accordingly. The web server then uses these saved states to display job information. 76 | 77 | ### Executor 78 | 79 | The mechanism that gets the tasks done. 80 | 81 | ### Metadata database 82 | 83 | - Powers how the other components interact 84 | - Stores the Airflow states 85 | - All processes read and write from here 86 | 87 | ## Workflow as a code 88 | One of the main advantages of using a workflow system like Airflow is that all is code, which makes your workflows maintainable, versionable, testable, and collaborative. 89 | 90 | Thus your workflows become more explicit and maintainable (atomic tasks). 91 | 92 | Not only your code is dynamic but also is your infrastructure. 93 | 94 | ### Defining tasks 95 | 96 | Tasks are defined based on the abstraction of `Operators` (see Airflow docs [here](https://airflow.apache.org/concepts.html#operators)) which represent a single **idempotent task**. 97 | 98 | The best practice is to have atomic operators (i.e. can stand on their own and do not need to share resources among them). 99 | 100 | You can choose among; 101 | - `BashOperator` 102 | - `PythonOperator` 103 | - `EmailOperator` 104 | - `SimpleHttpOperator` 105 | - `MySqlOperator` (and other DB) 106 | 107 | Examples: 108 | 109 | If you have a DAG like this: 110 | 111 | ![](https://miro.medium.com/max/2120/1*Oqvm3jsGqfHDWoGOd3iB1A.png) 112 | 113 | Your Dag will be formed by the following operators: 114 | 115 | ``` 116 | source = DummyOperator(task_id=’source’, dag=dag) 117 | a_task = DummyOperator(task_id=’a’, dag=dag) 118 | b_task = DummyOperator(task_id=’b’, dag=dag) 119 | 120 | source >> a_task >> b_task 121 | 122 | ``` 123 | 124 | ```python 125 | t1 = BashOperator(task_id='print_date', 126 | bash_command='date, 127 | dag=dag) 128 | ``` 129 | 130 | ```python 131 | def print_context(ds, **kwargs): 132 | pprint(kwargs) 133 | print(ds) 134 | return 'Whatever you return gets printed in the logs' 135 | 136 | 137 | run_this = PythonOperator( 138 | task_id='print_the_context', 139 | provide_context=True, 140 | python_callable=print_context, 141 | dag=dag, 142 | ) 143 | ``` 144 | 145 | ## Comparing Luigi and Airflow 146 | 147 | ### Luigi 148 | 149 | - Created at Spotify (named after the plumber) 150 | - Open sourced in late 2012 151 | - GNU make for data 152 | 153 | ### Airflow 154 | - Airbnb data team 155 | - Open-sourced mud 2015 156 | - Apache incubator mid-2016 157 | - ETL pipelines 158 | 159 | ### Similarities 160 | - Python open source projects for data pipelines 161 | - Integrate with a number of sources (databases, filesystems) 162 | - Tracking failure, retries, success 163 | - Ability to identify the dependencies and execution 164 | 165 | ### Differences 166 | - Scheduler support: Airflow has built-in support using schedulers 167 | - Scalability: Airflow has had stability issues in the past 168 | - Web interfaces 169 | 170 | ![](_static/luigi.png) 171 | 172 | 173 | ![](_static/airflow.png) 174 | 175 | 176 | | Airflow | Luigi | 177 | | ------------------------------------------------ | ------------------------------------------------------------------------------ | 178 | | Task are defined by`dag_id` defined by user name | Task are defined by task name and parameters | 179 | | Task retries based on definitions | Decide if a task is done via input/output | 180 | | Task code to the worker | Workers started by Python file where the tasks are defined | 181 | | Centralized scheduler (Celery spins up workers) | Centralized scheduler in charge of deduplication sending tasks (Tornado based) | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | # import os 16 | # import sys 17 | # sys.path.insert(0, os.path.abspath('.')) 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = "EuroScipy tutorial" 23 | copyright = "2019, Tania Allard" 24 | author = "Tania Allard" 25 | 26 | # The short X.Y version 27 | version = "" 28 | # The full version, including alpha/beta/rc tags 29 | release = "" 30 | 31 | 32 | # -- General configuration --------------------------------------------------- 33 | 34 | # If your documentation needs a minimal Sphinx version, state it here. 35 | # 36 | # needs_sphinx = '1.0' 37 | 38 | # Add any Sphinx extension module names here, as strings. They can be 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 40 | # ones. 41 | extensions = [ 42 | "sphinx.ext.doctest", 43 | "sphinx.ext.intersphinx", 44 | "sphinx.ext.mathjax", 45 | "sphinx.ext.githubpages", 46 | "recommonmark", 47 | ] 48 | 49 | # Add any paths that contain templates here, relative to this directory. 50 | templates_path = ["_templates"] 51 | 52 | # The suffix(es) of source filenames. 53 | # You can specify multiple suffix as a list of string: 54 | # 55 | source_suffix = [".rst", ".md"] 56 | 57 | # The master toctree document. 58 | master_doc = "index" 59 | 60 | # The language for content autogenerated by Sphinx. Refer to documentation 61 | # for a list of supported languages. 62 | # 63 | # This is also used if you do content translation via gettext catalogs. 64 | # Usually you set "language" from the command line for these cases. 65 | language = None 66 | 67 | # List of patterns, relative to source directory, that match files and 68 | # directories to ignore when looking for source files. 69 | # This pattern also affects html_static_path and html_extra_path. 70 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 71 | 72 | # The name of the Pygments (syntax highlighting) style to use. 73 | pygments_style = "monokai" 74 | 75 | 76 | # -- Options for HTML output ------------------------------------------------- 77 | 78 | # The theme to use for HTML and HTML Help pages. See the documentation for 79 | # a list of builtin themes. 80 | # 81 | html_theme = "alabaster" 82 | 83 | # Theme options are theme-specific and customize the look and feel of a theme 84 | # further. For a list of options available for each theme, see the 85 | # documentation. 86 | # 87 | html_theme_options = { 88 | "github_banner": False, 89 | "github_button": True, 90 | "github_user": "trallard", 91 | "github_repo": "opendata-airflow-tutorial", 92 | "github_type": "star", 93 | "font_family": "Nunito, Georgia, sans", 94 | "head_font_family": "Nunito, Georgia, serif", 95 | "code_font_family": "'Source Code Pro', 'Consolas', monospace", 96 | "description": "a.k.a an introduction to all things DAGS and pipelines joy", 97 | "show_relbars": True, 98 | "logo": "python.png", 99 | } 100 | 101 | # Add any paths that contain custom static files (such as style sheets) here, 102 | # relative to this directory. They are copied after the builtin static files, 103 | # so a file named "default.css" will overwrite the builtin "default.css". 104 | html_static_path = ["_static"] 105 | 106 | # Custom sidebar templates, must be a dictionary that maps document names 107 | # to template names. 108 | # 109 | # The default sidebars (for documents that don't match any pattern) are 110 | # defined by theme itself. Builtin themes are using these templates by 111 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 112 | # 'searchbox.html']``. 113 | # 114 | # Custom sidebar templates, maps document names to template names. 115 | html_sidebars = { 116 | "**": [ 117 | "about.html", 118 | "localtoc.html", 119 | "searchbox.html", 120 | "navigation.html", 121 | "relations.html", 122 | "sidebarlogo.html", 123 | ] 124 | } 125 | 126 | # -- Options for HTMLHelp output --------------------------------------------- 127 | 128 | # Output file base name for HTML help builder. 129 | htmlhelp_basename = "EuroScipytutorialdoc" 130 | 131 | 132 | # -- Options for LaTeX output ------------------------------------------------ 133 | 134 | latex_elements = { 135 | # The paper size ('letterpaper' or 'a4paper'). 136 | # 137 | # 'papersize': 'letterpaper', 138 | # The font size ('10pt', '11pt' or '12pt'). 139 | # 140 | # 'pointsize': '10pt', 141 | # Additional stuff for the LaTeX preamble. 142 | # 143 | # 'preamble': '', 144 | # Latex figure (float) alignment 145 | # 146 | # 'figure_align': 'htbp', 147 | } 148 | 149 | # Grouping the document tree into LaTeX files. List of tuples 150 | # (source start file, target name, title, 151 | # author, documentclass [howto, manual, or own class]). 152 | latex_documents = [ 153 | ( 154 | master_doc, 155 | "EuroScipytutorial.tex", 156 | "EuroScipy tutorial Documentation", 157 | "Tania Allard", 158 | "manual", 159 | ) 160 | ] 161 | 162 | 163 | # -- Options for manual page output ------------------------------------------ 164 | 165 | # One entry per manual page. List of tuples 166 | # (source start file, name, description, authors, manual section). 167 | man_pages = [ 168 | (master_doc, "euroscipytutorial", "EuroScipy tutorial Documentation", [author], 1) 169 | ] 170 | 171 | 172 | # -- Options for Texinfo output ---------------------------------------------- 173 | 174 | # Grouping the document tree into Texinfo files. List of tuples 175 | # (source start file, target name, title, author, 176 | # dir menu entry, description, category) 177 | texinfo_documents = [ 178 | ( 179 | master_doc, 180 | "EuroScipytutorial", 181 | "EuroScipy tutorial Documentation", 182 | author, 183 | "EuroScipytutorial", 184 | "One line description of project.", 185 | "Miscellaneous", 186 | ) 187 | ] 188 | 189 | 190 | # -- Options for Epub output ------------------------------------------------- 191 | 192 | # Bibliographic Dublin Core info. 193 | epub_title = project 194 | 195 | # The unique identifier of the text. This can be a ISBN number 196 | # or the project homepage. 197 | # 198 | # epub_identifier = '' 199 | 200 | # A unique identification for the text. 201 | # 202 | # epub_uid = '' 203 | 204 | # A list of files that should not be packed into the epub file. 205 | epub_exclude_files = ["search.html"] 206 | 207 | 208 | # -- Extension configuration ------------------------------------------------- 209 | 210 | # -- Options for intersphinx extension --------------------------------------- 211 | 212 | # Example configuration for intersphinx: refer to the Python standard library. 213 | intersphinx_mapping = {"https://docs.python.org/": None} 214 | -------------------------------------------------------------------------------- /docs/source/first-airflow.md: -------------------------------------------------------------------------------- 1 | # Airflow 101: working locally and familiarise with the tool 2 | 3 | ## Pre-requisites 4 | 5 | The following prerequisites are needed: 6 | 7 | - Libraries detailed in the Setting up section (either via conda or pipenv) 8 | - MySQL installed 9 | - text editor 10 | - command line 11 | 12 | ## Getting your environment up and running 13 | 14 | If you followed the instructions you should have Airflow installed as well as the rest of the packages we will be using. 15 | 16 | So let's get our environment up and running: 17 | 18 | If you are using conda start your environment via: 19 | ``` 20 | $ source activate airflow-env 21 | ``` 22 | If using pipenv then: 23 | ``` 24 | $ pipenv shell 25 | ```` 26 | 27 | this will start a shell within a virtual environment, to exit the shell you need to type `exit` and this will exit the virtual environment. 28 | 29 | ## Starting Airflow locally 30 | 31 | Airflow home lives in `~/airflow` by default, but you can change the location before installing airflow. You first need to set the `AIRFLOW_HOME` environment variable and then install airflow. For example, using pip: 32 | 33 | ```sh 34 | export AIRFLOW_HOME=~/mydir/airflow 35 | 36 | # install from PyPI using pip 37 | pip install apache-airflow 38 | ``` 39 | 40 | once you have completed the installation you should see something like this in the `airflow` directory (wherever it lives for you) 41 | 42 | ``` 43 | drwxr-xr-x - myuser 18 Apr 14:02 . 44 | .rw-r--r-- 26k myuser 18 Apr 14:02 ├── airflow.cfg 45 | drwxr-xr-x - myuser 18 Apr 14:02 ├── logs 46 | drwxr-xr-x - myuser 18 Apr 14:02 │ └── scheduler 47 | drwxr-xr-x - myuser 18 Apr 14:02 │ ├── 2019-04-18 48 | lrwxr-xr-x 46 myuser 18 Apr 14:02 │ └── latest -> /Users/myuser/airflow/logs/scheduler/2019-04-18 49 | .rw-r--r-- 2.5k myuser 18 Apr 14:02 └── unittests.cfg 50 | ``` 51 | We need to create a local dag folder: 52 | 53 | ``` 54 | mkdir ~/airflow/dags 55 | ``` 56 | 57 | As your project evolves, your directory will look something like this: 58 | 59 | ``` 60 | airflow # the root directory. 61 | ├── dags # root folder for all dags. files inside folders are not searched for dags. 62 | │ ├── my_dag.py, # my dag (definitions of tasks/operators) including precedence. 63 | │ └── ... 64 | ├── logs # logs for the various tasks that are run 65 | │ └── my_dag # DAG specific logs 66 | │ │ ├── src1_s3 # folder for task-specific logs (log files are created by date of a run) 67 | │ │ ├── src2_hdfs 68 | │ │ ├── src3_s3 69 | │ │ └── spark_task_etl 70 | ├── airflow.db # SQLite database used by Airflow internally to track the status of each DAG. 71 | ├── airflow.cfg # global configuration for Airflow (this can be overridden by config inside the file.) 72 | └── ... 73 | ``` 74 | 75 | --- 76 | 77 | ## Spinning up a local airflow instance 78 | 79 | ➡️ The first thing we need to do is initialize Airflow database: 80 | 81 | ``` 82 | airflow initdb 83 | ``` 84 | 85 | This will be cfrated in `airflow.db` by default. 86 | 87 | ``` 88 | airflow_home 89 | ├── airflow.cfg 90 | ├── airflow.db <- Airflow SQLite DB 91 | └── unittests.cfg 92 | ``` 93 | 94 | 💡Using SQLite is an adequate solution for local testing and development, but it does not support concurrent access. In a production environment you will most certainly want to use a more robust database solution such as Postgres or MySQL (see optional section at the bottom on how to do this locally). 95 | 96 | Now we need to launch a terminal an start the Airflow web server (which is a Flask application): 97 | 98 | ``` 99 | airflow webserver -p 8080 100 | ``` 101 | 102 | Now we can head over to [http://localhost:8080](http://localhost:8080) now and you will see that there are a number of examples DAGS already there. 103 | 104 | #### Troubleshooting 105 | 106 | If you have any issues with loading the Airflow console in your web browser, or if there were any errors when you ran airflow webserver, then you may have another application running on port 8080. That's the default port for Airflow, but you can change it to any other user port that's not being used. For example, to run Airflow on port 7070 you could run: 107 | 108 | ``` 109 | airflow webserver -p 7070 110 | ``` 111 | 112 | 113 | 🚦 Take some time to familiarise with the UI and get your local instance set up 114 | 115 | ![](https://www.tensorflow.org/tfx/tutorials/tfx/images/workshop/airflow_dag_buttons.png) 116 | 117 | These are the buttons that allow you to enable, trigger and refresh dags. 118 | 119 | --- 120 | 121 | ### Airflow connections 122 | Now let's have a look at the connections ([http://localhost:8080/admin/connection/](http://localhost:8080/admin/connection/)) go to `admin > connections`. You should be able to see a number of connections available. 123 | These allows you to add services or integrate tools with your airflow server. 124 | 125 | ### Commands 126 | Let us go over some of the commands. Back on your command line: 127 | 128 | ``` 129 | airflow list_dags 130 | ``` 131 | we can list the DAG tasks in a tree view 132 | 133 | ``` 134 | airflow list_tasks tutorial --tree 135 | ``` 136 | 137 | we can tests the dags too, but we will need to set a date parameter so that this executes: 138 | 139 | ``` 140 | airflow test tutorial print_date 2019-09-02 141 | ``` 142 | (note that you cannot use a future date or you will get an error) 143 | ``` 144 | airflow test tutorial templated 2019-09-02 145 | ``` 146 | By using the test commands these are not saved in the database. 147 | 148 | You can also use the command line to enable and trigger DAGS, similar to the buttons in the GUI above: 149 | ``` 150 | # enable/disable 151 | airflow unpause 152 | airflow pause 153 | 154 | # trigger 155 | airflow trigger_dag 156 | ``` 157 | 158 | Now let's start the scheduler: 159 | ``` 160 | airflow scheduler 161 | ``` 162 | 163 | Behind the scenes, it monitors and stays in sync with a folder for all DAG objects it contains. The Airflow scheduler is designed to run as a service in an Airflow production environment. 164 | 165 | Now with the schedule up and running we can trigger an instance: 166 | ``` 167 | $ airflow run airflow run example_bash_operator runme_0 2015-01-01 168 | ``` 169 | 170 | This will be stored in the database and you can see the change of the status change straight away. 171 | 172 | What would happen for example if we wanted to run or trigger the `tutorial` task? 🤔 173 | 174 | Let's try from the CLI and see what happens. 175 | 176 | ``` 177 | airflow trigger_dag tutorial 178 | ``` 179 | 180 | 181 | ## Writing your first DAG 182 | 183 | Let's create our first simple DAG. 184 | Inside the dag directory (`~/airflow/dags)` create a `simple_dag.py` file. 185 | 186 | 1. Import Python dependencies 187 | ```python 188 | from datetime import datetime, timedelta 189 | from airflow import DAG 190 | from airflow.operators.dummy_operator import DummyOperator 191 | from airflow.operators.python_operator import PythonOperator 192 | ``` 193 | 194 | 2. Default Airflow arguments 195 | ```python 196 | default_args = { 197 | "owner": "airflow", 198 | "depends_on_past": False, 199 | "start_date": datetime(2019, 4, 30), 200 | "email": ["airflow@example.com"], 201 | "email_on_failure": False, 202 | "email_on_retry": False, 203 | # If a task fails, retry it once after waiting 204 | # at least 2 minutes 205 | "retries": 1, 206 | "retry_delay": timedelta(minutes=2), 207 | } 208 | ``` 209 | 210 | 3. Instantiate the DAG:Give the DAG name, configure the schedule, and set the DAG settings 211 | 212 | ```python 213 | dag = DAG( 214 | "hello_world", 215 | description="Simple tutorial DAG", 216 | schedule_interval="0 12 * * *", 217 | default_args=default_args, 218 | catchup=False, 219 | ) 220 | ``` 221 | Here are a couple of options you can use for your `schedule_interval`. You can choose to use some preset argument or cron-like argument: 222 | 223 | ![](_static/dag-time.png) 224 | 225 | For example 226 | `schedule_interval='@daily' ` 227 | `schedule_interval='0 0 * * *'` 228 | 229 | For reference or 230 | 231 | 4. Layout your tasks 232 | 233 | ```python 234 | t1 = DummyOperator(task_id="dummy_task", retries=3, dag=dag) 235 | 236 | t2 = PythonOperator(task_id="hello_task", python_callable=print_hello, dag=dag) 237 | ``` 238 | 239 | 5. Setting dependencies 240 | Set the order of the tasks 241 | 242 | ```python 243 | t1 >> t2 244 | ``` 245 | 246 | Other ways 247 | ```python 248 | # This means that t2 will depend on t1 249 | # running successfully to run. 250 | t1.set_downstream(t2) 251 | 252 | # similar to above where t3 will depend on t1 253 | t3.set_upstream(t1) 254 | ``` 255 | 256 | ```python 257 | # And the upstream dependency with the 258 | # bit shift operator: 259 | t2 << t1 260 | ``` 261 | ```python 262 | # A list of tasks can also be set as 263 | # dependencies. These operations 264 | # all have the same effect: 265 | t1.set_downstream([t2, t3]) 266 | t1 >> [t2, t3] 267 | [t2, t3] << t1 268 | 269 | ``` 270 | 271 | Your final DAG should look like this 272 | 273 | ```python 274 | from datetime import datetime, timedelta 275 | from airflow import DAG 276 | from airflow.operators.dummy_operator import DummyOperator 277 | from airflow.operators.python_operator import PythonOperator 278 | 279 | 280 | def print_hello(): 281 | return "Hello world!" 282 | 283 | 284 | default_args = { 285 | "owner": "airflow", 286 | "depends_on_past": False, 287 | "start_date": datetime(2019, 8, 31), 288 | "email": ["airflow@example.com"], 289 | "email_on_failure": False, 290 | "email_on_retry": False, 291 | # If a task fails, retry it once after waiting 292 | # at least 2 minutes 293 | "retries": 1, 294 | "retry_delay": timedelta(minutes=2), 295 | } 296 | 297 | dag = DAG( 298 | "hello_world", 299 | description="Simple tutorial DAG", 300 | schedule_interval="0 12 * * *", 301 | default_args=default_args, 302 | catchup=False, 303 | ) 304 | 305 | t1 = DummyOperator(task_id="dummy_task", retries=3, dag=dag) 306 | 307 | t2 = PythonOperator(task_id="hello_task", python_callable=print_hello, dag=dag) 308 | 309 | # sets downstream foe t1 310 | t1 >> t2 311 | 312 | # equivalent 313 | # t2.set_upstream(t1) 314 | 315 | ``` 316 | 317 | If it is properly setup you should be able to see this straight away on your instance. 318 | 319 | You should be able to trigger this DAG straight away. 320 | 321 | ### Your first operator 322 | 323 | An Operator is an atomic block of workflow logic, which performs a single action. Operators are written as Python classes (subclasses of `BaseOperator`), where the `__init__` function can be used to configure settings for the task and a method named execute is called when the task instance is executed. 324 | 325 | 326 | The execute method may also raise the `AirflowSkipException` from `airflow.exceptions`. In such a case the task instance would transition to the Skipped status. 327 | 328 | If another exception is raised, the task will be retried until the maximum number of `retries` is reached. 329 | 330 | 🚦We need to create a new directory: 331 | 332 | ``` 333 | mkdir /plugins 334 | ``` 335 | 336 | Then `my_operators.py` 337 | 338 | 339 | ``` 340 | import logging 341 | 342 | from airflow.models import BaseOperator 343 | from airflow.plugins_manager import AirflowPlugin 344 | from airflow.utils.decorators import apply_defaults 345 | 346 | log = logging.getLogger(__name__) 347 | 348 | class MyFirstOperator(BaseOperator): 349 | 350 | @apply_defaults 351 | def __init__(self, my_operator_param, *args, **kwargs): 352 | self.operator_param = my_operator_param 353 | super(MyFirstOperator, self).__init__(*args, **kwargs) 354 | 355 | def execute(self, context): 356 | log.info("Hello World!") 357 | log.info('operator_param: %s', self.operator_param) 358 | 359 | class MyFirstPlugin(AirflowPlugin): 360 | name = "my_first_plugin" 361 | operators = [MyFirstOperator] 362 | ``` 363 | In this file we are defining a new operator named `MyFirstOperator`. Its execute method is very simple, all it does is log “Hello World!” and the value of its own single parameter. The parameter is set in the `__init__` function. 364 | 365 | Now, we’ll need to create a new DAG to test our operator. Create a `dags/test_operators.py` file and fill it with the following content: 366 | 367 | ``` 368 | from datetime import datetime 369 | from airflow import DAG 370 | from airflow.operators.dummy_operator import DummyOperator 371 | from my_operators import MyFirstOperator 372 | 373 | dag = DAG('my_test_dag', description='Another tutorial DAG', 374 | schedule_interval='0 12 * * *', 375 | start_date=datetime(2019, 8, 31), catchup=False) 376 | 377 | dummy_task = DummyOperator(task_id='dummy_task', dag=dag) 378 | 379 | operator_task = MyFirstOperator(my_operator_param='This is a test.', 380 | task_id='my_first_operator_task', dag=dag) 381 | 382 | dummy_task >> operator_task 383 | ``` 384 | 385 | --- 386 | 387 | ## 🧪 OPTIONAL: Changing your database for a MySQL database 388 | 389 | As we mentioned before Airflow uses a database to keep track of the tasks and their statuses. So it is critical to have one set up. 390 | 391 | To start the default database we can run 392 | ` airflow initdb`. This will initialize your database via alembic so that it matches the latest Airflow release. 393 | 394 | The default database used is `sqlite` which means you cannot parallelize tasks using this database. Since we have MySQL and MySQL client installed we will set them up so that we can use them with airflow. 395 | 396 | 🚦Create an airflow database 397 | 398 | From the command line: 399 | 400 | ``` 401 | MySQL -u root -p 402 | mysql> CREATE DATABASE airflow CHARACTER SET utf8 COLLATE utf8_unicode_ci; 403 | mysql> GRANT ALL PRIVILEGES ON airflow.* To 'airflow'@'localhost'; 404 | mysql> FLUSH PRIVILEGES; 405 | ``` 406 | and initialize the database: 407 | 408 | ``` 409 | airflow initdb 410 | ``` 411 | 412 | Notice that this will fail with the default `airflow.cfg` 413 | 414 | 415 | ## Update your local configuration 416 | 417 | Open your airflow configuration file `~/airflow/airflow.cf` and make the following changes: 418 | 419 | 420 | ``` 421 | executor = CeleryExecutor 422 | ``` 423 | 424 | ``` 425 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#broker-settings 426 | # needs rabbitmq running 427 | broker_url = amqp://guest:guest@127.0.0.1/ 428 | 429 | 430 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#task-result-backend-settings 431 | result_backend = db+mysql://airflow:airflow@localhost:3306/airflow 432 | 433 | sql_alchemy_conn = mysql://airflow:python2019@localhost:3306/airflow 434 | 435 | ``` 436 | 437 | Here we are replacing the default executor (`SequentialExecutor`) with the `CeleryExecutor` so that we can run multiple DAGs in parallel. 438 | We also replace the default `sqlite` database with our newly created `airflow` database. 439 | 440 | Now we can initialize the database: 441 | ``` 442 | airflow initdb 443 | ``` 444 | 445 | --- -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. EuroScipy tutorial documentation master file, created by 2 | sphinx-quickstart on Sun Sep 1 21:47:51 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to the EuroScipy Airflow tutorial 7 | ============================================== 8 | This tutorial was originally developed for EuroScipy 2019. 9 | 10 | 11 | .. toctree:: 12 | :caption: Table of Contents 13 | :hidden: 14 | :maxdepth: 2 15 | 16 | setup 17 | about 18 | pipelines 19 | airflow-intro 20 | first-airflow 21 | 22 | .. toctree:: 23 | :maxdepth: 2 24 | :caption: Contents: 25 | 26 | About your facilitator 27 | ====================== 28 | 29 | My name is Tania. I live in Manchester UK where I work as a 30 | Cloud Advocate for Microsoft. 31 | 32 | Over the years, I have worked as a data engineer, machine learning engineer, 33 | and research software engineer. I love data intensive 34 | enviroments and I am particularly interested in the tools and workflows to 35 | deliver robust, reproducible data insights. 36 | 37 | If you have any questions or feedback about this tutorial please, 38 | file an issue using the following link: ``_. 39 | 40 | You can also contact me via the following channels: 41 | 42 | - E-mail: trallard@bitsandchips.me 43 | - Twitter: `@ixek `_ 44 | - `Tania on GitHub `_ 45 | 46 | Code of Conduct 47 | ================ 48 | All attendees to this workshop are expected to adhere to EuroScipy's Code of Conduct, 49 | in brief: 50 | **Be open, considerate, and respectful.** 51 | 52 | License 53 | ======= 54 | The content in this workshop is Licensed under `CC-BY-SA 4.0 `_. 55 | Which means that you can use, remix and re-distribute so long attribution to the original 56 | author is maintained (Tania Allard). 57 | 58 | The logo used here was designed by Ashley McNamara for the Microsoft Developer Advocates team use. 59 | 60 | 61 | -------------------------------------------------------------------------------- /docs/source/pipelines.md: -------------------------------------------------------------------------------- 1 | # Pipelines 2 | 3 | ![](_static/automation1.jpg) 4 | 5 | Automation helps us speed those manual boring tasks. The ability to automate means you can spend time working on other more thought-intensive projects. 6 | 7 | Automation adds monitoring and logging tasks: 8 | 9 | 10 | 11 | ![](_static/automate.png) 12 | 13 | ## Steps to automation 14 | 15 | Whenever you consider automating a task ask the following questions: 16 | - When should this task begin? 17 | - Does this task have a time limit? 18 | - What are the inputs for this task? 19 | - What is success or failure within this task? (How can we clearly identify the outcomes?) 20 | - If the task fails what should happen? 21 | - What does the task provide or produce? In what way? To whom? 22 | - What (if anything) should happen after the task concludes? 23 | 24 |
25 |

Top tip

26 | If your project is too large or loosely defined, try breaking it up into smaller tasks and automate a few of those tasks. Perhaps your task involves a report which downloads two datasets, runs cleanup and analysis, and then sends the results to different groups depending on the outcome. 27 | You can break this task into subtasks, automating each step. If any of these subtasks fail, stop the chain and alert the whoever is responsible for maintaining the script so it can be investigated further. 28 |
29 | 30 | ## What is a data pipeline? 31 | 32 | Roughly this is how all pipelines look like: 33 | 34 | ![](https://i1.wp.com/datapipesoft.com/wp-content/uploads/2017/05/data-pipeline.png?fit=651%2C336&ssl=1) 35 | 36 | they consist mainly of three distinct parts: data engineering processes, data preparation, and analytics. The upstream steps and quality of data determine in great measure the performance and quality of the subsequent steps. 37 | 38 | ## Why do pipelines matter? 39 | 40 | - Analytics and batch processing is mission-critical as they power all data-intensive applications 41 | - The complexity of the data sources and demands increase every day 42 | - A lot of time is invested in writing, monitoring jobs, and troubleshooting issues. 43 | 44 | This makes data engineering one of the most critical foundations of the whole analytics cycle. 45 | 46 | ### Good data pipelines are: 47 | 48 | - Reproducible: same code, same data, same environment -> same outcome 49 | - Easy to productise: need minimal modifications from R&D to production 50 | - Atomic: broken into smaller well-defined tasks 51 | 52 | When working with data pipelines always remember these two statements: 53 | 54 | 55 | ![](_static/gooddata.png) 56 | 57 | --- 58 | 59 | ![](_static/gooddata1.png) 60 | 61 | As your data engineering and data quality demands increase so does the complexity of the processes. So more often than not you will eventually need a workflow manager to help you with the orchestration of such processes. 62 | 63 |
64 | Think of a workflow manager as: 65 | 66 | GNU Make + Unix pipes + Steroids 67 |
68 | 69 | 70 | --- 71 | 72 | ## Creating a simple data analysis pipeline 73 | 74 | 75 | 76 | Let's start by cloning the repository 77 | 78 | ``` 79 | git clone https://github.com/trallard/opendata-airflow-tutorial.git 80 | ``` 81 | 82 | You will notice that you have a `census_data` directory. This contains both the scripts and the notebooks versions of the analysis we are going to use. 83 | 84 | Let's have a look at the notebooks! 85 | 86 | ``` 87 | jupyter lab 88 | ``` 89 | 90 | Alternatively: 91 | 92 | ``` 93 | jupyter 94 | ``` 95 | 96 | 97 | ### Create your own pipeline 98 | 99 | Note that there is not a single correct answer for this. Many will have different approaches. 100 | 101 | 🚦 Create a local script/pipeline that will run: 102 | 103 | get data -> clean data -> analyse data -> generate report / generate plots 104 | 105 | You already have `get_data.py`, `clean_data.py` and `analysis.py` as a simplified version of the notebooks. You can add a `create_plots.py` or `create_report.py`. -------------------------------------------------------------------------------- /docs/source/setup.rst: -------------------------------------------------------------------------------- 1 | Setup 2 | =============== 3 | This section will guide you through the pre requisites for the workshop. 4 | Please make sure to install the libraries before the workshop as the conference WiFi 5 | can get quite slow when having too many people downloading and installing things at the same 6 | time. 7 | 8 | Make sure to follow all the steps as detailed here. 9 | 10 | Python 3.x 11 | ++++++++++ 12 | 13 | 3.7 Preferred 14 | 15 | We will be using `Python `_. 16 | Installing all of Python's packages individually can be a bit 17 | difficult, so we recommend using `Anaconda `_ which 18 | provides a variety of useful packages/tools. 19 | 20 | To download Anaconda, follow the link https://www.anaconda.com/download/ and select 21 | Python 3. Following the download, run the installer as per usual on your machine. 22 | 23 | If you prefer not using Anaconda then this `tutorial `_ can help you with the installation and 24 | setup. 25 | 26 | If you already have Python installed but not via Anaconda do not worry. 27 | Make sure to have either ``venv`` or ``pipenv`` installed. Then follow the instructions to set 28 | your virtual environment further down. 29 | 30 | Git 31 | +++ 32 | 33 | `Git `_ is a version control software that records changes 34 | to a file or set of files. Git is especially helpful for software developers 35 | as it allows changes to be tracked (including who and when) when working on a 36 | project. 37 | 38 | To download Git, go to the following link and choose the correct version for your 39 | operating system: https://git-scm.com/downloads. 40 | 41 | Windows 42 | -------- 43 | 44 | Download the `git for Windows installer `_ . 45 | Make sure to select "use Git from the Windows command prompt" 46 | this will ensure that Git is permanently added to your PATH. 47 | 48 | Also select "Checkout Windows-style, commit Unix-style line endings" selected and click on "Next". 49 | 50 | This will provide you both git and git bash. We will use the command line quite a lot during the workshop 51 | so using git bash is a good option. 52 | 53 | GitHub 54 | ++++++ 55 | 56 | GitHub is a web-based service for version control using Git. You will need 57 | to set up an account at `https://github.com `_. Basic GitHub accounts are 58 | free and you can now also have private repositories. 59 | 60 | Text Editors/IDEs 61 | ++++++++++++ 62 | 63 | Text editors are tools with powerful features designed to optimize writing code. 64 | There are several text editors that you can choose from. 65 | Here are some we recommend: 66 | 67 | - `VS code `_: this is your facilitator's favourite 💜 and it is worth trying if you have not checked it yet 68 | - `Pycharm `_ 69 | - `Atom `_ 70 | 71 | We suggest trying several editors before settling on one. 72 | 73 | If you decide to go for VSCode make sure to also 74 | have the `Python extension `_ 75 | installed. This will make your life so much easier (and it comes with a lot of nifty 76 | features 😎). 77 | 78 | Creating a virtual environment 79 | +++++++++++++++++++++++++++++++ 80 | 81 | You will need to create a virtual environment to make sure that you have the right packages and setup needed to follow along the tutorial. 82 | Follow the instructions that best suit your installation. 83 | 84 | Anaconda 85 | -------- 86 | 87 | Clone the repository: 88 | :: 89 | git clone https://github.com/trallard/opendata-airflow-tutorial 90 | 91 | Change into the repo 92 | :: 93 | cd opendata-airflow-tutorial 94 | 95 | Create a conda environment: 96 | :: 97 | conda env create -f environment.yml 98 | 99 | Once all the dependencies are installed you can activate your environment through the following commands 100 | :: 101 | source activate airflow-env # Mac 102 | activate airflow-env # Windows and Linux 103 | To exit the environment you can use 104 | :: 105 | conda deactivate 106 | 107 | virtualenv 108 | ----------- 109 | Create a directory for the tutorial, for example : 110 | :: 111 | mkdir airflow-tutorial 112 | and change directories into it (``cd airflow-tutorial``). 113 | Now you need to run venv 114 | :: 115 | python3 -m venv env/airflow # Mac and Linux 116 | python -m venv env/airflow # Windows 117 | 118 | this will create a virtual Python environment in the ``env/airflow`` folder. 119 | Before installing the required packages you need to activate your virtual environment: 120 | :: 121 | source env/bin/activate # Mac and Linux 122 | .\env\Scripts\activate # Windows 123 | 124 | 125 | Now you can install the packages using via pip ``pip install -r requirements.txt`` 126 | 127 | To leave the virtual environment run ``deactivate`` 128 | 129 | Docker 130 | +++++++ 131 | 132 | There is a Docker image built with all the needed libraries. 133 | 134 | You can run it locally with: 135 | :: 136 | docker run --rm -it -p 5555:5555/tcp -p 8080:8080/tcp -p 8793:8793/tcp -p 8888:8888/tcp -e JUPYTER_ENABLE_LAB=yes trallard/airflow-tutorial:1.0 -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: airflow-env 2 | dependencies: 3 | - jupyter==1.0.0 4 | - jupyterlab==0.35.5 5 | - matplotlib==3.0.3 6 | - pandas==0.24.2 7 | - pip: 8 | - apache-airflow==1.10.3 9 | -------------------------------------------------------------------------------- /extra_tfx_example/dags/taxi_pipeline.py: -------------------------------------------------------------------------------- 1 | """Chicago taxi example using TFX.""" 2 | 3 | from __future__ import absolute_import, division, print_function 4 | 5 | import datetime 6 | import logging 7 | import os 8 | 9 | from tfx.components.example_gen.csv_example_gen.component import CsvExampleGen 10 | from tfx.orchestration.airflow.airflow_runner import AirflowDAGRunner 11 | from tfx.orchestration.pipeline import PipelineDecorator 12 | from tfx.utils.dsl_utils import csv_input 13 | 14 | # pylint: disable=line-too-long 15 | # from tfx.components.statistics_gen.component import StatisticsGen # Step 3 16 | # from tfx.components.schema_gen.component import SchemaGen # Step 3 17 | # from tfx.components.example_validator.component import ExampleValidator # Step 3 18 | 19 | # from tfx.components.transform.component import Transform # Step 4 20 | 21 | # from tfx.proto import trainer_pb2 # Step 5 22 | # from tfx.components.trainer.component import Trainer # Step 5 23 | 24 | # from tfx.proto import evaluator_pb2 # Step 6 25 | # from tfx.components.evaluator.component import Evaluator # Step 6 26 | 27 | # from tfx.proto import pusher_pb2 # Step 7 28 | # from tfx.components.model_validator.component import ModelValidator # Step 7 29 | # from tfx.components.pusher.component import Pusher # Step 7 30 | 31 | 32 | # pylint: enable=line-too-long 33 | 34 | # This example assumes that the taxi data is stored in ~/taxi/data and the 35 | # taxi utility function is in ~/taxi. Feel free to customize this as needed. 36 | _taxi_root = os.path.join(os.environ["HOME"], "airflow") 37 | _data_root = os.path.join(_taxi_root, "data/taxi_data") 38 | # Python module file to inject customized logic into the TFX components. The 39 | # Transform and Trainer both require user-defined functions to run successfully. 40 | _taxi_module_file = os.path.join(_taxi_root, "dags/taxi_utils.py") 41 | # Path which can be listened to by the model server. Pusher will output the 42 | # trained model here. 43 | _serving_model_dir = os.path.join(_taxi_root, "saved_models/taxi") 44 | 45 | # Directory and data locations. This example assumes all of the chicago taxi 46 | # example code and metadata library is relative to $HOME, but you can store 47 | # these files anywhere on your local filesystem. 48 | _tfx_root = os.path.join(_taxi_root, "tfx") 49 | _pipeline_root = os.path.join(_tfx_root, "pipelines") 50 | _metadata_db_root = os.path.join(_tfx_root, "metadata") 51 | _log_root = os.path.join(_tfx_root, "logs") 52 | 53 | # Airflow-specific configs; these will be passed directly to airflow 54 | _airflow_config = { 55 | "schedule_interval": None, 56 | "start_date": datetime.datetime(2019, 1, 1), 57 | } 58 | 59 | # Logging overrides 60 | logger_overrides = {"log_root": _log_root, "log_level": logging.INFO} 61 | 62 | 63 | @PipelineDecorator( 64 | pipeline_name="taxi", 65 | enable_cache=True, 66 | metadata_db_root=_metadata_db_root, 67 | additional_pipeline_args={"logger_args": logger_overrides}, 68 | pipeline_root=_pipeline_root, 69 | ) 70 | def _create_pipeline(): 71 | """Implements the chicago taxi pipeline with TFX.""" 72 | examples = csv_input(_data_root) 73 | 74 | # Brings data into the pipeline or otherwise joins/converts training data. 75 | example_gen = CsvExampleGen(input_base=examples) 76 | 77 | # Computes statistics over data for visualization and example validation. 78 | # pylint: disable=line-too-long 79 | # statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Step 3 80 | # pylint: enable=line-too-long 81 | 82 | # Generates schema based on statistics files. 83 | # infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Step 3 84 | 85 | # Performs anomaly detection based on statistics and data schema. 86 | # validate_stats = ExampleValidator( # Step 3 87 | # stats=statistics_gen.outputs.output, # Step 3 88 | # schema=infer_schema.outputs.output) # Step 3 89 | 90 | # Performs transformations and feature engineering in training and serving. 91 | # transform = Transform( # Step 4 92 | # input_data=example_gen.outputs.examples, # Step 4 93 | # schema=infer_schema.outputs.output, # Step 4 94 | # module_file=_taxi_module_file) # Step 4 95 | 96 | # Uses user-provided Python function that implements a model using TF-Learn. 97 | # trainer = Trainer( # Step 5 98 | # module_file=_taxi_module_file, # Step 5 99 | # transformed_examples=transform.outputs.transformed_examples, # Step 5 100 | # schema=infer_schema.outputs.output, # Step 5 101 | # transform_output=transform.outputs.transform_output, # Step 5 102 | # train_args=trainer_pb2.TrainArgs(num_steps=10000), # Step 5 103 | # eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Step 5 104 | 105 | # Uses TFMA to compute a evaluation statistics over features of a model. 106 | # model_analyzer = Evaluator( # Step 6 107 | # examples=example_gen.outputs.examples, # Step 6 108 | # model_exports=trainer.outputs.output, # Step 6 109 | # feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ # Step 6 110 | # evaluator_pb2.SingleSlicingSpec( # Step 6 111 | # column_for_slicing=['trip_start_hour']) # Step 6 112 | # ])) # Step 6 113 | 114 | # Performs quality validation of a candidate model (compared to a baseline). 115 | # model_validator = ModelValidator( # Step 7 116 | # examples=example_gen.outputs.examples, # Step 7 117 | # model=trainer.outputs.output) # Step 7 118 | 119 | # Checks whether the model passed the validation steps and pushes the model 120 | # to a file destination if check passed. 121 | # pusher = Pusher( # Step 7 122 | # model_export=trainer.outputs.output, # Step 7 123 | # model_blessing=model_validator.outputs.blessing, # Step 7 124 | # push_destination=pusher_pb2.PushDestination( # Step 7 125 | # filesystem=pusher_pb2.PushDestination.Filesystem( # Step 7 126 | # base_directory=_serving_model_dir))) # Step 7 127 | 128 | return [ 129 | example_gen, 130 | # statistics_gen, infer_schema, validate_stats, # Step 3 131 | # transform, # Step 4 132 | # trainer, # Step 5 133 | # model_analyzer, # Step 6 134 | # model_validator, pusher # Step 7 135 | ] 136 | 137 | 138 | pipeline = AirflowDAGRunner(_airflow_config).run(_create_pipeline()) 139 | -------------------------------------------------------------------------------- /extra_tfx_example/dags/taxi_utils.py: -------------------------------------------------------------------------------- 1 | """Python source file include taxi pipeline functions and necesasry utils. 2 | 3 | For a TFX pipeline to successfully run, a preprocessing_fn and a 4 | _build_estimator function needs to be provided. This file contains both. 5 | """ 6 | 7 | from __future__ import division, print_function 8 | 9 | import os # pylint: disable=unused-import 10 | 11 | import tensorflow as tf # pylint: disable=unused-import 12 | 13 | # import tensorflow_transform as tft # Step 4 14 | # from tensorflow_transform.beam.tft_beam_io import transform_fn_io # Step 4 15 | # from tensorflow_transform.saved import saved_transform_io # Step 4 16 | # from tensorflow_transform.tf_metadata import metadata_io # Step 4 17 | # from tensorflow_transform.tf_metadata import schema_utils # Step 4 18 | 19 | # import tensorflow_model_analysis as tfma # Step 5 20 | 21 | 22 | # Categorical features are assumed to each have a maximum value in the dataset. 23 | _MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12] 24 | 25 | _CATEGORICAL_FEATURE_KEYS = [ 26 | "trip_start_hour", 27 | "trip_start_day", 28 | "trip_start_month", 29 | "pickup_census_tract", 30 | "dropoff_census_tract", 31 | "pickup_community_area", 32 | "dropoff_community_area", 33 | ] 34 | 35 | _DENSE_FLOAT_FEATURE_KEYS = ["trip_miles", "fare", "trip_seconds"] 36 | 37 | # Number of buckets used by tf.transform for encoding each feature. 38 | _FEATURE_BUCKET_COUNT = 10 39 | 40 | _BUCKET_FEATURE_KEYS = [ 41 | "pickup_latitude", 42 | "pickup_longitude", 43 | "dropoff_latitude", 44 | "dropoff_longitude", 45 | ] 46 | 47 | # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform 48 | _VOCAB_SIZE = 1000 49 | 50 | # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed. 51 | _OOV_SIZE = 10 52 | 53 | _VOCAB_FEATURE_KEYS = ["payment_type", "company"] 54 | 55 | # Keys 56 | _LABEL_KEY = "tips" 57 | _FARE_KEY = "fare" 58 | 59 | # Step 4 START -------------------------- 60 | # def _transformed_name(key): 61 | # return key + '_xf' 62 | 63 | 64 | # def _transformed_names(keys): 65 | # return [_transformed_name(key) for key in keys] 66 | 67 | 68 | # # Tf.Transform considers these features as "raw" 69 | # def _get_raw_feature_spec(schema): 70 | # return schema_utils.schema_as_feature_spec(schema).feature_spec 71 | 72 | 73 | # def _gzip_reader_fn(): 74 | # """Small utility returning a record reader that can read gzip'ed files.""" 75 | # return tf.TFRecordReader( 76 | # options=tf.python_io.TFRecordOptions( 77 | # compression_type=tf.python_io.TFRecordCompressionType.GZIP)) 78 | 79 | 80 | # def _fill_in_missing(x): 81 | # """Replace missing values in a SparseTensor. 82 | 83 | # Fills in missing values of `x` with '' or 0, and converts to a dense tensor. 84 | 85 | # Args: 86 | # x: A `SparseTensor` of rank 2. Its dense shape should have size at most 1 87 | # in the second dimension. 88 | 89 | # Returns: 90 | # A rank 1 tensor where missing values of `x` have been filled in. 91 | # """ 92 | # default_value = '' if x.dtype == tf.string else 0 93 | # return tf.squeeze( 94 | # tf.sparse_to_dense(x.indices, [x.dense_shape[0], 1], x.values, 95 | # default_value), 96 | # axis=1) 97 | 98 | 99 | # def preprocessing_fn(inputs): 100 | # """tf.transform's callback function for preprocessing inputs. 101 | 102 | # Args: 103 | # inputs: map from feature keys to raw not-yet-transformed features. 104 | 105 | # Returns: 106 | # Map from string feature key to transformed feature operations. 107 | # """ 108 | # outputs = {} 109 | # for key in _DENSE_FLOAT_FEATURE_KEYS: 110 | # # Preserve this feature as a dense float, setting nan's to the mean. 111 | # outputs[_transformed_name(key)] = tft.scale_to_z_score( 112 | # _fill_in_missing(inputs[key])) 113 | 114 | # for key in _VOCAB_FEATURE_KEYS: 115 | # # Build a vocabulary for this feature. 116 | # outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary( 117 | # _fill_in_missing(inputs[key]), 118 | # top_k=_VOCAB_SIZE, 119 | # num_oov_buckets=_OOV_SIZE) 120 | 121 | # for key in _BUCKET_FEATURE_KEYS: 122 | # outputs[_transformed_name(key)] = tft.bucketize( 123 | # _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT) 124 | 125 | # for key in _CATEGORICAL_FEATURE_KEYS: 126 | # outputs[_transformed_name(key)] = _fill_in_missing(inputs[key]) 127 | 128 | # # Was this passenger a big tipper? 129 | # taxi_fare = _fill_in_missing(inputs[_FARE_KEY]) 130 | # tips = _fill_in_missing(inputs[_LABEL_KEY]) 131 | # outputs[_transformed_name(_LABEL_KEY)] = tf.where( 132 | # tf.is_nan(taxi_fare), 133 | # tf.cast(tf.zeros_like(taxi_fare), tf.int64), 134 | # # Test if the tip was > 20% of the fare. 135 | # tf.cast( 136 | # tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), 137 | # tf.int64)) 138 | 139 | # return outputs 140 | # Step 4 END -------------------------- 141 | 142 | # Step 5 START -------------------------- 143 | # def _build_estimator(transform_output, 144 | # config, 145 | # hidden_units=None, 146 | # warm_start_from=None): 147 | # """Build an estimator for predicting the tipping behavior of taxi riders. 148 | 149 | # Args: 150 | # transform_output: directory in which the tf-transform model was written 151 | # during the preprocessing step. 152 | # config: tf.contrib.learn.RunConfig defining the runtime environment for 153 | # the estimator (including model_dir). 154 | # hidden_units: [int], the layer sizes of the DNN (input layer first) 155 | # warm_start_from: Optional directory to warm start from. 156 | 157 | # Returns: 158 | # A dict of the following: 159 | # - estimator: The estimator that will be used for training and eval. 160 | # - train_spec: Spec for training. 161 | # - eval_spec: Spec for eval. 162 | # - eval_input_receiver_fn: Input function for eval. 163 | # """ 164 | # metadata_dir = os.path.join(transform_output, 165 | # transform_fn_io.TRANSFORMED_METADATA_DIR) 166 | # transformed_metadata = metadata_io.read_metadata(metadata_dir) 167 | # transformed_feature_spec = transformed_metadata.schema.as_feature_spec() 168 | 169 | # transformed_feature_spec.pop(_transformed_name(_LABEL_KEY)) 170 | 171 | # real_valued_columns = [ 172 | # tf.feature_column.numeric_column(key, shape=()) 173 | # for key in _transformed_names(_DENSE_FLOAT_FEATURE_KEYS) 174 | # ] 175 | # categorical_columns = [ 176 | # tf.feature_column.categorical_column_with_identity( 177 | # key, num_buckets=_VOCAB_SIZE + _OOV_SIZE, default_value=0) 178 | # for key in _transformed_names(_VOCAB_FEATURE_KEYS) 179 | # ] 180 | # categorical_columns += [ 181 | # tf.feature_column.categorical_column_with_identity( 182 | # key, num_buckets=_FEATURE_BUCKET_COUNT, default_value=0) 183 | # for key in _transformed_names(_BUCKET_FEATURE_KEYS) 184 | # ] 185 | # categorical_columns += [ 186 | # tf.feature_column.categorical_column_with_identity( # pylint: disable=g-complex-comprehension 187 | # key, 188 | # num_buckets=num_buckets, 189 | # default_value=0) for key, num_buckets in zip( 190 | # _transformed_names(_CATEGORICAL_FEATURE_KEYS), 191 | # _MAX_CATEGORICAL_FEATURE_VALUES) 192 | # ] 193 | # return tf.estimator.DNNLinearCombinedClassifier( 194 | # config=config, 195 | # linear_feature_columns=categorical_columns, 196 | # dnn_feature_columns=real_valued_columns, 197 | # dnn_hidden_units=hidden_units or [100, 70, 50, 25], 198 | # warm_start_from=warm_start_from) 199 | 200 | 201 | # def _example_serving_receiver_fn(transform_output, schema): 202 | # """Build the serving in inputs. 203 | 204 | # Args: 205 | # transform_output: directory in which the tf-transform model was written 206 | # during the preprocessing step. 207 | # schema: the schema of the input data. 208 | 209 | # Returns: 210 | # Tensorflow graph which parses examples, applying tf-transform to them. 211 | # """ 212 | # raw_feature_spec = _get_raw_feature_spec(schema) 213 | # raw_feature_spec.pop(_LABEL_KEY) 214 | 215 | # raw_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn( 216 | # raw_feature_spec, default_batch_size=None) 217 | # serving_input_receiver = raw_input_fn() 218 | 219 | # _, transformed_features = ( 220 | # saved_transform_io.partially_apply_saved_transform( 221 | # os.path.join(transform_output, transform_fn_io.TRANSFORM_FN_DIR), 222 | # serving_input_receiver.features)) 223 | 224 | # return tf.estimator.export.ServingInputReceiver( 225 | # transformed_features, serving_input_receiver.receiver_tensors) 226 | 227 | 228 | # def _eval_input_receiver_fn(transform_output, schema): 229 | # """Build everything needed for the tf-model-analysis to run the model. 230 | 231 | # Args: 232 | # transform_output: directory in which the tf-transform model was written 233 | # during the preprocessing step. 234 | # schema: the schema of the input data. 235 | 236 | # Returns: 237 | # EvalInputReceiver function, which contains: 238 | # - Tensorflow graph which parses raw untransformed features, applies the 239 | # tf-transform preprocessing operators. 240 | # - Set of raw, untransformed features. 241 | # - Label against which predictions will be compared. 242 | # """ 243 | # # Notice that the inputs are raw features, not transformed features here. 244 | # raw_feature_spec = _get_raw_feature_spec(schema) 245 | 246 | # serialized_tf_example = tf.placeholder( 247 | # dtype=tf.string, shape=[None], name='input_example_tensor') 248 | 249 | # # Add a parse_example operator to the tensorflow graph, which will parse 250 | # # raw, untransformed, tf examples. 251 | # features = tf.parse_example(serialized_tf_example, raw_feature_spec) 252 | 253 | # # Now that we have our raw examples, process them through the tf-transform 254 | # # function computed during the preprocessing step. 255 | # _, transformed_features = ( 256 | # saved_transform_io.partially_apply_saved_transform( 257 | # os.path.join(transform_output, transform_fn_io.TRANSFORM_FN_DIR), 258 | # features)) 259 | 260 | # # The key name MUST be 'examples'. 261 | # receiver_tensors = {'examples': serialized_tf_example} 262 | 263 | # # NOTE: Model is driven by transformed features (since training works on the 264 | # # materialized output of TFT, but slicing will happen on raw features. 265 | # features.update(transformed_features) 266 | 267 | # return tfma.export.EvalInputReceiver( 268 | # features=features, 269 | # receiver_tensors=receiver_tensors, 270 | # labels=transformed_features[_transformed_name(_LABEL_KEY)]) 271 | 272 | 273 | # def _input_fn(filenames, transform_output, batch_size=200): 274 | # """Generates features and labels for training or evaluation. 275 | 276 | # Args: 277 | # filenames: [str] list of CSV files to read data from. 278 | # transform_output: directory in which the tf-transform model was written 279 | # during the preprocessing step. 280 | # batch_size: int First dimension size of the Tensors returned by input_fn 281 | 282 | # Returns: 283 | # A (features, indices) tuple where features is a dictionary of 284 | # Tensors, and indices is a single Tensor of label indices. 285 | # """ 286 | # metadata_dir = os.path.join(transform_output, 287 | # transform_fn_io.TRANSFORMED_METADATA_DIR) 288 | # transformed_metadata = metadata_io.read_metadata(metadata_dir) 289 | # transformed_feature_spec = transformed_metadata.schema.as_feature_spec() 290 | 291 | # transformed_features = tf.contrib.learn.io.read_batch_features( 292 | # filenames, batch_size, transformed_feature_spec, reader=_gzip_reader_fn) 293 | 294 | # # We pop the label because we do not want to use it as a feature while we're 295 | # # training. 296 | # return transformed_features, transformed_features.pop( 297 | # _transformed_name(_LABEL_KEY)) 298 | 299 | 300 | # # TFX will call this function 301 | # def trainer_fn(hparams, schema): 302 | # """Build the estimator using the high level API. 303 | 304 | # Args: 305 | # hparams: Holds hyperparameters used to train the model as name/value pairs 306 | # schema: Holds the schema of the training examples. 307 | 308 | # Returns: 309 | # A dict of the following: 310 | # - estimator: The estimator that will be used for training and eval. 311 | # - train_spec: Spec for training. 312 | # - eval_spec: Spec for eval. 313 | # - eval_input_receiver_fn: Input function for eval. 314 | # """ 315 | # # Number of nodes in the first layer of the DNN 316 | # first_dnn_layer_size = 100 317 | # num_dnn_layers = 4 318 | # dnn_decay_factor = 0.7 319 | 320 | # train_batch_size = 40 321 | # eval_batch_size = 40 322 | 323 | # train_input_fn = lambda: _input_fn( # pylint: disable=g-long-lambda 324 | # hparams.train_files, 325 | # hparams.transform_output, 326 | # batch_size=train_batch_size) 327 | 328 | # eval_input_fn = lambda: _input_fn( # pylint: disable=g-long-lambda 329 | # hparams.eval_files, 330 | # hparams.transform_output, 331 | # batch_size=eval_batch_size) 332 | 333 | # train_spec = tf.estimator.TrainSpec( # pylint: disable=g-long-lambda 334 | # train_input_fn, 335 | # max_steps=hparams.train_steps) 336 | 337 | # serving_receiver_fn = lambda: _example_serving_receiver_fn( # pylint: disable=g-long-lambda 338 | # hparams.transform_output, schema) 339 | 340 | # exporter = tf.estimator.FinalExporter('chicago-taxi', serving_receiver_fn) 341 | # eval_spec = tf.estimator.EvalSpec( 342 | # eval_input_fn, 343 | # steps=hparams.eval_steps, 344 | # exporters=[exporter], 345 | # name='chicago-taxi-eval') 346 | 347 | # run_config = tf.estimator.RunConfig( 348 | # save_checkpoints_steps=999, keep_checkpoint_max=1) 349 | 350 | # run_config = run_config.replace(model_dir=hparams.serving_model_dir) 351 | 352 | # estimator = _build_estimator( 353 | # transform_output=hparams.transform_output, 354 | 355 | # # Construct layers sizes with exponetial decay 356 | # hidden_units=[ 357 | # max(2, int(first_dnn_layer_size * dnn_decay_factor**i)) 358 | # for i in range(num_dnn_layers) 359 | # ], 360 | # config=run_config, 361 | # warm_start_from=hparams.warm_start_from) 362 | 363 | # # Create an input receiver for TFMA processing 364 | # receiver_fn = lambda: _eval_input_receiver_fn( # pylint: disable=g-long-lambda 365 | # hparams.transform_output, schema) 366 | 367 | # return { 368 | # 'estimator': estimator, 369 | # 'train_spec': train_spec, 370 | # 'eval_spec': eval_spec, 371 | # 'eval_input_receiver_fn': receiver_fn 372 | # } 373 | # Step 5 END -------------------------- 374 | -------------------------------------------------------------------------------- /extra_tfx_example/setup/chicago_data/taxi_pipeline_simple.py: -------------------------------------------------------------------------------- 1 | """Chicago taxi example using TFX.""" 2 | 3 | from __future__ import absolute_import, division, print_function 4 | 5 | import datetime 6 | import logging 7 | import os 8 | 9 | from tfx.components.evaluator.component import Evaluator 10 | from tfx.components.example_gen.csv_example_gen.component import CsvExampleGen 11 | from tfx.components.example_validator.component import ExampleValidator 12 | from tfx.components.model_validator.component import ModelValidator 13 | from tfx.components.pusher.component import Pusher 14 | from tfx.components.schema_gen.component import SchemaGen 15 | from tfx.components.statistics_gen.component import StatisticsGen 16 | from tfx.components.trainer.component import Trainer 17 | from tfx.components.transform.component import Transform 18 | from tfx.orchestration.airflow.airflow_runner import AirflowDAGRunner 19 | from tfx.orchestration.pipeline import PipelineDecorator 20 | from tfx.proto import evaluator_pb2, pusher_pb2, trainer_pb2 21 | from tfx.utils.dsl_utils import csv_input 22 | 23 | # This example assumes that the taxi data is stored in ~/taxi/data and the 24 | # taxi utility function is in ~/taxi. Feel free to customize this as needed. 25 | _taxi_root = os.path.join(os.environ["HOME"], "taxi") 26 | _data_root = os.path.join(_taxi_root, "data/simple") 27 | # Python module file to inject customized logic into the TFX components. The 28 | # Transform and Trainer both require user-defined functions to run successfully. 29 | _taxi_module_file = os.path.join(_taxi_root, "taxi_utils.py") 30 | # Path which can be listened to by the model server. Pusher will output the 31 | # trained model here. 32 | _serving_model_dir = os.path.join(_taxi_root, "serving_model/taxi_simple") 33 | 34 | # Directory and data locations. This example assumes all of the chicago taxi 35 | # example code and metadata library is relative to $HOME, but you can store 36 | # these files anywhere on your local filesystem. 37 | _tfx_root = os.path.join(os.environ["HOME"], "tfx") 38 | _pipeline_root = os.path.join(_tfx_root, "pipelines") 39 | _metadata_db_root = os.path.join(_tfx_root, "metadata") 40 | _log_root = os.path.join(_tfx_root, "logs") 41 | 42 | # Airflow-specific configs; these will be passed directly to airflow 43 | _airflow_config = { 44 | "schedule_interval": None, 45 | "start_date": datetime.datetime(2019, 1, 1), 46 | } 47 | 48 | # Logging overrides 49 | logger_overrides = {"log_root": _log_root, "log_level": logging.INFO} 50 | 51 | 52 | # TODO(b/124066911): Centralize tfx related config into one place. 53 | # TODO(zhitaoli): Remove PipelineDecorator after 0.13.0. 54 | @PipelineDecorator( 55 | pipeline_name="chicago_taxi_simple", 56 | enable_cache=True, 57 | metadata_db_root=_metadata_db_root, 58 | additional_pipeline_args={"logger_args": logger_overrides}, 59 | pipeline_root=_pipeline_root, 60 | ) 61 | def _create_pipeline(): 62 | """Implements the chicago taxi pipeline with TFX.""" 63 | examples = csv_input(_data_root) 64 | 65 | # Brings data into the pipeline or otherwise joins/converts training data. 66 | example_gen = CsvExampleGen(input_base=examples) 67 | 68 | # Computes statistics over data for visualization and example validation. 69 | statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) 70 | 71 | # Generates schema based on statistics files. 72 | infer_schema = SchemaGen(stats=statistics_gen.outputs.output) 73 | 74 | # Performs anomaly detection based on statistics and data schema. 75 | validate_stats = ExampleValidator( 76 | stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output 77 | ) 78 | 79 | # Performs transformations and feature engineering in training and serving. 80 | transform = Transform( 81 | input_data=example_gen.outputs.examples, 82 | schema=infer_schema.outputs.output, 83 | module_file=_taxi_module_file, 84 | ) 85 | 86 | # Uses user-provided Python function that implements a model using TF-Learn. 87 | trainer = Trainer( 88 | module_file=_taxi_module_file, 89 | transformed_examples=transform.outputs.transformed_examples, 90 | schema=infer_schema.outputs.output, 91 | transform_output=transform.outputs.transform_output, 92 | train_args=trainer_pb2.TrainArgs(num_steps=10000), 93 | eval_args=trainer_pb2.EvalArgs(num_steps=5000), 94 | ) 95 | 96 | # Uses TFMA to compute a evaluation statistics over features of a model. 97 | model_analyzer = Evaluator( 98 | examples=example_gen.outputs.examples, 99 | model_exports=trainer.outputs.output, 100 | feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec( 101 | specs=[ 102 | evaluator_pb2.SingleSlicingSpec(column_for_slicing=["trip_start_hour"]) 103 | ] 104 | ), 105 | ) 106 | 107 | # Performs quality validation of a candidate model (compared to a baseline). 108 | model_validator = ModelValidator( 109 | examples=example_gen.outputs.examples, model=trainer.outputs.output 110 | ) 111 | 112 | # Checks whether the model passed the validation steps and pushes the model 113 | # to a file destination if check passed. 114 | pusher = Pusher( 115 | model_export=trainer.outputs.output, 116 | model_blessing=model_validator.outputs.blessing, 117 | push_destination=pusher_pb2.PushDestination( 118 | filesystem=pusher_pb2.PushDestination.Filesystem( 119 | base_directory=_serving_model_dir 120 | ) 121 | ), 122 | ) 123 | 124 | return [ 125 | example_gen, 126 | statistics_gen, 127 | infer_schema, 128 | validate_stats, 129 | transform, 130 | trainer, 131 | model_analyzer, 132 | model_validator, 133 | pusher, 134 | ] 135 | 136 | 137 | pipeline = AirflowDAGRunner(_airflow_config).run(_create_pipeline()) 138 | -------------------------------------------------------------------------------- /extra_tfx_example/setup/chicago_data/taxi_utils.py: -------------------------------------------------------------------------------- 1 | """Python source file include taxi pipeline functions and necesasry utils. 2 | 3 | For a TFX pipeline to successfully run, a preprocessing_fn and a 4 | _build_estimator function needs to be provided. This file contains both. 5 | 6 | This file is equivalent to examples/chicago_taxi/trainer/model.py and 7 | examples/chicago_taxi/preprocess.py. 8 | """ 9 | 10 | from __future__ import division, print_function 11 | 12 | import os 13 | 14 | import tensorflow as tf 15 | import tensorflow_model_analysis as tfma 16 | import tensorflow_transform as tft 17 | from tensorflow_transform.beam.tft_beam_io import transform_fn_io 18 | from tensorflow_transform.saved import saved_transform_io 19 | from tensorflow_transform.tf_metadata import metadata_io, schema_utils 20 | 21 | # Categorical features are assumed to each have a maximum value in the dataset. 22 | _MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12] 23 | 24 | _CATEGORICAL_FEATURE_KEYS = [ 25 | "trip_start_hour", 26 | "trip_start_day", 27 | "trip_start_month", 28 | "pickup_census_tract", 29 | "dropoff_census_tract", 30 | "pickup_community_area", 31 | "dropoff_community_area", 32 | ] 33 | 34 | _DENSE_FLOAT_FEATURE_KEYS = ["trip_miles", "fare", "trip_seconds"] 35 | 36 | # Number of buckets used by tf.transform for encoding each feature. 37 | _FEATURE_BUCKET_COUNT = 10 38 | 39 | _BUCKET_FEATURE_KEYS = [ 40 | "pickup_latitude", 41 | "pickup_longitude", 42 | "dropoff_latitude", 43 | "dropoff_longitude", 44 | ] 45 | 46 | # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform 47 | _VOCAB_SIZE = 1000 48 | 49 | # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed. 50 | _OOV_SIZE = 10 51 | 52 | _VOCAB_FEATURE_KEYS = ["payment_type", "company"] 53 | 54 | # Keys 55 | _LABEL_KEY = "tips" 56 | _FARE_KEY = "fare" 57 | 58 | 59 | def _transformed_name(key): 60 | return key + "_xf" 61 | 62 | 63 | def _transformed_names(keys): 64 | return [_transformed_name(key) for key in keys] 65 | 66 | 67 | # Tf.Transform considers these features as "raw" 68 | def _get_raw_feature_spec(schema): 69 | return schema_utils.schema_as_feature_spec(schema).feature_spec 70 | 71 | 72 | def _gzip_reader_fn(): 73 | """Small utility returning a record reader that can read gzip'ed files.""" 74 | return tf.TFRecordReader( 75 | options=tf.python_io.TFRecordOptions( 76 | compression_type=tf.python_io.TFRecordCompressionType.GZIP 77 | ) 78 | ) 79 | 80 | 81 | def _fill_in_missing(x): 82 | """Replace missing values in a SparseTensor. 83 | 84 | Fills in missing values of `x` with '' or 0, and converts to a dense tensor. 85 | 86 | Args: 87 | x: A `SparseTensor` of rank 2. Its dense shape should have size at most 1 88 | in the second dimension. 89 | 90 | Returns: 91 | A rank 1 tensor where missing values of `x` have been filled in. 92 | """ 93 | default_value = "" if x.dtype == tf.string else 0 94 | return tf.squeeze( 95 | tf.sparse.to_dense( 96 | tf.SparseTensor(x.indices, x.values, [x.dense_shape[0], 1]), default_value 97 | ), 98 | axis=1, 99 | ) 100 | 101 | 102 | def preprocessing_fn(inputs): 103 | """tf.transform's callback function for preprocessing inputs. 104 | 105 | Args: 106 | inputs: map from feature keys to raw not-yet-transformed features. 107 | 108 | Returns: 109 | Map from string feature key to transformed feature operations. 110 | """ 111 | outputs = {} 112 | for key in _DENSE_FLOAT_FEATURE_KEYS: 113 | # Preserve this feature as a dense float, setting nan's to the mean. 114 | outputs[_transformed_name(key)] = tft.scale_to_z_score( 115 | _fill_in_missing(inputs[key]) 116 | ) 117 | 118 | for key in _VOCAB_FEATURE_KEYS: 119 | # Build a vocabulary for this feature. 120 | outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary( 121 | _fill_in_missing(inputs[key]), top_k=_VOCAB_SIZE, num_oov_buckets=_OOV_SIZE 122 | ) 123 | 124 | for key in _BUCKET_FEATURE_KEYS: 125 | outputs[_transformed_name(key)] = tft.bucketize( 126 | _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT 127 | ) 128 | 129 | for key in _CATEGORICAL_FEATURE_KEYS: 130 | outputs[_transformed_name(key)] = _fill_in_missing(inputs[key]) 131 | 132 | # Was this passenger a big tipper? 133 | taxi_fare = _fill_in_missing(inputs[_FARE_KEY]) 134 | tips = _fill_in_missing(inputs[_LABEL_KEY]) 135 | outputs[_transformed_name(_LABEL_KEY)] = tf.where( 136 | tf.is_nan(taxi_fare), 137 | tf.cast(tf.zeros_like(taxi_fare), tf.int64), 138 | # Test if the tip was > 20% of the fare. 139 | tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64), 140 | ) 141 | 142 | return outputs 143 | 144 | 145 | def _build_estimator(config, hidden_units=None, warm_start_from=None): 146 | """Build an estimator for predicting the tipping behavior of taxi riders. 147 | 148 | Args: 149 | config: tf.contrib.learn.RunConfig defining the runtime environment for the 150 | estimator (including model_dir). 151 | hidden_units: [int], the layer sizes of the DNN (input layer first) 152 | warm_start_from: Optional directory to warm start from. 153 | 154 | Returns: 155 | A dict of the following: 156 | - estimator: The estimator that will be used for training and eval. 157 | - train_spec: Spec for training. 158 | - eval_spec: Spec for eval. 159 | - eval_input_receiver_fn: Input function for eval. 160 | """ 161 | real_valued_columns = [ 162 | tf.feature_column.numeric_column(key, shape=()) 163 | for key in _transformed_names(_DENSE_FLOAT_FEATURE_KEYS) 164 | ] 165 | categorical_columns = [ 166 | tf.feature_column.categorical_column_with_identity( 167 | key, num_buckets=_VOCAB_SIZE + _OOV_SIZE, default_value=0 168 | ) 169 | for key in _transformed_names(_VOCAB_FEATURE_KEYS) 170 | ] 171 | categorical_columns += [ 172 | tf.feature_column.categorical_column_with_identity( 173 | key, num_buckets=_FEATURE_BUCKET_COUNT, default_value=0 174 | ) 175 | for key in _transformed_names(_BUCKET_FEATURE_KEYS) 176 | ] 177 | categorical_columns += [ 178 | tf.feature_column.categorical_column_with_identity( # pylint: disable=g-complex-comprehension 179 | key, num_buckets=num_buckets, default_value=0 180 | ) 181 | for key, num_buckets in zip( 182 | _transformed_names(_CATEGORICAL_FEATURE_KEYS), 183 | _MAX_CATEGORICAL_FEATURE_VALUES, 184 | ) 185 | ] 186 | return tf.estimator.DNNLinearCombinedClassifier( 187 | config=config, 188 | linear_feature_columns=categorical_columns, 189 | dnn_feature_columns=real_valued_columns, 190 | dnn_hidden_units=hidden_units or [100, 70, 50, 25], 191 | warm_start_from=warm_start_from, 192 | ) 193 | 194 | 195 | def _example_serving_receiver_fn(transform_output, schema): 196 | """Build the serving in inputs. 197 | 198 | Args: 199 | transform_output: directory in which the tf-transform model was written 200 | during the preprocessing step. 201 | schema: the schema of the input data. 202 | 203 | Returns: 204 | Tensorflow graph which parses examples, applying tf-transform to them. 205 | """ 206 | raw_feature_spec = _get_raw_feature_spec(schema) 207 | raw_feature_spec.pop(_LABEL_KEY) 208 | 209 | raw_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn( 210 | raw_feature_spec, default_batch_size=None 211 | ) 212 | serving_input_receiver = raw_input_fn() 213 | 214 | _, transformed_features = saved_transform_io.partially_apply_saved_transform( 215 | os.path.join(transform_output, transform_fn_io.TRANSFORM_FN_DIR), 216 | serving_input_receiver.features, 217 | ) 218 | 219 | return tf.estimator.export.ServingInputReceiver( 220 | transformed_features, serving_input_receiver.receiver_tensors 221 | ) 222 | 223 | 224 | def _eval_input_receiver_fn(transform_output, schema): 225 | """Build everything needed for the tf-model-analysis to run the model. 226 | 227 | Args: 228 | transform_output: directory in which the tf-transform model was written 229 | during the preprocessing step. 230 | schema: the schema of the input data. 231 | 232 | Returns: 233 | EvalInputReceiver function, which contains: 234 | - Tensorflow graph which parses raw untransformed features, applies the 235 | tf-transform preprocessing operators. 236 | - Set of raw, untransformed features. 237 | - Label against which predictions will be compared. 238 | """ 239 | # Notice that the inputs are raw features, not transformed features here. 240 | raw_feature_spec = _get_raw_feature_spec(schema) 241 | 242 | serialized_tf_example = tf.placeholder( 243 | dtype=tf.string, shape=[None], name="input_example_tensor" 244 | ) 245 | 246 | # Add a parse_example operator to the tensorflow graph, which will parse 247 | # raw, untransformed, tf examples. 248 | features = tf.parse_example(serialized_tf_example, raw_feature_spec) 249 | 250 | # Now that we have our raw examples, process them through the tf-transform 251 | # function computed during the preprocessing step. 252 | _, transformed_features = saved_transform_io.partially_apply_saved_transform( 253 | os.path.join(transform_output, transform_fn_io.TRANSFORM_FN_DIR), features 254 | ) 255 | 256 | # The key name MUST be 'examples'. 257 | receiver_tensors = {"examples": serialized_tf_example} 258 | 259 | # NOTE: Model is driven by transformed features (since training works on the 260 | # materialized output of TFT, but slicing will happen on raw features. 261 | features.update(transformed_features) 262 | 263 | return tfma.export.EvalInputReceiver( 264 | features=features, 265 | receiver_tensors=receiver_tensors, 266 | labels=transformed_features[_transformed_name(_LABEL_KEY)], 267 | ) 268 | 269 | 270 | def _input_fn(filenames, transform_output, batch_size=200): 271 | """Generates features and labels for training or evaluation. 272 | 273 | Args: 274 | filenames: [str] list of CSV files to read data from. 275 | transform_output: directory in which the tf-transform model was written 276 | during the preprocessing step. 277 | batch_size: int First dimension size of the Tensors returned by input_fn 278 | 279 | Returns: 280 | A (features, indices) tuple where features is a dictionary of 281 | Tensors, and indices is a single Tensor of label indices. 282 | """ 283 | metadata_dir = os.path.join( 284 | transform_output, transform_fn_io.TRANSFORMED_METADATA_DIR 285 | ) 286 | transformed_metadata = metadata_io.read_metadata(metadata_dir) 287 | transformed_feature_spec = transformed_metadata.schema.as_feature_spec() 288 | 289 | transformed_features = tf.contrib.learn.io.read_batch_features( 290 | filenames, batch_size, transformed_feature_spec, reader=_gzip_reader_fn 291 | ) 292 | 293 | # We pop the label because we do not want to use it as a feature while we're 294 | # training. 295 | return transformed_features, transformed_features.pop(_transformed_name(_LABEL_KEY)) 296 | 297 | 298 | # TFX will call this function 299 | def trainer_fn(hparams, schema): 300 | """Build the estimator using the high level API. 301 | 302 | Args: 303 | hparams: Holds hyperparameters used to train the model as name/value pairs. 304 | schema: Holds the schema of the training examples. 305 | 306 | Returns: 307 | A dict of the following: 308 | - estimator: The estimator that will be used for training and eval. 309 | - train_spec: Spec for training. 310 | - eval_spec: Spec for eval. 311 | - eval_input_receiver_fn: Input function for eval. 312 | """ 313 | # Number of nodes in the first layer of the DNN 314 | first_dnn_layer_size = 100 315 | num_dnn_layers = 4 316 | dnn_decay_factor = 0.7 317 | 318 | train_batch_size = 40 319 | eval_batch_size = 40 320 | 321 | train_input_fn = lambda: _input_fn( # pylint: disable=g-long-lambda 322 | hparams.train_files, hparams.transform_output, batch_size=train_batch_size 323 | ) 324 | 325 | eval_input_fn = lambda: _input_fn( # pylint: disable=g-long-lambda 326 | hparams.eval_files, hparams.transform_output, batch_size=eval_batch_size 327 | ) 328 | 329 | train_spec = tf.estimator.TrainSpec( # pylint: disable=g-long-lambda 330 | train_input_fn, max_steps=hparams.train_steps 331 | ) 332 | 333 | serving_receiver_fn = lambda: _example_serving_receiver_fn( # pylint: disable=g-long-lambda 334 | hparams.transform_output, schema 335 | ) 336 | 337 | exporter = tf.estimator.FinalExporter("chicago-taxi", serving_receiver_fn) 338 | eval_spec = tf.estimator.EvalSpec( 339 | eval_input_fn, 340 | steps=hparams.eval_steps, 341 | exporters=[exporter], 342 | name="chicago-taxi-eval", 343 | ) 344 | 345 | run_config = tf.estimator.RunConfig( 346 | save_checkpoints_steps=999, keep_checkpoint_max=1 347 | ) 348 | 349 | run_config = run_config.replace(model_dir=hparams.serving_model_dir) 350 | 351 | estimator = _build_estimator( 352 | # Construct layers sizes with exponetial decay 353 | hidden_units=[ 354 | max(2, int(first_dnn_layer_size * dnn_decay_factor ** i)) 355 | for i in range(num_dnn_layers) 356 | ], 357 | config=run_config, 358 | warm_start_from=hparams.warm_start_from, 359 | ) 360 | 361 | # Create an input receiver for TFMA processing 362 | receiver_fn = lambda: _eval_input_receiver_fn( # pylint: disable=g-long-lambda 363 | hparams.transform_output, schema 364 | ) 365 | 366 | return { 367 | "estimator": estimator, 368 | "train_spec": train_spec, 369 | "eval_spec": eval_spec, 370 | "eval_input_receiver_fn": receiver_fn, 371 | } 372 | -------------------------------------------------------------------------------- /extra_tfx_example/setup/reset_env.sh: -------------------------------------------------------------------------------- 1 | # Use this to completely nuke the pypi libraries that TFX requires 2 | # and start with a 'clean' environment. This will uninstall TF/TFX 3 | # libraries and airflow libraries. 4 | # 5 | # It will not delete the Airflow install itself. You'll want to delete 6 | # ~/airflow on your own. 7 | # 8 | 9 | 10 | GREEN=$(tput setaf 2) 11 | NORMAL=$(tput sgr0) 12 | 13 | printf "${GREEN}Resetting TFX workshop${NORMAL}\n\n" 14 | 15 | pip uninstall tensorflow 16 | pip uninstall tfx 17 | pip uninstall tensorflow-model-analysis 18 | pip uninstall tensorflow-data-validation 19 | pip uninstall tensorflow-metadata 20 | pip uninstall tensorflow-transform 21 | pip uninstall apache-airflow 22 | 23 | printf "\n\n${GREEN}TFX workshop has been reset${NORMAL}\n" 24 | printf "${GREEN}Remember to delete ~/airflow${NORMAL}\n" 25 | 26 | -------------------------------------------------------------------------------- /extra_tfx_example/setup/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Set up the environment for the tutorial 4 | 5 | 6 | GREEN=$(tput setaf 2) 7 | NORMAL=$(tput sgr0) 8 | 9 | printf "${GREEN}Installing TFX workshop${NORMAL}\n\n" 10 | 11 | printf "${GREEN}Refreshing setuptools to avoid _NamespacePath issues${NORMAL}\n" 12 | pip uninstall setuptools -y && pip install setuptools 13 | 14 | printf "${GREEN}Installing httplib2 for Beam compatibility${NORMAL}\n" 15 | pip install httplib2==0.12.0 16 | 17 | printf "${GREEN}Installing pendulum to avoid problem with tzlocal${NORMAL}\n" 18 | pip install pendulum==1.4.4 19 | 20 | # TODO: Use range or pin for pip installs. 21 | printf "${GREEN}Installing TensorFlow${NORMAL}\n" 22 | pip install tensorflow==1.14.0 23 | 24 | printf "${GREEN}Installing TFX${NORMAL}\n" 25 | pip install tfx==0.14.0rc1 26 | 27 | printf "${GREEN}Installing Google API Client${NORMAL}\n" 28 | pip install google-api-python-client 29 | 30 | printf "${GREEN}Installing required Jupyter version${NORMAL}\n" 31 | pip install ipykernel 32 | ipython kernel install --user --name=tfx 33 | pip install --upgrade notebook==5.7.8 34 | jupyter nbextension install --py --symlink --sys-prefix tensorflow_model_analysis 35 | jupyter nbextension enable --py --sys-prefix tensorflow_model_analysis 36 | 37 | printf "${GREEN}Installing packages used by the notebooks${NORMAL}\n" 38 | pip install matplotlib 39 | pip install papermill 40 | pip install pandas 41 | pip install networkx 42 | 43 | # # Docker images 44 | printf "${GREEN}Installing docker${NORMAL}\n" 45 | pip install docker 46 | 47 | # Airflow 48 | # Set this to avoid the GPL version; no functionality difference either way 49 | printf "${GREEN}Preparing environment for Airflow${NORMAL}\n" 50 | export SLUGIFY_USES_TEXT_UNIDECODE=yes 51 | printf "${GREEN}Installing Airflow${NORMAL}\n" 52 | 53 | # TODO(b/136777165): Remove pinned version of Flask and Werkzeug 54 | # after newer version of Airflow: see AIRFLOW-4900. 55 | pip install apache-airflow==1.10.3 Flask==1.0.4 Werkzeug==0.14.1 56 | printf "${GREEN}Initializing Airflow database${NORMAL}\n" 57 | airflow initdb 58 | 59 | # Adjust configuration 60 | printf "${GREEN}Adjusting Airflow config${NORMAL}\n" 61 | sed -i'.orig' 's/dag_dir_list_interval = 300/dag_dir_list_interval = 1/g' ~/airflow/airflow.cfg 62 | sed -i'.orig' 's/job_heartbeat_sec = 5/job_heartbeat_sec = 1/g' ~/airflow/airflow.cfg 63 | sed -i'.orig' 's/scheduler_heartbeat_sec = 5/scheduler_heartbeat_sec = 1/g' ~/airflow/airflow.cfg 64 | sed -i'.orig' 's/dag_default_view = tree/dag_default_view = graph/g' ~/airflow/airflow.cfg 65 | # sed -i'.orig' 's/load_examples = True/load_examples = False/g' ~/airflow/airflow.cfg 66 | sed -i'.orig' 's/max_threads = 2/max_threads = 1/g' ~/airflow/airflow.cfg 67 | 68 | printf "${GREEN}Refreshing Airflow to pick up new config${NORMAL}\n" 69 | airflow resetdb --yes 70 | airflow initdb 71 | 72 | # Copy Dag to ~/airflow/dags 73 | mkdir -p ~/airflow/dags 74 | cp ./dags/taxi_pipeline.py ~/airflow/dags/ 75 | cp ./dags/taxi_utils.py ~/airflow/dags/ 76 | 77 | # Copy the simple pipeline example and adjust for user's environment 78 | cp ./chicago_data/taxi_pipeline_simple.py ~/airflow/dags/taxi_pipeline_solution.py 79 | cp ./chicago_data/taxi_utils.py ~/airflow/dags/taxi_utils_solution.py 80 | sed -i'.orig' "s/os.environ\['HOME'\], 'taxi'/os.environ\['HOME'\], 'airflow'/g" ~/airflow/dags/taxi_pipeline_solution.py 81 | sed -i'.orig' "s/_taxi_root, 'data', 'simple'/_taxi_root, 'data', 'taxi_data'/g" ~/airflow/dags/taxi_pipeline_solution.py 82 | sed -i'.orig' "s/taxi_utils.py/dags\/taxi_utils_solution.py/g" ~/airflow/dags/taxi_pipeline_solution.py 83 | sed -i'.orig' "s/os.environ\['HOME'\], 'tfx'/_taxi_root, 'tfx'/g" ~/airflow/dags/taxi_pipeline_solution.py 84 | sed -i'.orig' "s/chicago_taxi_simple/taxi_solution/g" ~/airflow/dags/taxi_pipeline_solution.py 85 | 86 | # Copy data to ~/airflow/data 87 | # TODO(): Combine Chicago Taxi data files 88 | cp -R data ~/airflow 89 | 90 | printf "\n${GREEN}TFX workshop installed${NORMAL}\n" 91 | 92 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | rows 3 | papermill 4 | jupyterlab 5 | apache-airflow 6 | --------------------------------------------------------------------------------