├── tests ├── __init__.py ├── test_duckdb.py └── test_jinja.py ├── dagster_jaffle_shop ├── assets │ ├── __init__.py │ ├── models │ │ ├── __init__.py │ │ ├── seeds │ │ │ ├── __init__.py │ │ │ ├── raw_orders.py │ │ │ ├── raw_customers.py │ │ │ └── raw_payments.py │ │ ├── staging │ │ │ ├── __init__.py │ │ │ ├── stg_customers.py │ │ │ ├── stg_orders.py │ │ │ └── stg_payments.py │ │ ├── orders.py │ │ └── customers.py │ ├── system │ │ ├── __init__.py │ │ └── duckdb_db.py │ └── include │ │ ├── raw_customers.csv │ │ ├── raw_payments.csv │ │ └── raw_orders.csv ├── jobs │ ├── __init__.py │ ├── models.py │ └── profiling.py ├── __init__.py ├── utils.py ├── resources.py └── io_managers.py ├── img ├── job_run.png ├── asset_graph.png └── catalog_metadata.png ├── Dockerfile ├── pyproject.toml ├── Makefile ├── LICENSE ├── .dockerignore ├── .gitignore └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dagster_jaffle_shop/assets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dagster_jaffle_shop/jobs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dagster_jaffle_shop/assets/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dagster_jaffle_shop/assets/system/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dagster_jaffle_shop/assets/models/seeds/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dagster_jaffle_shop/assets/models/staging/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /img/job_run.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stkbailey/dagster-jaffle-shop/HEAD/img/job_run.png -------------------------------------------------------------------------------- /img/asset_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stkbailey/dagster-jaffle-shop/HEAD/img/asset_graph.png -------------------------------------------------------------------------------- /img/catalog_metadata.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stkbailey/dagster-jaffle-shop/HEAD/img/catalog_metadata.png -------------------------------------------------------------------------------- /dagster_jaffle_shop/jobs/models.py: -------------------------------------------------------------------------------- 1 | from dagster import define_asset_job, load_assets_from_package_module 2 | 3 | from dagster_jaffle_shop import assets 4 | 5 | 6 | asset_list = [x.key.to_user_string() for x in load_assets_from_package_module(assets)] 7 | 8 | materialize_all_job = define_asset_job( 9 | name="materialize_all_assets_job", 10 | description="Materialize all assets in the project.", 11 | selection=asset_list, 12 | ) 13 | -------------------------------------------------------------------------------- /dagster_jaffle_shop/__init__.py: -------------------------------------------------------------------------------- 1 | from dagster import repository, load_assets_from_package_module 2 | 3 | from dagster_jaffle_shop import assets 4 | from dagster_jaffle_shop.jobs.models import materialize_all_job 5 | from dagster_jaffle_shop.jobs.profiling import evaluate_duckdb_tables_job 6 | 7 | 8 | # load all of the assets from the assets folder into a single list 9 | asset_list = load_assets_from_package_module(assets) 10 | job_list = [materialize_all_job, evaluate_duckdb_tables_job] 11 | 12 | 13 | @repository 14 | def jaffle_shop_repo(): 15 | "Repo (like a folder) for Jaffle Shop assets, jobs, sensors, schedules." 16 | return asset_list + job_list 17 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim-buster 2 | 3 | # Update system dependencies 4 | RUN apt-get update && \ 5 | apt-get install -y build-essential openssh-client git gcc 6 | 7 | # Add a dagster.yaml in the working directory 8 | RUN touch /tmp/dagster.yaml 9 | 10 | # copy requirements to cache in docker layer 11 | COPY pyproject.toml poetry.lock ./ 12 | 13 | # Install dependencies via Poetry, for extra stability 14 | RUN pip install poetry==1.2.0 15 | RUN poetry config virtualenvs.create false \ 16 | && poetry install 17 | 18 | # copy all other system files over (ignoring those in .dockerignore) 19 | COPY . . 20 | 21 | CMD ["dagit", "-h", "0.0.0.0", "-p","3000", "-w", "./workspace.yaml"] -------------------------------------------------------------------------------- /dagster_jaffle_shop/assets/models/seeds/raw_orders.py: -------------------------------------------------------------------------------- 1 | from dagster import asset, OpExecutionContext 2 | 3 | from dagster_jaffle_shop.assets.system.duckdb_db import duckdb_db 4 | from dagster_jaffle_shop.utils import get_package_root_path 5 | from dagster_jaffle_shop.io_managers import duckdb_io_manager 6 | 7 | 8 | @asset( 9 | group_name="models", 10 | non_argument_deps={duckdb_db.key}, 11 | io_manager_def=duckdb_io_manager, 12 | ) 13 | def raw_orders() -> str: 14 | "Raw orders data loaded from a CSV." 15 | 16 | p = get_package_root_path() / "assets" / "include" / "raw_orders.csv" 17 | query = f"SELECT * FROM read_csv_auto('{p.as_posix()}')" 18 | 19 | return query 20 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "dagster-jaffle-shop" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Stephen Bailey "] 6 | readme = "README.md" 7 | packages = [{include = "dagster_jaffle_shop"}] 8 | 9 | [tool.poetry.dependencies] 10 | python = ">=3.8,<3.11" 11 | dagster = "^1.0.12" 12 | dagit = "^1.0.12" 13 | Jinja2 = "^3.1.2" 14 | duckdb = "^0.5.1" 15 | pandas = "^1.5.0" 16 | pandas-profiling = "^3.4.0" 17 | numpy = "^1.20.0" 18 | 19 | 20 | [tool.poetry.group.dev.dependencies] 21 | pytest = "^7.1.3" 22 | black = "^22.10.0" 23 | ipython = "^8.5.0" 24 | notebook = "^6.5.1" 25 | 26 | [build-system] 27 | requires = ["poetry-core"] 28 | build-backend = "poetry.core.masonry.api" 29 | -------------------------------------------------------------------------------- /dagster_jaffle_shop/assets/models/seeds/raw_customers.py: -------------------------------------------------------------------------------- 1 | from dagster import asset, OpExecutionContext 2 | 3 | from dagster_jaffle_shop.assets.system.duckdb_db import duckdb_db 4 | from dagster_jaffle_shop.utils import get_package_root_path 5 | from dagster_jaffle_shop.io_managers import duckdb_io_manager 6 | 7 | 8 | @asset( 9 | group_name="models", 10 | non_argument_deps={duckdb_db.key}, 11 | io_manager_def=duckdb_io_manager, 12 | ) 13 | def raw_customers() -> str: 14 | "Raw customer data loaded from a CSV." 15 | 16 | p = get_package_root_path() / "assets" / "include" / "raw_customers.csv" 17 | query = f"SELECT * FROM read_csv_auto('{p.as_posix()}')" 18 | 19 | return query 20 | -------------------------------------------------------------------------------- /dagster_jaffle_shop/assets/models/seeds/raw_payments.py: -------------------------------------------------------------------------------- 1 | from dagster import asset, OpExecutionContext 2 | 3 | from dagster_jaffle_shop.assets.system.duckdb_db import duckdb_db 4 | from dagster_jaffle_shop.utils import get_package_root_path 5 | from dagster_jaffle_shop.io_managers import duckdb_io_manager 6 | 7 | 8 | @asset( 9 | group_name="models", 10 | non_argument_deps={duckdb_db.key}, 11 | io_manager_def=duckdb_io_manager, 12 | ) 13 | def raw_payments() -> str: 14 | "Raw payments data loaded from a CSV." 15 | 16 | p = get_package_root_path() / "assets" / "include" / "raw_payments.csv" 17 | query = f"SELECT * FROM read_csv_auto('{p.as_posix()}')" 18 | 19 | return query 20 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: docker-build 2 | 3 | # Sets a default image to be built/run 4 | # Override this by running: make IMAGE=foo 5 | PACKAGE_NAME := dagster_jaffle_shop 6 | IMAGE_NAME := dagster-jaffle-shop 7 | 8 | clean: 9 | find $(PACKAGE_NAME) -type d -name '__pycache__' -exec rm -rf {} \; 10 | 11 | dagit: setup 12 | dagit --package-name $(PACKAGE_NAME) 13 | 14 | materialize_job: setup 15 | dagster job execute --package-name $(PACKAGE_NAME) -j materialize_all_assets_job 16 | 17 | evaluate_job: setup 18 | dagster job execute --package-name $(PACKAGE_NAME) -j evaluate_duckdb_tables_job 19 | 20 | format: 21 | black . 22 | 23 | setup: 24 | touch /tmp/dagster.yaml 25 | 26 | test: 27 | pytest tests 28 | 29 | docker-build: 30 | docker build --platform linux/amd64 -t $(IMAGE_NAME) . 31 | 32 | docker-dagit: 33 | docker run $(IMAGE_NAME) dagit --package-name $(PACKAGE_NAME) -p 3000:3000 -h 0.0.0.0 34 | -------------------------------------------------------------------------------- /dagster_jaffle_shop/utils.py: -------------------------------------------------------------------------------- 1 | import jinja2 2 | import pathlib 3 | import warnings 4 | 5 | from dagster import ExperimentalWarning 6 | 7 | # mute dagster warnings 8 | warnings.filterwarnings("ignore", category=ExperimentalWarning) 9 | 10 | 11 | def resolve_duckdb_ref(name): 12 | """ 13 | Function that will be used in `render_jinja_template` to pass to the 14 | `ref` function that is common in dbt-formatted queries. In DuckDB, there 15 | are no database or schema designations, so we simply pass through the name. 16 | """ 17 | return name 18 | 19 | 20 | def render_jinja_template(q: str) -> str: 21 | "Takes a dbt-jinja-formatted query and resolves functions." 22 | template = jinja2.Template(q) 23 | result = template.render(ref=resolve_duckdb_ref) 24 | return result 25 | 26 | 27 | def get_package_root_path() -> pathlib.Path: 28 | "Get the path of the package" 29 | return pathlib.Path(__file__).parent.absolute() 30 | -------------------------------------------------------------------------------- /dagster_jaffle_shop/assets/models/staging/stg_customers.py: -------------------------------------------------------------------------------- 1 | from dagster import asset 2 | 3 | from dagster_jaffle_shop.io_managers import duckdb_io_manager 4 | from dagster_jaffle_shop.resources import DuckDBAssetMetadata 5 | 6 | 7 | @asset(group_name="models", io_manager_def=duckdb_io_manager) 8 | def stg_customers(raw_customers: DuckDBAssetMetadata) -> str: 9 | "An intermediate staging table for customers" 10 | 11 | jinja_query = """ 12 | with source as ( 13 | 14 | {#- 15 | Normally we would select from the table here, but we are using seeds to load 16 | our data in this project 17 | #} 18 | select * from {{ ref('raw_customers') }} 19 | 20 | ), 21 | 22 | renamed as ( 23 | 24 | select 25 | id as customer_id, 26 | first_name, 27 | last_name 28 | 29 | from source 30 | 31 | ) 32 | 33 | select * from renamed 34 | 35 | """ 36 | return jinja_query 37 | -------------------------------------------------------------------------------- /dagster_jaffle_shop/assets/models/staging/stg_orders.py: -------------------------------------------------------------------------------- 1 | from dagster import asset 2 | 3 | from dagster_jaffle_shop.io_managers import duckdb_io_manager 4 | from dagster_jaffle_shop.resources import DuckDBAssetMetadata 5 | 6 | 7 | @asset(group_name="models", io_manager_def=duckdb_io_manager) 8 | def stg_orders(raw_orders: DuckDBAssetMetadata) -> str: 9 | "An intermediate staging table for orders" 10 | 11 | jinja_query = """ 12 | with source as ( 13 | 14 | {#- 15 | Normally we would select from the table here, but we are using seeds to load 16 | our data in this project 17 | #} 18 | select * from {{ ref('raw_orders') }} 19 | 20 | ), 21 | 22 | renamed as ( 23 | 24 | select 25 | id as order_id, 26 | user_id as customer_id, 27 | order_date, 28 | status 29 | 30 | from source 31 | 32 | ) 33 | 34 | select * from renamed 35 | """ 36 | return jinja_query 37 | -------------------------------------------------------------------------------- /dagster_jaffle_shop/assets/models/staging/stg_payments.py: -------------------------------------------------------------------------------- 1 | from dagster import asset 2 | 3 | from dagster_jaffle_shop.io_managers import duckdb_io_manager 4 | from dagster_jaffle_shop.resources import DuckDBAssetMetadata 5 | 6 | 7 | @asset(group_name="models", io_manager_def=duckdb_io_manager) 8 | def stg_payments(raw_payments: DuckDBAssetMetadata) -> str: 9 | "An intermediate staging table for payments" 10 | 11 | jinja_query = """ 12 | with source as ( 13 | 14 | {#- 15 | Normally we would select from the table here, but we are using seeds to load 16 | our data in this project 17 | #} 18 | select * from {{ ref('raw_payments') }} 19 | 20 | ), 21 | 22 | renamed as ( 23 | 24 | select 25 | id as payment_id, 26 | order_id, 27 | payment_method, 28 | 29 | -- `amount` is currently stored in cents, so we convert it to dollars 30 | amount / 100 as amount 31 | 32 | from source 33 | 34 | ) 35 | 36 | select * from renamed 37 | """ 38 | 39 | return jinja_query 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Stephen Bailey 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /tests/test_duckdb.py: -------------------------------------------------------------------------------- 1 | import duckdb 2 | import pytest 3 | 4 | DATA_CSV = """\ 5 | id,value 6 | 1,101 7 | 2,202 8 | 3,303""" 9 | 10 | 11 | @pytest.fixture 12 | def duckdb_cursor(tmp_path): 13 | # create duckdb database and csv 14 | cursor = duckdb.connect() 15 | f = tmp_path / "foo.csv" 16 | f.write_text(DATA_CSV) 17 | 18 | # load table 19 | query = f""" 20 | CREATE TABLE new_tbl AS ( 21 | SELECT * FROM read_csv_auto('{f.as_posix()}') 22 | );""" 23 | cursor.execute(query).fetchdf() 24 | 25 | yield cursor 26 | 27 | 28 | def test_duckdb_create(tmp_path): 29 | # given 30 | cursor = duckdb.connect() 31 | f = tmp_path / "foo.csv" 32 | f.write_text(DATA_CSV) 33 | 34 | # when 35 | QUERY = f""" 36 | CREATE TABLE new_tbl AS ( 37 | SELECT * FROM read_csv_auto('{f.as_posix()}') 38 | );""" 39 | df = cursor.execute(QUERY).fetchdf() 40 | 41 | # then 42 | assert df.iloc[0]["Count"] == 3 43 | 44 | 45 | def test_duckdb_select(duckdb_cursor): 46 | # given 47 | QUERY = "select * from new_tbl;" 48 | 49 | # when 50 | df = duckdb_cursor.execute(QUERY).fetchdf() 51 | 52 | # then 53 | print(df) 54 | assert df.shape == (3, 2) 55 | -------------------------------------------------------------------------------- /dagster_jaffle_shop/assets/system/duckdb_db.py: -------------------------------------------------------------------------------- 1 | import duckdb 2 | import pathlib 3 | 4 | from dagster import asset, OpExecutionContext 5 | 6 | from dagster_jaffle_shop.resources import duckdb_resource 7 | 8 | 9 | @asset( 10 | group_name="system", 11 | required_resource_keys={"duckdb"}, 12 | resource_defs={"duckdb": duckdb_resource}, 13 | ) 14 | def duckdb_db(context: OpExecutionContext) -> dict: 15 | """ 16 | Creates the local DuckDB database file and directory path. 17 | Not required since the `io_manager` will create these, but included 18 | as an example and a way to log database-level metadata. 19 | """ 20 | 21 | db_file = pathlib.Path(context.resources.duckdb.database) 22 | if not db_file.parent.exists(): 23 | db_file.parent.mkdir(parents=True) 24 | 25 | if not db_file.exists(): 26 | context.log.info("Database file does not exist. Creating...") 27 | con = duckdb.connect(database=db_file.as_posix()) 28 | con.close() 29 | 30 | # list the tables currently available in the database 31 | with duckdb.connect(database=db_file.as_posix()) as con: 32 | table_df = con.execute("select * from pg_tables").fetchdf() 33 | table_list = ", ".join(table_df["tablename"].values.tolist()) 34 | context.add_output_metadata( 35 | {"tables": table_list, "count_tables": table_df.shape[0]} 36 | ) 37 | 38 | return {"path": db_file.as_posix()} 39 | -------------------------------------------------------------------------------- /dagster_jaffle_shop/resources.py: -------------------------------------------------------------------------------- 1 | import duckdb 2 | import pandas 3 | import pathlib 4 | 5 | from dagster import resource, get_dagster_logger, InitResourceContext 6 | 7 | DUCKDB_FILE = "/tmp/duckdb/dagster.duckdb" 8 | logger = get_dagster_logger() 9 | 10 | 11 | class DuckDBResource: 12 | """ 13 | Utility class for executing queries against a DuckDB database. 14 | This is mainly useful so that we know connections are made within 15 | the operations actually executed. 16 | """ 17 | 18 | def __init__(self, database: str): 19 | self.database = database 20 | 21 | def execute_query(self, query: str, read_only=True) -> pandas.DataFrame: 22 | with duckdb.connect(database=self.database, read_only=read_only) as conn: 23 | logger.info("Executing query: %s", query) 24 | df = conn.execute(query).fetch_df() 25 | 26 | if not "df" in locals(): 27 | raise Exception("There was an error with the query!") 28 | 29 | return df 30 | 31 | 32 | class DuckDBAssetMetadata: 33 | "This metadata class provides a structured way to yield asset contents." 34 | 35 | def __init__(self, table_name: str): 36 | self.table_name = table_name 37 | 38 | 39 | @resource 40 | def duckdb_resource(context: InitResourceContext) -> duckdb.DuckDBPyConnection: 41 | "The DuckDB resource has helper functions to work with a local database." 42 | 43 | db_file = pathlib.Path(DUCKDB_FILE) 44 | yield DuckDBResource(database=db_file.as_posix()) 45 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # Git 2 | .git 3 | .gitignore 4 | 5 | # CI 6 | .codeclimate.yml 7 | .travis.yml 8 | .taskcluster.yml 9 | 10 | # Docker 11 | docker-compose.yml 12 | .docker 13 | 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | */__pycache__/ 17 | */*/__pycache__/ 18 | */*/*/__pycache__/ 19 | *.py[cod] 20 | */*.py[cod] 21 | */*/*.py[cod] 22 | */*/*/*.py[cod] 23 | 24 | # C extensions 25 | *.so 26 | 27 | # Distribution / packaging 28 | .Python 29 | env/ 30 | build/ 31 | develop-eggs/ 32 | dist/ 33 | downloads/ 34 | eggs/ 35 | lib/ 36 | lib64/ 37 | parts/ 38 | sdist/ 39 | var/ 40 | *.egg-info/ 41 | .installed.cfg 42 | *.egg 43 | 44 | # PyInstaller 45 | # Usually these files are written by a python script from a template 46 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 47 | *.manifest 48 | *.spec 49 | 50 | # Installer logs 51 | pip-log.txt 52 | pip-delete-this-directory.txt 53 | 54 | # Unit test / coverage reports 55 | htmlcov/ 56 | .tox/ 57 | .coverage 58 | .cache 59 | nosetests.xml 60 | coverage.xml 61 | 62 | # Translations 63 | *.mo 64 | *.pot 65 | 66 | # Django stuff: 67 | *.log 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Virtual environment 76 | .env/ 77 | .venv/ 78 | venv/ 79 | 80 | # PyCharm 81 | .idea 82 | 83 | # Python mode for VIM 84 | .ropeproject 85 | */.ropeproject 86 | */*/.ropeproject 87 | */*/*/.ropeproject 88 | 89 | # Vim swap files 90 | *.swp 91 | */*.swp 92 | */*/*.swp 93 | */*/*/*.swp 94 | 95 | # Meltano 96 | .meltano -------------------------------------------------------------------------------- /tests/test_jinja.py: -------------------------------------------------------------------------------- 1 | import jinja2 2 | 3 | 4 | QUERY = """ 5 | {% set payment_methods = ['credit_card', 'coupon', 'bank_transfer', 'gift_card'] %} 6 | 7 | with orders as ( 8 | 9 | select * from {{ ref('stg_orders') }} 10 | 11 | ), 12 | 13 | payments as ( 14 | 15 | select * from {{ ref('stg_payments') }} 16 | 17 | ), 18 | 19 | order_payments as ( 20 | 21 | select 22 | order_id, 23 | 24 | {% for payment_method in payment_methods -%} 25 | sum(case when payment_method = '{{ payment_method }}' then amount else 0 end) as {{ payment_method }}_amount, 26 | {% endfor -%} 27 | 28 | sum(amount) as total_amount 29 | 30 | from payments 31 | 32 | group by order_id 33 | 34 | ), 35 | 36 | final as ( 37 | 38 | select 39 | orders.order_id, 40 | orders.customer_id, 41 | orders.order_date, 42 | orders.status, 43 | 44 | {% for payment_method in payment_methods -%} 45 | 46 | order_payments.{{ payment_method }}_amount, 47 | 48 | {% endfor -%} 49 | 50 | order_payments.total_amount as amount 51 | 52 | from orders 53 | 54 | 55 | left join order_payments 56 | on orders.order_id = order_payments.order_id 57 | 58 | ) 59 | 60 | select * from final 61 | 62 | """ 63 | 64 | 65 | def test_jinja_interpolation(): 66 | def resolve(s): 67 | return s 68 | 69 | template = jinja2.Template(QUERY) 70 | result = template.render(ref=resolve) 71 | 72 | print(result) 73 | 74 | assert "{" not in result 75 | -------------------------------------------------------------------------------- /dagster_jaffle_shop/assets/models/orders.py: -------------------------------------------------------------------------------- 1 | from dagster import asset 2 | 3 | from dagster_jaffle_shop.io_managers import duckdb_io_manager 4 | from dagster_jaffle_shop.resources import DuckDBAssetMetadata 5 | 6 | 7 | @asset(group_name="models", io_manager_def=duckdb_io_manager) 8 | def orders(stg_orders: DuckDBAssetMetadata, stg_payments: DuckDBAssetMetadata) -> str: 9 | "This table has basic information about orders, as well as some derived facts based on payments" 10 | 11 | jinja_query = """ 12 | {% set payment_methods = ['credit_card', 'coupon', 'bank_transfer', 'gift_card'] %} 13 | 14 | with orders as ( 15 | 16 | select * from {{ ref('stg_orders') }} 17 | 18 | ), 19 | 20 | payments as ( 21 | 22 | select * from {{ ref('stg_payments') }} 23 | 24 | ), 25 | 26 | order_payments as ( 27 | 28 | select 29 | order_id, 30 | 31 | {% for payment_method in payment_methods -%} 32 | sum(case when payment_method = '{{ payment_method }}' then amount else 0 end) as {{ payment_method }}_amount, 33 | {% endfor -%} 34 | 35 | sum(amount) as total_amount 36 | 37 | from payments 38 | 39 | group by order_id 40 | 41 | ), 42 | 43 | final as ( 44 | 45 | select 46 | orders.order_id, 47 | orders.customer_id, 48 | orders.order_date, 49 | orders.status, 50 | 51 | {% for payment_method in payment_methods -%} 52 | 53 | order_payments.{{ payment_method }}_amount, 54 | 55 | {% endfor -%} 56 | 57 | order_payments.total_amount as amount 58 | 59 | from orders 60 | 61 | 62 | left join order_payments 63 | on orders.order_id = order_payments.order_id 64 | 65 | ) 66 | 67 | select * from final 68 | """ 69 | 70 | return jinja_query 71 | -------------------------------------------------------------------------------- /dagster_jaffle_shop/assets/include/raw_customers.csv: -------------------------------------------------------------------------------- 1 | id,first_name,last_name 2 | 1,Michael,P. 3 | 2,Shawn,M. 4 | 3,Kathleen,P. 5 | 4,Jimmy,C. 6 | 5,Katherine,R. 7 | 6,Sarah,R. 8 | 7,Martin,M. 9 | 8,Frank,R. 10 | 9,Jennifer,F. 11 | 10,Henry,W. 12 | 11,Fred,S. 13 | 12,Amy,D. 14 | 13,Kathleen,M. 15 | 14,Steve,F. 16 | 15,Teresa,H. 17 | 16,Amanda,H. 18 | 17,Kimberly,R. 19 | 18,Johnny,K. 20 | 19,Virginia,F. 21 | 20,Anna,A. 22 | 21,Willie,H. 23 | 22,Sean,H. 24 | 23,Mildred,A. 25 | 24,David,G. 26 | 25,Victor,H. 27 | 26,Aaron,R. 28 | 27,Benjamin,B. 29 | 28,Lisa,W. 30 | 29,Benjamin,K. 31 | 30,Christina,W. 32 | 31,Jane,G. 33 | 32,Thomas,O. 34 | 33,Katherine,M. 35 | 34,Jennifer,S. 36 | 35,Sara,T. 37 | 36,Harold,O. 38 | 37,Shirley,J. 39 | 38,Dennis,J. 40 | 39,Louise,W. 41 | 40,Maria,A. 42 | 41,Gloria,C. 43 | 42,Diana,S. 44 | 43,Kelly,N. 45 | 44,Jane,R. 46 | 45,Scott,B. 47 | 46,Norma,C. 48 | 47,Marie,P. 49 | 48,Lillian,C. 50 | 49,Judy,N. 51 | 50,Billy,L. 52 | 51,Howard,R. 53 | 52,Laura,F. 54 | 53,Anne,B. 55 | 54,Rose,M. 56 | 55,Nicholas,R. 57 | 56,Joshua,K. 58 | 57,Paul,W. 59 | 58,Kathryn,K. 60 | 59,Adam,A. 61 | 60,Norma,W. 62 | 61,Timothy,R. 63 | 62,Elizabeth,P. 64 | 63,Edward,G. 65 | 64,David,C. 66 | 65,Brenda,W. 67 | 66,Adam,W. 68 | 67,Michael,H. 69 | 68,Jesse,E. 70 | 69,Janet,P. 71 | 70,Helen,F. 72 | 71,Gerald,C. 73 | 72,Kathryn,O. 74 | 73,Alan,B. 75 | 74,Harry,A. 76 | 75,Andrea,H. 77 | 76,Barbara,W. 78 | 77,Anne,W. 79 | 78,Harry,H. 80 | 79,Jack,R. 81 | 80,Phillip,H. 82 | 81,Shirley,H. 83 | 82,Arthur,D. 84 | 83,Virginia,R. 85 | 84,Christina,R. 86 | 85,Theresa,M. 87 | 86,Jason,C. 88 | 87,Phillip,B. 89 | 88,Adam,T. 90 | 89,Margaret,J. 91 | 90,Paul,P. 92 | 91,Todd,W. 93 | 92,Willie,O. 94 | 93,Frances,R. 95 | 94,Gregory,H. 96 | 95,Lisa,P. 97 | 96,Jacqueline,A. 98 | 97,Shirley,D. 99 | 98,Nicole,M. 100 | 99,Mary,G. 101 | 100,Jean,M. 102 | -------------------------------------------------------------------------------- /dagster_jaffle_shop/assets/models/customers.py: -------------------------------------------------------------------------------- 1 | from dagster import asset 2 | 3 | from dagster_jaffle_shop.io_managers import duckdb_io_manager 4 | from dagster_jaffle_shop.resources import DuckDBAssetMetadata 5 | 6 | 7 | @asset(group_name="models", io_manager_def=duckdb_io_manager) 8 | def customers( 9 | stg_customers: DuckDBAssetMetadata, 10 | stg_orders: DuckDBAssetMetadata, 11 | stg_payments: DuckDBAssetMetadata, 12 | ) -> str: 13 | "This table has basic information about a customer, as well as some derived facts based on a customer's orders" 14 | 15 | jinja_query = """ 16 | with customers as ( 17 | 18 | select * from {{ ref('stg_customers') }} 19 | 20 | ), 21 | 22 | orders as ( 23 | 24 | select * from {{ ref('stg_orders') }} 25 | 26 | ), 27 | 28 | payments as ( 29 | 30 | select * from {{ ref('stg_payments') }} 31 | 32 | ), 33 | 34 | customer_orders as ( 35 | 36 | select 37 | customer_id, 38 | 39 | min(order_date) as first_order, 40 | max(order_date) as most_recent_order, 41 | count(order_id) as number_of_orders 42 | from orders 43 | 44 | group by customer_id 45 | 46 | ), 47 | 48 | customer_payments as ( 49 | 50 | select 51 | orders.customer_id, 52 | sum(amount) as total_amount 53 | 54 | from payments 55 | 56 | left join orders on 57 | payments.order_id = orders.order_id 58 | 59 | group by orders.customer_id 60 | 61 | ), 62 | 63 | final as ( 64 | 65 | select 66 | customers.customer_id, 67 | customers.first_name, 68 | customers.last_name, 69 | customer_orders.first_order, 70 | customer_orders.most_recent_order, 71 | customer_orders.number_of_orders, 72 | customer_payments.total_amount as customer_lifetime_value 73 | 74 | from customers 75 | 76 | left join customer_orders 77 | on customers.customer_id = customer_orders.customer_id 78 | 79 | left join customer_payments 80 | on customers.customer_id = customer_payments.customer_id 81 | 82 | ) 83 | 84 | select * from final 85 | """ 86 | 87 | return jinja_query 88 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | dev/ -------------------------------------------------------------------------------- /dagster_jaffle_shop/io_managers.py: -------------------------------------------------------------------------------- 1 | import duckdb 2 | import time 3 | 4 | from dagster import IOManager, io_manager, OutputContext, InputContext 5 | 6 | from dagster_jaffle_shop.resources import ( 7 | DUCKDB_FILE, 8 | DuckDBResource, 9 | DuckDBAssetMetadata, 10 | ) 11 | from dagster_jaffle_shop.utils import render_jinja_template 12 | 13 | 14 | class DuckDBIOManager(IOManager): 15 | "The DuckDB IO Manager takes query text and creates tables based on it." 16 | 17 | def __init__(self, db_file: str): 18 | self.db_file = db_file 19 | self.resource = DuckDBResource(db_file) 20 | 21 | def handle_output(self, context: OutputContext, obj: str): 22 | """ 23 | The handle_output function takes the asset output and does something 24 | with it. In this case, we are going to take a Jinja SQL string, render 25 | it, and then write it into the DuckDB database. 26 | """ 27 | # name is the name given to the Out that we're storing for 28 | table_name = context.asset_key.to_user_string() 29 | rendered_query = render_jinja_template(obj) 30 | query = f"CREATE OR REPLACE TABLE {table_name} AS {rendered_query}" 31 | 32 | # we try multiple times to insert the data 33 | max_attempts = 5 34 | counter = 0 35 | while counter < max_attempts: 36 | try: 37 | result_df = self.resource.execute_query(query, read_only=False) 38 | counter = max_attempts 39 | except duckdb.IOException: 40 | counter += 1 41 | time.sleep(1) 42 | 43 | if "result_df" not in vars(): 44 | raise Exception("Query was unsuccessful.") 45 | 46 | context.add_output_metadata( 47 | { 48 | "query": rendered_query, 49 | "table_name": table_name, 50 | "count_records": int(result_df.iloc[0]["Count"]), 51 | } 52 | ) 53 | 54 | def load_input(self, context: InputContext) -> DuckDBAssetMetadata: 55 | """ 56 | The load_input function determines how this asset will be loaded by any 57 | assets that reference it. In our case, we are not loading a dataframe 58 | directly, we are loading the table name, which can then be referenced 59 | in the SQL query. 60 | """ 61 | # upstream_output.name is the name given to the Out that we're loading for 62 | return DuckDBAssetMetadata(table_name=context.upstream_output.name) 63 | 64 | 65 | @io_manager 66 | def duckdb_io_manager(_): 67 | yield DuckDBIOManager(DUCKDB_FILE) 68 | -------------------------------------------------------------------------------- /dagster_jaffle_shop/assets/include/raw_payments.csv: -------------------------------------------------------------------------------- 1 | id,order_id,payment_method,amount 2 | 1,1,credit_card,1000 3 | 2,2,credit_card,2000 4 | 3,3,coupon,100 5 | 4,4,coupon,2500 6 | 5,5,bank_transfer,1700 7 | 6,6,credit_card,600 8 | 7,7,credit_card,1600 9 | 8,8,credit_card,2300 10 | 9,9,gift_card,2300 11 | 10,9,bank_transfer,0 12 | 11,10,bank_transfer,2600 13 | 12,11,credit_card,2700 14 | 13,12,credit_card,100 15 | 14,13,credit_card,500 16 | 15,13,bank_transfer,1400 17 | 16,14,bank_transfer,300 18 | 17,15,coupon,2200 19 | 18,16,credit_card,1000 20 | 19,17,bank_transfer,200 21 | 20,18,credit_card,500 22 | 21,18,credit_card,800 23 | 22,19,gift_card,600 24 | 23,20,bank_transfer,1500 25 | 24,21,credit_card,1200 26 | 25,22,bank_transfer,800 27 | 26,23,gift_card,2300 28 | 27,24,coupon,2600 29 | 28,25,bank_transfer,2000 30 | 29,25,credit_card,2200 31 | 30,25,coupon,1600 32 | 31,26,credit_card,3000 33 | 32,27,credit_card,2300 34 | 33,28,bank_transfer,1900 35 | 34,29,bank_transfer,1200 36 | 35,30,credit_card,1300 37 | 36,31,credit_card,1200 38 | 37,32,credit_card,300 39 | 38,33,credit_card,2200 40 | 39,34,bank_transfer,1500 41 | 40,35,credit_card,2900 42 | 41,36,bank_transfer,900 43 | 42,37,credit_card,2300 44 | 43,38,credit_card,1500 45 | 44,39,bank_transfer,800 46 | 45,40,credit_card,1400 47 | 46,41,credit_card,1700 48 | 47,42,coupon,1700 49 | 48,43,gift_card,1800 50 | 49,44,gift_card,1100 51 | 50,45,bank_transfer,500 52 | 51,46,bank_transfer,800 53 | 52,47,credit_card,2200 54 | 53,48,bank_transfer,300 55 | 54,49,credit_card,600 56 | 55,49,credit_card,900 57 | 56,50,credit_card,2600 58 | 57,51,credit_card,2900 59 | 58,51,credit_card,100 60 | 59,52,bank_transfer,1500 61 | 60,53,credit_card,300 62 | 61,54,credit_card,1800 63 | 62,54,bank_transfer,1100 64 | 63,55,credit_card,2900 65 | 64,56,credit_card,400 66 | 65,57,bank_transfer,200 67 | 66,58,coupon,1800 68 | 67,58,gift_card,600 69 | 68,59,gift_card,2800 70 | 69,60,credit_card,400 71 | 70,61,bank_transfer,1600 72 | 71,62,gift_card,1400 73 | 72,63,credit_card,2900 74 | 73,64,bank_transfer,2600 75 | 74,65,credit_card,0 76 | 75,66,credit_card,2800 77 | 76,67,bank_transfer,400 78 | 77,67,credit_card,1900 79 | 78,68,credit_card,1600 80 | 79,69,credit_card,1900 81 | 80,70,credit_card,2600 82 | 81,71,credit_card,500 83 | 82,72,credit_card,2900 84 | 83,73,bank_transfer,300 85 | 84,74,credit_card,3000 86 | 85,75,credit_card,1900 87 | 86,76,coupon,200 88 | 87,77,credit_card,0 89 | 88,77,bank_transfer,1900 90 | 89,78,bank_transfer,2600 91 | 90,79,credit_card,1800 92 | 91,79,credit_card,900 93 | 92,80,gift_card,300 94 | 93,81,coupon,200 95 | 94,82,credit_card,800 96 | 95,83,credit_card,100 97 | 96,84,bank_transfer,2500 98 | 97,85,bank_transfer,1700 99 | 98,86,coupon,2300 100 | 99,87,gift_card,3000 101 | 100,87,credit_card,2600 102 | 101,88,credit_card,2900 103 | 102,89,bank_transfer,2200 104 | 103,90,bank_transfer,200 105 | 104,91,credit_card,1900 106 | 105,92,bank_transfer,1500 107 | 106,92,coupon,200 108 | 107,93,gift_card,2600 109 | 108,94,coupon,700 110 | 109,95,coupon,2400 111 | 110,96,gift_card,1700 112 | 111,97,bank_transfer,1400 113 | 112,98,bank_transfer,1000 114 | 113,99,credit_card,2400 115 | -------------------------------------------------------------------------------- /dagster_jaffle_shop/jobs/profiling.py: -------------------------------------------------------------------------------- 1 | """ 2 | Creates a job that is not directly associated with assets, which can 3 | be run to log metadata to those assets, based on their names. 4 | """ 5 | import json 6 | 7 | from dagster import op, job, OpExecutionContext, AssetObservation 8 | from pandas_profiling import ProfileReport 9 | 10 | from dagster_jaffle_shop.resources import duckdb_resource 11 | 12 | 13 | PRIMARY_KEY_DICT = { 14 | "customers": "customer_id", 15 | "stg_customers": "customer_id", 16 | "orders": "order_id", 17 | "stg_orders": "order_id", 18 | } 19 | 20 | 21 | @op(required_resource_keys={"duckdb"}) 22 | def profile_duckdb_tables(context: OpExecutionContext) -> None: 23 | "Generates `pandas_profiling` observation for each table in the db." 24 | 25 | # list all tables in the duckdb database 26 | ddb = context.resources.duckdb 27 | df = ddb.execute_query("select * from pg_tables") 28 | table_list = df["tablename"].values.tolist() 29 | context.log.info("Found %s tables in DuckDB database.", len(table_list)) 30 | 31 | # for each table, read in the data and create profile 32 | for t in table_list: 33 | context.log.info("Creating profile for table %s", t) 34 | df = ddb.execute_query(f"select * from {t}") 35 | profile = ProfileReport(df, minimal=True) 36 | profile_dict = json.loads(profile.to_json()) 37 | metadata = {**profile_dict["analysis"], **profile_dict["table"]} 38 | 39 | # log event 40 | observation = AssetObservation( 41 | asset_key=t, 42 | description="Auto-logged statistics by pandas-profiling job.", 43 | metadata=metadata, 44 | ) 45 | context.log_event(event=observation) 46 | 47 | 48 | @op(required_resource_keys={"duckdb"}) 49 | def test_duckdb_table_primary_keys(context: OpExecutionContext) -> None: 50 | "Tests that there are no null values in a declared primary key col." 51 | 52 | # list all tables in the duckdb database 53 | ddb = context.resources.duckdb 54 | df = ddb.execute_query("select * from pg_tables") 55 | table_list = df["tablename"].values.tolist() 56 | context.log.info("Found %s tables in DuckDB database.", len(table_list)) 57 | 58 | # for each table, read in the data and create profile 59 | for table_name, col_name in PRIMARY_KEY_DICT.items(): 60 | context.log.info("Checking for primary key validity of table %s", table_name) 61 | test = f""" 62 | select count(*) as count_null 63 | from {table_name} 64 | where {col_name} is null 65 | """ 66 | df = ddb.execute_query(test) 67 | success = bool(df.iloc[0]["count_null"] == 0) 68 | 69 | # log event 70 | observation = AssetObservation( 71 | asset_key=table_name, 72 | description="TEST: Primary key is not null", 73 | metadata={"success": success}, 74 | ) 75 | context.log_event(event=observation) 76 | 77 | 78 | @job(resource_defs={"duckdb": duckdb_resource}) 79 | def evaluate_duckdb_tables_job(): 80 | "This job runs both the table profiling and testing steps." 81 | 82 | profile_duckdb_tables() 83 | test_duckdb_table_primary_keys() 84 | -------------------------------------------------------------------------------- /dagster_jaffle_shop/assets/include/raw_orders.csv: -------------------------------------------------------------------------------- 1 | id,user_id,order_date,status 2 | 1,1,2018-01-01,returned 3 | 2,3,2018-01-02,completed 4 | 3,94,2018-01-04,completed 5 | 4,50,2018-01-05,completed 6 | 5,64,2018-01-05,completed 7 | 6,54,2018-01-07,completed 8 | 7,88,2018-01-09,completed 9 | 8,2,2018-01-11,returned 10 | 9,53,2018-01-12,completed 11 | 10,7,2018-01-14,completed 12 | 11,99,2018-01-14,completed 13 | 12,59,2018-01-15,completed 14 | 13,84,2018-01-17,completed 15 | 14,40,2018-01-17,returned 16 | 15,25,2018-01-17,completed 17 | 16,39,2018-01-18,completed 18 | 17,71,2018-01-18,completed 19 | 18,64,2018-01-20,returned 20 | 19,54,2018-01-22,completed 21 | 20,20,2018-01-23,completed 22 | 21,71,2018-01-23,completed 23 | 22,86,2018-01-24,completed 24 | 23,22,2018-01-26,return_pending 25 | 24,3,2018-01-27,completed 26 | 25,51,2018-01-28,completed 27 | 26,32,2018-01-28,completed 28 | 27,94,2018-01-29,completed 29 | 28,8,2018-01-29,completed 30 | 29,57,2018-01-31,completed 31 | 30,69,2018-02-02,completed 32 | 31,16,2018-02-02,completed 33 | 32,28,2018-02-04,completed 34 | 33,42,2018-02-04,completed 35 | 34,38,2018-02-06,completed 36 | 35,80,2018-02-08,completed 37 | 36,85,2018-02-10,completed 38 | 37,1,2018-02-10,completed 39 | 38,51,2018-02-10,completed 40 | 39,26,2018-02-11,completed 41 | 40,33,2018-02-13,completed 42 | 41,99,2018-02-14,completed 43 | 42,92,2018-02-16,completed 44 | 43,31,2018-02-17,completed 45 | 44,66,2018-02-17,completed 46 | 45,22,2018-02-17,completed 47 | 46,6,2018-02-19,completed 48 | 47,50,2018-02-20,completed 49 | 48,27,2018-02-21,completed 50 | 49,35,2018-02-21,completed 51 | 50,51,2018-02-23,completed 52 | 51,71,2018-02-24,completed 53 | 52,54,2018-02-25,return_pending 54 | 53,34,2018-02-26,completed 55 | 54,54,2018-02-26,completed 56 | 55,18,2018-02-27,completed 57 | 56,79,2018-02-28,completed 58 | 57,93,2018-03-01,completed 59 | 58,22,2018-03-01,completed 60 | 59,30,2018-03-02,completed 61 | 60,12,2018-03-03,completed 62 | 61,63,2018-03-03,completed 63 | 62,57,2018-03-05,completed 64 | 63,70,2018-03-06,completed 65 | 64,13,2018-03-07,completed 66 | 65,26,2018-03-08,completed 67 | 66,36,2018-03-10,completed 68 | 67,79,2018-03-11,completed 69 | 68,53,2018-03-11,completed 70 | 69,3,2018-03-11,completed 71 | 70,8,2018-03-12,completed 72 | 71,42,2018-03-12,shipped 73 | 72,30,2018-03-14,shipped 74 | 73,19,2018-03-16,completed 75 | 74,9,2018-03-17,shipped 76 | 75,69,2018-03-18,completed 77 | 76,25,2018-03-20,completed 78 | 77,35,2018-03-21,shipped 79 | 78,90,2018-03-23,shipped 80 | 79,52,2018-03-23,shipped 81 | 80,11,2018-03-23,shipped 82 | 81,76,2018-03-23,shipped 83 | 82,46,2018-03-24,shipped 84 | 83,54,2018-03-24,shipped 85 | 84,70,2018-03-26,placed 86 | 85,47,2018-03-26,shipped 87 | 86,68,2018-03-26,placed 88 | 87,46,2018-03-27,placed 89 | 88,91,2018-03-27,shipped 90 | 89,21,2018-03-28,placed 91 | 90,66,2018-03-30,shipped 92 | 91,47,2018-03-31,placed 93 | 92,84,2018-04-02,placed 94 | 93,66,2018-04-03,placed 95 | 94,63,2018-04-03,placed 96 | 95,27,2018-04-04,placed 97 | 96,90,2018-04-06,placed 98 | 97,89,2018-04-07,placed 99 | 98,41,2018-04-07,placed 100 | 99,85,2018-04-09,placed 101 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dagster's Jaffle Shop 2 | 3 | Dagster's Software Defined Assets technology is very cool. It can also be 4 | extremely hard to understand -- not unlike its common partner, dbt. 5 | 6 | This repo is intended to help dbt users understand what Dagster assets are, 7 | and how Dagster uses other object classes to declaratively create resources, 8 | in much the same way dbt does. 9 | 10 | ## dbt _is_ software-defined assets 11 | 12 | The core insight this package aims to help users understand is that dbt was 13 | the OG software-defined assets platform for the modern data stack. The beauty 14 | is that dbt made the statement: "you know what, 80% of the highest impact 15 | data work you do can be done simply by writing queries. You give us the SQL, 16 | we'll do the rest." 17 | 18 | It's a very neat framework: you just have a directory of SQL files (with Jinja, of 19 | course), and a few "types" of assets like snapshots and seeds and models. 20 | 21 | Dagster's software-defined assets framework does not have the benefit of dbt's 22 | simple worldview. But, one could easily rebuild a simple dbt worldview with the 23 | tools provided. That's what I'm doing here, with DuckDB and Dagster. 24 | 25 | ## Jaffle Shop 26 | 27 | The Jaffle Shop repo is a popular starting point for dbt newbies. It's a great 28 | starting point for Dagster noobs too! 29 | 30 | What we are going to do here is to: 31 | 32 | 1. Convert the queries from the dbt jaffle_shop project into assets. 33 | 2. Define a custom DuckDB IO Manager that can take a query and write it a db. 34 | 3. Write a separate job that can profile the tables (once writen) and log metadata to the asset. 35 | 36 | The output graph will look like this: 37 | 38 | ![Dagster Asset Graph](img/asset_graph.png) 39 | 40 | Here are a few of the opinions this directory adopts: 41 | 42 | - All assets live in the dagster_jaffle_shop/assets folder. 43 | - Only one asset is defined per .py file. 44 | - DuckDB assets return a query string. The DuckDB IO Manager renders the Jinja 45 | and executes the query. An alternative would be to have the IO Manager load a 46 | dataframe for each DuckDB asset, and return a dataframe (which it would then 47 | write). This may be a better approach, but it is less "dbt-like", so I'm 48 | passing around query text and table names. 49 | 50 | ### A typical asset 51 | 52 | Here's a definition for a typical asset in this repo: 53 | 54 | ```{python} 55 | from dagster import asset 56 | 57 | from dagster_jaffle_shop.io_managers import duckdb_io_manager 58 | 59 | 60 | @asset(io_manager_def=duckdb_io_manager) 61 | def stg_foo(some_upstream_asset: str) -> str: 62 | "An example asset for this custom IO manager." 63 | 64 | jinja_query = """ 65 | with source as ( 66 | select * from {{ some_upstream_asset }} 67 | ), 68 | 69 | select * from source 70 | """ 71 | 72 | return jinja_query 73 | ``` 74 | 75 | You can see that it's pretty simple: the decorated function is the Python code. 76 | The thing that's returned -- `jinja_query` -- is going to be handled by the IO 77 | Manager. In this case, that IO Manager is going to execute the query against 78 | a DuckDB. 79 | 80 | ![](img/job_run.png) 81 | 82 | This the magic and complexity of the Dagster asset framework. We could swap out 83 | the IO Manager and have it write this query against Snowflake, BigQuery, S3, or 84 | a Google Document. Essentially, this is what `adapters` are in dbt. 85 | 86 | The hard part is that the developer has to be _really_ mindful of what the Python 87 | function is doing, and how the IO Manager will handle it, _and_ how the upstream 88 | assets are going to be rendered within the function. dbt's SQL world has 89 | it easy! 90 | 91 | ### The asset metadata job 92 | 93 | One of the coolest things about Dagster is that it owns the catalog, the asset 94 | definition, and it has a way to log structured metadata in an ad hoc fashion. 95 | 96 | ![](img/catalog_metadata.png) 97 | 98 | It's not hard to imagine a world where you have some package that will profile 99 | all your Snowflake tables on a regular basis and log the info to the asset catalog. 100 | That is essentially what we do here, as a replacement to dbt's YAML-driven 101 | testing framework. 102 | 103 | We include two operations: 104 | 105 | 1. For each table in DuckDB, run `pandas_profiling` against the table. 106 | 2. For each defined primary key in a dict, check that all values are null. 107 | 108 | The trouble (right now) with Dagster's metadata catalog is that, while you _can_ 109 | use a sensor and the event history to do things like, "If you find any events 110 | where an assets metadata key X looks like Y, then send an alert," it's hardly 111 | straightforward. So -- the application of the metadata is still TBD. But you 112 | can see the fundamentals of something really cool are all there. 113 | 114 | # Running the package 115 | 116 | There are some (hopefully) helpful functions in the Makefile. Docker is giving 117 | me some problems (M1 issues I think) so YMMV if you try to use the Dockerfile 118 | route, but if you have Poetry installed, you should be able to run: 119 | 120 | ```{bash} 121 | poetry install 122 | poetry run make materialize_job # runs the asset materialize job from the CLI 123 | poetry run make dagit # starts up dagit web UI 124 | ``` 125 | 126 | If you don't have Poetry installed, you can use a new virtual environment and 127 | install it there first: 128 | 129 | ```{bash} 130 | python -m venv .venv 131 | source .venv/bin/activate 132 | pip install poetry==1.2.0 133 | ``` 134 | 135 | Hopefully this works for you! 136 | --------------------------------------------------------------------------------