├── .env.local ├── .gitignore ├── README.md ├── dagster.yaml ├── dagster_assets ├── __init__.py ├── assets │ ├── __init__.py │ ├── dbt │ │ └── __init__.py │ └── raw_data │ │ ├── __init__.py │ │ └── duckpond.py └── utils │ └── __init__.py ├── dbt ├── .gitignore ├── dbt_project.yml ├── macros │ ├── metabase.sql │ └── risingwave.sql ├── models │ ├── groups.yml │ ├── models.yml │ ├── multiengine │ │ ├── public │ │ │ ├── metric_action.sql │ │ │ ├── metric_churn.sql │ │ │ ├── metric_queries.sql │ │ │ └── subscription_consumption_daily.sql │ │ └── staging │ │ │ └── staging_churn.sql │ └── sources.yml ├── packages.yml └── tests │ └── .gitkeep ├── kafka └── README.md ├── pyproject.toml ├── requirements.txt ├── risingwave ├── README.md ├── create_sink.sql └── create_source.sql ├── setup.cfg ├── setup.py ├── shadowtraffic ├── README.md ├── generate_historical_data.ipynb └── hello-world.json ├── sources ├── duckdb │ ├── generate_orders.py │ └── utils.py └── shadowtraffic │ └── hello-world.json └── tox.ini /.env.local: -------------------------------------------------------------------------------- 1 | AWS_ACCESS_KEY_ID= 2 | AWS_SECRET_ACCESS_KEY= 3 | BUCKET_NAME= -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | venv/ 2 | .env 3 | .vscode 4 | dbt/profiles.yml 5 | dbt/*.dbi 6 | **/__pycache__/ 7 | .DS_Store 8 | venv/ 9 | venv2/ 10 | tmp*/ 11 | *.duckdb 12 | **.user.yml 13 | license.env -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Setup of a very simple multi engine data stack. 2 | 3 | More details in [this](https://juhache.substack.com/p/multi-engine-data-stack-v0) article: 4 | 5 | # Getting Started 6 | 7 | 1- Install requirements.txt 8 | ``` 9 | pip install -r requirements.txt 10 | ``` 11 | 12 | 13 | 1- Set .env 14 | Rename .env.local to .env and complete the following variables: 15 | - AWS_ACCESS_KEY_ID 16 | - AWS_SECRET_ACCESS_KEY 17 | - BUCKET_NAME 18 | 19 | 2- Set your dbt/profiles.yml 20 | ``` 21 | duck: 22 | target: duck 23 | outputs: 24 | duck: 25 | type: duckdb 26 | path: dbt.duckdb 27 | extensions: 28 | - httpfs 29 | - parquet 30 | settings: 31 | s3_region: 32 | s3_access_key_id: 33 | s3_secret_access_key: 34 | 35 | supabase: 36 | target: supabase 37 | outputs: 38 | supabase: 39 | type: postgres 40 | host: 41 | user: postgres 42 | password: 43 | port: 5432 44 | dbname: postgres 45 | schema: metric 46 | threads: 1 47 | connect_timeout: 30 48 | 49 | snow: 50 | target: snow 51 | outputs: 52 | snow: 53 | type: snowflake 54 | account: 55 | user: 56 | password: 57 | database: 58 | schema: 59 | ``` 60 | 61 | 4 - Run Risingwage 62 | ``` 63 | docker run -it --pull=always -p 4566:4566 -p 5691:5691 risingwavelabs/risingwave:latest playground 64 | ``` 65 | 66 | ``` 67 | psql -h localhost -p 4566 -d dev -U root 68 | ``` 69 | 70 | 4- Run Kafka 71 | Ref: https://www.conduktor.io/kafka/kafka-topics-cli-tutorial/ 72 | 73 | ``` 74 | cd kafka 75 | git clone https://github.com/conduktor/kafka-stack-docker-compose.git 76 | cd kafka-stack-docker-compose 77 | docker-compose -f zk-single-kafka-single.yml ps 78 | ``` 79 | 80 | Create topic 81 | ``` 82 | docker exec -it kafka1 /bin/bash 83 | kafka-topics --bootstrap-server localhost:9092 --list 84 | kafka-topics --topic query-topic --create --partitions 3 --replication-factor 1 --bootstrap-server localhost:9092 85 | ``` 86 | 87 | If you need to delete a topic 88 | ``` 89 | kafka-topics --bootstrap-server localhost:9092 --delete --topic first_topic 90 | ``` 91 | 92 | Interact with kcat 93 | ``` 94 | # list topics 95 | kcat -b localhost:9092 -L 96 | 97 | # produce messages 98 | kcat -b localhost:9092 -t test-topic -P 99 | 100 | # consumer messages 101 | kcat -b localhost:9092 -t test-topic -C # consumer 102 | ``` 103 | 104 | 4- Run Shadowtraffic 105 | Create license file (cf shadowtraffic website) 106 | ``` 107 | cd shadowtraffic 108 | docker run --env-file license.env -v $(pwd)/hello-world.json:/home/config.json shadowtraffic/shadowtraffic:latest --config /home/config.json --watch --sample 10 109 | ``` 110 | 111 | 3- Run Dagster 112 | ``` 113 | dagster dev 114 | ``` -------------------------------------------------------------------------------- /dagster.yaml: -------------------------------------------------------------------------------- 1 | # storage: 2 | # postgres: 3 | # postgres_db: 4 | # username: postgres 5 | # password: 6 | # hostname: 7 | # db_name: postgres 8 | # port: 5432 9 | -------------------------------------------------------------------------------- /dagster_assets/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dagster import ( 3 | Definitions, 4 | FilesystemIOManager, 5 | ScheduleDefinition, 6 | define_asset_job, 7 | load_assets_from_package_module, 8 | ) 9 | from .assets.raw_data.duckpond import DuckPondIOManager, DuckDB 10 | from .assets import raw_data 11 | from .assets.dbt import ( 12 | DBT_PROJECT_DIR, 13 | dbt_project_assets_duck, 14 | dbt_project_assets_supabase, 15 | dbt_project_assets_risingwave, 16 | duck_models, 17 | supabase_models, 18 | risingwave_models, 19 | dbt_resource, 20 | ) 21 | 22 | raw_data_assets = load_assets_from_package_module( 23 | raw_data, 24 | group_name="raw_data", 25 | key_prefix=["raw_data"], 26 | ) 27 | 28 | raw_data_update_job = define_asset_job( 29 | "raw_data_update", 30 | selection=raw_data_assets) 31 | 32 | duck_job = define_asset_job( 33 | "duck_job", 34 | selection=duck_models) 35 | 36 | supabase_job = define_asset_job( 37 | "supabase_job", 38 | selection=supabase_models) 39 | 40 | risingwave_job = define_asset_job( 41 | "risingwave_job", 42 | selection=risingwave_models) 43 | 44 | DUCKDB_LOCAL_CONFIG=f""" 45 | set s3_region="us-east-1"; 46 | set s3_access_key_id='{os.environ["AWS_ACCESS_KEY_ID"]}'; 47 | set s3_secret_access_key='{os.environ["AWS_SECRET_ACCESS_KEY"]}'; 48 | """ 49 | 50 | resources = { 51 | # this io_manager allows us to load dbt models as pandas dataframes 52 | "io_manager": DuckPondIOManager(os.environ["BUCKET_NAME"], DuckDB(DUCKDB_LOCAL_CONFIG), prefix="data/"), 53 | # this resource is used to execute dbt cli commands 54 | "dbt": dbt_resource, 55 | } 56 | 57 | defs = Definitions( 58 | assets=[dbt_project_assets_duck, dbt_project_assets_supabase,dbt_project_assets_risingwave, *raw_data_assets], 59 | resources=resources, 60 | schedules=[ 61 | ScheduleDefinition(job=raw_data_update_job, cron_schedule="*/2 * * * *"), 62 | ] 63 | ) 64 | 65 | -------------------------------------------------------------------------------- /dagster_assets/assets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hachej/multi-engine-data-stack/5894bf321d2c6370c8b2db39aa3389492755aeeb/dagster_assets/assets/__init__.py -------------------------------------------------------------------------------- /dagster_assets/assets/dbt/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Mapping 2 | from dagster import MetadataValue 3 | from dagster import AssetExecutionContext, AssetKey, file_relative_path 4 | from dagster_dbt import ( 5 | DagsterDbtTranslator, 6 | DbtCliResource, 7 | dbt_assets, 8 | get_asset_key_for_model, 9 | DbtManifestAssetSelection 10 | ) 11 | from dagster import asset, Config 12 | 13 | DBT_PROJECT_DIR = file_relative_path(__file__, "../../../dbt") 14 | 15 | dbt_resource = DbtCliResource(project_dir=DBT_PROJECT_DIR) 16 | dbt_parse_invocation = dbt_resource.cli(["parse"]).wait() 17 | dbt_manifest_path = dbt_parse_invocation.target_path.joinpath("manifest.json") 18 | 19 | @dbt_assets( 20 | manifest=dbt_manifest_path, 21 | select='tag:duckdb', 22 | ) 23 | def dbt_project_assets_duck(context: AssetExecutionContext, dbt: DbtCliResource): 24 | yield from dbt.cli(["build", "--profile", "duck"], context=context).stream() 25 | 26 | @dbt_assets( 27 | manifest=dbt_manifest_path, 28 | select='tag:supabase' 29 | ) 30 | def dbt_project_assets_supabase(context: AssetExecutionContext, dbt: DbtCliResource): 31 | yield from dbt.cli(["build", "--profile", "supabase"], context=context).stream() 32 | 33 | @dbt_assets( 34 | manifest=dbt_manifest_path, 35 | select='tag:risingwave' 36 | ) 37 | def dbt_project_assets_risingwave(context: AssetExecutionContext, dbt: DbtCliResource): 38 | yield from dbt.cli(["build", "--profile", "risingwave"], context=context).stream() 39 | 40 | duck_models = DbtManifestAssetSelection(manifest=dbt_manifest_path, select="tag:duckdb") 41 | supabase_models = DbtManifestAssetSelection(manifest=dbt_manifest_path, select="tag:supabase") 42 | risingwave_models = DbtManifestAssetSelection(manifest=dbt_manifest_path, select="tag:risingwave") -------------------------------------------------------------------------------- /dagster_assets/assets/raw_data/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from dagster import asset 4 | from dagster_assets.assets.raw_data.duckpond import SQL 5 | from dagster_assets.utils import _random_times 6 | import uuid 7 | import random 8 | import datetime 9 | import lorem 10 | 11 | @asset(compute_kind="random") 12 | def accounts() -> SQL: 13 | """A table containing all orders that have been placed.""" 14 | dataset_size = 1000 15 | 16 | data = { 17 | "account_id": ["c4cefa9e-8a45-45be-ae65-5a1d3972e18e", "dcf134a1-1871-49d5-8fdf-64fd82c907cf"], 18 | "country": [ "FR", "US"], 19 | "age": np.random.randint(2, 50, size=2) , 20 | "dt" : [datetime.datetime.now(), datetime.datetime.now()] 21 | } 22 | 23 | df = pd.DataFrame(data) 24 | # generate duplicates 25 | duplicates = df[0:2].copy() 26 | df = pd.concat([df, duplicates]) 27 | return SQL("select * from $df", df=df) 28 | -------------------------------------------------------------------------------- /dagster_assets/assets/raw_data/duckpond.py: -------------------------------------------------------------------------------- 1 | # source : https://github.com/petehunt/dagster-poor-mans-data-lake/blob/main/jaffle/duckpond.py 2 | 3 | from duckdb import connect 4 | from dagster import IOManager 5 | import pandas as pd 6 | from sqlescapy import sqlescape 7 | from string import Template 8 | from typing import Mapping 9 | from datetime import datetime 10 | 11 | 12 | class SQL: 13 | def __init__(self, sql, **bindings): 14 | self.sql = sql 15 | self.bindings = bindings 16 | 17 | 18 | def sql_to_string(s: SQL) -> str: 19 | replacements = {} 20 | for key, value in s.bindings.items(): 21 | if isinstance(value, pd.DataFrame): 22 | replacements[key] = f"df_{id(value)}" 23 | elif isinstance(value, SQL): 24 | replacements[key] = f"({sql_to_string(value)})" 25 | elif isinstance(value, str): 26 | replacements[key] = f"'{sqlescape(value)}'" 27 | elif isinstance(value, (int, float, bool)): 28 | replacements[key] = str(value) 29 | elif value is None: 30 | replacements[key] = "null" 31 | else: 32 | raise ValueError(f"Invalid type for {key}") 33 | return Template(s.sql).safe_substitute(replacements) 34 | 35 | 36 | def collect_dataframes(s: SQL) -> Mapping[str, pd.DataFrame]: 37 | dataframes = {} 38 | for key, value in s.bindings.items(): 39 | if isinstance(value, pd.DataFrame): 40 | dataframes[f"df_{id(value)}"] = value 41 | elif isinstance(value, SQL): 42 | dataframes.update(collect_dataframes(value)) 43 | return dataframes 44 | 45 | 46 | class DuckDB: 47 | def __init__(self, options=""): 48 | self.options = options 49 | 50 | def query(self, select_statement: SQL): 51 | db = connect(":memory:") 52 | db.query("install httpfs; load httpfs;") 53 | db.query(self.options) 54 | 55 | dataframes = collect_dataframes(select_statement) 56 | for key, value in dataframes.items(): 57 | db.register(key, value) 58 | 59 | result = db.query(sql_to_string(select_statement)) 60 | if result is None: 61 | return 62 | return result.df() 63 | 64 | 65 | class DuckPondIOManager(IOManager): 66 | def __init__(self, bucket_name: str, duckdb: DuckDB, prefix=""): 67 | self.bucket_name = bucket_name 68 | self.duckdb = duckdb 69 | self.prefix = prefix 70 | 71 | def _get_s3_url(self, context): 72 | if context.has_asset_key: 73 | id = context.get_asset_identifier() 74 | else: 75 | id = context.get_identifier() 76 | timestamp_str = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") 77 | 78 | return f"s3://{self.bucket_name}/{self.prefix}{'/'.join(id)}/{timestamp_str}.parquet" 79 | 80 | 81 | 82 | 83 | def handle_output(self, context, select_statement: SQL): 84 | if select_statement is None: 85 | return 86 | 87 | if not isinstance(select_statement, SQL): 88 | raise ValueError( 89 | f"Expected asset to return a SQL; got {select_statement!r}" 90 | ) 91 | 92 | url = self._get_s3_url(context) 93 | print(url) 94 | self.duckdb.query( 95 | SQL( 96 | "copy $select_statement to $url (format parquet)", 97 | select_statement=select_statement, 98 | url=url, 99 | ) 100 | ) 101 | 102 | def load_input(self, context) -> SQL: 103 | return SQL("select * from read_parquet($url)", url=self._get_s3_url(context)) -------------------------------------------------------------------------------- /dagster_assets/utils/__init__.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import time 3 | import uuid 4 | from typing import Any, Dict 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | 10 | def _random_times(n: int): 11 | """Generate some random times that generally become more frequent as time goes on.""" 12 | time.sleep(0.5) 13 | start = pd.to_datetime("2022-01-01") 14 | end = pd.to_datetime(datetime.datetime.now()) 15 | 16 | start_u = start.value // 10**9 17 | end_u = end.value // 10**9 18 | 19 | dist = np.random.standard_exponential(size=n) / 10 20 | 21 | clipped_flipped_dist = 1 - dist[dist <= 1] 22 | clipped_flipped_dist = clipped_flipped_dist[:-1] 23 | 24 | if len(clipped_flipped_dist) < n: 25 | clipped_flipped_dist = np.append( 26 | clipped_flipped_dist, clipped_flipped_dist[: n - len(clipped_flipped_dist)] 27 | ) 28 | 29 | return pd.to_datetime((clipped_flipped_dist * (end_u - start_u)) + start_u, unit="s") 30 | 31 | 32 | def random_data(extra_columns: Dict[str, Any], n: int) -> pd.DataFrame: 33 | # always have user_id and day 34 | data = { 35 | "user_id": np.random.randint(0, 1000, size=n), 36 | "dt": _random_times(n) 37 | } 38 | for name, dtype in extra_columns.items(): 39 | if dtype == str: 40 | data[name] = [uuid.uuid4() for _ in range(n)] 41 | elif dtype == int: 42 | data[name] = np.random.randint(0, 100, size=n) 43 | elif dtype == float: 44 | data[name] = 100 * np.random.random(size=n) 45 | return pd.DataFrame(data) 46 | -------------------------------------------------------------------------------- /dbt/.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | dbt_packages/ 3 | logs/ 4 | -------------------------------------------------------------------------------- /dbt/dbt_project.yml: -------------------------------------------------------------------------------- 1 | name: "mutli_engine_stack" 2 | version: "1.0.0" 3 | config-version: 2 4 | 5 | profile: duck 6 | 7 | analysis-paths: ["analyses"] 8 | test-paths: ["tests"] 9 | macro-paths: ["macros"] 10 | snapshot-paths: ["snapshots"] 11 | 12 | target-path: "target" 13 | clean-targets: 14 | - "target" 15 | - "dbt_packages" 16 | -------------------------------------------------------------------------------- /dbt/macros/metabase.sql: -------------------------------------------------------------------------------- 1 | -- {% macro create_external_metabase_table(schema_name, table_name) %} 2 | -- {% if execute %} 3 | -- {% set query %} 4 | -- CREATE SCHEMA IF NOT EXISTS {{schema_name}}; 5 | -- {% endset %} 6 | -- {% do run_query(query) %} 7 | 8 | -- {% set check_table_exists_query %} 9 | -- SELECT * FROM information_schema.tables 10 | -- WHERE table_schema = '{{ schema_name }}' and table_name = '{{ table_name }}' 11 | -- {% endset %} 12 | 13 | -- {% set results = run_query(check_table_exists_query) %} 14 | 15 | -- {% if results|length > 0 %} 16 | -- {{ print("drop external table ") }} 17 | -- {% set query %} 18 | -- DROP FOREIGN TABLE {{ schema_name }}.{{ table_name }}; 19 | -- {% endset %} 20 | -- {% do run_query(query) %} 21 | -- {% endif %} 22 | 23 | -- {{ print("create external table ") }} 24 | -- {% set create_external_table_query %} 25 | -- CREATE FOREIGN TABLE {{ schema_name }}.{{ table_name }} ( 26 | -- account_id varchar, 27 | -- churn_probability float, 28 | -- prediction_date date 29 | -- ) 30 | -- server 31 | -- s3_server 32 | -- options ( 33 | -- uri 's3://newsletter-multiengine-stack/data/staging/churn/churn.parquet', 34 | -- format 'parquet' 35 | -- ); 36 | -- {% endset %} 37 | -- {% print(create_external_table_query) %} 38 | -- {% set results = run_query(create_external_table_query) %} 39 | -- {% endif %} 40 | -- {% endmacro %} -------------------------------------------------------------------------------- /dbt/macros/risingwave.sql: -------------------------------------------------------------------------------- 1 | {% macro create_sink() %} 2 | {% set query %} 3 | CREATE SINK target_count_postgres_sink FROM metric_queries WITH ( 4 | connector = 'jdbc', 5 | jdbc.url = '{{ env_var("SUPABASE_JDBC_CONNECTION_STRING") }}', 6 | table.name = 'query_metrics', 7 | type = 'upsert', 8 | primary_key = 'account_id_window_start' 9 | ); 10 | {% endset %} 11 | {% do run_query(query) %} 12 | 13 | {% endmacro %} 14 | 15 | 16 | {% macro create_source() %} 17 | {% set query %} 18 | CREATE SOURCE IF NOT EXISTS source_queries ( 19 | query_id varchar, 20 | account_id varchar, 21 | query_agent varchar, 22 | start_time varchar, 23 | processed_kb float, 24 | compute_size float, 25 | execution_duration_ms float 26 | ) 27 | WITH ( 28 | connector='kafka', 29 | topic='query-topic', 30 | properties.bootstrap.server='host.docker.internal:29092', 31 | scan.startup.mode='latest', 32 | scan.startup.timestamp_millis='140000000' 33 | ) FORMAT PLAIN ENCODE JSON; 34 | {% endset %} 35 | {% do run_query(query) %} 36 | 37 | {% endmacro %} -------------------------------------------------------------------------------- /dbt/models/groups.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: duckdb 3 | owner: 4 | # 'name' or 'email' is required; additional properties allowed 5 | email: finance@jaffleshop.com 6 | slack: finance-data 7 | github: finance-data-team 8 | - name: risingwave 9 | owner: 10 | # 'name' or 'email' is required; additional properties allowed 11 | email: finance@jaffleshop.com 12 | slack: finance-data 13 | github: finance-data-team 14 | - name: supabase 15 | owner: 16 | # 'name' or 'email' is required; additional properties allowed 17 | email: finance@jaffleshop.com 18 | slack: finance-data 19 | github: finance-data-team -------------------------------------------------------------------------------- /dbt/models/models.yml: -------------------------------------------------------------------------------- 1 | # models: 2 | # - name: landing_orders 3 | # config: 4 | # meta: 5 | # dagster: 6 | # freshness_policy: 7 | # maximum_lag_minutes: 5 8 | # - name: staging_orders 9 | # config: 10 | # meta: 11 | # dagster: 12 | # freshness_policy: 13 | # maximum_lag_minutes: 1 14 | 15 | -------------------------------------------------------------------------------- /dbt/models/multiengine/public/metric_action.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized='view', 3 | description="Supabase table", 4 | tags=["supabase"], 5 | group="supabase" 6 | ) 7 | }} 8 | 9 | SELECT 10 | c.account_id, 11 | churn_probability, 12 | prediction_date, 13 | m.query_count, 14 | CASE WHEN churn_probability > 0.5 OR query_count > 10 THEN 1 ELSE 0 END as display_churn_action 15 | FROM {{ ref("metric_churn") }} as c 16 | JOIN {{ source("metric", "query_metrics") }} as m 17 | on c.account_id = m.account_id 18 | and m.query_agent = 'chatbot' 19 | and m.window_start = current_date 20 | -------------------------------------------------------------------------------- /dbt/models/multiengine/public/metric_churn.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized='table', 3 | description="Supabase table", 4 | group="supabase", 5 | tags=["supabase"] 6 | ) 7 | }} 8 | 9 | SELECT 10 | account_id::uuid as account_id, 11 | churn_probability, 12 | prediction_date::date 13 | FROM {{ source('staging', 'account_churn') }} -------------------------------------------------------------------------------- /dbt/models/multiengine/public/metric_queries.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized='materializedview', 3 | pre_hook = "{{ create_source() }}", 4 | post_hook = "{{ create_sink() }}", 5 | tags=["risingwave"], 6 | group="risingwave" 7 | ) 8 | }} 9 | 10 | with queries as ( 11 | SELECT 12 | account_id, 13 | query_agent, 14 | start_time::timestamp as query_time, 15 | execution_duration_ms, 16 | processed_kb, 17 | round(processed_kb * execution_duration_ms / 10000000) as credits_consumed 18 | FROM source_queries 19 | ) 20 | SELECT 21 | CONCAT(account_id, '-', query_agent, '-', window_start) as account_id_window_start, 22 | account_id, 23 | query_agent, 24 | window_start, 25 | window_end, 26 | count(*) as query_count, 27 | sum(execution_duration_ms) as sum_execution_duration_ms, 28 | sum(processed_kb) as sum_processed_kb, 29 | sum(credits_consumed) as sum_credits_consumed, 30 | max(query_time) as last_query_timestamp 31 | FROM TUMBLE(queries, query_time, INTERVAL '1 day') 32 | GROUP BY account_id, query_agent, window_start, window_end 33 | -------------------------------------------------------------------------------- /dbt/models/multiengine/public/subscription_consumption_daily.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized='view', 3 | description="Supabase table", 4 | tags=["supabase"], 5 | group="supabase" 6 | ) 7 | }} 8 | with sub as ( 9 | SELECT 10 | account_id, 11 | subscription_id, 12 | subscription_start_date, 13 | credits_purchased 14 | FROM {{ source("metric", "subscriptions") }} 15 | where current_date between subscription_start_date::date and subscription_end_date::date 16 | ) 17 | select 18 | sub.account_id, 19 | sub.subscription_id, 20 | sub.subscription_start_date, 21 | credits_purchased, 22 | credits_purchased / 30 as credits_purchased_daily, 23 | m.window_start as consumption_date, 24 | m.sum_credits_consumed as credits_consumed_daily, 25 | m.sum_processed_kb as sum_processed_kb_daily, 26 | m.sum_execution_duration_ms as sum_execution_duration_ms_daily, 27 | sum(m.sum_credits_consumed) OVER (PARTITION BY sub.account_id, sub.subscription_id 28 | ORDER BY m.window_start) AS credits_consumed_cumulative 29 | 30 | from sub 31 | LEFT JOIN {{ source("metric", "query_metrics") }} as m 32 | ON m.account_id = sub.account_id 33 | and m.query_agent = 'chatbot' 34 | and m.window_start >= sub.subscription_start_date 35 | order by account_id, subscription_id, consumption_date 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /dbt/models/multiengine/staging/staging_churn.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized='external', 3 | location="s3://{{ env_var('BUCKET_NAME') }}/data/staging/churn/churn.parquet", 4 | format="parquet", 5 | description="Duck DB table", 6 | group='duckdb', 7 | tags=["duckdb"] 8 | ) 9 | }} 10 | 11 | SELECT 12 | account_id::BLOB as account_id, 13 | random() as churn_probability, 14 | current_date::VARCHAR::BLOB as prediction_date 15 | FROM {{ ref('landing_accounts') }} -------------------------------------------------------------------------------- /dbt/models/sources.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: raw_data 5 | tables: 6 | - name: accounts 7 | meta: 8 | external_location: "read_parquet('s3://{{ env_var('BUCKET_NAME') }}/data/raw_data/accounts/*.parquet', filename=true)" 9 | columns: 10 | - name: account_id 11 | data_type: varchar 12 | - name: country 13 | data_type: varchar 14 | - name: age 15 | data_type: varchar 16 | 17 | 18 | - name: staging 19 | schema: public 20 | tables: 21 | - name: account_churn 22 | meta: 23 | dagster: 24 | asset_key: ["staging_churn"] 25 | external_location: "s3://newsletter-multiengine-stack/data/staging/churn/churn.parquet" 26 | columns: 27 | - name: account_id 28 | data_type: varchar 29 | - name: churn_probability 30 | data_type: bigint 31 | - name: prediction_date 32 | data_type: date 33 | 34 | - name: metric 35 | schema: public 36 | tables: 37 | - name: subscriptions 38 | - name: accounts 39 | - name: query_metrics 40 | meta: 41 | dagster: 42 | asset_key: ["metric_queries"] 43 | columns: 44 | - name: account_id 45 | data_type: varchar 46 | - name: key_id 47 | data_type: varchar 48 | - name: query_agent 49 | data_type: varchar 50 | - name: sum_compute_size 51 | data_type: varchar 52 | - name: window_end 53 | data_type: varchar 54 | - name: account_id_window_start 55 | data_type: varchar 56 | - name: query_count 57 | data_type: integer 58 | - name: sum_execution_duration_ms 59 | data_type: float 60 | - name: sum_processed_kb 61 | data_type: float 62 | - name: sum_credits_consumed 63 | data_type: float 64 | - name: window_start 65 | data_type: timestamp -------------------------------------------------------------------------------- /dbt/packages.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: 1.0.0 4 | - package: dbt-labs/dbt_external_tables 5 | version: 0.8.7 -------------------------------------------------------------------------------- /dbt/tests/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hachej/multi-engine-data-stack/5894bf321d2c6370c8b2db39aa3389492755aeeb/dbt/tests/.gitkeep -------------------------------------------------------------------------------- /kafka/README.md: -------------------------------------------------------------------------------- 1 | https://www.conduktor.io/kafka/kafka-topics-cli-tutorial/ 2 | 3 | 4 | 5 | 6 | # kafak command: https://www.conduktor.io/kafka/kafka-topics-cli-tutorial/ 7 | git clone https://github.com/conduktor/kafka-stack-docker-compose.git 8 | cd kafka-stack-docker-compose 9 | 10 | 11 | docker exec -it kafka1 /bin/bash 12 | kafka-topics --bootstrap-server localhost:9092 --list 13 | kafka-topics --topic query-topic --create --partitions 3 --replication-factor 1 --bootstrap-server localhost:9092 14 | kafka-topics --bootstrap-server localhost:9092 --delete --topic first_topic -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.dagster] 6 | module_name = "dagster_assets" 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | dagster 2 | dagster-webserver 3 | dagster-dbt 4 | dbt-duckdb 5 | dbt-snowflake 6 | dbt-postgres 7 | dbt-risingwave 8 | boto3 9 | pandas 10 | sqlescapy 11 | dagster_postgres -------------------------------------------------------------------------------- /risingwave/README.md: -------------------------------------------------------------------------------- 1 | https://github.com/risingwavelabs/risingwave 2 | 3 | brew tap risingwavelabs/risingwave 4 | brew install risingwave 5 | risingwave playground -------------------------------------------------------------------------------- /risingwave/create_sink.sql: -------------------------------------------------------------------------------- 1 | CREATE SINK target_count_postgres_sink FROM metric_queries WITH ( 2 | connector = 'jdbc', 3 | jdbc.url = 'jdbc:postgresql://db.iiawsxgmdjqdqzorpwqh.supabase.co:5432/postgres?user=postgres&password=posF_jptkvx4pDC2haj_koFJ', 4 | table.name = 'query_metrics', 5 | type = 'upsert', 6 | primary_key = 'account_id_window_start' 7 | ); -------------------------------------------------------------------------------- /risingwave/create_source.sql: -------------------------------------------------------------------------------- 1 | DROP SOURCE source_queries; 2 | 3 | CREATE SOURCE IF NOT EXISTS source_queries ( 4 | query_id varchar, 5 | account_id varchar, 6 | query_agent varchar, 7 | start_time varchar, 8 | processed_kb float, 9 | compute_size float, 10 | execution_duration_ms float 11 | ) 12 | WITH ( 13 | connector='kafka', 14 | topic='query-topic', 15 | properties.bootstrap.server='host.docker.internal:29092', 16 | scan.startup.mode='latest', 17 | scan.startup.timestamp_millis='140000000' 18 | ) FORMAT PLAIN ENCODE JSON; 19 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = dagster_assets 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import glob 2 | 3 | from setuptools import find_packages, setup 4 | 5 | setup( 6 | name="dagster_assets", 7 | packages=find_packages(exclude=["dagster_assets_tests"]), 8 | # package data paths are relative to the package key 9 | package_data={ 10 | "dagster_assets": ["../" + path for path in glob.glob("dbt_project/**", recursive=True)] 11 | }, 12 | install_requires=[ 13 | "dagster", 14 | "dagster-cloud", 15 | "boto3", 16 | "dagster-dbt", 17 | "pandas", 18 | "numpy", 19 | "scipy", 20 | "dbt-core", 21 | "dbt-duckdb", 22 | "dagster-duckdb", 23 | "dagster-duckdb-pandas", 24 | # packaging v22 has build compatibility issues with dbt as of 2022-12-07 25 | "packaging<22.0", 26 | ], 27 | extras_require={"dev": ["dagster-webserver", "pytest"]}, 28 | ) 29 | -------------------------------------------------------------------------------- /shadowtraffic/README.md: -------------------------------------------------------------------------------- 1 | docker run --env-file license.env -v $(pwd)/hello-world.json:/home/config.json shadowtraffic/shadowtraffic:latest --config /home/config.json --watch --sample 10 --stdout 2 | -------------------------------------------------------------------------------- /shadowtraffic/generate_historical_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 18, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "\n", 11 | "from datetime import date, timedelta\n", 12 | "import numpy as np\n", 13 | "\n", 14 | "dates= [date.today() - timedelta(days=x) for x in range(1, 32)]\n", 15 | "window_start = [x.strftime(\"%Y-%m-%d\") + \" 00:00:00\" for x in dates]\n", 16 | "accounts = [\"c4cefa9e-8a45-45be-ae65-5a1d3972e18e\", \"dcf134a1-1871-49d5-8fdf-64fd82c907cf\"]\n", 17 | "account_id = [i for i in accounts for d in window_start]\n", 18 | "size = len(account_id)\n", 19 | "query_count = np.random.randint(2, 50, size=size)\n", 20 | "exec_duration = [np.random.randint(1000, 10000) * c for c in query_count]\n", 21 | "processed_kb =[np.random.randint(100, 1000) * c for c in query_count]\n", 22 | "credits = [exec_duration[idx] * processed_kb[idx] / 100000000 for idx,v in enumerate(query_count)]\n", 23 | "\n", 24 | "data = {\n", 25 | " \"account_id\": account_id,\n", 26 | " \"sum_compute_size\": np.random.randint(25555, 502234, size=size) ,\n", 27 | " \"sum_credits_consumed\":credits,\n", 28 | " \"window_start\": dates + dates,\n", 29 | " \"account_id_window_start\":[acc + \"-chatbot-\" + d for acc, d in zip(account_id, window_start + window_start )],\n", 30 | " \"query_agent\": [\"chatbot\" for i in range(size)] ,\n", 31 | " \"query_count\": query_count,\n", 32 | " \"sum_execution_duration_ms\": exec_duration,\n", 33 | " \"sum_processed_kb\": processed_kb,\n", 34 | " \"last_query_timestamp\": [d + timedelta(hours= np.random.randint(0,24)) for d in dates+dates ],\n", 35 | " \"window_end\": [d + timedelta(hours=24) for d in dates+dates ]\n", 36 | "}\n", 37 | "df = pd.DataFrame(data)\n", 38 | "df.to_csv(\"data.csv\", index=False)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 19, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/html": [ 49 | "
\n", 50 | "\n", 63 | "\n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | "
account_idsum_compute_sizesum_credits_consumedwindow_startaccount_id_window_startquery_agentquery_countsum_execution_duration_mssum_processed_kblast_query_timestampwindow_end
0c4cefa9e-8a45-45be-ae65-5a1d3972e18e39361932.1832222023-12-11c4cefa9e-8a45-45be-ae65-5a1d3972e18e-chatbot-2...chatbot24211176152402023-12-112023-12-12
1c4cefa9e-8a45-45be-ae65-5a1d3972e18e13114044.0740082023-12-10c4cefa9e-8a45-45be-ae65-5a1d3972e18e-chatbot-2...chatbot33166947264002023-12-102023-12-11
2c4cefa9e-8a45-45be-ae65-5a1d3972e18e9267916.4366072023-12-09c4cefa9e-8a45-45be-ae65-5a1d3972e18e-chatbot-2...chatbot18136494120422023-12-092023-12-10
3c4cefa9e-8a45-45be-ae65-5a1d3972e18e19294613.8039192023-12-08c4cefa9e-8a45-45be-ae65-5a1d3972e18e-chatbot-2...chatbot28101024136642023-12-082023-12-09
4c4cefa9e-8a45-45be-ae65-5a1d3972e18e23254370.3546842023-12-07c4cefa9e-8a45-45be-ae65-5a1d3972e18e-chatbot-2...chatbot38207328339342023-12-072023-12-08
\n", 153 | "
" 154 | ], 155 | "text/plain": [ 156 | " account_id sum_compute_size \\\n", 157 | "0 c4cefa9e-8a45-45be-ae65-5a1d3972e18e 393619 \n", 158 | "1 c4cefa9e-8a45-45be-ae65-5a1d3972e18e 131140 \n", 159 | "2 c4cefa9e-8a45-45be-ae65-5a1d3972e18e 92679 \n", 160 | "3 c4cefa9e-8a45-45be-ae65-5a1d3972e18e 192946 \n", 161 | "4 c4cefa9e-8a45-45be-ae65-5a1d3972e18e 232543 \n", 162 | "\n", 163 | " sum_credits_consumed window_start \\\n", 164 | "0 32.183222 2023-12-11 \n", 165 | "1 44.074008 2023-12-10 \n", 166 | "2 16.436607 2023-12-09 \n", 167 | "3 13.803919 2023-12-08 \n", 168 | "4 70.354684 2023-12-07 \n", 169 | "\n", 170 | " account_id_window_start query_agent query_count \\\n", 171 | "0 c4cefa9e-8a45-45be-ae65-5a1d3972e18e-chatbot-2... chatbot 24 \n", 172 | "1 c4cefa9e-8a45-45be-ae65-5a1d3972e18e-chatbot-2... chatbot 33 \n", 173 | "2 c4cefa9e-8a45-45be-ae65-5a1d3972e18e-chatbot-2... chatbot 18 \n", 174 | "3 c4cefa9e-8a45-45be-ae65-5a1d3972e18e-chatbot-2... chatbot 28 \n", 175 | "4 c4cefa9e-8a45-45be-ae65-5a1d3972e18e-chatbot-2... chatbot 38 \n", 176 | "\n", 177 | " sum_execution_duration_ms sum_processed_kb last_query_timestamp \\\n", 178 | "0 211176 15240 2023-12-11 \n", 179 | "1 166947 26400 2023-12-10 \n", 180 | "2 136494 12042 2023-12-09 \n", 181 | "3 101024 13664 2023-12-08 \n", 182 | "4 207328 33934 2023-12-07 \n", 183 | "\n", 184 | " window_end \n", 185 | "0 2023-12-12 \n", 186 | "1 2023-12-11 \n", 187 | "2 2023-12-10 \n", 188 | "3 2023-12-09 \n", 189 | "4 2023-12-08 " 190 | ] 191 | }, 192 | "execution_count": 19, 193 | "metadata": {}, 194 | "output_type": "execute_result" 195 | } 196 | ], 197 | "source": [ 198 | "df.head()" 199 | ] 200 | } 201 | ], 202 | "metadata": { 203 | "kernelspec": { 204 | "display_name": "project_newsletter", 205 | "language": "python", 206 | "name": "python3" 207 | }, 208 | "language_info": { 209 | "codemirror_mode": { 210 | "name": "ipython", 211 | "version": 3 212 | }, 213 | "file_extension": ".py", 214 | "mimetype": "text/x-python", 215 | "name": "python", 216 | "nbconvert_exporter": "python", 217 | "pygments_lexer": "ipython3", 218 | "version": "3.11.2" 219 | } 220 | }, 221 | "nbformat": 4, 222 | "nbformat_minor": 2 223 | } 224 | -------------------------------------------------------------------------------- /shadowtraffic/hello-world.json: -------------------------------------------------------------------------------- 1 | { 2 | "generators": [ 3 | { 4 | "topic": "query-topic", 5 | "value": { 6 | "query_id": { 7 | "_gen": "uuid" 8 | }, 9 | "account_id": { 10 | "_gen": "oneOf", 11 | "choices": [ 12 | "c4cefa9e-8a45-45be-ae65-5a1d3972e18e", 13 | "dcf134a1-1871-49d5-8fdf-64fd82c907cf" 14 | ] 15 | }, 16 | "query_agent": "chatbot", 17 | "start_time": { 18 | "_gen" : "formatDateTime", 19 | "ms" : { 20 | "_gen" : "now" 21 | }, 22 | "format" : "yyyy-MM-dd HH:mm:ss" 23 | }, 24 | "processed_kb": { 25 | "_gen": "uniformDistribution", 26 | "bounds": [ 27 | 100, 28 | 1000 29 | ] 30 | }, 31 | "compute_size": { 32 | "_gen": "normalDistribution", 33 | "mean": 100, 34 | "sd": 20, 35 | "bounds": [ 36 | 0, 37 | 100 38 | ] 39 | }, 40 | "execution_duration_ms": { 41 | "_gen": "uniformDistribution", 42 | "bounds": [ 43 | 1000, 44 | 10000 45 | ] 46 | } 47 | } 48 | } 49 | ], 50 | "connections": { 51 | "dev-kafka": { 52 | "kind": "kafka", 53 | "producerConfigs": { 54 | "bootstrap.servers": "host.docker.internal:29092", 55 | "key.serializer": "io.shadowtraffic.kafka.serdes.JsonSerializer", 56 | "value.serializer": "io.shadowtraffic.kafka.serdes.JsonSerializer" 57 | } 58 | } 59 | } 60 | } -------------------------------------------------------------------------------- /sources/duckdb/generate_orders.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import random 3 | import duckdb 4 | from datetime import datetime 5 | from utils import random_data 6 | import os 7 | from dotenv import load_dotenv, find_dotenv 8 | 9 | load_dotenv(find_dotenv()) 10 | 11 | class DataGenerator: 12 | def __init__(self, bucket, prefix) -> None: 13 | self.bucket = bucket 14 | self.prefix = prefix 15 | self.db = duckdb.connect(":memory:") 16 | 17 | self.db.query(f"""install httpfs; 18 | load httpfs; 19 | set s3_region="us-east-1"; 20 | set s3_access_key_id='{os.environ["AWS_ACCESS_KEY_ID"]}'; 21 | set s3_secret_access_key='{os.environ["AWS_SECRET_ACCESS_KEY"]}'; 22 | """) 23 | 24 | def _get_s3_url(self, id): 25 | timestamp_str = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") 26 | return f"s3://{self.bucket}/{self.prefix}{id}/{timestamp_str}.parquet" 27 | 28 | def generate(self, df: pd.DataFrame, id: str): 29 | url = self._get_s3_url(id) 30 | print(url) 31 | self.db.query(f"COPY (SELECT * FROM df) to '{url}' (FORMAT PARQUET)") 32 | 33 | # connect to an in-memory database 34 | generator = DataGenerator( 35 | bucket=os.environ["BUCKET_NAME"], 36 | prefix="data/" 37 | ) 38 | 39 | # generate random data 40 | df= random_data( 41 | extra_columns={"order_id": str, "quantity": int, "purchase_price": float, "sku": str}, 42 | n = random.randint(10, 11) 43 | ) 44 | # generate duplicates 45 | duplicated_orders = df[0:10].copy() 46 | duplicated_orders["quantity"] = duplicated_orders["quantity"] + 10 47 | df = pd.concat([df, duplicated_orders]) 48 | 49 | generator.generate(df, "raw_data/orders") 50 | -------------------------------------------------------------------------------- /sources/duckdb/utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import time 3 | import uuid 4 | from typing import Any, Dict 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | 10 | def _random_times(n: int): 11 | """Generate some random times that generally become more frequent as time goes on.""" 12 | time.sleep(0.5) 13 | start = pd.to_datetime("2022-01-01") 14 | end = pd.to_datetime(datetime.datetime.now()) 15 | 16 | start_u = start.value // 10**9 17 | end_u = end.value // 10**9 18 | 19 | dist = np.random.standard_exponential(size=n) / 10 20 | 21 | clipped_flipped_dist = 1 - dist[dist <= 1] 22 | clipped_flipped_dist = clipped_flipped_dist[:-1] 23 | 24 | if len(clipped_flipped_dist) < n: 25 | clipped_flipped_dist = np.append( 26 | clipped_flipped_dist, clipped_flipped_dist[: n - len(clipped_flipped_dist)] 27 | ) 28 | 29 | return pd.to_datetime((clipped_flipped_dist * (end_u - start_u)) + start_u, unit="s") 30 | 31 | 32 | def random_data(extra_columns: Dict[str, Any], n: int) -> pd.DataFrame: 33 | # always have user_id and day 34 | data = {"user_id": np.random.randint(0, 1000, size=n), "dt": _random_times(n)} 35 | for name, dtype in extra_columns.items(): 36 | if dtype == str: 37 | data[name] = [uuid.uuid4() for _ in range(n)] 38 | elif dtype == int: 39 | data[name] = np.random.randint(0, 100, size=n) 40 | elif dtype == float: 41 | data[name] = 100 * np.random.random(size=n) 42 | return pd.DataFrame(data) 43 | -------------------------------------------------------------------------------- /sources/shadowtraffic/hello-world.json: -------------------------------------------------------------------------------- 1 | { 2 | "generators": [ 3 | { 4 | "table": "testTable", 5 | "row": { 6 | "testColumn": { 7 | "_gen": "oneOf", 8 | "choices": [ 9 | "👍", 10 | "🔥", 11 | "❤️", 12 | "dd" 13 | ] 14 | } 15 | } 16 | } 17 | ], 18 | "connections": { 19 | "pg": { 20 | "kind": "postgres", 21 | "connectionConfigs": { 22 | "host": "localhost", 23 | "port": 5432, 24 | "db": "mydb" 25 | } 26 | } 27 | } 28 | } -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | skipsdist = true 3 | 4 | [testenv] 5 | download = true 6 | passenv = CI_* COVERALLS_REPO_TOKEN BUILDKITE* 7 | ; note: "source" does not work at this time due to dagster-cloud source access 8 | deps = 9 | source: -e ../../python_modules/dagster[test] 10 | pypi: dagster[test] 11 | pypi: -r ../temp_pins.txt 12 | source: -e ../../python_modules/dagster-webserver 13 | source: -e ../../python_modules/libraries/dagster-pandas/ 14 | source: -e ../../python_modules/libraries/dagster-dbt/ 15 | source: -e ../../python_modules/libraries/dagster-duckdb/ 16 | source: -e ../../python_modules/libraries/dagster-duckdb-pandas/ 17 | -e . 18 | allowlist_externals = 19 | /bin/bash 20 | commands = 21 | source: /bin/bash -c '! pip list --exclude-editable | grep -e dagster' 22 | pytest -c ../../pyproject.toml -vv 23 | --------------------------------------------------------------------------------