├── dlt_restack_demo ├── __init__.py ├── restack-app │ ├── __init__.py │ ├── src │ │ ├── __init__.py │ │ ├── functions │ │ │ ├── __init__.py │ │ │ ├── vector_search.py │ │ │ └── dlt_to_weaviate.py │ │ ├── workflows │ │ │ ├── __init__.py │ │ │ └── workflow.py │ │ ├── client.py │ │ └── services.py │ ├── .gitignore │ ├── .dlt │ │ ├── config.toml │ │ └── example.secrets.toml │ ├── .env.Example │ ├── Dockerfile │ ├── schedule_workflow.py │ └── pyproject.toml ├── img │ ├── UI.png │ ├── run.png │ ├── run2.png │ ├── results.png │ ├── results2.png │ └── incremental.png └── README.md ├── iceberg-tabular ├── __init__.py ├── move_data_to_tabular.py ├── requirements.txt ├── .gitignore ├── .dlt │ ├── config.toml │ └── example.secrets.toml ├── github_pipeline.py └── README.md ├── dlt-init-openapi-demo ├── __init__.py ├── stripe_pipeline │ ├── __init__.py │ ├── requirements.txt │ ├── rest_api │ │ ├── requirements.txt │ │ ├── exceptions.py │ │ ├── utils.py │ │ ├── typing.py │ │ ├── README.md │ │ └── __init__.py │ ├── .gitignore │ ├── .dlt │ │ ├── config.toml │ │ └── example.secrets.toml │ └── stripe_pipeline.py └── README.md ├── coinpaprika-to-postgresql ├── requirements.txt ├── example_api_responses │ ├── coin_list.json │ ├── coin_exchanges.json │ ├── coin_ohlc.json │ └── coin_details.json ├── .dlt │ ├── config.toml │ └── example.secrets.toml ├── .gitignore ├── dlt_pipeline.py ├── dlt_pipeline_merged.py └── README.md ├── dlt-dbt-cloud ├── pipeline │ ├── requirements.txt │ ├── pokemon │ │ ├── helpers.py │ │ ├── settings.py │ │ └── __init__.py │ ├── .gitignore │ ├── .dlt │ │ ├── config.toml │ │ └── .sources │ └── pokemon_pipeline.py ├── models │ ├── example │ │ ├── my_first_dbt_model.sql │ │ ├── my_second_dbt_model.sql │ │ └── schema.yml │ └── source.yml ├── dbt_project.yml ├── README.md └── .gitignore ├── scraping-source ├── requirements.txt ├── scraping │ ├── diagram.png │ ├── types.py │ ├── settings.py │ ├── __init__.py │ ├── queue.py │ ├── helpers.py │ └── runner.py ├── .gitignore ├── .dlt │ └── config.toml ├── README.md └── scraping_pipeline.py ├── pyladies-2024-demo ├── test.json ├── getting-started.py ├── load_from_database.py ├── load_from_json.py ├── github_pipeline.py ├── poke_pipeline.py └── README.md ├── dlt-dagster-snowflake ├── pyproject.toml ├── charts │ ├── google_trends_over_time.png │ └── hacker_news_sentiment_counts.png ├── requirements.txt ├── .gitignore ├── .dlt │ ├── config.toml │ └── example.secrets.toml ├── dlt_dagster_snowflake_demo │ ├── resources │ │ └── __init__.py │ ├── __init__.py │ ├── assets │ │ └── __init__.py │ └── dlt │ │ └── __init__.py └── README.md ├── images └── dlt-high-level.png ├── sengled-plug-demo ├── tuya_helpers │ ├── version.py │ ├── __init__.py │ ├── tuya_enums.py │ ├── openlogging.py │ └── openapi.py ├── env.py ├── README.md └── main.py ├── test-data └── invoice_1.pdf ├── secrets-providers-demo ├── .dlt │ ├── example.secrets.toml │ └── config.toml ├── README.md └── dlt_with_google_secrets_pipeline.py ├── .gitignore ├── .dlt └── config.toml ├── README.md ├── LICENSE.txt └── sql_to_weaviate.ipynb /dlt_restack_demo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /iceberg-tabular/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dlt-init-openapi-demo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dlt_restack_demo/restack-app/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dlt_restack_demo/restack-app/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /iceberg-tabular/move_data_to_tabular.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /dlt-init-openapi-demo/stripe_pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dlt_restack_demo/restack-app/src/functions/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dlt_restack_demo/restack-app/src/workflows/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /iceberg-tabular/requirements.txt: -------------------------------------------------------------------------------- 1 | dlt[athena]<0.4,>=0.3.5 -------------------------------------------------------------------------------- /coinpaprika-to-postgresql/requirements.txt: -------------------------------------------------------------------------------- 1 | dlt[postgres]>=0.4.2 -------------------------------------------------------------------------------- /dlt-dbt-cloud/pipeline/requirements.txt: -------------------------------------------------------------------------------- 1 | dlt[bigquery]<0.4,>=0.3.5 -------------------------------------------------------------------------------- /dlt-init-openapi-demo/stripe_pipeline/requirements.txt: -------------------------------------------------------------------------------- 1 | dlt>=0.4.12 -------------------------------------------------------------------------------- /scraping-source/requirements.txt: -------------------------------------------------------------------------------- 1 | dlt[duckdb]>=0.4.5 2 | scrapy>=2.11.0 -------------------------------------------------------------------------------- /dlt-dbt-cloud/pipeline/pokemon/helpers.py: -------------------------------------------------------------------------------- 1 | """Pokemon pipeline helpers""" 2 | -------------------------------------------------------------------------------- /dlt-init-openapi-demo/stripe_pipeline/rest_api/requirements.txt: -------------------------------------------------------------------------------- 1 | dlt>=0.4.11 -------------------------------------------------------------------------------- /pyladies-2024-demo/test.json: -------------------------------------------------------------------------------- 1 | {"id": 1, "name": "Alice", "children": {"id": 1, "name": "Eve"}} -------------------------------------------------------------------------------- /dlt-dagster-snowflake/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.dagster] 2 | module_name = "dlt-dagster-snowflake-demo" 3 | -------------------------------------------------------------------------------- /images/dlt-high-level.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlt-hub/dlt_demos/HEAD/images/dlt-high-level.png -------------------------------------------------------------------------------- /sengled-plug-demo/tuya_helpers/version.py: -------------------------------------------------------------------------------- 1 | """tuya_iot version.""" 2 | 3 | VERSION = "0.6.6" 4 | 5 | -------------------------------------------------------------------------------- /test-data/invoice_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlt-hub/dlt_demos/HEAD/test-data/invoice_1.pdf -------------------------------------------------------------------------------- /dlt_restack_demo/img/UI.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlt-hub/dlt_demos/HEAD/dlt_restack_demo/img/UI.png -------------------------------------------------------------------------------- /dlt_restack_demo/img/run.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlt-hub/dlt_demos/HEAD/dlt_restack_demo/img/run.png -------------------------------------------------------------------------------- /dlt_restack_demo/img/run2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlt-hub/dlt_demos/HEAD/dlt_restack_demo/img/run2.png -------------------------------------------------------------------------------- /dlt_restack_demo/img/results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlt-hub/dlt_demos/HEAD/dlt_restack_demo/img/results.png -------------------------------------------------------------------------------- /dlt_restack_demo/img/results2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlt-hub/dlt_demos/HEAD/dlt_restack_demo/img/results2.png -------------------------------------------------------------------------------- /dlt_restack_demo/restack-app/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .pytest_cache 3 | venv 4 | .env 5 | .vscode 6 | poetry.lock 7 | -------------------------------------------------------------------------------- /dlt-init-openapi-demo/stripe_pipeline/.gitignore: -------------------------------------------------------------------------------- 1 | # local duckdb files 2 | *.duckdb 3 | # pipeline secrets 4 | secrets.toml 5 | -------------------------------------------------------------------------------- /dlt_restack_demo/img/incremental.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlt-hub/dlt_demos/HEAD/dlt_restack_demo/img/incremental.png -------------------------------------------------------------------------------- /scraping-source/scraping/diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlt-hub/dlt_demos/HEAD/scraping-source/scraping/diagram.png -------------------------------------------------------------------------------- /dlt_restack_demo/restack-app/.dlt/config.toml: -------------------------------------------------------------------------------- 1 | [destination.weaviate] 2 | module_config={text2vec-openai = {}, generative-openai = {}} -------------------------------------------------------------------------------- /dlt-dagster-snowflake/charts/google_trends_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlt-hub/dlt_demos/HEAD/dlt-dagster-snowflake/charts/google_trends_over_time.png -------------------------------------------------------------------------------- /dlt-dagster-snowflake/charts/hacker_news_sentiment_counts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlt-hub/dlt_demos/HEAD/dlt-dagster-snowflake/charts/hacker_news_sentiment_counts.png -------------------------------------------------------------------------------- /dlt_restack_demo/restack-app/.dlt/example.secrets.toml: -------------------------------------------------------------------------------- 1 | [destination.weaviate.credentials.additional_headers] 2 | X-OpenAI-Api-Key = "..." 3 | 4 | [openai] 5 | api_key = "..." -------------------------------------------------------------------------------- /dlt-dbt-cloud/models/example/my_first_dbt_model.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized="table") }} 2 | 3 | 4 | select * 5 | from {{ source("pokemon_data", "pokemon") }} 6 | limit 7 | 1000 8 | -------------------------------------------------------------------------------- /dlt-init-openapi-demo/stripe_pipeline/.dlt/config.toml: -------------------------------------------------------------------------------- 1 | [runtime] 2 | log_level="INFO" 3 | 4 | [sources.stripe] 5 | # Base URL for the API 6 | base_url = "https://api.stripe.com/" 7 | 8 | -------------------------------------------------------------------------------- /dlt-dbt-cloud/pipeline/pokemon/settings.py: -------------------------------------------------------------------------------- 1 | """Pokemon Pipeline settings and constants""" 2 | 3 | BERRY_URL = "https://pokeapi.co/api/v2/berry" 4 | POKEMON_URL = "https://pokeapi.co/api/v2/pokemon/" 5 | -------------------------------------------------------------------------------- /dlt-dbt-cloud/models/example/my_second_dbt_model.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized="table") }} 2 | 3 | with two_pokemon as (select name from {{ ref('my_first_dbt_model') }} limit 2) 4 | select * 5 | from two_pokemon -------------------------------------------------------------------------------- /dlt-init-openapi-demo/stripe_pipeline/rest_api/exceptions.py: -------------------------------------------------------------------------------- 1 | from dlt.common.exceptions import DltException 2 | 3 | 4 | class RestApiException(DltException): 5 | pass 6 | 7 | 8 | # class Paginator 9 | -------------------------------------------------------------------------------- /dlt-dbt-cloud/models/source.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: pokemon_data 5 | database: dlt-dev-external 6 | schema: pokemon_data 7 | tables: 8 | - name: pokemon 9 | - name: berries -------------------------------------------------------------------------------- /dlt-dbt-cloud/models/example/schema.yml: -------------------------------------------------------------------------------- 1 | 2 | version: 2 3 | 4 | models: 5 | - name: my_first_dbt_model 6 | description: "A starter dbt model" 7 | 8 | - name: my_second_dbt_model 9 | description: "the second dbt model" -------------------------------------------------------------------------------- /dlt-init-openapi-demo/stripe_pipeline/.dlt/example.secrets.toml: -------------------------------------------------------------------------------- 1 | [sources.stripe] 2 | # secrets for your stripe source 3 | username = "FILL ME OUT" # TODO: fill in your credentials 4 | password = "FILL ME OUT" # TODO: fill in your credentials 5 | -------------------------------------------------------------------------------- /secrets-providers-demo/.dlt/example.secrets.toml: -------------------------------------------------------------------------------- 1 | [google_secrets.credentials] 2 | "project_id" = "" 3 | "private_key" = "-----BEGIN PRIVATE KEY-----\n....\n-----END PRIVATE KEY-----\n" 4 | "client_email" = "....gserviceaccount.com" 5 | -------------------------------------------------------------------------------- /dlt-dagster-snowflake/requirements.txt: -------------------------------------------------------------------------------- 1 | dlt[duckdb]>=0.3.5 2 | dagster 3 | snowflake-connector-python[pandas] 4 | dagster-snowflake-pandas 5 | dagster-snowflake 6 | pandas 7 | matplotlib 8 | pytrends 9 | openai 10 | dagster-webserver 11 | toml -------------------------------------------------------------------------------- /sengled-plug-demo/env.py: -------------------------------------------------------------------------------- 1 | # online 2 | ACCESS_ID = '' 3 | ACCESS_KEY = '' 4 | USERNAME = '' 5 | PASSWORD = '' 6 | DEVICE_ID = '' 7 | ENDPOINT = "https://openapi.tuya.com" 8 | -------------------------------------------------------------------------------- /iceberg-tabular/.gitignore: -------------------------------------------------------------------------------- 1 | # ignore secrets, virtual environments and typical python compilation artifacts 2 | secrets.toml 3 | # ignore basic python artifacts 4 | .env 5 | **/__pycache__/ 6 | **/*.py[cod] 7 | **/*$py.class 8 | # ignore duckdb 9 | *.duckdb 10 | *.wal -------------------------------------------------------------------------------- /scraping-source/.gitignore: -------------------------------------------------------------------------------- 1 | # ignore secrets, virtual environments and typical python compilation artifacts 2 | secrets.toml 3 | # ignore basic python artifacts 4 | .env 5 | **/__pycache__/ 6 | **/*.py[cod] 7 | **/*$py.class 8 | # ignore duckdb 9 | *.duckdb 10 | *.wal -------------------------------------------------------------------------------- /dlt-dagster-snowflake/.gitignore: -------------------------------------------------------------------------------- 1 | # ignore secrets, virtual environments and typical python compilation artifacts 2 | secrets.toml 3 | # ignore basic python artifacts 4 | .env 5 | **/__pycache__/ 6 | **/*.py[cod] 7 | **/*$py.class 8 | # ignore duckdb 9 | *.duckdb 10 | *.wal -------------------------------------------------------------------------------- /dlt-dbt-cloud/pipeline/.gitignore: -------------------------------------------------------------------------------- 1 | # ignore secrets, virtual environments and typical python compilation artifacts 2 | secrets.toml 3 | # ignore basic python artifacts 4 | .env 5 | **/__pycache__/ 6 | **/*.py[cod] 7 | **/*$py.class 8 | # ignore duckdb 9 | *.duckdb 10 | *.wal -------------------------------------------------------------------------------- /coinpaprika-to-postgresql/example_api_responses/coin_list.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "id": "btc-bitcoin", 4 | "name": "Bitcoin", 5 | "symbol": "BTC", 6 | "rank": 1, 7 | "is_new": false, 8 | "is_active": true, 9 | "type": "coin" 10 | } 11 | ] -------------------------------------------------------------------------------- /iceberg-tabular/.dlt/config.toml: -------------------------------------------------------------------------------- 1 | # put your configuration values here 2 | 3 | [runtime] 4 | log_level="WARNING" # the system log level of dlt 5 | # use the dlthub_telemetry setting to enable/disable anonymous usage data reporting, see https://dlthub.com/docs/telemetry 6 | dlthub_telemetry = true 7 | -------------------------------------------------------------------------------- /coinpaprika-to-postgresql/example_api_responses/coin_exchanges.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "id": "binance", 4 | "name": "Binance", 5 | "fiats": [ 6 | { 7 | "name": "US Dollars", 8 | "symbol": "USD" 9 | } 10 | ], 11 | "adjusted_volume_24h_share": 11.26 12 | } 13 | ] -------------------------------------------------------------------------------- /dlt-dagster-snowflake/.dlt/config.toml: -------------------------------------------------------------------------------- 1 | # put your configuration values here 2 | 3 | [runtime] 4 | log_level="WARNING" # the system log level of dlt 5 | # use the dlthub_telemetry setting to enable/disable anonymous usage data reporting, see https://dlthub.com/docs/telemetry 6 | dlthub_telemetry = true 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # ignore secrets, virtual environments and typical python compilation artifacts 2 | secrets.toml 3 | # ignore basic python artifacts 4 | .env 5 | **/__pycache__/ 6 | **/*.py[cod] 7 | **/*$py.class 8 | # ignore duckdb 9 | *.duckdb 10 | *.wal 11 | .DS_Store 12 | _storage 13 | test-data 14 | data -------------------------------------------------------------------------------- /coinpaprika-to-postgresql/.dlt/config.toml: -------------------------------------------------------------------------------- 1 | # put your configuration values here 2 | 3 | [runtime] 4 | log_level="WARNING" # the system log level of dlt 5 | # use the dlthub_telemetry setting to enable/disable anonymous usage data reporting, see https://dlthub.com/docs/telemetry 6 | dlthub_telemetry = true 7 | -------------------------------------------------------------------------------- /dlt-dbt-cloud/pipeline/.dlt/config.toml: -------------------------------------------------------------------------------- 1 | # put your configuration values here 2 | 3 | [runtime] 4 | log_level="WARNING" # the system log level of dlt 5 | # use the dlthub_telemetry setting to enable/disable anonymous usage data reporting, see https://dlthub.com/docs/telemetry 6 | dlthub_telemetry = true 7 | -------------------------------------------------------------------------------- /scraping-source/scraping/types.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | 3 | from typing_extensions import ParamSpec 4 | 5 | AnyDict = t.Dict[str, t.Any] 6 | 7 | P = ParamSpec("P") 8 | 9 | 10 | class Runnable(t.Protocol): 11 | def run(self, *args: P.args, **kwargs: P.kwargs) -> t.Any: 12 | pass 13 | -------------------------------------------------------------------------------- /secrets-providers-demo/.dlt/config.toml: -------------------------------------------------------------------------------- 1 | # put your configuration values here 2 | 3 | [runtime] 4 | log_level="WARNING" # the system log level of dlt 5 | # use the dlthub_telemetry setting to enable/disable anonymous usage data reporting, see https://dlthub.com/docs/telemetry 6 | dlthub_telemetry = true 7 | -------------------------------------------------------------------------------- /coinpaprika-to-postgresql/.gitignore: -------------------------------------------------------------------------------- 1 | # ignore secrets, virtual environments and typical python compilation artifacts 2 | #secrets.toml 3 | # ignore basic python artifacts 4 | .env 5 | **/__pycache__/ 6 | **/*.py[cod] 7 | **/*$py.class 8 | # ignore duckdb 9 | *.duckdb 10 | *.wal 11 | test_data.json 12 | secrets.toml -------------------------------------------------------------------------------- /dlt_restack_demo/restack-app/.env.Example: -------------------------------------------------------------------------------- 1 | # Restack Cloud (Optional) 2 | 3 | # RESTACK_ENGINE_ID= 4 | # RESTACK_ENGINE_API_KEY= 5 | # RESTACK_ENGINE_API_ADDRESS= 6 | # RESTACK_ENGINE_ADDRESS= 7 | # RESTACK_CLOUD_TOKEN= 8 | -------------------------------------------------------------------------------- /coinpaprika-to-postgresql/example_api_responses/coin_ohlc.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "time_open": "2018-03-01T00:00:00Z", 4 | "time_close": "2018-03-01T23:59:59Z", 5 | "open": 856.012, 6 | "high": 880.302, 7 | "low": 851.92, 8 | "close": 872.2, 9 | "volume": 1868520000, 10 | "market_cap": 83808161204 11 | } 12 | ] -------------------------------------------------------------------------------- /pyladies-2024-demo/getting-started.py: -------------------------------------------------------------------------------- 1 | import dlt 2 | 3 | 4 | data = [ 5 | {'id': 1, 'name': 'Alice'}, 6 | {'id': 2, 'name': 'Bob'} 7 | ] 8 | 9 | pipeline = dlt.pipeline( 10 | pipeline_name='quick_start', 11 | destination='duckdb', 12 | dataset_name='mydata', 13 | dev_mode=True, 14 | ) 15 | load_info = pipeline.run(data, table_name="users") 16 | print(load_info) -------------------------------------------------------------------------------- /sengled-plug-demo/tuya_helpers/__init__.py: -------------------------------------------------------------------------------- 1 | from .openapi import TuyaOpenAPI, TuyaTokenInfo 2 | from .openlogging import TUYA_LOGGER 3 | from .tuya_enums import AuthType, TuyaCloudOpenAPIEndpoint 4 | from .version import VERSION 5 | 6 | __all__ = [ 7 | "TuyaOpenAPI", 8 | "TuyaTokenInfo", 9 | "AuthType", 10 | "TuyaCloudOpenAPIEndpoint", 11 | "TUYA_LOGGER", 12 | ] 13 | __version__ = VERSION 14 | -------------------------------------------------------------------------------- /scraping-source/.dlt/config.toml: -------------------------------------------------------------------------------- 1 | # put your configuration values here 2 | 3 | [runtime] 4 | log_level="WARNING" # the system log level of dlt 5 | # use the dlthub_telemetry setting to enable/disable anonymous usage data reporting, see https://dlthub.com/docs/telemetry 6 | dlthub_telemetry = true 7 | 8 | [sources.scraping] 9 | start_urls = ["https://quotes.toscrape.com/"] 10 | start_urls_file = "start_urls_file" 11 | -------------------------------------------------------------------------------- /pyladies-2024-demo/load_from_database.py: -------------------------------------------------------------------------------- 1 | import dlt 2 | from dlt.sources.sql_database import sql_database 3 | 4 | source = sql_database( 5 | "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam" 6 | ) 7 | 8 | pipeline = dlt.pipeline( 9 | pipeline_name="sql_database_example", 10 | destination="duckdb", 11 | dataset_name="sql_data", 12 | ) 13 | 14 | load_info = pipeline.run(source.family) 15 | print(load_info) -------------------------------------------------------------------------------- /sengled-plug-demo/README.md: -------------------------------------------------------------------------------- 1 | This repo contains code that connects your Sengled Smart Device to Python via the tuya python API, and then loads your device data into a duckdb destination via dlt. 2 | 3 | # How to: 4 | 1. Insert your credentials in `env.py`. 5 | a. All the credentials required can be found on your Tuya Cloud Account after you are registered and have successfully activated a cloud project with an active connected device. 6 | 2. Run `main.py`. 7 | 8 | -------------------------------------------------------------------------------- /coinpaprika-to-postgresql/.dlt/example.secrets.toml: -------------------------------------------------------------------------------- 1 | # put your secret values and credentials here. do not share this file and do not push it to github 2 | api_secret_key = "api_secret_key" # please set me up! 3 | 4 | [destination.postgres.credentials] 5 | database = "demo_data" # please set me up! 6 | password = "password" # please set me up! 7 | username = "loader" # please set me up! 8 | host = "localhost" # please set me up! 9 | port = 5432 10 | connect_timeout = 15 11 | -------------------------------------------------------------------------------- /.dlt/config.toml: -------------------------------------------------------------------------------- 1 | # put your configuration values here 2 | 3 | [runtime] 4 | log_level="WARNING" # the system log level of dlt 5 | # use the dlthub_telemetry setting to enable/disable anonymous usage data reporting, see https://dlthub.com/docs/telemetry 6 | dlthub_telemetry = true 7 | 8 | [destination.weaviate.module_config."text2vec-openai"] 9 | model = "ada" 10 | modelVersion = "002" 11 | type = "text" 12 | [destination.weaviate.module_config."generative-openai"] 13 | model = "gpt-3.5-turbo" -------------------------------------------------------------------------------- /pyladies-2024-demo/load_from_json.py: -------------------------------------------------------------------------------- 1 | # load test json to duckdb database 2 | 3 | import json 4 | import dlt 5 | 6 | with open("test.json", 'r') as file: 7 | data = json.load(file) 8 | 9 | pipeline = dlt.pipeline( 10 | pipeline_name='from_json', 11 | destination='duckdb', 12 | dataset_name='mydata', 13 | dev_mode=True, 14 | ) 15 | # dlt works with lists of dicts, so wrap data to the list 16 | load_info = pipeline.run([data], table_name="json_data") 17 | print(load_info) 18 | -------------------------------------------------------------------------------- /dlt_restack_demo/restack-app/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.12-slim 2 | 3 | WORKDIR /app 4 | 5 | RUN apt-get update && apt-get install -y 6 | 7 | RUN pip install poetry 8 | 9 | COPY pyproject.toml ./ 10 | 11 | COPY . . 12 | 13 | # Configure poetry to not create virtual environment 14 | RUN poetry config virtualenvs.create false 15 | 16 | # Install dependencies 17 | RUN poetry install --no-interaction --no-ansi 18 | 19 | # Expose port 80 20 | EXPOSE 80 21 | 22 | CMD poetry run python -m src.services 23 | -------------------------------------------------------------------------------- /pyladies-2024-demo/github_pipeline.py: -------------------------------------------------------------------------------- 1 | import dlt 2 | import requests 3 | 4 | # url to request dlt-hub user 5 | url = f"https://api.github.com/users/dlt-hub/followers" 6 | # make the request and return the json 7 | data = requests.get(url).json() 8 | 9 | pipeline = dlt.pipeline( 10 | pipeline_name='from_api', 11 | destination='duckdb', 12 | dataset_name='mydata', 13 | dev_mode=True, 14 | ) 15 | # dlt works with lists of dicts, so wrap data to the list 16 | load_info = pipeline.run([data], table_name="followers") 17 | print(load_info) 18 | -------------------------------------------------------------------------------- /sengled-plug-demo/tuya_helpers/tuya_enums.py: -------------------------------------------------------------------------------- 1 | """Tuya iot enums.""" 2 | 3 | from enum import Enum 4 | 5 | 6 | class AuthType(Enum): 7 | """Tuya Cloud Auth Type.""" 8 | 9 | SMART_HOME = 0 10 | CUSTOM = 1 11 | 12 | 13 | class TuyaCloudOpenAPIEndpoint: 14 | """Tuya Cloud Open API Endpoint.""" 15 | 16 | CHINA = "https://openapi.tuyacn.com" 17 | AMERICA = "https://openapi.tuyaus.com" 18 | AMERICA_AZURE = "https://openapi-ueaz.tuyaus.com" 19 | EUROPE = "https://openapi.tuyaeu.com" 20 | EUROPE_MS = "https://openapi-weaz.tuyaeu.com" 21 | INDIA = "https://openapi.tuyain.com" 22 | -------------------------------------------------------------------------------- /dlt-init-openapi-demo/stripe_pipeline/stripe_pipeline.py: -------------------------------------------------------------------------------- 1 | import os 2 | import dlt 3 | 4 | from stripe import stripe_source 5 | 6 | stripe_api_key = os.getenv('STRIPE_API_KEY') 7 | 8 | 9 | if __name__ == "__main__": 10 | pipeline = dlt.pipeline( 11 | pipeline_name="stripe_pipeline", 12 | destination='duckdb', 13 | dataset_name="stripe_data", 14 | progress="log", 15 | export_schema_path="schemas/export" 16 | ) 17 | source = stripe_source(stripe_api_key, password="").with_resources("get_customers", "get_subscriptions") 18 | info = pipeline.run(source) 19 | print(info) -------------------------------------------------------------------------------- /dlt-dagster-snowflake/.dlt/example.secrets.toml: -------------------------------------------------------------------------------- 1 | # put your secret values and credentials here. do not share this file and do not push it to github 2 | [openai] 3 | openai_api_key = "api_key" # please set me up! 4 | 5 | [destination.snowflake.credentials] 6 | database = "DLT_DATA" # please set me up! 7 | password = "your_password" # please set me up! 8 | username = "your_username" # please set me up! 9 | host = "your_host" # please set me up! 10 | warehouse = "COMPUTE_WH" # please set me up! 11 | role = "ACCOUNTADMIN" # please set me up! 12 | account = "your_account_url" # please set me up! 13 | schema = "dagster_snowflake_demo" # please set me up! -------------------------------------------------------------------------------- /dlt_restack_demo/restack-app/schedule_workflow.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import time 3 | 4 | from restack_ai import Restack 5 | 6 | 7 | async def main(): 8 | 9 | client = Restack() 10 | 11 | workflow_id = f"{int(time.time() * 1000)}-AnimePipeline" 12 | run_id = await client.schedule_workflow( 13 | workflow_name="AnimePipeline", 14 | workflow_id=workflow_id, 15 | ) 16 | 17 | await client.get_workflow_result(workflow_id=workflow_id, run_id=run_id) 18 | 19 | exit(0) 20 | 21 | 22 | def run_schedule_workflow(): 23 | asyncio.run(main()) 24 | 25 | 26 | if __name__ == "__main__": 27 | run_schedule_workflow() 28 | -------------------------------------------------------------------------------- /dlt_restack_demo/restack-app/src/client.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from dotenv import load_dotenv 4 | from restack_ai import Restack 5 | from restack_ai.restack import CloudConnectionOptions 6 | 7 | # Load environment variables from a .env file 8 | load_dotenv() 9 | 10 | 11 | engine_id = os.getenv("RESTACK_ENGINE_ID") 12 | address = os.getenv("RESTACK_ENGINE_ADDRESS") 13 | api_key = os.getenv("RESTACK_ENGINE_API_KEY") 14 | api_address = os.getenv("RESTACK_ENGINE_API_ADDRESS") 15 | 16 | connection_options = CloudConnectionOptions( 17 | engine_id=engine_id, address=address, api_key=api_key, api_address=api_address 18 | ) 19 | client = Restack(connection_options) 20 | -------------------------------------------------------------------------------- /dlt_restack_demo/restack-app/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "restack-app" 3 | version = "0.0.1" 4 | description = "A quickstart for Restack with dlt" 5 | authors = [{ name = "Restack Team", email = "service@restack.io" }] 6 | requires-python = ">=3.10,<3.13" 7 | dependencies = [ 8 | "pydantic>=2.10.6", 9 | "watchfiles>=1.0.4", 10 | "python-dotenv==1.0.1", 11 | "openai>=1.61.0", 12 | "restack-ai>=0.0.62", 13 | "dlt>=1.5.0", 14 | "weaviate-client==3.22" 15 | ] 16 | 17 | [project.scripts] 18 | dev = "src.services:watch_services" 19 | services = "src.services:run_services" 20 | 21 | [tool.hatch.build.targets.sdist] 22 | include = ["src"] 23 | 24 | [tool.hatch.build.targets.wheel] 25 | include = ["src"] 26 | 27 | [build-system] 28 | requires = ["hatchling"] 29 | build-backend = "hatchling.build" -------------------------------------------------------------------------------- /iceberg-tabular/.dlt/example.secrets.toml: -------------------------------------------------------------------------------- 1 | [destination.filesystem] 2 | bucket_url = "s3://[your_bucket_name]" # replace with your bucket name, 3 | 4 | [destination.filesystem.credentials] 5 | aws_access_key_id = "please set me up!" # copy the access key here 6 | aws_secret_access_key = "please set me up!" # copy the secret access key here 7 | 8 | [destination.athena] 9 | force_iceberg = "True" 10 | query_result_bucket="s3://[results_bucket_name]" # replace with your query results bucket name 11 | 12 | [destination.athena.credentials] 13 | aws_access_key_id="please set me up!" # same as credentials for filesystem 14 | aws_secret_access_key="please set me up!" # same as credentials for filesystem 15 | region_name="please set me up!" # set your aws region, for example "eu-central-1" for frankfurt 16 | database="awsdatacatalog" 17 | 18 | -------------------------------------------------------------------------------- /dlt_restack_demo/restack-app/src/services.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import webbrowser 4 | 5 | from watchfiles import run_process 6 | 7 | from src.client import client 8 | from src.functions.dlt_to_weaviate import anime_pipeline 9 | from src.functions.vector_search import rag_pipeline 10 | from src.workflows.workflow import AnimePipeline, RAGPipeline 11 | 12 | 13 | async def main(): 14 | await client.start_service( 15 | workflows=[AnimePipeline, RAGPipeline], functions=[anime_pipeline, rag_pipeline] 16 | ) 17 | 18 | 19 | def run_services(): 20 | try: 21 | asyncio.run(main()) 22 | except KeyboardInterrupt: 23 | print("Service interrupted by user. Exiting gracefully.") 24 | 25 | 26 | def watch_services(): 27 | watch_path = os.getcwd() 28 | print(f"Watching {watch_path} and its subdirectories for changes...") 29 | webbrowser.open("http://localhost:5233") 30 | run_process(watch_path, recursive=True, target=run_services) 31 | 32 | 33 | if __name__ == "__main__": 34 | run_services() 35 | -------------------------------------------------------------------------------- /dlt-init-openapi-demo/stripe_pipeline/rest_api/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, Dict, Any, Mapping, Iterable 2 | 3 | from dlt.common import logger 4 | from dlt.extract.source import DltSource 5 | 6 | 7 | def join_url(base_url: str, path: str) -> str: 8 | if not base_url.endswith("/"): 9 | base_url += "/" 10 | return base_url + path.lstrip("/") 11 | 12 | 13 | def exclude_keys(d: Mapping[str, Any], keys: Iterable[str]) -> Dict[str, Any]: 14 | """Removes specified keys from a dictionary and returns a new dictionary. 15 | 16 | Args: 17 | d (Mapping[str, Any]): The dictionary to remove keys from. 18 | keys (Iterable[str]): The keys to remove. 19 | 20 | Returns: 21 | Dict[str, Any]: A new dictionary with the specified keys removed. 22 | """ 23 | return {k: v for k, v in d.items() if k not in keys} 24 | 25 | 26 | def check_connection( 27 | source: DltSource, 28 | *resource_names: str, 29 | ) -> Tuple[bool, str]: 30 | try: 31 | list(source.with_resources(*resource_names).add_limit(1)) 32 | return (True, "") 33 | except Exception as e: 34 | logger.error(f"Error checking connection: {e}") 35 | return (False, str(e)) 36 | -------------------------------------------------------------------------------- /dlt-dbt-cloud/pipeline/pokemon/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This source provides data extraction from an example source as a starting point for new pipelines. 3 | Available resources: [berries, pokemon] 4 | """ 5 | 6 | import typing as t 7 | from typing import Sequence, Iterable, Dict, Any 8 | import dlt 9 | from dlt.common.typing import TDataItem 10 | from dlt.extract.source import DltResource 11 | from dlt.sources.helpers import requests 12 | from .settings import BERRY_URL, POKEMON_URL 13 | 14 | 15 | @dlt.resource(write_disposition="replace") 16 | def berries() -> Iterable[TDataItem]: 17 | """ 18 | Returns a list of berries. 19 | Yields: 20 | dict: The berries data. 21 | """ 22 | yield requests.get(BERRY_URL).json()["results"] 23 | 24 | 25 | @dlt.resource(write_disposition="replace") 26 | def pokemon() -> Iterable[TDataItem]: 27 | """ 28 | Returns a list of pokemon. 29 | Yields: 30 | dict: The pokemon data. 31 | """ 32 | yield requests.get(POKEMON_URL).json()["results"] 33 | 34 | 35 | @dlt.source 36 | def source() -> Sequence[DltResource]: 37 | """ 38 | The source function that returns all availble resources. 39 | Returns: 40 | Sequence[DltResource]: A sequence of DltResource objects containing the fetched data. 41 | """ 42 | return [berries, pokemon] 43 | -------------------------------------------------------------------------------- /dlt-dbt-cloud/pipeline/pokemon_pipeline.py: -------------------------------------------------------------------------------- 1 | """Very simple pokemon pipeline, to be used as a starting point for new pipelines. 2 | 3 | Available resources: 4 | fruits 5 | vegetables 6 | """ 7 | import dlt 8 | from pokemon import source 9 | from typing import List 10 | 11 | from dlt.helpers.dbt_cloud import run_dbt_cloud_job 12 | 13 | 14 | def load(resources: List[str]) -> None: 15 | """ 16 | Execute a pipeline that will load all the resources for the given endpoints. 17 | Args: 18 | resources (List[str]): A list of resource names to load data from the pokemon source. Available resources include 'pokemon' and 'berries'. 19 | Returns: 20 | None: This function doesn't return any value. It prints the loading information on successful execution. 21 | """ 22 | pipeline = dlt.pipeline( 23 | pipeline_name="pokemon", destination='bigquery', dataset_name="pokemon_data" 24 | ) 25 | load_info = pipeline.run(source().with_resources(*resources)) 26 | print(load_info) 27 | 28 | 29 | if __name__ == "__main__": 30 | """ 31 | Main function to execute the data loading pipeline. 32 | Add your desired resources to the list and call the load function. 33 | """ 34 | resources = ["pokemon", "berries"] 35 | load(resources) 36 | 37 | run_info = run_dbt_cloud_job() 38 | print(f"Job run status: {run_info['status_humanized']}") 39 | -------------------------------------------------------------------------------- /scraping-source/scraping/settings.py: -------------------------------------------------------------------------------- 1 | from .types import AnyDict 2 | 3 | SOURCE_BATCH_SIZE: int = 10 4 | SOURCE_SCRAPY_QUEUE_SIZE: int = 3000 5 | SOURCE_SCRAPY_QUEUE_RESULT_TIMEOUT: int = 5 6 | SOURCE_SCRAPY_SETTINGS: AnyDict = { 7 | "LOG_LEVEL": "INFO", 8 | # If not set then will keep logging warning in the console 9 | # https://docs.scrapy.org/en/latest/topics/request-response.html#request-fingerprinter-implementation 10 | "REQUEST_FINGERPRINTER_IMPLEMENTATION": "2.7", 11 | "TELNETCONSOLE_ENABLED": False, 12 | # How many sub pages to scrape 13 | # https://docs.scrapy.org/en/latest/topics/settings.html#depth-limit 14 | "DEPTH_LIMIT": 0, 15 | "SPIDER_MIDDLEWARES": { 16 | "scrapy.spidermiddlewares.depth.DepthMiddleware": 200, 17 | "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 300, 18 | }, 19 | "HTTPERROR_ALLOW_ALL": True, 20 | "FAKEUSERAGENT_PROVIDERS": [ 21 | # this is the first provider we'll try 22 | "scrapy_fake_useragent.providers.FakeUserAgentProvider", 23 | # if FakeUserAgentProvider fails, we'll use faker to generate a user-agent string for us 24 | "scrapy_fake_useragent.providers.FakerProvider", 25 | # fall back to USER_AGENT value 26 | "scrapy_fake_useragent.providers.FixedUserAgentProvider", 27 | ], 28 | "USER_AGENT": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:123.0) Gecko/20100101 Firefox/123.0", 29 | } 30 | -------------------------------------------------------------------------------- /dlt-dbt-cloud/dbt_project.yml: -------------------------------------------------------------------------------- 1 | 2 | # Name your project! Project names should contain only lowercase characters 3 | # and underscores. A good package name should reflect your organization's 4 | # name or the intended use of these models 5 | name: 'my_new_project' 6 | version: '1.0.0' 7 | config-version: 2 8 | 9 | # This setting configures which "profile" dbt uses for this project. 10 | profile: 'default' 11 | 12 | # These configurations specify where dbt should look for different types of files. 13 | # The `source-paths` config, for example, states that models in this project can be 14 | # found in the "models/" directory. You probably won't need to change these! 15 | model-paths: ["models"] 16 | analysis-paths: ["analyses"] 17 | test-paths: ["tests"] 18 | seed-paths: ["seeds"] 19 | macro-paths: ["macros"] 20 | snapshot-paths: ["snapshots"] 21 | 22 | target-path: "target" # directory which will store compiled SQL files 23 | clean-targets: # directories to be removed by `dbt clean` 24 | - "target" 25 | - "dbt_packages" 26 | 27 | 28 | # Configuring models 29 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 30 | 31 | # In this example config, we tell dbt to build all models in the example/ directory 32 | # as tables. These settings can be overridden in the individual model files 33 | # using the `{{ config(...) }}` macro. 34 | models: 35 | my_new_project: 36 | # Applies to all files under models/example/ 37 | example: 38 | materialized: view 39 | -------------------------------------------------------------------------------- /sengled-plug-demo/tuya_helpers/openlogging.py: -------------------------------------------------------------------------------- 1 | """Tuya iot logging.""" 2 | from __future__ import annotations 3 | 4 | import copy 5 | import logging 6 | from typing import Any 7 | 8 | logger = logging.getLogger(__package__) 9 | 10 | default_handler = logging.StreamHandler() 11 | default_handler.setFormatter( 12 | logging.Formatter("[%(asctime)s] [tuya-%(module)s] %(message)s") 13 | ) 14 | 15 | logger.addHandler(default_handler) 16 | TUYA_LOGGER = logger 17 | 18 | FILTER_LIST = [ 19 | "access_token", 20 | "client_id", 21 | "ip", 22 | "lat", 23 | "link_id", 24 | "local_key", 25 | "lon", 26 | "password", 27 | "refresh_token", 28 | "uid", 29 | ] 30 | 31 | STAR = "***" 32 | 33 | 34 | def filter_logger(result_info: dict[str, Any]): 35 | """Filter log, hide sensitive info.""" 36 | if result_info is None: 37 | return result_info 38 | filter_info_original = copy.deepcopy(result_info) 39 | if "result" in filter_info_original: 40 | filter_info = filter_info_original["result"] 41 | else: 42 | filter_info = filter_info_original 43 | if isinstance(filter_info, list): 44 | for item in filter_info: 45 | for filter_key in FILTER_LIST: 46 | if filter_key in item: 47 | item[filter_key] = STAR 48 | 49 | elif isinstance(filter_info, dict): 50 | for filter_key in FILTER_LIST: 51 | if filter_key in filter_info: 52 | filter_info[filter_key] = STAR 53 | 54 | return filter_info_original 55 | -------------------------------------------------------------------------------- /sengled-plug-demo/main.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from tuya_helpers import TuyaOpenAPI, TUYA_LOGGER 3 | import dlt 4 | from env import ENDPOINT, ACCESS_ID, ACCESS_KEY, USERNAME, PASSWORD, DEVICE_ID 5 | import streamlit as st 6 | 7 | if 'schema' not in st.session_state: 8 | st.session_state['schema'] = 'value' 9 | 10 | TUYA_LOGGER.setLevel(logging.DEBUG) 11 | 12 | # Init 13 | openapi = TuyaOpenAPI(ENDPOINT, ACCESS_ID, ACCESS_KEY) 14 | openapi.connect(USERNAME, PASSWORD, "86", 'tuyaSmart') 15 | 16 | @dlt.resource(name='status') 17 | def get_status(): 18 | yield openapi.get(f'/v1.0/devices/{DEVICE_ID}') 19 | 20 | @dlt.resource(name='specs') 21 | def get_specs(): 22 | yield openapi.get(f'/v1.0/devices/{DEVICE_ID}/specifications') 23 | 24 | @dlt.resource(name='properties') 25 | def get_properties(): 26 | yield openapi.get(f'/v2.0/cloud/thing/{DEVICE_ID}/shadow/properties') 27 | 28 | 29 | pipeline = dlt.pipeline( 30 | pipeline_name="smart_plug", 31 | destination="duckdb", 32 | dataset_name="smart_plug_data", 33 | ) 34 | 35 | 36 | pipeline.run(get_status()) 37 | pipeline.run(get_specs()) 38 | pipeline.run(get_properties()) 39 | 40 | 41 | info = pipeline.run(get_status()) 42 | print(info) 43 | info = pipeline.run(get_specs()) 44 | print(info) 45 | info = pipeline.run(get_properties()) 46 | print(info) 47 | 48 | 49 | #dashboard.write_data_explorer_page(pipeline) 50 | ''' 51 | # Receive device message 52 | def on_message(msg): 53 | print("on_message: %s" % msg) 54 | 55 | print(openapi) 56 | openapi.token_info.expire_time = 0 57 | 58 | openmq = TuyaOpenMQ(openapi) 59 | openmq.start() 60 | openmq.add_message_listener(on_message) 61 | ''' 62 | 63 | -------------------------------------------------------------------------------- /coinpaprika-to-postgresql/dlt_pipeline.py: -------------------------------------------------------------------------------- 1 | import dlt 2 | from dlt.sources.helpers import requests 3 | 4 | # Resource 1: Basic information about cryptocurrencies on coinpaprika.com: 5 | @dlt.resource(name = "coin_list", write_disposition="replace") 6 | def coin_list(): 7 | response = requests.get('https://api.coinpaprika.com/v1/coins') 8 | yield from response.json() 9 | 10 | # Resource 2 - Transformer: Detailed descriptive information about a single coin 11 | @dlt.transformer(data_from = coin_list().add_limit(2)) 12 | def coin_details(coin): 13 | coin_id = coin['id'] 14 | response = requests.get(f'https://api.coinpaprika.com/v1/coins/{coin_id}') 15 | yield response.json() 16 | 17 | # Resource 3 - Transformer: The last 50 timeline tweets from the official Twitter profile for a given coin 18 | @dlt.transformer(data_from = coin_list().add_limit(2)) 19 | def coin_tweets(coin): 20 | coin_id = coin['id'] 21 | response = requests.get(f'https://api.coinpaprika.com/v1/coins/{coin_id}/twitter') 22 | data = response.json() 23 | data_with_id = [{'id': coin_id, **entry} for entry in data] 24 | yield data_with_id 25 | 26 | # Source: Combines the above three resources into a single source 27 | @dlt.source 28 | def crypto_data(name = "crypto_source"): 29 | yield coin_list() 30 | yield coin_details() 31 | yield coin_tweets() 32 | 33 | # Main function to run the pipeline 34 | def load_coin_details() -> None: 35 | pipeline = dlt.pipeline( 36 | pipeline_name="crypto_pipeline", 37 | destination='postgres', 38 | full_refresh=True, 39 | dataset_name="crypto_data" 40 | ) 41 | info = pipeline.run(crypto_data().add_limit(2)) 42 | print(info) 43 | 44 | if __name__ == "__main__": 45 | load_coin_details() 46 | -------------------------------------------------------------------------------- /dlt-dbt-cloud/pipeline/.dlt/.sources: -------------------------------------------------------------------------------- 1 | engine_version: 1 2 | sources: 3 | google_analytics: 4 | is_dirty: false 5 | last_commit_sha: d82fa669817909c54087cb753c20701f1d480a25 6 | last_commit_timestamp: '2023-08-31T20:00:58+02:00' 7 | files: 8 | google_analytics/__init__.py: 9 | commit_sha: d82fa669817909c54087cb753c20701f1d480a25 10 | git_sha: 853225b13342dd9447e167bd6d8ca74b37a76728 11 | sha3_256: a43c90ae7507c61e6768cc543bd276e7de2c8075593269d27c3aae469fa5b7f9 12 | google_analytics/README.md: 13 | commit_sha: d82fa669817909c54087cb753c20701f1d480a25 14 | git_sha: 3b9fb9bda50ed4228ca15798f53f7c17d74216ae 15 | sha3_256: 5408fd3e6320a74298216cf17d2a9157ebadda9dc8be123e109b460c1348653e 16 | google_analytics/setup_script_gcp_oauth.py: 17 | commit_sha: d82fa669817909c54087cb753c20701f1d480a25 18 | git_sha: 1a6025198657b905a49a78789c8166c8de081ea3 19 | sha3_256: 89dc2cac47c053a14bd04cd5584e310b4f38a7cee5f2da9c31dfe138ea26cea8 20 | google_analytics/settings.py: 21 | commit_sha: d82fa669817909c54087cb753c20701f1d480a25 22 | git_sha: f003ce1a4aac37c294b1595d27d0d4a4b0ea79b9 23 | sha3_256: 94cdf6cd852b64c716b865fe40a2f99bfa50908f7806540923a6f87aad87bda4 24 | google_analytics/helpers/__init__.py: 25 | commit_sha: d82fa669817909c54087cb753c20701f1d480a25 26 | git_sha: 0caf864b021d11765a1b54ff0bf1d48ea16a9a77 27 | sha3_256: 7af600fd6a3e895fdf37935993854d0141ad92a58859810df9aa352cdc1e50fd 28 | google_analytics/helpers/data_processing.py: 29 | commit_sha: d82fa669817909c54087cb753c20701f1d480a25 30 | git_sha: bd732da7e7f268ddcc3b6784a540dc45df1d7bcf 31 | sha3_256: 7f7bde54e0706ba4fabf2b7c346e7e4592a8378f6265a89811dbf83de209e75f 32 | dlt_version_constraint: '>=0.2.5' 33 | -------------------------------------------------------------------------------- /dlt_restack_demo/restack-app/src/workflows/workflow.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | 3 | from pydantic import BaseModel, Field 4 | from restack_ai.workflow import RetryPolicy, import_functions, log, workflow 5 | 6 | with import_functions(): 7 | from src.functions.dlt_to_weaviate import anime_pipeline 8 | from src.functions.vector_search import rag_pipeline 9 | 10 | 11 | class AnimePipelineInput(BaseModel): 12 | pipeline_name: str = Field(default="anime_pipeline") 13 | destination: str = Field(default="weaviate") 14 | add_limit: int = Field(default=2) 15 | dev_mode: bool = Field(default=False) 16 | 17 | 18 | class RAGPipelineInput(BaseModel): 19 | pipeline_name: str = Field(default="anime_pipeline") 20 | question: str = Field( 21 | default="What is the story about Ye Bufan and his medical skills?" 22 | ) 23 | 24 | 25 | @workflow.defn() 26 | class AnimePipeline: 27 | @workflow.run 28 | async def run(self, input: AnimePipelineInput): 29 | log.info("PokePipeline started") 30 | result = await workflow.step( 31 | anime_pipeline, 32 | input=input, 33 | start_to_close_timeout=timedelta(seconds=300), 34 | retry_policy=RetryPolicy(maximum_attempts=1), 35 | ) 36 | log.info("PokePipeline completed", result=result) 37 | return result 38 | 39 | 40 | @workflow.defn() 41 | class RAGPipeline: 42 | @workflow.run 43 | async def run(self, input: RAGPipelineInput): 44 | log.info("RAGPipeline started") 45 | result = await workflow.step( 46 | rag_pipeline, 47 | input=input, 48 | start_to_close_timeout=timedelta(seconds=300), 49 | retry_policy=RetryPolicy(maximum_attempts=1), 50 | ) 51 | log.info("RAGPipeline completed", result=result) 52 | return result 53 | -------------------------------------------------------------------------------- /dlt_restack_demo/restack-app/src/functions/vector_search.py: -------------------------------------------------------------------------------- 1 | import traceback 2 | from typing import Optional, Any 3 | 4 | import dlt 5 | from openai import OpenAI 6 | from pydantic import BaseModel 7 | from restack_ai.function import function, log 8 | 9 | 10 | class RAGInput(BaseModel): 11 | pipeline_name: str 12 | question: str 13 | 14 | 15 | openai_client = OpenAI(api_key=dlt.secrets["openai.api_key"]) 16 | 17 | 18 | @function.defn() 19 | async def rag_pipeline(input: RAGInput) -> str: 20 | try: 21 | pipeline = dlt.pipeline( 22 | pipeline_name=input.pipeline_name, 23 | destination="weaviate", 24 | progress="log", 25 | dev_mode=False, 26 | ) 27 | with pipeline.destination_client() as client: 28 | return rag_query(client.db_client, str(input)) 29 | 30 | except Exception as e: 31 | log.error("Something went wrong!", error=e) 32 | log.error(traceback.format_exc()) 33 | raise e 34 | 35 | 36 | def rag_query(weaviate_client: Any, prompt: str) -> Optional[str]: 37 | response = ( 38 | weaviate_client.query.get("Anime", ["title", "synopsis"]) 39 | .with_near_text({"concepts": [prompt]}) 40 | .with_limit(3) 41 | .do() 42 | ) 43 | 44 | if "data" in response and "Get" in response["data"]: 45 | docs = response["data"]["Get"]["Anime"] 46 | print("Retrieved documents:") 47 | for doc in docs: 48 | print(f"- {doc['title']}: {doc['synopsis']}\n") 49 | else: 50 | print("No results found.") 51 | return None 52 | 53 | context = " ".join([doc["synopsis"] for doc in docs]) 54 | 55 | completion = openai_client.completions.create( 56 | model="gpt-3.5-turbo-instruct", 57 | prompt=f"{context}\n\nQuestion: {prompt}\nAnswer:", 58 | max_tokens=150, 59 | ) 60 | 61 | log.info("Generated Answer:") 62 | answer = completion.choices[0].text.strip() 63 | log.info(answer) 64 | return answer 65 | -------------------------------------------------------------------------------- /secrets-providers-demo/README.md: -------------------------------------------------------------------------------- 1 | # Use `dlt` with Cloud Secrets Vaults 2 | 3 | ## Google Cloud Secret Manager 4 | To retrieve secrets from Google Cloud Secret Manager using Python, and convert them into a dictionary format, you'll need to follow these steps. First, ensure that you have the necessary permissions to access the secrets on Google Cloud, and have the `google-cloud-secret-manager` library installed. If not, you can install it using pip: 5 | 6 | ```bash 7 | pip install google-cloud-secret-manager 8 | ``` 9 | [Google Docs](https://cloud.google.com/secret-manager/docs/reference/libraries) 10 | 11 | Here's how you can retrieve secrets and convert them into a dictionary: 12 | 13 | 1. **Set up the Secret Manager client**: Create a client that will interact with the Secret Manager API. 14 | 2. **Access the secret**: Use the client to access the secret's latest version. 15 | 3. **Convert to a dictionary**: If the secret is stored in a structured format (like JSON), parse it into a Python dictionary. 16 | 17 | Assume we store secrets in JSON format: 18 | ```json 19 | {"api_token": "ghp_Kskdgf98dugjf98ghd...."} 20 | ``` 21 | 22 | In the script `dlt_with_google_secrets_pipeline.py` you can find an example how to use Google Secrets in `dlt` pipelines. 23 | 24 | ### Points to Note: 25 | 26 | - **Permissions**: Ensure the service account or user credentials you are using have the necessary permissions to access the Secret Manager and the specific secrets. 27 | - **Secret Format**: This example assumes that the secret is stored in a JSON string format. If your secret is in a different format, you will need to adjust the parsing method accordingly. 28 | - **Google Cloud Authentication**: Make sure your environment is authenticated with Google Cloud. This can typically be done by setting credentials in `.dlt/secrets.toml` or setting the `GOOGLE_SECRETS__CREDENTIALS` environment variable to the path of your service account key file or the dict of credentials as a string. 29 | 30 | With this setup, you can effectively retrieve secrets stored in Google Cloud Secret Manager and use them in your `dlt` pipelines as dictionaries. -------------------------------------------------------------------------------- /coinpaprika-to-postgresql/dlt_pipeline_merged.py: -------------------------------------------------------------------------------- 1 | import dlt 2 | from dlt.sources.helpers import requests 3 | 4 | 5 | # Resource 1: Retrieves a basic list of cryptocurrencies from coinpaprika.com 6 | @dlt.resource(name = "coin_list", write_disposition="replace") 7 | def coin_list(): 8 | response = requests.get('https://api.coinpaprika.com/v1/coins') 9 | yield from response.json() 10 | 11 | 12 | # Resource 2 - Transformer: Extracts detailed information for each coin 13 | @dlt.transformer(data_from = coin_list().add_limit(10)) # The limit is added to avoid exceeding the API's request quota 14 | def coin_information(coin): 15 | coin_id = coin['id'] 16 | # Fetching detailed information including the list of team members, tags, and links for each coin 17 | details = requests.get(f'https://api.coinpaprika.com/v1/coins/{coin_id}') 18 | # Fetching the latest OHLCV (Open, High, Low, Close, Volume) data as a list with a single dictionary 19 | ohlc = requests.get(f'https://api.coinpaprika.com/v1/coins/{coin_id}/ohlcv/latest') 20 | # Fetching exchanges where the coin is traded as a nested list 21 | exchanges = requests.get(f'https://api.coinpaprika.com/v1/coins/{coin_id}/exchanges') 22 | # Merging details, OHLCV, and exchanges data and yielding as one record 23 | yield details.json() | ohlc.json()[0] | {"exchanges": exchanges.json()} 24 | 25 | 26 | # Source: Aggregates the coin list and detailed coin information into a single source 27 | @dlt.source 28 | def crypto_data(name = "crypto_source"): 29 | yield coin_list() 30 | yield coin_information() 31 | 32 | 33 | # Main function to execute the data loading pipeline 34 | def load_coin_details() -> None: 35 | # Setting up the pipeline with PostgreSQL as the destination 36 | pipeline = dlt.pipeline( 37 | pipeline_name="crypto_pipeline", 38 | destination='postgres', 39 | full_refresh=True, 40 | dataset_name="crypto_data", 41 | ) 42 | # Running the pipeline and printing execution details 43 | info = pipeline.run(crypto_data()) 44 | print(info) 45 | 46 | 47 | if __name__ == "__main__": 48 | load_coin_details() 49 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Loading Demos 2 | 3 | This repository contains Jupyter notebooks and more extensive projects that illustrate various methods for loading data into different destinations (e.g. Weaviate database) 4 | using the [dlt](https://github.com/dlt-hub/dlt) library. 5 | 6 | ## Repository Contents 7 | 8 | ### Prerequisites 9 | 10 | To run the notebooks, you will need credentials for the tools being used. They are added to the `.dlt` folder. For instance, if you're working on a Weaviate notebook, you will have to add Weaviate credentials. Refer to the notebooks to find out which credentials are needed. 11 | 12 | ### Common demos 13 | 14 | - [schema_evolution.ipynb](schema_evolution.ipynb): shows how you can alert schema changes to slack. 15 | - [spotlight_demo.ipynb](spotlight_demo.ipynb): shows how to get data from APIs, files, Python objects and move it into a local or remote database. 16 | Demo was created for a [Data Talks Club: Open-Source Spotlight](https://youtube.com/playlist?list=PL3MmuxUbc_hJ5t5nnjzC0F2zan76Dpsz0&feature=shared) project. 17 | - [Pyladies 12.11.2024](pyladies-2024-demo): Similar to Spotlight demo, shows dlt basics, demonstrates how to get data from APIs (GitHub and PokeAPI), database, JSON, and move it into a Duckdb. 18 | 19 | ### Weaviate demos 20 | 21 | - [pdf_to_weaviate.ipynb](pdf_to_weaviate.ipynb): shows how to load data from PDF files, specifically invoices, into Weaviate. 22 | - [sql_to_weaviate.ipynb](sql_to_weaviate.ipynb): shows how to import data from a public MySQL database into Weaviate. 23 | - [zendesk_to_weaviate.ipynb](zendesk_to_weaviate.ipynb): loads data from a [Zendesk dlt source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/zendesk) into Weaviate. 24 | 25 | 26 | ### Personio demos 27 | 28 | - [personio_demo.ipynb](personio_demo.ipynb): shows how to load data from Personio to duckDB. 29 | 30 | ### Project demos 31 | 32 | Project demos are more extensive compared to the notebook ones and have their own README files. Refer to each project for more details. 33 | 34 | 35 | ## License 36 | 37 | This repository is licensed under the [Apache License 2.0](LICENSE.txt). Please refer to the `LICENSE.txt` file for more details. 38 | 39 | Happy coding and data loading! 🚀📊 40 | -------------------------------------------------------------------------------- /iceberg-tabular/github_pipeline.py: -------------------------------------------------------------------------------- 1 | import click 2 | import dlt 3 | from dlt.sources.helpers import requests 4 | 5 | BASE_URL = "https://api.github.com/repos" 6 | 7 | 8 | @dlt.resource( 9 | table_name="issues", 10 | write_disposition="merge", 11 | primary_key="id", 12 | ) 13 | def get_issues( 14 | organisation_name: str, 15 | repo_name: str, 16 | updated_at=dlt.sources.incremental( 17 | "updated_at", initial_value="1970-01-01T00:00:00Z" 18 | ), 19 | ): 20 | # NOTE: we read only open issues to minimize the number of calls to the API. 21 | # There's a limit of ~50 calls for not authenticated Github users 22 | url = f"{BASE_URL}/{organisation_name}/{repo_name}/issues" 23 | 24 | while True: 25 | response = requests.get( 26 | url, 27 | params={ 28 | "since": {updated_at.last_value}, 29 | "per_page": 100, 30 | "sort": "updated", 31 | "directions": "desc", 32 | "state": "open", 33 | }, 34 | ) 35 | response.raise_for_status() 36 | yield response.json() 37 | 38 | # get next page 39 | if "next" not in response.links: 40 | break 41 | url = response.links["next"]["url"] 42 | 43 | 44 | @click.command() 45 | @click.option("--organisation-name", required=True, help="GitHub organisation name.") 46 | @click.option("--repo-name", required=True, help="GitHub repository name.") 47 | @click.option("--pipeline-name", default="github_pipeline", help="Name of the DLT pipeline.") 48 | @click.option("--dataset-name", default="issues", help="Name of the dataset.") 49 | def github_pipeline(organisation_name, repo_name, pipeline_name, dataset_name): 50 | pipeline = dlt.pipeline( 51 | pipeline_name=pipeline_name, 52 | destination="athena", 53 | dataset_name=dataset_name, 54 | ) 55 | source_data = get_issues(organisation_name=organisation_name, repo_name=repo_name) 56 | load_info = pipeline.run(source_data) 57 | row_counts = pipeline.last_trace.last_normalize_info 58 | 59 | print(row_counts) 60 | print("------") 61 | print(load_info) 62 | 63 | 64 | if __name__ == "__main__": 65 | github_pipeline() 66 | -------------------------------------------------------------------------------- /dlt-dagster-snowflake/dlt_dagster_snowflake_demo/resources/__init__.py: -------------------------------------------------------------------------------- 1 | from dagster import ConfigurableResource 2 | import dlt 3 | import os 4 | 5 | 6 | # Define a Dagster resource for managing dlt pipelines 7 | class DltPipeline(ConfigurableResource): 8 | # Initialize resource with pipeline details 9 | pipeline_name: str 10 | dataset_name: str 11 | destination: str 12 | 13 | def create_pipeline(self, resource_data, table_name): 14 | """ 15 | Creates and runs a dlt pipeline with specified data and table name. 16 | 17 | Args: 18 | resource_data: The data to be processed by the pipeline. 19 | table_name: The name of the table where data will be loaded. 20 | 21 | Returns: 22 | The result of the pipeline execution. 23 | """ 24 | 25 | # Configure the dlt pipeline with your destination details 26 | pipeline = dlt.pipeline( 27 | pipeline_name=self.pipeline_name, 28 | destination=self.destination, 29 | dataset_name=self.dataset_name 30 | ) 31 | 32 | # Run the pipeline with your parameters 33 | load_info = pipeline.run(resource_data, table_name=table_name) 34 | return load_info 35 | 36 | 37 | # Define a Dagster resource for managing local file storage 38 | class LocalFileStorage(ConfigurableResource): 39 | dir: str 40 | 41 | def setup_for_execution(self, context) -> None: 42 | """ 43 | Prepares the local directory for file storage, creating it if it doesn't exist. 44 | 45 | Args: 46 | context: The Dagster execution context (not used here). 47 | """ 48 | 49 | # Ensure the storage directory exists 50 | os.makedirs(self.dir, exist_ok=True) 51 | 52 | def write(self, filename, data): 53 | """ 54 | Writes data to a file within the local storage directory. 55 | 56 | Args: 57 | filename: The name of the file to write to. 58 | data: The data to be written to the file. 59 | """ 60 | 61 | # Create the directory path for the file if it does not exist 62 | dir_path = f"{self.dir}/{os.path.dirname(filename)}" 63 | os.makedirs(dir_path, exist_ok=True) 64 | 65 | # Write data to the file in binary mode 66 | with open(f"{self.dir}/{filename}", "wb") as f: 67 | f.write(data.read()) 68 | -------------------------------------------------------------------------------- /dlt-dagster-snowflake/dlt_dagster_snowflake_demo/__init__.py: -------------------------------------------------------------------------------- 1 | from dagster import Definitions, load_assets_from_modules, define_asset_job 2 | from dagster_snowflake_pandas import SnowflakePandasIOManager 3 | from dagster_snowflake import SnowflakeResource 4 | from . import assets 5 | from . import resources 6 | import toml 7 | import os 8 | 9 | 10 | # Load your secrets from the secrets.toml file accessed by dlt 11 | with open(os.getcwd() + '/.dlt/secrets.toml', 'r') as secrets_file: 12 | secrets = toml.load(secrets_file) 13 | 14 | 15 | # Set your secret values 16 | snowflake_user = secrets["destination"]["snowflake"]["credentials"]["username"] 17 | snowflake_password = secrets["destination"]["snowflake"]["credentials"]["password"] 18 | snowflake_warehouse = secrets["destination"]["snowflake"]["credentials"]["warehouse"] 19 | snowflake_database = secrets["destination"]["snowflake"]["credentials"]["database"] 20 | snowflake_account = secrets["destination"]["snowflake"]["credentials"]["host"] 21 | snowflake_schema = secrets["destination"]["snowflake"]["credentials"]["schema"] 22 | 23 | 24 | # Set your dlt pipelines as Dagster jobs 25 | dlt_pipelines = define_asset_job(name = "dlt_pipelines", selection= ['google_trends_asset', 'hacker_news_full_asset']) 26 | 27 | 28 | # Set your Dagster definition 29 | defs = Definitions( 30 | assets = load_assets_from_modules([assets]), 31 | jobs = [dlt_pipelines], 32 | resources = { 33 | "pipeline": resources.DltPipeline( 34 | pipeline_name = "dagster_pipeline", 35 | dataset_name = "dagster_snoflake_demo", 36 | destination = "snowflake", 37 | description = "" 38 | ), 39 | "io_manager": SnowflakePandasIOManager( 40 | account = snowflake_account, 41 | user = snowflake_user, 42 | password = snowflake_password, 43 | warehouse = snowflake_warehouse, 44 | database = snowflake_database, 45 | schema = snowflake_schema, 46 | # role = snowflake_role # Optional 47 | ), 48 | "image_storage": resources.LocalFileStorage( 49 | dir = "charts" 50 | ), 51 | "snowflake": SnowflakeResource( 52 | account = snowflake_account, 53 | user = snowflake_user, 54 | password = snowflake_password, 55 | database = snowflake_database, 56 | schema = snowflake_schema, 57 | ) 58 | } 59 | ) -------------------------------------------------------------------------------- /scraping-source/scraping/__init__.py: -------------------------------------------------------------------------------- 1 | """Scraping source 2 | 3 | Integrates Dlt and Scrapy to facilitate scraping pipelines. 4 | """ 5 | import inspect 6 | import typing as t 7 | 8 | import dlt 9 | 10 | from dlt.sources import DltResource 11 | from dlt.common.source import _SOURCES, SourceInfo 12 | 13 | from scrapy import Spider # type: ignore 14 | 15 | from .helpers import ScrapingConfig, create_pipeline_runner 16 | from .types import P, AnyDict 17 | 18 | 19 | def run_pipeline( # type: ignore[valid-type] 20 | pipeline: dlt.Pipeline, 21 | spider: t.Type[Spider], 22 | *args: P.args, 23 | on_before_start: t.Callable[[DltResource], None] = None, 24 | scrapy_settings: t.Optional[AnyDict] = None, 25 | batch_size: t.Optional[int] = None, 26 | queue_size: t.Optional[int] = None, 27 | queue_result_timeout: t.Optional[float] = None, 28 | **kwargs: P.kwargs, 29 | ) -> None: 30 | """Simple runner for the scraping pipeline 31 | 32 | You can pass all parameters via kwargs to `dlt.pipeline.run(....)` 33 | 34 | ``` 35 | destination: TDestinationReferenceArg = None, 36 | staging: TDestinationReferenceArg = None, 37 | dataset_name: str = None, 38 | credentials: Any = None, 39 | table_name: str = None, 40 | write_disposition: TWriteDisposition = None, 41 | columns: TAnySchemaColumns = None, 42 | primary_key: TColumnNames = None, 43 | schema: Schema = None, 44 | loader_file_format: TLoaderFileFormat = None 45 | ``` 46 | """ 47 | options: AnyDict = {} 48 | if scrapy_settings: 49 | options["scrapy_settings"] = scrapy_settings 50 | 51 | if batch_size: 52 | options["batch_size"] = batch_size 53 | 54 | if queue_size: 55 | options["queue_size"] = queue_size 56 | 57 | if queue_result_timeout: 58 | options["queue_result_timeout"] = queue_result_timeout 59 | 60 | scraping_host = create_pipeline_runner(pipeline, spider, **options) 61 | 62 | if on_before_start: 63 | on_before_start(scraping_host.pipeline_runner.scraping_resource) 64 | 65 | scraping_host.run(*args, **kwargs) 66 | 67 | 68 | # This way we allow dlt init to detect scraping source it is indeed hacky 69 | # and the core team is working to provide a better alternative. 70 | _SOURCES[run_pipeline.__qualname__] = SourceInfo( 71 | ScrapingConfig, 72 | run_pipeline, 73 | inspect.getmodule(run_pipeline), 74 | ) 75 | -------------------------------------------------------------------------------- /scraping-source/scraping/queue.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | from queue import Empty, Queue 3 | 4 | from dlt.common import logger 5 | 6 | 7 | # Please read more at https://mypy.readthedocs.io/en/stable/runtime_troubles.html#not-generic-runtime 8 | T = t.TypeVar("T") 9 | 10 | if t.TYPE_CHECKING: 11 | 12 | class _Queue(Queue[T]): 13 | pass 14 | 15 | else: 16 | 17 | class _Queue(Queue, t.Generic[T]): 18 | pass 19 | 20 | 21 | class QueueClosedError(Exception): 22 | pass 23 | 24 | 25 | class ScrapingQueue(_Queue[T]): 26 | def __init__( 27 | self, 28 | maxsize: int = 0, 29 | batch_size: int = 10, 30 | read_timeout: float = 1.0, 31 | ) -> None: 32 | super().__init__(maxsize) 33 | self.batch_size = batch_size 34 | self.read_timeout = read_timeout 35 | self._is_closed = False 36 | 37 | def get_batches(self) -> t.Iterator[t.Any]: 38 | """Batching helper can be wrapped as a dlt.resource 39 | 40 | Returns: 41 | Iterator[Any]: yields scraped items one by one 42 | """ 43 | batch: t.List[T] = [] 44 | while True: 45 | if len(batch) == self.batch_size: 46 | yield batch 47 | batch = [] 48 | 49 | try: 50 | if self.is_closed: 51 | raise QueueClosedError("Queue is closed") 52 | 53 | item = self.get(timeout=self.read_timeout) 54 | batch.append(item) 55 | 56 | # Mark task as completed 57 | self.task_done() 58 | except Empty: 59 | if batch: 60 | yield batch 61 | batch = [] 62 | except QueueClosedError: 63 | logger.info("Queue is closed, stopping...") 64 | 65 | # Return the last batch before exiting 66 | if batch: 67 | yield batch 68 | 69 | break 70 | 71 | def stream(self) -> t.Iterator[t.Any]: 72 | """Streaming generator, wraps get_batches 73 | and handles `GeneratorExit` if dlt closes it. 74 | 75 | Returns: 76 | t.Iterator[t.Any]: returns batches of scraped content 77 | """ 78 | try: 79 | yield from self.get_batches() 80 | except GeneratorExit: 81 | self.close() 82 | 83 | def close(self) -> None: 84 | """Marks queue as closed""" 85 | self._is_closed = True 86 | 87 | @property 88 | def is_closed(self) -> bool: 89 | return self._is_closed 90 | -------------------------------------------------------------------------------- /pyladies-2024-demo/poke_pipeline.py: -------------------------------------------------------------------------------- 1 | import dlt 2 | from dlt.sources.helpers import requests 3 | 4 | 5 | @dlt.source(max_table_nesting=2) 6 | def source(pokemon_api_url: str): 7 | # note that we deselect `pokemon_list` - we do not want it to be loaded 8 | @dlt.resource(write_disposition="replace", selected=False) 9 | def pokemon_list(): 10 | """Retrieve a first page of Pokemons and yield it. We do not retrieve all the pages in this example""" 11 | yield requests.get(pokemon_api_url).json()["results"] 12 | 13 | # transformer that retrieves a list of objects in parallel 14 | @dlt.transformer 15 | def pokemon(pokemons): 16 | """Yields details for a list of `pokemons`""" 17 | 18 | # @dlt.defer marks a function to be executed in parallel 19 | # in a thread pool 20 | @dlt.defer 21 | def _get_pokemon(_pokemon): 22 | return requests.get(_pokemon["url"]).json() 23 | 24 | # call and yield the function result normally, the @dlt.defer takes care of parallelism 25 | for _pokemon in pokemons: 26 | yield _get_pokemon(_pokemon) 27 | 28 | # a special case where just one item is retrieved in transformer 29 | # a whole transformer may be marked for parallel execution 30 | @dlt.transformer(parallelized=True) 31 | def species(pokemon_details): 32 | """Yields species details for a pokemon""" 33 | species_data = requests.get(pokemon_details["species"]["url"]).json() 34 | # link back to pokemon so we have a relation in loaded data 35 | species_data["pokemon_id"] = pokemon_details["id"] 36 | # You can return the result instead of yield since the transformer only generates one result 37 | return species_data 38 | 39 | # create two simple pipelines with | operator 40 | # 1. send list of pokemons into `pokemon` transformer to get pokemon details 41 | # 2. send pokemon details into `species` transformer to get species details 42 | # NOTE: dlt is smart enough to get data from pokemon_list and pokemon details once 43 | 44 | return (pokemon_list | pokemon, pokemon_list | pokemon | species) 45 | 46 | 47 | if __name__ == "__main__": 48 | # build duck db pipeline 49 | pipeline = dlt.pipeline( 50 | pipeline_name="pokemon", 51 | destination="duckdb", 52 | dataset_name="pokemon_data", 53 | dev_mode=True 54 | ) 55 | 56 | # the pokemon_list resource does not need to be loaded 57 | load_info = pipeline.run(source("https://pokeapi.co/api/v2/pokemon")) 58 | print(pipeline.last_trace.last_normalize_info) 59 | print(load_info) 60 | -------------------------------------------------------------------------------- /secrets-providers-demo/dlt_with_google_secrets_pipeline.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import dlt 4 | import requests 5 | from dlt.common.configuration.inject import with_config 6 | from dlt.common.configuration.specs import GcpServiceAccountCredentials 7 | from google.cloud import secretmanager 8 | 9 | 10 | @with_config(sections=("google_secrets",)) 11 | def get_secret_dict( 12 | secret_id, credentials: GcpServiceAccountCredentials = dlt.secrets.value 13 | ): 14 | """ 15 | Retrieve a secret from Google Cloud Secret Manager and convert to a dictionary. 16 | 17 | Args: 18 | secret_id (str): ID of the secret to retrieve. 19 | credentials (GcpServiceAccountCredentials): Credentials for accessing the secret manager. 20 | 21 | Returns: 22 | dict: The secret data as a dictionary. 23 | """ 24 | # Create the Secret Manager client with provided credentials 25 | client = secretmanager.SecretManagerServiceClient( 26 | credentials=credentials.to_native_credentials() 27 | ) 28 | # Build the resource name of the secret version 29 | name = f"projects/{credentials.project_id}/secrets/{secret_id}/versions/latest" 30 | 31 | # Access the secret version 32 | response = client.access_secret_version(request={"name": name}) 33 | # Decode the payload to a string and convert it to a dictionary 34 | secret_string = response.payload.data.decode("UTF-8") 35 | secret_dict = json.loads(secret_string) 36 | 37 | return secret_dict 38 | 39 | 40 | @dlt.resource() 41 | def get_repositories( 42 | api_token: str = dlt.secrets.value, organization: str = dlt.secrets.value 43 | ): 44 | """ 45 | Retrieve repositories of a specified organization from GitHub. 46 | 47 | Args: 48 | api_token (str): GitHub API token for authentication. 49 | organization (str): The GitHub organization from which to retrieve repositories. 50 | 51 | Yields: 52 | list: A list of repositories for the specified organization. 53 | """ 54 | BASE_URL = "https://api.github.com" 55 | url = f"{BASE_URL}/orgs/{organization}/repos" 56 | headers = { 57 | "Authorization": f"token {api_token}", 58 | "Accept": "application/vnd.github+json", 59 | } 60 | 61 | response = requests.get(url, headers=headers) 62 | response.raise_for_status() # Ensure that a HTTP error is raised for bad responses 63 | yield response.json() 64 | 65 | 66 | if __name__ == "__main__": 67 | secret_data = get_secret_dict("temp-secret") 68 | data = get_repositories(api_token=secret_data["api_token"], organization="dlt-hub") 69 | 70 | pipeline = dlt.pipeline( 71 | pipeline_name="quick_start", destination="duckdb", dataset_name="mydata" 72 | ) 73 | load_info = pipeline.run(data, table_name="repos") 74 | 75 | print(load_info) 76 | -------------------------------------------------------------------------------- /dlt-dbt-cloud/README.md: -------------------------------------------------------------------------------- 1 | # dlt_dbt_cloud 2 | Repository with demos of using DLT and DBT Cloud 3 | 4 | ## Installation 5 | 6 | ```sh 7 | pip install dlt[bigquery] 8 | ``` 9 | 10 | ## Set up the pokemon pipeline 11 | 12 | To get started with this data pipeline, follow these steps: 13 | 14 | ### Init the pipeline 15 | 16 | Enter the following command: 17 | 18 | ```sh 19 | dlt init pokemon bigquery 20 | ``` 21 | 22 | For more information, read the 23 | [Add a verified source.](https://dlthub.com/docs/walkthroughs/add-a-verified-source) 24 | 25 | ### Add credentials 26 | 27 | 1. In the `.dlt` folder, there's a file called `secrets.toml`. It's where you store sensitive 28 | information securely, like access tokens. Keep this file safe. 29 | 30 | Use the following format for service account authentication: 31 | 32 | ```toml 33 | [sources.source_name] 34 | secret = "Please set me up!" 35 | ``` 36 | 37 | [Pokemon verified source](https://github.com/dlt-hub/verified-sources/tree/master/sources/pokemon) 38 | doesn't require authentication, so we don't need to provide credentials. 39 | 40 | 2. Enter credentials for the BigQuery destination as per the [docs](https://dlthub.com/docs/dlt-ecosystem/destinations/bigquery): 41 | ```toml 42 | [destination.bigquery] 43 | location = "US" 44 | 45 | [destination.bigquery.credentials] 46 | project_id = "project_id" # please set me up! 47 | private_key = "private_key" # please set me up! 48 | client_email = "client_email" # please set me up! 49 | ``` 50 | 51 | 52 | For more information, read the [General Usage: Credentials.](https://dlthub.com/docs/general-usage/credentials) 53 | 54 | ## Set up the dbt Cloud 55 | 56 | ### Sign in dbt Cloud 57 | Go through this [Quickstart for dbt Cloud and BigQuery](https://docs.getdbt.com/quickstarts/bigquery?step=1). 58 | 59 | ### Create the dbt model 60 | Create the model for your data with the tutorial: [How to build SQL models](https://docs.getdbt.com/docs/build/sql-models). 61 | 62 | ### Update pipeline script 63 | 64 | Add the following code into your pipeline script (`pipelines/pokemon_pipeline.py`): 65 | 66 | ```python 67 | from dlt.helpers.dbt_cloud import run_dbt_cloud_job 68 | 69 | run_info = run_dbt_cloud_job() 70 | print(f"Job run status: {run_info['status_humanized']}") 71 | ``` 72 | 73 | ### Credentials 74 | 75 | Use the following format for dbt Cloud API authentication in `.dlt/secrets.toml`: 76 | 77 | ```toml 78 | [dbt_cloud] 79 | api_token = "set me up!" # required for authentication 80 | account_id = "set me up!" # required for both helpers function 81 | job_id = "set me up!" # optional only for run_dbt_cloud_job function (you can pass this explicitly as an argument to the function) 82 | ``` 83 | 84 | More information about dbt cloud helpers in [DBT Cloud Client and Helper Functions](https://dlthub.com/docs/dlt-ecosystem/transformations/dbt/dbt_cloud). 85 | 86 | ## Run the pipeline 87 | 88 | Now you are ready to run the pipeline! To get started, run the following command: 89 | 90 | ```bash 91 | python pokemon_pipeline.py 92 | ``` 93 | -------------------------------------------------------------------------------- /coinpaprika-to-postgresql/example_api_responses/coin_details.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "btc-bitcoin", 3 | "name": "Bitcoin", 4 | "symbol": "BTC", 5 | "parent": { 6 | "id": "eth-ethereum", 7 | "name": "Ethereum", 8 | "symbol": "ETH" 9 | }, 10 | "rank": 1, 11 | "is_new": false, 12 | "is_active": true, 13 | "type": "coin", 14 | "logo": "https://static.coinpaprika.com/coin/bnb-binance-coin/logo.png", 15 | "tags": [ 16 | { 17 | "id": "blockchain-service", 18 | "name": "Blockchain Service", 19 | "coin_counter": 160, 20 | "ico_counter": 80 21 | } 22 | ], 23 | "team": [ 24 | { 25 | "id": "vitalik-buterin", 26 | "name": "Vitalik Buterin", 27 | "position": "Author" 28 | } 29 | ], 30 | "description": "Bitcoin is a cryptocurrency and worldwide payment system. It is the first decentralized digital currency, as the system works without a central bank or single administrator.", 31 | "message": "string", 32 | "open_source": true, 33 | "hardware_wallet": true, 34 | "started_at": "2009-01-03T00:00:00Z", 35 | "development_status": "Working product", 36 | "proof_type": "Proof of work", 37 | "org_structure": "Decentralized", 38 | "hash_algorithm": "SHA256", 39 | "contract": "string", 40 | "platform": "string", 41 | "contracts": [ 42 | { 43 | "contract": "string", 44 | "platform": "string", 45 | "type": "string" 46 | } 47 | ], 48 | "links": { 49 | "explorer": [ 50 | "http://blockchain.com/explorer", 51 | "https://blockchair.com/bitcoin/blocks", 52 | "https://blockexplorer.com/", 53 | "https://live.blockcypher.com/btc/" 54 | ], 55 | "facebook": [ 56 | "https://www.facebook.com/bitcoins/" 57 | ], 58 | "reddit": [ 59 | "https://www.reddit.com/r/bitcoin" 60 | ], 61 | "source_code": [ 62 | "https://github.com/bitcoin/bitcoin" 63 | ], 64 | "website": [ 65 | "https://bitcoin.org/" 66 | ], 67 | "youtube": [ 68 | "https://www.youtube.com/watch?v=Um63OQz3bjo" 69 | ], 70 | "medium": null 71 | }, 72 | "links_extended": [ 73 | { 74 | "url": "http://blockchain.com/explorer", 75 | "type": "explorer" 76 | }, 77 | { 78 | "url": "https://www.reddit.com/r/bitcoin", 79 | "type": "reddit", 80 | "stats": { 81 | "subscribers": 1009135 82 | } 83 | }, 84 | { 85 | "url": "https://github.com/bitcoin/bitcoin", 86 | "type": "source_code", 87 | "stats": { 88 | "contributors": 730, 89 | "stars": 36613 90 | } 91 | }, 92 | { 93 | "url": "https://bitcoin.org/", 94 | "type": "website" 95 | } 96 | ], 97 | "whitepaper": { 98 | "link": "https://static.coinpaprika.com/storage/cdn/whitepapers/215.pdf", 99 | "thumbnail": "https://static.coinpaprika.com/storage/cdn/whitepapers/217.jpg" 100 | }, 101 | "first_data_at": "2018-10-03T11:48:19Z", 102 | "last_data_at": "2019-05-03T11:00:00" 103 | } -------------------------------------------------------------------------------- /scraping-source/README.md: -------------------------------------------------------------------------------- 1 | # Scraping with dlt 2 | 3 | Scraping source allows you to scrape content from web and uses [Scrapy](https://doc.scrapy.org/en/latest/) 4 | to enable this capability. 5 | 6 | It is possible to access and manipulate a scraping resource via (please see `scraping_pipeline.py`) 7 | 8 | 1. `on_before_start` callback which will receive a `DltResource` as the only argument, 9 | 2. The advanced scraping pipeline builder `scraping.helpers.create_pipeline_runner` 10 | 11 | 12 | ## 🎲 Configuration 13 | 14 | It is possible to provide configuration via `.dlt/config.toml` below you can see an example 15 | 16 | ```toml 17 | [sources.scraping] 18 | start_urls = [ 19 | "https://quotes.toscrape.com/page/1/" 20 | ] 21 | start_urls_file="/path/to/urls.txt" 22 | ``` 23 | 24 | When both `start_urls` and `start_urls_file` they will be merged and deduplicated so Scrapy 25 | gets a unique set of `start_urls`. 26 | 27 | ## 🏎️ Running the pipeline 28 | 29 | Install requirements and run the pipeline 30 | 31 | ```sh 32 | pip install -r requirements.txt 33 | python scraping_pipeline.py 34 | ``` 35 | 36 | ## Implementing a spider 37 | 38 | It is your responsibility to implement the spider and data extraction logic from the responses 39 | because our runner expects spider class, please see as a reference an example of spider in `scraping_pipeline.py`. 40 | For more information about spider implementation please also see [Scrapy docs](https://docs.scrapy.org/en/latest/topics/spiders.html). 41 | 42 | ## Configuring Scrapy 43 | 44 | You can pass scrapy settings via 45 | 46 | 1. `run_pipeline(..., scrapy_settings={...})`, 47 | 2. `create_pipeline_runner(..., scrapy_settings={...})`, 48 | 3. Overriding defaults in `settings.py`. 49 | 50 | Example: 51 | ```py 52 | run_pipeline( 53 | pipeline, 54 | MySpider, 55 | scrapy_settings={ 56 | # How many sub pages to scrape 57 | # https://docs.scrapy.org/en/latest/topics/settings.html#depth-limit 58 | "DEPTH_LIMIT": 0, 59 | "SPIDER_MIDDLEWARES": { 60 | "scrapy.spidermiddlewares.depth.DepthMiddleware": 200, 61 | "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 300, 62 | }, 63 | "HTTPERROR_ALLOW_ALL": True, 64 | }, 65 | ) 66 | ``` 67 | 68 | Note: this is just a shallow merge. 69 | Also log level is automatically set in sync with the one 70 | dlt provides so providing it via `scrapy_settings` as `"LOG_LEVEL": "DEBUG"` will not work, 71 | please see [logging documentation](https://dlthub.com/docs/running-in-production/running#set-the-log-level-and-format) for dlt. 72 | 73 | ## 🧐 Introspection using streamlit 74 | 75 | NOTE: you might need to set up `streamlit`, `pip install streamlit` 76 | 77 | ```sh 78 | dlt pipeline show 79 | ``` 80 | 81 | ## 🧠 How it works? 82 | 83 | Under the hood we run DLT [pipeline](https://dlthub.com/docs/api_reference/pipeline) in a separate thread while scrapy is running in the main thread. 84 | 85 | Communication between the two is done via the queue, where 86 | 87 | - Spider is responsible to put the results in the queue, 88 | - DLT resource collects and batches results from the queue. 89 | 90 | ![simple diagram](./diagram.png) 91 | 92 |

Enjoy it!

93 |
94 |

✨ 🚀 ✨

95 | -------------------------------------------------------------------------------- /dlt_restack_demo/restack-app/src/functions/dlt_to_weaviate.py: -------------------------------------------------------------------------------- 1 | import time 2 | import traceback 3 | 4 | import dlt 5 | import pendulum 6 | import requests 7 | from dlt.destinations.adapters import weaviate_adapter 8 | from pydantic import BaseModel 9 | from restack_ai.function import function, log 10 | 11 | 12 | class PipelineInput(BaseModel): 13 | pipeline_name: str 14 | destination: str 15 | add_limit: int 16 | dev_mode: bool 17 | 18 | 19 | @function.defn() 20 | async def anime_pipeline(input: PipelineInput) -> str: 21 | @dlt.resource(table_name="anime", write_disposition="merge", primary_key="mal_id") 22 | def get_anime( 23 | aired_from=dlt.sources.incremental( 24 | "aired.from", initial_value="2024-07-01T00:00:00+00:00" 25 | ) 26 | ): 27 | yield pagination( 28 | "anime", 29 | params={ 30 | "order_by": "start_date", 31 | "start_date": remove_hms(aired_from.last_value), 32 | "status": "airing", 33 | }, 34 | base_url="https://api.jikan.moe/v4/", 35 | ) 36 | 37 | try: 38 | # Set pipeline name, destination, and dataset name 39 | pipeline = dlt.pipeline( 40 | pipeline_name=input.pipeline_name, 41 | destination=input.destination, 42 | progress="log", 43 | dev_mode=input.dev_mode, 44 | ) 45 | 46 | data = get_anime().add_limit(input.add_limit) 47 | 48 | if input.destination == "weaviate": 49 | data = weaviate_adapter( 50 | data, 51 | vectorize=["title", "synopsis"], 52 | ) 53 | 54 | # Run the pipeline using the defined resource 55 | pipeline.run(data) 56 | return str(pipeline.last_trace) 57 | except Exception as e: 58 | log.error("Something went wrong!", error=e) 59 | log.error(traceback.format_exc()) 60 | raise e 61 | 62 | 63 | def remove_hms(date: str) -> str: 64 | date_obj = pendulum.parse(date) 65 | return date_obj.to_date_string() 66 | 67 | 68 | def pagination(endpoint: str, params: dict, base_url: str): 69 | has_next_page = True 70 | page = 1 71 | retries = 0 72 | 73 | while has_next_page: 74 | params.update({"page": page}) 75 | 76 | try: 77 | response = requests.get(f"{base_url}{endpoint}", params) 78 | if response.status_code == 429: # Rate limit error 79 | wait_time = min(2 ** retries, 60) # Exponential backoff, max 60s 80 | print(f"Rate limit hit. Retrying in {wait_time} seconds...") 81 | time.sleep(wait_time) 82 | retries += 1 83 | continue # Retry the same request 84 | 85 | response.raise_for_status() # Raise an error for 4xx or 5xx responses 86 | retries = 0 # Reset retries on a successful request 87 | data = response.json() 88 | 89 | if "data" in data: 90 | yield data["data"] 91 | 92 | has_next_page = data.get("pagination", {}).get("has_next_page", False) 93 | page += 1 94 | 95 | except requests.RequestException as e: 96 | print(f"Request failed for {endpoint}: {e}") 97 | break 98 | 99 | time.sleep(0.5) -------------------------------------------------------------------------------- /iceberg-tabular/README.md: -------------------------------------------------------------------------------- 1 | # Demo: GitHub Issues data pipeline 2 | 3 | This Python script utilizes the `dlt` library to create a data pipeline for extracting and loading 4 | GitHub issues data into [Athena/Glue Catalog](https://dlthub.com/docs/dlt-ecosystem/destinations/athena). 5 | The pipeline focuses on fetching open issues from a specified GitHub 6 | repository, storing data as parquet files in s3 buckets and creating external tables in AWS Glue Catalog. 7 | You can then query those tables with Athena SQL commands which 8 | will then scan the whole folder of parquet files and return the results. 9 | 10 | In this demo we load data in Iceberg format (`force_iceberg = "True"`). 11 | We can use AWS Glue as a Data Catalog, or we can load the Iceberg data into Tabular.io. 12 | 13 | ## Prerequisites 14 | 15 | Before using the script, ensure you have the following prerequisites installed: 16 | 17 | - Python 18 | - `dlt` library with Athena dependencies (`pip install dlt[athena]`) 19 | 20 | 21 | ## Usage 22 | 23 | * **Clone the Repository:** 24 | 25 | ```bash 26 | git clone https://github.com/dlt-hub/dlt_demos.git 27 | cd dlt_demos/iceberg-tabular 28 | ``` 29 | * **Install Dependencies:** 30 | 31 | ```bash 32 | pip install -r requirements.txt 33 | ``` 34 | * **Credentials Configuration:** 35 | 36 | Copy `secrets.toml`: 37 | ```bash 38 | cp .dlt/example.secrets.toml .dlt/secrets.toml 39 | ``` 40 | Ensure you set up the necessary credentials for the filesystem (S3) and Athena destinations in your 41 | `secrets.toml` file. Replace the placeholders with your actual credentials. 42 | 43 | ```toml 44 | [destination.filesystem] 45 | bucket_url = "s3://[your_bucket_name]" # replace with your bucket name, 46 | 47 | [destination.filesystem.credentials] 48 | aws_access_key_id = "please set me up!" # copy the access key here 49 | aws_secret_access_key = "please set me up!" # copy the secret access key here 50 | 51 | [destination.athena] 52 | force_iceberg = "True" # load data in the iceberg format 53 | query_result_bucket="s3://[results_bucket_name]" # replace with your query results bucket name 54 | 55 | [destination.athena.credentials] 56 | aws_access_key_id="please set me up!" # same as credentials for filesystem 57 | aws_secret_access_key="please set me up!" # same as credentials for filesystem 58 | region_name="please set me up!" # set your aws region, for example "eu-central-1" for Frankfurt 59 | database="awsdatacatalog" 60 | ``` 61 | * **Run the Script:** 62 | 63 | ```bash 64 | python github_pipeline.py --organisation-name=dlt-hub --repo-name=dlt 65 | ``` 66 | CLI Options: 67 | * `--organisation-name`: GitHub organization name (required). 68 | * `--repo-name`: GitHub repository name (required). 69 | * `--pipeline-name`: Name of the dlt pipeline. 70 | * `--dataset-name`: Name of the dataset. 71 | 72 | 73 | ## Notes 74 | 75 | - The script reads only open issues to minimize the number of API calls, considering the limit for 76 | non-authenticated GitHub users. 77 | - The `updated_at` parameter ensures that only issues updated since the last execution are fetched. 78 | - The pipeline utilizes Athena as the destination for storing the GitHub issues data. 79 | 80 | Feel free to customize the script and pipeline configuration according to your requirements. 81 | -------------------------------------------------------------------------------- /scraping-source/scraping/helpers.py: -------------------------------------------------------------------------------- 1 | import os 2 | import typing as t 3 | 4 | import dlt 5 | from dlt.common.configuration.inject import with_config 6 | from dlt.common.configuration.specs.base_configuration import ( 7 | configspec, 8 | BaseConfiguration, 9 | ) 10 | 11 | from scrapy import Spider # type: ignore 12 | 13 | from .queue import ScrapingQueue 14 | from .settings import SOURCE_SCRAPY_QUEUE_SIZE, SOURCE_SCRAPY_SETTINGS 15 | from .runner import ScrapingHost, PipelineRunner, ScrapyRunner, Signals 16 | from .types import AnyDict 17 | 18 | 19 | @configspec 20 | class ScrapingConfig(BaseConfiguration): 21 | # Batch size for scraped items 22 | batch_size: int = 100 23 | 24 | # maxsize for queue 25 | queue_size: t.Optional[int] = SOURCE_SCRAPY_QUEUE_SIZE 26 | 27 | # result wait timeout for our queue 28 | queue_result_timeout: t.Optional[float] = 1.0 29 | 30 | # List of start urls 31 | start_urls: t.List[str] = None 32 | start_urls_file: str = None 33 | 34 | 35 | @with_config(sections=("sources", "scraping"), spec=ScrapingConfig) 36 | def resolve_start_urls( 37 | start_urls: t.Optional[t.List[str]] = dlt.config.value, 38 | start_urls_file: t.Optional[str] = dlt.config.value, 39 | ) -> t.List[str]: 40 | """Merges start urls 41 | If both `start_urls` and `start_urls_file` given, we will merge them 42 | and return deduplicated list of `start_urls` for scrapy spider. 43 | """ 44 | urls = set() 45 | if os.path.exists(start_urls_file): 46 | with open(start_urls_file, encoding="utf-8") as fp: 47 | urls = {line for line in fp.readlines() if str(line).strip()} 48 | 49 | if start_urls: 50 | for url in start_urls: 51 | urls.add(url) 52 | 53 | return list(set(urls)) 54 | 55 | 56 | @with_config(sections=("sources", "scraping"), spec=ScrapingConfig) 57 | def create_pipeline_runner( 58 | pipeline: dlt.Pipeline, 59 | spider: t.Type[Spider], 60 | batch_size: int = dlt.config.value, 61 | queue_size: int = dlt.config.value, 62 | queue_result_timeout: float = dlt.config.value, 63 | scrapy_settings: t.Optional[AnyDict] = None, 64 | ) -> ScrapingHost: 65 | """Creates scraping host instance 66 | This helper only creates pipeline host, so running and controlling 67 | scrapy runner and pipeline is completely delegated to advanced users 68 | """ 69 | queue = ScrapingQueue( # type: ignore 70 | maxsize=queue_size, 71 | batch_size=batch_size, 72 | read_timeout=queue_result_timeout, 73 | ) 74 | 75 | signals = Signals( 76 | pipeline_name=pipeline.pipeline_name, 77 | queue=queue, 78 | ) 79 | 80 | # Just to simple merge 81 | settings = {**SOURCE_SCRAPY_SETTINGS} 82 | if scrapy_settings: 83 | settings = {**scrapy_settings} 84 | 85 | scrapy_runner = ScrapyRunner( 86 | spider=spider, 87 | start_urls=resolve_start_urls(), 88 | signals=signals, 89 | settings=settings, 90 | ) 91 | 92 | pipeline_runner = PipelineRunner( 93 | pipeline=pipeline, 94 | queue=queue, 95 | ) 96 | 97 | scraping_host = ScrapingHost( 98 | queue, 99 | scrapy_runner, 100 | pipeline_runner, 101 | ) 102 | 103 | return scraping_host 104 | -------------------------------------------------------------------------------- /pyladies-2024-demo/README.md: -------------------------------------------------------------------------------- 1 | ## Overview 2 | 3 | `dlt` is an open-source library that you can add to your Python scripts to load data from various and often messy data sources into well structured, live datasets. Below we give you a preview how you can get data from APIs, files, Python objects or pandas dataframes and move it into a local or remote database, data lake or a vector data store. 4 | 5 | Let's get started! 6 | 7 | ## Installation 8 | 9 | Official releases of dlt can be installed from [PyPI](https://pypi.org/project/dlt/): 10 | 11 | ```shell 12 | pip install dlt 13 | ``` 14 | 15 | Command above just installs library core, in example below we use `duckdb` as a [destination](https://dlthub.com/docs/dlt-ecosystem/destinations), so let's add it: 16 | 17 | ```shell 18 | pip install -q "dlt[duckdb]" 19 | ``` 20 | 21 | > Use clean virtual environment for your experiments! Here are [detailed instructions](https://dlthub.com/docs/reference/installation). 22 | 23 | ## Quick start 24 | 25 | Let's load a list of Python objects (dicts) into `duckdb` database and inspect the created dataset. 26 | 27 | > We gonna use `dev_mode` for our test examples. If you create a new pipeline script you will be experimenting a lot. 28 | > If you want that each time the pipeline resets its state and loads data to a new dataset, set the `dev_mode` argument of the `dlt.pipeline` method to True. 29 | > Each time the pipeline is created, dlt adds datetime-based suffix to the dataset name. 30 | 31 | Run the command: 32 | ```shell 33 | python getting-started.py 34 | ``` 35 | 36 | ### Now explore your data! 37 | 38 | #### If you run it locally 39 | 40 | To see the schema of your created database, run Streamlit command `dlt pipeline show`. 41 | 42 | To use `streamlit`, install it first. 43 | 44 | ```shell 45 | pip install streamlit 46 | ``` 47 | 48 | For example above pipeline name is “quick_start”, so run: 49 | 50 | ```shell 51 | dlt pipeline quick_start show 52 | ``` 53 | [This command](https://dlthub.com/docs/reference/command-line-interface#show-tables-and-data-in-the-destination) generates and launches a simple Streamlit app that you can use to inspect the schemas and data in the destination. 54 | 55 | ## Load data from variety of sources 56 | 57 | Use dlt to load practically any data you deal with in your Python script into a dataset. 58 | 59 | The library will create/update tables, infer data types and deal with nested data automatically: 60 | - list of dicts 61 | - json 62 | - csv/parquet 63 | - API 64 | - database 65 | - etc. 66 | 67 | ### from JSON 68 | 69 | When creating a schema during normalization, dlt recursively unpacks this nested structure into relational tables, creating and linking [children and parent tables](https://dlthub.com/docs/general-usage/destination-tables#nested-tables). 70 | 71 | ```shell 72 | python load_from_json.py 73 | ``` 74 | 75 | ### from API 76 | 77 | Below we load 100 most recent followers from our [own dlt-hub organisation](https://github.com/dlt-hub/dlt) into "followers" table. 78 | 79 | ```shell 80 | python github_pipeline.py 81 | ``` 82 | 83 | ### from Database 84 | 85 | Use the SQL source to extract data from databases like PostgreSQL, MySQL, SQLite, Oracle, and more. 86 | 87 | ```shell 88 | pip install pymysql 89 | ``` 90 | 91 | ## Real-life example 92 | 93 | For this example, we will be loading Pokemon data from the PokeAPI with the help of transformers to load Pokemon details in parallel. 94 | 95 | ```shell 96 | python poke_pipeline.py 97 | ``` 98 | -------------------------------------------------------------------------------- /scraping-source/scraping_pipeline.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import dlt 4 | from dlt.sources import DltResource 5 | from scrapy import Spider # type: ignore 6 | from scrapy.http import Response # type: ignore 7 | 8 | from scraping import run_pipeline 9 | from scraping.helpers import create_pipeline_runner 10 | 11 | 12 | class MySpider(Spider): 13 | def parse(self, response: Response, **kwargs: Any) -> Any: 14 | for next_page in response.css("li.next a::attr(href)"): 15 | if next_page: 16 | yield response.follow(next_page, self.parse) 17 | 18 | for quote in response.css("div.quote"): 19 | result = { 20 | "quote": { 21 | "text": quote.css("span.text::text").get(), 22 | "author": quote.css("small.author::text").get(), 23 | "tags": quote.css("div.tags a.tag::text").getall(), 24 | }, 25 | } 26 | yield result 27 | 28 | 29 | def scrape_quotes() -> None: 30 | pipeline = dlt.pipeline( 31 | pipeline_name="scraping", 32 | destination='duckdb', 33 | dataset_name="quotes", 34 | ) 35 | 36 | run_pipeline( 37 | pipeline, 38 | MySpider, 39 | # you can pass scrapy settings overrides here 40 | scrapy_settings={ 41 | "DEPTH_LIMIT": 10, 42 | }, 43 | write_disposition="append", 44 | ) 45 | 46 | 47 | def scrape_quotes_scrapy_configs() -> None: 48 | pipeline = dlt.pipeline( 49 | pipeline_name="scraping_custom_scrapy_configs", 50 | destination='duckdb', 51 | dataset_name="quotes", 52 | ) 53 | 54 | run_pipeline( 55 | pipeline, 56 | MySpider, 57 | # you can pass scrapy settings overrides here 58 | scrapy_settings={ 59 | # How many sub pages to scrape 60 | # https://docs.scrapy.org/en/latest/topics/settings.html#depth-limit 61 | "DEPTH_LIMIT": 100, 62 | "SPIDER_MIDDLEWARES": { 63 | "scrapy.spidermiddlewares.depth.DepthMiddleware": 200, 64 | "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 300, 65 | }, 66 | "HTTPERROR_ALLOW_ALL": False, 67 | }, 68 | write_disposition="append", 69 | ) 70 | 71 | 72 | def scrape_quotes_callback_access_resource() -> None: 73 | pipeline = dlt.pipeline( 74 | pipeline_name="scraping_resource_callback", 75 | destination='duckdb', 76 | dataset_name="quotes", 77 | ) 78 | 79 | def on_before_start(res: DltResource) -> None: 80 | res.add_limit(2) 81 | 82 | run_pipeline( 83 | pipeline, 84 | MySpider, 85 | batch_size=10, 86 | scrapy_settings={}, 87 | on_before_start=on_before_start, 88 | write_disposition="append", 89 | ) 90 | 91 | 92 | def scrape_quotes_advanced_runner() -> None: 93 | pipeline = dlt.pipeline( 94 | pipeline_name="scraping_advanced_direct", 95 | destination='duckdb', 96 | ) 97 | scraping_host = create_pipeline_runner(pipeline, MySpider, batch_size=10) 98 | scraping_host.pipeline_runner.scraping_resource.add_limit(2) 99 | scraping_host.run(dataset_name="quotes", write_disposition="append") 100 | 101 | 102 | if __name__ == "__main__": 103 | scrape_quotes() 104 | # scrape_quotes_scrapy_configs() 105 | # scrape_quotes_callback_access_resource() 106 | # scrape_quotes_advanced_runner() 107 | -------------------------------------------------------------------------------- /dlt-dbt-cloud/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | # Default .gitignore content added by dbt Cloud 163 | target/ 164 | dbt_packages/ 165 | logs/ 166 | # end dbt Cloud content 167 | -------------------------------------------------------------------------------- /dlt-init-openapi-demo/README.md: -------------------------------------------------------------------------------- 1 | # `dlt-init-openapi`, REST API Clients and `dlt` 2 | 3 | A REST API (Representational State Transfer Application Programming Interface) is a set of rules and conventions for building and interacting with web services. It allows different systems to communicate over the internet using standard HTTP methods. 4 | 5 | Generating a REST API client in Python can be done in several ways. Two popular methods are: 6 | 7 | - Manually creating the client using the requests library. 8 | - Automatically generating the client using an OpenAPI spec. 9 | 10 | Another method is using [dlt rest_api source.](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api) 11 | 12 | `dlt` is an open-source library that you can add to your Python scripts to load data from various and often messy data sources into well-structured, live datasets. 13 | 14 | The `rest_api` source in `dlt` is a versatile and generic tool designed to help you extract data from any REST API. By using a declarative configuration, you can define API endpoints, their relationships, pagination handling, and authentication methods effortlessly. 15 | 16 | > See [dlt Rest API helpers tutorial](https://colab.research.google.com/drive/1qnzIM2N4iUL8AOX1oBUypzwoM3Hj5hhG?usp=sharing) for details. 17 | 18 | `dlt` went ahead and created a RestAPI clients generator based on `rest_api` source and OpenAPI spec -- [`dlt-init-openapi`](https://pypi.org/project/dlt-init-openapi/). 19 | 20 | ## Installation 21 | 22 | ```sh 23 | pip install dlt-init-openapi 24 | ``` 25 | 26 | ## Initialize the source with Stripe OpenAPI Spec 27 | 28 | This will take a while, you have time to make a coffee... 29 | 30 | ```sh 31 | dlt-init-openapi stripe --url "https://raw.githubusercontent.com/stripe/openapi/master/openapi/spec3.json" 32 | ``` 33 | 34 | Pipeline was generated, 247 endpoints were found, but **we do not believe in magic** and we need to make sure that source was generated correctly. 35 | For example, need to provide `base_url`, secrets, query parameters, etc. 36 | 37 | Stripe is well known for its high-quality API and documentation, so you will find 38 | [here](https://docs.stripe.com/api) all required information: 39 | - base url; 40 | - authentication type; 41 | - pagination type; 42 | - available query parameters; 43 | - child endpoints. 44 | 45 | 46 | Walk through this [dlt REST API tutorial](https://colab.research.google.com/drive/1qnzIM2N4iUL8AOX1oBUypzwoM3Hj5hhG?usp=sharing) to learn how to investigate API documentation and avoid struggling with building a REST API client. 47 | 48 | We're gonna explore a few endpoints: [customers list](https://docs.stripe.com/api/customers/list), [subscriptions list](https://docs.stripe.com/api/subscriptions/list). 49 | 50 | ## Authentication 51 | 52 | As you know, to gain access to the API you will need a secret token. [Here is a guide](https://docs.stripe.com/keys) how to get the key. 53 | 54 | 55 | Let's explore the [Stripe API Authentication methods.](https://docs.stripe.com/stripe-apps/api-authentication) 56 | 57 | It says here: 58 | 59 | >Authentication to the API is performed via HTTP Basic Auth. Provide your API key as the basic auth username value. You do not need to provide a password. 60 | > 61 | 62 | 63 | ## Pagination 64 | 65 | Well, let's take a look at how the tool coped with pagination. 66 | First, we will find out what type of pagination the Stripe API has: 67 | 68 | >Stripe’s list API methods use **cursor-based pagination** through the `starting_after` and `ending_before` parameters. 69 | > Both parameters accept an existing object `ID` value (see below) and return objects in reverse chronological order. 70 | > 71 | 72 | We can easily fix it, go to [the rest_api documentation](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api#pagination) and find correct pagination type: 73 | 74 | >**JSONResponseCursorPaginator** handles pagination based on a cursor in the JSON response. \ 75 | *Parameters*: \ 76 | `cursor_path`: A JSONPath expression pointing to the cursor in the JSON response. This cursor is used to fetch subsequent pages. Defaults to "cursors.next".\ 77 | `cursor_param`: The query parameter used to send the cursor value in the next request. Defaults to "after". 78 | 79 | 80 | ``` 81 | "paginator": { 82 | "type": "cursor", 83 | "cursor_path": "id", 84 | "cursor_param": "starting_after", 85 | }, 86 | ``` 87 | 88 | 89 | ## Run the pipeline 90 | 91 | ```shell 92 | python stripe_pipeline.py 93 | ``` -------------------------------------------------------------------------------- /dlt_restack_demo/README.md: -------------------------------------------------------------------------------- 1 | # Anime data pipeline with RAG using dlt, Restack and Weaviate 2 | 3 | This project demonstrates how to: 4 | 5 | - Extract and store anime data from the Jikan API into Weaviate using dlt. 6 | - Implement a Retrieval-Augmented Generation (RAG) workflow to answer anime-related questions using OpenAI's GPT model and Weaviate's vector search. 7 | 8 | ## Start Restack 9 | 10 | To start the Restack, use the following Docker command: 11 | 12 | ```bash 13 | docker run -d --pull always --name restack -p 5233:5233 -p 6233:6233 -p 7233:7233 ghcr.io/restackio/restack:main 14 | ``` 15 | 16 | ## Run Weaviate 17 | 18 | ```shell 19 | docker run -p 8080:8080 -p 50051:50051 -e ENABLE_MODULES=text2vec-openai,generative-openai cr.weaviate.io/semitechnologies/weaviate:1.28.4 20 | ``` 21 | 22 | ## Add environment variables 23 | 24 | Copy from `.dlt` the `example.secrets.toml` and rename it to `secrets.toml` and add you OpenAI key there. 25 | 26 | ``` 27 | [destination.weaviate.credentials.additional_headers] 28 | X-OpenAI-Api-Key = "..." 29 | 30 | [openai] 31 | api_key = "..." 32 | ``` 33 | 34 | ## Start python shell 35 | 36 | ``` 37 | cd restack-app 38 | ``` 39 | 40 | If using uv: 41 | 42 | ```bash 43 | uv venv && source .venv/bin/activate 44 | ``` 45 | 46 | If using pip: 47 | 48 | ```bash 49 | python -m venv .venv && source .venv/bin/activate 50 | ``` 51 | 52 | ## Install dependencies 53 | 54 | If using uv: 55 | 56 | ```bash 57 | uv sync 58 | uv run dev 59 | ``` 60 | 61 | If using pip: 62 | 63 | ```bash 64 | pip install -e . 65 | python -c "from src.services import watch_services; watch_services()" 66 | ``` 67 | 68 | ## Usage 69 | 70 | ### Data ingestion pipeline 71 | 72 | #### How it works 73 | 74 | - Fetches incremental anime data from Jikan API. 75 | - Handles rate limits with exponential backoff. 76 | - Stores structured and vectorized data in Weaviate. 77 | 78 | #### Run workflows from UI 79 | 80 | You can run workflows from the UI by clicking the "Run" button. 81 | 82 | ![Run workflows from UI](img/UI.png) 83 | 84 | You should provide input in JSON format: 85 | 86 | | Parameter | Type | Description | 87 | |-----------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 88 | | `pipeline_name` | `str` | The name of the pipeline. It is used to restore its state and data schemas on subsequent runs. | 89 | | `destination` | `str` | The destination where the anime data will be stored. Defaults to `"weaviate"`. You can use `duckdb` for development and debugging. | 90 | | `add_limit` | `int` | The maximum number of API pages to fetch and process in a single run. Defaults to 2. Set -1 if you want to fetch all data. | 91 | | `dev_mode` | `bool` | When dev_mode is enabled, dlt creates a versioned dataset each time you run the pipeline. This means the data is loaded into a new dataset (a new database schema) with each run. | 92 | 93 | ![Pass input](img/run.png) 94 | 95 | #### Pipeline execution example 96 | 97 | The pipeline finalizes the execution and logs the results, ensuring the data is successfully ingested into the destination. 98 | 99 | ![See results](img/results.png) 100 | 101 | #### Incremental loading 102 | 103 | Incremental loading ensures that only new or updated data is fetched and processed when the pipeline runs again. 104 | Instead of reloading all data, the pipeline tracks the last processed 105 | timestamp (e.g., aired.from) and retrieves only new entries since that point. 106 | 107 | If you rerun the pipeline without new data being available, 108 | it will detect that no new records exist and avoid unnecessary processing. 109 | This helps improve efficiency, reduce API requests, and minimize redundant storage. 110 | 111 | ![Incremental loading](img/incremental.png) 112 | 113 | 114 | 1. **First Run** (14s 600ms Execution) 115 | 116 | - Extracted, normalized, and loaded anime data into Weaviate. 117 | - Successfully processed multiple rows and stored them in the database. 118 | 119 | 2. **Second Run** (1s Execution) 120 | 121 | - The second execution was much faster (~1 second) because no new data was available. 122 | - The output log confirms that no new records were found, and 0 load packages were sent to the destination. 123 | - This is a result of incremental loading, preventing redundant data processing. 124 | 125 | By using incremental loading, the pipeline optimizes data ingestion by only processing new entries, making subsequent runs more efficient. 126 | 127 | 128 | ### RAG query pipeline 129 | This pipeline allows **semantic search** on the stored anime data using Weaviate and OpenAI. 130 | 131 | #### How it works 132 | 1. A user asks a **question** related to anime. 133 | 2. Weaviate retrieves relevant anime descriptions using **vector search**. 134 | 3. OpenAI generates a **natural language answer** based on the retrieved data. 135 | 136 | #### Run workflows from UI 137 | 138 | You can run workflows from the UI by clicking the "Run" button. 139 | 140 | ![Run workflows from UI](img/run2.png) 141 | 142 | You should provide input in JSON format: 143 | 144 | | Parameter | Type | Description | 145 | |---------------|--------|--------------| 146 | | `pipeline_name` | `str` | Name of the pipeline, Defaults to `anime_pipeline`. | 147 | | `question` | `str` | User's query for retrieving anime information. | 148 | 149 | 150 | #### Pipeline execution example 151 | 152 | The pipeline finalizes the execution and logs the results, ensuring the answer was successfully generated. 153 | 154 | ![See results](img/results2.png) -------------------------------------------------------------------------------- /dlt-dagster-snowflake/dlt_dagster_snowflake_demo/assets/__init__.py: -------------------------------------------------------------------------------- 1 | from dagster import asset, get_dagster_logger, AssetExecutionContext, MetadataValue 2 | from dagster_snowflake import SnowflakeResource 3 | from ..dlt import hacker_news, comments, google_trends, hacker_news_full 4 | from ..resources import LocalFileStorage, DltPipeline 5 | import pandas as pd 6 | import matplotlib.pyplot as plt 7 | from io import BytesIO 8 | 9 | 10 | orchestration_tools = ("Kestra", "Dagster", "Airflow", "Luigi", "MageAi", "Keboola") 11 | 12 | 13 | def run_dlt_pipeline_and_generate_md(pipeline: DltPipeline, resource_data, table_name: str): 14 | """ 15 | Executes a dlt pipeline and generates a Markdown report. 16 | 17 | Args: 18 | pipeline: A DltResource object representing the dlt pipeline to be executed. 19 | resource_data: The data that will be used by the pipeline. 20 | table_name: The name of the table to be updated by the pipeline. 21 | 22 | Returns: 23 | A string containing the Markdown formatted summary of the schema updates performed by the pipeline. 24 | """ 25 | 26 | logger = get_dagster_logger() 27 | 28 | # Create the pipeline and log the resulting load information 29 | load_info = pipeline.create_pipeline( 30 | resource_data = resource_data, 31 | table_name = table_name 32 | ) 33 | logger.info(load_info) 34 | 35 | md_content = "" 36 | # Iterate through the load packages to update the Markdown content 37 | for package in load_info.load_packages: 38 | for table_name, table in package.schema_update.items(): 39 | for column_name, column in table["columns"].items(): 40 | md_content += f"\tTable updated: {table_name}: Column changed: {column_name}: {column['data_type']}\n" 41 | 42 | return md_content 43 | 44 | 45 | @asset(group_name = "google_trends_data") 46 | def google_trends_asset(context: AssetExecutionContext, pipeline: DltPipeline) -> None: 47 | """ 48 | A Dagster asset that loads Google Trends data from the "google_trends" dlt resource to Snowflake using a dlt pipeline and documents the updates. 49 | """ 50 | 51 | dlt_resource = google_trends(orchestration_tools) 52 | md_content = run_dlt_pipeline_and_generate_md(pipeline, resource_data = dlt_resource, table_name = "google_trends_asset") 53 | 54 | context.add_output_metadata(metadata = {"Updates": MetadataValue.md(md_content)}) 55 | 56 | 57 | @asset(group_name = "google_trends_data", deps = [google_trends_asset]) 58 | def google_trends_chart(snowflake: SnowflakeResource, image_storage: LocalFileStorage) -> None: 59 | """ 60 | A Dagster asset that generates a line chart visualizing Google Trends data over time and saves it to the local storage. 61 | """ 62 | 63 | with snowflake.get_connection() as conn: 64 | google_trends = conn.cursor().execute( 65 | f"SELECT * FROM {google_trends_asset.name}" 66 | ).fetch_pandas_all() 67 | 68 | # Plot the data 69 | google_trends["DATE"] = pd.to_datetime(google_trends['DATE']) 70 | pivot_df = google_trends.pivot(index='DATE', columns='TOOL', values='HITS') 71 | plt.figure(figsize=(10, 6)) 72 | pivot_df.plot(kind='line', ax=plt.gca(), linewidth=2) 73 | plt.title('Google Trends Over Time') 74 | plt.xlabel('Date') 75 | plt.ylabel('Number of Hits') 76 | plt.legend(title='Tool') 77 | plt.grid(True) 78 | plt.xticks(rotation = 45) 79 | plt.ylim(-50, 200) # Setting y-axis range from -50 to 200 for better visbility 80 | 81 | # Save the chart as an image to local storage 82 | buffer = BytesIO() 83 | plt.savefig(buffer, format='png') 84 | buffer.seek(0) 85 | plt.close() 86 | filename = "google_trends_over_time.png" 87 | image_storage.write(filename, buffer) 88 | 89 | 90 | @asset(group_name = "hacker_news_data") 91 | def hacker_news_full_asset(context: AssetExecutionContext, pipeline: DltPipeline) -> None: 92 | """ 93 | A Dagster asset that loads Hackernews data from the "hacker_news_full" dlt source to Snowflake using a dlt pipeline and documents the updates. 94 | """ 95 | 96 | dlt_source = hacker_news_full(orchestration_tools) 97 | md_content = run_dlt_pipeline_and_generate_md(pipeline, resource_data = dlt_source, table_name = "hacke_news_full_asset") 98 | 99 | context.add_output_metadata(metadata={"Updates": MetadataValue.md(md_content)}) 100 | 101 | 102 | @asset(group_name = "hacker_news_data", deps = [hacker_news_full_asset]) 103 | def hacker_news_chart(snowflake: SnowflakeResource, image_storage: LocalFileStorage) -> None: 104 | """ 105 | A Dagster asset that generates a line chart visualizing the sentiment of comments for each tool and saves it to the local storage. 106 | """ 107 | 108 | with snowflake.get_connection() as conn: 109 | data = conn.cursor().execute( 110 | f"SELECT TOOL_NAME, SENTIMENT, COUNT(*) AS COUNT FROM HACKER_NEWS_FULL_ASSET WHERE SENTIMENT IN ('Neutral', 'Positive', 'Negative') GROUP BY TOOL_NAME, SENTIMENT" 111 | ).fetch_pandas_all() 112 | 113 | # Plot the data 114 | pivot_df = data.pivot(index='TOOL_NAME', columns='SENTIMENT', values='COUNT').fillna(0) 115 | plt.figure(figsize=(10, 6)) 116 | pivot_df.plot(kind='bar', width=0.8) 117 | plt.title('Sentiment Counts for Each Tool on Hacker News') 118 | plt.xlabel('Tool Name') 119 | plt.ylabel('Count') 120 | plt.legend(title='Sentiment') 121 | plt.xticks(rotation=45) 122 | 123 | # Save the chart as an image to local storage 124 | buffer = BytesIO() 125 | plt.savefig(buffer, format='png', bbox_inches='tight') 126 | buffer.seek(0) 127 | plt.close() 128 | filename = "hacker_news_sentiment_counts.png" 129 | image_storage.write(filename, buffer) 130 | 131 | 132 | ''' 133 | @asset(group_name = "hacker_news_data") 134 | def hacker_news_asset(context: AssetExecutionContext, pipeline: DltResource) -> None: 135 | """ 136 | A Dagster asset that separately loads Hackernews stories from the "hacker_news" dlt resource to Snowflake using a dlt pipeline and documents the updates. 137 | """ 138 | 139 | dlt_resource = hacker_news(orchestration_tools) 140 | md_content = run_dlt_pipeline_and_generate_md(pipeline, resource_data = dlt_resource, table_name = "hacker_news_asset") 141 | 142 | context.add_output_metadata(metadata={"Updates": MetadataValue.md(md_content)}) 143 | ''' -------------------------------------------------------------------------------- /coinpaprika-to-postgresql/README.md: -------------------------------------------------------------------------------- 1 | # Loading Nested Data from an API into a PostgreSQL Database with dlt 2 | 3 | ## Overview 4 | 5 | This demo project demonstrates how to load nested data from separate API endpoints, where multiple endpoints rely on the response of one endpoint. It demonstrates how to set up `dlt` (Data Loading Tool) resources, including transformer resources and a source that merges them into a single dataset. Additionally, it includes a pipeline that handles the data ingestion process. PostgreSQL is used as the storage destination, and data is sourced from the Coinpaprika API. 6 | 7 | ![Pipeline overview](https://storage.googleapis.com/dlt-blog-images/belgrade_demo_overview.jpg) 8 | 9 | 10 | ## Prerequisites 11 | 12 | 1. Docker Desktop 13 | 14 | > Download [Docker Desktop](https://www.docker.com/products/docker-desktop/) to download. 15 | 16 | 2. DBeaver or another database administration tool of your choice 17 | 18 | > Download [DBeaver](https://dbeaver.io/download/). 19 | 20 | Alternatively, use [DuckDB as destination](https://dlthub.com/docs/getting-started) for a simpler setup. 21 | 22 | ## Setup Guide 23 | 24 | 1. Clone this repository. 25 | 26 | 2. Install the necessary dependencies for PostgreSQL: 27 | 28 | ```bash 29 | pip install -r requirements.txt 30 | ``` 31 | 32 | 3. Setup PostgreSQL using the public image: 33 | 34 | ```bash 35 | $ docker pull postgres 36 | ``` 37 | 38 | 4. Run the Docker container using the postgres:latest image with the command below: 39 | 40 | ```bash 41 | $ docker run -itd -e POSTGRES_USER=loader -e POSTGRES_PASSWORD=password -p 5432:5432 -v /data:/var/lib/postgresql/data --name postgresql postgres 42 | ``` 43 | 44 | > Replace `/data` with the absolute path to your local directory that you want to map to `/var/lib/postgresql/data` inside the container. 45 | 46 | 5. Connect to the database: 47 | 48 | ```bash 49 | PGPASSWORD=password psql -h localhost -p 5432 -U loader 50 | ``` 51 | 52 | 6. Create a new database: 53 | 54 | ```bash 55 | CREATE DATABASE demo_data; 56 | ``` 57 | 58 | 7. Create an empty `secrets.toml` in the `.dlt` directory and enter your credentials: 59 | 60 | ```env 61 | [destination.postgres.credentials] 62 | 63 | database = "demo_data" 64 | username = "loader" 65 | password = "password" # replace with your password 66 | host = "localhost" # or the IP address location of your database 67 | port = 5432 68 | connect_timeout = 15 69 | ``` 70 | 71 | ## Your `dlt` Pipeline 72 | 73 | 1. Understand your resources and sources. 74 | 75 | In the context of `dlt`, a source is a location that holds data with a certain structure, organized into one or more resources. It can also refer to the software component (i.e., a Python function) that extracts data from the source location using one or more resource components. For example, if the source is an API, then a resource is an endpoint in that API. If the source is a database, then a resource is a table in that database. 76 | 77 | The demo has two resources: 78 | 79 | - `coin_list()` yields a list of cryptocurrencies from coinpaprika.com: 80 | 81 | ```python 82 | @dlt.resource(name = "coin_list", write_disposition="replace") 83 | def coin_list(): 84 | response = requests.get('https://api.coinpaprika.com/v1/coins') 85 | yield from response.json() 86 | ``` 87 | - `coin_information(coin)` is a transformer resource that fetches comprehensive details from three distinct API endpoints for each cryptocurrency provided by `coin_list()`. The responses are then merged into one object for loading into a single database table: 88 | 89 | ```python 90 | @dlt.transformer(data_from = coin_list().add_limit(2)) 91 | def coin_information(coin): 92 | coin_id = coin['id'] 93 | details = requests.get(f'https://api.coinpaprika.com/v1/coins/{coin_id}') 94 | ohlc = requests.get(f'https://api.coinpaprika.com/v1/coins/{coin_id}/ohlcv/latest') 95 | exchanges = requests.get(f'https://api.coinpaprika.com/v1/coins/{coin_id}/exchanges') 96 | yield details.json() | ohlc.json()[0] | {"exchanges": exchanges.json()} 97 | ``` 98 | These resources are combined into the `crypto_data()` source,which corresponds to a dataset with tables generated from the two resources: 99 | 100 | ```python 101 | @dlt.source 102 | def crypto_data(name = "crypto_source"): 103 | yield coin_list() 104 | yield coin_information() 105 | ``` 106 | 107 | The `@dlt.resource` and `@dlt.source` decorators declare a function as a resource/source in `dlt`, offering flexibility with essential functionalities. 108 | 109 | > Note that the decorators use `yield` to produce data on-the-fly, instead of loading all data into memory at once. 110 | 111 | 2. Understand your pipeline. 112 | 113 | We define a pipeline named `crypto_pipeline` with PostgreSQL as destination: 114 | 115 | ```python 116 | def load_coin_details() -> None: 117 | pipeline = dlt.pipeline( 118 | pipeline_name="crypto_pipeline", 119 | destination='postgres', 120 | full_refresh=True, 121 | dataset_name="crypto_data", 122 | ) 123 | info = pipeline.run(crypto_data()) 124 | ``` 125 | `full_refresh` is set to `True`, creating new dataset instances each time the pipeline runs. If set to `False`, the pipeline will update the existing dataset instead of creating new ones. 126 | 127 | The default `write_disposition` in `pipeline.run()` is set to `append`, meaning new data will be added to the existing data in the destination. Other options include: 128 | 129 | - `replace`: This replaces the existing data in the destination with the new data. 130 | - `merge`: This option merges the new data into the destination using a `merge_key`. It can also deduplicate or upsert new data using a `private_key`. 131 | 132 | 133 | 3. Run your pipeline: 134 | 135 | ```bash 136 | $ python3 dlt_pipeline_merged.py 137 | ``` 138 | 139 | ## Viewing Your Data in DBeaver 140 | 141 | 1. Connect DBeaver to your database. 142 | 143 | - Click `New Database Connection` in the top left corner. 144 | - Choose PostgreSQL. 145 | - Enter `demo_data` as the database. 146 | - Enter `loader` as the username. 147 | - Enter `password` as the password. 148 | - Test the connection. 149 | 150 | 2. Once connected, you can view your data. It should look like this: 151 | 152 | ![DBeaver view of demo_data](https://storage.googleapis.com/dlt-blog-images/belgrade_demo_DBeaver.png) 153 | 154 | > To get a better understanding of how the nested data was normalized with `dlt`, view the example respones returned by the API endpoints in `example_api_responses`. 155 | 156 | ## Contact / Support 157 | For guidance on running custom pipelines with `dlt`, consider joining our [Slack community](https://dlthub-community.slack.com). 158 | 159 | Visit our [documentation page](https://dlthub.com/docs/intro) for more detailed information. -------------------------------------------------------------------------------- /scraping-source/scraping/runner.py: -------------------------------------------------------------------------------- 1 | """This module contains abstractions to facilitate scraping and loading process""" 2 | import threading 3 | import typing as t 4 | import dlt 5 | 6 | from dlt.common import logger 7 | from pydispatch import dispatcher # type: ignore 8 | from typing_extensions import Self 9 | 10 | from scrapy import signals, Item, Spider # type: ignore 11 | from scrapy.crawler import CrawlerProcess # type: ignore 12 | 13 | from .types import AnyDict, Runnable, P 14 | from .queue import ScrapingQueue 15 | 16 | T = t.TypeVar("T") 17 | 18 | 19 | class Signals: 20 | """Signals context wrapper 21 | 22 | This wrapper is also a callable which accepts `CrawlerProcess` instance 23 | this is required to stop the scraping process as soon as the queue closes. 24 | """ 25 | 26 | def __init__(self, pipeline_name: str, queue: ScrapingQueue[T]) -> None: 27 | self.stopping = False 28 | self.queue = queue 29 | self.pipeline_name = pipeline_name 30 | 31 | def on_item_scraped(self, item: Item) -> None: 32 | if not self.queue.is_closed: 33 | self.queue.put(item) 34 | else: 35 | logger.info( 36 | "Queue is closed, stopping", 37 | extra={"pipeline_name": self.pipeline_name}, 38 | ) 39 | if not self.stopping: 40 | self.on_engine_stopped() 41 | 42 | def on_engine_stopped(self) -> None: 43 | logger.info(f"Crawling engine stopped for pipeline={self.pipeline_name}") 44 | self.stopping = True 45 | self.crawler.stop() 46 | self.queue.close() 47 | self.queue.join() 48 | 49 | def __call__(self, crawler: CrawlerProcess) -> Self: 50 | self.crawler = crawler 51 | return self 52 | 53 | def __enter__(self) -> None: 54 | # We want to receive on_item_scraped callback from 55 | # outside so we don't have to know about any queue instance. 56 | dispatcher.connect(self.on_item_scraped, signals.item_scraped) 57 | 58 | # Once crawling engine stops we would like to know about it as well. 59 | dispatcher.connect(self.on_engine_stopped, signals.engine_stopped) 60 | 61 | def __exit__(self, exc_type: t.Any, exc_val: t.Any, exc_tb: t.Any) -> None: 62 | dispatcher.disconnect(self.on_item_scraped, signals.item_scraped) 63 | dispatcher.disconnect(self.on_engine_stopped, signals.engine_stopped) 64 | 65 | 66 | class ScrapyRunner(Runnable): 67 | """Scrapy runner handles setup and teardown of scrapy crawling""" 68 | 69 | def __init__( 70 | self, 71 | spider: t.Type[Spider], 72 | start_urls: t.List[str], 73 | settings: AnyDict, 74 | signals: Signals, 75 | ) -> None: 76 | self.spider = spider 77 | self.start_urls = start_urls 78 | self.crawler = CrawlerProcess(settings=settings) 79 | self.signals = signals 80 | 81 | def run(self, *args: P.args, **kwargs: P.kwargs) -> None: 82 | """Runs scrapy crawler process 83 | 84 | All `kwargs` are forwarded to `crawler.crawl(**kwargs)`. 85 | Also manages relevant signal handling in proper way. 86 | """ 87 | self.crawler.crawl( 88 | self.spider, 89 | name="scraping_spider", 90 | start_urls=self.start_urls, 91 | **kwargs, 92 | ) 93 | 94 | try: 95 | logger.info("Starting the crawler") 96 | with self.signals(self.crawler): 97 | self.crawler.start() 98 | except Exception: 99 | logger.error("Was unable to start crawling process") 100 | raise 101 | finally: 102 | self.signals.on_engine_stopped() 103 | logger.info("Scraping stopped") 104 | 105 | 106 | class PipelineRunner(Runnable): 107 | """Pipeline runner runs dlt pipeline in a separate thread 108 | Since scrapy wants to run in the main thread it is the only available 109 | option to host pipeline in a thread and communicate via the queue. 110 | """ 111 | 112 | def __init__(self, pipeline: dlt.Pipeline, queue: ScrapingQueue[T]) -> None: 113 | self.pipeline = pipeline 114 | self.queue = queue 115 | 116 | if pipeline.dataset_name and not self.is_default_dataset_name(pipeline): 117 | resource_name = pipeline.dataset_name 118 | else: 119 | resource_name = f"{pipeline.pipeline_name}_results" 120 | 121 | logger.info(f"Resource name: {resource_name}") 122 | 123 | self.scraping_resource = dlt.resource( 124 | # Queue get_batches is a generator so we can 125 | # pass it to pipeline.run and dlt will handle the rest. 126 | self.queue.stream(), 127 | name=resource_name, 128 | ) 129 | 130 | def is_default_dataset_name(self, pipeline: dlt.Pipeline) -> bool: 131 | default_name = pipeline.pipeline_name + pipeline.DEFAULT_DATASET_SUFFIX 132 | return pipeline.dataset_name == default_name 133 | 134 | def run( 135 | self, 136 | *args: P.args, 137 | **kwargs: P.kwargs, 138 | ) -> threading.Thread: 139 | """You can use all regular dlt.pipeline.run() arguments 140 | 141 | ``` 142 | destination: TDestinationReferenceArg = None, 143 | staging: TDestinationReferenceArg = None, 144 | dataset_name: str = None, 145 | credentials: Any = None, 146 | table_name: str = None, 147 | write_disposition: TWriteDisposition = None, 148 | columns: TAnySchemaColumns = None, 149 | primary_key: TColumnNames = None, 150 | schema: Schema = None, 151 | loader_file_format: TLoaderFileFormat = None 152 | ``` 153 | """ 154 | 155 | def run() -> None: 156 | try: 157 | self.pipeline.run(self.scraping_resource, **kwargs) # type: ignore[arg-type] 158 | except Exception: 159 | logger.error("Error during pipeline.run call, closing the queue") 160 | raise 161 | finally: 162 | self.queue.close() 163 | 164 | thread_runner = threading.Thread(target=run) 165 | thread_runner.start() 166 | return thread_runner 167 | 168 | 169 | class ScrapingHost: 170 | """Scraping host runs the pipeline and scrapy""" 171 | 172 | def __init__( 173 | self, 174 | queue: ScrapingQueue[T], 175 | scrapy_runner: ScrapyRunner, 176 | pipeline_runner: PipelineRunner, 177 | ) -> None: 178 | self.queue = queue 179 | self.scrapy_runner = scrapy_runner 180 | self.pipeline_runner = pipeline_runner 181 | 182 | def run( 183 | self, 184 | *args: P.args, 185 | **kwargs: P.kwargs, 186 | ) -> None: 187 | """You can pass kwargs which are passed to `pipeline.run`""" 188 | logger.info("Starting pipeline") 189 | pipeline_worker = self.pipeline_runner.run(*args, **kwargs) 190 | 191 | logger.info("Starting scrapy crawler") 192 | self.scrapy_runner.run() 193 | 194 | # Wait to for pipeline finish it's job 195 | pipeline_worker.join() 196 | -------------------------------------------------------------------------------- /dlt-init-openapi-demo/stripe_pipeline/rest_api/typing.py: -------------------------------------------------------------------------------- 1 | from typing import ( 2 | Any, 3 | Dict, 4 | List, 5 | Literal, 6 | Optional, 7 | TypedDict, 8 | Union, 9 | ) 10 | from dataclasses import dataclass, field 11 | 12 | from dlt.common import jsonpath 13 | from dlt.common.typing import TSortOrder 14 | from dlt.common.schema.typing import ( 15 | TColumnNames, 16 | TTableFormat, 17 | TAnySchemaColumns, 18 | TWriteDispositionConfig, 19 | TSchemaContract, 20 | ) 21 | 22 | from dlt.extract.items import TTableHintTemplate 23 | from dlt.extract.incremental.typing import LastValueFunc 24 | 25 | from dlt.sources.helpers.rest_client.paginators import BasePaginator 26 | from dlt.sources.helpers.rest_client.typing import HTTPMethodBasic 27 | from dlt.sources.helpers.rest_client.auth import AuthConfigBase, TApiKeyLocation 28 | 29 | from dlt.sources.helpers.rest_client.paginators import ( 30 | SinglePagePaginator, 31 | HeaderLinkPaginator, 32 | JSONResponsePaginator, 33 | JSONResponseCursorPaginator, 34 | OffsetPaginator, 35 | PageNumberPaginator, 36 | ) 37 | from dlt.sources.helpers.rest_client.exceptions import IgnoreResponseException 38 | from dlt.sources.helpers.rest_client.auth import ( 39 | AuthConfigBase, 40 | HttpBasicAuth, 41 | BearerTokenAuth, 42 | APIKeyAuth, 43 | OAuthJWTAuth, 44 | ) 45 | 46 | PaginatorType = Literal[ 47 | "json_response", 48 | "header_link", 49 | "auto", 50 | "single_page", 51 | "cursor", 52 | "offset", 53 | "page_number", 54 | ] 55 | 56 | 57 | class PaginatorTypeConfig(TypedDict, total=True): 58 | type: PaginatorType # noqa 59 | 60 | 61 | class PageNumberPaginatorConfig(PaginatorTypeConfig, total=False): 62 | """A paginator that uses page number-based pagination strategy.""" 63 | 64 | initial_page: Optional[int] 65 | page_param: Optional[str] 66 | total_path: Optional[jsonpath.TJsonPath] 67 | maximum_page: Optional[int] 68 | 69 | 70 | class OffsetPaginatorConfig(PaginatorTypeConfig, total=False): 71 | """A paginator that uses offset-based pagination strategy.""" 72 | 73 | limit: int 74 | offset: Optional[int] 75 | offset_param: Optional[str] 76 | limit_param: Optional[str] 77 | total_path: Optional[jsonpath.TJsonPath] 78 | maximum_offset: Optional[int] 79 | 80 | 81 | class HeaderLinkPaginatorConfig(PaginatorTypeConfig, total=False): 82 | """A paginator that uses the 'Link' header in HTTP responses 83 | for pagination.""" 84 | 85 | links_next_key: Optional[str] 86 | 87 | 88 | class JSONResponsePaginatorConfig(PaginatorTypeConfig, total=False): 89 | """Locates the next page URL within the JSON response body. The key 90 | containing the URL can be specified using a JSON path.""" 91 | 92 | next_url_path: Optional[jsonpath.TJsonPath] 93 | 94 | 95 | class JSONResponseCursorPaginatorConfig(PaginatorTypeConfig, total=False): 96 | """Uses a cursor parameter for pagination, with the cursor value found in 97 | the JSON response body.""" 98 | 99 | cursor_path: Optional[jsonpath.TJsonPath] 100 | cursor_param: Optional[str] 101 | 102 | 103 | PaginatorConfig = Union[ 104 | PaginatorType, 105 | PageNumberPaginatorConfig, 106 | OffsetPaginatorConfig, 107 | HeaderLinkPaginatorConfig, 108 | JSONResponsePaginatorConfig, 109 | JSONResponseCursorPaginatorConfig, 110 | BasePaginator, 111 | SinglePagePaginator, 112 | HeaderLinkPaginator, 113 | JSONResponsePaginator, 114 | JSONResponseCursorPaginator, 115 | OffsetPaginator, 116 | PageNumberPaginator, 117 | ] 118 | 119 | 120 | AuthType = Literal["bearer", "api_key", "http_basic"] 121 | 122 | 123 | class AuthTypeConfig(TypedDict, total=True): 124 | type: AuthType # noqa 125 | 126 | 127 | class BearerTokenAuthConfig(TypedDict, total=False): 128 | """Uses `token` for Bearer authentication in "Authorization" header.""" 129 | 130 | # we allow for a shorthand form of bearer auth, without a type 131 | type: Optional[AuthType] # noqa 132 | token: str 133 | 134 | 135 | class ApiKeyAuthConfig(AuthTypeConfig, total=False): 136 | """Uses provided `api_key` to create authorization data in the specified `location` (query, param, header, cookie) under specified `name`""" 137 | 138 | name: Optional[str] 139 | api_key: str 140 | location: Optional[TApiKeyLocation] 141 | 142 | 143 | class HttpBasicAuthConfig(AuthTypeConfig, total=True): 144 | """Uses HTTP basic authentication""" 145 | 146 | username: str 147 | password: str 148 | 149 | 150 | # TODO: add later 151 | # class OAuthJWTAuthConfig(AuthTypeConfig, total=True): 152 | 153 | 154 | AuthConfig = Union[ 155 | AuthConfigBase, 156 | AuthType, 157 | BearerTokenAuthConfig, 158 | ApiKeyAuthConfig, 159 | HttpBasicAuthConfig, 160 | BearerTokenAuth, 161 | APIKeyAuth, 162 | HttpBasicAuth, 163 | ] 164 | 165 | 166 | class ClientConfig(TypedDict, total=False): 167 | base_url: str 168 | headers: Optional[Dict[str, str]] 169 | auth: Optional[AuthConfig] 170 | paginator: Optional[PaginatorConfig] 171 | 172 | 173 | class IncrementalArgs(TypedDict, total=False): 174 | cursor_path: str 175 | initial_value: Optional[str] 176 | last_value_func: Optional[LastValueFunc[str]] 177 | primary_key: Optional[TTableHintTemplate[TColumnNames]] 178 | end_value: Optional[str] 179 | row_order: Optional[TSortOrder] 180 | 181 | 182 | class IncrementalConfig(IncrementalArgs, total=False): 183 | start_param: str 184 | end_param: Optional[str] 185 | 186 | 187 | ParamBindType = Literal["resolve", "incremental"] 188 | 189 | 190 | class ParamBindConfig(TypedDict): 191 | type: ParamBindType # noqa 192 | 193 | 194 | class ResolveParamConfig(ParamBindConfig): 195 | resource: str 196 | field: str 197 | 198 | 199 | class IncrementalParamConfig(ParamBindConfig, IncrementalArgs): 200 | pass 201 | # TODO: implement param type to bind incremental to 202 | # param_type: Optional[Literal["start_param", "end_param"]] 203 | 204 | 205 | @dataclass 206 | class ResolvedParam: 207 | param_name: str 208 | resolve_config: ResolveParamConfig 209 | field_path: jsonpath.TJsonPath = field(init=False) 210 | 211 | def __post_init__(self) -> None: 212 | self.field_path = jsonpath.compile_path(self.resolve_config["field"]) 213 | 214 | 215 | class ResponseAction(TypedDict, total=False): 216 | status_code: Optional[Union[int, str]] 217 | content: Optional[str] 218 | action: str 219 | 220 | 221 | class Endpoint(TypedDict, total=False): 222 | path: Optional[str] 223 | method: Optional[HTTPMethodBasic] 224 | params: Optional[Dict[str, Union[ResolveParamConfig, IncrementalParamConfig, Any]]] 225 | json: Optional[Dict[str, Any]] 226 | paginator: Optional[PaginatorConfig] 227 | data_selector: Optional[jsonpath.TJsonPath] 228 | response_actions: Optional[List[ResponseAction]] 229 | incremental: Optional[IncrementalConfig] 230 | 231 | 232 | class ResourceBase(TypedDict, total=False): 233 | """Defines hints that may be passed to `dlt.resource` decorator""" 234 | 235 | table_name: Optional[TTableHintTemplate[str]] 236 | max_table_nesting: Optional[int] 237 | write_disposition: Optional[TTableHintTemplate[TWriteDispositionConfig]] 238 | parent: Optional[TTableHintTemplate[str]] 239 | columns: Optional[TTableHintTemplate[TAnySchemaColumns]] 240 | primary_key: Optional[TTableHintTemplate[TColumnNames]] 241 | merge_key: Optional[TTableHintTemplate[TColumnNames]] 242 | schema_contract: Optional[TTableHintTemplate[TSchemaContract]] 243 | table_format: Optional[TTableHintTemplate[TTableFormat]] 244 | selected: Optional[bool] 245 | parallelized: Optional[bool] 246 | 247 | 248 | class EndpointResourceBase(ResourceBase, total=False): 249 | endpoint: Optional[Union[str, Endpoint]] 250 | include_from_parent: Optional[List[str]] 251 | 252 | 253 | class EndpointResource(EndpointResourceBase, total=False): 254 | name: TTableHintTemplate[str] 255 | 256 | 257 | class RESTAPIConfig(TypedDict): 258 | client: ClientConfig 259 | resource_defaults: Optional[EndpointResourceBase] 260 | resources: List[Union[str, EndpointResource]] 261 | -------------------------------------------------------------------------------- /dlt-init-openapi-demo/stripe_pipeline/rest_api/README.md: -------------------------------------------------------------------------------- 1 | # REST API Generic Source 2 | A declarative way to define dlt sources for REST APIs. 3 | 4 | ## What is this? 5 | > Happy APIs are all alike 6 | > 7 | > \- E. T. Lev Tolstoy, Senior Data Engineer 8 | 9 | This is a generic source that you can use to create a dlt source from a REST API using a declarative configuration. The majority of REST APIs behave in a similar way; this dlt source attempts to provide a declarative way to define a dlt source for those APIs. 10 | 11 | ## How to use it 12 | Let's see how a source for the [Pokemon API](https://pokeapi.co/) would look like: 13 | 14 | ```python 15 | pokemon_config = { 16 | "client": { 17 | "base_url": "https://pokeapi.co/api/v2/", 18 | }, 19 | "resources": [ 20 | "berry", 21 | "location", 22 | { 23 | "name": "pokemon_list", 24 | "endpoint": "pokemon", 25 | }, 26 | { 27 | "name": "pokemon", 28 | "endpoint": { 29 | "path": "pokemon/{name}", 30 | "params": { 31 | "name": { 32 | "type": "resolve", 33 | "resource": "pokemon_list", 34 | "field": "name", 35 | }, 36 | }, 37 | }, 38 | }, 39 | ], 40 | } 41 | 42 | pokemon_source = rest_api_source(pokemon_config) 43 | ``` 44 | Here's a short summary: 45 | - The `client` node contains the base URL of the endpoints that we want to collect. 46 | - The `resources` correspond to the API endpoints. 47 | 48 | We have a couple of simple resources (`berry` and `location`). For them, the API endpoint is also the name of the dlt resource and the name of the destination table. They don't need additional configuration. 49 | 50 | The next resource leverages some additional configuration. The endpoint `pokemon/` returns a list of pokemons, but it can also be used as `pokemon/{id or name}` to return a single pokemon. In this case, we want the list, so we decided to rename the resource to `pokemon_list`, while the endpoint stays `pokemon/`. We do not specify the name of the destination table, so it will match the resource name. 51 | 52 | And now the `pokemon` one. This is actually a child endpoint of the `pokemon_list`: for each pokemon, we want to get further details. So we need to make this resource a bit more smart; the endpoint `path` needs to be explicit, and we have to specify how the value of `name` will be resolved from another resource; this is actually telling the generic source that `pokemon` needs to be queried for each pokemon in `pokemon_list`. 53 | 54 | ## Anatomy of the config object 55 | 56 | > **_TIP:_** Import `RESTAPIConfig` from the `rest_api` module to have convenient tips. 57 | 58 | The config object passed to the REST API Generic Source has three main elements: 59 | 60 | ```python 61 | my_config: RESTAPIConfig = { 62 | "client": { 63 | ... 64 | }, 65 | "resource_defaults": { 66 | ... 67 | }, 68 | "resources": { 69 | ... 70 | }, 71 | } 72 | ``` 73 | 74 | `client` contains the configuration to connect to the API's endpoints (e.g., base URL, authentication method, default behavior for the paginator, and more). 75 | 76 | `resource_defaults` contains the default values to configure the dlt resources returned by this source. 77 | 78 | `resources` object contains the configuration for each resource. 79 | 80 | The configuration with a smaller scope will overwrite the one with the wider one: 81 | 82 | Resource Configuration > Resource Defaults Configuration > Client Configuration 83 | 84 | ## Reference 85 | 86 | ### `client` 87 | 88 | #### `auth` [optional] 89 | Use the auth property to pass a token or a `HTTPBasicAuth` object for more complex authentication methods. Here are some practical examples: 90 | 91 | 1. Simple token (read from the `.dlt/secrets.toml` file): 92 | ```python 93 | my_api_config: RESTAPIConfig = { 94 | "client": { 95 | "base_url": "https://my_api.com/api/v1/", 96 | "auth": { 97 | "token": dlt.secrets["sources.my_api.access_token"], 98 | }, 99 | }, 100 | ... 101 | } 102 | ``` 103 | 104 | 2. 105 | ```python 106 | from requests.auth import HTTPBasicAuth 107 | 108 | basic_auth = HTTPBasicAuth(dlt.secrets["sources.my_api.api_key"], dlt.secrets["sources.my_api.api_secret"]) 109 | 110 | my_api_config: RESTAPIConfig = { 111 | "client": { 112 | "base_url": "https://my_api.com/api/v1/", 113 | "auth": basic_auth, 114 | }, 115 | ... 116 | } 117 | ``` 118 | 119 | #### `base_url` 120 | The base URL that will be prepended to the endpoints specified in the `resources` objects. Example: 121 | 122 | ```python 123 | "base_url": "https://my_api.com/api/v1/", 124 | ``` 125 | 126 | #### `paginator` [optional] 127 | The paginator property specifies the default paginator to be used for the endpoint responses. 128 | 129 | Possible paginators are: 130 | | Paginator | String Alias | Note | 131 | | --------- | ------------ | ---- | 132 | | BasePaginator | | | 133 | | HeaderLinkPaginator | `header_links` | | 134 | | JSONResponsePaginator | `json_links` | The pagination metainformation is in a node of the JSON response (see example below) | 135 | | SinglePagePaginator | `single_page` | The response will be interpreted as a single-page response, ignoring possible pagination metadata | 136 | 137 | Usage example of the `JSONResponsePaginator`, for a response with the URL of the next page located at `paging.next`: 138 | ```python 139 | "paginator": JSONResponsePaginator( 140 | next_key=["paging", "next"] 141 | ) 142 | ``` 143 | 144 | 145 | #### `session` [optional] 146 | 147 | This property allows you to pass a custom `Session` object. 148 | 149 | 150 | ### `resource_defaults` 151 | This property allows you to pass default properties and behavior to the dlt resources created by the REST API Generic Source. Besides the properties mentioned in this documentation, a resource accepts all the arguments that usually are passed to a [dlt resource](https://dlthub.com/docs/general-usage/resource). 152 | 153 | #### `endpoint` 154 | A string indicating the endpoint or an `endpoint` object (see [below](#endpoint-1)). 155 | 156 | #### `include_from_parent` [optional] 157 | A list of fields, from the parent resource, which will be included in the resource output. 158 | 159 | #### `name` 160 | The name of the dlt `resource` and the name of the associated table that will be created. 161 | 162 | #### `params` 163 | The query parameters for the endpoint URL. 164 | 165 | For child resources, you can use values from the parent resource for params. The syntax is the following: 166 | 167 | ```python 168 | "PARAM_NAME": { 169 | "type": "resolve", 170 | "resource": "PARENT_RESOURCE_NAME", 171 | "field": "PARENT_RESOURCE_FIELD", 172 | }, 173 | ``` 174 | 175 | An example of use: 176 | ```python 177 | "endpoint": { 178 | "path": "pokemon/{name}", 179 | "params": { 180 | "name": { 181 | "type": "resolve", 182 | "resource": "pokemon_list", 183 | "field": "name", 184 | }, 185 | }, 186 | }, 187 | ``` 188 | 189 | #### `path` 190 | The URL of the endpoint. If you need to include URL parameters, they can be included using `{}`, for example: 191 | ```python 192 | "path": "pokemon/{name}", 193 | ``` 194 | In case you need to include query parameters, use the [params](#params) property. 195 | 196 | 197 | ### `resources` 198 | An array of resources. Each resource is a string or a resource object. 199 | 200 | Simple resources with their name corresponding to the endpoint can be simple strings. For example: 201 | ```python 202 | "resources": [ 203 | "berry", 204 | "location", 205 | ] 206 | ``` 207 | Resources with the name different from the endpoint string will be: 208 | ```python 209 | "resources": [ 210 | { 211 | "name": "pokemon_list", 212 | "endpoint": "pokemon", 213 | }, 214 | ] 215 | ``` 216 | In case you need to have a resource with a name different from the table created, you can pass the property `table_name` too. 217 | 218 | For the other properties, see the [resource_defaults](#resource_defaults) above. -------------------------------------------------------------------------------- /dlt-dagster-snowflake/dlt_dagster_snowflake_demo/dlt/__init__.py: -------------------------------------------------------------------------------- 1 | from dlt.sources.helpers import requests 2 | import dlt 3 | from openai import OpenAI 4 | from pytrends.request import TrendReq 5 | from datetime import datetime 6 | import logging 7 | import toml 8 | import time 9 | import os 10 | 11 | 12 | def openai_sentiment(context: str): 13 | """ 14 | Analyzes the sentiment of a given text using OpenAI. 15 | 16 | Args: 17 | context: The text string for which the sentiment is to be analyzed. 18 | 19 | Returns: 20 | A string indicating the sentiment of the text, which could be 'positive', 'negative', or 'neutral'. 21 | If an error occurs, it returns a string describing the error. 22 | """ 23 | 24 | # Load your OpenAI API key from the secrets.toml file accessed by dlt 25 | with open(os.getcwd() + '/.dlt/secrets.toml', 'r') as secrets_file: 26 | secrets = toml.load(secrets_file) 27 | openai_key = secrets["openai"]["openai_api_key"] 28 | 29 | # Initialize the OpenAI client 30 | client = OpenAI(api_key = openai_key) 31 | 32 | # Set up the prompt 33 | messages = [ 34 | {"role": "system", "content": "You will be given a comment text. Give the sentiment of the comment in one word. It should be either negative, positive, or neutral."}, 35 | {"role": "assistant", "content": f"{context}"} 36 | ] 37 | 38 | # Try to get the sentiment 39 | try: 40 | response = client.chat.completions.create(model="gpt-3.5-turbo", messages=messages) 41 | output = response.choices[0].message.content 42 | return output 43 | except Exception as e: 44 | # Return the error message if an exception occurs 45 | return f"An error occurred: {e}" 46 | 47 | 48 | # The 'write_disposition' parameter determines how the data returned by this function is handled. 49 | # 'append' means that the data will be added to the end of the existing data. 50 | # Other possible values for 'write_disposition' are 'replace' (which replaces the existing data with the new data) 51 | # and 'merge' (which merges the new data with the existing data, updating any existing records that have the same primary key). 52 | @dlt.resource(write_disposition = "append") 53 | def hacker_news(orchestration_tools: tuple[str] = ("Airflow", )): 54 | """ 55 | This function fetches stories related to specified orchestration tools from Hackernews. 56 | For each tool, it retrieves the top 5 stories that have at least one comment. 57 | The stories are then appended to the existing data. 58 | 59 | Args: 60 | orchestration_tools: A tuple containing the names of orchestration tools for which stories are to be fetched. 61 | 62 | Yields: 63 | A generator that yields dictionaries. Each dictionary represents a story and contains the tool name along with the story details returned by the API request. 64 | """ 65 | 66 | for tool in orchestration_tools: 67 | response = requests.get(f'http://hn.algolia.com/api/v1/search?query={tool}&tags=story&numericFilters=num_comments>=1&hitsPerPage=5') 68 | data = response.json() 69 | # Add the tool name to each story 70 | data["hits"] = [{"tool_name": tool, **item} for item in data["hits"]] 71 | # Yield each story one by one 72 | yield from data["hits"] 73 | 74 | 75 | @dlt.transformer(data_from = hacker_news, write_disposition = "append") 76 | def comments(story): 77 | """ 78 | This function fetches comments for each story yielded by the 'hacker_news' function. 79 | It calculates the number of pages of comments based on the number of comments each story has, 80 | and fetches comments page by page. The comments are then appended to the existing data. 81 | 82 | Args: 83 | story: A dictionary representing a story, yielded by the 'hacker_news' function. 84 | 85 | Yields: 86 | A generator that yields lists of dictionaries. Each list represents a page of comments, 87 | and each dictionary within the list represents a comment and contains the tool name, story title, 88 | story URL, sentiment of the comment, and the comment details returned by the API request. 89 | """ 90 | 91 | tool_name = story["tool_name"] 92 | story_title = story["title"] 93 | story_id = story["story_id"] 94 | url = story.get("url") 95 | num_comments = story["num_comments"] 96 | 97 | num_pages = int(num_comments/20) # The API returns 20 comments per page 98 | if num_pages != num_comments/20: 99 | num_pages += 1 100 | 101 | for page in range(num_pages): 102 | response = requests.get(f'http://hn.algolia.com/api/v1/search?tags=comment,story_{story_id}&page={page}') 103 | data = response.json() 104 | # Add the tool name, story title, story URL, and sentiment to each comment 105 | data["hits"] = [{"tool_name": tool_name, "story_title": story_title, "story_url": url, "sentiment": openai_sentiment(item["comment_text"]), **item} for item in data["hits"]] 106 | #data["hits"] = [{"tool_name": tool_name, "story_title": story_title, "story_url": url, **item} for item in data["hits"]] # Without sentiment_analysis 107 | # Yield each page of comments 108 | yield data["hits"] 109 | 110 | 111 | @dlt.source() 112 | def hacker_news_full(orchestration_tools:tuple[str] = ("Airflow", )): 113 | """ 114 | This function is a dlt source that groups together the resources and transformers needed to fetch 115 | Hackernews stories and their comments for specified orchestration tools. 116 | 117 | Args: 118 | orchestration_tools: A tuple containing the names of orchestration tools for which Hacker News stories and comments are to be fetched. 119 | 120 | Yields: 121 | A generator that yields the results of the 'hacker_news' resource piped into the 'comments' transformer. 122 | """ 123 | 124 | # The 'hacker_news' resource fetches stories for the specified orchestration tools 125 | # The 'comments' transformer fetches comments for each story yielded by the 'hacker_news' resource 126 | yield hacker_news(orchestration_tools = orchestration_tools) | comments 127 | 128 | 129 | @dlt.resource(write_disposition = "append") 130 | def google_trends(orchestration_tools: tuple[str] = ("Airflow",), start_date='2023-01-01', geo=''): 131 | """ 132 | This function fetches Google Trends data for specified orchestration tools. 133 | It attempts to retrieve the data multiple times in case of failures or empty responses. 134 | The retrieved data is then appended to the existing data. 135 | 136 | Args: 137 | orchestration_tools: A tuple containing the names of orchestration tools for which Google Trends data is to be fetched. 138 | start_date: The start date for the Google Trends data. Defaults to '2023-01-01'. 139 | geo: The geographic area for the Google Trends data. Defaults to an empty string, which means worldwide. 140 | 141 | Yields: 142 | A generator that yields lists of dictionaries. Each list represents the Google Trends data for a tool, 143 | and each dictionary within the list contains the tool name and the Google Trends data. 144 | """ 145 | 146 | # pytrend = TrendReq() 147 | for tool in orchestration_tools: 148 | attempts = 0 149 | max_attempts = 5 # Set a maximum number of attempts to avoid infinite loops 150 | while attempts < max_attempts: 151 | try: 152 | end_date = datetime.now().strftime('%Y-%m-%d') 153 | timeframe = f'{start_date} {end_date}' 154 | pytrend.build_payload(kw_list = [tool], timeframe = timeframe, geo = geo) 155 | data_df = pytrend.interest_over_time() 156 | 157 | if not data_df.empty: 158 | data_df.reset_index(inplace = True) 159 | data_df.rename(columns = {tool: 'Hits'}, inplace=True) 160 | data = data_df.to_dict('records') 161 | data = [{"tool": tool, **item} for item in data] 162 | print(data) 163 | yield data 164 | break # Successfully fetched data, exit the retry loop 165 | else: 166 | logging.warning(f"No data for {tool}. Retrying...") 167 | attempts += 1 168 | time.sleep(60) # Wait before retrying 169 | except Exception as e: 170 | logging.warning(f"Encountered an error fetching data for {tool}: {e}. Attempt {attempts+1}/{max_attempts}. Retrying...") 171 | attempts += 1 172 | time.sleep(100) # Wait before retrying 173 | 174 | if attempts >= max_attempts: 175 | logging.error(f"Max retries reached for {tool}. Moving to the next tool.") -------------------------------------------------------------------------------- /dlt-dagster-snowflake/README.md: -------------------------------------------------------------------------------- 1 | # Loading Nested API Data into Snowflake using `dlt` in Dagster 2 | 3 | ## Overview 4 | 5 | This is a demo project that shows how to load nested data into Snowflake using `dlt` (Data Load Tool) in Dagster. It demonstrates the process of defining a `dlt` pipeline as a Dagster resource and implementing the pipeline with `dlt` resources and sources in Dagster assets. Additionally, it integrates AI analysis for sentiment assessment of textual data. 6 | 7 | ![Pipeline overview](https://storage.googleapis.com/dlt-blog-images/dlt_dagster_snowflake_demo_overview.png) 8 | 9 | The diagram above represents the workflow overview of the project, encompassing the following steps: 10 | 11 | 1. Data loading from Google Trends and Hacker News to Snowflake using `dlt`, with an added step for OpenAI sentiment analysis specifically for Hacker News before the loading process. 12 | 2. Data reporting from the destination to a local directory in the form of image files. 13 | 14 | ## Prerequisites 15 | 16 | 1. Snowflake credentials 17 | - username 18 | - password 19 | - account and host 20 | 21 | >The host refers to your account identifier within your account. For instance, if your account is `https://kgiotue-wn98412.snowflakecomputing.com`, your host would be `kgiotue-wn98412`. 22 | 23 | 2. OpenAI API key 24 | 25 | >If you're new to [OpenAi](https://platform.openai.com/), they offer $5 in free credits usable during your first 3 months. 26 | 27 | ## Setup Guide 28 | 29 | 1. **Clone this repository**: Follow the instructions [here](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository). 30 | 31 | 2. **Create a virtual environment and activate it**: This step is advised to maintain a clean workspace and prevent dependency conflicts, although this is not mandatory. 32 | 33 | ```bash 34 | python -m venv myenv 35 | source myenv/bin activate 36 | ``` 37 | 3. **Create a `secrets.toml` file in `.dlt` folder and enter the missing values**: Use the `example.secrets.toml` file for reference. 38 | 39 | > Deafault values for the role and warehouse are "ACCOUNTADMIN" and "COMPUTE_WH", respectively. 40 | 41 | 4. **Install dependencies**: Run the following command from the project folder. 42 | 43 | ```bash 44 | pip install -r requirements.txt 45 | ``` 46 | 47 | 5. **Start Dagster server**: Run the following command. 48 | ```bash 49 | dagster dev 50 | ``` 51 | 8. **Access Dagster UI**: Launch http://127.0.0.1:3000. 52 | 53 | >If you want to run Dagster in cloud, or customize your project, consult their official [documentation](https://docs.dagster.io/getting-started). 54 | 55 | ## Understand Your Project 56 | 57 | This project is very minimal, including just what's needed to run Dagster locally with `dlt`. Here's a quick breakdown of its structure: 58 | 59 | 1. `.dlt`: Utilized by the `dlt` library for storing configuration and sensitive information. The Dagster project is set up to fetch secret values from this directory as well. 60 | 61 | 2. `charts`: Used to store chart images generated by assets. 62 | 63 | 3. `dlt_dagster_snowflake_demo`: Your Dagster package, comprising Dagster assets, `dlt` resources, Dagster resources, and general project configurations. 64 | 65 | ### Dagster Resources Explained 66 | 67 | In the `resources` folder, the following two Dagster resources are defined as classes: 68 | 69 | 1. `DltPipeline`: This is our `dlt` object defined as a Dagster ConfigurableResource that creates and runs a `dlt` pipeline with the specified data and table name. It will later be used in our Dagster assets to load data into Snowflake. 70 | 71 | ```python 72 | class DltPipeline(ConfigurableResource): 73 | # Initialize resource with pipeline details 74 | pipeline_name: str 75 | dataset_name: str 76 | destination: str 77 | 78 | def create_pipeline(self, resource_data, table_name): 79 | """ 80 | Creates and runs a dlt pipeline with specified data and table name. 81 | 82 | Args: 83 | resource_data: The data to be processed by the pipeline. 84 | table_name: The name of the table where data will be loaded. 85 | 86 | Returns: 87 | The result of the pipeline execution. 88 | """ 89 | 90 | # Configure the dlt pipeline with your destination details 91 | pipeline = dlt.pipeline( 92 | pipeline_name=self.pipeline_name, 93 | destination=self.destination, 94 | dataset_name=self.dataset_name 95 | ) 96 | 97 | # Run the pipeline with your parameters 98 | load_info = pipeline.run(resource_data, table_name=table_name) 99 | return load_info 100 | ``` 101 | 102 | 2. `LocalFileStorage`: Manages the local file storage, ensuring the storage directory exists and allowing data to be written to files within it. It will be later used in our Dagster assets to save images into the `charts` folder. 103 | 104 | ### `dlt` Explained 105 | 106 | In the `dlt` folder within `dlt_dagster_snowflake_demo`, necessary `dlt` resources and sources are defined. Below is a visual representation illustrating the functionality of `dlt`: 107 | 108 | ![dlt explained](https://storage.googleapis.com/dlt-blog-images/dlt_dagster_snowflake_demo_dlt.png) 109 | 110 | 1. `hacker_news`: A `dlt` resource that yields stories related to specified orchestration tools from Hackernews. For each tool, it retrieves the top 5 stories that have at least one comment. The stories are then appended to the existing data. 111 | 112 | Note that the `write_disposition` can also be set to `merge` or `replace`: 113 | - The merge write disposition merges the new data from the resource with the existing data at the destination. It requires a primary_key to be specified for the resource. More details can be found here. 114 | - The replace write disposition replaces the data in the destination with the data from the resource. It deletes all the classes and objects and recreates the schema before loading the data. 115 | 116 | More details can be found [here](https://dlthub.com/docs/general-usage/resource). 117 | 118 | 2. `comments`: A `dlt` transformer - a resource that receives data from another resource. It fetches comments for each story yielded by the `hacker_news` function. 119 | 120 | 3. `hacker_news_full`: A `dlt` source that extracts data from the source location using one or more resource components, such as `hacker_news` and `comments`. To illustrate, if the source is a database, a resource corresponds to a table within that database. 121 | 122 | 4. `google_trends`: A `dlt` resource that fetches Google Trends data for specified orchestration tools. It attempts to retrieve the data multiple times in case of failures or empty responses. The retrieved data is then appended to the existing data. 123 | 124 | As you may have noticed, the `dlt` library is designed to handle the unnesting of data internally. When you retrieve data from APIs like Hackernews or Google Trends, `dlt` automatically unpacks the nested structures into relational tables, creating and linking child and parent tables. This is achieved through unique identifiers (`_dlt_id` and `_dlt_parent_id`) that link child tables to specific rows in the parent table. However, it's important to note that you have control over [how this unnesting is done](https://dlthub.com/docs/general-usage/destination-tables). 125 | 126 | ### Dagster Assets Explained 127 | 128 | > If you're new to Dagster, start by understanding the [concept of an asset](https://docs.dagster.io/concepts). 129 | 130 | The assets defined in this project are essentially combinations of `dlt` resources paired with pipeline runs. When materialized, `dlt` objects are initialized, and the pipeline is executed to load data into Snowflake. This project includes the following assets: 131 | 132 | 1. `google_trends_asset`: Loads Google Trends data from the "google_trends" `dlt` resource to Snowflake using a `dlt` pipeline. 133 | 134 | 2. `google_trends_chart`: Generates a line chart visualizing Google Trends data over time and saves it to the local storage. This asset is dependent on `google_trends_asset`, since it uses data that's loaded by the latter. 135 | 136 | 3. `hacker_news_full_asset`: Loads Hackernews data from the "hacker_news_full" `dlt` source to Snowflake using a `dlt` pipeline. 137 | 138 | 4. `hacker_news_chart`: Generates a line chart visualizing the sentiment of comments for each tool and saves it to the local storage. 139 | 140 | 4. `hacker_news_asset`: Separately loads Hackernews stories from the "hacker_news" `dlt` resource to Snowflake using a `dlt` pipeline. 141 | 142 | 143 | ## Materialize Your Assets 144 | 145 | Once you launch your Dagster locally, you can materialize your assets and view the resulting charts in the corresponding folder. Feel free to explore and experiment with this project. 146 | 147 | ## View Your Data 148 | 149 | Your data is now stored in Snowflake. It should look something like this: 150 | 151 | ![Snowflake view](https://storage.googleapis.com/dlt-blog-images/dlt_dagster_snowflake_demo_view.png) 152 | 153 | ## Contact / Support 154 | For insights on executing custom pipelines using `dlt` or orchestrating workflows in Dagster, join their Slack communities: 155 | 156 | - [dltHub](https://dlthub-community.slack.com) 157 | - [Dagster](https://kestra-io.slack.com) 158 | 159 | For more information on Snowflake, refer to their official [documentation](https://docs.snowflake.com/en/?_ga=2.9762677.1316386857.1709051821-830416446.1707924081). 160 | -------------------------------------------------------------------------------- /sengled-plug-demo/tuya_helpers/openapi.py: -------------------------------------------------------------------------------- 1 | """Tuya Open API.""" 2 | from __future__ import annotations 3 | 4 | import hashlib 5 | import hmac 6 | import json 7 | import time 8 | from typing import Any 9 | 10 | import requests 11 | 12 | from .openlogging import filter_logger, logger 13 | from .tuya_enums import AuthType 14 | from .version import VERSION 15 | 16 | TUYA_ERROR_CODE_TOKEN_INVALID = 1010 17 | 18 | TO_C_CUSTOM_REFRESH_TOKEN_API = "/v1.0/iot-03/users/token/" 19 | TO_C_SMART_HOME_REFRESH_TOKEN_API = "/v1.0/token/" 20 | 21 | TO_C_CUSTOM_TOKEN_API = "/v1.0/iot-03/users/login" 22 | TO_C_SMART_HOME_TOKEN_API = "/v1.0/iot-01/associated-users/actions/authorized-login" 23 | 24 | 25 | class TuyaTokenInfo: 26 | """Tuya token info. 27 | 28 | Attributes: 29 | access_token: Access token. 30 | expire_time: Valid period in seconds. 31 | refresh_token: Refresh token. 32 | uid: Tuya user ID. 33 | platform_url: user region platform url 34 | """ 35 | 36 | def __init__(self, token_response: dict[str, Any] = None): 37 | """Init TuyaTokenInfo.""" 38 | result = token_response.get("result", {}) 39 | 40 | self.expire_time = ( 41 | token_response.get("t", 0) 42 | + result.get("expire", result.get("expire_time", 0)) * 1000 43 | ) 44 | self.access_token = result.get("access_token", "") 45 | self.refresh_token = result.get("refresh_token", "") 46 | self.uid = result.get("uid", "") 47 | self.platform_url = result.get("platform_url", "") 48 | 49 | 50 | class TuyaOpenAPI: 51 | """Open Api. 52 | 53 | Typical usage example: 54 | 55 | openapi = TuyaOpenAPI(ENDPOINT, ACCESS_ID, ACCESS_KEY) 56 | """ 57 | 58 | def __init__( 59 | self, 60 | endpoint: str, 61 | access_id: str, 62 | access_secret: str, 63 | auth_type: AuthType = AuthType.SMART_HOME, 64 | lang: str = "en", 65 | ) -> None: 66 | """Init TuyaOpenAPI.""" 67 | self.session = requests.session() 68 | 69 | self.endpoint = endpoint 70 | self.access_id = access_id 71 | self.access_secret = access_secret 72 | self.lang = lang 73 | 74 | self.auth_type = auth_type 75 | if self.auth_type == AuthType.CUSTOM: 76 | self.__login_path = TO_C_CUSTOM_TOKEN_API 77 | else: 78 | self.__login_path = TO_C_SMART_HOME_TOKEN_API 79 | 80 | self.token_info: TuyaTokenInfo = None 81 | 82 | self.dev_channel: str = "" 83 | 84 | self.__username = "" 85 | self.__password = "" 86 | self.__country_code = "" 87 | self.__schema = "" 88 | 89 | # https://developer.tuya.com/docs/iot/open-api/api-reference/singnature?id=Ka43a5mtx1gsc 90 | def _calculate_sign( 91 | self, 92 | method: str, 93 | path: str, 94 | params: dict[str, Any] | None = None, 95 | body: dict[str, Any] | None = None, 96 | ) -> tuple[str, int]: 97 | 98 | # HTTPMethod 99 | str_to_sign = method 100 | str_to_sign += "\n" 101 | 102 | # Content-SHA256 103 | content_to_sha256 = ( 104 | "" if body is None or len(body.keys()) == 0 else json.dumps(body) 105 | ) 106 | 107 | str_to_sign += ( 108 | hashlib.sha256(content_to_sha256.encode("utf8")).hexdigest().lower() 109 | ) 110 | str_to_sign += "\n" 111 | 112 | # Header 113 | str_to_sign += "\n" 114 | 115 | # URL 116 | str_to_sign += path 117 | 118 | if params is not None and len(params.keys()) > 0: 119 | str_to_sign += "?" 120 | 121 | params_keys = sorted(params.keys()) 122 | query_builder = "".join(f"{key}={params[key]}&" for key in params_keys) 123 | str_to_sign += query_builder[:-1] 124 | 125 | # Sign 126 | t = int(time.time() * 1000) 127 | 128 | message = self.access_id 129 | if self.token_info is not None: 130 | message += self.token_info.access_token 131 | message += str(t) + str_to_sign 132 | sign = ( 133 | hmac.new( 134 | self.access_secret.encode("utf8"), 135 | msg=message.encode("utf8"), 136 | digestmod=hashlib.sha256, 137 | ) 138 | .hexdigest() 139 | .upper() 140 | ) 141 | return sign, t 142 | 143 | def __refresh_access_token_if_need(self, path: str): 144 | if self.is_connect() is False: 145 | return 146 | 147 | if path.startswith(self.__login_path): 148 | return 149 | 150 | # should use refresh token? 151 | now = int(time.time() * 1000) 152 | expired_time = self.token_info.expire_time 153 | 154 | if expired_time - 60 * 1000 > now: # 1min 155 | return 156 | 157 | self.token_info.access_token = "" 158 | 159 | if self.auth_type == AuthType.CUSTOM: 160 | response = self.post( 161 | TO_C_CUSTOM_REFRESH_TOKEN_API + self.token_info.refresh_token 162 | ) 163 | else: 164 | response = self.get( 165 | TO_C_SMART_HOME_REFRESH_TOKEN_API + self.token_info.refresh_token 166 | ) 167 | 168 | self.token_info = TuyaTokenInfo(response) 169 | 170 | def set_dev_channel(self, dev_channel: str): 171 | """Set dev channel.""" 172 | self.dev_channel = dev_channel 173 | 174 | def connect( 175 | self, 176 | username: str = "", 177 | password: str = "", 178 | country_code: str = "", 179 | schema: str = "", 180 | ) -> dict[str, Any]: 181 | """Connect to Tuya Cloud. 182 | 183 | Args: 184 | username (str): user name in to C 185 | password (str): user password in to C 186 | country_code (str): country code in SMART_HOME 187 | schema (str): app schema in SMART_HOME 188 | 189 | Returns: 190 | response: connect response 191 | """ 192 | self.__username = username 193 | self.__password = password 194 | self.__country_code = country_code 195 | self.__schema = schema 196 | 197 | if self.auth_type == AuthType.CUSTOM: 198 | response = self.post( 199 | TO_C_CUSTOM_TOKEN_API, 200 | { 201 | "username": username, 202 | "password": hashlib.sha256(password.encode("utf8")) 203 | .hexdigest() 204 | .lower(), 205 | }, 206 | ) 207 | else: 208 | response = self.post( 209 | TO_C_SMART_HOME_TOKEN_API, 210 | { 211 | "username": username, 212 | "password": hashlib.md5(password.encode("utf8")).hexdigest(), 213 | "country_code": country_code, 214 | "schema": schema, 215 | }, 216 | ) 217 | 218 | if not response["success"]: 219 | return response 220 | 221 | # Cache token info. 222 | self.token_info = TuyaTokenInfo(response) 223 | 224 | return response 225 | 226 | def is_connect(self) -> bool: 227 | """Is connect to tuya cloud.""" 228 | return self.token_info is not None and len(self.token_info.access_token) > 0 229 | 230 | def __request( 231 | self, 232 | method: str, 233 | path: str, 234 | params: dict[str, Any] | None = None, 235 | body: dict[str, Any] | None = None, 236 | ) -> dict[str, Any]: 237 | 238 | self.__refresh_access_token_if_need(path) 239 | 240 | access_token = self.token_info.access_token if self.token_info else "" 241 | sign, t = self._calculate_sign(method, path, params, body) 242 | headers = { 243 | "client_id": self.access_id, 244 | "sign": sign, 245 | "sign_method": "HMAC-SHA256", 246 | "access_token": access_token, 247 | "t": str(t), 248 | "lang": self.lang, 249 | } 250 | 251 | if path == self.__login_path or \ 252 | path.startswith(TO_C_CUSTOM_REFRESH_TOKEN_API) or\ 253 | path.startswith(TO_C_SMART_HOME_REFRESH_TOKEN_API): 254 | headers["dev_lang"] = "python" 255 | headers["dev_version"] = VERSION 256 | headers["dev_channel"] = self.dev_channel 257 | 258 | logger.debug( 259 | f"Request: method = {method}, \ 260 | url = {self.endpoint + path},\ 261 | params = {params},\ 262 | body = {filter_logger(body)},\ 263 | t = {int(time.time()*1000)}" 264 | ) 265 | 266 | response = self.session.request( 267 | method, self.endpoint + path, params=params, json=body, headers=headers 268 | ) 269 | 270 | if response.ok is False: 271 | logger.error( 272 | f"Response error: code={response.status_code}, body={response.body}" 273 | ) 274 | return None 275 | 276 | result = response.json() 277 | 278 | logger.debug( 279 | f"Response: {json.dumps(filter_logger(result), ensure_ascii=False, indent=2)}" 280 | ) 281 | 282 | if result.get("code", -1) == TUYA_ERROR_CODE_TOKEN_INVALID: 283 | self.token_info = None 284 | self.connect( 285 | self.__username, self.__password, self.__country_code, self.__schema 286 | ) 287 | 288 | return result 289 | 290 | def get(self, path: str, params: dict[str, Any] | None = None) -> dict[str, Any]: 291 | """Http Get. 292 | 293 | Requests the server to return specified resources. 294 | 295 | Args: 296 | path (str): api path 297 | params (map): request parameter 298 | 299 | Returns: 300 | response: response body 301 | """ 302 | return self.__request("GET", path, params, None) 303 | 304 | def post(self, path: str, body: dict[str, Any] | None = None) -> dict[str, Any]: 305 | """Http Post. 306 | 307 | Requests the server to update specified resources. 308 | 309 | Args: 310 | path (str): api path 311 | body (map): request body 312 | 313 | Returns: 314 | response: response body 315 | """ 316 | return self.__request("POST", path, None, body) 317 | 318 | def put(self, path: str, body: dict[str, Any] | None = None) -> dict[str, Any]: 319 | """Http Put. 320 | 321 | Requires the server to perform specified operations. 322 | 323 | Args: 324 | path (str): api path 325 | body (map): request body 326 | 327 | Returns: 328 | response: response body 329 | """ 330 | return self.__request("PUT", path, None, body) 331 | 332 | def delete(self, path: str, params: dict[str, Any] | None = None) -> dict[str, Any]: 333 | """Http Delete. 334 | 335 | Requires the server to delete specified resources. 336 | 337 | Args: 338 | path (str): api path 339 | params (map): request param 340 | 341 | Returns: 342 | response: response body 343 | """ 344 | return self.__request("DELETE", path, params, None) 345 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | Apache License 4 | Version 2.0, January 2004 5 | http://www.apache.org/licenses/ 6 | 7 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 8 | 9 | 1. Definitions. 10 | 11 | "License" shall mean the terms and conditions for use, reproduction, 12 | and distribution as defined by Sections 1 through 9 of this document. 13 | 14 | "Licensor" shall mean the copyright owner or entity authorized by 15 | the copyright owner that is granting the License. 16 | 17 | "Legal Entity" shall mean the union of the acting entity and all 18 | other entities that control, are controlled by, or are under common 19 | control with that entity. For the purposes of this definition, 20 | "control" means (i) the power, direct or indirect, to cause the 21 | direction or management of such entity, whether by contract or 22 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 23 | outstanding shares, or (iii) beneficial ownership of such entity. 24 | 25 | "You" (or "Your") shall mean an individual or Legal Entity 26 | exercising permissions granted by this License. 27 | 28 | "Source" form shall mean the preferred form for making modifications, 29 | including but not limited to software source code, documentation 30 | source, and configuration files. 31 | 32 | "Object" form shall mean any form resulting from mechanical 33 | transformation or translation of a Source form, including but 34 | not limited to compiled object code, generated documentation, 35 | and conversions to other media types. 36 | 37 | "Work" shall mean the work of authorship, whether in Source or 38 | Object form, made available under the License, as indicated by a 39 | copyright notice that is included in or attached to the work 40 | (an example is provided in the Appendix below). 41 | 42 | "Derivative Works" shall mean any work, whether in Source or Object 43 | form, that is based on (or derived from) the Work and for which the 44 | editorial revisions, annotations, elaborations, or other modifications 45 | represent, as a whole, an original work of authorship. For the purposes 46 | of this License, Derivative Works shall not include works that remain 47 | separable from, or merely link (or bind by name) to the interfaces of, 48 | the Work and Derivative Works thereof. 49 | 50 | "Contribution" shall mean any work of authorship, including 51 | the original version of the Work and any modifications or additions 52 | to that Work or Derivative Works thereof, that is intentionally 53 | submitted to Licensor for inclusion in the Work by the copyright owner 54 | or by an individual or Legal Entity authorized to submit on behalf of 55 | the copyright owner. For the purposes of this definition, "submitted" 56 | means any form of electronic, verbal, or written communication sent 57 | to the Licensor or its representatives, including but not limited to 58 | communication on electronic mailing lists, source code control systems, 59 | and issue tracking systems that are managed by, or on behalf of, the 60 | Licensor for the purpose of discussing and improving the Work, but 61 | excluding communication that is conspicuously marked or otherwise 62 | designated in writing by the copyright owner as "Not a Contribution." 63 | 64 | "Contributor" shall mean Licensor and any individual or Legal Entity 65 | on behalf of whom a Contribution has been received by Licensor and 66 | subsequently incorporated within the Work. 67 | 68 | 2. Grant of Copyright License. Subject to the terms and conditions of 69 | this License, each Contributor hereby grants to You a perpetual, 70 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 71 | copyright license to reproduce, prepare Derivative Works of, 72 | publicly display, publicly perform, sublicense, and distribute the 73 | Work and such Derivative Works in Source or Object form. 74 | 75 | 3. Grant of Patent License. Subject to the terms and conditions of 76 | this License, each Contributor hereby grants to You a perpetual, 77 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 78 | (except as stated in this section) patent license to make, have made, 79 | use, offer to sell, sell, import, and otherwise transfer the Work, 80 | where such license applies only to those patent claims licensable 81 | by such Contributor that are necessarily infringed by their 82 | Contribution(s) alone or by combination of their Contribution(s) 83 | with the Work to which such Contribution(s) was submitted. If You 84 | institute patent litigation against any entity (including a 85 | cross-claim or counterclaim in a lawsuit) alleging that the Work 86 | or a Contribution incorporated within the Work constitutes direct 87 | or contributory patent infringement, then any patent licenses 88 | granted to You under this License for that Work shall terminate 89 | as of the date such litigation is filed. 90 | 91 | 4. Redistribution. You may reproduce and distribute copies of the 92 | Work or Derivative Works thereof in any medium, with or without 93 | modifications, and in Source or Object form, provided that You 94 | meet the following conditions: 95 | 96 | (a) You must give any other recipients of the Work or 97 | Derivative Works a copy of this License; and 98 | 99 | (b) You must cause any modified files to carry prominent notices 100 | stating that You changed the files; and 101 | 102 | (c) You must retain, in the Source form of any Derivative Works 103 | that You distribute, all copyright, patent, trademark, and 104 | attribution notices from the Source form of the Work, 105 | excluding those notices that do not pertain to any part of 106 | the Derivative Works; and 107 | 108 | (d) If the Work includes a "NOTICE" text file as part of its 109 | distribution, then any Derivative Works that You distribute must 110 | include a readable copy of the attribution notices contained 111 | within such NOTICE file, excluding those notices that do not 112 | pertain to any part of the Derivative Works, in at least one 113 | of the following places: within a NOTICE text file distributed 114 | as part of the Derivative Works; within the Source form or 115 | documentation, if provided along with the Derivative Works; or, 116 | within a display generated by the Derivative Works, if and 117 | wherever such third-party notices normally appear. The contents 118 | of the NOTICE file are for informational purposes only and 119 | do not modify the License. You may add Your own attribution 120 | notices within Derivative Works that You distribute, alongside 121 | or as an addendum to the NOTICE text from the Work, provided 122 | that such additional attribution notices cannot be construed 123 | as modifying the License. 124 | 125 | You may add Your own copyright statement to Your modifications and 126 | may provide additional or different license terms and conditions 127 | for use, reproduction, or distribution of Your modifications, or 128 | for any such Derivative Works as a whole, provided Your use, 129 | reproduction, and distribution of the Work otherwise complies with 130 | the conditions stated in this License. 131 | 132 | 5. Submission of Contributions. Unless You explicitly state otherwise, 133 | any Contribution intentionally submitted for inclusion in the Work 134 | by You to the Licensor shall be under the terms and conditions of 135 | this License, without any additional terms or conditions. 136 | Notwithstanding the above, nothing herein shall supersede or modify 137 | the terms of any separate license agreement you may have executed 138 | with Licensor regarding such Contributions. 139 | 140 | 6. Trademarks. This License does not grant permission to use the trade 141 | names, trademarks, service marks, or product names of the Licensor, 142 | except as required for reasonable and customary use in describing the 143 | origin of the Work and reproducing the content of the NOTICE file. 144 | 145 | 7. Disclaimer of Warranty. Unless required by applicable law or 146 | agreed to in writing, Licensor provides the Work (and each 147 | Contributor provides its Contributions) on an "AS IS" BASIS, 148 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 149 | implied, including, without limitation, any warranties or conditions 150 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 151 | PARTICULAR PURPOSE. You are solely responsible for determining the 152 | appropriateness of using or redistributing the Work and assume any 153 | risks associated with Your exercise of permissions under this License. 154 | 155 | 8. Limitation of Liability. In no event and under no legal theory, 156 | whether in tort (including negligence), contract, or otherwise, 157 | unless required by applicable law (such as deliberate and grossly 158 | negligent acts) or agreed to in writing, shall any Contributor be 159 | liable to You for damages, including any direct, indirect, special, 160 | incidental, or consequential damages of any character arising as a 161 | result of this License or out of the use or inability to use the 162 | Work (including but not limited to damages for loss of goodwill, 163 | work stoppage, computer failure or malfunction, or any and all 164 | other commercial damages or losses), even if such Contributor 165 | has been advised of the possibility of such damages. 166 | 167 | 9. Accepting Warranty or Additional Liability. While redistributing 168 | the Work or Derivative Works thereof, You may choose to offer, 169 | and charge a fee for, acceptance of support, warranty, indemnity, 170 | or other liability obligations and/or rights consistent with this 171 | License. However, in accepting such obligations, You may act only 172 | on Your own behalf and on Your sole responsibility, not on behalf 173 | of any other Contributor, and only if You agree to indemnify, 174 | defend, and hold each Contributor harmless for any liability 175 | incurred by, or claims asserted against, such Contributor by reason 176 | of your accepting any such warranty or additional liability. 177 | 178 | END OF TERMS AND CONDITIONS 179 | 180 | APPENDIX: How to apply the Apache License to your work. 181 | 182 | To apply the Apache License to your work, attach the following 183 | boilerplate notice, with the fields enclosed by brackets "[]" 184 | replaced with your own identifying information. (Don't include 185 | the brackets!) The text should be enclosed in the appropriate 186 | comment syntax for the file format. We also recommend that a 187 | file or class name and description of purpose be included on the 188 | same "printed page" as the copyright notice for easier 189 | identification within third-party archives. 190 | 191 | Copyright 2022 ScaleVector 192 | 193 | Licensed under the Apache License, Version 2.0 (the "License"); 194 | you may not use this file except in compliance with the License. 195 | You may obtain a copy of the License at 196 | 197 | http://www.apache.org/licenses/LICENSE-2.0 198 | 199 | Unless required by applicable law or agreed to in writing, software 200 | distributed under the License is distributed on an "AS IS" BASIS, 201 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 202 | See the License for the specific language governing permissions and 203 | limitations under the License. 204 | -------------------------------------------------------------------------------- /sql_to_weaviate.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "# SQL database to Weaviate using dlt library\n", 7 | "Example for [Public MySQL Database.](https://docs.rfam.org/en/latest/database.html)" 8 | ], 9 | "metadata": { 10 | "collapsed": false 11 | }, 12 | "id": "b34fe934cb4f242c" 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "outputs": [], 18 | "source": [ 19 | "!pip install -q \"dlt[weaviate]\"" 20 | ], 21 | "metadata": { 22 | "collapsed": false, 23 | "ExecuteTime": { 24 | "start_time": "2023-09-07T16:15:32.762937679Z" 25 | } 26 | }, 27 | "id": "2841cf5886b5df42" 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "source": [ 32 | "Let's init [verified source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/sql_database) `sql_database` with dlt cli command:" 33 | ], 34 | "metadata": { 35 | "collapsed": false 36 | }, 37 | "id": "f4b452c86009840e" 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "Looking up the init scripts in \u001B[1mhttps://github.com/dlt-hub/verified-sources.git\u001B[0m...\r\n", 48 | "Cloning and configuring a verified source \u001B[1msql_database\u001B[0m (Source that loads tables form any SQLAlchemy supported database, supports batching requests and incremental loads.)\r\n", 49 | "\r\n", 50 | "Verified source \u001B[1msql_database\u001B[0m was added to your project!\r\n", 51 | "* See the usage examples and code snippets to copy from \u001B[1msql_database_pipeline.py\u001B[0m\r\n", 52 | "* Add credentials for \u001B[1mweaviate\u001B[0m and other secrets in \u001B[1m./.dlt/secrets.toml\u001B[0m\r\n", 53 | "* Add the required dependencies to \u001B[1mpyproject.toml\u001B[0m:\r\n", 54 | " \u001B[1msqlalchemy>=1.4\u001B[0m\r\n", 55 | " \u001B[1mdlt[weaviate]<0.4,>=0.3.5\u001B[0m\r\n", 56 | " If the dlt dependency is already added, make sure you install the extra for \u001B[1mweaviate\u001B[0m to it\r\n", 57 | " If you are using poetry you may issue the following command:\r\n", 58 | "\u001B[1m poetry add dlt -E weaviate\u001B[0m\r\n", 59 | "\r\n", 60 | "* Read \u001B[1mhttps://dlthub.com/docs/walkthroughs/create-a-pipeline\u001B[0m for more information\r\n" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "!dlt --non-interactive init sql_database weaviate " 66 | ], 67 | "metadata": { 68 | "collapsed": false, 69 | "ExecuteTime": { 70 | "end_time": "2023-09-07T16:15:39.112449485Z", 71 | "start_time": "2023-09-07T16:15:35.652268880Z" 72 | } 73 | }, 74 | "id": "82b557c903459372" 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 15, 79 | "outputs": [], 80 | "source": [ 81 | "!pip install -q sqlalchemy" 82 | ], 83 | "metadata": { 84 | "collapsed": false, 85 | "ExecuteTime": { 86 | "start_time": "2023-09-07T16:43:27.653554239Z" 87 | } 88 | }, 89 | "id": "998f9710b6661067" 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 12, 94 | "outputs": [], 95 | "source": [ 96 | "import os\n", 97 | "import weaviate\n", 98 | "\n", 99 | "\n", 100 | "def show_data(class_name, properties):\n", 101 | " client = weaviate.Client(\n", 102 | " url=os.getenv(\"WEAVIATE_URL\"),\n", 103 | " auth_client_secret=weaviate.AuthApiKey(\n", 104 | " api_key=os.getenv(\"WEAVIATE_API_KEY\")\n", 105 | " ),\n", 106 | " additional_headers={\n", 107 | " \"X-OpenAI-Api-Key\": os.getenv(\"WEAVIATE_OPENAI_KEY\")\n", 108 | " }\n", 109 | " )\n", 110 | "\n", 111 | " response = (\n", 112 | " client.query\n", 113 | " .get(class_name, properties)\n", 114 | " .do()\n", 115 | " )\n", 116 | " return response" 117 | ], 118 | "metadata": { 119 | "collapsed": false, 120 | "ExecuteTime": { 121 | "end_time": "2023-09-07T16:39:30.455113374Z", 122 | "start_time": "2023-09-07T16:39:30.414665860Z" 123 | } 124 | }, 125 | "id": "f96b7b53f2250ad" 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "source": [ 130 | "Put credentials into `.dlt/secrets.toml` file like this:\n", 131 | "\n", 132 | "```\n", 133 | "[sources.sql_database.credentials]\n", 134 | "drivername = \"mysql+pymysql\" # driver name for the database\n", 135 | "database = \"Rfam\" # database name\n", 136 | "username = \"rfamro\" # username associated with the database\n", 137 | "host = \"mysql-rfam-public.ebi.ac.uk\" # host address\n", 138 | "port = \"4497\" # port required for connection\n", 139 | "```\n", 140 | "\n", 141 | "[More info about credentials.](https://dlthub.com/docs/general-usage/credentials)" 142 | ], 143 | "metadata": { 144 | "collapsed": false 145 | }, 146 | "id": "b71060e1b9377e70" 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 4, 151 | "outputs": [ 152 | { 153 | "name": "stderr", 154 | "output_type": "stream", 155 | "text": [ 156 | "/home/alenaastrakhantseva/.cache/pypoetry/virtualenvs/weaviate-demo-9BqQS6RD-py3.10/lib/python3.10/site-packages/weaviate/warnings.py:130: DeprecationWarning: Dep006: You are using the `client.batch()` method, which will be removed in the next major release.\n", 157 | " Please instead use the `client.batch.configure()` method to configure your batch and `client.batch` to enter the context manager.\n", 158 | " See https://weaviate.io/developers/weaviate/client-libraries/python for details.\n", 159 | " warnings.warn(\n" 160 | ] 161 | }, 162 | { 163 | "name": "stdout", 164 | "output_type": "stream", 165 | "text": [ 166 | "Normalized data for the following tables:\n", 167 | "- DltPipelineState: 1 row(s)\n", 168 | "- Family: 4108 row(s)\n", 169 | "\n", 170 | "------\n", 171 | "Pipeline rfam completed in 1 minute and 37.41 seconds\n", 172 | "1 load package(s) were loaded to destination weaviate and into dataset Rfam\n", 173 | "The weaviate destination used https://demo-1-wvxjul5s.weaviate.network location to store data\n", 174 | "Load package 1694104408.537511 is LOADED and contains no failed jobs\n" 175 | ] 176 | } 177 | ], 178 | "source": [ 179 | "import dlt\n", 180 | "from dlt.destinations.weaviate import weaviate_adapter\n", 181 | "\n", 182 | "from sql_database import sql_table\n", 183 | "\n", 184 | "pipeline = dlt.pipeline(\n", 185 | " pipeline_name=\"rfam\", destination='weaviate', dataset_name=\"rfam\"\n", 186 | ")\n", 187 | "\n", 188 | "load_source = sql_table(table=\"family\",)\n", 189 | "load_info = pipeline.run(weaviate_adapter(load_source, vectorize=\"description\", tokenization={\"description\": \"word\"}))\n", 190 | "\n", 191 | "# pretty print the information on data that was loaded\n", 192 | "row_counts = pipeline.last_trace.last_normalize_info\n", 193 | "print(row_counts)\n", 194 | "print(\"------\")\n", 195 | "print(load_info)" 196 | ], 197 | "metadata": { 198 | "collapsed": false, 199 | "ExecuteTime": { 200 | "end_time": "2023-09-07T16:35:03.547036147Z", 201 | "start_time": "2023-09-07T16:33:25.356346391Z" 202 | } 203 | }, 204 | "id": "3972d77c699a500a" 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 14, 209 | "outputs": [ 210 | { 211 | "data": { 212 | "text/plain": "{'data': {'Get': {'Rfam_Family': [{'description': 'CDKN2B antisense RNA 1 intronic convserved region'},\n {'description': 'microRNA mir-605'},\n {'description': 'mir-974 microRNA precursor family'},\n {'description': 'microRNA mir-633'},\n {'description': 'microRNA mir-569'},\n {'description': 'mir-6715 microRNA precursor family'},\n {'description': 'Small nucleolar RNA Z103'},\n {'description': 'Small nucleolar RNA SNORD70'},\n {'description': 'mir-5856 microRNA precursor family'},\n {'description': 'ctRNA'},\n {'description': 'MIR4245 microRNA precursor family'},\n {'description': 'mir-2068 microRNA precursor family'},\n {'description': 'mir-5890 microRNA precursor family'},\n {'description': 'Leptospira sRNA 30_255'},\n {'description': 'Pospiviroid RY motif stem loop'},\n {'description': 'TeloSII non coding RNA 45'},\n {'description': 'MIR2871 microRNA precursor family'},\n {'description': 'mir-1017 microRNA precursor family'},\n {'description': 'Rickettsia rpsL leader'},\n {'description': 'microRNA mir-70'},\n {'description': 'MIR1882 microRNA precursor family'},\n {'description': 'L31-Coriobacteria ribosomal protein leader'},\n {'description': 'Small nucleolar RNA snoR60'},\n {'description': 'Brucella sRNA 1350'},\n {'description': 'mir-3121 microRNA precursor family'},\n {'description': 'microRNA MIR1122'},\n {'description': 'Gag/pro ribosomal frameshift site'},\n {'description': 'RT-3 RNA'},\n {'description': 'mir-2513 microRNA precursor family'},\n {'description': 'Rhodo-rpoB RNA'},\n {'description': 'Antisense to pHK01_099'},\n {'description': 'SPRY4-IT1 conserved region 2'},\n {'description': 'Small nucleolar RNA snR44'},\n {'description': 'Fluoride riboswitch (crcB)'},\n {'description': 'mir-156 microRNA precursor'},\n {'description': 'mir-4773 microRNA precursor family'},\n {'description': 'mir-1397 microRNA precursor family'},\n {'description': 'mir-3047 microRNA precursor family'},\n {'description': 'Non-coding RNA BC040587'},\n {'description': 'TeloSII non coding RNA 30'},\n {'description': 'microRNA mir-636'},\n {'description': 'microRNA mir-214'},\n {'description': \"Flavivirus 5' UTR\"},\n {'description': 'mir-561 microRNA precursor family'},\n {'description': 'Burkholderia sRNA 37'},\n {'description': 'RAGATH-6 RNA'},\n {'description': 'Small nucleolar RNA ZL1'},\n {'description': 'osmY RNA'},\n {'description': \"AilA 5' UTR thermometer\"},\n {'description': 'SMAD5 antisense RNA 1 conserved region 3'},\n {'description': 'Fst antitoxin sRNA'},\n {'description': 'DUF3800-XI RNA'},\n {'description': 'Caenorhabditis snoRNA ceN46'},\n {'description': 'FTX transcript, XIST regulator conserved region 4'},\n {'description': 'Streptococcus sRNA SpF11'},\n {'description': 'mir-15_2 microRNA precursor family'},\n {'description': 'microRNA mir-2518'},\n {'description': 'mir-3347 microRNA precursor family'},\n {'description': \"Insect-specific Flavivirus 3' UTR cis-acting replication element (CRE)\"},\n {'description': 'Burkholderia sRNA Bp1_Cand612_SIPHT'},\n {'description': 'mir-2235 microRNA precursor family'},\n {'description': 'Listeria sRNA rliB'},\n {'description': 'mir-3618 microRNA precursor family'},\n {'description': 'Salmonella enterica conserved region STnc30'},\n {'description': 'mir-4526 microRNA precursor family'},\n {'description': 'S. pyogenes small RNA 1186876'},\n {'description': 'mir-3064 microRNA precursor family'},\n {'description': 'Fungal small nucleolar RNA U3'},\n {'description': 'mir-3204 microRNA precursor family'},\n {'description': 'mir-5928 microRNA precursor family'},\n {'description': 'Small nucleolar RNA U18'},\n {'description': 'Small nucleolar RNA sR48'},\n {'description': 'RAGATH-15 RNA'},\n {'description': 'Streptomyces sRNA 4677'},\n {'description': 'mir-4079 microRNA precursor family'},\n {'description': 'microRNA mir-58'},\n {'description': 'Small nucleolar RNA SNORD60'},\n {'description': 'Listeria sRNA rli43'},\n {'description': 'RAGATH-13 RNA'},\n {'description': 'Plasmodium RNA of unkown function RU6-F3'},\n {'description': 'mir-2059 microRNA precursor family'},\n {'description': 'Small nucleolar RNA ACA59'},\n {'description': 'microRNA mir-328'},\n {'description': 'mir-2278 microRNA precursor family'},\n {'description': 'microRNA mir-330'},\n {'description': 'Pseudomonas sRNA P34'},\n {'description': 'Small nucleolar RNA Z39'},\n {'description': 'Small nucleolar RNA SNORA15'},\n {'description': 'Small nucleolar RNA snoR144'},\n {'description': 'Vibrio alginolyticus sRNA 907'},\n {'description': 'Deleted in lymphocytic leukemia 2 conserved region 3'},\n {'description': 'mir-1822 microRNA precursor family'},\n {'description': 'MIR7725 microRNA precursor family'},\n {'description': 'mir-6129 microRNA precursor family'},\n {'description': 'mir-2767 microRNA precursor family'},\n {'description': 'CRISPR RNA direct repeat element'},\n {'description': 'S15-Flavobacteria ribosomal protein leader'},\n {'description': 'Actinomyces-1 RNA'},\n {'description': 'Small Cajal body specific RNA ncR21'},\n {'description': 'Z30 small nucleolar RNA'}]}}}" 213 | }, 214 | "execution_count": 14, 215 | "metadata": {}, 216 | "output_type": "execute_result" 217 | } 218 | ], 219 | "source": [ 220 | "# class_name: table name \"Family\" + dataset name \"Rfam\"\n", 221 | "show_data(\"Rfam_Family\", [\"description\"])" 222 | ], 223 | "metadata": { 224 | "collapsed": false, 225 | "ExecuteTime": { 226 | "end_time": "2023-09-07T16:39:43.839817411Z", 227 | "start_time": "2023-09-07T16:39:42.788097606Z" 228 | } 229 | }, 230 | "id": "942793b835309409" 231 | } 232 | ], 233 | "metadata": { 234 | "kernelspec": { 235 | "display_name": "Python 3", 236 | "language": "python", 237 | "name": "python3" 238 | }, 239 | "language_info": { 240 | "codemirror_mode": { 241 | "name": "ipython", 242 | "version": 2 243 | }, 244 | "file_extension": ".py", 245 | "mimetype": "text/x-python", 246 | "name": "python", 247 | "nbconvert_exporter": "python", 248 | "pygments_lexer": "ipython2", 249 | "version": "2.7.6" 250 | } 251 | }, 252 | "nbformat": 4, 253 | "nbformat_minor": 5 254 | } 255 | -------------------------------------------------------------------------------- /dlt-init-openapi-demo/stripe_pipeline/rest_api/__init__.py: -------------------------------------------------------------------------------- 1 | """Generic API Source""" 2 | 3 | from typing import ( 4 | Type, 5 | Any, 6 | Dict, 7 | List, 8 | Optional, 9 | Generator, 10 | Callable, 11 | cast, 12 | ) 13 | import graphlib # type: ignore[import,unused-ignore] 14 | 15 | import dlt 16 | from dlt.common.validation import validate_dict 17 | from dlt.common import jsonpath 18 | from dlt.common.schema.schema import Schema 19 | from dlt.common.schema.typing import TSchemaContract 20 | from dlt.common.configuration.specs import BaseConfiguration 21 | 22 | from dlt.extract.incremental import Incremental 23 | from dlt.extract.source import DltResource, DltSource 24 | 25 | from dlt.sources.helpers.rest_client import RESTClient 26 | from dlt.sources.helpers.rest_client.paginators import BasePaginator 27 | from dlt.sources.helpers.rest_client.typing import HTTPMethodBasic 28 | from .typing import ( 29 | ClientConfig, 30 | ResolvedParam, 31 | Endpoint, 32 | EndpointResource, 33 | RESTAPIConfig, 34 | ) 35 | from .config_setup import ( 36 | IncrementalParam, 37 | create_auth, 38 | create_paginator, 39 | build_resource_dependency_graph, 40 | process_parent_data_item, 41 | setup_incremental_object, 42 | create_response_hooks, 43 | ) 44 | from .utils import check_connection, exclude_keys # noqa: F401 45 | 46 | 47 | def rest_api_source( 48 | config: RESTAPIConfig, 49 | name: str = None, 50 | section: str = None, 51 | max_table_nesting: int = None, 52 | root_key: bool = False, 53 | schema: Schema = None, 54 | schema_contract: TSchemaContract = None, 55 | spec: Type[BaseConfiguration] = None, 56 | ) -> DltSource: 57 | """Creates and configures a REST API source for data extraction. 58 | 59 | Args: 60 | config (RESTAPIConfig): Configuration for the REST API source. 61 | name (str, optional): Name of the source. 62 | section (str, optional): Section of the configuration file. 63 | max_table_nesting (int, optional): Maximum depth of nested table above which 64 | the remaining nodes are loaded as structs or JSON. 65 | root_key (bool, optional): Enables merging on all resources by propagating 66 | root foreign key to child tables. This option is most useful if you 67 | plan to change write disposition of a resource to disable/enable merge. 68 | Defaults to False. 69 | schema (Schema, optional): An explicit `Schema` instance to be associated 70 | with the source. If not present, `dlt` creates a new `Schema` object 71 | with provided `name`. If such `Schema` already exists in the same 72 | folder as the module containing the decorated function, such schema 73 | will be loaded from file. 74 | schema_contract (TSchemaContract, optional): Schema contract settings 75 | that will be applied to this resource. 76 | spec (Type[BaseConfiguration], optional): A specification of configuration 77 | and secret values required by the source. 78 | 79 | Returns: 80 | DltSource: A configured dlt source. 81 | 82 | Example: 83 | pokemon_source = rest_api_source({ 84 | "client": { 85 | "base_url": "https://pokeapi.co/api/v2/", 86 | "paginator": "json_response", 87 | }, 88 | "endpoints": { 89 | "pokemon": { 90 | "params": { 91 | "limit": 100, # Default page size is 20 92 | }, 93 | "resource": { 94 | "primary_key": "id", 95 | } 96 | }, 97 | }, 98 | }) 99 | """ 100 | decorated = dlt.source( 101 | rest_api_resources, 102 | name, 103 | section, 104 | max_table_nesting, 105 | root_key, 106 | schema, 107 | schema_contract, 108 | spec, 109 | ) 110 | 111 | return decorated(config) 112 | 113 | 114 | def rest_api_resources(config: RESTAPIConfig) -> List[DltResource]: 115 | """Creates a list of resources from a REST API configuration. 116 | 117 | Args: 118 | config (RESTAPIConfig): Configuration for the REST API source. 119 | 120 | Returns: 121 | List[DltResource]: List of dlt resources. 122 | 123 | Example: 124 | github_source = rest_api_resources({ 125 | "client": { 126 | "base_url": "https://api.github.com/repos/dlt-hub/dlt/", 127 | "auth": { 128 | "token": dlt.secrets["token"], 129 | }, 130 | }, 131 | "resource_defaults": { 132 | "primary_key": "id", 133 | "write_disposition": "merge", 134 | "endpoint": { 135 | "params": { 136 | "per_page": 100, 137 | }, 138 | }, 139 | }, 140 | "resources": [ 141 | { 142 | "name": "issues", 143 | "endpoint": { 144 | "path": "issues", 145 | "params": { 146 | "sort": "updated", 147 | "direction": "desc", 148 | "state": "open", 149 | "since": { 150 | "type": "incremental", 151 | "cursor_path": "updated_at", 152 | "initial_value": "2024-01-25T11:21:28Z", 153 | }, 154 | }, 155 | }, 156 | }, 157 | { 158 | "name": "issue_comments", 159 | "endpoint": { 160 | "path": "issues/{issue_number}/comments", 161 | "params": { 162 | "issue_number": { 163 | "type": "resolve", 164 | "resource": "issues", 165 | "field": "number", 166 | } 167 | }, 168 | }, 169 | }, 170 | ], 171 | }) 172 | """ 173 | 174 | validate_dict(RESTAPIConfig, config, path="") 175 | 176 | client_config = config["client"] 177 | resource_defaults = config.get("resource_defaults", {}) 178 | resource_list = config["resources"] 179 | 180 | ( 181 | dependency_graph, 182 | endpoint_resource_map, 183 | resolved_param_map, 184 | ) = build_resource_dependency_graph( 185 | resource_defaults, 186 | resource_list, 187 | ) 188 | 189 | resources = create_resources( 190 | client_config, 191 | dependency_graph, 192 | endpoint_resource_map, 193 | resolved_param_map, 194 | ) 195 | 196 | return list(resources.values()) 197 | 198 | 199 | def create_resources( 200 | client_config: ClientConfig, 201 | dependency_graph: graphlib.TopologicalSorter, 202 | endpoint_resource_map: Dict[str, EndpointResource], 203 | resolved_param_map: Dict[str, Optional[ResolvedParam]], 204 | ) -> Dict[str, DltResource]: 205 | resources = {} 206 | 207 | for resource_name in dependency_graph.static_order(): 208 | resource_name = cast(str, resource_name) 209 | endpoint_resource = endpoint_resource_map[resource_name] 210 | endpoint_config = cast(Endpoint, endpoint_resource["endpoint"]) 211 | request_params = endpoint_config.get("params", {}) 212 | request_json = endpoint_config.get("json", None) 213 | paginator = create_paginator(endpoint_config.get("paginator")) 214 | 215 | resolved_param: ResolvedParam = resolved_param_map[resource_name] 216 | 217 | include_from_parent: List[str] = endpoint_resource.get( 218 | "include_from_parent", [] 219 | ) 220 | if not resolved_param and include_from_parent: 221 | raise ValueError( 222 | f"Resource {resource_name} has include_from_parent but is not " 223 | "dependent on another resource" 224 | ) 225 | 226 | ( 227 | incremental_object, 228 | incremental_param, 229 | ) = setup_incremental_object(request_params, endpoint_config.get("incremental")) 230 | 231 | client = RESTClient( 232 | base_url=client_config["base_url"], 233 | headers=client_config.get("headers"), 234 | auth=create_auth(client_config.get("auth")), 235 | paginator=create_paginator(client_config.get("paginator")), 236 | ) 237 | 238 | hooks = create_response_hooks(endpoint_config.get("response_actions")) 239 | 240 | resource_kwargs = exclude_keys( 241 | endpoint_resource, {"endpoint", "include_from_parent"} 242 | ) 243 | 244 | if resolved_param is None: 245 | 246 | def paginate_resource( 247 | method: HTTPMethodBasic, 248 | path: str, 249 | params: Dict[str, Any], 250 | json: Optional[Dict[str, Any]], 251 | paginator: Optional[BasePaginator], 252 | data_selector: Optional[jsonpath.TJsonPath], 253 | hooks: Optional[Dict[str, Any]], 254 | client: RESTClient = client, 255 | incremental_object: Optional[Incremental[Any]] = incremental_object, 256 | incremental_param: IncrementalParam = incremental_param, 257 | ) -> Generator[Any, None, None]: 258 | if incremental_object: 259 | params[incremental_param.start] = incremental_object.last_value 260 | if incremental_param.end: 261 | params[incremental_param.end] = incremental_object.end_value 262 | 263 | yield from client.paginate( 264 | method=method, 265 | path=path, 266 | params=params, 267 | json=json, 268 | paginator=paginator, 269 | data_selector=data_selector, 270 | hooks=hooks, 271 | ) 272 | 273 | resources[resource_name] = dlt.resource( 274 | paginate_resource, 275 | **resource_kwargs, # TODO: implement typing.Unpack 276 | )( 277 | method=endpoint_config.get("method", "get"), 278 | path=endpoint_config.get("path"), 279 | params=request_params, 280 | json=request_json, 281 | paginator=paginator, 282 | data_selector=endpoint_config.get("data_selector"), 283 | hooks=hooks, 284 | ) 285 | 286 | else: 287 | predecessor = resources[resolved_param.resolve_config["resource"]] 288 | 289 | base_params = exclude_keys(request_params, {resolved_param.param_name}) 290 | 291 | def paginate_dependent_resource( 292 | items: List[Dict[str, Any]], 293 | method: HTTPMethodBasic, 294 | path: str, 295 | params: Dict[str, Any], 296 | paginator: Optional[BasePaginator], 297 | data_selector: Optional[jsonpath.TJsonPath], 298 | hooks: Optional[Dict[str, Any]], 299 | client: RESTClient = client, 300 | resolved_param: ResolvedParam = resolved_param, 301 | include_from_parent: List[str] = include_from_parent, 302 | incremental_object: Optional[Incremental[Any]] = incremental_object, 303 | incremental_param: IncrementalParam = incremental_param, 304 | ) -> Generator[Any, None, None]: 305 | if incremental_object: 306 | params[incremental_param.start] = incremental_object.last_value 307 | if incremental_param.end: 308 | params[incremental_param.end] = incremental_object.end_value 309 | 310 | for item in items: 311 | formatted_path, parent_record = process_parent_data_item( 312 | path, item, resolved_param, include_from_parent 313 | ) 314 | 315 | for child_page in client.paginate( 316 | method=method, 317 | path=formatted_path, 318 | params=params, 319 | paginator=paginator, 320 | data_selector=data_selector, 321 | hooks=hooks, 322 | ): 323 | if parent_record: 324 | for child_record in child_page: 325 | child_record.update(parent_record) 326 | yield child_page 327 | 328 | resources[resource_name] = dlt.resource( # type: ignore[call-overload] 329 | paginate_dependent_resource, 330 | data_from=predecessor, 331 | **resource_kwargs, # TODO: implement typing.Unpack 332 | )( 333 | method=endpoint_config.get("method", "get"), 334 | path=endpoint_config.get("path"), 335 | params=base_params, 336 | paginator=paginator, 337 | data_selector=endpoint_config.get("data_selector"), 338 | hooks=hooks, 339 | ) 340 | 341 | return resources 342 | 343 | 344 | # XXX: This is a workaround pass test_dlt_init.py 345 | # since the source uses dlt.source as a function 346 | def _register_source(source_func: Callable[..., DltSource]) -> None: 347 | import inspect 348 | from dlt.common.configuration import get_fun_spec 349 | from dlt.common.source import _SOURCES, SourceInfo 350 | 351 | spec = get_fun_spec(source_func) 352 | func_module = inspect.getmodule(source_func) 353 | _SOURCES[source_func.__name__] = SourceInfo( 354 | SPEC=spec, 355 | f=source_func, 356 | module=func_module, 357 | ) 358 | 359 | 360 | _register_source(rest_api_source) 361 | --------------------------------------------------------------------------------