├── dlt_restack_demo
    ├── __init__.py
    ├── restack-app
    │   ├── __init__.py
    │   ├── src
    │   │   ├── __init__.py
    │   │   ├── functions
    │   │   │   ├── __init__.py
    │   │   │   ├── vector_search.py
    │   │   │   └── dlt_to_weaviate.py
    │   │   ├── workflows
    │   │   │   ├── __init__.py
    │   │   │   └── workflow.py
    │   │   ├── client.py
    │   │   └── services.py
    │   ├── .gitignore
    │   ├── .dlt
    │   │   ├── config.toml
    │   │   └── example.secrets.toml
    │   ├── .env.Example
    │   ├── Dockerfile
    │   ├── schedule_workflow.py
    │   └── pyproject.toml
    ├── img
    │   ├── UI.png
    │   ├── run.png
    │   ├── run2.png
    │   ├── results.png
    │   ├── results2.png
    │   └── incremental.png
    └── README.md
├── iceberg-tabular
    ├── __init__.py
    ├── move_data_to_tabular.py
    ├── requirements.txt
    ├── .gitignore
    ├── .dlt
    │   ├── config.toml
    │   └── example.secrets.toml
    ├── github_pipeline.py
    └── README.md
├── dlt-init-openapi-demo
    ├── __init__.py
    ├── stripe_pipeline
    │   ├── __init__.py
    │   ├── requirements.txt
    │   ├── rest_api
    │   │   ├── requirements.txt
    │   │   ├── exceptions.py
    │   │   ├── utils.py
    │   │   ├── typing.py
    │   │   ├── README.md
    │   │   └── __init__.py
    │   ├── .gitignore
    │   ├── .dlt
    │   │   ├── config.toml
    │   │   └── example.secrets.toml
    │   └── stripe_pipeline.py
    └── README.md
├── coinpaprika-to-postgresql
    ├── requirements.txt
    ├── example_api_responses
    │   ├── coin_list.json
    │   ├── coin_exchanges.json
    │   ├── coin_ohlc.json
    │   └── coin_details.json
    ├── .dlt
    │   ├── config.toml
    │   └── example.secrets.toml
    ├── .gitignore
    ├── dlt_pipeline.py
    ├── dlt_pipeline_merged.py
    └── README.md
├── dlt-dbt-cloud
    ├── pipeline
    │   ├── requirements.txt
    │   ├── pokemon
    │   │   ├── helpers.py
    │   │   ├── settings.py
    │   │   └── __init__.py
    │   ├── .gitignore
    │   ├── .dlt
    │   │   ├── config.toml
    │   │   └── .sources
    │   └── pokemon_pipeline.py
    ├── models
    │   ├── example
    │   │   ├── my_first_dbt_model.sql
    │   │   ├── my_second_dbt_model.sql
    │   │   └── schema.yml
    │   └── source.yml
    ├── dbt_project.yml
    ├── README.md
    └── .gitignore
├── scraping-source
    ├── requirements.txt
    ├── scraping
    │   ├── diagram.png
    │   ├── types.py
    │   ├── settings.py
    │   ├── __init__.py
    │   ├── queue.py
    │   ├── helpers.py
    │   └── runner.py
    ├── .gitignore
    ├── .dlt
    │   └── config.toml
    ├── README.md
    └── scraping_pipeline.py
├── pyladies-2024-demo
    ├── test.json
    ├── getting-started.py
    ├── load_from_database.py
    ├── load_from_json.py
    ├── github_pipeline.py
    ├── poke_pipeline.py
    └── README.md
├── dlt-dagster-snowflake
    ├── pyproject.toml
    ├── charts
    │   ├── google_trends_over_time.png
    │   └── hacker_news_sentiment_counts.png
    ├── requirements.txt
    ├── .gitignore
    ├── .dlt
    │   ├── config.toml
    │   └── example.secrets.toml
    ├── dlt_dagster_snowflake_demo
    │   ├── resources
    │   │   └── __init__.py
    │   ├── __init__.py
    │   ├── assets
    │   │   └── __init__.py
    │   └── dlt
    │   │   └── __init__.py
    └── README.md
├── images
    └── dlt-high-level.png
├── sengled-plug-demo
    ├── tuya_helpers
    │   ├── version.py
    │   ├── __init__.py
    │   ├── tuya_enums.py
    │   ├── openlogging.py
    │   └── openapi.py
    ├── env.py
    ├── README.md
    └── main.py
├── test-data
    └── invoice_1.pdf
├── secrets-providers-demo
    ├── .dlt
    │   ├── example.secrets.toml
    │   └── config.toml
    ├── README.md
    └── dlt_with_google_secrets_pipeline.py
├── .gitignore
├── .dlt
    └── config.toml
├── README.md
├── LICENSE.txt
└── sql_to_weaviate.ipynb


/dlt_restack_demo/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/iceberg-tabular/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dlt-init-openapi-demo/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dlt_restack_demo/restack-app/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dlt_restack_demo/restack-app/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/iceberg-tabular/move_data_to_tabular.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/dlt-init-openapi-demo/stripe_pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dlt_restack_demo/restack-app/src/functions/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dlt_restack_demo/restack-app/src/workflows/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/iceberg-tabular/requirements.txt:
--------------------------------------------------------------------------------
1 | dlt[athena]<0.4,>=0.3.5


--------------------------------------------------------------------------------
/coinpaprika-to-postgresql/requirements.txt:
--------------------------------------------------------------------------------
1 | dlt[postgres]>=0.4.2


--------------------------------------------------------------------------------
/dlt-dbt-cloud/pipeline/requirements.txt:
--------------------------------------------------------------------------------
1 | dlt[bigquery]<0.4,>=0.3.5


--------------------------------------------------------------------------------
/dlt-init-openapi-demo/stripe_pipeline/requirements.txt:
--------------------------------------------------------------------------------
1 | dlt>=0.4.12


--------------------------------------------------------------------------------
/scraping-source/requirements.txt:
--------------------------------------------------------------------------------
1 | dlt[duckdb]>=0.4.5
2 | scrapy>=2.11.0


--------------------------------------------------------------------------------
/dlt-dbt-cloud/pipeline/pokemon/helpers.py:
--------------------------------------------------------------------------------
1 | """Pokemon pipeline helpers"""
2 | 


--------------------------------------------------------------------------------
/dlt-init-openapi-demo/stripe_pipeline/rest_api/requirements.txt:
--------------------------------------------------------------------------------
1 | dlt>=0.4.11


--------------------------------------------------------------------------------
/pyladies-2024-demo/test.json:
--------------------------------------------------------------------------------
1 | {"id": 1, "name": "Alice", "children": {"id": 1, "name": "Eve"}}


--------------------------------------------------------------------------------
/dlt-dagster-snowflake/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.dagster]
2 | module_name = "dlt-dagster-snowflake-demo"
3 | 


--------------------------------------------------------------------------------
/images/dlt-high-level.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlt-hub/dlt_demos/HEAD/images/dlt-high-level.png


--------------------------------------------------------------------------------
/sengled-plug-demo/tuya_helpers/version.py:
--------------------------------------------------------------------------------
1 | """tuya_iot version."""
2 | 
3 | VERSION = "0.6.6"
4 | 
5 | 


--------------------------------------------------------------------------------
/test-data/invoice_1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlt-hub/dlt_demos/HEAD/test-data/invoice_1.pdf


--------------------------------------------------------------------------------
/dlt_restack_demo/img/UI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlt-hub/dlt_demos/HEAD/dlt_restack_demo/img/UI.png


--------------------------------------------------------------------------------
/dlt_restack_demo/img/run.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlt-hub/dlt_demos/HEAD/dlt_restack_demo/img/run.png


--------------------------------------------------------------------------------
/dlt_restack_demo/img/run2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlt-hub/dlt_demos/HEAD/dlt_restack_demo/img/run2.png


--------------------------------------------------------------------------------
/dlt_restack_demo/img/results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlt-hub/dlt_demos/HEAD/dlt_restack_demo/img/results.png


--------------------------------------------------------------------------------
/dlt_restack_demo/img/results2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlt-hub/dlt_demos/HEAD/dlt_restack_demo/img/results2.png


--------------------------------------------------------------------------------
/dlt_restack_demo/restack-app/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | .pytest_cache
3 | venv
4 | .env
5 | .vscode
6 | poetry.lock
7 | 


--------------------------------------------------------------------------------
/dlt-init-openapi-demo/stripe_pipeline/.gitignore:
--------------------------------------------------------------------------------
1 | # local duckdb files
2 | *.duckdb
3 | # pipeline secrets
4 | secrets.toml
5 | 


--------------------------------------------------------------------------------
/dlt_restack_demo/img/incremental.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlt-hub/dlt_demos/HEAD/dlt_restack_demo/img/incremental.png


--------------------------------------------------------------------------------
/scraping-source/scraping/diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlt-hub/dlt_demos/HEAD/scraping-source/scraping/diagram.png


--------------------------------------------------------------------------------
/dlt_restack_demo/restack-app/.dlt/config.toml:
--------------------------------------------------------------------------------
1 | [destination.weaviate]
2 | module_config={text2vec-openai = {}, generative-openai = {}}


--------------------------------------------------------------------------------
/dlt-dagster-snowflake/charts/google_trends_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlt-hub/dlt_demos/HEAD/dlt-dagster-snowflake/charts/google_trends_over_time.png


--------------------------------------------------------------------------------
/dlt-dagster-snowflake/charts/hacker_news_sentiment_counts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlt-hub/dlt_demos/HEAD/dlt-dagster-snowflake/charts/hacker_news_sentiment_counts.png


--------------------------------------------------------------------------------
/dlt_restack_demo/restack-app/.dlt/example.secrets.toml:
--------------------------------------------------------------------------------
1 | [destination.weaviate.credentials.additional_headers]
2 | X-OpenAI-Api-Key = "..."
3 | 
4 | [openai]
5 | api_key = "..."


--------------------------------------------------------------------------------
/dlt-dbt-cloud/models/example/my_first_dbt_model.sql:
--------------------------------------------------------------------------------
1 | {{ config(materialized="table") }}
2 | 
3 | 
4 | select *
5 | from {{ source("pokemon_data", "pokemon") }}
6 | limit
7 |     1000
8 | 


--------------------------------------------------------------------------------
/dlt-init-openapi-demo/stripe_pipeline/.dlt/config.toml:
--------------------------------------------------------------------------------
1 | [runtime]
2 | log_level="INFO"
3 | 
4 | [sources.stripe]
5 | # Base URL for the API
6 | base_url = "https://api.stripe.com/"
7 | 
8 | 


--------------------------------------------------------------------------------
/dlt-dbt-cloud/pipeline/pokemon/settings.py:
--------------------------------------------------------------------------------
1 | """Pokemon Pipeline settings and constants"""
2 | 
3 | BERRY_URL = "https://pokeapi.co/api/v2/berry"
4 | POKEMON_URL = "https://pokeapi.co/api/v2/pokemon/"
5 | 


--------------------------------------------------------------------------------
/dlt-dbt-cloud/models/example/my_second_dbt_model.sql:
--------------------------------------------------------------------------------
1 | {{ config(materialized="table") }}
2 | 
3 | with two_pokemon as (select name from {{ ref('my_first_dbt_model') }} limit 2)
4 | select *
5 | from two_pokemon


--------------------------------------------------------------------------------
/dlt-init-openapi-demo/stripe_pipeline/rest_api/exceptions.py:
--------------------------------------------------------------------------------
1 | from dlt.common.exceptions import DltException
2 | 
3 | 
4 | class RestApiException(DltException):
5 |     pass
6 | 
7 | 
8 | # class Paginator
9 | 


--------------------------------------------------------------------------------
/dlt-dbt-cloud/models/source.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | 
3 | sources:
4 |   - name: pokemon_data
5 |     database: dlt-dev-external
6 |     schema: pokemon_data
7 |     tables:
8 |       - name: pokemon
9 |       - name: berries


--------------------------------------------------------------------------------
/dlt-dbt-cloud/models/example/schema.yml:
--------------------------------------------------------------------------------
1 | 
2 | version: 2
3 | 
4 | models:
5 |   - name: my_first_dbt_model
6 |     description: "A starter dbt model"
7 | 
8 |   - name: my_second_dbt_model
9 |     description: "the second dbt model"


--------------------------------------------------------------------------------
/dlt-init-openapi-demo/stripe_pipeline/.dlt/example.secrets.toml:
--------------------------------------------------------------------------------
1 | [sources.stripe]
2 | # secrets for your stripe source
3 | username = "FILL ME OUT"  # TODO: fill in your credentials
4 | password = "FILL ME OUT"  # TODO: fill in your credentials
5 | 


--------------------------------------------------------------------------------
/secrets-providers-demo/.dlt/example.secrets.toml:
--------------------------------------------------------------------------------
1 | [google_secrets.credentials]
2 | "project_id" = "<project_id>"
3 | "private_key" = "-----BEGIN PRIVATE KEY-----\n....\n-----END PRIVATE KEY-----\n"
4 | "client_email" = "....gserviceaccount.com"
5 | 


--------------------------------------------------------------------------------
/dlt-dagster-snowflake/requirements.txt:
--------------------------------------------------------------------------------
 1 | dlt[duckdb]>=0.3.5
 2 | dagster
 3 | snowflake-connector-python[pandas]
 4 | dagster-snowflake-pandas
 5 | dagster-snowflake
 6 | pandas
 7 | matplotlib
 8 | pytrends
 9 | openai
10 | dagster-webserver
11 | toml


--------------------------------------------------------------------------------
/sengled-plug-demo/env.py:
--------------------------------------------------------------------------------
1 | # online
2 | ACCESS_ID = '<YOUR-ACCESS-ID>'
3 | ACCESS_KEY = '<YOUR-ACCESS-KEY>'
4 | USERNAME = '<YOUR-LOGIN-ACCOUNT>'
5 | PASSWORD = '<YOUR-PASSWORD>'
6 | DEVICE_ID = '<YOUR-DEVICE-ID>'
7 | ENDPOINT = "https://openapi.tuya<LOCATION>.com"
8 | 


--------------------------------------------------------------------------------
/iceberg-tabular/.gitignore:
--------------------------------------------------------------------------------
 1 | # ignore secrets, virtual environments and typical python compilation artifacts
 2 | secrets.toml
 3 | # ignore basic python artifacts
 4 | .env
 5 | **/__pycache__/
 6 | **/*.py[cod]
 7 | **/*$py.class
 8 | # ignore duckdb
 9 | *.duckdb
10 | *.wal


--------------------------------------------------------------------------------
/scraping-source/.gitignore:
--------------------------------------------------------------------------------
 1 | # ignore secrets, virtual environments and typical python compilation artifacts
 2 | secrets.toml
 3 | # ignore basic python artifacts
 4 | .env
 5 | **/__pycache__/
 6 | **/*.py[cod]
 7 | **/*$py.class
 8 | # ignore duckdb
 9 | *.duckdb
10 | *.wal


--------------------------------------------------------------------------------
/dlt-dagster-snowflake/.gitignore:
--------------------------------------------------------------------------------
 1 | # ignore secrets, virtual environments and typical python compilation artifacts
 2 | secrets.toml
 3 | # ignore basic python artifacts
 4 | .env
 5 | **/__pycache__/
 6 | **/*.py[cod]
 7 | **/*$py.class
 8 | # ignore duckdb
 9 | *.duckdb
10 | *.wal


--------------------------------------------------------------------------------
/dlt-dbt-cloud/pipeline/.gitignore:
--------------------------------------------------------------------------------
 1 | # ignore secrets, virtual environments and typical python compilation artifacts
 2 | secrets.toml
 3 | # ignore basic python artifacts
 4 | .env
 5 | **/__pycache__/
 6 | **/*.py[cod]
 7 | **/*$py.class
 8 | # ignore duckdb
 9 | *.duckdb
10 | *.wal


--------------------------------------------------------------------------------
/coinpaprika-to-postgresql/example_api_responses/coin_list.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |       "id": "btc-bitcoin",
 4 |       "name": "Bitcoin",
 5 |       "symbol": "BTC",
 6 |       "rank": 1,
 7 |       "is_new": false,
 8 |       "is_active": true,
 9 |       "type": "coin"
10 |     }
11 | ]


--------------------------------------------------------------------------------
/iceberg-tabular/.dlt/config.toml:
--------------------------------------------------------------------------------
1 | # put your configuration values here
2 | 
3 | [runtime]
4 | log_level="WARNING"  # the system log level of dlt
5 | # use the dlthub_telemetry setting to enable/disable anonymous usage data reporting, see https://dlthub.com/docs/telemetry
6 | dlthub_telemetry = true
7 | 


--------------------------------------------------------------------------------
/coinpaprika-to-postgresql/example_api_responses/coin_exchanges.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |     "id": "binance",
 4 |     "name": "Binance",
 5 |     "fiats": [
 6 |     {
 7 |     "name": "US Dollars",
 8 |     "symbol": "USD"
 9 |     }
10 |     ],
11 |     "adjusted_volume_24h_share": 11.26
12 |     }
13 | ]


--------------------------------------------------------------------------------
/dlt-dagster-snowflake/.dlt/config.toml:
--------------------------------------------------------------------------------
1 | # put your configuration values here
2 | 
3 | [runtime]
4 | log_level="WARNING"  # the system log level of dlt
5 | # use the dlthub_telemetry setting to enable/disable anonymous usage data reporting, see https://dlthub.com/docs/telemetry
6 | dlthub_telemetry = true
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # ignore secrets, virtual environments and typical python compilation artifacts
 2 | secrets.toml
 3 | # ignore basic python artifacts
 4 | .env
 5 | **/__pycache__/
 6 | **/*.py[cod]
 7 | **/*$py.class
 8 | # ignore duckdb
 9 | *.duckdb
10 | *.wal
11 | .DS_Store
12 | _storage
13 | test-data
14 | data


--------------------------------------------------------------------------------
/coinpaprika-to-postgresql/.dlt/config.toml:
--------------------------------------------------------------------------------
1 | # put your configuration values here
2 | 
3 | [runtime]
4 | log_level="WARNING"  # the system log level of dlt
5 | # use the dlthub_telemetry setting to enable/disable anonymous usage data reporting, see https://dlthub.com/docs/telemetry
6 | dlthub_telemetry = true
7 | 


--------------------------------------------------------------------------------
/dlt-dbt-cloud/pipeline/.dlt/config.toml:
--------------------------------------------------------------------------------
1 | # put your configuration values here
2 | 
3 | [runtime]
4 | log_level="WARNING"  # the system log level of dlt
5 | # use the dlthub_telemetry setting to enable/disable anonymous usage data reporting, see https://dlthub.com/docs/telemetry
6 | dlthub_telemetry = true
7 | 


--------------------------------------------------------------------------------
/scraping-source/scraping/types.py:
--------------------------------------------------------------------------------
 1 | import typing as t
 2 | 
 3 | from typing_extensions import ParamSpec
 4 | 
 5 | AnyDict = t.Dict[str, t.Any]
 6 | 
 7 | P = ParamSpec("P")
 8 | 
 9 | 
10 | class Runnable(t.Protocol):
11 |     def run(self, *args: P.args, **kwargs: P.kwargs) -> t.Any:
12 |         pass
13 | 


--------------------------------------------------------------------------------
/secrets-providers-demo/.dlt/config.toml:
--------------------------------------------------------------------------------
1 | # put your configuration values here
2 | 
3 | [runtime]
4 | log_level="WARNING"  # the system log level of dlt
5 | # use the dlthub_telemetry setting to enable/disable anonymous usage data reporting, see https://dlthub.com/docs/telemetry
6 | dlthub_telemetry = true
7 | 


--------------------------------------------------------------------------------
/coinpaprika-to-postgresql/.gitignore:
--------------------------------------------------------------------------------
 1 | # ignore secrets, virtual environments and typical python compilation artifacts
 2 | #secrets.toml
 3 | # ignore basic python artifacts
 4 | .env
 5 | **/__pycache__/
 6 | **/*.py[cod]
 7 | **/*$py.class
 8 | # ignore duckdb
 9 | *.duckdb
10 | *.wal
11 | test_data.json
12 | secrets.toml


--------------------------------------------------------------------------------
/dlt_restack_demo/restack-app/.env.Example:
--------------------------------------------------------------------------------
1 | # Restack Cloud (Optional)
2 | 
3 | # RESTACK_ENGINE_ID=<your-engine-id>
4 | # RESTACK_ENGINE_API_KEY=<your-engine-api-key>
5 | # RESTACK_ENGINE_API_ADDRESS=<your-engine-api-address>
6 | # RESTACK_ENGINE_ADDRESS=<your-engine-address>
7 | # RESTACK_CLOUD_TOKEN=<your-cloud-token>
8 | 


--------------------------------------------------------------------------------
/coinpaprika-to-postgresql/example_api_responses/coin_ohlc.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |       "time_open": "2018-03-01T00:00:00Z",
 4 |       "time_close": "2018-03-01T23:59:59Z",
 5 |       "open": 856.012,
 6 |       "high": 880.302,
 7 |       "low": 851.92,
 8 |       "close": 872.2,
 9 |       "volume": 1868520000,
10 |       "market_cap": 83808161204
11 |     }
12 | ]


--------------------------------------------------------------------------------
/pyladies-2024-demo/getting-started.py:
--------------------------------------------------------------------------------
 1 | import dlt
 2 | 
 3 | 
 4 | data = [
 5 | 	{'id': 1, 'name': 'Alice'},
 6 | 	{'id': 2, 'name': 'Bob'}
 7 | ]
 8 | 
 9 | pipeline = dlt.pipeline(
10 | 	pipeline_name='quick_start',
11 | 	destination='duckdb',
12 | 	dataset_name='mydata',
13 |   	dev_mode=True,
14 | )
15 | load_info = pipeline.run(data, table_name="users")
16 | print(load_info)


--------------------------------------------------------------------------------
/sengled-plug-demo/tuya_helpers/__init__.py:
--------------------------------------------------------------------------------
 1 | from .openapi import TuyaOpenAPI, TuyaTokenInfo
 2 | from .openlogging import TUYA_LOGGER
 3 | from .tuya_enums import AuthType, TuyaCloudOpenAPIEndpoint
 4 | from .version import VERSION
 5 | 
 6 | __all__ = [
 7 |     "TuyaOpenAPI",
 8 |     "TuyaTokenInfo",
 9 |     "AuthType",
10 |     "TuyaCloudOpenAPIEndpoint",
11 |     "TUYA_LOGGER",
12 | ]
13 | __version__ = VERSION
14 | 


--------------------------------------------------------------------------------
/scraping-source/.dlt/config.toml:
--------------------------------------------------------------------------------
 1 | # put your configuration values here
 2 | 
 3 | [runtime]
 4 | log_level="WARNING"  # the system log level of dlt
 5 | # use the dlthub_telemetry setting to enable/disable anonymous usage data reporting, see https://dlthub.com/docs/telemetry
 6 | dlthub_telemetry = true
 7 | 
 8 | [sources.scraping]
 9 | start_urls = ["https://quotes.toscrape.com/"]
10 | start_urls_file = "start_urls_file"
11 | 


--------------------------------------------------------------------------------
/pyladies-2024-demo/load_from_database.py:
--------------------------------------------------------------------------------
 1 | import dlt
 2 | from dlt.sources.sql_database import sql_database
 3 | 
 4 | source = sql_database(
 5 |     "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam"
 6 | )
 7 | 
 8 | pipeline = dlt.pipeline(
 9 |     pipeline_name="sql_database_example",
10 |     destination="duckdb",
11 |     dataset_name="sql_data",
12 | )
13 | 
14 | load_info = pipeline.run(source.family)
15 | print(load_info)


--------------------------------------------------------------------------------
/sengled-plug-demo/README.md:
--------------------------------------------------------------------------------
1 | This repo contains code that connects your Sengled Smart Device to Python via the tuya python API, and then loads your device data into a duckdb destination via dlt. 
2 | 
3 | # How to:
4 | 1. Insert your credentials in `env.py`.
5 |    a. All the credentials required can be found on your Tuya Cloud Account after you are registered and have successfully activated a cloud project with an active connected device.
6 | 2. Run `main.py`.
7 | 
8 | 


--------------------------------------------------------------------------------
/coinpaprika-to-postgresql/.dlt/example.secrets.toml:
--------------------------------------------------------------------------------
 1 | # put your secret values and credentials here. do not share this file and do not push it to github
 2 | api_secret_key = "api_secret_key" # please set me up!
 3 | 
 4 | [destination.postgres.credentials]
 5 | database = "demo_data" # please set me up!
 6 | password = "password" # please set me up!
 7 | username = "loader" # please set me up!
 8 | host = "localhost" # please set me up!
 9 | port = 5432
10 | connect_timeout = 15
11 | 


--------------------------------------------------------------------------------
/.dlt/config.toml:
--------------------------------------------------------------------------------
 1 | # put your configuration values here
 2 | 
 3 | [runtime]
 4 | log_level="WARNING"  # the system log level of dlt
 5 | # use the dlthub_telemetry setting to enable/disable anonymous usage data reporting, see https://dlthub.com/docs/telemetry
 6 | dlthub_telemetry = true
 7 | 
 8 | [destination.weaviate.module_config."text2vec-openai"]
 9 | model = "ada"
10 | modelVersion = "002"
11 | type = "text"
12 | [destination.weaviate.module_config."generative-openai"]
13 | model = "gpt-3.5-turbo"


--------------------------------------------------------------------------------
/pyladies-2024-demo/load_from_json.py:
--------------------------------------------------------------------------------
 1 | # load test json to duckdb database
 2 | 
 3 | import json
 4 | import dlt
 5 | 
 6 | with open("test.json", 'r') as file:
 7 |     data = json.load(file)
 8 | 
 9 | pipeline = dlt.pipeline(
10 |     pipeline_name='from_json',
11 |     destination='duckdb',
12 |     dataset_name='mydata',
13 |     dev_mode=True,
14 | )
15 | # dlt works with lists of dicts, so wrap data to the list
16 | load_info = pipeline.run([data], table_name="json_data")
17 | print(load_info)
18 | 


--------------------------------------------------------------------------------
/dlt_restack_demo/restack-app/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.12-slim
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | RUN apt-get update && apt-get install -y
 6 | 
 7 | RUN pip install poetry
 8 | 
 9 | COPY pyproject.toml ./
10 | 
11 | COPY . .
12 | 
13 | # Configure poetry to not create virtual environment
14 | RUN poetry config virtualenvs.create false
15 | 
16 | # Install dependencies
17 | RUN poetry install --no-interaction --no-ansi
18 | 
19 | # Expose port 80
20 | EXPOSE 80
21 | 
22 | CMD poetry run python -m src.services
23 | 


--------------------------------------------------------------------------------
/pyladies-2024-demo/github_pipeline.py:
--------------------------------------------------------------------------------
 1 | import dlt
 2 | import requests
 3 | 
 4 | # url to request dlt-hub user
 5 | url = f"https://api.github.com/users/dlt-hub/followers"
 6 | # make the request and return the json
 7 | data = requests.get(url).json()
 8 | 
 9 | pipeline = dlt.pipeline(
10 |     pipeline_name='from_api',
11 |     destination='duckdb',
12 |     dataset_name='mydata',
13 |     dev_mode=True,
14 | )
15 | # dlt works with lists of dicts, so wrap data to the list
16 | load_info = pipeline.run([data], table_name="followers")
17 | print(load_info)
18 | 


--------------------------------------------------------------------------------
/sengled-plug-demo/tuya_helpers/tuya_enums.py:
--------------------------------------------------------------------------------
 1 | """Tuya iot enums."""
 2 | 
 3 | from enum import Enum
 4 | 
 5 | 
 6 | class AuthType(Enum):
 7 |     """Tuya Cloud Auth Type."""
 8 | 
 9 |     SMART_HOME = 0
10 |     CUSTOM = 1
11 | 
12 | 
13 | class TuyaCloudOpenAPIEndpoint:
14 |     """Tuya Cloud Open API Endpoint."""
15 | 
16 |     CHINA = "https://openapi.tuyacn.com"
17 |     AMERICA = "https://openapi.tuyaus.com"
18 |     AMERICA_AZURE = "https://openapi-ueaz.tuyaus.com"
19 |     EUROPE = "https://openapi.tuyaeu.com"
20 |     EUROPE_MS = "https://openapi-weaz.tuyaeu.com"
21 |     INDIA = "https://openapi.tuyain.com"
22 | 


--------------------------------------------------------------------------------
/dlt-init-openapi-demo/stripe_pipeline/stripe_pipeline.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import dlt
 3 | 
 4 | from stripe import stripe_source
 5 | 
 6 | stripe_api_key = os.getenv('STRIPE_API_KEY')
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 |     pipeline = dlt.pipeline(
11 |         pipeline_name="stripe_pipeline",
12 |         destination='duckdb',
13 |         dataset_name="stripe_data",
14 |         progress="log",
15 |         export_schema_path="schemas/export"
16 |     )
17 |     source = stripe_source(stripe_api_key, password="").with_resources("get_customers", "get_subscriptions")
18 |     info = pipeline.run(source)
19 |     print(info)


--------------------------------------------------------------------------------
/dlt-dagster-snowflake/.dlt/example.secrets.toml:
--------------------------------------------------------------------------------
 1 | # put your secret values and credentials here. do not share this file and do not push it to github
 2 | [openai]
 3 | openai_api_key = "api_key" # please set me up!
 4 | 
 5 | [destination.snowflake.credentials]
 6 | database = "DLT_DATA"  # please set me up!
 7 | password = "your_password" # please set me up!
 8 | username = "your_username" # please set me up!
 9 | host = "your_host" # please set me up!
10 | warehouse = "COMPUTE_WH"  # please set me up!
11 | role = "ACCOUNTADMIN" # please set me up!
12 | account = "your_account_url" # please set me up!
13 | schema = "dagster_snowflake_demo"  # please set me up!


--------------------------------------------------------------------------------
/dlt_restack_demo/restack-app/schedule_workflow.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import time
 3 | 
 4 | from restack_ai import Restack
 5 | 
 6 | 
 7 | async def main():
 8 | 
 9 |     client = Restack()
10 | 
11 |     workflow_id = f"{int(time.time() * 1000)}-AnimePipeline"
12 |     run_id = await client.schedule_workflow(
13 |         workflow_name="AnimePipeline",
14 |         workflow_id=workflow_id,
15 |     )
16 | 
17 |     await client.get_workflow_result(workflow_id=workflow_id, run_id=run_id)
18 | 
19 |     exit(0)
20 | 
21 | 
22 | def run_schedule_workflow():
23 |     asyncio.run(main())
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     run_schedule_workflow()
28 | 


--------------------------------------------------------------------------------
/dlt_restack_demo/restack-app/src/client.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from dotenv import load_dotenv
 4 | from restack_ai import Restack
 5 | from restack_ai.restack import CloudConnectionOptions
 6 | 
 7 | # Load environment variables from a .env file
 8 | load_dotenv()
 9 | 
10 | 
11 | engine_id = os.getenv("RESTACK_ENGINE_ID")
12 | address = os.getenv("RESTACK_ENGINE_ADDRESS")
13 | api_key = os.getenv("RESTACK_ENGINE_API_KEY")
14 | api_address = os.getenv("RESTACK_ENGINE_API_ADDRESS")
15 | 
16 | connection_options = CloudConnectionOptions(
17 |     engine_id=engine_id, address=address, api_key=api_key, api_address=api_address
18 | )
19 | client = Restack(connection_options)
20 | 


--------------------------------------------------------------------------------
/dlt_restack_demo/restack-app/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "restack-app"
 3 | version = "0.0.1"
 4 | description = "A quickstart for Restack with dlt"
 5 | authors = [{ name = "Restack Team", email = "service@restack.io" }]
 6 | requires-python = ">=3.10,<3.13"
 7 | dependencies = [
 8 |     "pydantic>=2.10.6",
 9 |     "watchfiles>=1.0.4",
10 |     "python-dotenv==1.0.1",
11 |     "openai>=1.61.0",
12 |     "restack-ai>=0.0.62",
13 |     "dlt>=1.5.0",
14 |     "weaviate-client==3.22"
15 | ]
16 | 
17 | [project.scripts]
18 | dev = "src.services:watch_services"
19 | services = "src.services:run_services"
20 | 
21 | [tool.hatch.build.targets.sdist]
22 | include = ["src"]
23 | 
24 | [tool.hatch.build.targets.wheel]
25 | include = ["src"]
26 | 
27 | [build-system]
28 | requires = ["hatchling"]
29 | build-backend = "hatchling.build"


--------------------------------------------------------------------------------
/iceberg-tabular/.dlt/example.secrets.toml:
--------------------------------------------------------------------------------
 1 | [destination.filesystem]
 2 | bucket_url = "s3://[your_bucket_name]" # replace with your bucket name,
 3 | 
 4 | [destination.filesystem.credentials]
 5 | aws_access_key_id = "please set me up!" # copy the access key here
 6 | aws_secret_access_key = "please set me up!" # copy the secret access key here
 7 | 
 8 | [destination.athena]
 9 | force_iceberg = "True"
10 | query_result_bucket="s3://[results_bucket_name]" # replace with your query results bucket name
11 | 
12 | [destination.athena.credentials]
13 | aws_access_key_id="please set me up!" # same as credentials for filesystem
14 | aws_secret_access_key="please set me up!" # same as credentials for filesystem
15 | region_name="please set me up!" # set your aws region, for example "eu-central-1" for frankfurt
16 | database="awsdatacatalog"
17 | 
18 | 


--------------------------------------------------------------------------------
/dlt_restack_demo/restack-app/src/services.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import webbrowser
 4 | 
 5 | from watchfiles import run_process
 6 | 
 7 | from src.client import client
 8 | from src.functions.dlt_to_weaviate import anime_pipeline
 9 | from src.functions.vector_search import rag_pipeline
10 | from src.workflows.workflow import AnimePipeline, RAGPipeline
11 | 
12 | 
13 | async def main():
14 |     await client.start_service(
15 |         workflows=[AnimePipeline, RAGPipeline], functions=[anime_pipeline, rag_pipeline]
16 |     )
17 | 
18 | 
19 | def run_services():
20 |     try:
21 |         asyncio.run(main())
22 |     except KeyboardInterrupt:
23 |         print("Service interrupted by user. Exiting gracefully.")
24 | 
25 | 
26 | def watch_services():
27 |     watch_path = os.getcwd()
28 |     print(f"Watching {watch_path} and its subdirectories for changes...")
29 |     webbrowser.open("http://localhost:5233")
30 |     run_process(watch_path, recursive=True, target=run_services)
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     run_services()
35 | 


--------------------------------------------------------------------------------
/dlt-init-openapi-demo/stripe_pipeline/rest_api/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple, Dict, Any, Mapping, Iterable
 2 | 
 3 | from dlt.common import logger
 4 | from dlt.extract.source import DltSource
 5 | 
 6 | 
 7 | def join_url(base_url: str, path: str) -> str:
 8 |     if not base_url.endswith("/"):
 9 |         base_url += "/"
10 |     return base_url + path.lstrip("/")
11 | 
12 | 
13 | def exclude_keys(d: Mapping[str, Any], keys: Iterable[str]) -> Dict[str, Any]:
14 |     """Removes specified keys from a dictionary and returns a new dictionary.
15 | 
16 |     Args:
17 |         d (Mapping[str, Any]): The dictionary to remove keys from.
18 |         keys (Iterable[str]): The keys to remove.
19 | 
20 |     Returns:
21 |         Dict[str, Any]: A new dictionary with the specified keys removed.
22 |     """
23 |     return {k: v for k, v in d.items() if k not in keys}
24 | 
25 | 
26 | def check_connection(
27 |     source: DltSource,
28 |     *resource_names: str,
29 | ) -> Tuple[bool, str]:
30 |     try:
31 |         list(source.with_resources(*resource_names).add_limit(1))
32 |         return (True, "")
33 |     except Exception as e:
34 |         logger.error(f"Error checking connection: {e}")
35 |         return (False, str(e))
36 | 


--------------------------------------------------------------------------------
/dlt-dbt-cloud/pipeline/pokemon/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This source provides data extraction from an example source as a starting point for new pipelines.
 3 | Available resources: [berries, pokemon]
 4 | """
 5 | 
 6 | import typing as t
 7 | from typing import Sequence, Iterable, Dict, Any
 8 | import dlt
 9 | from dlt.common.typing import TDataItem
10 | from dlt.extract.source import DltResource
11 | from dlt.sources.helpers import requests
12 | from .settings import BERRY_URL, POKEMON_URL
13 | 
14 | 
15 | @dlt.resource(write_disposition="replace")
16 | def berries() -> Iterable[TDataItem]:
17 |     """
18 |     Returns a list of berries.
19 |     Yields:
20 |         dict: The berries data.
21 |     """
22 |     yield requests.get(BERRY_URL).json()["results"]
23 | 
24 | 
25 | @dlt.resource(write_disposition="replace")
26 | def pokemon() -> Iterable[TDataItem]:
27 |     """
28 |     Returns a list of pokemon.
29 |     Yields:
30 |         dict: The pokemon data.
31 |     """
32 |     yield requests.get(POKEMON_URL).json()["results"]
33 | 
34 | 
35 | @dlt.source
36 | def source() -> Sequence[DltResource]:
37 |     """
38 |     The source function that returns all availble resources.
39 |     Returns:
40 |         Sequence[DltResource]: A sequence of DltResource objects containing the fetched data.
41 |     """
42 |     return [berries, pokemon]
43 | 


--------------------------------------------------------------------------------
/dlt-dbt-cloud/pipeline/pokemon_pipeline.py:
--------------------------------------------------------------------------------
 1 | """Very simple pokemon pipeline, to be used as a starting point for new pipelines.
 2 | 
 3 | Available resources:
 4 |     fruits
 5 |     vegetables
 6 | """
 7 | import dlt
 8 | from pokemon import source
 9 | from typing import List
10 | 
11 | from dlt.helpers.dbt_cloud import run_dbt_cloud_job
12 | 
13 | 
14 | def load(resources: List[str]) -> None:
15 |     """
16 |     Execute a pipeline that will load all the resources for the given endpoints.
17 |     Args:
18 |         resources (List[str]): A list of resource names to load data from the pokemon source. Available resources include 'pokemon' and 'berries'.
19 |     Returns:
20 |         None: This function doesn't return any value. It prints the loading information on successful execution.
21 |     """
22 |     pipeline = dlt.pipeline(
23 |         pipeline_name="pokemon", destination='bigquery', dataset_name="pokemon_data"
24 |     )
25 |     load_info = pipeline.run(source().with_resources(*resources))
26 |     print(load_info)
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     """
31 |     Main function to execute the data loading pipeline.
32 |     Add your desired resources to the list and call the load function.
33 |     """
34 |     resources = ["pokemon", "berries"]
35 |     load(resources)
36 | 
37 |     run_info = run_dbt_cloud_job()
38 |     print(f"Job run status: {run_info['status_humanized']}")
39 | 


--------------------------------------------------------------------------------
/scraping-source/scraping/settings.py:
--------------------------------------------------------------------------------
 1 | from .types import AnyDict
 2 | 
 3 | SOURCE_BATCH_SIZE: int = 10
 4 | SOURCE_SCRAPY_QUEUE_SIZE: int = 3000
 5 | SOURCE_SCRAPY_QUEUE_RESULT_TIMEOUT: int = 5
 6 | SOURCE_SCRAPY_SETTINGS: AnyDict = {
 7 |     "LOG_LEVEL": "INFO",
 8 |     # If not set then will keep logging warning in the console
 9 |     # https://docs.scrapy.org/en/latest/topics/request-response.html#request-fingerprinter-implementation
10 |     "REQUEST_FINGERPRINTER_IMPLEMENTATION": "2.7",
11 |     "TELNETCONSOLE_ENABLED": False,
12 |     # How many sub pages to scrape
13 |     # https://docs.scrapy.org/en/latest/topics/settings.html#depth-limit
14 |     "DEPTH_LIMIT": 0,
15 |     "SPIDER_MIDDLEWARES": {
16 |         "scrapy.spidermiddlewares.depth.DepthMiddleware": 200,
17 |         "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 300,
18 |     },
19 |     "HTTPERROR_ALLOW_ALL": True,
20 |     "FAKEUSERAGENT_PROVIDERS": [
21 |         # this is the first provider we'll try
22 |         "scrapy_fake_useragent.providers.FakeUserAgentProvider",
23 |         # if FakeUserAgentProvider fails, we'll use faker to generate a user-agent string for us
24 |         "scrapy_fake_useragent.providers.FakerProvider",
25 |         # fall back to USER_AGENT value
26 |         "scrapy_fake_useragent.providers.FixedUserAgentProvider",
27 |     ],
28 |     "USER_AGENT": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:123.0) Gecko/20100101 Firefox/123.0",
29 | }
30 | 


--------------------------------------------------------------------------------
/dlt-dbt-cloud/dbt_project.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Name your project! Project names should contain only lowercase characters
 3 | # and underscores. A good package name should reflect your organization's
 4 | # name or the intended use of these models
 5 | name: 'my_new_project'
 6 | version: '1.0.0'
 7 | config-version: 2
 8 | 
 9 | # This setting configures which "profile" dbt uses for this project.
10 | profile: 'default'
11 | 
12 | # These configurations specify where dbt should look for different types of files.
13 | # The `source-paths` config, for example, states that models in this project can be
14 | # found in the "models/" directory. You probably won't need to change these!
15 | model-paths: ["models"]
16 | analysis-paths: ["analyses"]
17 | test-paths: ["tests"]
18 | seed-paths: ["seeds"]
19 | macro-paths: ["macros"]
20 | snapshot-paths: ["snapshots"]
21 | 
22 | target-path: "target"  # directory which will store compiled SQL files
23 | clean-targets:         # directories to be removed by `dbt clean`
24 |   - "target"
25 |   - "dbt_packages"
26 | 
27 | 
28 | # Configuring models
29 | # Full documentation: https://docs.getdbt.com/docs/configuring-models
30 | 
31 | # In this example config, we tell dbt to build all models in the example/ directory
32 | # as tables. These settings can be overridden in the individual model files
33 | # using the `{{ config(...) }}` macro.
34 | models:
35 |   my_new_project:
36 |     # Applies to all files under models/example/
37 |     example:
38 |       materialized: view
39 | 


--------------------------------------------------------------------------------
/sengled-plug-demo/tuya_helpers/openlogging.py:
--------------------------------------------------------------------------------
 1 | """Tuya iot logging."""
 2 | from __future__ import annotations
 3 | 
 4 | import copy
 5 | import logging
 6 | from typing import Any
 7 | 
 8 | logger = logging.getLogger(__package__)
 9 | 
10 | default_handler = logging.StreamHandler()
11 | default_handler.setFormatter(
12 |     logging.Formatter("[%(asctime)s] [tuya-%(module)s] %(message)s")
13 | )
14 | 
15 | logger.addHandler(default_handler)
16 | TUYA_LOGGER = logger
17 | 
18 | FILTER_LIST = [
19 |     "access_token",
20 |     "client_id",
21 |     "ip",
22 |     "lat",
23 |     "link_id",
24 |     "local_key",
25 |     "lon",
26 |     "password",
27 |     "refresh_token",
28 |     "uid",
29 | ]
30 | 
31 | STAR = "***"
32 | 
33 | 
34 | def filter_logger(result_info: dict[str, Any]):
35 |     """Filter log, hide sensitive info."""
36 |     if result_info is None:
37 |         return result_info
38 |     filter_info_original = copy.deepcopy(result_info)
39 |     if "result" in filter_info_original:
40 |         filter_info = filter_info_original["result"]
41 |     else:
42 |         filter_info = filter_info_original
43 |     if isinstance(filter_info, list):
44 |         for item in filter_info:
45 |             for filter_key in FILTER_LIST:
46 |                 if filter_key in item:
47 |                     item[filter_key] = STAR
48 | 
49 |     elif isinstance(filter_info, dict):
50 |         for filter_key in FILTER_LIST:
51 |             if filter_key in filter_info:
52 |                 filter_info[filter_key] = STAR
53 | 
54 |     return filter_info_original
55 | 


--------------------------------------------------------------------------------
/sengled-plug-demo/main.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from tuya_helpers import TuyaOpenAPI, TUYA_LOGGER
 3 | import dlt
 4 | from env import ENDPOINT, ACCESS_ID, ACCESS_KEY, USERNAME, PASSWORD, DEVICE_ID
 5 | import streamlit as st
 6 | 
 7 | if 'schema' not in st.session_state:
 8 |     st.session_state['schema'] = 'value'
 9 | 
10 | TUYA_LOGGER.setLevel(logging.DEBUG)
11 | 
12 | # Init
13 | openapi = TuyaOpenAPI(ENDPOINT, ACCESS_ID, ACCESS_KEY)
14 | openapi.connect(USERNAME, PASSWORD, "86", 'tuyaSmart')
15 | 
16 | @dlt.resource(name='status')
17 | def get_status():
18 |     yield openapi.get(f'/v1.0/devices/{DEVICE_ID}')
19 | 
20 | @dlt.resource(name='specs')
21 | def get_specs():
22 |     yield openapi.get(f'/v1.0/devices/{DEVICE_ID}/specifications')
23 | 
24 | @dlt.resource(name='properties')
25 | def get_properties():
26 |     yield openapi.get(f'/v2.0/cloud/thing/{DEVICE_ID}/shadow/properties')
27 | 
28 | 
29 | pipeline = dlt.pipeline(
30 |     pipeline_name="smart_plug",
31 |     destination="duckdb",
32 |     dataset_name="smart_plug_data",
33 | )
34 | 
35 | 
36 | pipeline.run(get_status())
37 | pipeline.run(get_specs())
38 | pipeline.run(get_properties())
39 | 
40 | 
41 | info = pipeline.run(get_status())
42 | print(info)
43 | info = pipeline.run(get_specs())
44 | print(info)
45 | info = pipeline.run(get_properties())
46 | print(info)
47 | 
48 | 
49 | #dashboard.write_data_explorer_page(pipeline)
50 | '''
51 | # Receive device message
52 | def on_message(msg):
53 |     print("on_message: %s" % msg)
54 | 
55 | print(openapi)
56 | openapi.token_info.expire_time = 0
57 | 
58 | openmq = TuyaOpenMQ(openapi)
59 | openmq.start()
60 | openmq.add_message_listener(on_message)
61 | '''
62 | 
63 | 


--------------------------------------------------------------------------------
/coinpaprika-to-postgresql/dlt_pipeline.py:
--------------------------------------------------------------------------------
 1 | import dlt
 2 | from dlt.sources.helpers import requests
 3 | 
 4 | # Resource 1: Basic information about cryptocurrencies on coinpaprika.com:
 5 | @dlt.resource(name = "coin_list", write_disposition="replace")
 6 | def coin_list():
 7 |     response = requests.get('https://api.coinpaprika.com/v1/coins')
 8 |     yield from response.json()
 9 | 
10 | # Resource 2 - Transformer: Detailed descriptive information about a single coin
11 | @dlt.transformer(data_from = coin_list().add_limit(2)) 
12 | def coin_details(coin):
13 |     coin_id = coin['id']
14 |     response = requests.get(f'https://api.coinpaprika.com/v1/coins/{coin_id}')
15 |     yield response.json()
16 | 
17 | # Resource 3 - Transformer: The last 50 timeline tweets from the official Twitter profile for a given coin
18 | @dlt.transformer(data_from = coin_list().add_limit(2))
19 | def coin_tweets(coin):
20 |     coin_id = coin['id']
21 |     response = requests.get(f'https://api.coinpaprika.com/v1/coins/{coin_id}/twitter')
22 |     data = response.json()
23 |     data_with_id = [{'id': coin_id, **entry} for entry in data]
24 |     yield data_with_id
25 | 
26 | # Source: Combines the above three resources into a single source
27 | @dlt.source
28 | def crypto_data(name = "crypto_source"):
29 |     yield coin_list()
30 |     yield coin_details()
31 |     yield coin_tweets()
32 | 
33 | # Main function to run the pipeline
34 | def load_coin_details() -> None:
35 |     pipeline = dlt.pipeline(
36 |         pipeline_name="crypto_pipeline",
37 |         destination='postgres',
38 |         full_refresh=True,
39 |         dataset_name="crypto_data"
40 |     )
41 |     info = pipeline.run(crypto_data().add_limit(2))
42 |     print(info)
43 | 
44 | if __name__ == "__main__":
45 |     load_coin_details()
46 | 


--------------------------------------------------------------------------------
/dlt-dbt-cloud/pipeline/.dlt/.sources:
--------------------------------------------------------------------------------
 1 | engine_version: 1
 2 | sources:
 3 |   google_analytics:
 4 |     is_dirty: false
 5 |     last_commit_sha: d82fa669817909c54087cb753c20701f1d480a25
 6 |     last_commit_timestamp: '2023-08-31T20:00:58+02:00'
 7 |     files:
 8 |       google_analytics/__init__.py:
 9 |         commit_sha: d82fa669817909c54087cb753c20701f1d480a25
10 |         git_sha: 853225b13342dd9447e167bd6d8ca74b37a76728
11 |         sha3_256: a43c90ae7507c61e6768cc543bd276e7de2c8075593269d27c3aae469fa5b7f9
12 |       google_analytics/README.md:
13 |         commit_sha: d82fa669817909c54087cb753c20701f1d480a25
14 |         git_sha: 3b9fb9bda50ed4228ca15798f53f7c17d74216ae
15 |         sha3_256: 5408fd3e6320a74298216cf17d2a9157ebadda9dc8be123e109b460c1348653e
16 |       google_analytics/setup_script_gcp_oauth.py:
17 |         commit_sha: d82fa669817909c54087cb753c20701f1d480a25
18 |         git_sha: 1a6025198657b905a49a78789c8166c8de081ea3
19 |         sha3_256: 89dc2cac47c053a14bd04cd5584e310b4f38a7cee5f2da9c31dfe138ea26cea8
20 |       google_analytics/settings.py:
21 |         commit_sha: d82fa669817909c54087cb753c20701f1d480a25
22 |         git_sha: f003ce1a4aac37c294b1595d27d0d4a4b0ea79b9
23 |         sha3_256: 94cdf6cd852b64c716b865fe40a2f99bfa50908f7806540923a6f87aad87bda4
24 |       google_analytics/helpers/__init__.py:
25 |         commit_sha: d82fa669817909c54087cb753c20701f1d480a25
26 |         git_sha: 0caf864b021d11765a1b54ff0bf1d48ea16a9a77
27 |         sha3_256: 7af600fd6a3e895fdf37935993854d0141ad92a58859810df9aa352cdc1e50fd
28 |       google_analytics/helpers/data_processing.py:
29 |         commit_sha: d82fa669817909c54087cb753c20701f1d480a25
30 |         git_sha: bd732da7e7f268ddcc3b6784a540dc45df1d7bcf
31 |         sha3_256: 7f7bde54e0706ba4fabf2b7c346e7e4592a8378f6265a89811dbf83de209e75f
32 |     dlt_version_constraint: '>=0.2.5'
33 | 


--------------------------------------------------------------------------------
/dlt_restack_demo/restack-app/src/workflows/workflow.py:
--------------------------------------------------------------------------------
 1 | from datetime import timedelta
 2 | 
 3 | from pydantic import BaseModel, Field
 4 | from restack_ai.workflow import RetryPolicy, import_functions, log, workflow
 5 | 
 6 | with import_functions():
 7 |     from src.functions.dlt_to_weaviate import anime_pipeline
 8 |     from src.functions.vector_search import rag_pipeline
 9 | 
10 | 
11 | class AnimePipelineInput(BaseModel):
12 |     pipeline_name: str = Field(default="anime_pipeline")
13 |     destination: str = Field(default="weaviate")
14 |     add_limit: int = Field(default=2)
15 |     dev_mode: bool = Field(default=False)
16 | 
17 | 
18 | class RAGPipelineInput(BaseModel):
19 |     pipeline_name: str = Field(default="anime_pipeline")
20 |     question: str = Field(
21 |         default="What is the story about Ye Bufan and his medical skills?"
22 |     )
23 | 
24 | 
25 | @workflow.defn()
26 | class AnimePipeline:
27 |     @workflow.run
28 |     async def run(self, input: AnimePipelineInput):
29 |         log.info("PokePipeline started")
30 |         result = await workflow.step(
31 |             anime_pipeline,
32 |             input=input,
33 |             start_to_close_timeout=timedelta(seconds=300),
34 |             retry_policy=RetryPolicy(maximum_attempts=1),
35 |         )
36 |         log.info("PokePipeline completed", result=result)
37 |         return result
38 | 
39 | 
40 | @workflow.defn()
41 | class RAGPipeline:
42 |     @workflow.run
43 |     async def run(self, input: RAGPipelineInput):
44 |         log.info("RAGPipeline started")
45 |         result = await workflow.step(
46 |             rag_pipeline,
47 |             input=input,
48 |             start_to_close_timeout=timedelta(seconds=300),
49 |             retry_policy=RetryPolicy(maximum_attempts=1),
50 |         )
51 |         log.info("RAGPipeline completed", result=result)
52 |         return result
53 | 


--------------------------------------------------------------------------------
/dlt_restack_demo/restack-app/src/functions/vector_search.py:
--------------------------------------------------------------------------------
 1 | import traceback
 2 | from typing import Optional, Any
 3 | 
 4 | import dlt
 5 | from openai import OpenAI
 6 | from pydantic import BaseModel
 7 | from restack_ai.function import function, log
 8 | 
 9 | 
10 | class RAGInput(BaseModel):
11 |     pipeline_name: str
12 |     question: str
13 | 
14 | 
15 | openai_client = OpenAI(api_key=dlt.secrets["openai.api_key"])
16 | 
17 | 
18 | @function.defn()
19 | async def rag_pipeline(input: RAGInput) -> str:
20 |     try:
21 |         pipeline = dlt.pipeline(
22 |             pipeline_name=input.pipeline_name,
23 |             destination="weaviate",
24 |             progress="log",
25 |             dev_mode=False,
26 |         )
27 |         with pipeline.destination_client() as client:
28 |             return rag_query(client.db_client, str(input))
29 | 
30 |     except Exception as e:
31 |         log.error("Something went wrong!", error=e)
32 |         log.error(traceback.format_exc())
33 |         raise e
34 | 
35 | 
36 | def rag_query(weaviate_client: Any, prompt: str) -> Optional[str]:
37 |     response = (
38 |         weaviate_client.query.get("Anime", ["title", "synopsis"])
39 |         .with_near_text({"concepts": [prompt]})
40 |         .with_limit(3)
41 |         .do()
42 |     )
43 | 
44 |     if "data" in response and "Get" in response["data"]:
45 |         docs = response["data"]["Get"]["Anime"]
46 |         print("Retrieved documents:")
47 |         for doc in docs:
48 |             print(f"- {doc['title']}: {doc['synopsis']}\n")
49 |     else:
50 |         print("No results found.")
51 |         return None
52 | 
53 |     context = " ".join([doc["synopsis"] for doc in docs])
54 | 
55 |     completion = openai_client.completions.create(
56 |         model="gpt-3.5-turbo-instruct",
57 |         prompt=f"{context}\n\nQuestion: {prompt}\nAnswer:",
58 |         max_tokens=150,
59 |     )
60 | 
61 |     log.info("Generated Answer:")
62 |     answer = completion.choices[0].text.strip()
63 |     log.info(answer)
64 |     return answer
65 | 


--------------------------------------------------------------------------------
/secrets-providers-demo/README.md:
--------------------------------------------------------------------------------
 1 | # Use `dlt` with Cloud Secrets Vaults
 2 | 
 3 | ## Google Cloud Secret Manager 
 4 | To retrieve secrets from Google Cloud Secret Manager using Python, and convert them into a dictionary format, you'll need to follow these steps. First, ensure that you have the necessary permissions to access the secrets on Google Cloud, and have the `google-cloud-secret-manager` library installed. If not, you can install it using pip:
 5 | 
 6 | ```bash
 7 | pip install google-cloud-secret-manager
 8 | ```
 9 | [Google Docs](https://cloud.google.com/secret-manager/docs/reference/libraries)
10 | 
11 | Here's how you can retrieve secrets and convert them into a dictionary:
12 | 
13 | 1. **Set up the Secret Manager client**: Create a client that will interact with the Secret Manager API.
14 | 2. **Access the secret**: Use the client to access the secret's latest version.
15 | 3. **Convert to a dictionary**: If the secret is stored in a structured format (like JSON), parse it into a Python dictionary.
16 | 
17 | Assume we store secrets in JSON format:
18 | ```json
19 | {"api_token": "ghp_Kskdgf98dugjf98ghd...."}
20 | ```
21 | 
22 | In the script `dlt_with_google_secrets_pipeline.py` you can find an example how to use Google Secrets in `dlt` pipelines.
23 | 
24 | ### Points to Note:
25 | 
26 | - **Permissions**: Ensure the service account or user credentials you are using have the necessary permissions to access the Secret Manager and the specific secrets.
27 | - **Secret Format**: This example assumes that the secret is stored in a JSON string format. If your secret is in a different format, you will need to adjust the parsing method accordingly.
28 | - **Google Cloud Authentication**: Make sure your environment is authenticated with Google Cloud. This can typically be done by setting credentials in `.dlt/secrets.toml` or setting the `GOOGLE_SECRETS__CREDENTIALS` environment variable to the path of your service account key file or the dict of credentials as a string.
29 | 
30 | With this setup, you can effectively retrieve secrets stored in Google Cloud Secret Manager and use them in your `dlt` pipelines as dictionaries.


--------------------------------------------------------------------------------
/coinpaprika-to-postgresql/dlt_pipeline_merged.py:
--------------------------------------------------------------------------------
 1 | import dlt
 2 | from dlt.sources.helpers import requests
 3 | 
 4 | 
 5 | # Resource 1: Retrieves a basic list of cryptocurrencies from coinpaprika.com
 6 | @dlt.resource(name = "coin_list", write_disposition="replace")
 7 | def coin_list():
 8 |     response = requests.get('https://api.coinpaprika.com/v1/coins')
 9 |     yield from response.json()
10 | 
11 | 
12 | # Resource 2 - Transformer: Extracts detailed information for each coin
13 | @dlt.transformer(data_from = coin_list().add_limit(10)) # The limit is added to avoid exceeding the API's request quota
14 | def coin_information(coin):
15 |     coin_id = coin['id']
16 |     # Fetching detailed information including the list of team members, tags, and links for each coin
17 |     details = requests.get(f'https://api.coinpaprika.com/v1/coins/{coin_id}')
18 |     # Fetching the latest OHLCV (Open, High, Low, Close, Volume) data as a list with a single dictionary
19 |     ohlc = requests.get(f'https://api.coinpaprika.com/v1/coins/{coin_id}/ohlcv/latest')
20 |     # Fetching exchanges where the coin is traded as a nested list
21 |     exchanges = requests.get(f'https://api.coinpaprika.com/v1/coins/{coin_id}/exchanges')
22 |     # Merging details, OHLCV, and exchanges data and yielding as one record
23 |     yield details.json() | ohlc.json()[0] | {"exchanges": exchanges.json()}
24 | 
25 | 
26 | # Source: Aggregates the coin list and detailed coin information into a single source
27 | @dlt.source
28 | def crypto_data(name = "crypto_source"):
29 |     yield coin_list()
30 |     yield coin_information()
31 | 
32 | 
33 | # Main function to execute the data loading pipeline
34 | def load_coin_details() -> None:
35 |     # Setting up the pipeline with PostgreSQL as the destination
36 |     pipeline = dlt.pipeline(
37 |         pipeline_name="crypto_pipeline",
38 |         destination='postgres',
39 |         full_refresh=True,
40 |         dataset_name="crypto_data",
41 |     )
42 |     # Running the pipeline and printing execution details
43 |     info = pipeline.run(crypto_data())
44 |     print(info)
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     load_coin_details()
49 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Data Loading Demos
 2 | 
 3 | This repository contains Jupyter notebooks and more extensive projects that illustrate various methods for loading data into different destinations (e.g. Weaviate database)
 4 | using the [dlt](https://github.com/dlt-hub/dlt) library.
 5 | 
 6 | ## Repository Contents
 7 | 
 8 | ### Prerequisites
 9 | 
10 | To run the notebooks, you will need credentials for the tools being used. They are added to the `.dlt` folder. For instance, if you're working on a Weaviate notebook, you will have to add Weaviate credentials. Refer to the notebooks to find out which credentials are needed.
11 | 
12 | ### Common demos
13 | 
14 | - [schema_evolution.ipynb](schema_evolution.ipynb): shows how you can alert schema changes to slack.
15 | - [spotlight_demo.ipynb](spotlight_demo.ipynb): shows how to get data from APIs, files, Python objects and move it into a local or remote database.
16 |   Demo was created for a [Data Talks Club: Open-Source Spotlight](https://youtube.com/playlist?list=PL3MmuxUbc_hJ5t5nnjzC0F2zan76Dpsz0&feature=shared) project.
17 | - [Pyladies 12.11.2024](pyladies-2024-demo): Similar to Spotlight demo, shows dlt basics, demonstrates how to get data from APIs (GitHub and PokeAPI), database, JSON, and move it into a Duckdb. 
18 | 
19 | ### Weaviate demos
20 | 
21 | - [pdf_to_weaviate.ipynb](pdf_to_weaviate.ipynb): shows how to load data from PDF files, specifically invoices, into Weaviate.
22 | - [sql_to_weaviate.ipynb](sql_to_weaviate.ipynb): shows how to import data from a public MySQL database into Weaviate.
23 | - [zendesk_to_weaviate.ipynb](zendesk_to_weaviate.ipynb): loads data from a [Zendesk dlt source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/zendesk) into Weaviate.
24 | 
25 | 
26 | ### Personio demos
27 | 
28 | - [personio_demo.ipynb](personio_demo.ipynb): shows how to load data from Personio to duckDB.
29 | 
30 | ### Project demos
31 | 
32 | Project demos are more extensive compared to the notebook ones and have their own README files. Refer to each project for more details.
33 | 
34 | 
35 | ## License
36 | 
37 | This repository is licensed under the [Apache License 2.0](LICENSE.txt). Please refer to the `LICENSE.txt` file for more details.
38 | 
39 | Happy coding and data loading! 🚀📊
40 | 


--------------------------------------------------------------------------------
/iceberg-tabular/github_pipeline.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | import dlt
 3 | from dlt.sources.helpers import requests
 4 | 
 5 | BASE_URL = "https://api.github.com/repos"
 6 | 
 7 | 
 8 | @dlt.resource(
 9 |     table_name="issues",
10 |     write_disposition="merge",
11 |     primary_key="id",
12 | )
13 | def get_issues(
14 |     organisation_name: str,
15 |     repo_name: str,
16 |     updated_at=dlt.sources.incremental(
17 |         "updated_at", initial_value="1970-01-01T00:00:00Z"
18 |     ),
19 | ):
20 |     # NOTE: we read only open issues to minimize the number of calls to the API.
21 |     # There's a limit of ~50 calls for not authenticated Github users
22 |     url = f"{BASE_URL}/{organisation_name}/{repo_name}/issues"
23 | 
24 |     while True:
25 |         response = requests.get(
26 |             url,
27 |             params={
28 |                 "since": {updated_at.last_value},
29 |                 "per_page": 100,
30 |                 "sort": "updated",
31 |                 "directions": "desc",
32 |                 "state": "open",
33 |             },
34 |         )
35 |         response.raise_for_status()
36 |         yield response.json()
37 | 
38 |         # get next page
39 |         if "next" not in response.links:
40 |             break
41 |         url = response.links["next"]["url"]
42 | 
43 | 
44 | @click.command()
45 | @click.option("--organisation-name", required=True, help="GitHub organisation name.")
46 | @click.option("--repo-name", required=True, help="GitHub repository name.")
47 | @click.option("--pipeline-name", default="github_pipeline", help="Name of the DLT pipeline.")
48 | @click.option("--dataset-name", default="issues", help="Name of the dataset.")
49 | def github_pipeline(organisation_name, repo_name, pipeline_name, dataset_name):
50 |     pipeline = dlt.pipeline(
51 |         pipeline_name=pipeline_name,
52 |         destination="athena",
53 |         dataset_name=dataset_name,
54 |     )
55 |     source_data = get_issues(organisation_name=organisation_name, repo_name=repo_name)
56 |     load_info = pipeline.run(source_data)
57 |     row_counts = pipeline.last_trace.last_normalize_info
58 | 
59 |     print(row_counts)
60 |     print("------")
61 |     print(load_info)
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     github_pipeline()
66 | 


--------------------------------------------------------------------------------
/dlt-dagster-snowflake/dlt_dagster_snowflake_demo/resources/__init__.py:
--------------------------------------------------------------------------------
 1 | from dagster import ConfigurableResource 
 2 | import dlt
 3 | import os
 4 | 
 5 | 
 6 | # Define a Dagster resource for managing dlt pipelines
 7 | class DltPipeline(ConfigurableResource):
 8 |     # Initialize resource with pipeline details
 9 |     pipeline_name: str
10 |     dataset_name: str
11 |     destination: str
12 | 
13 |     def create_pipeline(self, resource_data, table_name):
14 |         """
15 |         Creates and runs a dlt pipeline with specified data and table name.
16 |         
17 |         Args:
18 |             resource_data: The data to be processed by the pipeline.
19 |             table_name: The name of the table where data will be loaded.
20 |         
21 |         Returns:
22 |             The result of the pipeline execution.
23 |         """
24 | 
25 |         # Configure the dlt pipeline with your destination details
26 |         pipeline = dlt.pipeline(
27 |             pipeline_name=self.pipeline_name,
28 |             destination=self.destination,
29 |             dataset_name=self.dataset_name
30 |         )
31 | 
32 |         # Run the pipeline with your parameters
33 |         load_info = pipeline.run(resource_data, table_name=table_name)
34 |         return load_info
35 | 
36 | 
37 | # Define a Dagster resource for managing local file storage
38 | class LocalFileStorage(ConfigurableResource):
39 |     dir: str
40 | 
41 |     def setup_for_execution(self, context) -> None:
42 |         """
43 |         Prepares the local directory for file storage, creating it if it doesn't exist.
44 |         
45 |         Args:
46 |             context: The Dagster execution context (not used here).
47 |         """
48 | 
49 |         # Ensure the storage directory exists
50 |         os.makedirs(self.dir, exist_ok=True)
51 | 
52 |     def write(self, filename, data):
53 |         """
54 |         Writes data to a file within the local storage directory.
55 |         
56 |         Args:
57 |             filename: The name of the file to write to.
58 |             data: The data to be written to the file.
59 |         """
60 | 
61 |         # Create the directory path for the file if it does not exist
62 |         dir_path = f"{self.dir}/{os.path.dirname(filename)}"
63 |         os.makedirs(dir_path, exist_ok=True)
64 | 
65 |         # Write data to the file in binary mode
66 |         with open(f"{self.dir}/{filename}", "wb") as f:
67 |             f.write(data.read())
68 | 


--------------------------------------------------------------------------------
/dlt-dagster-snowflake/dlt_dagster_snowflake_demo/__init__.py:
--------------------------------------------------------------------------------
 1 | from dagster import Definitions, load_assets_from_modules, define_asset_job
 2 | from dagster_snowflake_pandas import SnowflakePandasIOManager
 3 | from dagster_snowflake import SnowflakeResource
 4 | from . import assets
 5 | from . import resources
 6 | import toml
 7 | import os
 8 | 
 9 | 
10 | # Load your secrets from the secrets.toml file accessed by dlt
11 | with open(os.getcwd() + '/.dlt/secrets.toml', 'r') as secrets_file:
12 |     secrets = toml.load(secrets_file) 
13 | 
14 | 
15 | # Set your secret values
16 | snowflake_user = secrets["destination"]["snowflake"]["credentials"]["username"]
17 | snowflake_password = secrets["destination"]["snowflake"]["credentials"]["password"]
18 | snowflake_warehouse = secrets["destination"]["snowflake"]["credentials"]["warehouse"]
19 | snowflake_database = secrets["destination"]["snowflake"]["credentials"]["database"]
20 | snowflake_account = secrets["destination"]["snowflake"]["credentials"]["host"]
21 | snowflake_schema = secrets["destination"]["snowflake"]["credentials"]["schema"]
22 | 
23 | 
24 | # Set your dlt pipelines as Dagster jobs
25 | dlt_pipelines = define_asset_job(name = "dlt_pipelines", selection= ['google_trends_asset', 'hacker_news_full_asset'])
26 | 
27 | 
28 | # Set your Dagster definition 
29 | defs = Definitions(
30 |     assets = load_assets_from_modules([assets]),
31 |     jobs = [dlt_pipelines],
32 |     resources = {
33 |         "pipeline": resources.DltPipeline(
34 |             pipeline_name = "dagster_pipeline",
35 |             dataset_name = "dagster_snoflake_demo",
36 |             destination = "snowflake",
37 |             description = ""
38 |         ),
39 |         "io_manager": SnowflakePandasIOManager(
40 |             account = snowflake_account,
41 |             user = snowflake_user, 
42 |             password = snowflake_password,
43 |             warehouse = snowflake_warehouse,
44 |             database = snowflake_database,
45 |             schema = snowflake_schema,
46 |             # role = snowflake_role # Optional
47 |         ),
48 |         "image_storage": resources.LocalFileStorage(
49 |             dir = "charts"
50 |         ),
51 |         "snowflake": SnowflakeResource(
52 |             account = snowflake_account,
53 |             user = snowflake_user, 
54 |             password = snowflake_password,
55 |             database = snowflake_database,
56 |             schema = snowflake_schema,
57 |         )
58 |     }
59 | )


--------------------------------------------------------------------------------
/scraping-source/scraping/__init__.py:
--------------------------------------------------------------------------------
 1 | """Scraping source
 2 | 
 3 | Integrates Dlt and Scrapy to facilitate scraping pipelines.
 4 | """
 5 | import inspect
 6 | import typing as t
 7 | 
 8 | import dlt
 9 | 
10 | from dlt.sources import DltResource
11 | from dlt.common.source import _SOURCES, SourceInfo
12 | 
13 | from scrapy import Spider  # type: ignore
14 | 
15 | from .helpers import ScrapingConfig, create_pipeline_runner
16 | from .types import P, AnyDict
17 | 
18 | 
19 | def run_pipeline(  # type: ignore[valid-type]
20 |     pipeline: dlt.Pipeline,
21 |     spider: t.Type[Spider],
22 |     *args: P.args,
23 |     on_before_start: t.Callable[[DltResource], None] = None,
24 |     scrapy_settings: t.Optional[AnyDict] = None,
25 |     batch_size: t.Optional[int] = None,
26 |     queue_size: t.Optional[int] = None,
27 |     queue_result_timeout: t.Optional[float] = None,
28 |     **kwargs: P.kwargs,
29 | ) -> None:
30 |     """Simple runner for the scraping pipeline
31 | 
32 |     You can pass all parameters via kwargs to `dlt.pipeline.run(....)`
33 | 
34 |         ```
35 |         destination: TDestinationReferenceArg = None,
36 |         staging: TDestinationReferenceArg = None,
37 |         dataset_name: str = None,
38 |         credentials: Any = None,
39 |         table_name: str = None,
40 |         write_disposition: TWriteDisposition = None,
41 |         columns: TAnySchemaColumns = None,
42 |         primary_key: TColumnNames = None,
43 |         schema: Schema = None,
44 |         loader_file_format: TLoaderFileFormat = None
45 |         ```
46 |     """
47 |     options: AnyDict = {}
48 |     if scrapy_settings:
49 |         options["scrapy_settings"] = scrapy_settings
50 | 
51 |     if batch_size:
52 |         options["batch_size"] = batch_size
53 | 
54 |     if queue_size:
55 |         options["queue_size"] = queue_size
56 | 
57 |     if queue_result_timeout:
58 |         options["queue_result_timeout"] = queue_result_timeout
59 | 
60 |     scraping_host = create_pipeline_runner(pipeline, spider, **options)
61 | 
62 |     if on_before_start:
63 |         on_before_start(scraping_host.pipeline_runner.scraping_resource)
64 | 
65 |     scraping_host.run(*args, **kwargs)
66 | 
67 | 
68 | # This way we allow dlt init to detect scraping source it is indeed hacky
69 | # and the core team is working to provide a better alternative.
70 | _SOURCES[run_pipeline.__qualname__] = SourceInfo(
71 |     ScrapingConfig,
72 |     run_pipeline,
73 |     inspect.getmodule(run_pipeline),
74 | )
75 | 


--------------------------------------------------------------------------------
/scraping-source/scraping/queue.py:
--------------------------------------------------------------------------------
 1 | import typing as t
 2 | from queue import Empty, Queue
 3 | 
 4 | from dlt.common import logger
 5 | 
 6 | 
 7 | # Please read more at https://mypy.readthedocs.io/en/stable/runtime_troubles.html#not-generic-runtime
 8 | T = t.TypeVar("T")
 9 | 
10 | if t.TYPE_CHECKING:
11 | 
12 |     class _Queue(Queue[T]):
13 |         pass
14 | 
15 | else:
16 | 
17 |     class _Queue(Queue, t.Generic[T]):
18 |         pass
19 | 
20 | 
21 | class QueueClosedError(Exception):
22 |     pass
23 | 
24 | 
25 | class ScrapingQueue(_Queue[T]):
26 |     def __init__(
27 |         self,
28 |         maxsize: int = 0,
29 |         batch_size: int = 10,
30 |         read_timeout: float = 1.0,
31 |     ) -> None:
32 |         super().__init__(maxsize)
33 |         self.batch_size = batch_size
34 |         self.read_timeout = read_timeout
35 |         self._is_closed = False
36 | 
37 |     def get_batches(self) -> t.Iterator[t.Any]:
38 |         """Batching helper can be wrapped as a dlt.resource
39 | 
40 |         Returns:
41 |             Iterator[Any]: yields scraped items one by one
42 |         """
43 |         batch: t.List[T] = []
44 |         while True:
45 |             if len(batch) == self.batch_size:
46 |                 yield batch
47 |                 batch = []
48 | 
49 |             try:
50 |                 if self.is_closed:
51 |                     raise QueueClosedError("Queue is closed")
52 | 
53 |                 item = self.get(timeout=self.read_timeout)
54 |                 batch.append(item)
55 | 
56 |                 # Mark task as completed
57 |                 self.task_done()
58 |             except Empty:
59 |                 if batch:
60 |                     yield batch
61 |                     batch = []
62 |             except QueueClosedError:
63 |                 logger.info("Queue is closed, stopping...")
64 | 
65 |                 # Return the last batch before exiting
66 |                 if batch:
67 |                     yield batch
68 | 
69 |                 break
70 | 
71 |     def stream(self) -> t.Iterator[t.Any]:
72 |         """Streaming generator, wraps get_batches
73 |         and handles `GeneratorExit` if dlt closes it.
74 | 
75 |         Returns:
76 |             t.Iterator[t.Any]: returns batches of scraped content
77 |         """
78 |         try:
79 |             yield from self.get_batches()
80 |         except GeneratorExit:
81 |             self.close()
82 | 
83 |     def close(self) -> None:
84 |         """Marks queue as closed"""
85 |         self._is_closed = True
86 | 
87 |     @property
88 |     def is_closed(self) -> bool:
89 |         return self._is_closed
90 | 


--------------------------------------------------------------------------------
/pyladies-2024-demo/poke_pipeline.py:
--------------------------------------------------------------------------------
 1 | import dlt
 2 | from dlt.sources.helpers import requests
 3 | 
 4 | 
 5 | @dlt.source(max_table_nesting=2)
 6 | def source(pokemon_api_url: str):
 7 |     # note that we deselect `pokemon_list` - we do not want it to be loaded
 8 |     @dlt.resource(write_disposition="replace", selected=False)
 9 |     def pokemon_list():
10 |         """Retrieve a first page of Pokemons and yield it. We do not retrieve all the pages in this example"""
11 |         yield requests.get(pokemon_api_url).json()["results"]
12 | 
13 |     # transformer that retrieves a list of objects in parallel
14 |     @dlt.transformer
15 |     def pokemon(pokemons):
16 |         """Yields details for a list of `pokemons`"""
17 | 
18 |         # @dlt.defer marks a function to be executed in parallel
19 |         # in a thread pool
20 |         @dlt.defer
21 |         def _get_pokemon(_pokemon):
22 |             return requests.get(_pokemon["url"]).json()
23 | 
24 |         # call and yield the function result normally, the @dlt.defer takes care of parallelism
25 |         for _pokemon in pokemons:
26 |             yield _get_pokemon(_pokemon)
27 | 
28 |     # a special case where just one item is retrieved in transformer
29 |     # a whole transformer may be marked for parallel execution
30 |     @dlt.transformer(parallelized=True)
31 |     def species(pokemon_details):
32 |         """Yields species details for a pokemon"""
33 |         species_data = requests.get(pokemon_details["species"]["url"]).json()
34 |         # link back to pokemon so we have a relation in loaded data
35 |         species_data["pokemon_id"] = pokemon_details["id"]
36 |         # You can return the result instead of yield since the transformer only generates one result
37 |         return species_data
38 | 
39 |     # create two simple pipelines with | operator
40 |     # 1. send list of pokemons into `pokemon` transformer to get pokemon details
41 |     # 2. send pokemon details into `species` transformer to get species details
42 |     # NOTE: dlt is smart enough to get data from pokemon_list and pokemon details once
43 | 
44 |     return (pokemon_list | pokemon, pokemon_list | pokemon | species)
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     # build duck db pipeline
49 |     pipeline = dlt.pipeline(
50 |         pipeline_name="pokemon",
51 |         destination="duckdb",
52 |         dataset_name="pokemon_data",
53 |         dev_mode=True
54 |     )
55 | 
56 |     # the pokemon_list resource does not need to be loaded
57 |     load_info = pipeline.run(source("https://pokeapi.co/api/v2/pokemon"))
58 |     print(pipeline.last_trace.last_normalize_info)
59 |     print(load_info)
60 | 


--------------------------------------------------------------------------------
/secrets-providers-demo/dlt_with_google_secrets_pipeline.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import dlt
 4 | import requests
 5 | from dlt.common.configuration.inject import with_config
 6 | from dlt.common.configuration.specs import GcpServiceAccountCredentials
 7 | from google.cloud import secretmanager
 8 | 
 9 | 
10 | @with_config(sections=("google_secrets",))
11 | def get_secret_dict(
12 |     secret_id, credentials: GcpServiceAccountCredentials = dlt.secrets.value
13 | ):
14 |     """
15 |     Retrieve a secret from Google Cloud Secret Manager and convert to a dictionary.
16 | 
17 |     Args:
18 |         secret_id (str): ID of the secret to retrieve.
19 |         credentials (GcpServiceAccountCredentials): Credentials for accessing the secret manager.
20 | 
21 |     Returns:
22 |         dict: The secret data as a dictionary.
23 |     """
24 |     # Create the Secret Manager client with provided credentials
25 |     client = secretmanager.SecretManagerServiceClient(
26 |         credentials=credentials.to_native_credentials()
27 |     )
28 |     # Build the resource name of the secret version
29 |     name = f"projects/{credentials.project_id}/secrets/{secret_id}/versions/latest"
30 | 
31 |     # Access the secret version
32 |     response = client.access_secret_version(request={"name": name})
33 |     # Decode the payload to a string and convert it to a dictionary
34 |     secret_string = response.payload.data.decode("UTF-8")
35 |     secret_dict = json.loads(secret_string)
36 | 
37 |     return secret_dict
38 | 
39 | 
40 | @dlt.resource()
41 | def get_repositories(
42 |     api_token: str = dlt.secrets.value, organization: str = dlt.secrets.value
43 | ):
44 |     """
45 |     Retrieve repositories of a specified organization from GitHub.
46 | 
47 |     Args:
48 |         api_token (str): GitHub API token for authentication.
49 |         organization (str): The GitHub organization from which to retrieve repositories.
50 | 
51 |     Yields:
52 |         list: A list of repositories for the specified organization.
53 |     """
54 |     BASE_URL = "https://api.github.com"
55 |     url = f"{BASE_URL}/orgs/{organization}/repos"
56 |     headers = {
57 |         "Authorization": f"token {api_token}",
58 |         "Accept": "application/vnd.github+json",
59 |     }
60 | 
61 |     response = requests.get(url, headers=headers)
62 |     response.raise_for_status()  # Ensure that a HTTP error is raised for bad responses
63 |     yield response.json()
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     secret_data = get_secret_dict("temp-secret")
68 |     data = get_repositories(api_token=secret_data["api_token"], organization="dlt-hub")
69 | 
70 |     pipeline = dlt.pipeline(
71 |         pipeline_name="quick_start", destination="duckdb", dataset_name="mydata"
72 |     )
73 |     load_info = pipeline.run(data, table_name="repos")
74 | 
75 |     print(load_info)
76 | 


--------------------------------------------------------------------------------
/dlt-dbt-cloud/README.md:
--------------------------------------------------------------------------------
 1 | # dlt_dbt_cloud
 2 | Repository with demos of using DLT and DBT Cloud
 3 | 
 4 | ## Installation
 5 | 
 6 | ```sh
 7 | pip install dlt[bigquery]
 8 | ```
 9 | 
10 | ## Set up the pokemon pipeline
11 | 
12 | To get started with this data pipeline, follow these steps:
13 | 
14 | ### Init the pipeline
15 | 
16 | Enter the following command:
17 | 
18 | ```sh
19 | dlt init pokemon bigquery
20 | ```
21 | 
22 | For more information, read the
23 | [Add a verified source.](https://dlthub.com/docs/walkthroughs/add-a-verified-source)
24 | 
25 | ### Add credentials
26 | 
27 | 1. In the `.dlt` folder, there's a file called `secrets.toml`. It's where you store sensitive
28 |    information securely, like access tokens. Keep this file safe.
29 | 
30 |    Use the following format for service account authentication:
31 | 
32 |    ```toml
33 |    [sources.source_name]
34 |    secret = "Please set me up!"
35 |    ```
36 |    
37 |    [Pokemon verified source](https://github.com/dlt-hub/verified-sources/tree/master/sources/pokemon) 
38 |    doesn't require authentication, so we don't need to provide credentials. 
39 |     
40 | 2. Enter credentials for the BigQuery destination as per the [docs](https://dlthub.com/docs/dlt-ecosystem/destinations/bigquery):
41 |     ```toml
42 |     [destination.bigquery]
43 |     location = "US"
44 |     
45 |     [destination.bigquery.credentials]
46 |     project_id = "project_id" # please set me up!
47 |     private_key = "private_key" # please set me up!
48 |     client_email = "client_email" # please set me up!
49 |     ```
50 | 
51 | 
52 | For more information, read the [General Usage: Credentials.](https://dlthub.com/docs/general-usage/credentials)
53 | 
54 | ## Set up the dbt Cloud
55 | 
56 | ### Sign in dbt Cloud 
57 | Go through this [Quickstart for dbt Cloud and BigQuery](https://docs.getdbt.com/quickstarts/bigquery?step=1).
58 | 
59 | ### Create the dbt model
60 | Create the model for your data with the tutorial: [How to build SQL models](https://docs.getdbt.com/docs/build/sql-models).
61 | 
62 | ### Update pipeline script
63 | 
64 | Add the following code into your pipeline script (`pipelines/pokemon_pipeline.py`):
65 | 
66 | ```python
67 | from dlt.helpers.dbt_cloud import run_dbt_cloud_job
68 | 
69 | run_info = run_dbt_cloud_job()
70 | print(f"Job run status: {run_info['status_humanized']}")
71 | ```
72 | 
73 | ### Credentials
74 | 
75 | Use the following format for dbt Cloud API authentication in `.dlt/secrets.toml`:
76 | 
77 | ```toml
78 | [dbt_cloud]
79 | api_token = "set me up!" # required for authentication
80 | account_id = "set me up!" # required for both helpers function
81 | job_id = "set me up!" # optional only for run_dbt_cloud_job function (you can pass this explicitly as an argument to the function)
82 | ```
83 | 
84 | More information about dbt cloud helpers in [DBT Cloud Client and Helper Functions](https://dlthub.com/docs/dlt-ecosystem/transformations/dbt/dbt_cloud).
85 | 
86 | ## Run the pipeline
87 | 
88 | Now you are ready to run the pipeline! To get started, run the following command:
89 | 
90 | ```bash
91 | python pokemon_pipeline.py
92 | ```
93 | 


--------------------------------------------------------------------------------
/coinpaprika-to-postgresql/example_api_responses/coin_details.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "id": "btc-bitcoin",
  3 |     "name": "Bitcoin",
  4 |     "symbol": "BTC",
  5 |     "parent": {
  6 |       "id": "eth-ethereum",
  7 |       "name": "Ethereum",
  8 |       "symbol": "ETH"
  9 |     },
 10 |     "rank": 1,
 11 |     "is_new": false,
 12 |     "is_active": true,
 13 |     "type": "coin",
 14 |     "logo": "https://static.coinpaprika.com/coin/bnb-binance-coin/logo.png",
 15 |     "tags": [
 16 |       {
 17 |         "id": "blockchain-service",
 18 |         "name": "Blockchain Service",
 19 |         "coin_counter": 160,
 20 |         "ico_counter": 80
 21 |       }
 22 |     ],
 23 |     "team": [
 24 |       {
 25 |         "id": "vitalik-buterin",
 26 |         "name": "Vitalik Buterin",
 27 |         "position": "Author"
 28 |       }
 29 |     ],
 30 |     "description": "Bitcoin is a cryptocurrency and worldwide payment system. It is the first decentralized digital currency, as the system works without a central bank or single administrator.",
 31 |     "message": "string",
 32 |     "open_source": true,
 33 |     "hardware_wallet": true,
 34 |     "started_at": "2009-01-03T00:00:00Z",
 35 |     "development_status": "Working product",
 36 |     "proof_type": "Proof of work",
 37 |     "org_structure": "Decentralized",
 38 |     "hash_algorithm": "SHA256",
 39 |     "contract": "string",
 40 |     "platform": "string",
 41 |     "contracts": [
 42 |       {
 43 |         "contract": "string",
 44 |         "platform": "string",
 45 |         "type": "string"
 46 |       }
 47 |     ],
 48 |     "links": {
 49 |       "explorer": [
 50 |         "http://blockchain.com/explorer",
 51 |         "https://blockchair.com/bitcoin/blocks",
 52 |         "https://blockexplorer.com/",
 53 |         "https://live.blockcypher.com/btc/"
 54 |       ],
 55 |       "facebook": [
 56 |         "https://www.facebook.com/bitcoins/"
 57 |       ],
 58 |       "reddit": [
 59 |         "https://www.reddit.com/r/bitcoin"
 60 |       ],
 61 |       "source_code": [
 62 |         "https://github.com/bitcoin/bitcoin"
 63 |       ],
 64 |       "website": [
 65 |         "https://bitcoin.org/"
 66 |       ],
 67 |       "youtube": [
 68 |         "https://www.youtube.com/watch?v=Um63OQz3bjo"
 69 |       ],
 70 |       "medium": null
 71 |     },
 72 |     "links_extended": [
 73 |       {
 74 |         "url": "http://blockchain.com/explorer",
 75 |         "type": "explorer"
 76 |       },
 77 |       {
 78 |         "url": "https://www.reddit.com/r/bitcoin",
 79 |         "type": "reddit",
 80 |         "stats": {
 81 |           "subscribers": 1009135
 82 |         }
 83 |       },
 84 |       {
 85 |         "url": "https://github.com/bitcoin/bitcoin",
 86 |         "type": "source_code",
 87 |         "stats": {
 88 |           "contributors": 730,
 89 |           "stars": 36613
 90 |         }
 91 |       },
 92 |       {
 93 |         "url": "https://bitcoin.org/",
 94 |         "type": "website"
 95 |       }
 96 |     ],
 97 |     "whitepaper": {
 98 |       "link": "https://static.coinpaprika.com/storage/cdn/whitepapers/215.pdf",
 99 |       "thumbnail": "https://static.coinpaprika.com/storage/cdn/whitepapers/217.jpg"
100 |     },
101 |     "first_data_at": "2018-10-03T11:48:19Z",
102 |     "last_data_at": "2019-05-03T11:00:00"
103 |   }


--------------------------------------------------------------------------------
/scraping-source/README.md:
--------------------------------------------------------------------------------
 1 | # Scraping with dlt
 2 | 
 3 | Scraping source allows you to scrape content from web and uses [Scrapy](https://doc.scrapy.org/en/latest/)
 4 | to enable this capability.
 5 | 
 6 | It is possible to access and manipulate a scraping resource via (please see `scraping_pipeline.py`)
 7 | 
 8 | 1. `on_before_start` callback which will receive a `DltResource` as the only argument,
 9 | 2. The advanced scraping pipeline builder `scraping.helpers.create_pipeline_runner`
10 | 
11 | 
12 | ## 🎲 Configuration
13 | 
14 | It is possible to provide configuration via `.dlt/config.toml` below you can see an example
15 | 
16 | ```toml
17 | [sources.scraping]
18 | start_urls = [
19 |     "https://quotes.toscrape.com/page/1/"
20 | ]
21 | start_urls_file="/path/to/urls.txt"
22 | ```
23 | 
24 | When both `start_urls` and `start_urls_file` they will be merged and deduplicated so Scrapy
25 | gets a unique set of `start_urls`.
26 | 
27 | ## 🏎️ Running the pipeline
28 | 
29 | Install requirements and run the pipeline
30 | 
31 | ```sh
32 | pip install -r requirements.txt
33 | python scraping_pipeline.py
34 | ```
35 | 
36 | ## Implementing a spider
37 | 
38 | It is your responsibility to implement the spider and data extraction logic from the responses
39 | because our runner expects spider class, please see as a reference an example of spider in `scraping_pipeline.py`.
40 | For more information about spider implementation please also see [Scrapy docs](https://docs.scrapy.org/en/latest/topics/spiders.html).
41 | 
42 | ## Configuring Scrapy
43 | 
44 | You can pass scrapy settings via
45 | 
46 | 1. `run_pipeline(..., scrapy_settings={...})`,
47 | 2. `create_pipeline_runner(..., scrapy_settings={...})`,
48 | 3. Overriding defaults in `settings.py`.
49 | 
50 | Example:
51 | ```py
52 | run_pipeline(
53 |     pipeline,
54 |     MySpider,
55 |     scrapy_settings={
56 |         # How many sub pages to scrape
57 |         # https://docs.scrapy.org/en/latest/topics/settings.html#depth-limit
58 |         "DEPTH_LIMIT": 0,
59 |         "SPIDER_MIDDLEWARES": {
60 |             "scrapy.spidermiddlewares.depth.DepthMiddleware": 200,
61 |             "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 300,
62 |         },
63 |         "HTTPERROR_ALLOW_ALL": True,
64 |     },
65 | )
66 | ```
67 | 
68 | Note: this is just a shallow merge.
69 | Also log level is automatically set in sync with the one
70 | dlt provides so providing it via `scrapy_settings` as `"LOG_LEVEL": "DEBUG"` will not work,
71 | please see [logging documentation](https://dlthub.com/docs/running-in-production/running#set-the-log-level-and-format) for dlt.
72 | 
73 | ## 🧐 Introspection using streamlit
74 | 
75 | NOTE: you might need to set up `streamlit`, `pip install streamlit`
76 | 
77 | ```sh
78 | dlt pipeline <pipeline_name> show
79 | ```
80 | 
81 | ## 🧠 How it works?
82 | 
83 | Under the hood we run DLT [pipeline](https://dlthub.com/docs/api_reference/pipeline) in a separate thread while scrapy is running in the main thread.
84 | 
85 | Communication between the two is done via the queue, where
86 | 
87 | - Spider is responsible to put the results in the queue,
88 | - DLT resource collects and batches results from the queue.
89 | 
90 | ![simple diagram](./diagram.png)
91 | 
92 | <p align="center"><strong>Enjoy it!<strong></p>
93 | <hr>
94 | <p align="center">✨ 🚀 ✨</p>
95 | 


--------------------------------------------------------------------------------
/dlt_restack_demo/restack-app/src/functions/dlt_to_weaviate.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import traceback
 3 | 
 4 | import dlt
 5 | import pendulum
 6 | import requests
 7 | from dlt.destinations.adapters import weaviate_adapter
 8 | from pydantic import BaseModel
 9 | from restack_ai.function import function, log
10 | 
11 | 
12 | class PipelineInput(BaseModel):
13 |     pipeline_name: str
14 |     destination: str
15 |     add_limit: int
16 |     dev_mode: bool
17 | 
18 | 
19 | @function.defn()
20 | async def anime_pipeline(input: PipelineInput) -> str:
21 |     @dlt.resource(table_name="anime", write_disposition="merge", primary_key="mal_id")
22 |     def get_anime(
23 |         aired_from=dlt.sources.incremental(
24 |             "aired.from", initial_value="2024-07-01T00:00:00+00:00"
25 |         )
26 |     ):
27 |         yield pagination(
28 |             "anime",
29 |             params={
30 |                 "order_by": "start_date",
31 |                 "start_date": remove_hms(aired_from.last_value),
32 |                 "status": "airing",
33 |             },
34 |             base_url="https://api.jikan.moe/v4/",
35 |         )
36 | 
37 |     try:
38 |         # Set pipeline name, destination, and dataset name
39 |         pipeline = dlt.pipeline(
40 |             pipeline_name=input.pipeline_name,
41 |             destination=input.destination,
42 |             progress="log",
43 |             dev_mode=input.dev_mode,
44 |         )
45 | 
46 |         data = get_anime().add_limit(input.add_limit)
47 | 
48 |         if input.destination == "weaviate":
49 |             data = weaviate_adapter(
50 |                 data,
51 |                 vectorize=["title", "synopsis"],
52 |             )
53 | 
54 |         # Run the pipeline using the defined resource
55 |         pipeline.run(data)
56 |         return str(pipeline.last_trace)
57 |     except Exception as e:
58 |         log.error("Something went wrong!", error=e)
59 |         log.error(traceback.format_exc())
60 |         raise e
61 | 
62 | 
63 | def remove_hms(date: str) -> str:
64 |     date_obj = pendulum.parse(date)
65 |     return date_obj.to_date_string()
66 | 
67 | 
68 | def pagination(endpoint: str, params: dict, base_url: str):
69 |     has_next_page = True
70 |     page = 1
71 |     retries = 0
72 | 
73 |     while has_next_page:
74 |         params.update({"page": page})
75 | 
76 |         try:
77 |             response = requests.get(f"{base_url}{endpoint}", params)
78 |             if response.status_code == 429:  # Rate limit error
79 |                 wait_time = min(2 ** retries, 60)  # Exponential backoff, max 60s
80 |                 print(f"Rate limit hit. Retrying in {wait_time} seconds...")
81 |                 time.sleep(wait_time)
82 |                 retries += 1
83 |                 continue  # Retry the same request
84 | 
85 |             response.raise_for_status()  # Raise an error for 4xx or 5xx responses
86 |             retries = 0  # Reset retries on a successful request
87 |             data = response.json()
88 | 
89 |             if "data" in data:
90 |                 yield data["data"]
91 | 
92 |             has_next_page = data.get("pagination", {}).get("has_next_page", False)
93 |             page += 1
94 | 
95 |         except requests.RequestException as e:
96 |             print(f"Request failed for {endpoint}: {e}")
97 |             break
98 | 
99 |         time.sleep(0.5)


--------------------------------------------------------------------------------
/iceberg-tabular/README.md:
--------------------------------------------------------------------------------
 1 | # Demo: GitHub Issues data pipeline
 2 | 
 3 | This Python script utilizes the `dlt` library to create a data pipeline for extracting and loading
 4 | GitHub issues data into [Athena/Glue Catalog](https://dlthub.com/docs/dlt-ecosystem/destinations/athena). 
 5 | The pipeline focuses on fetching open issues from a specified GitHub
 6 | repository, storing data as parquet files in s3 buckets and creating external tables in AWS Glue Catalog.
 7 | You can then query those tables with Athena SQL commands which 
 8 | will then scan the whole folder of parquet files and return the results.
 9 | 
10 | In this demo we load data in Iceberg format (`force_iceberg = "True"`).
11 | We can use AWS Glue as a Data Catalog, or we can load the Iceberg data into Tabular.io.
12 | 
13 | ## Prerequisites
14 | 
15 | Before using the script, ensure you have the following prerequisites installed:
16 | 
17 | - Python
18 | - `dlt` library with Athena dependencies (`pip install dlt[athena]`)
19 | 
20 | 
21 | ## Usage
22 | 
23 | * **Clone the Repository:**
24 | 
25 |    ```bash
26 |    git clone https://github.com/dlt-hub/dlt_demos.git
27 |    cd dlt_demos/iceberg-tabular
28 |    ```
29 | * **Install Dependencies:**
30 | 
31 |    ```bash
32 |    pip install -r requirements.txt
33 |    ```
34 | * **Credentials Configuration:**
35 | 
36 |     Copy `secrets.toml`:
37 |     ```bash
38 |     cp .dlt/example.secrets.toml .dlt/secrets.toml
39 |     ```
40 |     Ensure you set up the necessary credentials for the filesystem (S3) and Athena destinations in your
41 |     `secrets.toml` file. Replace the placeholders with your actual credentials.
42 |     
43 |     ```toml
44 |     [destination.filesystem]
45 |     bucket_url = "s3://[your_bucket_name]" # replace with your bucket name,
46 |     
47 |     [destination.filesystem.credentials]
48 |     aws_access_key_id = "please set me up!" # copy the access key here
49 |     aws_secret_access_key = "please set me up!" # copy the secret access key here
50 |     
51 |     [destination.athena]
52 |     force_iceberg = "True" # load data in the iceberg format
53 |     query_result_bucket="s3://[results_bucket_name]" # replace with your query results bucket name
54 |     
55 |     [destination.athena.credentials]
56 |     aws_access_key_id="please set me up!" # same as credentials for filesystem
57 |     aws_secret_access_key="please set me up!" # same as credentials for filesystem
58 |     region_name="please set me up!" # set your aws region, for example "eu-central-1" for Frankfurt
59 |     database="awsdatacatalog"
60 |     ```
61 | * **Run the Script:**
62 | 
63 |    ```bash
64 |    python github_pipeline.py --organisation-name=dlt-hub --repo-name=dlt
65 |    ```
66 |    CLI Options:
67 |    * `--organisation-name`: GitHub organization name (required).
68 |    * `--repo-name`: GitHub repository name (required).
69 |    * `--pipeline-name`: Name of the dlt pipeline.
70 |    * `--dataset-name`: Name of the dataset.
71 | 
72 | 
73 | ## Notes
74 | 
75 | - The script reads only open issues to minimize the number of API calls, considering the limit for
76 |   non-authenticated GitHub users.
77 | - The `updated_at` parameter ensures that only issues updated since the last execution are fetched.
78 | - The pipeline utilizes Athena as the destination for storing the GitHub issues data.
79 | 
80 | Feel free to customize the script and pipeline configuration according to your requirements.
81 | 


--------------------------------------------------------------------------------
/scraping-source/scraping/helpers.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import typing as t
  3 | 
  4 | import dlt
  5 | from dlt.common.configuration.inject import with_config
  6 | from dlt.common.configuration.specs.base_configuration import (
  7 |     configspec,
  8 |     BaseConfiguration,
  9 | )
 10 | 
 11 | from scrapy import Spider  # type: ignore
 12 | 
 13 | from .queue import ScrapingQueue
 14 | from .settings import SOURCE_SCRAPY_QUEUE_SIZE, SOURCE_SCRAPY_SETTINGS
 15 | from .runner import ScrapingHost, PipelineRunner, ScrapyRunner, Signals
 16 | from .types import AnyDict
 17 | 
 18 | 
 19 | @configspec
 20 | class ScrapingConfig(BaseConfiguration):
 21 |     # Batch size for scraped items
 22 |     batch_size: int = 100
 23 | 
 24 |     # maxsize for queue
 25 |     queue_size: t.Optional[int] = SOURCE_SCRAPY_QUEUE_SIZE
 26 | 
 27 |     # result wait timeout for our queue
 28 |     queue_result_timeout: t.Optional[float] = 1.0
 29 | 
 30 |     # List of start urls
 31 |     start_urls: t.List[str] = None
 32 |     start_urls_file: str = None
 33 | 
 34 | 
 35 | @with_config(sections=("sources", "scraping"), spec=ScrapingConfig)
 36 | def resolve_start_urls(
 37 |     start_urls: t.Optional[t.List[str]] = dlt.config.value,
 38 |     start_urls_file: t.Optional[str] = dlt.config.value,
 39 | ) -> t.List[str]:
 40 |     """Merges start urls
 41 |     If both `start_urls` and `start_urls_file` given, we will merge them
 42 |     and return deduplicated list of `start_urls` for scrapy spider.
 43 |     """
 44 |     urls = set()
 45 |     if os.path.exists(start_urls_file):
 46 |         with open(start_urls_file, encoding="utf-8") as fp:
 47 |             urls = {line for line in fp.readlines() if str(line).strip()}
 48 | 
 49 |     if start_urls:
 50 |         for url in start_urls:
 51 |             urls.add(url)
 52 | 
 53 |     return list(set(urls))
 54 | 
 55 | 
 56 | @with_config(sections=("sources", "scraping"), spec=ScrapingConfig)
 57 | def create_pipeline_runner(
 58 |     pipeline: dlt.Pipeline,
 59 |     spider: t.Type[Spider],
 60 |     batch_size: int = dlt.config.value,
 61 |     queue_size: int = dlt.config.value,
 62 |     queue_result_timeout: float = dlt.config.value,
 63 |     scrapy_settings: t.Optional[AnyDict] = None,
 64 | ) -> ScrapingHost:
 65 |     """Creates scraping host instance
 66 |     This helper only creates pipeline host, so running and controlling
 67 |     scrapy runner and pipeline is completely delegated to advanced users
 68 |     """
 69 |     queue = ScrapingQueue(  # type: ignore
 70 |         maxsize=queue_size,
 71 |         batch_size=batch_size,
 72 |         read_timeout=queue_result_timeout,
 73 |     )
 74 | 
 75 |     signals = Signals(
 76 |         pipeline_name=pipeline.pipeline_name,
 77 |         queue=queue,
 78 |     )
 79 | 
 80 |     # Just to simple merge
 81 |     settings = {**SOURCE_SCRAPY_SETTINGS}
 82 |     if scrapy_settings:
 83 |         settings = {**scrapy_settings}
 84 | 
 85 |     scrapy_runner = ScrapyRunner(
 86 |         spider=spider,
 87 |         start_urls=resolve_start_urls(),
 88 |         signals=signals,
 89 |         settings=settings,
 90 |     )
 91 | 
 92 |     pipeline_runner = PipelineRunner(
 93 |         pipeline=pipeline,
 94 |         queue=queue,
 95 |     )
 96 | 
 97 |     scraping_host = ScrapingHost(
 98 |         queue,
 99 |         scrapy_runner,
100 |         pipeline_runner,
101 |     )
102 | 
103 |     return scraping_host
104 | 


--------------------------------------------------------------------------------
/pyladies-2024-demo/README.md:
--------------------------------------------------------------------------------
 1 | ## Overview
 2 | 
 3 | `dlt` is an open-source library that you can add to your Python scripts to load data from various and often messy data sources into well structured, live datasets. Below we give you a preview how you can get data from APIs, files, Python objects or pandas dataframes and move it into a local or remote database, data lake or a vector data store.
 4 | 
 5 | Let's get started!
 6 | 
 7 | ## Installation
 8 | 
 9 | Official releases of dlt can be installed from [PyPI](https://pypi.org/project/dlt/):
10 | 
11 | ```shell
12 | pip install dlt
13 | ```
14 | 
15 | Command above just installs library core, in example below we use `duckdb` as a [destination](https://dlthub.com/docs/dlt-ecosystem/destinations), so let's add it:
16 | 
17 | ```shell
18 | pip install -q "dlt[duckdb]"
19 | ```
20 | 
21 | > Use clean virtual environment for your experiments! Here are [detailed instructions](https://dlthub.com/docs/reference/installation).
22 | 
23 | ## Quick start
24 | 
25 | Let's load a list of Python objects (dicts) into `duckdb` database and inspect the created dataset.
26 | 
27 | > We gonna use `dev_mode` for our test examples. If you create a new pipeline script you will be experimenting a lot. 
28 | > If you want that each time the pipeline resets its state and loads data to a new dataset, set the `dev_mode` argument of the `dlt.pipeline` method to True. 
29 | > Each time the pipeline is created, dlt adds datetime-based suffix to the dataset name.
30 | 
31 | Run the command:
32 | ```shell
33 | python getting-started.py
34 | ```
35 | 
36 | ### Now explore your data!
37 | 
38 | #### If you run it locally
39 | 
40 | To see the schema of your created database, run Streamlit command `dlt pipeline <pipeline_name> show`.
41 | 
42 | To use `streamlit`, install it first.
43 | 
44 | ```shell
45 | pip install streamlit
46 | ```
47 | 
48 | For example above pipeline name is “quick_start”, so run:
49 | 
50 | ```shell
51 | dlt pipeline quick_start show
52 | ```
53 | [This command](https://dlthub.com/docs/reference/command-line-interface#show-tables-and-data-in-the-destination) generates and launches a simple Streamlit app that you can use to inspect the schemas and data in the destination.
54 | 
55 | ## Load data from variety of sources
56 | 
57 | Use dlt to load practically any data you deal with in your Python script into a dataset.
58 | 
59 | The library will create/update tables, infer data types and deal with nested data automatically:
60 | - list of dicts
61 | - json 
62 | - csv/parquet
63 | - API
64 | - database
65 | - etc.
66 | 
67 | ### from JSON
68 | 
69 | When creating a schema during normalization, dlt recursively unpacks this nested structure into relational tables, creating and linking [children and parent tables](https://dlthub.com/docs/general-usage/destination-tables#nested-tables).
70 | 
71 | ```shell
72 | python load_from_json.py
73 | ```
74 | 
75 | ### from API
76 | 
77 | Below we load 100 most recent followers from our [own dlt-hub organisation](https://github.com/dlt-hub/dlt) into "followers" table.
78 | 
79 | ```shell
80 | python github_pipeline.py
81 | ```
82 | 
83 | ### from Database
84 | 
85 | Use the SQL source to extract data from databases like PostgreSQL, MySQL, SQLite, Oracle, and more.
86 | 
87 | ```shell
88 | pip install pymysql
89 | ```
90 | 
91 | ## Real-life example
92 | 
93 | For this example, we will be loading Pokemon data from the PokeAPI with the help of transformers to load Pokemon details in parallel.
94 | 
95 | ```shell
96 | python poke_pipeline.py
97 | ```
98 | 


--------------------------------------------------------------------------------
/scraping-source/scraping_pipeline.py:
--------------------------------------------------------------------------------
  1 | from typing import Any
  2 | 
  3 | import dlt
  4 | from dlt.sources import DltResource
  5 | from scrapy import Spider  # type: ignore
  6 | from scrapy.http import Response  # type: ignore
  7 | 
  8 | from scraping import run_pipeline
  9 | from scraping.helpers import create_pipeline_runner
 10 | 
 11 | 
 12 | class MySpider(Spider):
 13 |     def parse(self, response: Response, **kwargs: Any) -> Any:
 14 |         for next_page in response.css("li.next a::attr(href)"):
 15 |             if next_page:
 16 |                 yield response.follow(next_page, self.parse)
 17 | 
 18 |         for quote in response.css("div.quote"):
 19 |             result = {
 20 |                 "quote": {
 21 |                     "text": quote.css("span.text::text").get(),
 22 |                     "author": quote.css("small.author::text").get(),
 23 |                     "tags": quote.css("div.tags a.tag::text").getall(),
 24 |                 },
 25 |             }
 26 |             yield result
 27 | 
 28 | 
 29 | def scrape_quotes() -> None:
 30 |     pipeline = dlt.pipeline(
 31 |         pipeline_name="scraping",
 32 |         destination='duckdb',
 33 |         dataset_name="quotes",
 34 |     )
 35 | 
 36 |     run_pipeline(
 37 |         pipeline,
 38 |         MySpider,
 39 |         # you can pass scrapy settings overrides here
 40 |         scrapy_settings={
 41 |             "DEPTH_LIMIT": 10,
 42 |         },
 43 |         write_disposition="append",
 44 |     )
 45 | 
 46 | 
 47 | def scrape_quotes_scrapy_configs() -> None:
 48 |     pipeline = dlt.pipeline(
 49 |         pipeline_name="scraping_custom_scrapy_configs",
 50 |         destination='duckdb',
 51 |         dataset_name="quotes",
 52 |     )
 53 | 
 54 |     run_pipeline(
 55 |         pipeline,
 56 |         MySpider,
 57 |         # you can pass scrapy settings overrides here
 58 |         scrapy_settings={
 59 |             # How many sub pages to scrape
 60 |             # https://docs.scrapy.org/en/latest/topics/settings.html#depth-limit
 61 |             "DEPTH_LIMIT": 100,
 62 |             "SPIDER_MIDDLEWARES": {
 63 |                 "scrapy.spidermiddlewares.depth.DepthMiddleware": 200,
 64 |                 "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 300,
 65 |             },
 66 |             "HTTPERROR_ALLOW_ALL": False,
 67 |         },
 68 |         write_disposition="append",
 69 |     )
 70 | 
 71 | 
 72 | def scrape_quotes_callback_access_resource() -> None:
 73 |     pipeline = dlt.pipeline(
 74 |         pipeline_name="scraping_resource_callback",
 75 |         destination='duckdb',
 76 |         dataset_name="quotes",
 77 |     )
 78 | 
 79 |     def on_before_start(res: DltResource) -> None:
 80 |         res.add_limit(2)
 81 | 
 82 |     run_pipeline(
 83 |         pipeline,
 84 |         MySpider,
 85 |         batch_size=10,
 86 |         scrapy_settings={},
 87 |         on_before_start=on_before_start,
 88 |         write_disposition="append",
 89 |     )
 90 | 
 91 | 
 92 | def scrape_quotes_advanced_runner() -> None:
 93 |     pipeline = dlt.pipeline(
 94 |         pipeline_name="scraping_advanced_direct",
 95 |         destination='duckdb',
 96 |     )
 97 |     scraping_host = create_pipeline_runner(pipeline, MySpider, batch_size=10)
 98 |     scraping_host.pipeline_runner.scraping_resource.add_limit(2)
 99 |     scraping_host.run(dataset_name="quotes", write_disposition="append")
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     scrape_quotes()
104 |     # scrape_quotes_scrapy_configs()
105 |     # scrape_quotes_callback_access_resource()
106 |     # scrape_quotes_advanced_runner()
107 | 


--------------------------------------------------------------------------------
/dlt-dbt-cloud/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | # Default .gitignore content added by dbt Cloud
163 | target/
164 | dbt_packages/
165 | logs/
166 | # end dbt Cloud content
167 | 


--------------------------------------------------------------------------------
/dlt-init-openapi-demo/README.md:
--------------------------------------------------------------------------------
 1 | # `dlt-init-openapi`, REST API Clients and `dlt`
 2 | 
 3 | A REST API (Representational State Transfer Application Programming Interface) is a set of rules and conventions for building and interacting with web services. It allows different systems to communicate over the internet using standard HTTP methods.
 4 | 
 5 | Generating a REST API client in Python can be done in several ways. Two popular methods are:
 6 | 
 7 | - Manually creating the client using the requests library.
 8 | - Automatically generating the client using an OpenAPI spec.
 9 | 
10 | Another method is using [dlt rest_api source.](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api)
11 | 
12 | `dlt` is an open-source library that you can add to your Python scripts to load data from various and often messy data sources into well-structured, live datasets.
13 | 
14 | The `rest_api` source in `dlt` is a versatile and generic tool designed to help you extract data from any REST API. By using a declarative configuration, you can define API endpoints, their relationships, pagination handling, and authentication methods effortlessly.
15 | 
16 | > See [dlt Rest API helpers tutorial](https://colab.research.google.com/drive/1qnzIM2N4iUL8AOX1oBUypzwoM3Hj5hhG?usp=sharing) for details.
17 | 
18 | `dlt` went ahead and created a RestAPI clients generator based on `rest_api` source and OpenAPI spec -- [`dlt-init-openapi`](https://pypi.org/project/dlt-init-openapi/).
19 | 
20 | ## Installation
21 | 
22 | ```sh
23 | pip install dlt-init-openapi
24 | ```
25 | 
26 | ## Initialize the source with Stripe OpenAPI Spec
27 | 
28 | This will take a while, you have time to make a coffee...
29 | 
30 | ```sh
31 | dlt-init-openapi stripe --url "https://raw.githubusercontent.com/stripe/openapi/master/openapi/spec3.json"
32 | ```
33 | 
34 | Pipeline was generated, 247 endpoints were found, but **we do not believe in magic** and we need to make sure that source was generated correctly. 
35 | For example, need to provide `base_url`, secrets, query parameters, etc.
36 | 
37 | Stripe is well known for its high-quality API and documentation, so you will find
38 | [here](https://docs.stripe.com/api) all required information:
39 | - base url;
40 | - authentication type;
41 | - pagination type;
42 | - available query parameters;
43 | - child endpoints.
44 | 
45 | 
46 | Walk through this [dlt REST API tutorial](https://colab.research.google.com/drive/1qnzIM2N4iUL8AOX1oBUypzwoM3Hj5hhG?usp=sharing) to learn how to investigate API documentation and avoid struggling with building a REST API client.
47 | 
48 | We're gonna explore a few endpoints: [customers list](https://docs.stripe.com/api/customers/list), [subscriptions list](https://docs.stripe.com/api/subscriptions/list).
49 | 
50 | ## Authentication
51 | 
52 | As you know, to gain access to the API you will need a secret token. [Here is a guide](https://docs.stripe.com/keys) how to get the key.
53 | 
54 | 
55 | Let's explore the [Stripe API Authentication methods.](https://docs.stripe.com/stripe-apps/api-authentication)
56 | 
57 | It says here:
58 | 
59 | >Authentication to the API is performed via HTTP Basic Auth. Provide your API key as the basic auth username value. You do not need to provide a password.
60 | >
61 | 
62 | 
63 | ## Pagination
64 | 
65 | Well, let's take a look at how the tool coped with pagination.
66 | First, we will find out what type of pagination the Stripe API has:
67 | 
68 | >Stripe’s list API methods use **cursor-based pagination** through the `starting_after` and `ending_before` parameters. 
69 | > Both parameters accept an existing object `ID` value (see below) and return objects in reverse chronological order.
70 | > 
71 | 
72 | We can easily fix it, go to [the rest_api documentation](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api#pagination) and find correct pagination type:
73 | 
74 | >**JSONResponseCursorPaginator** handles pagination based on a cursor in the JSON response. \
75 | *Parameters*: \
76 | `cursor_path`: A JSONPath expression pointing to the cursor in the JSON response. This cursor is used to fetch subsequent pages. Defaults to "cursors.next".\
77 | `cursor_param`: The query parameter used to send the cursor value in the next request. Defaults to "after".
78 | 
79 | 
80 | ```
81 | "paginator": {
82 |     "type": "cursor",
83 |     "cursor_path": "id",
84 |     "cursor_param": "starting_after",
85 | },
86 | ```
87 | 
88 | 
89 | ## Run the pipeline
90 | 
91 | ```shell
92 | python stripe_pipeline.py
93 | ```


--------------------------------------------------------------------------------
/dlt_restack_demo/README.md:
--------------------------------------------------------------------------------
  1 | # Anime data pipeline with RAG using dlt, Restack and Weaviate
  2 | 
  3 | This project demonstrates how to:
  4 | 
  5 | - Extract and store anime data from the Jikan API into Weaviate using dlt.
  6 | - Implement a Retrieval-Augmented Generation (RAG) workflow to answer anime-related questions using OpenAI's GPT model and Weaviate's vector search.
  7 | 
  8 | ## Start Restack
  9 | 
 10 | To start the Restack, use the following Docker command:
 11 | 
 12 | ```bash
 13 | docker run -d --pull always --name restack -p 5233:5233 -p 6233:6233 -p 7233:7233 ghcr.io/restackio/restack:main
 14 | ```
 15 | 
 16 | ## Run Weaviate
 17 | 
 18 | ```shell
 19 | docker run -p 8080:8080 -p 50051:50051  -e ENABLE_MODULES=text2vec-openai,generative-openai cr.weaviate.io/semitechnologies/weaviate:1.28.4 
 20 | ```
 21 | 
 22 | ## Add environment variables
 23 | 
 24 | Copy from `.dlt` the `example.secrets.toml` and rename it to `secrets.toml` and add you OpenAI key there.
 25 | 
 26 | ```
 27 | [destination.weaviate.credentials.additional_headers]
 28 | X-OpenAI-Api-Key = "..."
 29 | 
 30 | [openai]
 31 | api_key = "..."
 32 | ```
 33 | 
 34 | ## Start python shell
 35 | 
 36 | ```
 37 | cd restack-app
 38 | ```
 39 | 
 40 | If using uv:
 41 | 
 42 | ```bash
 43 | uv venv && source .venv/bin/activate
 44 | ```
 45 | 
 46 | If using pip:
 47 | 
 48 | ```bash
 49 | python -m venv .venv && source .venv/bin/activate
 50 | ```
 51 | 
 52 | ## Install dependencies
 53 | 
 54 | If using uv:
 55 | 
 56 | ```bash
 57 | uv sync
 58 | uv run dev
 59 | ```
 60 | 
 61 | If using pip:
 62 | 
 63 | ```bash
 64 | pip install -e .
 65 | python -c "from src.services import watch_services; watch_services()"
 66 | ```
 67 | 
 68 | ## Usage
 69 | 
 70 | ### Data ingestion pipeline
 71 | 
 72 | #### How it works
 73 | 
 74 | - Fetches incremental anime data from Jikan API.
 75 | - Handles rate limits with exponential backoff.
 76 | - Stores structured and vectorized data in Weaviate.
 77 | 
 78 | #### Run workflows from UI
 79 | 
 80 | You can run workflows from the UI by clicking the "Run" button.
 81 | 
 82 | ![Run workflows from UI](img/UI.png)
 83 | 
 84 | You should provide input in JSON format:
 85 | 
 86 | | Parameter        | Type     | Description                                                                                                                                                                       |
 87 | |-----------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 88 | | `pipeline_name` | `str`   | The name of the pipeline. It is used to restore its state and data schemas on subsequent runs.                                                                                    |
 89 | | `destination`   | `str`   | The destination where the anime data will be stored. Defaults to `"weaviate"`. You can use `duckdb` for development and debugging.                                                |
 90 | | `add_limit`     | `int`   | The maximum number of API pages to fetch and process in a single run. Defaults to 2. Set -1 if you want to fetch all data.                                                        |
 91 | | `dev_mode`      | `bool`  | When dev_mode is enabled, dlt creates a versioned dataset each time you run the pipeline. This means the data is loaded into a new dataset (a new database schema) with each run. |
 92 | 
 93 | ![Pass input](img/run.png)
 94 | 
 95 | #### Pipeline execution example
 96 | 
 97 | The pipeline finalizes the execution and logs the results, ensuring the data is successfully ingested into the destination.
 98 | 
 99 | ![See results](img/results.png)
100 | 
101 | #### Incremental loading
102 | 
103 | Incremental loading ensures that only new or updated data is fetched and processed when the pipeline runs again. 
104 | Instead of reloading all data, the pipeline tracks the last processed 
105 | timestamp (e.g., aired.from) and retrieves only new entries since that point.
106 | 
107 | If you rerun the pipeline without new data being available, 
108 | it will detect that no new records exist and avoid unnecessary processing. 
109 | This helps improve efficiency, reduce API requests, and minimize redundant storage.
110 | 
111 | ![Incremental loading](img/incremental.png)
112 | 
113 | 
114 | 1. **First Run** (14s 600ms Execution)
115 | 
116 |     - Extracted, normalized, and loaded anime data into Weaviate.
117 |     - Successfully processed multiple rows and stored them in the database.
118 |     
119 | 2. **Second Run** (1s Execution)
120 | 
121 |    - The second execution was much faster (~1 second) because no new data was available.
122 |    - The output log confirms that no new records were found, and 0 load packages were sent to the destination.
123 |    - This is a result of incremental loading, preventing redundant data processing.
124 | 
125 | By using incremental loading, the pipeline optimizes data ingestion by only processing new entries, making subsequent runs more efficient.
126 | 
127 | 
128 | ### RAG query pipeline
129 | This pipeline allows **semantic search** on the stored anime data using Weaviate and OpenAI.  
130 | 
131 | #### How it works
132 | 1. A user asks a **question** related to anime.  
133 | 2. Weaviate retrieves relevant anime descriptions using **vector search**.  
134 | 3. OpenAI generates a **natural language answer** based on the retrieved data.  
135 | 
136 | #### Run workflows from UI
137 | 
138 | You can run workflows from the UI by clicking the "Run" button.
139 | 
140 | ![Run workflows from UI](img/run2.png)
141 | 
142 | You should provide input in JSON format:
143 | 
144 | | Parameter      | Type    | Description  |
145 | |---------------|--------|--------------|
146 | | `pipeline_name` | `str` | Name of the pipeline, Defaults to `anime_pipeline`. |
147 | | `question` | `str` | User's query for retrieving anime information. |
148 | 
149 | 
150 | #### Pipeline execution example
151 | 
152 | The pipeline finalizes the execution and logs the results, ensuring the answer was successfully generated.
153 | 
154 | ![See results](img/results2.png)


--------------------------------------------------------------------------------
/dlt-dagster-snowflake/dlt_dagster_snowflake_demo/assets/__init__.py:
--------------------------------------------------------------------------------
  1 | from dagster import asset, get_dagster_logger, AssetExecutionContext, MetadataValue
  2 | from dagster_snowflake import SnowflakeResource
  3 | from ..dlt import hacker_news, comments, google_trends, hacker_news_full
  4 | from ..resources import LocalFileStorage, DltPipeline
  5 | import pandas as pd
  6 | import matplotlib.pyplot as plt
  7 | from io import BytesIO
  8 | 
  9 | 
 10 | orchestration_tools = ("Kestra", "Dagster", "Airflow", "Luigi", "MageAi", "Keboola")
 11 | 
 12 | 
 13 | def run_dlt_pipeline_and_generate_md(pipeline: DltPipeline, resource_data, table_name: str):
 14 |     """
 15 |     Executes a dlt pipeline and generates a Markdown report.
 16 | 
 17 |     Args:
 18 |         pipeline: A DltResource object representing the dlt pipeline to be executed.
 19 |         resource_data: The data that will be used by the pipeline.
 20 |         table_name: The name of the table to be updated by the pipeline.
 21 | 
 22 |     Returns:
 23 |         A string containing the Markdown formatted summary of the schema updates performed by the pipeline.
 24 |     """
 25 | 
 26 |     logger = get_dagster_logger()
 27 | 
 28 |     # Create the pipeline and log the resulting load information
 29 |     load_info = pipeline.create_pipeline(
 30 |         resource_data = resource_data,
 31 |         table_name = table_name
 32 |     )
 33 |     logger.info(load_info)
 34 | 
 35 |     md_content = ""
 36 |     # Iterate through the load packages to update the Markdown content
 37 |     for package in load_info.load_packages:
 38 |         for table_name, table in package.schema_update.items():
 39 |             for column_name, column in table["columns"].items():
 40 |                 md_content += f"\tTable updated: {table_name}: Column changed: {column_name}: {column['data_type']}\n"
 41 |     
 42 |     return md_content
 43 | 
 44 | 
 45 | @asset(group_name = "google_trends_data")
 46 | def google_trends_asset(context: AssetExecutionContext, pipeline: DltPipeline) -> None:
 47 |     """
 48 |     A Dagster asset that loads Google Trends data from the "google_trends" dlt resource to Snowflake using a dlt pipeline and documents the updates.
 49 |     """ 
 50 | 
 51 |     dlt_resource = google_trends(orchestration_tools)
 52 |     md_content = run_dlt_pipeline_and_generate_md(pipeline, resource_data = dlt_resource, table_name = "google_trends_asset")
 53 | 
 54 |     context.add_output_metadata(metadata = {"Updates": MetadataValue.md(md_content)})
 55 | 
 56 | 
 57 | @asset(group_name = "google_trends_data", deps = [google_trends_asset])
 58 | def google_trends_chart(snowflake: SnowflakeResource, image_storage: LocalFileStorage) -> None:
 59 |     """
 60 |     A Dagster asset that generates a line chart visualizing Google Trends data over time and saves it to the local storage.
 61 |     """
 62 |     
 63 |     with snowflake.get_connection() as conn:
 64 |         google_trends = conn.cursor().execute(
 65 |                 f"SELECT * FROM {google_trends_asset.name}"
 66 |             ).fetch_pandas_all()
 67 |         
 68 |         # Plot the data
 69 |         google_trends["DATE"] = pd.to_datetime(google_trends['DATE'])
 70 |         pivot_df = google_trends.pivot(index='DATE', columns='TOOL', values='HITS')
 71 |         plt.figure(figsize=(10, 6))
 72 |         pivot_df.plot(kind='line', ax=plt.gca(), linewidth=2)
 73 |         plt.title('Google Trends Over Time')
 74 |         plt.xlabel('Date')
 75 |         plt.ylabel('Number of Hits')
 76 |         plt.legend(title='Tool')
 77 |         plt.grid(True)
 78 |         plt.xticks(rotation = 45)
 79 |         plt.ylim(-50, 200)  # Setting y-axis range from -50 to 200 for better visbility 
 80 | 
 81 |         # Save the chart as an image to local storage
 82 |         buffer = BytesIO()
 83 |         plt.savefig(buffer, format='png')
 84 |         buffer.seek(0) 
 85 |         plt.close()
 86 |         filename = "google_trends_over_time.png"
 87 |         image_storage.write(filename, buffer)
 88 | 
 89 | 
 90 | @asset(group_name = "hacker_news_data")
 91 | def hacker_news_full_asset(context: AssetExecutionContext, pipeline: DltPipeline) -> None:
 92 |     """
 93 |     A Dagster asset that loads Hackernews data from the "hacker_news_full" dlt source to Snowflake using a dlt pipeline and documents the updates.
 94 |     """ 
 95 | 
 96 |     dlt_source = hacker_news_full(orchestration_tools)
 97 |     md_content = run_dlt_pipeline_and_generate_md(pipeline, resource_data = dlt_source, table_name = "hacke_news_full_asset")
 98 | 
 99 |     context.add_output_metadata(metadata={"Updates": MetadataValue.md(md_content)})
100 | 
101 | 
102 | @asset(group_name = "hacker_news_data", deps = [hacker_news_full_asset])
103 | def hacker_news_chart(snowflake: SnowflakeResource, image_storage: LocalFileStorage) -> None:
104 |     """
105 |     A Dagster asset that generates a line chart visualizing the sentiment of comments for each tool and saves it to the local storage.
106 |     """
107 | 
108 |     with snowflake.get_connection() as conn:
109 |         data = conn.cursor().execute(
110 |             f"SELECT TOOL_NAME, SENTIMENT, COUNT(*) AS COUNT FROM HACKER_NEWS_FULL_ASSET WHERE SENTIMENT IN ('Neutral', 'Positive', 'Negative') GROUP BY TOOL_NAME, SENTIMENT"
111 |         ).fetch_pandas_all()
112 | 
113 |         # Plot the data
114 |         pivot_df = data.pivot(index='TOOL_NAME', columns='SENTIMENT', values='COUNT').fillna(0)
115 |         plt.figure(figsize=(10, 6)) 
116 |         pivot_df.plot(kind='bar', width=0.8)
117 |         plt.title('Sentiment Counts for Each Tool on Hacker News')
118 |         plt.xlabel('Tool Name')
119 |         plt.ylabel('Count')
120 |         plt.legend(title='Sentiment')
121 |         plt.xticks(rotation=45)
122 | 
123 |         # Save the chart as an image to local storage
124 |         buffer = BytesIO()
125 |         plt.savefig(buffer, format='png', bbox_inches='tight')
126 |         buffer.seek(0)  
127 |         plt.close()  
128 |         filename = "hacker_news_sentiment_counts.png"
129 |         image_storage.write(filename, buffer)
130 | 
131 | 
132 | '''
133 | @asset(group_name = "hacker_news_data")
134 | def hacker_news_asset(context: AssetExecutionContext, pipeline: DltResource) -> None:
135 |     """
136 |     A Dagster asset that separately loads Hackernews stories from the "hacker_news" dlt resource to Snowflake using a dlt pipeline and documents the updates.
137 |     """ 
138 | 
139 |     dlt_resource = hacker_news(orchestration_tools)
140 |     md_content = run_dlt_pipeline_and_generate_md(pipeline, resource_data = dlt_resource, table_name = "hacker_news_asset")
141 |    
142 |     context.add_output_metadata(metadata={"Updates": MetadataValue.md(md_content)})
143 | '''


--------------------------------------------------------------------------------
/coinpaprika-to-postgresql/README.md:
--------------------------------------------------------------------------------
  1 | # Loading Nested Data from an API into a PostgreSQL Database with dlt
  2 | 
  3 | ## Overview
  4 | 
  5 | This demo project demonstrates how to load nested data from separate API endpoints, where multiple endpoints rely on the response of one endpoint. It demonstrates how to set up `dlt` (Data Loading Tool) resources, including transformer resources and a source that merges them into a single dataset. Additionally, it includes a pipeline that handles the data ingestion process. PostgreSQL is used as the storage destination, and data is sourced from the Coinpaprika API.
  6 | 
  7 | ![Pipeline overview](https://storage.googleapis.com/dlt-blog-images/belgrade_demo_overview.jpg)
  8 | 
  9 | 
 10 | ## Prerequisites
 11 |  
 12 | 1. Docker Desktop
 13 | 
 14 |     > Download [Docker Desktop](https://www.docker.com/products/docker-desktop/) to download.
 15 | 
 16 | 2. DBeaver or another database administration tool of your choice
 17 | 
 18 |     > Download [DBeaver](https://dbeaver.io/download/).
 19 | 
 20 | Alternatively, use [DuckDB as destination](https://dlthub.com/docs/getting-started) for a simpler setup.
 21 |     
 22 | ## Setup Guide
 23 | 
 24 | 1. Clone this repository.
 25 | 
 26 | 2. Install the necessary dependencies for PostgreSQL:
 27 | 
 28 |     ```bash
 29 |     pip install -r requirements.txt
 30 |     ```
 31 | 
 32 | 3. Setup PostgreSQL using the public image:
 33 | 
 34 |    ```bash
 35 |     $ docker pull postgres
 36 |     ```
 37 | 
 38 | 4. Run the Docker container using the postgres:latest image with the command below:
 39 | 
 40 |     ```bash
 41 |     $ docker run -itd -e POSTGRES_USER=loader -e POSTGRES_PASSWORD=password -p 5432:5432 -v /data:/var/lib/postgresql/data --name postgresql postgres    
 42 |     ```
 43 | 
 44 |     > Replace `/data` with the absolute path to your local directory that you want to map to `/var/lib/postgresql/data` inside the container.
 45 | 
 46 | 5. Connect to the database:
 47 | 
 48 |     ```bash
 49 |     PGPASSWORD=password psql -h localhost -p 5432 -U loader     
 50 |     ```
 51 | 
 52 | 6. Create a new database:
 53 | 
 54 |     ```bash
 55 |     CREATE DATABASE demo_data;
 56 |     ```
 57 | 
 58 | 7. Create an empty `secrets.toml` in the `.dlt` directory and enter your credentials:
 59 | 
 60 |     ```env
 61 |     [destination.postgres.credentials]
 62 | 
 63 |     database = "demo_data"
 64 |     username = "loader"
 65 |     password = "password" # replace with your password
 66 |     host = "localhost" # or the IP address location of your database
 67 |     port = 5432
 68 |     connect_timeout = 15    
 69 |     ```
 70 | 
 71 | ## Your `dlt` Pipeline
 72 | 
 73 | 1. Understand your resources and sources.
 74 | 
 75 |     In the context of `dlt`, a source is a location that holds data with a certain structure, organized into one or more resources. It can also refer to the software component (i.e., a Python function) that extracts data from the source location using one or more resource components. For example, if the source is an API, then a resource is an endpoint in that API. If the source is a database, then a resource is a table in that database.
 76 | 
 77 |     The demo has two resources:
 78 | 
 79 |     - `coin_list()` yields a list of cryptocurrencies from coinpaprika.com:
 80 | 
 81 |         ```python
 82 |         @dlt.resource(name = "coin_list", write_disposition="replace")
 83 |         def coin_list():
 84 |             response = requests.get('https://api.coinpaprika.com/v1/coins')
 85 |             yield from response.json()
 86 |         ```
 87 |     - `coin_information(coin)` is a transformer resource that fetches comprehensive details from three distinct API endpoints for each cryptocurrency provided by `coin_list()`. The responses are then merged into one object for loading into a single database table:
 88 | 
 89 |         ```python
 90 |         @dlt.transformer(data_from = coin_list().add_limit(2)) 
 91 |         def coin_information(coin):
 92 |             coin_id = coin['id']
 93 |             details = requests.get(f'https://api.coinpaprika.com/v1/coins/{coin_id}')
 94 |             ohlc = requests.get(f'https://api.coinpaprika.com/v1/coins/{coin_id}/ohlcv/latest')
 95 |             exchanges = requests.get(f'https://api.coinpaprika.com/v1/coins/{coin_id}/exchanges')
 96 |             yield details.json() | ohlc.json()[0] | {"exchanges": exchanges.json()}
 97 |         ```
 98 |     These resources are combined into the `crypto_data()` source,which corresponds to a dataset with tables generated from the two resources:
 99 | 
100 |     ```python
101 |     @dlt.source
102 |     def crypto_data(name = "crypto_source"):
103 |         yield coin_list()
104 |         yield coin_information()        
105 |     ```
106 | 
107 |     The `@dlt.resource` and `@dlt.source` decorators declare a function as a resource/source in `dlt`, offering flexibility with essential functionalities. 
108 | 
109 |     > Note that the decorators use `yield` to produce data on-the-fly, instead of loading all data into memory at once. 
110 | 
111 | 2. Understand your pipeline.
112 | 
113 |     We define a pipeline named `crypto_pipeline` with PostgreSQL as destination:
114 | 
115 |     ```python
116 |     def load_coin_details() -> None:
117 |     pipeline = dlt.pipeline(
118 |         pipeline_name="crypto_pipeline",
119 |         destination='postgres',
120 |         full_refresh=True,
121 |         dataset_name="crypto_data",
122 |     )
123 |     info = pipeline.run(crypto_data())
124 |     ```
125 |     `full_refresh` is set to `True`, creating new dataset instances each time the pipeline runs. If set to `False`, the pipeline will update the existing dataset instead of creating new ones.
126 | 
127 |     The default `write_disposition` in `pipeline.run()` is set to `append`, meaning new data will be added to the existing data in the destination. Other options include:
128 | 
129 |     - `replace`: This replaces the existing data in the destination with the new data.
130 |     - `merge`: This option merges the new data into the destination using a `merge_key`. It can also deduplicate or upsert new data using a `private_key`.
131 | 
132 | 
133 | 3. Run your pipeline:
134 | 
135 |     ```bash
136 |     $ python3 dlt_pipeline_merged.py
137 |     ```
138 | 
139 | ## Viewing Your Data in DBeaver
140 | 
141 | 1. Connect DBeaver to your database.
142 | 
143 |     - Click `New Database Connection` in the top left corner.
144 |     - Choose PostgreSQL.
145 |     - Enter `demo_data` as the database.
146 |     - Enter `loader` as the username.
147 |     - Enter `password` as the password.
148 |     - Test the connection.
149 | 
150 | 2. Once connected, you can view your data. It should look like this:
151 | 
152 |     ![DBeaver view of demo_data](https://storage.googleapis.com/dlt-blog-images/belgrade_demo_DBeaver.png)
153 | 
154 |     > To get a better understanding of how the nested data was normalized with `dlt`, view the example respones returned by the API endpoints in `example_api_responses`.
155 | 
156 | ## Contact / Support
157 | For guidance on running custom pipelines with `dlt`, consider joining our [Slack community](https://dlthub-community.slack.com).
158 | 
159 | Visit our [documentation page](https://dlthub.com/docs/intro) for more detailed information.


--------------------------------------------------------------------------------
/scraping-source/scraping/runner.py:
--------------------------------------------------------------------------------
  1 | """This module contains abstractions to facilitate scraping and loading process"""
  2 | import threading
  3 | import typing as t
  4 | import dlt
  5 | 
  6 | from dlt.common import logger
  7 | from pydispatch import dispatcher  # type: ignore
  8 | from typing_extensions import Self
  9 | 
 10 | from scrapy import signals, Item, Spider  # type: ignore
 11 | from scrapy.crawler import CrawlerProcess  # type: ignore
 12 | 
 13 | from .types import AnyDict, Runnable, P
 14 | from .queue import ScrapingQueue
 15 | 
 16 | T = t.TypeVar("T")
 17 | 
 18 | 
 19 | class Signals:
 20 |     """Signals context wrapper
 21 | 
 22 |     This wrapper is also a callable which accepts `CrawlerProcess` instance
 23 |     this is required to stop the scraping process as soon as the queue closes.
 24 |     """
 25 | 
 26 |     def __init__(self, pipeline_name: str, queue: ScrapingQueue[T]) -> None:
 27 |         self.stopping = False
 28 |         self.queue = queue
 29 |         self.pipeline_name = pipeline_name
 30 | 
 31 |     def on_item_scraped(self, item: Item) -> None:
 32 |         if not self.queue.is_closed:
 33 |             self.queue.put(item)
 34 |         else:
 35 |             logger.info(
 36 |                 "Queue is closed, stopping",
 37 |                 extra={"pipeline_name": self.pipeline_name},
 38 |             )
 39 |             if not self.stopping:
 40 |                 self.on_engine_stopped()
 41 | 
 42 |     def on_engine_stopped(self) -> None:
 43 |         logger.info(f"Crawling engine stopped for pipeline={self.pipeline_name}")
 44 |         self.stopping = True
 45 |         self.crawler.stop()
 46 |         self.queue.close()
 47 |         self.queue.join()
 48 | 
 49 |     def __call__(self, crawler: CrawlerProcess) -> Self:
 50 |         self.crawler = crawler
 51 |         return self
 52 | 
 53 |     def __enter__(self) -> None:
 54 |         # We want to receive on_item_scraped callback from
 55 |         # outside so we don't have to know about any queue instance.
 56 |         dispatcher.connect(self.on_item_scraped, signals.item_scraped)
 57 | 
 58 |         # Once crawling engine stops we would like to know about it as well.
 59 |         dispatcher.connect(self.on_engine_stopped, signals.engine_stopped)
 60 | 
 61 |     def __exit__(self, exc_type: t.Any, exc_val: t.Any, exc_tb: t.Any) -> None:
 62 |         dispatcher.disconnect(self.on_item_scraped, signals.item_scraped)
 63 |         dispatcher.disconnect(self.on_engine_stopped, signals.engine_stopped)
 64 | 
 65 | 
 66 | class ScrapyRunner(Runnable):
 67 |     """Scrapy runner handles setup and teardown of scrapy crawling"""
 68 | 
 69 |     def __init__(
 70 |         self,
 71 |         spider: t.Type[Spider],
 72 |         start_urls: t.List[str],
 73 |         settings: AnyDict,
 74 |         signals: Signals,
 75 |     ) -> None:
 76 |         self.spider = spider
 77 |         self.start_urls = start_urls
 78 |         self.crawler = CrawlerProcess(settings=settings)
 79 |         self.signals = signals
 80 | 
 81 |     def run(self, *args: P.args, **kwargs: P.kwargs) -> None:
 82 |         """Runs scrapy crawler process
 83 | 
 84 |         All `kwargs` are forwarded to `crawler.crawl(**kwargs)`.
 85 |         Also manages relevant signal handling in proper way.
 86 |         """
 87 |         self.crawler.crawl(
 88 |             self.spider,
 89 |             name="scraping_spider",
 90 |             start_urls=self.start_urls,
 91 |             **kwargs,
 92 |         )
 93 | 
 94 |         try:
 95 |             logger.info("Starting the crawler")
 96 |             with self.signals(self.crawler):
 97 |                 self.crawler.start()
 98 |         except Exception:
 99 |             logger.error("Was unable to start crawling process")
100 |             raise
101 |         finally:
102 |             self.signals.on_engine_stopped()
103 |             logger.info("Scraping stopped")
104 | 
105 | 
106 | class PipelineRunner(Runnable):
107 |     """Pipeline runner runs dlt pipeline in a separate thread
108 |     Since scrapy wants to run in the main thread it is the only available
109 |     option to host pipeline in a thread and communicate via the queue.
110 |     """
111 | 
112 |     def __init__(self, pipeline: dlt.Pipeline, queue: ScrapingQueue[T]) -> None:
113 |         self.pipeline = pipeline
114 |         self.queue = queue
115 | 
116 |         if pipeline.dataset_name and not self.is_default_dataset_name(pipeline):
117 |             resource_name = pipeline.dataset_name
118 |         else:
119 |             resource_name = f"{pipeline.pipeline_name}_results"
120 | 
121 |         logger.info(f"Resource name: {resource_name}")
122 | 
123 |         self.scraping_resource = dlt.resource(
124 |             # Queue get_batches is a generator so we can
125 |             # pass it to pipeline.run and dlt will handle the rest.
126 |             self.queue.stream(),
127 |             name=resource_name,
128 |         )
129 | 
130 |     def is_default_dataset_name(self, pipeline: dlt.Pipeline) -> bool:
131 |         default_name = pipeline.pipeline_name + pipeline.DEFAULT_DATASET_SUFFIX
132 |         return pipeline.dataset_name == default_name
133 | 
134 |     def run(
135 |         self,
136 |         *args: P.args,
137 |         **kwargs: P.kwargs,
138 |     ) -> threading.Thread:
139 |         """You can use all regular dlt.pipeline.run() arguments
140 | 
141 |         ```
142 |         destination: TDestinationReferenceArg = None,
143 |         staging: TDestinationReferenceArg = None,
144 |         dataset_name: str = None,
145 |         credentials: Any = None,
146 |         table_name: str = None,
147 |         write_disposition: TWriteDisposition = None,
148 |         columns: TAnySchemaColumns = None,
149 |         primary_key: TColumnNames = None,
150 |         schema: Schema = None,
151 |         loader_file_format: TLoaderFileFormat = None
152 |         ```
153 |         """
154 | 
155 |         def run() -> None:
156 |             try:
157 |                 self.pipeline.run(self.scraping_resource, **kwargs)  # type: ignore[arg-type]
158 |             except Exception:
159 |                 logger.error("Error during pipeline.run call, closing the queue")
160 |                 raise
161 |             finally:
162 |                 self.queue.close()
163 | 
164 |         thread_runner = threading.Thread(target=run)
165 |         thread_runner.start()
166 |         return thread_runner
167 | 
168 | 
169 | class ScrapingHost:
170 |     """Scraping host runs the pipeline and scrapy"""
171 | 
172 |     def __init__(
173 |         self,
174 |         queue: ScrapingQueue[T],
175 |         scrapy_runner: ScrapyRunner,
176 |         pipeline_runner: PipelineRunner,
177 |     ) -> None:
178 |         self.queue = queue
179 |         self.scrapy_runner = scrapy_runner
180 |         self.pipeline_runner = pipeline_runner
181 | 
182 |     def run(
183 |         self,
184 |         *args: P.args,
185 |         **kwargs: P.kwargs,
186 |     ) -> None:
187 |         """You can pass kwargs which are passed to `pipeline.run`"""
188 |         logger.info("Starting pipeline")
189 |         pipeline_worker = self.pipeline_runner.run(*args, **kwargs)
190 | 
191 |         logger.info("Starting scrapy crawler")
192 |         self.scrapy_runner.run()
193 | 
194 |         # Wait to for pipeline finish it's job
195 |         pipeline_worker.join()
196 | 


--------------------------------------------------------------------------------
/dlt-init-openapi-demo/stripe_pipeline/rest_api/typing.py:
--------------------------------------------------------------------------------
  1 | from typing import (
  2 |     Any,
  3 |     Dict,
  4 |     List,
  5 |     Literal,
  6 |     Optional,
  7 |     TypedDict,
  8 |     Union,
  9 | )
 10 | from dataclasses import dataclass, field
 11 | 
 12 | from dlt.common import jsonpath
 13 | from dlt.common.typing import TSortOrder
 14 | from dlt.common.schema.typing import (
 15 |     TColumnNames,
 16 |     TTableFormat,
 17 |     TAnySchemaColumns,
 18 |     TWriteDispositionConfig,
 19 |     TSchemaContract,
 20 | )
 21 | 
 22 | from dlt.extract.items import TTableHintTemplate
 23 | from dlt.extract.incremental.typing import LastValueFunc
 24 | 
 25 | from dlt.sources.helpers.rest_client.paginators import BasePaginator
 26 | from dlt.sources.helpers.rest_client.typing import HTTPMethodBasic
 27 | from dlt.sources.helpers.rest_client.auth import AuthConfigBase, TApiKeyLocation
 28 | 
 29 | from dlt.sources.helpers.rest_client.paginators import (
 30 |     SinglePagePaginator,
 31 |     HeaderLinkPaginator,
 32 |     JSONResponsePaginator,
 33 |     JSONResponseCursorPaginator,
 34 |     OffsetPaginator,
 35 |     PageNumberPaginator,
 36 | )
 37 | from dlt.sources.helpers.rest_client.exceptions import IgnoreResponseException
 38 | from dlt.sources.helpers.rest_client.auth import (
 39 |     AuthConfigBase,
 40 |     HttpBasicAuth,
 41 |     BearerTokenAuth,
 42 |     APIKeyAuth,
 43 |     OAuthJWTAuth,
 44 | )
 45 | 
 46 | PaginatorType = Literal[
 47 |     "json_response",
 48 |     "header_link",
 49 |     "auto",
 50 |     "single_page",
 51 |     "cursor",
 52 |     "offset",
 53 |     "page_number",
 54 | ]
 55 | 
 56 | 
 57 | class PaginatorTypeConfig(TypedDict, total=True):
 58 |     type: PaginatorType  # noqa
 59 | 
 60 | 
 61 | class PageNumberPaginatorConfig(PaginatorTypeConfig, total=False):
 62 |     """A paginator that uses page number-based pagination strategy."""
 63 | 
 64 |     initial_page: Optional[int]
 65 |     page_param: Optional[str]
 66 |     total_path: Optional[jsonpath.TJsonPath]
 67 |     maximum_page: Optional[int]
 68 | 
 69 | 
 70 | class OffsetPaginatorConfig(PaginatorTypeConfig, total=False):
 71 |     """A paginator that uses offset-based pagination strategy."""
 72 | 
 73 |     limit: int
 74 |     offset: Optional[int]
 75 |     offset_param: Optional[str]
 76 |     limit_param: Optional[str]
 77 |     total_path: Optional[jsonpath.TJsonPath]
 78 |     maximum_offset: Optional[int]
 79 | 
 80 | 
 81 | class HeaderLinkPaginatorConfig(PaginatorTypeConfig, total=False):
 82 |     """A paginator that uses the 'Link' header in HTTP responses
 83 |     for pagination."""
 84 | 
 85 |     links_next_key: Optional[str]
 86 | 
 87 | 
 88 | class JSONResponsePaginatorConfig(PaginatorTypeConfig, total=False):
 89 |     """Locates the next page URL within the JSON response body. The key
 90 |     containing the URL can be specified using a JSON path."""
 91 | 
 92 |     next_url_path: Optional[jsonpath.TJsonPath]
 93 | 
 94 | 
 95 | class JSONResponseCursorPaginatorConfig(PaginatorTypeConfig, total=False):
 96 |     """Uses a cursor parameter for pagination, with the cursor value found in
 97 |     the JSON response body."""
 98 | 
 99 |     cursor_path: Optional[jsonpath.TJsonPath]
100 |     cursor_param: Optional[str]
101 | 
102 | 
103 | PaginatorConfig = Union[
104 |     PaginatorType,
105 |     PageNumberPaginatorConfig,
106 |     OffsetPaginatorConfig,
107 |     HeaderLinkPaginatorConfig,
108 |     JSONResponsePaginatorConfig,
109 |     JSONResponseCursorPaginatorConfig,
110 |     BasePaginator,
111 |     SinglePagePaginator,
112 |     HeaderLinkPaginator,
113 |     JSONResponsePaginator,
114 |     JSONResponseCursorPaginator,
115 |     OffsetPaginator,
116 |     PageNumberPaginator,
117 | ]
118 | 
119 | 
120 | AuthType = Literal["bearer", "api_key", "http_basic"]
121 | 
122 | 
123 | class AuthTypeConfig(TypedDict, total=True):
124 |     type: AuthType  # noqa
125 | 
126 | 
127 | class BearerTokenAuthConfig(TypedDict, total=False):
128 |     """Uses `token` for Bearer authentication in "Authorization" header."""
129 | 
130 |     # we allow for a shorthand form of bearer auth, without a type
131 |     type: Optional[AuthType]  # noqa
132 |     token: str
133 | 
134 | 
135 | class ApiKeyAuthConfig(AuthTypeConfig, total=False):
136 |     """Uses provided `api_key` to create authorization data in the specified `location` (query, param, header, cookie) under specified `name`"""
137 | 
138 |     name: Optional[str]
139 |     api_key: str
140 |     location: Optional[TApiKeyLocation]
141 | 
142 | 
143 | class HttpBasicAuthConfig(AuthTypeConfig, total=True):
144 |     """Uses HTTP basic authentication"""
145 | 
146 |     username: str
147 |     password: str
148 | 
149 | 
150 | # TODO: add later
151 | # class OAuthJWTAuthConfig(AuthTypeConfig, total=True):
152 | 
153 | 
154 | AuthConfig = Union[
155 |     AuthConfigBase,
156 |     AuthType,
157 |     BearerTokenAuthConfig,
158 |     ApiKeyAuthConfig,
159 |     HttpBasicAuthConfig,
160 |     BearerTokenAuth,
161 |     APIKeyAuth,
162 |     HttpBasicAuth,
163 | ]
164 | 
165 | 
166 | class ClientConfig(TypedDict, total=False):
167 |     base_url: str
168 |     headers: Optional[Dict[str, str]]
169 |     auth: Optional[AuthConfig]
170 |     paginator: Optional[PaginatorConfig]
171 | 
172 | 
173 | class IncrementalArgs(TypedDict, total=False):
174 |     cursor_path: str
175 |     initial_value: Optional[str]
176 |     last_value_func: Optional[LastValueFunc[str]]
177 |     primary_key: Optional[TTableHintTemplate[TColumnNames]]
178 |     end_value: Optional[str]
179 |     row_order: Optional[TSortOrder]
180 | 
181 | 
182 | class IncrementalConfig(IncrementalArgs, total=False):
183 |     start_param: str
184 |     end_param: Optional[str]
185 | 
186 | 
187 | ParamBindType = Literal["resolve", "incremental"]
188 | 
189 | 
190 | class ParamBindConfig(TypedDict):
191 |     type: ParamBindType  # noqa
192 | 
193 | 
194 | class ResolveParamConfig(ParamBindConfig):
195 |     resource: str
196 |     field: str
197 | 
198 | 
199 | class IncrementalParamConfig(ParamBindConfig, IncrementalArgs):
200 |     pass
201 |     # TODO: implement param type to bind incremental to
202 |     # param_type: Optional[Literal["start_param", "end_param"]]
203 | 
204 | 
205 | @dataclass
206 | class ResolvedParam:
207 |     param_name: str
208 |     resolve_config: ResolveParamConfig
209 |     field_path: jsonpath.TJsonPath = field(init=False)
210 | 
211 |     def __post_init__(self) -> None:
212 |         self.field_path = jsonpath.compile_path(self.resolve_config["field"])
213 | 
214 | 
215 | class ResponseAction(TypedDict, total=False):
216 |     status_code: Optional[Union[int, str]]
217 |     content: Optional[str]
218 |     action: str
219 | 
220 | 
221 | class Endpoint(TypedDict, total=False):
222 |     path: Optional[str]
223 |     method: Optional[HTTPMethodBasic]
224 |     params: Optional[Dict[str, Union[ResolveParamConfig, IncrementalParamConfig, Any]]]
225 |     json: Optional[Dict[str, Any]]
226 |     paginator: Optional[PaginatorConfig]
227 |     data_selector: Optional[jsonpath.TJsonPath]
228 |     response_actions: Optional[List[ResponseAction]]
229 |     incremental: Optional[IncrementalConfig]
230 | 
231 | 
232 | class ResourceBase(TypedDict, total=False):
233 |     """Defines hints that may be passed to `dlt.resource` decorator"""
234 | 
235 |     table_name: Optional[TTableHintTemplate[str]]
236 |     max_table_nesting: Optional[int]
237 |     write_disposition: Optional[TTableHintTemplate[TWriteDispositionConfig]]
238 |     parent: Optional[TTableHintTemplate[str]]
239 |     columns: Optional[TTableHintTemplate[TAnySchemaColumns]]
240 |     primary_key: Optional[TTableHintTemplate[TColumnNames]]
241 |     merge_key: Optional[TTableHintTemplate[TColumnNames]]
242 |     schema_contract: Optional[TTableHintTemplate[TSchemaContract]]
243 |     table_format: Optional[TTableHintTemplate[TTableFormat]]
244 |     selected: Optional[bool]
245 |     parallelized: Optional[bool]
246 | 
247 | 
248 | class EndpointResourceBase(ResourceBase, total=False):
249 |     endpoint: Optional[Union[str, Endpoint]]
250 |     include_from_parent: Optional[List[str]]
251 | 
252 | 
253 | class EndpointResource(EndpointResourceBase, total=False):
254 |     name: TTableHintTemplate[str]
255 | 
256 | 
257 | class RESTAPIConfig(TypedDict):
258 |     client: ClientConfig
259 |     resource_defaults: Optional[EndpointResourceBase]
260 |     resources: List[Union[str, EndpointResource]]
261 | 


--------------------------------------------------------------------------------
/dlt-init-openapi-demo/stripe_pipeline/rest_api/README.md:
--------------------------------------------------------------------------------
  1 | # REST API Generic Source
  2 | A declarative way to define dlt sources for REST APIs.
  3 | 
  4 | ## What is this?
  5 | > Happy APIs are all alike
  6 | >
  7 | >    \- E. T. Lev Tolstoy, Senior Data Engineer
  8 | 
  9 | This is a generic source that you can use to create a dlt source from a REST API using a declarative configuration. The majority of REST APIs behave in a similar way; this dlt source attempts to provide a declarative way to define a dlt source for those APIs.
 10 | 
 11 | ## How to use it
 12 | Let's see how a source for the [Pokemon API](https://pokeapi.co/) would look like:
 13 | 
 14 | ```python
 15 | pokemon_config = {
 16 |     "client": {
 17 |         "base_url": "https://pokeapi.co/api/v2/",
 18 |     },
 19 |     "resources": [
 20 |         "berry",
 21 |         "location",
 22 |         {
 23 |             "name": "pokemon_list",
 24 |             "endpoint": "pokemon",
 25 |         },
 26 |         {
 27 |             "name": "pokemon",
 28 |             "endpoint": {
 29 |                 "path": "pokemon/{name}",
 30 |                 "params": {
 31 |                     "name": {
 32 |                         "type": "resolve",
 33 |                         "resource": "pokemon_list",
 34 |                         "field": "name",
 35 |                     },
 36 |                 },
 37 |             },
 38 |         },
 39 |     ],
 40 | }
 41 | 
 42 | pokemon_source = rest_api_source(pokemon_config)
 43 | ```
 44 | Here's a short summary:
 45 | - The `client` node contains the base URL of the endpoints that we want to collect.
 46 | - The `resources` correspond to the API endpoints.
 47 | 
 48 | We have a couple of simple resources (`berry` and `location`). For them, the API endpoint is also the name of the dlt resource and the name of the destination table. They don't need additional configuration.
 49 | 
 50 | The next resource leverages some additional configuration. The endpoint `pokemon/` returns a list of pokemons, but it can also be used as `pokemon/{id or name}` to return a single pokemon. In this case, we want the list, so we decided to rename the resource to `pokemon_list`, while the endpoint stays `pokemon/`. We do not specify the name of the destination table, so it will match the resource name.
 51 | 
 52 | And now the `pokemon` one. This is actually a child endpoint of the `pokemon_list`: for each pokemon, we want to get further details. So we need to make this resource a bit more smart; the endpoint `path` needs to be explicit, and we have to specify how the value of `name` will be resolved from another resource; this is actually telling the generic source that `pokemon` needs to be queried for each pokemon in `pokemon_list`.
 53 | 
 54 | ## Anatomy of the config object
 55 | 
 56 | > **_TIP:_**  Import `RESTAPIConfig` from the `rest_api` module to have convenient tips.
 57 | 
 58 | The config object passed to the REST API Generic Source has three main elements:
 59 | 
 60 | ```python
 61 | my_config: RESTAPIConfig = {
 62 |     "client": {
 63 |         ...
 64 |     },
 65 |     "resource_defaults": {
 66 |         ...
 67 |     },
 68 |     "resources": {
 69 |         ...
 70 |     },
 71 | }
 72 | ```
 73 | 
 74 | `client` contains the configuration to connect to the API's endpoints (e.g., base URL, authentication method, default behavior for the paginator, and more).
 75 | 
 76 | `resource_defaults` contains the default values to configure the dlt resources returned by this source.
 77 | 
 78 | `resources` object contains the configuration for each resource.
 79 | 
 80 | The configuration with a smaller scope will overwrite the one with the wider one:
 81 | 
 82 |     Resource Configuration > Resource Defaults Configuration > Client Configuration
 83 | 
 84 | ## Reference
 85 | 
 86 | ### `client`
 87 | 
 88 | #### `auth` [optional]
 89 | Use the auth property to pass a token or a `HTTPBasicAuth` object for more complex authentication methods. Here are some practical examples:
 90 | 
 91 | 1. Simple token (read from the `.dlt/secrets.toml` file):
 92 | ```python
 93 | my_api_config: RESTAPIConfig = {
 94 |     "client": {
 95 |         "base_url": "https://my_api.com/api/v1/",
 96 |         "auth": {
 97 |             "token": dlt.secrets["sources.my_api.access_token"],
 98 |         },
 99 |     },
100 |     ...
101 | }
102 | ```
103 | 
104 | 2.
105 | ```python
106 | from requests.auth import HTTPBasicAuth
107 | 
108 | basic_auth = HTTPBasicAuth(dlt.secrets["sources.my_api.api_key"], dlt.secrets["sources.my_api.api_secret"])
109 | 
110 | my_api_config: RESTAPIConfig = {
111 |     "client": {
112 |         "base_url": "https://my_api.com/api/v1/",
113 |         "auth": basic_auth,
114 |     },
115 |     ...
116 | }
117 | ```
118 | 
119 | #### `base_url`
120 | The base URL that will be prepended to the endpoints specified in the `resources` objects. Example:
121 | 
122 | ```python
123 |     "base_url": "https://my_api.com/api/v1/",
124 | ```
125 | 
126 | #### `paginator` [optional]
127 | The paginator property specifies the default paginator to be used for the endpoint responses.
128 | 
129 | Possible paginators are:
130 | | Paginator | String Alias | Note |
131 | | --------- | ------------ | ---- |
132 | | BasePaginator | | |
133 | | HeaderLinkPaginator | `header_links` | |
134 | | JSONResponsePaginator | `json_links` | The pagination metainformation is in a node of the JSON response (see example below) |
135 | | SinglePagePaginator | `single_page` | The response will be interpreted as a single-page response, ignoring possible pagination metadata |
136 | 
137 | Usage example of the `JSONResponsePaginator`, for a response with the URL of the next page located at `paging.next`:
138 | ```python
139 | "paginator": JSONResponsePaginator(
140 |     next_key=["paging", "next"]
141 | )
142 | ```
143 | 
144 | 
145 | #### `session` [optional]
146 | 
147 | This property allows you to pass a custom `Session` object.
148 | 
149 | 
150 | ### `resource_defaults`
151 | This property allows you to pass default properties and behavior to the dlt resources created by the REST API Generic Source. Besides the properties mentioned in this documentation, a resource accepts all the arguments that usually are passed to a [dlt resource](https://dlthub.com/docs/general-usage/resource).
152 | 
153 | #### `endpoint`
154 | A string indicating the endpoint or an `endpoint` object (see [below](#endpoint-1)).
155 | 
156 | #### `include_from_parent` [optional]
157 | A list of fields, from the parent resource, which will be included in the resource output.
158 | 
159 | #### `name`
160 | The name of the dlt `resource` and the name of the associated table that will be created.
161 | 
162 | #### `params`
163 | The query parameters for the endpoint URL.
164 | 
165 | For child resources, you can use values from the parent resource for params. The syntax is the following:
166 | 
167 | ```python
168 |     "PARAM_NAME": {
169 |         "type": "resolve",
170 |         "resource": "PARENT_RESOURCE_NAME",
171 |         "field": "PARENT_RESOURCE_FIELD",
172 |     },
173 | ```
174 | 
175 | An example of use:
176 | ```python
177 |     "endpoint": {
178 |         "path": "pokemon/{name}",
179 |         "params": {
180 |             "name": {
181 |                 "type": "resolve",
182 |                 "resource": "pokemon_list",
183 |                 "field": "name",
184 |             },
185 |         },
186 |     },
187 | ```
188 | 
189 | #### `path`
190 | The URL of the endpoint. If you need to include URL parameters, they can be included using `{}`, for example:
191 | ```python
192 |     "path": "pokemon/{name}",
193 | ```
194 | In case you need to include query parameters, use the [params](#params) property.
195 | 
196 | 
197 | ### `resources`
198 | An array of resources. Each resource is a string or a resource object.
199 | 
200 | Simple resources with their name corresponding to the endpoint can be simple strings. For example:
201 | ```python
202 |     "resources": [
203 |         "berry",
204 |         "location",
205 |     ]
206 | ```
207 | Resources with the name different from the endpoint string will be:
208 | ```python
209 |     "resources": [
210 |         {
211 |             "name": "pokemon_list",
212 |             "endpoint": "pokemon",
213 |         },
214 |     ]
215 | ```
216 | In case you need to have a resource with a name different from the table created, you can pass the property `table_name` too.
217 | 
218 | For the other properties, see the [resource_defaults](#resource_defaults) above.


--------------------------------------------------------------------------------
/dlt-dagster-snowflake/dlt_dagster_snowflake_demo/dlt/__init__.py:
--------------------------------------------------------------------------------
  1 | from dlt.sources.helpers import requests
  2 | import dlt
  3 | from openai import OpenAI
  4 | from pytrends.request import TrendReq
  5 | from datetime import datetime
  6 | import logging
  7 | import toml
  8 | import time
  9 | import os
 10 | 
 11 | 
 12 | def openai_sentiment(context: str):
 13 |     """
 14 |     Analyzes the sentiment of a given text using OpenAI.
 15 | 
 16 |     Args:
 17 |         context: The text string for which the sentiment is to be analyzed.
 18 | 
 19 |     Returns:
 20 |         A string indicating the sentiment of the text, which could be 'positive', 'negative', or 'neutral'.
 21 |         If an error occurs, it returns a string describing the error.
 22 |     """
 23 | 
 24 |     # Load your OpenAI API key from the secrets.toml file accessed by dlt
 25 |     with open(os.getcwd() + '/.dlt/secrets.toml', 'r') as secrets_file:
 26 |         secrets = toml.load(secrets_file)
 27 |         openai_key = secrets["openai"]["openai_api_key"]
 28 | 
 29 |     # Initialize the OpenAI client
 30 |     client = OpenAI(api_key = openai_key)
 31 | 
 32 |     # Set up the prompt
 33 |     messages = [
 34 |         {"role": "system", "content": "You will be given a comment text. Give the sentiment of the comment in one word. It should be either negative, positive, or neutral."},
 35 |         {"role": "assistant", "content": f"{context}"}
 36 |     ]
 37 | 
 38 |     # Try to get the sentiment
 39 |     try:
 40 |         response = client.chat.completions.create(model="gpt-3.5-turbo", messages=messages)
 41 |         output = response.choices[0].message.content 
 42 |         return output
 43 |     except Exception as e:
 44 |         # Return the error message if an exception occurs
 45 |         return f"An error occurred: {e}"
 46 | 
 47 | 
 48 | # The 'write_disposition' parameter determines how the data returned by this function is handled.
 49 | # 'append' means that the data will be added to the end of the existing data.
 50 | # Other possible values for 'write_disposition' are 'replace' (which replaces the existing data with the new data) 
 51 | # and 'merge' (which merges the new data with the existing data, updating any existing records that have the same primary key).
 52 | @dlt.resource(write_disposition = "append")
 53 | def hacker_news(orchestration_tools: tuple[str] = ("Airflow", )):
 54 |     """
 55 |     This function fetches stories related to specified orchestration tools from Hackernews. 
 56 |     For each tool, it retrieves the top 5 stories that have at least one comment. 
 57 |     The stories are then appended to the existing data.
 58 | 
 59 |     Args:
 60 |         orchestration_tools: A tuple containing the names of orchestration tools for which stories are to be fetched.
 61 | 
 62 |     Yields:
 63 |         A generator that yields dictionaries. Each dictionary represents a story and contains the tool name along with the story details returned by the API request.
 64 |     """
 65 | 
 66 |     for tool in orchestration_tools:
 67 |         response = requests.get(f'http://hn.algolia.com/api/v1/search?query={tool}&tags=story&numericFilters=num_comments>=1&hitsPerPage=5')
 68 |         data = response.json()
 69 |         # Add the tool name to each story
 70 |         data["hits"] = [{"tool_name": tool, **item} for item in data["hits"]]
 71 |         # Yield each story one by one
 72 |         yield from data["hits"]
 73 | 
 74 | 
 75 | @dlt.transformer(data_from = hacker_news, write_disposition = "append")
 76 | def comments(story):
 77 |     """
 78 |     This function fetches comments for each story yielded by the 'hacker_news' function. 
 79 |     It calculates the number of pages of comments based on the number of comments each story has, 
 80 |     and fetches comments page by page. The comments are then appended to the existing data.
 81 | 
 82 |     Args:
 83 |         story: A dictionary representing a story, yielded by the 'hacker_news' function.
 84 | 
 85 |     Yields:
 86 |         A generator that yields lists of dictionaries. Each list represents a page of comments, 
 87 |         and each dictionary within the list represents a comment and contains the tool name, story title, 
 88 |         story URL, sentiment of the comment, and the comment details returned by the API request.
 89 |     """
 90 | 
 91 |     tool_name = story["tool_name"]
 92 |     story_title = story["title"]
 93 |     story_id = story["story_id"]
 94 |     url = story.get("url")
 95 |     num_comments = story["num_comments"]
 96 | 
 97 |     num_pages = int(num_comments/20) # The API returns 20 comments per page
 98 |     if num_pages != num_comments/20:
 99 |         num_pages += 1
100 | 
101 |     for page in range(num_pages):
102 |         response = requests.get(f'http://hn.algolia.com/api/v1/search?tags=comment,story_{story_id}&page={page}')
103 |         data = response.json()
104 |         # Add the tool name, story title, story URL, and sentiment to each comment
105 |         data["hits"] = [{"tool_name": tool_name, "story_title": story_title, "story_url": url, "sentiment": openai_sentiment(item["comment_text"]), **item} for item in data["hits"]]
106 |         #data["hits"] = [{"tool_name": tool_name, "story_title": story_title, "story_url": url, **item} for item in data["hits"]] # Without sentiment_analysis
107 |         # Yield each page of comments
108 |         yield data["hits"]
109 | 
110 | 
111 | @dlt.source()
112 | def hacker_news_full(orchestration_tools:tuple[str] = ("Airflow", )):
113 |     """
114 |     This function is a dlt source that groups together the resources and transformers needed to fetch 
115 |     Hackernews stories and their comments for specified orchestration tools. 
116 | 
117 |     Args:
118 |         orchestration_tools: A tuple containing the names of orchestration tools for which Hacker News stories and comments are to be fetched.
119 | 
120 |     Yields:
121 |         A generator that yields the results of the 'hacker_news' resource piped into the 'comments' transformer.
122 |     """
123 | 
124 |     # The 'hacker_news' resource fetches stories for the specified orchestration tools
125 |     # The 'comments' transformer fetches comments for each story yielded by the 'hacker_news' resource
126 |     yield hacker_news(orchestration_tools = orchestration_tools) | comments
127 | 
128 | 
129 | @dlt.resource(write_disposition = "append")
130 | def google_trends(orchestration_tools: tuple[str] = ("Airflow",), start_date='2023-01-01', geo=''):
131 |     """
132 |     This function fetches Google Trends data for specified orchestration tools. 
133 |     It attempts to retrieve the data multiple times in case of failures or empty responses. 
134 |     The retrieved data is then appended to the existing data.
135 | 
136 |     Args:
137 |         orchestration_tools: A tuple containing the names of orchestration tools for which Google Trends data is to be fetched.
138 |         start_date: The start date for the Google Trends data. Defaults to '2023-01-01'.
139 |         geo: The geographic area for the Google Trends data. Defaults to an empty string, which means worldwide.
140 | 
141 |     Yields:
142 |         A generator that yields lists of dictionaries. Each list represents the Google Trends data for a tool, 
143 |         and each dictionary within the list contains the tool name and the Google Trends data.
144 |     """
145 | 
146 |     # pytrend = TrendReq()
147 |     for tool in orchestration_tools:
148 |         attempts = 0
149 |         max_attempts = 5  # Set a maximum number of attempts to avoid infinite loops
150 |         while attempts < max_attempts:
151 |             try:
152 |                 end_date = datetime.now().strftime('%Y-%m-%d')
153 |                 timeframe = f'{start_date} {end_date}'
154 |                 pytrend.build_payload(kw_list = [tool], timeframe = timeframe, geo = geo)
155 |                 data_df = pytrend.interest_over_time()
156 |                 
157 |                 if not data_df.empty:
158 |                     data_df.reset_index(inplace = True)
159 |                     data_df.rename(columns = {tool: 'Hits'}, inplace=True)
160 |                     data = data_df.to_dict('records')
161 |                     data = [{"tool": tool, **item} for item in data]
162 |                     print(data)
163 |                     yield data
164 |                     break  # Successfully fetched data, exit the retry loop
165 |                 else:
166 |                     logging.warning(f"No data for {tool}. Retrying...")
167 |                     attempts += 1
168 |                     time.sleep(60)  # Wait before retrying
169 |             except Exception as e:
170 |                 logging.warning(f"Encountered an error fetching data for {tool}: {e}. Attempt {attempts+1}/{max_attempts}. Retrying...")
171 |                 attempts += 1
172 |                 time.sleep(100)  # Wait before retrying
173 |             
174 |             if attempts >= max_attempts:
175 |                 logging.error(f"Max retries reached for {tool}. Moving to the next tool.")


--------------------------------------------------------------------------------
/dlt-dagster-snowflake/README.md:
--------------------------------------------------------------------------------
  1 | # Loading Nested API Data into Snowflake using `dlt` in Dagster
  2 | 
  3 | ## Overview
  4 | 
  5 | This is a demo project that shows how to load nested data into Snowflake using `dlt` (Data Load Tool) in Dagster. It demonstrates the process of defining a `dlt` pipeline as a Dagster resource and implementing the pipeline with `dlt` resources and sources in Dagster assets. Additionally, it integrates AI analysis for sentiment assessment of textual data.
  6 | 
  7 | ![Pipeline overview](https://storage.googleapis.com/dlt-blog-images/dlt_dagster_snowflake_demo_overview.png)
  8 | 
  9 | The diagram above represents the workflow overview of the project, encompassing the following steps:
 10 | 
 11 | 1. Data loading from Google Trends and Hacker News to Snowflake using `dlt`, with an added step for OpenAI sentiment analysis specifically for Hacker News before the loading process.
 12 | 2. Data reporting from the destination to a local directory in the form of image files. 
 13 | 
 14 | ## Prerequisites
 15 | 
 16 | 1. Snowflake credentials
 17 |     - username 
 18 |     - password
 19 |     - account and host
 20 | 
 21 |     >The host refers to your account identifier within your account. For instance, if your account is `https://kgiotue-wn98412.snowflakecomputing.com`, your host would be `kgiotue-wn98412`.
 22 | 
 23 | 2. OpenAI API key
 24 |     
 25 |     >If you're new to [OpenAi](https://platform.openai.com/), they offer $5 in free credits usable during your first 3 months.
 26 |     
 27 | ## Setup Guide
 28 | 
 29 | 1. **Clone this repository**: Follow the instructions [here](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository).
 30 | 
 31 | 2. **Create a virtual environment and activate it**: This step is advised to maintain a clean workspace and prevent dependency conflicts, although this is not mandatory.
 32 | 
 33 |    ```bash
 34 |     python -m venv myenv
 35 |     source myenv/bin activate
 36 |     ```
 37 | 3. **Create a `secrets.toml` file in `.dlt` folder and enter the missing values**: Use the `example.secrets.toml` file for reference.
 38 | 
 39 |     > Deafault values for the role and warehouse are "ACCOUNTADMIN" and "COMPUTE_WH", respectively.
 40 | 
 41 | 4. **Install dependencies**: Run the following command from the project folder.
 42 | 
 43 |    ```bash
 44 |     pip install -r requirements.txt
 45 |     ```
 46 | 
 47 | 5. **Start Dagster server**: Run the following command.
 48 |    ```bash
 49 |     dagster dev
 50 |     ```
 51 | 8. **Access Dagster UI**: Launch  http://127.0.0.1:3000.
 52 | 
 53 |     >If you want to run Dagster in cloud, or customize your project, consult their official [documentation](https://docs.dagster.io/getting-started).
 54 | 
 55 | ## Understand Your Project
 56 | 
 57 | This project is very minimal, including just what's needed to run Dagster locally with `dlt`. Here's a quick breakdown of its structure:
 58 | 
 59 | 1. `.dlt`: Utilized by the `dlt` library for storing configuration and sensitive information. The Dagster project is set up to fetch secret values from this directory as well.
 60 | 
 61 | 2. `charts`: Used to store chart images generated by assets.
 62 | 
 63 | 3. `dlt_dagster_snowflake_demo`: Your Dagster package, comprising Dagster assets, `dlt` resources, Dagster resources, and general project configurations.
 64 | 
 65 | ### Dagster Resources Explained
 66 | 
 67 | In the `resources` folder, the following two Dagster resources are defined as classes:
 68 | 
 69 | 1. `DltPipeline`: This is our `dlt` object defined as a Dagster ConfigurableResource that creates and runs a `dlt` pipeline with the specified data and table name. It will later be used in our Dagster assets to load data into Snowflake.
 70 | 
 71 |     ```python
 72 |     class DltPipeline(ConfigurableResource):
 73 |         # Initialize resource with pipeline details
 74 |         pipeline_name: str
 75 |         dataset_name: str
 76 |         destination: str
 77 | 
 78 |         def create_pipeline(self, resource_data, table_name):
 79 |             """
 80 |             Creates and runs a dlt pipeline with specified data and table name.
 81 |             
 82 |             Args:
 83 |                 resource_data: The data to be processed by the pipeline.
 84 |                 table_name: The name of the table where data will be loaded.
 85 |             
 86 |             Returns:
 87 |                 The result of the pipeline execution.
 88 |             """
 89 | 
 90 |             # Configure the dlt pipeline with your destination details
 91 |             pipeline = dlt.pipeline(
 92 |                 pipeline_name=self.pipeline_name,
 93 |                 destination=self.destination,
 94 |                 dataset_name=self.dataset_name
 95 |             )
 96 | 
 97 |             # Run the pipeline with your parameters
 98 |             load_info = pipeline.run(resource_data, table_name=table_name)
 99 |             return load_info
100 |     ```
101 | 
102 | 2. `LocalFileStorage`: Manages the local file storage, ensuring the storage directory exists and allowing data to be written to files within it. It will be later used in our Dagster assets to save images into the `charts` folder.
103 | 
104 | ### `dlt` Explained
105 | 
106 | In the `dlt` folder within `dlt_dagster_snowflake_demo`, necessary `dlt` resources and sources are defined. Below is a visual representation illustrating the functionality of `dlt`:
107 | 
108 | ![dlt explained](https://storage.googleapis.com/dlt-blog-images/dlt_dagster_snowflake_demo_dlt.png)
109 | 
110 | 1. `hacker_news`: A `dlt` resource that yields stories related to specified orchestration tools from Hackernews. For each tool, it retrieves the top 5 stories that have at least one comment. The stories are then appended to the existing data.
111 | 
112 |     Note that the `write_disposition` can also be set to `merge` or `replace`: 
113 |     - The merge write disposition merges the new data from the resource with the existing data at the destination. It requires a primary_key to be specified for the resource. More details can be found here.
114 |     - The replace write disposition replaces the data in the destination with the data from the resource. It deletes all the classes and objects and recreates the schema before loading the data.
115 | 
116 |     More details can be found [here](https://dlthub.com/docs/general-usage/resource).
117 | 
118 | 2. `comments`: A `dlt` transformer - a resource that receives data from another resource. It fetches comments for each story yielded by the `hacker_news` function. 
119 | 
120 | 3. `hacker_news_full`: A `dlt` source that extracts data from the source location using one or more resource components, such as `hacker_news` and `comments`. To illustrate, if the source is a database, a resource corresponds to a table within that database.
121 | 
122 | 4. `google_trends`: A `dlt` resource that fetches Google Trends data for specified orchestration tools. It attempts to retrieve the data multiple times in case of failures or empty responses. The retrieved data is then appended to the existing data.
123 | 
124 | As you may have noticed, the `dlt` library is designed to handle the unnesting of data internally. When you retrieve data from APIs like Hackernews or Google Trends, `dlt` automatically unpacks the nested structures into relational tables, creating and linking child and parent tables. This is achieved through unique identifiers (`_dlt_id` and `_dlt_parent_id`) that link child tables to specific rows in the parent table. However, it's important to note that you have control over [how this unnesting is done](https://dlthub.com/docs/general-usage/destination-tables).
125 | 
126 | ### Dagster Assets Explained
127 | 
128 | > If you're new to Dagster, start by understanding the [concept of an asset](https://docs.dagster.io/concepts).
129 | 
130 | The assets defined in this project are essentially combinations of `dlt` resources paired with pipeline runs. When materialized, `dlt` objects are initialized, and the pipeline is executed to load data into Snowflake. This project includes the following assets:
131 | 
132 | 1. `google_trends_asset`: Loads Google Trends data from the "google_trends" `dlt` resource to Snowflake using a `dlt` pipeline.
133 | 
134 | 2. `google_trends_chart`: Generates a line chart visualizing Google Trends data over time and saves it to the local storage. This asset is dependent on `google_trends_asset`, since it uses data that's loaded by the latter. 
135 | 
136 | 3. `hacker_news_full_asset`: Loads Hackernews data from the "hacker_news_full" `dlt` source to Snowflake using a `dlt` pipeline.
137 | 
138 | 4. `hacker_news_chart`: Generates a line chart visualizing the sentiment of comments for each tool and saves it to the local storage.
139 | 
140 | 4. `hacker_news_asset`: Separately loads Hackernews stories from the "hacker_news" `dlt` resource to Snowflake using a `dlt` pipeline.
141 | 
142 | 
143 | ## Materialize Your Assets
144 | 
145 | Once you launch your Dagster locally, you can materialize your assets and view the resulting charts in the corresponding folder. Feel free to explore and experiment with this project.
146 | 
147 | ## View Your Data
148 | 
149 | Your data is now stored in Snowflake. It should look something like this:
150 | 
151 | ![Snowflake view](https://storage.googleapis.com/dlt-blog-images/dlt_dagster_snowflake_demo_view.png)
152 | 
153 | ## Contact / Support
154 | For insights on executing custom pipelines using `dlt` or orchestrating workflows in Dagster, join their Slack communities:
155 | 
156 | - [dltHub](https://dlthub-community.slack.com)
157 | - [Dagster](https://kestra-io.slack.com)
158 | 
159 | For more information on Snowflake, refer to their official [documentation](https://docs.snowflake.com/en/?_ga=2.9762677.1316386857.1709051821-830416446.1707924081).
160 | 


--------------------------------------------------------------------------------
/sengled-plug-demo/tuya_helpers/openapi.py:
--------------------------------------------------------------------------------
  1 | """Tuya Open API."""
  2 | from __future__ import annotations
  3 | 
  4 | import hashlib
  5 | import hmac
  6 | import json
  7 | import time
  8 | from typing import Any
  9 | 
 10 | import requests
 11 | 
 12 | from .openlogging import filter_logger, logger
 13 | from .tuya_enums import AuthType
 14 | from .version import VERSION
 15 | 
 16 | TUYA_ERROR_CODE_TOKEN_INVALID = 1010
 17 | 
 18 | TO_C_CUSTOM_REFRESH_TOKEN_API = "/v1.0/iot-03/users/token/"
 19 | TO_C_SMART_HOME_REFRESH_TOKEN_API = "/v1.0/token/"
 20 | 
 21 | TO_C_CUSTOM_TOKEN_API = "/v1.0/iot-03/users/login"
 22 | TO_C_SMART_HOME_TOKEN_API = "/v1.0/iot-01/associated-users/actions/authorized-login"
 23 | 
 24 | 
 25 | class TuyaTokenInfo:
 26 |     """Tuya token info.
 27 | 
 28 |     Attributes:
 29 |         access_token: Access token.
 30 |         expire_time: Valid period in seconds.
 31 |         refresh_token: Refresh token.
 32 |         uid: Tuya user ID.
 33 |         platform_url: user region platform url
 34 |     """
 35 | 
 36 |     def __init__(self, token_response: dict[str, Any] = None):
 37 |         """Init TuyaTokenInfo."""
 38 |         result = token_response.get("result", {})
 39 | 
 40 |         self.expire_time = (
 41 |             token_response.get("t", 0)
 42 |             + result.get("expire", result.get("expire_time", 0)) * 1000
 43 |         )
 44 |         self.access_token = result.get("access_token", "")
 45 |         self.refresh_token = result.get("refresh_token", "")
 46 |         self.uid = result.get("uid", "")
 47 |         self.platform_url = result.get("platform_url", "")
 48 | 
 49 | 
 50 | class TuyaOpenAPI:
 51 |     """Open Api.
 52 | 
 53 |     Typical usage example:
 54 | 
 55 |     openapi = TuyaOpenAPI(ENDPOINT, ACCESS_ID, ACCESS_KEY)
 56 |     """
 57 | 
 58 |     def __init__(
 59 |         self,
 60 |         endpoint: str,
 61 |         access_id: str,
 62 |         access_secret: str,
 63 |         auth_type: AuthType = AuthType.SMART_HOME,
 64 |         lang: str = "en",
 65 |     ) -> None:
 66 |         """Init TuyaOpenAPI."""
 67 |         self.session = requests.session()
 68 | 
 69 |         self.endpoint = endpoint
 70 |         self.access_id = access_id
 71 |         self.access_secret = access_secret
 72 |         self.lang = lang
 73 | 
 74 |         self.auth_type = auth_type
 75 |         if self.auth_type == AuthType.CUSTOM:
 76 |             self.__login_path = TO_C_CUSTOM_TOKEN_API
 77 |         else:
 78 |             self.__login_path = TO_C_SMART_HOME_TOKEN_API
 79 | 
 80 |         self.token_info: TuyaTokenInfo = None
 81 | 
 82 |         self.dev_channel: str = ""
 83 | 
 84 |         self.__username = ""
 85 |         self.__password = ""
 86 |         self.__country_code = ""
 87 |         self.__schema = ""
 88 | 
 89 |     # https://developer.tuya.com/docs/iot/open-api/api-reference/singnature?id=Ka43a5mtx1gsc
 90 |     def _calculate_sign(
 91 |         self,
 92 |         method: str,
 93 |         path: str,
 94 |         params: dict[str, Any] | None = None,
 95 |         body: dict[str, Any] | None = None,
 96 |     ) -> tuple[str, int]:
 97 | 
 98 |         # HTTPMethod
 99 |         str_to_sign = method
100 |         str_to_sign += "\n"
101 | 
102 |         # Content-SHA256
103 |         content_to_sha256 = (
104 |             "" if body is None or len(body.keys()) == 0 else json.dumps(body)
105 |         )
106 | 
107 |         str_to_sign += (
108 |             hashlib.sha256(content_to_sha256.encode("utf8")).hexdigest().lower()
109 |         )
110 |         str_to_sign += "\n"
111 | 
112 |         # Header
113 |         str_to_sign += "\n"
114 | 
115 |         # URL
116 |         str_to_sign += path
117 | 
118 |         if params is not None and len(params.keys()) > 0:
119 |             str_to_sign += "?"
120 | 
121 |             params_keys = sorted(params.keys())
122 |             query_builder = "".join(f"{key}={params[key]}&" for key in params_keys)
123 |             str_to_sign += query_builder[:-1]
124 | 
125 |         # Sign
126 |         t = int(time.time() * 1000)
127 | 
128 |         message = self.access_id
129 |         if self.token_info is not None:
130 |             message += self.token_info.access_token
131 |         message += str(t) + str_to_sign
132 |         sign = (
133 |             hmac.new(
134 |                 self.access_secret.encode("utf8"),
135 |                 msg=message.encode("utf8"),
136 |                 digestmod=hashlib.sha256,
137 |             )
138 |             .hexdigest()
139 |             .upper()
140 |         )
141 |         return sign, t
142 | 
143 |     def __refresh_access_token_if_need(self, path: str):
144 |         if self.is_connect() is False:
145 |             return
146 | 
147 |         if path.startswith(self.__login_path):
148 |             return
149 | 
150 |         # should use refresh token?
151 |         now = int(time.time() * 1000)
152 |         expired_time = self.token_info.expire_time
153 | 
154 |         if expired_time - 60 * 1000 > now:  # 1min
155 |             return
156 | 
157 |         self.token_info.access_token = ""
158 | 
159 |         if self.auth_type == AuthType.CUSTOM:
160 |             response = self.post(
161 |                 TO_C_CUSTOM_REFRESH_TOKEN_API + self.token_info.refresh_token
162 |             )
163 |         else:
164 |             response = self.get(
165 |                 TO_C_SMART_HOME_REFRESH_TOKEN_API + self.token_info.refresh_token
166 |             )
167 | 
168 |         self.token_info = TuyaTokenInfo(response)
169 | 
170 |     def set_dev_channel(self, dev_channel: str):
171 |         """Set dev channel."""
172 |         self.dev_channel = dev_channel
173 | 
174 |     def connect(
175 |         self,
176 |         username: str = "",
177 |         password: str = "",
178 |         country_code: str = "",
179 |         schema: str = "",
180 |     ) -> dict[str, Any]:
181 |         """Connect to Tuya Cloud.
182 | 
183 |         Args:
184 |             username (str): user name in to C
185 |             password (str): user password in to C
186 |             country_code (str): country code in SMART_HOME
187 |             schema (str): app schema in SMART_HOME
188 | 
189 |         Returns:
190 |             response: connect response
191 |         """
192 |         self.__username = username
193 |         self.__password = password
194 |         self.__country_code = country_code
195 |         self.__schema = schema
196 | 
197 |         if self.auth_type == AuthType.CUSTOM:
198 |             response = self.post(
199 |                 TO_C_CUSTOM_TOKEN_API,
200 |                 {
201 |                     "username": username,
202 |                     "password": hashlib.sha256(password.encode("utf8"))
203 |                     .hexdigest()
204 |                     .lower(),
205 |                 },
206 |             )
207 |         else:
208 |             response = self.post(
209 |                 TO_C_SMART_HOME_TOKEN_API,
210 |                 {
211 |                     "username": username,
212 |                     "password": hashlib.md5(password.encode("utf8")).hexdigest(),
213 |                     "country_code": country_code,
214 |                     "schema": schema,
215 |                 },
216 |             )
217 | 
218 |         if not response["success"]:
219 |             return response
220 | 
221 |         # Cache token info.
222 |         self.token_info = TuyaTokenInfo(response)
223 | 
224 |         return response
225 | 
226 |     def is_connect(self) -> bool:
227 |         """Is connect to tuya cloud."""
228 |         return self.token_info is not None and len(self.token_info.access_token) > 0
229 | 
230 |     def __request(
231 |         self,
232 |         method: str,
233 |         path: str,
234 |         params: dict[str, Any] | None = None,
235 |         body: dict[str, Any] | None = None,
236 |     ) -> dict[str, Any]:
237 | 
238 |         self.__refresh_access_token_if_need(path)
239 | 
240 |         access_token = self.token_info.access_token if self.token_info else ""
241 |         sign, t = self._calculate_sign(method, path, params, body)
242 |         headers = {
243 |             "client_id": self.access_id,
244 |             "sign": sign,
245 |             "sign_method": "HMAC-SHA256",
246 |             "access_token": access_token,
247 |             "t": str(t),
248 |             "lang": self.lang,
249 |         }
250 | 
251 |         if path == self.__login_path or \
252 |             path.startswith(TO_C_CUSTOM_REFRESH_TOKEN_API) or\
253 |             path.startswith(TO_C_SMART_HOME_REFRESH_TOKEN_API):
254 |             headers["dev_lang"] = "python"
255 |             headers["dev_version"] = VERSION
256 |             headers["dev_channel"] = self.dev_channel
257 | 
258 |         logger.debug(
259 |             f"Request: method = {method}, \
260 |                 url = {self.endpoint + path},\
261 |                 params = {params},\
262 |                 body = {filter_logger(body)},\
263 |                 t = {int(time.time()*1000)}"
264 |         )
265 | 
266 |         response = self.session.request(
267 |             method, self.endpoint + path, params=params, json=body, headers=headers
268 |         )
269 | 
270 |         if response.ok is False:
271 |             logger.error(
272 |                 f"Response error: code={response.status_code}, body={response.body}"
273 |             )
274 |             return None
275 | 
276 |         result = response.json()
277 | 
278 |         logger.debug(
279 |             f"Response: {json.dumps(filter_logger(result), ensure_ascii=False, indent=2)}"
280 |         )
281 | 
282 |         if result.get("code", -1) == TUYA_ERROR_CODE_TOKEN_INVALID:
283 |             self.token_info = None
284 |             self.connect(
285 |                 self.__username, self.__password, self.__country_code, self.__schema
286 |             )
287 | 
288 |         return result
289 | 
290 |     def get(self, path: str, params: dict[str, Any] | None = None) -> dict[str, Any]:
291 |         """Http Get.
292 | 
293 |         Requests the server to return specified resources.
294 | 
295 |         Args:
296 |             path (str): api path
297 |             params (map): request parameter
298 | 
299 |         Returns:
300 |             response: response body
301 |         """
302 |         return self.__request("GET", path, params, None)
303 | 
304 |     def post(self, path: str, body: dict[str, Any] | None = None) -> dict[str, Any]:
305 |         """Http Post.
306 | 
307 |         Requests the server to update specified resources.
308 | 
309 |         Args:
310 |             path (str): api path
311 |             body (map): request body
312 | 
313 |         Returns:
314 |             response: response body
315 |         """
316 |         return self.__request("POST", path, None, body)
317 | 
318 |     def put(self, path: str, body: dict[str, Any] | None = None) -> dict[str, Any]:
319 |         """Http Put.
320 | 
321 |         Requires the server to perform specified operations.
322 | 
323 |         Args:
324 |             path (str): api path
325 |             body (map): request body
326 | 
327 |         Returns:
328 |             response: response body
329 |         """
330 |         return self.__request("PUT", path, None, body)
331 | 
332 |     def delete(self, path: str, params: dict[str, Any] | None = None) -> dict[str, Any]:
333 |         """Http Delete.
334 | 
335 |         Requires the server to delete specified resources.
336 | 
337 |         Args:
338 |             path (str): api path
339 |             params (map): request param
340 | 
341 |         Returns:
342 |             response: response body
343 |         """
344 |         return self.__request("DELETE", path, params, None)
345 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 |                                  Apache License
  4 |                            Version 2.0, January 2004
  5 |                         http://www.apache.org/licenses/
  6 | 
  7 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  8 | 
  9 |    1. Definitions.
 10 | 
 11 |       "License" shall mean the terms and conditions for use, reproduction,
 12 |       and distribution as defined by Sections 1 through 9 of this document.
 13 | 
 14 |       "Licensor" shall mean the copyright owner or entity authorized by
 15 |       the copyright owner that is granting the License.
 16 | 
 17 |       "Legal Entity" shall mean the union of the acting entity and all
 18 |       other entities that control, are controlled by, or are under common
 19 |       control with that entity. For the purposes of this definition,
 20 |       "control" means (i) the power, direct or indirect, to cause the
 21 |       direction or management of such entity, whether by contract or
 22 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 23 |       outstanding shares, or (iii) beneficial ownership of such entity.
 24 | 
 25 |       "You" (or "Your") shall mean an individual or Legal Entity
 26 |       exercising permissions granted by this License.
 27 | 
 28 |       "Source" form shall mean the preferred form for making modifications,
 29 |       including but not limited to software source code, documentation
 30 |       source, and configuration files.
 31 | 
 32 |       "Object" form shall mean any form resulting from mechanical
 33 |       transformation or translation of a Source form, including but
 34 |       not limited to compiled object code, generated documentation,
 35 |       and conversions to other media types.
 36 | 
 37 |       "Work" shall mean the work of authorship, whether in Source or
 38 |       Object form, made available under the License, as indicated by a
 39 |       copyright notice that is included in or attached to the work
 40 |       (an example is provided in the Appendix below).
 41 | 
 42 |       "Derivative Works" shall mean any work, whether in Source or Object
 43 |       form, that is based on (or derived from) the Work and for which the
 44 |       editorial revisions, annotations, elaborations, or other modifications
 45 |       represent, as a whole, an original work of authorship. For the purposes
 46 |       of this License, Derivative Works shall not include works that remain
 47 |       separable from, or merely link (or bind by name) to the interfaces of,
 48 |       the Work and Derivative Works thereof.
 49 | 
 50 |       "Contribution" shall mean any work of authorship, including
 51 |       the original version of the Work and any modifications or additions
 52 |       to that Work or Derivative Works thereof, that is intentionally
 53 |       submitted to Licensor for inclusion in the Work by the copyright owner
 54 |       or by an individual or Legal Entity authorized to submit on behalf of
 55 |       the copyright owner. For the purposes of this definition, "submitted"
 56 |       means any form of electronic, verbal, or written communication sent
 57 |       to the Licensor or its representatives, including but not limited to
 58 |       communication on electronic mailing lists, source code control systems,
 59 |       and issue tracking systems that are managed by, or on behalf of, the
 60 |       Licensor for the purpose of discussing and improving the Work, but
 61 |       excluding communication that is conspicuously marked or otherwise
 62 |       designated in writing by the copyright owner as "Not a Contribution."
 63 | 
 64 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 65 |       on behalf of whom a Contribution has been received by Licensor and
 66 |       subsequently incorporated within the Work.
 67 | 
 68 |    2. Grant of Copyright License. Subject to the terms and conditions of
 69 |       this License, each Contributor hereby grants to You a perpetual,
 70 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 71 |       copyright license to reproduce, prepare Derivative Works of,
 72 |       publicly display, publicly perform, sublicense, and distribute the
 73 |       Work and such Derivative Works in Source or Object form.
 74 | 
 75 |    3. Grant of Patent License. Subject to the terms and conditions of
 76 |       this License, each Contributor hereby grants to You a perpetual,
 77 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 78 |       (except as stated in this section) patent license to make, have made,
 79 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 80 |       where such license applies only to those patent claims licensable
 81 |       by such Contributor that are necessarily infringed by their
 82 |       Contribution(s) alone or by combination of their Contribution(s)
 83 |       with the Work to which such Contribution(s) was submitted. If You
 84 |       institute patent litigation against any entity (including a
 85 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 86 |       or a Contribution incorporated within the Work constitutes direct
 87 |       or contributory patent infringement, then any patent licenses
 88 |       granted to You under this License for that Work shall terminate
 89 |       as of the date such litigation is filed.
 90 | 
 91 |    4. Redistribution. You may reproduce and distribute copies of the
 92 |       Work or Derivative Works thereof in any medium, with or without
 93 |       modifications, and in Source or Object form, provided that You
 94 |       meet the following conditions:
 95 | 
 96 |       (a) You must give any other recipients of the Work or
 97 |           Derivative Works a copy of this License; and
 98 | 
 99 |       (b) You must cause any modified files to carry prominent notices
100 |           stating that You changed the files; and
101 | 
102 |       (c) You must retain, in the Source form of any Derivative Works
103 |           that You distribute, all copyright, patent, trademark, and
104 |           attribution notices from the Source form of the Work,
105 |           excluding those notices that do not pertain to any part of
106 |           the Derivative Works; and
107 | 
108 |       (d) If the Work includes a "NOTICE" text file as part of its
109 |           distribution, then any Derivative Works that You distribute must
110 |           include a readable copy of the attribution notices contained
111 |           within such NOTICE file, excluding those notices that do not
112 |           pertain to any part of the Derivative Works, in at least one
113 |           of the following places: within a NOTICE text file distributed
114 |           as part of the Derivative Works; within the Source form or
115 |           documentation, if provided along with the Derivative Works; or,
116 |           within a display generated by the Derivative Works, if and
117 |           wherever such third-party notices normally appear. The contents
118 |           of the NOTICE file are for informational purposes only and
119 |           do not modify the License. You may add Your own attribution
120 |           notices within Derivative Works that You distribute, alongside
121 |           or as an addendum to the NOTICE text from the Work, provided
122 |           that such additional attribution notices cannot be construed
123 |           as modifying the License.
124 | 
125 |       You may add Your own copyright statement to Your modifications and
126 |       may provide additional or different license terms and conditions
127 |       for use, reproduction, or distribution of Your modifications, or
128 |       for any such Derivative Works as a whole, provided Your use,
129 |       reproduction, and distribution of the Work otherwise complies with
130 |       the conditions stated in this License.
131 | 
132 |    5. Submission of Contributions. Unless You explicitly state otherwise,
133 |       any Contribution intentionally submitted for inclusion in the Work
134 |       by You to the Licensor shall be under the terms and conditions of
135 |       this License, without any additional terms or conditions.
136 |       Notwithstanding the above, nothing herein shall supersede or modify
137 |       the terms of any separate license agreement you may have executed
138 |       with Licensor regarding such Contributions.
139 | 
140 |    6. Trademarks. This License does not grant permission to use the trade
141 |       names, trademarks, service marks, or product names of the Licensor,
142 |       except as required for reasonable and customary use in describing the
143 |       origin of the Work and reproducing the content of the NOTICE file.
144 | 
145 |    7. Disclaimer of Warranty. Unless required by applicable law or
146 |       agreed to in writing, Licensor provides the Work (and each
147 |       Contributor provides its Contributions) on an "AS IS" BASIS,
148 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
149 |       implied, including, without limitation, any warranties or conditions
150 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
151 |       PARTICULAR PURPOSE. You are solely responsible for determining the
152 |       appropriateness of using or redistributing the Work and assume any
153 |       risks associated with Your exercise of permissions under this License.
154 | 
155 |    8. Limitation of Liability. In no event and under no legal theory,
156 |       whether in tort (including negligence), contract, or otherwise,
157 |       unless required by applicable law (such as deliberate and grossly
158 |       negligent acts) or agreed to in writing, shall any Contributor be
159 |       liable to You for damages, including any direct, indirect, special,
160 |       incidental, or consequential damages of any character arising as a
161 |       result of this License or out of the use or inability to use the
162 |       Work (including but not limited to damages for loss of goodwill,
163 |       work stoppage, computer failure or malfunction, or any and all
164 |       other commercial damages or losses), even if such Contributor
165 |       has been advised of the possibility of such damages.
166 | 
167 |    9. Accepting Warranty or Additional Liability. While redistributing
168 |       the Work or Derivative Works thereof, You may choose to offer,
169 |       and charge a fee for, acceptance of support, warranty, indemnity,
170 |       or other liability obligations and/or rights consistent with this
171 |       License. However, in accepting such obligations, You may act only
172 |       on Your own behalf and on Your sole responsibility, not on behalf
173 |       of any other Contributor, and only if You agree to indemnify,
174 |       defend, and hold each Contributor harmless for any liability
175 |       incurred by, or claims asserted against, such Contributor by reason
176 |       of your accepting any such warranty or additional liability.
177 | 
178 |    END OF TERMS AND CONDITIONS
179 | 
180 |    APPENDIX: How to apply the Apache License to your work.
181 | 
182 |       To apply the Apache License to your work, attach the following
183 |       boilerplate notice, with the fields enclosed by brackets "[]"
184 |       replaced with your own identifying information. (Don't include
185 |       the brackets!)  The text should be enclosed in the appropriate
186 |       comment syntax for the file format. We also recommend that a
187 |       file or class name and description of purpose be included on the
188 |       same "printed page" as the copyright notice for easier
189 |       identification within third-party archives.
190 | 
191 |    Copyright 2022 ScaleVector
192 | 
193 |    Licensed under the Apache License, Version 2.0 (the "License");
194 |    you may not use this file except in compliance with the License.
195 |    You may obtain a copy of the License at
196 | 
197 |        http://www.apache.org/licenses/LICENSE-2.0
198 | 
199 |    Unless required by applicable law or agreed to in writing, software
200 |    distributed under the License is distributed on an "AS IS" BASIS,
201 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
202 |    See the License for the specific language governing permissions and
203 |    limitations under the License.
204 | 


--------------------------------------------------------------------------------
/sql_to_weaviate.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "source": [
  6 |     "# SQL database to Weaviate using dlt library\n",
  7 |     "Example for [Public MySQL Database.](https://docs.rfam.org/en/latest/database.html)"
  8 |    ],
  9 |    "metadata": {
 10 |     "collapsed": false
 11 |    },
 12 |    "id": "b34fe934cb4f242c"
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "!pip install -q \"dlt[weaviate]\""
 20 |    ],
 21 |    "metadata": {
 22 |     "collapsed": false,
 23 |     "ExecuteTime": {
 24 |      "start_time": "2023-09-07T16:15:32.762937679Z"
 25 |     }
 26 |    },
 27 |    "id": "2841cf5886b5df42"
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "source": [
 32 |     "Let's init [verified source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/sql_database)  `sql_database` with dlt cli command:"
 33 |    ],
 34 |    "metadata": {
 35 |     "collapsed": false
 36 |    },
 37 |    "id": "f4b452c86009840e"
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 2,
 42 |    "outputs": [
 43 |     {
 44 |      "name": "stdout",
 45 |      "output_type": "stream",
 46 |      "text": [
 47 |       "Looking up the init scripts in \u001B[1mhttps://github.com/dlt-hub/verified-sources.git\u001B[0m...\r\n",
 48 |       "Cloning and configuring a verified source \u001B[1msql_database\u001B[0m (Source that loads tables form any SQLAlchemy supported database, supports batching requests and incremental loads.)\r\n",
 49 |       "\r\n",
 50 |       "Verified source \u001B[1msql_database\u001B[0m was added to your project!\r\n",
 51 |       "* See the usage examples and code snippets to copy from \u001B[1msql_database_pipeline.py\u001B[0m\r\n",
 52 |       "* Add credentials for \u001B[1mweaviate\u001B[0m and other secrets in \u001B[1m./.dlt/secrets.toml\u001B[0m\r\n",
 53 |       "* Add the required dependencies to \u001B[1mpyproject.toml\u001B[0m:\r\n",
 54 |       "  \u001B[1msqlalchemy>=1.4\u001B[0m\r\n",
 55 |       "  \u001B[1mdlt[weaviate]<0.4,>=0.3.5\u001B[0m\r\n",
 56 |       "  If the dlt dependency is already added, make sure you install the extra for \u001B[1mweaviate\u001B[0m to it\r\n",
 57 |       "  If you are using poetry you may issue the following command:\r\n",
 58 |       "\u001B[1m  poetry add dlt -E weaviate\u001B[0m\r\n",
 59 |       "\r\n",
 60 |       "* Read \u001B[1mhttps://dlthub.com/docs/walkthroughs/create-a-pipeline\u001B[0m for more information\r\n"
 61 |      ]
 62 |     }
 63 |    ],
 64 |    "source": [
 65 |     "!dlt --non-interactive init sql_database weaviate "
 66 |    ],
 67 |    "metadata": {
 68 |     "collapsed": false,
 69 |     "ExecuteTime": {
 70 |      "end_time": "2023-09-07T16:15:39.112449485Z",
 71 |      "start_time": "2023-09-07T16:15:35.652268880Z"
 72 |     }
 73 |    },
 74 |    "id": "82b557c903459372"
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 15,
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "!pip install -q sqlalchemy"
 82 |    ],
 83 |    "metadata": {
 84 |     "collapsed": false,
 85 |     "ExecuteTime": {
 86 |      "start_time": "2023-09-07T16:43:27.653554239Z"
 87 |     }
 88 |    },
 89 |    "id": "998f9710b6661067"
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 12,
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "import os\n",
 97 |     "import weaviate\n",
 98 |     "\n",
 99 |     "\n",
100 |     "def show_data(class_name, properties):\n",
101 |     "    client = weaviate.Client(\n",
102 |     "        url=os.getenv(\"WEAVIATE_URL\"),\n",
103 |     "        auth_client_secret=weaviate.AuthApiKey(\n",
104 |     "            api_key=os.getenv(\"WEAVIATE_API_KEY\")\n",
105 |     "        ),\n",
106 |     "        additional_headers={\n",
107 |     "            \"X-OpenAI-Api-Key\": os.getenv(\"WEAVIATE_OPENAI_KEY\")\n",
108 |     "        }\n",
109 |     "    )\n",
110 |     "\n",
111 |     "    response = (\n",
112 |     "        client.query\n",
113 |     "        .get(class_name, properties)\n",
114 |     "        .do()\n",
115 |     "    )\n",
116 |     "    return response"
117 |    ],
118 |    "metadata": {
119 |     "collapsed": false,
120 |     "ExecuteTime": {
121 |      "end_time": "2023-09-07T16:39:30.455113374Z",
122 |      "start_time": "2023-09-07T16:39:30.414665860Z"
123 |     }
124 |    },
125 |    "id": "f96b7b53f2250ad"
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "source": [
130 |     "Put credentials into `.dlt/secrets.toml` file like this:\n",
131 |     "\n",
132 |     "```\n",
133 |     "[sources.sql_database.credentials]\n",
134 |     "drivername = \"mysql+pymysql\" # driver name for the database\n",
135 |     "database = \"Rfam\" # database name\n",
136 |     "username = \"rfamro\" # username associated with the database\n",
137 |     "host = \"mysql-rfam-public.ebi.ac.uk\" # host address\n",
138 |     "port = \"4497\" # port required for connection\n",
139 |     "```\n",
140 |     "\n",
141 |     "[More info about credentials.](https://dlthub.com/docs/general-usage/credentials)"
142 |    ],
143 |    "metadata": {
144 |     "collapsed": false
145 |    },
146 |    "id": "b71060e1b9377e70"
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 4,
151 |    "outputs": [
152 |     {
153 |      "name": "stderr",
154 |      "output_type": "stream",
155 |      "text": [
156 |       "/home/alenaastrakhantseva/.cache/pypoetry/virtualenvs/weaviate-demo-9BqQS6RD-py3.10/lib/python3.10/site-packages/weaviate/warnings.py:130: DeprecationWarning: Dep006: You are using the `client.batch()` method, which will be removed in the next major release.\n",
157 |       "            Please instead use the `client.batch.configure()` method to configure your batch and `client.batch` to enter the context manager.\n",
158 |       "            See https://weaviate.io/developers/weaviate/client-libraries/python for details.\n",
159 |       "  warnings.warn(\n"
160 |      ]
161 |     },
162 |     {
163 |      "name": "stdout",
164 |      "output_type": "stream",
165 |      "text": [
166 |       "Normalized data for the following tables:\n",
167 |       "- DltPipelineState: 1 row(s)\n",
168 |       "- Family: 4108 row(s)\n",
169 |       "\n",
170 |       "------\n",
171 |       "Pipeline rfam completed in 1 minute and 37.41 seconds\n",
172 |       "1 load package(s) were loaded to destination weaviate and into dataset Rfam\n",
173 |       "The weaviate destination used https://demo-1-wvxjul5s.weaviate.network location to store data\n",
174 |       "Load package 1694104408.537511 is LOADED and contains no failed jobs\n"
175 |      ]
176 |     }
177 |    ],
178 |    "source": [
179 |     "import dlt\n",
180 |     "from dlt.destinations.weaviate import weaviate_adapter\n",
181 |     "\n",
182 |     "from sql_database import sql_table\n",
183 |     "\n",
184 |     "pipeline = dlt.pipeline(\n",
185 |     "     pipeline_name=\"rfam\", destination='weaviate', dataset_name=\"rfam\"\n",
186 |     ")\n",
187 |     "\n",
188 |     "load_source = sql_table(table=\"family\",)\n",
189 |     "load_info = pipeline.run(weaviate_adapter(load_source, vectorize=\"description\", tokenization={\"description\": \"word\"}))\n",
190 |     "\n",
191 |     "# pretty print the information on data that was loaded\n",
192 |     "row_counts = pipeline.last_trace.last_normalize_info\n",
193 |     "print(row_counts)\n",
194 |     "print(\"------\")\n",
195 |     "print(load_info)"
196 |    ],
197 |    "metadata": {
198 |     "collapsed": false,
199 |     "ExecuteTime": {
200 |      "end_time": "2023-09-07T16:35:03.547036147Z",
201 |      "start_time": "2023-09-07T16:33:25.356346391Z"
202 |     }
203 |    },
204 |    "id": "3972d77c699a500a"
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 14,
209 |    "outputs": [
210 |     {
211 |      "data": {
212 |       "text/plain": "{'data': {'Get': {'Rfam_Family': [{'description': 'CDKN2B antisense RNA 1 intronic convserved region'},\n    {'description': 'microRNA mir-605'},\n    {'description': 'mir-974 microRNA precursor family'},\n    {'description': 'microRNA mir-633'},\n    {'description': 'microRNA mir-569'},\n    {'description': 'mir-6715 microRNA precursor family'},\n    {'description': 'Small nucleolar RNA Z103'},\n    {'description': 'Small nucleolar RNA SNORD70'},\n    {'description': 'mir-5856 microRNA precursor family'},\n    {'description': 'ctRNA'},\n    {'description': 'MIR4245 microRNA precursor family'},\n    {'description': 'mir-2068 microRNA precursor family'},\n    {'description': 'mir-5890 microRNA precursor family'},\n    {'description': 'Leptospira sRNA 30_255'},\n    {'description': 'Pospiviroid RY motif stem loop'},\n    {'description': 'TeloSII non coding RNA 45'},\n    {'description': 'MIR2871 microRNA precursor family'},\n    {'description': 'mir-1017 microRNA precursor family'},\n    {'description': 'Rickettsia rpsL leader'},\n    {'description': 'microRNA mir-70'},\n    {'description': 'MIR1882 microRNA precursor family'},\n    {'description': 'L31-Coriobacteria ribosomal protein leader'},\n    {'description': 'Small nucleolar RNA snoR60'},\n    {'description': 'Brucella sRNA 1350'},\n    {'description': 'mir-3121 microRNA precursor family'},\n    {'description': 'microRNA MIR1122'},\n    {'description': 'Gag/pro ribosomal frameshift site'},\n    {'description': 'RT-3 RNA'},\n    {'description': 'mir-2513 microRNA precursor family'},\n    {'description': 'Rhodo-rpoB RNA'},\n    {'description': 'Antisense to pHK01_099'},\n    {'description': 'SPRY4-IT1 conserved region 2'},\n    {'description': 'Small nucleolar RNA snR44'},\n    {'description': 'Fluoride riboswitch (crcB)'},\n    {'description': 'mir-156 microRNA precursor'},\n    {'description': 'mir-4773 microRNA precursor family'},\n    {'description': 'mir-1397 microRNA precursor family'},\n    {'description': 'mir-3047 microRNA precursor family'},\n    {'description': 'Non-coding RNA BC040587'},\n    {'description': 'TeloSII non coding RNA 30'},\n    {'description': 'microRNA mir-636'},\n    {'description': 'microRNA mir-214'},\n    {'description': \"Flavivirus 5' UTR\"},\n    {'description': 'mir-561 microRNA precursor family'},\n    {'description': 'Burkholderia sRNA 37'},\n    {'description': 'RAGATH-6 RNA'},\n    {'description': 'Small nucleolar RNA ZL1'},\n    {'description': 'osmY RNA'},\n    {'description': \"AilA 5' UTR thermometer\"},\n    {'description': 'SMAD5 antisense RNA 1 conserved region 3'},\n    {'description': 'Fst antitoxin sRNA'},\n    {'description': 'DUF3800-XI RNA'},\n    {'description': 'Caenorhabditis snoRNA ceN46'},\n    {'description': 'FTX transcript, XIST regulator conserved region 4'},\n    {'description': 'Streptococcus sRNA SpF11'},\n    {'description': 'mir-15_2 microRNA precursor family'},\n    {'description': 'microRNA mir-2518'},\n    {'description': 'mir-3347 microRNA precursor family'},\n    {'description': \"Insect-specific Flavivirus 3' UTR cis-acting replication element (CRE)\"},\n    {'description': 'Burkholderia sRNA Bp1_Cand612_SIPHT'},\n    {'description': 'mir-2235 microRNA precursor family'},\n    {'description': 'Listeria sRNA rliB'},\n    {'description': 'mir-3618 microRNA precursor family'},\n    {'description': 'Salmonella enterica conserved region STnc30'},\n    {'description': 'mir-4526 microRNA precursor family'},\n    {'description': 'S. pyogenes small RNA 1186876'},\n    {'description': 'mir-3064 microRNA precursor family'},\n    {'description': 'Fungal small nucleolar RNA U3'},\n    {'description': 'mir-3204 microRNA precursor family'},\n    {'description': 'mir-5928 microRNA precursor family'},\n    {'description': 'Small nucleolar RNA U18'},\n    {'description': 'Small nucleolar RNA sR48'},\n    {'description': 'RAGATH-15 RNA'},\n    {'description': 'Streptomyces sRNA 4677'},\n    {'description': 'mir-4079 microRNA precursor family'},\n    {'description': 'microRNA mir-58'},\n    {'description': 'Small nucleolar RNA SNORD60'},\n    {'description': 'Listeria sRNA rli43'},\n    {'description': 'RAGATH-13 RNA'},\n    {'description': 'Plasmodium RNA of unkown function RU6-F3'},\n    {'description': 'mir-2059 microRNA precursor family'},\n    {'description': 'Small nucleolar RNA ACA59'},\n    {'description': 'microRNA mir-328'},\n    {'description': 'mir-2278 microRNA precursor family'},\n    {'description': 'microRNA mir-330'},\n    {'description': 'Pseudomonas sRNA P34'},\n    {'description': 'Small nucleolar RNA Z39'},\n    {'description': 'Small nucleolar RNA SNORA15'},\n    {'description': 'Small nucleolar RNA snoR144'},\n    {'description': 'Vibrio alginolyticus sRNA 907'},\n    {'description': 'Deleted in lymphocytic leukemia 2 conserved region 3'},\n    {'description': 'mir-1822 microRNA precursor family'},\n    {'description': 'MIR7725 microRNA precursor family'},\n    {'description': 'mir-6129 microRNA precursor family'},\n    {'description': 'mir-2767 microRNA precursor family'},\n    {'description': 'CRISPR RNA direct repeat element'},\n    {'description': 'S15-Flavobacteria ribosomal protein leader'},\n    {'description': 'Actinomyces-1 RNA'},\n    {'description': 'Small Cajal body specific RNA ncR21'},\n    {'description': 'Z30 small nucleolar RNA'}]}}}"
213 |      },
214 |      "execution_count": 14,
215 |      "metadata": {},
216 |      "output_type": "execute_result"
217 |     }
218 |    ],
219 |    "source": [
220 |     "# class_name: table name \"Family\" + dataset name \"Rfam\"\n",
221 |     "show_data(\"Rfam_Family\", [\"description\"])"
222 |    ],
223 |    "metadata": {
224 |     "collapsed": false,
225 |     "ExecuteTime": {
226 |      "end_time": "2023-09-07T16:39:43.839817411Z",
227 |      "start_time": "2023-09-07T16:39:42.788097606Z"
228 |     }
229 |    },
230 |    "id": "942793b835309409"
231 |   }
232 |  ],
233 |  "metadata": {
234 |   "kernelspec": {
235 |    "display_name": "Python 3",
236 |    "language": "python",
237 |    "name": "python3"
238 |   },
239 |   "language_info": {
240 |    "codemirror_mode": {
241 |     "name": "ipython",
242 |     "version": 2
243 |    },
244 |    "file_extension": ".py",
245 |    "mimetype": "text/x-python",
246 |    "name": "python",
247 |    "nbconvert_exporter": "python",
248 |    "pygments_lexer": "ipython2",
249 |    "version": "2.7.6"
250 |   }
251 |  },
252 |  "nbformat": 4,
253 |  "nbformat_minor": 5
254 | }
255 | 


--------------------------------------------------------------------------------
/dlt-init-openapi-demo/stripe_pipeline/rest_api/__init__.py:
--------------------------------------------------------------------------------
  1 | """Generic API Source"""
  2 | 
  3 | from typing import (
  4 |     Type,
  5 |     Any,
  6 |     Dict,
  7 |     List,
  8 |     Optional,
  9 |     Generator,
 10 |     Callable,
 11 |     cast,
 12 | )
 13 | import graphlib  # type: ignore[import,unused-ignore]
 14 | 
 15 | import dlt
 16 | from dlt.common.validation import validate_dict
 17 | from dlt.common import jsonpath
 18 | from dlt.common.schema.schema import Schema
 19 | from dlt.common.schema.typing import TSchemaContract
 20 | from dlt.common.configuration.specs import BaseConfiguration
 21 | 
 22 | from dlt.extract.incremental import Incremental
 23 | from dlt.extract.source import DltResource, DltSource
 24 | 
 25 | from dlt.sources.helpers.rest_client import RESTClient
 26 | from dlt.sources.helpers.rest_client.paginators import BasePaginator
 27 | from dlt.sources.helpers.rest_client.typing import HTTPMethodBasic
 28 | from .typing import (
 29 |     ClientConfig,
 30 |     ResolvedParam,
 31 |     Endpoint,
 32 |     EndpointResource,
 33 |     RESTAPIConfig,
 34 | )
 35 | from .config_setup import (
 36 |     IncrementalParam,
 37 |     create_auth,
 38 |     create_paginator,
 39 |     build_resource_dependency_graph,
 40 |     process_parent_data_item,
 41 |     setup_incremental_object,
 42 |     create_response_hooks,
 43 | )
 44 | from .utils import check_connection, exclude_keys  # noqa: F401
 45 | 
 46 | 
 47 | def rest_api_source(
 48 |     config: RESTAPIConfig,
 49 |     name: str = None,
 50 |     section: str = None,
 51 |     max_table_nesting: int = None,
 52 |     root_key: bool = False,
 53 |     schema: Schema = None,
 54 |     schema_contract: TSchemaContract = None,
 55 |     spec: Type[BaseConfiguration] = None,
 56 | ) -> DltSource:
 57 |     """Creates and configures a REST API source for data extraction.
 58 | 
 59 |     Args:
 60 |         config (RESTAPIConfig): Configuration for the REST API source.
 61 |         name (str, optional): Name of the source.
 62 |         section (str, optional): Section of the configuration file.
 63 |         max_table_nesting (int, optional): Maximum depth of nested table above which
 64 |             the remaining nodes are loaded as structs or JSON.
 65 |         root_key (bool, optional): Enables merging on all resources by propagating
 66 |             root foreign key to child tables. This option is most useful if you
 67 |             plan to change write disposition of a resource to disable/enable merge.
 68 |             Defaults to False.
 69 |         schema (Schema, optional): An explicit `Schema` instance to be associated
 70 |             with the source. If not present, `dlt` creates a new `Schema` object
 71 |             with provided `name`. If such `Schema` already exists in the same
 72 |             folder as the module containing the decorated function, such schema
 73 |             will be loaded from file.
 74 |         schema_contract (TSchemaContract, optional): Schema contract settings
 75 |             that will be applied to this resource.
 76 |         spec (Type[BaseConfiguration], optional): A specification of configuration
 77 |             and secret values required by the source.
 78 | 
 79 |     Returns:
 80 |         DltSource: A configured dlt source.
 81 | 
 82 |     Example:
 83 |         pokemon_source = rest_api_source({
 84 |             "client": {
 85 |                 "base_url": "https://pokeapi.co/api/v2/",
 86 |                 "paginator": "json_response",
 87 |             },
 88 |             "endpoints": {
 89 |                 "pokemon": {
 90 |                     "params": {
 91 |                         "limit": 100, # Default page size is 20
 92 |                     },
 93 |                     "resource": {
 94 |                         "primary_key": "id",
 95 |                     }
 96 |                 },
 97 |             },
 98 |         })
 99 |     """
100 |     decorated = dlt.source(
101 |         rest_api_resources,
102 |         name,
103 |         section,
104 |         max_table_nesting,
105 |         root_key,
106 |         schema,
107 |         schema_contract,
108 |         spec,
109 |     )
110 | 
111 |     return decorated(config)
112 | 
113 | 
114 | def rest_api_resources(config: RESTAPIConfig) -> List[DltResource]:
115 |     """Creates a list of resources from a REST API configuration.
116 | 
117 |     Args:
118 |         config (RESTAPIConfig): Configuration for the REST API source.
119 | 
120 |     Returns:
121 |         List[DltResource]: List of dlt resources.
122 | 
123 |     Example:
124 |         github_source = rest_api_resources({
125 |             "client": {
126 |                 "base_url": "https://api.github.com/repos/dlt-hub/dlt/",
127 |                 "auth": {
128 |                     "token": dlt.secrets["token"],
129 |                 },
130 |             },
131 |             "resource_defaults": {
132 |                 "primary_key": "id",
133 |                 "write_disposition": "merge",
134 |                 "endpoint": {
135 |                     "params": {
136 |                         "per_page": 100,
137 |                     },
138 |                 },
139 |             },
140 |             "resources": [
141 |                 {
142 |                     "name": "issues",
143 |                     "endpoint": {
144 |                         "path": "issues",
145 |                         "params": {
146 |                             "sort": "updated",
147 |                             "direction": "desc",
148 |                             "state": "open",
149 |                             "since": {
150 |                                 "type": "incremental",
151 |                                 "cursor_path": "updated_at",
152 |                                 "initial_value": "2024-01-25T11:21:28Z",
153 |                             },
154 |                         },
155 |                     },
156 |                 },
157 |                 {
158 |                     "name": "issue_comments",
159 |                     "endpoint": {
160 |                         "path": "issues/{issue_number}/comments",
161 |                         "params": {
162 |                             "issue_number": {
163 |                                 "type": "resolve",
164 |                                 "resource": "issues",
165 |                                 "field": "number",
166 |                             }
167 |                         },
168 |                     },
169 |                 },
170 |             ],
171 |         })
172 |     """
173 | 
174 |     validate_dict(RESTAPIConfig, config, path="")
175 | 
176 |     client_config = config["client"]
177 |     resource_defaults = config.get("resource_defaults", {})
178 |     resource_list = config["resources"]
179 | 
180 |     (
181 |         dependency_graph,
182 |         endpoint_resource_map,
183 |         resolved_param_map,
184 |     ) = build_resource_dependency_graph(
185 |         resource_defaults,
186 |         resource_list,
187 |     )
188 | 
189 |     resources = create_resources(
190 |         client_config,
191 |         dependency_graph,
192 |         endpoint_resource_map,
193 |         resolved_param_map,
194 |     )
195 | 
196 |     return list(resources.values())
197 | 
198 | 
199 | def create_resources(
200 |     client_config: ClientConfig,
201 |     dependency_graph: graphlib.TopologicalSorter,
202 |     endpoint_resource_map: Dict[str, EndpointResource],
203 |     resolved_param_map: Dict[str, Optional[ResolvedParam]],
204 | ) -> Dict[str, DltResource]:
205 |     resources = {}
206 | 
207 |     for resource_name in dependency_graph.static_order():
208 |         resource_name = cast(str, resource_name)
209 |         endpoint_resource = endpoint_resource_map[resource_name]
210 |         endpoint_config = cast(Endpoint, endpoint_resource["endpoint"])
211 |         request_params = endpoint_config.get("params", {})
212 |         request_json = endpoint_config.get("json", None)
213 |         paginator = create_paginator(endpoint_config.get("paginator"))
214 | 
215 |         resolved_param: ResolvedParam = resolved_param_map[resource_name]
216 | 
217 |         include_from_parent: List[str] = endpoint_resource.get(
218 |             "include_from_parent", []
219 |         )
220 |         if not resolved_param and include_from_parent:
221 |             raise ValueError(
222 |                 f"Resource {resource_name} has include_from_parent but is not "
223 |                 "dependent on another resource"
224 |             )
225 | 
226 |         (
227 |             incremental_object,
228 |             incremental_param,
229 |         ) = setup_incremental_object(request_params, endpoint_config.get("incremental"))
230 | 
231 |         client = RESTClient(
232 |             base_url=client_config["base_url"],
233 |             headers=client_config.get("headers"),
234 |             auth=create_auth(client_config.get("auth")),
235 |             paginator=create_paginator(client_config.get("paginator")),
236 |         )
237 | 
238 |         hooks = create_response_hooks(endpoint_config.get("response_actions"))
239 | 
240 |         resource_kwargs = exclude_keys(
241 |             endpoint_resource, {"endpoint", "include_from_parent"}
242 |         )
243 | 
244 |         if resolved_param is None:
245 | 
246 |             def paginate_resource(
247 |                 method: HTTPMethodBasic,
248 |                 path: str,
249 |                 params: Dict[str, Any],
250 |                 json: Optional[Dict[str, Any]],
251 |                 paginator: Optional[BasePaginator],
252 |                 data_selector: Optional[jsonpath.TJsonPath],
253 |                 hooks: Optional[Dict[str, Any]],
254 |                 client: RESTClient = client,
255 |                 incremental_object: Optional[Incremental[Any]] = incremental_object,
256 |                 incremental_param: IncrementalParam = incremental_param,
257 |             ) -> Generator[Any, None, None]:
258 |                 if incremental_object:
259 |                     params[incremental_param.start] = incremental_object.last_value
260 |                     if incremental_param.end:
261 |                         params[incremental_param.end] = incremental_object.end_value
262 | 
263 |                 yield from client.paginate(
264 |                     method=method,
265 |                     path=path,
266 |                     params=params,
267 |                     json=json,
268 |                     paginator=paginator,
269 |                     data_selector=data_selector,
270 |                     hooks=hooks,
271 |                 )
272 | 
273 |             resources[resource_name] = dlt.resource(
274 |                 paginate_resource,
275 |                 **resource_kwargs,  # TODO: implement typing.Unpack
276 |             )(
277 |                 method=endpoint_config.get("method", "get"),
278 |                 path=endpoint_config.get("path"),
279 |                 params=request_params,
280 |                 json=request_json,
281 |                 paginator=paginator,
282 |                 data_selector=endpoint_config.get("data_selector"),
283 |                 hooks=hooks,
284 |             )
285 | 
286 |         else:
287 |             predecessor = resources[resolved_param.resolve_config["resource"]]
288 | 
289 |             base_params = exclude_keys(request_params, {resolved_param.param_name})
290 | 
291 |             def paginate_dependent_resource(
292 |                 items: List[Dict[str, Any]],
293 |                 method: HTTPMethodBasic,
294 |                 path: str,
295 |                 params: Dict[str, Any],
296 |                 paginator: Optional[BasePaginator],
297 |                 data_selector: Optional[jsonpath.TJsonPath],
298 |                 hooks: Optional[Dict[str, Any]],
299 |                 client: RESTClient = client,
300 |                 resolved_param: ResolvedParam = resolved_param,
301 |                 include_from_parent: List[str] = include_from_parent,
302 |                 incremental_object: Optional[Incremental[Any]] = incremental_object,
303 |                 incremental_param: IncrementalParam = incremental_param,
304 |             ) -> Generator[Any, None, None]:
305 |                 if incremental_object:
306 |                     params[incremental_param.start] = incremental_object.last_value
307 |                     if incremental_param.end:
308 |                         params[incremental_param.end] = incremental_object.end_value
309 | 
310 |                 for item in items:
311 |                     formatted_path, parent_record = process_parent_data_item(
312 |                         path, item, resolved_param, include_from_parent
313 |                     )
314 | 
315 |                     for child_page in client.paginate(
316 |                         method=method,
317 |                         path=formatted_path,
318 |                         params=params,
319 |                         paginator=paginator,
320 |                         data_selector=data_selector,
321 |                         hooks=hooks,
322 |                     ):
323 |                         if parent_record:
324 |                             for child_record in child_page:
325 |                                 child_record.update(parent_record)
326 |                         yield child_page
327 | 
328 |             resources[resource_name] = dlt.resource(  # type: ignore[call-overload]
329 |                 paginate_dependent_resource,
330 |                 data_from=predecessor,
331 |                 **resource_kwargs,  # TODO: implement typing.Unpack
332 |             )(
333 |                 method=endpoint_config.get("method", "get"),
334 |                 path=endpoint_config.get("path"),
335 |                 params=base_params,
336 |                 paginator=paginator,
337 |                 data_selector=endpoint_config.get("data_selector"),
338 |                 hooks=hooks,
339 |             )
340 | 
341 |     return resources
342 | 
343 | 
344 | # XXX: This is a workaround pass test_dlt_init.py
345 | # since the source uses dlt.source as a function
346 | def _register_source(source_func: Callable[..., DltSource]) -> None:
347 |     import inspect
348 |     from dlt.common.configuration import get_fun_spec
349 |     from dlt.common.source import _SOURCES, SourceInfo
350 | 
351 |     spec = get_fun_spec(source_func)
352 |     func_module = inspect.getmodule(source_func)
353 |     _SOURCES[source_func.__name__] = SourceInfo(
354 |         SPEC=spec,
355 |         f=source_func,
356 |         module=func_module,
357 |     )
358 | 
359 | 
360 | _register_source(rest_api_source)
361 | 


--------------------------------------------------------------------------------