├── tests ├── __init__.py ├── integration │ ├── __init__.py │ ├── project │ │ ├── .env.local │ │ ├── .gitignore │ │ ├── endpoints │ │ │ ├── simple_kv.pipe │ │ │ └── simple_pipe.pipe │ │ ├── datasources │ │ │ └── simple.datasource │ │ ├── CLAUDE.md │ │ └── .cursorrules │ ├── test_client.py │ ├── test_tokens.py │ ├── test_pipe.py │ ├── test_datasource.py │ ├── test_events.py │ ├── conftest.py │ ├── test_variables.py │ ├── test_query.py │ ├── test_datasources.py │ └── test_pipes.py ├── test_utils.py ├── test_pipe.py ├── utils.py ├── test_query.py ├── test_datasource.py └── test_worker.py ├── verdin ├── test │ ├── __init__.py │ ├── container.py │ └── cli.py ├── config.py ├── __init__.py ├── api │ ├── __init__.py │ ├── apis.py │ ├── base.py │ ├── events.py │ ├── tokens.py │ ├── variables.py │ ├── query.py │ ├── pipes.py │ └── datasources.py ├── tinybird.py ├── client.py ├── query.py ├── pipe.py ├── datasource.py └── worker.py ├── Makefile ├── .github └── workflows │ └── build.yml ├── pyproject.toml ├── .gitignore ├── README.md └── LICENSE /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /verdin/test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/integration/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/integration/project/.env.local: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/integration/project/.gitignore: -------------------------------------------------------------------------------- 1 | .tinyb 2 | .terraform 3 | -------------------------------------------------------------------------------- /verdin/config.py: -------------------------------------------------------------------------------- 1 | API_URL: str = "https://api.tinybird.co" 2 | -------------------------------------------------------------------------------- /verdin/__init__.py: -------------------------------------------------------------------------------- 1 | name = "verdin" 2 | 3 | __version__ = "0.5.1" 4 | -------------------------------------------------------------------------------- /verdin/api/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import ApiError, ApiResponse 2 | 3 | __all__ = [ 4 | "ApiError", 5 | "ApiResponse", 6 | ] 7 | -------------------------------------------------------------------------------- /tests/integration/test_client.py: -------------------------------------------------------------------------------- 1 | def test_client_has_token(client): 2 | """Makes sure the client fixture loaded the admin token correctly""" 3 | assert client.token.startswith("p.e") 4 | -------------------------------------------------------------------------------- /tests/integration/project/endpoints/simple_kv.pipe: -------------------------------------------------------------------------------- 1 | VERSION 0 2 | 3 | DESCRIPTION > 4 | Endpoint to select unique key/value pairs from simple 5 | 6 | NODE endpoint 7 | SQL > 8 | % 9 | SELECT key, value 10 | FROM simple 11 | ORDER BY key, timestamp desc 12 | LIMIT 1 by key 13 | 14 | TYPE ENDPOINT 15 | -------------------------------------------------------------------------------- /tests/integration/project/endpoints/simple_pipe.pipe: -------------------------------------------------------------------------------- 1 | VERSION 0 2 | 3 | DESCRIPTION > 4 | Endpoint to select specific keys from the table 5 | 6 | NODE endpoint 7 | SQL > 8 | % 9 | SELECT * 10 | FROM simple 11 | WHERE 1=1 12 | {% if defined(key) %} AND key == {{ String(key) }} {% end %} 13 | 14 | TYPE ENDPOINT 15 | -------------------------------------------------------------------------------- /tests/integration/project/datasources/simple.datasource: -------------------------------------------------------------------------------- 1 | DESCRIPTION > 2 | Simple Key-Value Data Source 3 | 4 | SCHEMA > 5 | id UUID `json:$.Id`, 6 | timestamp DateTime64(6) `json:$.Timestamp`, 7 | key String `json:$.Key`, 8 | value String `json:$.Value` 9 | 10 | ENGINE "MergeTree" 11 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from tests.utils import retry 4 | 5 | 6 | def test_retry(): 7 | assert retry(lambda: "foo") == "foo" 8 | assert retry(lambda x: f"foo: {x}", kwargs={"x": "bar"}) == "foo: bar" 9 | assert retry(lambda x: f"foo: {x}", args=("bar",)) == "foo: bar" 10 | 11 | 12 | def test_retry_error(): 13 | def _raise_error(): 14 | raise ValueError("oh noes") 15 | 16 | with pytest.raises(TimeoutError) as e: 17 | retry(_raise_error, retries=2, interval=0.1) 18 | 19 | assert e.match("oh noes") 20 | -------------------------------------------------------------------------------- /verdin/tinybird.py: -------------------------------------------------------------------------------- 1 | from .client import Client 2 | from .datasource import Datasource, Record 3 | from .pipe import Pipe, PipeError, PipeJsonData, PipeJsonResponse, PipeMetadata, PipePageIterator 4 | from .query import OutputFormat, QueryError, QueryJsonResult, SqlQuery 5 | 6 | __all__ = [ 7 | "Client", 8 | "Datasource", 9 | "Record", 10 | "Pipe", 11 | "PipeError", 12 | "PipeMetadata", 13 | "PipeJsonData", 14 | "PipeJsonResponse", 15 | "PipePageIterator", 16 | "SqlQuery", 17 | "QueryError", 18 | "OutputFormat", 19 | "QueryJsonResult", 20 | ] 21 | -------------------------------------------------------------------------------- /tests/test_pipe.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional 2 | 3 | from verdin.pipe import PagedPipeQuery, PipeMetadata 4 | 5 | 6 | class MockPipeJsonResponse: 7 | def __init__(self, empty: bool, data: Optional[Dict], meta: PipeMetadata): 8 | self.empty = empty 9 | self.data = data 10 | self.meta = meta 11 | 12 | 13 | class TestPagedPipeQuery: 14 | def test(self): 15 | queries = list() 16 | 17 | class MockPipe: 18 | def sql(self, query): 19 | queries.append(query) 20 | 21 | if len(queries) == 2: 22 | return MockPipeJsonResponse(empty=True, data=None, meta=[]) 23 | 24 | return MockPipeJsonResponse(empty=False, data={}, meta=[]) 25 | 26 | for page in PagedPipeQuery(pipe=MockPipe(), page_size=10, start_at=0): 27 | assert page.empty is False 28 | 29 | assert len(queries) == 2 30 | assert queries[0] == "SELECT * FROM _ LIMIT 10 OFFSET 0" 31 | assert queries[1] == "SELECT * FROM _ LIMIT 10 OFFSET 10" 32 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | VENV_BIN = python3 -m venv 2 | VENV_DIR ?= .venv 3 | VENV_ACTIVATE = $(VENV_DIR)/bin/activate 4 | VENV_RUN = . $(VENV_ACTIVATE) 5 | 6 | 7 | venv: $(VENV_ACTIVATE) 8 | 9 | $(VENV_ACTIVATE): pyproject.toml 10 | test -d $(VENV_DIR) || $(VENV_BIN) $(VENV_DIR) 11 | $(VENV_RUN); pip install -e ".[dev]" 12 | touch $(VENV_DIR)/bin/activate 13 | 14 | clean: 15 | rm -rf build/ 16 | rm -rf .eggs/ 17 | rm -rf *.egg-info/ 18 | rm -rf .venv 19 | 20 | clean-dist: clean 21 | rm -rf dist/ 22 | 23 | lint: venv 24 | $(VENV_RUN); python -m ruff check . 25 | 26 | format: venv 27 | $(VENV_RUN); python -m ruff format . && python -m ruff check . --fix 28 | 29 | test: venv 30 | $(VENV_RUN); python -m pytest 31 | 32 | test-coverage: venv 33 | $(VENV_RUN); coverage run --source=verdin -m pytest tests && coverage lcov -o .coverage.lcov 34 | 35 | dist: venv 36 | $(VENV_RUN); python -m build 37 | 38 | install: venv 39 | $(VENV_RUN); pip install -e . 40 | 41 | upload: venv 42 | $(VENV_RUN); pip install --upgrade twine; twine upload dist/* 43 | 44 | .PHONY: clean clean-dist format test test-coverage upload 45 | -------------------------------------------------------------------------------- /tests/integration/test_tokens.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from verdin.api import ApiError 4 | 5 | 6 | class TestTokensApi: 7 | def test_list(self, client): 8 | api = client.api.tokens 9 | tokens = api.list().tokens 10 | assert tokens 11 | assert "admin local_testing@tinybird.co" in [token["name"] for token in tokens] 12 | 13 | def test_get_information(self, client): 14 | api = client.api.tokens 15 | 16 | token = api.get_information("admin local_testing@tinybird.co").info 17 | assert token["name"] == "admin local_testing@tinybird.co" 18 | assert token["token"].startswith("p.e") 19 | 20 | # make sure it also works with the id 21 | token = api.get_information(token["id"]).info 22 | assert token["name"] == "admin local_testing@tinybird.co" 23 | 24 | def test_get_information_on_non_existing_token(self, client): 25 | api = client.api.tokens 26 | 27 | with pytest.raises(ApiError) as e: 28 | api.get_information("NON EXISTING TOKEN") 29 | 30 | assert e.match("Token has not enough permissions to get information about this token") 31 | assert e.value.status_code == 403 32 | -------------------------------------------------------------------------------- /verdin/api/apis.py: -------------------------------------------------------------------------------- 1 | from .datasources import DataSourcesApi 2 | from .events import EventsApi 3 | from .pipes import PipesApi 4 | from .query import QueryApi 5 | from .tokens import TokensApi 6 | from .variables import VariablesApi 7 | 8 | 9 | class Apis: 10 | """ 11 | Factory for Api objects. 12 | """ 13 | 14 | _token: str 15 | _host: str | None 16 | 17 | def __init__(self, token: str, host: str = None): 18 | self._token = token 19 | self._host = host 20 | 21 | @property 22 | def datasources(self) -> DataSourcesApi: 23 | return DataSourcesApi(self._token, self._host) 24 | 25 | @property 26 | def events(self) -> EventsApi: 27 | return EventsApi(self._token, self._host) 28 | 29 | @property 30 | def pipes(self) -> PipesApi: 31 | return PipesApi(self._token, self._host) 32 | 33 | @property 34 | def query(self) -> QueryApi: 35 | return QueryApi(self._token, self._host) 36 | 37 | @property 38 | def tokens(self) -> TokensApi: 39 | return TokensApi(self._token, self._host) 40 | 41 | @property 42 | def variables(self) -> VariablesApi: 43 | return VariablesApi(self._token, self._host) 44 | -------------------------------------------------------------------------------- /tests/integration/test_pipe.py: -------------------------------------------------------------------------------- 1 | class TestPipe: 2 | def test_pipe_query(self, client): 3 | ds = client.datasource("simple") 4 | ds.truncate() 5 | 6 | ds.append_ndjson( 7 | [ 8 | { 9 | "Id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0", 10 | "Timestamp": "2024-01-23T10:30:00.123456", 11 | "Key": "foo", 12 | "Value": "bar", 13 | }, 14 | { 15 | "Id": "d7792957-21d8-46e6-a4e0-188eb36e2758", 16 | "Timestamp": "2024-02-23T11:45:00.234567", 17 | "Key": "baz", 18 | "Value": "ed", 19 | }, 20 | { 21 | "Id": "fc71d4d5-7e0c-492a-9e3f-8f1cde9bcfaf", 22 | "Timestamp": "2024-03-23T11:45:00.234567", 23 | "Key": "foo", 24 | "Value": "bar2", 25 | }, 26 | ] 27 | ) 28 | 29 | pipe = client.pipe("simple_kv") 30 | 31 | response = pipe.query() 32 | assert response.data == [ 33 | {"key": "baz", "value": "ed"}, 34 | {"key": "foo", "value": "bar2"}, 35 | ] 36 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | paths-ignore: 6 | - 'README.md' 7 | branches: 8 | - main 9 | pull_request: 10 | branches: 11 | - main 12 | 13 | jobs: 14 | test: 15 | runs-on: ${{ matrix.os }} 16 | strategy: 17 | matrix: 18 | os: [ ubuntu-latest ] 19 | python-version: [ '3.10', '3.11', '3.12', '3.13' ] 20 | 21 | steps: 22 | - name: Checkout 23 | uses: actions/checkout@v2 24 | 25 | - name: Set up Python 26 | uses: actions/setup-python@v2 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | 30 | - name: Run linting 31 | run: | 32 | make lint 33 | 34 | - name: Run tests 35 | run: | 36 | make test-coverage 37 | 38 | - name: Coveralls Parallel 39 | uses: coverallsapp/github-action@master 40 | with: 41 | github-token: ${{ secrets.GITHUB_TOKEN }} 42 | flag-name: run-${{ matrix.os }}-${{ matrix.python_version }} 43 | path-to-lcov: ./.coverage.lcov 44 | parallel: true 45 | 46 | report: 47 | needs: test 48 | runs-on: ubuntu-latest 49 | steps: 50 | - name: Report coveralls 51 | uses: coverallsapp/github-action@master 52 | with: 53 | github-token: ${{ secrets.GITHUB_TOKEN }} 54 | parallel-finished: true 55 | -------------------------------------------------------------------------------- /verdin/client.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from . import config 4 | from .api.apis import Apis 5 | from .datasource import Datasource 6 | from .pipe import Pipe 7 | from .query import OutputFormat, SqlQuery 8 | 9 | 10 | class Client: 11 | """ 12 | Tinybird HTTP client that holds the access token and provides factory methods for resources. 13 | """ 14 | 15 | def __init__(self, token: str, api: str = None): 16 | self.host = (api or config.API_URL).lstrip("/") 17 | self.token = token 18 | self._api = Apis(self.token, self.host) 19 | 20 | @property 21 | def api(self) -> Apis: 22 | """ 23 | Returns an ``Apis`` object that gives you access to the tinybird API objects. 24 | :return: An ``Apis`` object 25 | """ 26 | return self._api 27 | 28 | def pipe(self, name: str, version: int = None) -> Pipe: 29 | """ 30 | Create an object representing a pipe with the given name, e.g., 31 | "localstack_dashboard_events.json" 32 | """ 33 | return Pipe(name, token=self.token, version=version, api=self.host) 34 | 35 | def datasource(self, name: str, version: int = None) -> Datasource: 36 | """ 37 | Create an object representing a datasource with a given name. 38 | """ 39 | return Datasource(name, token=self.token, version=version, api=self.host) 40 | 41 | def sql(self, sql: str, format: Optional[OutputFormat] = None) -> SqlQuery: 42 | return SqlQuery(sql, format=format, token=self.token, api=self.host) 43 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # Verdin project configuration 2 | [build-system] 3 | requires = ['hatchling'] 4 | build-backend = "hatchling.build" 5 | 6 | [project] 7 | name = "verdin" 8 | authors = [ 9 | { name = "Thomas Rausch", email = "info@localstack.cloud" } 10 | ] 11 | description = "A Python SDK for Tinybird" 12 | readme = "README.md" 13 | license = "Apache-2.0" 14 | classifiers = [ 15 | "Development Status :: 4 - Beta", 16 | "Operating System :: OS Independent", 17 | "Programming Language :: Python :: 3", 18 | "Programming Language :: Python :: 3.10", 19 | "Programming Language :: Python :: 3.11", 20 | "Programming Language :: Python :: 3.12", 21 | "Programming Language :: Python :: 3.13", 22 | "Topic :: System :: Networking", 23 | "Topic :: Software Development :: Libraries", 24 | "Topic :: Utilities" 25 | ] 26 | requires-python = ">=3.10" 27 | dynamic = ["version"] 28 | 29 | [project.urls] 30 | Repository = "https://github.com/localstack/verdin" 31 | 32 | [project.optional-dependencies] 33 | dev = [ 34 | "pytest>=6.2.4", 35 | "ruff==0.9.1", 36 | "pytest_httpserver>=1.0.1", 37 | "coverage[toml]>=5.0", 38 | "pytest-cov>=2.7.1", 39 | "coveralls", 40 | "tinybird", 41 | ] 42 | 43 | [tool.hatch.version] 44 | path = "verdin/__init__.py" 45 | 46 | [tool.ruff] 47 | line-length = 100 48 | target-version = "py310" 49 | 50 | [tool.coverage.run] 51 | relative_files = true 52 | source = [ 53 | "verdin/" 54 | ] 55 | 56 | [tool.coverage.report] 57 | exclude_lines = [ 58 | "if __name__ == .__main__.:", 59 | "raise NotImplementedError", 60 | "return NotImplemented", 61 | "def __repr__", 62 | ] 63 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | import uuid 3 | from typing import Callable 4 | 5 | 6 | def retry( 7 | fn: Callable, 8 | args: tuple = None, 9 | kwargs: dict = None, 10 | retries: int = 3, 11 | interval: float = 1, 12 | ): 13 | """ 14 | Retries the execution of a function ``fn`` for a specified number of attempts (``retries``) with a delay 15 | between attempts (``interval``). If all attempts fail, a ``TimeoutError`` is raised indicating the final 16 | error encountered. 17 | 18 | :param fn: The callable function to be executed. 19 | :param args: A tuple of positional arguments to pass to the ``fn``. Defaults to an empty tuple if not provided. 20 | :param kwargs: A dictionary of keyword arguments to pass to the ``fn``. Defaults to an empty dictionary if not 21 | provided. 22 | :param retries: The number of retry attempts before raising a ``TimeoutError``. Defaults to 3. 23 | :param interval: The time (in seconds) to wait between each retry attempt. Defaults to 1.0 seconds. 24 | :return: The result returned by successfully calling the ``fn`` with the specified ``args`` and ``kwargs``. 25 | Returns `None` only if no successful result is obtained after all retry attempts. 26 | """ 27 | args = args or () 28 | kwargs = kwargs or {} 29 | 30 | for i in range(retries): 31 | try: 32 | return fn(*args, **kwargs) 33 | except Exception as e: 34 | if i == retries - 1: 35 | raise TimeoutError(f"Gave up after {retries} retries, final error: {e}") from e 36 | else: 37 | time.sleep(interval) 38 | continue 39 | 40 | return None 41 | 42 | 43 | def short_id() -> str: 44 | return str(uuid.uuid4())[-8:] 45 | -------------------------------------------------------------------------------- /tests/integration/test_datasource.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | LOG = logging.getLogger(__name__) 5 | 6 | 7 | class TestDatasource: 8 | def test_append_ndjson_query_truncate(self, client): 9 | ds = client.datasource("simple") 10 | ds.truncate() 11 | 12 | ds.append_ndjson( 13 | [ 14 | { 15 | "Id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0", 16 | "Timestamp": "2024-01-23T10:30:00.123456", 17 | "Key": "foo", 18 | "Value": "bar", 19 | }, 20 | { 21 | "Id": "d7792957-21d8-46e6-a4e0-188eb36e2758", 22 | "Timestamp": "2024-02-23T11:45:00.234567", 23 | "Key": "baz", 24 | "Value": "ed", 25 | }, 26 | ] 27 | ) 28 | 29 | query = client.sql("SELECT * FROM simple") 30 | response = query.json() 31 | assert response.data == [ 32 | { 33 | "id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0", 34 | "timestamp": "2024-01-23 10:30:00.123456", 35 | "key": "foo", 36 | "value": "bar", 37 | }, 38 | { 39 | "id": "d7792957-21d8-46e6-a4e0-188eb36e2758", 40 | "timestamp": "2024-02-23 11:45:00.234567", 41 | "key": "baz", 42 | "value": "ed", 43 | }, 44 | ] 45 | 46 | query = client.sql("SELECT count(*) as cnt FROM simple") 47 | response = query.json() 48 | assert response.data == [{"cnt": 2}] 49 | 50 | # remove all records from the table 51 | ds.truncate() 52 | 53 | # check that the table is empty 54 | query = client.sql("SELECT count(*) as cnt FROM simple") 55 | response = query.json() 56 | assert response.data == [{"cnt": 0}] 57 | 58 | query = client.sql("SELECT * FROM simple") 59 | response = query.json() 60 | assert response.data == [] 61 | -------------------------------------------------------------------------------- /tests/test_query.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pytest_httpserver import HTTPServer 3 | from werkzeug import Response 4 | 5 | from verdin.query import QueryError, SqlQuery 6 | 7 | _mock_json_response = """{ 8 | "meta": [ 9 | { 10 | "name": "VendorID", 11 | "type": "Int32" 12 | }, 13 | { 14 | "name": "tpep_pickup_datetime", 15 | "type": "DateTime" 16 | }, 17 | { 18 | "name": "passenger_count", 19 | "type": "Int32" 20 | } 21 | ], 22 | "data": [ 23 | { 24 | "VendorID": 2, 25 | "tpep_pickup_datetime": "2001-01-05 11:45:23", 26 | "passenger_count": 5 27 | }, 28 | { 29 | "VendorID": 2, 30 | "tpep_pickup_datetime": "2002-12-31 23:01:55", 31 | "passenger_count": 3 32 | } 33 | ], 34 | "rows": 2, 35 | "rows_before_limit_at_least": 4, 36 | "statistics": 37 | { 38 | "elapsed": 0.00091042, 39 | "rows_read": 4, 40 | "bytes_read": 296 41 | } 42 | }""" 43 | 44 | 45 | def test_json(httpserver: HTTPServer): 46 | def handler(request): 47 | return Response(_mock_json_response, 200) 48 | 49 | httpserver.expect_request( 50 | "/v0/sql", query_string={"q": "select * from mytable FORMAT JSON"} 51 | ).respond_with_handler(handler) 52 | 53 | query = SqlQuery("select * from mytable", token="12345", api=httpserver.url_for("/")) 54 | 55 | response = query.json() 56 | 57 | assert response.meta[0] == {"name": "VendorID", "type": "Int32"} 58 | assert len(response.data) == 2 59 | 60 | 61 | def test_json_error(httpserver: HTTPServer): 62 | def handler(request): 63 | return Response('{"error": "invalid datasource"}', 403) 64 | 65 | httpserver.expect_request( 66 | "/v0/sql", query_string={"q": "select * from mytable FORMAT JSON"} 67 | ).respond_with_handler(handler) 68 | 69 | query = SqlQuery("select * from mytable", token="12345", api=httpserver.url_for("/")) 70 | 71 | with pytest.raises(QueryError) as e: 72 | query.json() 73 | e.match("403") 74 | e.match("invalid datasource") 75 | -------------------------------------------------------------------------------- /tests/integration/test_events.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from tests.utils import retry 4 | 5 | 6 | class TestEventsApi: 7 | @pytest.mark.parametrize("compress", [True, False]) 8 | def test_events(self, client, compress): 9 | events = client.api.events 10 | 11 | records = [ 12 | { 13 | "Id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0", 14 | "Timestamp": "2024-01-23T10:30:00.123456", 15 | "Key": "foo", 16 | "Value": "bar", 17 | }, 18 | { 19 | "Id": "d7792957-21d8-46e6-a4e0-188eb36e2758", 20 | "Timestamp": "2024-02-23T11:45:00.234567", 21 | "Key": "baz", 22 | "Value": "ed", 23 | }, 24 | ] 25 | 26 | response = events.send("simple", records, compress=compress) 27 | 28 | assert response.successful_rows == 2 29 | assert response.quarantined_rows == 0 30 | 31 | def _wait_for_count(cnt: int): 32 | query = client.sql("SELECT count(*) as cnt FROM simple") 33 | assert query.json().data == [{"cnt": cnt}] 34 | 35 | retry(_wait_for_count, args=(2,)) 36 | 37 | def test_events_wait(self, client): 38 | events = client.api.events 39 | 40 | records = [ 41 | { 42 | "Id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0", 43 | "Timestamp": "2024-01-23T10:30:00.123456", 44 | "Key": "foo", 45 | "Value": "bar", 46 | }, 47 | { 48 | "Id": "d7792957-21d8-46e6-a4e0-188eb36e2758", 49 | "Timestamp": "2024-02-23T11:45:00.234567", 50 | "Key": "baz", 51 | "Value": "ed", 52 | }, 53 | ] 54 | 55 | response = events.send("simple", records, wait=True) 56 | 57 | assert response.successful_rows == 2 58 | assert response.quarantined_rows == 0 59 | 60 | query = client.sql("SELECT count(*) as cnt FROM simple") 61 | assert query.json().data == [{"cnt": 2}] 62 | -------------------------------------------------------------------------------- /tests/test_datasource.py: -------------------------------------------------------------------------------- 1 | from pytest_httpserver import HTTPServer 2 | from werkzeug import Response 3 | 4 | from verdin.datasource import Datasource, FileDatasource 5 | 6 | 7 | class TestDatasource: 8 | def test_to_csv(self): 9 | records = [["a", "1", "{}"], ["b", "2", '{"foo":"bar","baz":"ed"}']] 10 | 11 | csv = Datasource.to_csv(records) 12 | 13 | assert csv == """a,1,{}\nb,2,"{""foo"":""bar"",""baz"":""ed""}"\n""" 14 | 15 | def test_to_csv_with_delimiter(self): 16 | records = [["a", "1", "{}"], ["b", "2", '{"foo":"bar","baz":"ed"}']] 17 | 18 | csv = Datasource.to_csv(records, delimiter=";") 19 | 20 | assert csv == """a;1;{}\nb;2;"{""foo"":""bar"",""baz"":""ed""}"\n""" 21 | 22 | def test_append(self, httpserver: HTTPServer): 23 | ds = Datasource("mydatasource", "123456", api=httpserver.url_for("/")) 24 | 25 | expected_data = '''a,1,{}\nb,2,"{""foo"":""bar"",""baz"":""ed""}"''' 26 | 27 | def handler(request): 28 | actual_data = request.data.decode() 29 | assert expected_data in actual_data 30 | return Response("", 200) 31 | 32 | httpserver.expect_request( 33 | "/v0/datasources", 34 | query_string={ 35 | "name": "mydatasource", 36 | "mode": "append", 37 | "dialect_delimiter": ",", 38 | "format": "csv", 39 | }, 40 | ).respond_with_handler(handler) 41 | 42 | response = ds.append([["a", "1", "{}"], ["b", "2", '{"foo":"bar","baz":"ed"}']]) 43 | httpserver.check() 44 | assert response.ok 45 | 46 | 47 | class TestFileDatasource: 48 | def test_append(self, tmp_path): 49 | file_path = tmp_path / "myfile.csv" 50 | ds = FileDatasource(str(file_path)) 51 | 52 | records = [["a", "1", "{}"], ["b", "2", '{"foo":"bar","baz":"ed"}']] 53 | ds.append(records) 54 | 55 | records = [["c", "3", "{}"]] 56 | ds.append(records) 57 | 58 | expected = """a,1,{}\nb,2,"{""foo"":""bar"",""baz"":""ed""}"\nc,3,{}\n""" 59 | actual = file_path.read_text() 60 | 61 | assert actual == expected 62 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | *.iml 3 | *~ 4 | 5 | # General 6 | .DS_Store 7 | .AppleDouble 8 | .LSOverride 9 | 10 | # Icon must end with two \r 11 | Icon 12 | 13 | # Thumbnails 14 | ._* 15 | 16 | # Files that might appear in the root of a volume 17 | .DocumentRevisions-V100 18 | .fseventsd 19 | .Spotlight-V100 20 | .TemporaryItems 21 | .Trashes 22 | .VolumeIcon.icns 23 | .com.apple.timemachine.donotpresent 24 | 25 | # Directories potentially created on remote AFP share 26 | .AppleDB 27 | .AppleDesktop 28 | Network Trash Folder 29 | Temporary Items 30 | .apdisk 31 | 32 | # Byte-compiled / optimized / DLL files 33 | __pycache__/ 34 | *.py[cod] 35 | *$py.class 36 | 37 | # C extensions 38 | *.so 39 | 40 | # Distribution / packaging 41 | .Python 42 | build/ 43 | develop-eggs/ 44 | dist/ 45 | downloads/ 46 | eggs/ 47 | .eggs/ 48 | lib/ 49 | lib64/ 50 | parts/ 51 | sdist/ 52 | var/ 53 | wheels/ 54 | *.egg-info/ 55 | .installed.cfg 56 | *.egg 57 | MANIFEST 58 | 59 | # PyInstaller 60 | # Usually these files are written by a python script from a template 61 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 62 | *.manifest 63 | *.spec 64 | 65 | # Installer logs 66 | pip-log.txt 67 | pip-delete-this-directory.txt 68 | 69 | # Unit test / coverage reports 70 | htmlcov/ 71 | .tox/ 72 | .coverage 73 | .coverage.* 74 | .cache 75 | nosetests.xml 76 | coverage.xml 77 | *.cover 78 | .hypothesis/ 79 | 80 | # Translations 81 | *.mo 82 | *.pot 83 | 84 | # Django stuff: 85 | *.log 86 | .static_storage/ 87 | .media/ 88 | local_settings.py 89 | 90 | # Flask stuff: 91 | instance/ 92 | .webassets-cache 93 | 94 | # Scrapy stuff: 95 | .scrapy 96 | 97 | # Sphinx documentation 98 | docs/_build/ 99 | 100 | # PyBuilder 101 | target/ 102 | 103 | # Jupyter Notebook 104 | .ipynb_checkpoints 105 | 106 | # pyenv 107 | .python-version 108 | 109 | # celery beat schedule file 110 | celerybeat-schedule 111 | 112 | # SageMath parsed files 113 | *.sage.py 114 | 115 | # Environments 116 | .venv 117 | env/ 118 | venv/ 119 | ENV/ 120 | env.bak/ 121 | venv.bak/ 122 | 123 | # Spyder project settings 124 | .spyderproject 125 | .spyproject 126 | 127 | # Rope project settings 128 | .ropeproject 129 | 130 | # mkdocs documentation 131 | /site 132 | 133 | # mypy 134 | .mypy_cache/ 135 | -------------------------------------------------------------------------------- /tests/integration/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | import pytest 5 | 6 | from verdin.api import ApiError 7 | from verdin.client import Client 8 | from verdin.test.cli import TinybirdCli 9 | from verdin.test.container import TinybirdLocalContainer 10 | 11 | # os.environ["SKIP_TINYBIRD_LOCAL_START"] = "1" 12 | 13 | 14 | def _is_skip_tinybird_local_start() -> bool: 15 | """ 16 | Set SKIP_TINYBIRD_LOCAL_START=1 if you have a tb local container running already with the project deployed. This 17 | allows faster iterations. 18 | """ 19 | return os.environ.get("SKIP_TINYBIRD_LOCAL_START") in ["1", "true", "True", True] 20 | 21 | 22 | @pytest.fixture(scope="session") 23 | def client(tinybird_local_container) -> Client: 24 | return tinybird_local_container.client() 25 | 26 | 27 | @pytest.fixture(scope="session") 28 | def cli(tinybird_local_container) -> TinybirdCli: 29 | project_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "project")) 30 | 31 | return TinybirdCli( 32 | host=tinybird_local_container.url, 33 | local=True, 34 | cwd=project_dir, 35 | ) 36 | 37 | 38 | @pytest.fixture(scope="session", autouse=True) 39 | def tinybird_local_container(): 40 | """ 41 | Starts a tinybird local container in the background and waits until it becomes available. 42 | """ 43 | project_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "project")) 44 | 45 | container = TinybirdLocalContainer(cwd=project_dir) 46 | 47 | if not _is_skip_tinybird_local_start(): 48 | container.start() 49 | 50 | container.wait_is_up() 51 | 52 | yield container 53 | 54 | # cleanup 55 | if not _is_skip_tinybird_local_start(): 56 | container.stop() 57 | 58 | 59 | @pytest.fixture(scope="session", autouse=True) 60 | def deployed_project(cli): 61 | if _is_skip_tinybird_local_start(): 62 | yield 63 | return 64 | 65 | time.sleep(5) 66 | cli.deploy(wait=True, auto=True) 67 | yield 68 | 69 | 70 | @pytest.fixture(autouse=True) 71 | def _truncate_datasource(client): 72 | # make sure to truncate "simple" datasource and its quarantine table before and after each test 73 | 74 | client.api.datasources.truncate("simple") 75 | try: 76 | # also truncate the quarantine table if it exists 77 | client.api.datasources.truncate("simple_quarantine") 78 | except ApiError: 79 | pass 80 | 81 | yield 82 | client.api.datasources.truncate("simple") 83 | 84 | try: 85 | client.api.datasources.truncate("simple_quarantine") 86 | except ApiError: 87 | pass 88 | -------------------------------------------------------------------------------- /verdin/api/base.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | 4 | class Api: 5 | DEFAULT_HOST = "https://api.tinybird.co" 6 | 7 | host: str 8 | token: str 9 | 10 | def __init__(self, token: str, host: str = None): 11 | self.token = token 12 | self.host = host or Api.DEFAULT_HOST 13 | 14 | 15 | class ApiResponse: 16 | _response: requests.Response 17 | 18 | def __init__(self, response: requests.Response): 19 | self._response = response 20 | self._json: dict | None = None # cache for json response 21 | 22 | @property 23 | def text(self) -> str: 24 | """ 25 | Returns the body of the HTTP response as a string. 26 | 27 | :return: The response body as a string. 28 | """ 29 | return self._response.text 30 | 31 | @property 32 | def content(self) -> bytes: 33 | """ 34 | Returns the body of the HTTP response as bytes. 35 | 36 | :return: The response body as a bytes. 37 | """ 38 | return self._response.content 39 | 40 | @property 41 | def json(self) -> dict: 42 | """ 43 | Parses the JSON response and returns a dictionary. It caches the result so that later calls to this method 44 | do not parse the response every time. 45 | 46 | :return: The parsed JSON response. 47 | """ 48 | if self._json: 49 | return self._json 50 | 51 | self._json = self._response.json() 52 | return self._json 53 | 54 | 55 | class ApiError(Exception): 56 | """ 57 | Exception that represents a non-200 HTTP response from the API. 58 | """ 59 | 60 | _response: requests.Response 61 | 62 | def __init__(self, response: requests.Response): 63 | self._response = response 64 | super().__init__(self._render_message(response)) 65 | 66 | def _render_message(self, response: requests.Response) -> str: 67 | error = None 68 | documentation = None 69 | 70 | if response.headers.get("Content-Type").startswith("application/json"): 71 | doc = response.json() 72 | error = doc.get("error") 73 | documentation = doc.get("documentation") 74 | 75 | if not error: 76 | error = response.text 77 | 78 | error.rstrip(".") 79 | 80 | message = f"API Error ({response.status_code}): {error}." 81 | 82 | if documentation: 83 | message += f" Documentation: {documentation}" 84 | 85 | return message 86 | 87 | @property 88 | def status_code(self) -> int: 89 | return self._response.status_code 90 | 91 | @property 92 | def text(self) -> str: 93 | return self._response.text 94 | 95 | @property 96 | def json(self) -> str: 97 | return self._response.json() 98 | -------------------------------------------------------------------------------- /verdin/test/container.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import time 3 | 4 | import requests 5 | 6 | from verdin.test.cli import TinybirdCli 7 | from verdin.client import Client 8 | 9 | 10 | class TinybirdLocalContainer: 11 | def __init__(self, cwd: str = None): 12 | """ 13 | Creates a new TinybirdLocalContainer instance. 14 | 15 | :param cwd: The current working directory to use for the tinybird local container. 16 | """ 17 | self.cwd = cwd 18 | self.url = "http://localhost:7181" 19 | self.proc: None | subprocess.Popen = None 20 | 21 | def start(self): 22 | """ 23 | Start the tinybird local container in a background process. 24 | """ 25 | cli = TinybirdCli(cwd=self.cwd, local=True) 26 | self.proc = cli.local_start(daemon=True, skip_new_version=True) 27 | 28 | def client(self) -> Client: 29 | """ 30 | Returns a tinybird Client that connects to this container with admin privileged. 31 | 32 | :return: Tinybird Client 33 | """ 34 | cli = TinybirdCli(host=self.url, cwd=self.cwd, local=True) 35 | 36 | cli_tokens = cli.token_ls() 37 | 38 | # i'm not really sure why this is needed, but when we use a token returned by the /tokens api, the 39 | # client cannot find datasources created through ``tb deploy``. 40 | token_to_use = None 41 | for token in cli_tokens: 42 | if token.name == "admin local_testing@tinybird.co": 43 | token_to_use = token.token 44 | break 45 | 46 | return Client( 47 | token=token_to_use, 48 | api=self.url, 49 | ) 50 | 51 | def wait_is_up(self, timeout: int = 120): 52 | """ 53 | Wait for the container to appear by querying the tokens endpoint. 54 | 55 | :param timeout: Timeout in seconds 56 | :raises TimeoutError: If the container does not appear within the timeout 57 | """ 58 | # Wait for the service to become available 59 | start_time = time.time() 60 | while time.time() - start_time < timeout: 61 | try: 62 | response = requests.get(f"{self.url}/tokens") 63 | if response.status_code == 200: 64 | break 65 | except requests.RequestException: 66 | pass 67 | time.sleep(1) 68 | else: 69 | raise TimeoutError("Tinybird container failed to start within timeout") 70 | 71 | def stop(self): 72 | """ 73 | Stops and removes the tinybird local container. 74 | """ 75 | cli = TinybirdCli(cwd=self.cwd, local=True) 76 | cli.local_stop() 77 | 78 | if self.proc: 79 | self.proc.kill() 80 | self.proc = None 81 | 82 | cli.local_remove() 83 | -------------------------------------------------------------------------------- /verdin/api/events.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import json 3 | import logging 4 | 5 | import requests 6 | 7 | from .base import Api, ApiError, ApiResponse 8 | 9 | LOG = logging.getLogger(__name__) 10 | 11 | 12 | class EventsResponse(ApiResponse): 13 | @property 14 | def successful_rows(self) -> int: 15 | return self.json.get("successful_rows") 16 | 17 | @property 18 | def quarantined_rows(self) -> int: 19 | return self.json.get("quarantined_rows") 20 | 21 | 22 | class EventsApi(Api): 23 | endpoint: str = "/v0/events" 24 | 25 | session: requests.Session 26 | 27 | def __init__(self, token: str, host: str = None): 28 | super().__init__(token, host) 29 | 30 | self.session = requests.Session() 31 | if self.token: 32 | self.session.headers.update({"Authorization": f"Bearer {self.token}"}) 33 | 34 | def send( 35 | self, 36 | name: str, 37 | records: list[dict], 38 | wait: bool = False, 39 | json_encoder: type = None, 40 | compress: bool = False, 41 | ) -> EventsResponse: 42 | """ 43 | Makes a POST request to ``/v0/events?name=`` with NDJSON encoded data. 44 | 45 | :param name: Name or ID of the target Data Source to append data to it 46 | :param records: List of JSON records to append. Records will be converted to NDJSON using ``json.dumps`` 47 | :param wait: 'false' by default. Set to 'true' to wait until the write is acknowledged by the database. 48 | Enabling this flag makes it possible to retry on database errors, but it introduces additional latency. 49 | It's recommended to enable it in use cases in which data loss avoidance is critical. Disable it otherwise. 50 | :param json_encoder: The JSON Encoder class passed to ``json.dumps``. Defaults to ``json.JSONEncoder``. 51 | :param compress: Whether to compress the data using gzip. Defaults to False. 52 | 53 | :return: The EventsResponse 54 | :raises ApiError: If the request failed 55 | """ 56 | url = f"{self.host}{self.endpoint}" 57 | 58 | docs = [json.dumps(doc, cls=json_encoder) for doc in records] 59 | data = "\n".join(docs) 60 | 61 | params = {"name": name} 62 | if wait: 63 | params["wait"] = "true" 64 | 65 | LOG.debug("sending %d ndjson records to %s via %s", len(records), name, url) 66 | 67 | headers = {"Content-Type": "application/x-ndjson"} 68 | 69 | if compress: 70 | headers["Content-Encoding"] = "gzip" 71 | data = gzip.compress(data.encode("utf-8")) 72 | 73 | response = self.session.post( 74 | url=url, 75 | params=params, 76 | headers=headers, 77 | data=data, 78 | ) 79 | 80 | if not response.ok: 81 | raise ApiError(response) 82 | 83 | return EventsResponse(response) 84 | -------------------------------------------------------------------------------- /verdin/api/tokens.py: -------------------------------------------------------------------------------- 1 | from typing import TypedDict 2 | 3 | import requests 4 | 5 | from verdin.api import ApiResponse 6 | from verdin.api.base import Api, ApiError 7 | 8 | 9 | class TokenNotFoundError(ApiError): 10 | """Specific ApiError representing a 404 Not Found when token names are given.""" 11 | 12 | 13 | class Scope(TypedDict): 14 | type: str 15 | resource: str | None 16 | filter: str | None 17 | 18 | 19 | class TokenInfo(TypedDict): 20 | id: str 21 | token: str 22 | scopes: list[Scope] 23 | name: str 24 | description: str | None 25 | origin: dict | None 26 | host: str 27 | is_internal: bool 28 | 29 | 30 | class ListTokensResponse(ApiResponse): 31 | @property 32 | def tokens(self) -> list[TokenInfo]: 33 | return self.json.get("tokens", []) 34 | 35 | 36 | class GetTokenInfoResponse(ApiResponse): 37 | @property 38 | def info(self) -> TokenInfo: 39 | return self.json 40 | 41 | 42 | class TokensApi(Api): 43 | """ 44 | Tokens API client. 45 | 46 | TODO: The following APIs are not yet implemented (note that some workspaces only allow resource modification 47 | through deployments anyway) 48 | - Create a new Token: Static or JWT (POST /v0/tokens) 49 | - Refresh a static token (POST /v0/tokens/:name/refresh) 50 | - Delete a Token (DELETE /v0/tokens/:name) 51 | - Modify a Token (PUT /v0/tokens/:name) 52 | """ 53 | 54 | endpoint: str = "/v0/tokens" 55 | 56 | session: requests.Session 57 | 58 | def __init__(self, token: str, host: str = None): 59 | super().__init__(token, host) 60 | 61 | self.session = requests.Session() 62 | if self.token: 63 | self.session.headers.update({"Authorization": f"Bearer {self.token}"}) 64 | 65 | def get_information(self, token: str): 66 | """ 67 | Fetches information about a particular Static Token. Makes a GET request to ``/v0/tokens/:name``. If the token 68 | doesn't exist, a 403 may be returned ("Not enough permissions to get information about this token"). 69 | 70 | :param token: The token identifier. 71 | :return: A ``GetTokenInfoResponse`` object. 72 | """ 73 | response = self.session.request( 74 | method="GET", 75 | url=f"{self.host}{self.endpoint}/{token}", 76 | ) 77 | 78 | if not response.ok: 79 | raise ApiError(response) 80 | 81 | return GetTokenInfoResponse(response) 82 | 83 | def list(self) -> ListTokensResponse: 84 | """ 85 | Retrieves all workspace Static Tokens. Makes a GET request to ``/v0/tokens``. 86 | 87 | :return: A ``ListTokensResponse`` object. 88 | """ 89 | response = self.session.request( 90 | method="GET", 91 | url=f"{self.host}{self.endpoint}", 92 | ) 93 | 94 | if not response.ok: 95 | raise ApiError(response) 96 | 97 | return ListTokensResponse(response) 98 | -------------------------------------------------------------------------------- /tests/integration/test_variables.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import time 3 | 4 | import pytest 5 | 6 | from tests.utils import short_id 7 | from verdin.api.variables import VariableNotFoundError 8 | 9 | 10 | class TestVariables: 11 | def test_integration(self, client): 12 | # E2E test for variables API 13 | variable_name = f"test_variable_{short_id()}" 14 | variable_value = "test_value" 15 | 16 | # List variables and make sure the variable_name is not in the list 17 | response = client.api.variables.list() 18 | variable_names = [var["name"] for var in response.variables] 19 | assert variable_name not in variable_names 20 | 21 | # Make sure an API Error with 404 is raised when getting a non-existent variable 22 | with pytest.raises(VariableNotFoundError) as e: 23 | client.api.variables.get(variable_name) 24 | assert e.value.status_code == 404 25 | 26 | # Create the variable 27 | create_response = client.api.variables.create(name=variable_name, value=variable_value) 28 | assert create_response.variable["name"] == variable_name 29 | assert create_response.variable["type"] == "secret" 30 | 31 | # List again and check that it's there 32 | list_response = client.api.variables.list() 33 | variable_names = [var["name"] for var in list_response.variables] 34 | assert variable_name in variable_names 35 | 36 | # Get the variable and assert the response 37 | get_response = client.api.variables.get(variable_name) 38 | assert get_response.variable["name"] == variable_name 39 | assert get_response.variable["type"] == "secret" 40 | 41 | # delete the variable and check again 42 | response = client.api.variables.delete(variable_name) 43 | assert response.ok 44 | 45 | with pytest.raises(VariableNotFoundError) as e: 46 | client.api.variables.get(variable_name) 47 | 48 | def test_get_non_existing_variable(self, client): 49 | with pytest.raises(VariableNotFoundError) as e: 50 | client.api.variables.get("non_existing_variable") 51 | 52 | assert e.match("Not found") 53 | assert e.value.status_code == 404 54 | 55 | def test_delete_non_existing_variable(self, client): 56 | with pytest.raises(VariableNotFoundError) as e: 57 | client.api.variables.delete("non_existing_variable") 58 | 59 | assert e.match("Variable not found") 60 | assert e.value.status_code == 404 61 | 62 | def test_update_non_existing_variable(self, client): 63 | with pytest.raises(VariableNotFoundError) as e: 64 | client.api.variables.update("non_existing_variable", value="foo") 65 | 66 | assert e.match("Variable not found") 67 | assert e.value.status_code == 404 68 | 69 | def test_update_variable(self, client): 70 | variable_name = f"test_variable_{short_id()}" 71 | variable_value = "test_value" 72 | 73 | response = client.api.variables.create(name=variable_name, value=variable_value) 74 | 75 | assert datetime.fromisoformat(response.variable["created_at"]).replace( 76 | microsecond=0 77 | ) == datetime.fromisoformat(response.variable["updated_at"]).replace(microsecond=0) 78 | 79 | time.sleep(1) 80 | 81 | response = client.api.variables.update(name=variable_name, value=variable_value + "1") 82 | assert response.variable["created_at"] != response.variable["updated_at"] 83 | 84 | client.api.variables.delete(variable_name) 85 | -------------------------------------------------------------------------------- /tests/test_worker.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import time 3 | from queue import Queue 4 | 5 | import requests 6 | 7 | from verdin.datasource import Datasource 8 | from verdin.worker import QueuingDatasourceAppender 9 | 10 | 11 | class QueueingDatasource(Datasource): 12 | def __init__(self, name, queue=None): 13 | super().__init__(name, None) 14 | self.queue = queue or Queue() 15 | 16 | def append(self, records) -> requests.Response: 17 | if records: 18 | self.queue.put(records) 19 | 20 | response = requests.Response() 21 | response.status_code = 200 22 | return response 23 | 24 | 25 | class TestQueuingDatasourceAppender: 26 | def test_batching(self): 27 | source = Queue() 28 | destination = QueueingDatasource("datasource") 29 | 30 | appender = QueuingDatasourceAppender(source, destination) 31 | appender.min_interval = 0 32 | 33 | source.put(("a", 1)) 34 | source.put(("b", 2)) 35 | source.put(("c", 3)) 36 | 37 | thread = threading.Thread(target=appender.run) 38 | thread.start() 39 | 40 | batch = destination.queue.get(timeout=1) 41 | assert len(batch) == 3 42 | assert batch[0] == ("a", 1) 43 | assert batch[1] == ("b", 2) 44 | assert batch[2] == ("c", 3) 45 | 46 | source.put(("d", 4)) 47 | 48 | batch = destination.queue.get(timeout=1) 49 | assert len(batch) == 1 50 | assert batch[0] == ("d", 4) 51 | 52 | appender.close() 53 | thread.join(timeout=2) 54 | assert appender.stopped.is_set() 55 | 56 | def test_stop_while_running(self): 57 | # instrument the queue 58 | source = Queue() 59 | destination = QueueingDatasource("datasource") 60 | appender = QueuingDatasourceAppender(source, destination) 61 | appender.min_interval = 0 62 | 63 | thread = threading.Thread(target=appender.run) 64 | thread.start() 65 | time.sleep(0.2) 66 | 67 | appender.close() 68 | thread.join(timeout=2) 69 | assert appender.stopped.is_set() 70 | 71 | def test_retry(self): 72 | class MockQueueingDatasource(QueueingDatasource): 73 | first_call = True 74 | 75 | def append(self, records) -> requests.Response: 76 | if self.first_call: 77 | self.first_call = False 78 | 79 | response = requests.Response() 80 | response.status_code = 429 81 | response.headers["Retry-After"] = "1" 82 | return response 83 | 84 | return super().append(records) 85 | 86 | source = Queue() 87 | destination = MockQueueingDatasource("datasource") 88 | appender = QueuingDatasourceAppender(source, destination) 89 | appender.min_interval = 0 90 | appender.wait_after_rate_limit = 0.5 91 | 92 | source.put(("a", 1)) 93 | source.put(("b", 2)) 94 | 95 | thread = threading.Thread(target=appender.run) 96 | thread.start() 97 | time.sleep(0.5) 98 | 99 | # should not be batched because we're still retrying with the previous batch 100 | source.put(("c", 3)) 101 | 102 | batch = destination.queue.get(timeout=5) 103 | assert len(batch) == 2 104 | 105 | batch = destination.queue.get(timeout=5) 106 | assert len(batch) == 1 107 | assert batch[0] == ("c", 3) 108 | 109 | appender.close() 110 | thread.join(timeout=5) 111 | assert appender.stopped.is_set() 112 | -------------------------------------------------------------------------------- /verdin/query.py: -------------------------------------------------------------------------------- 1 | import enum 2 | import logging 3 | from typing import Any, Optional, TypedDict 4 | 5 | import requests 6 | 7 | from . import config 8 | from .api import ApiError 9 | from .api.query import QueryApi 10 | 11 | LOG = logging.getLogger(__name__) 12 | 13 | 14 | class OutputFormat(enum.Enum): 15 | # https://docs.tinybird.co/api-reference/query-api.html#id6 16 | CSV = "CSV" 17 | CSVWithNames = "CSVWithNames" 18 | JSON = "JSON" 19 | TSV = "TSV" 20 | TSVWithNames = "TSVWithNames" 21 | PrettyCompact = "PrettyCompact" 22 | JSONEachRow = "JSONEachRow" 23 | 24 | 25 | class QueryMetadata(TypedDict): 26 | name: str 27 | type: str 28 | 29 | 30 | class Statistics(TypedDict): 31 | elapsed: float 32 | rows_read: int 33 | bytes_read: int 34 | 35 | 36 | JsonData = dict[str, Any] 37 | QueryJsonData = list[dict[str, Any]] 38 | 39 | 40 | class JsonResult(TypedDict): 41 | meta: list[QueryMetadata] 42 | data: QueryJsonData 43 | rows: int 44 | statistics: Statistics 45 | 46 | 47 | class QueryJsonResult: 48 | response: requests.Response 49 | result: JsonResult 50 | 51 | def __init__(self, response: requests.Response): 52 | self.response = response 53 | self.result = response.json() 54 | 55 | @property 56 | def empty(self): 57 | """ 58 | A property to check if the data in the result is empty. 59 | 60 | This property evaluates whether the "data" field within the "result" 61 | attribute is empty. 62 | 63 | :return: Returns True if the "data" field in "result" is missing or empty, 64 | otherwise returns False. 65 | """ 66 | return not self.result.get("data") 67 | 68 | @property 69 | def meta(self) -> list[QueryMetadata]: 70 | """ 71 | Returns the QueryMetadata from the query, which includes attributes and their types. 72 | 73 | :return: The QueryMetadata 74 | """ 75 | return self.result.get("meta") 76 | 77 | @property 78 | def data(self) -> QueryJsonData: 79 | """ 80 | Returns the data from the query, which is a list of dictionaries representing the rows of the query result. 81 | 82 | :return: The QueryJsonData 83 | """ 84 | return self.result.get("data") 85 | 86 | 87 | class QueryError(Exception): 88 | def __init__(self, response: requests.Response) -> None: 89 | self.response = response 90 | msg = response.text 91 | try: 92 | doc = response.json() 93 | if doc["error"]: 94 | msg = doc["error"] 95 | except Exception: 96 | pass 97 | super().__init__(f"{response.status_code}: {msg}") 98 | 99 | 100 | class SqlQuery: 101 | """ 102 | Tinybird SQL Query. https://docs.tinybird.co/api-reference/query-api.html#get--v0-sql 103 | """ 104 | 105 | endpoint: str = "/v0/sql" 106 | 107 | sql: str 108 | format: Optional[OutputFormat] 109 | 110 | def __init__( 111 | self, sql: str, token, format: Optional[OutputFormat] = None, api: str = None 112 | ) -> None: 113 | self.sql = sql 114 | self.format = format or OutputFormat.JSON 115 | self.token = token 116 | host = (api or config.API_URL).rstrip("/") 117 | self.api = host + self.endpoint 118 | self._query_api = QueryApi(token=token, host=host) 119 | 120 | def get(self, format: Optional[OutputFormat] = None) -> requests.Response: 121 | """ 122 | Runs the query and returns the response. 123 | 124 | TODO: replicate tinybird API concepts instead of returning Response 125 | 126 | :param format: Overwrite the default output format set in the constructor. 127 | :return: the HTTP response 128 | """ 129 | 130 | LOG.debug( 131 | "querying %s with query: %s", 132 | self.api, 133 | self.sql, 134 | ) 135 | 136 | try: 137 | response = self._query_api.query( 138 | self.sql, 139 | format=(format or self.format).value, 140 | ) 141 | return response._response 142 | except ApiError as e: 143 | raise QueryError(response=e._response) from e 144 | 145 | def json(self) -> QueryJsonResult: 146 | """ 147 | Runs the query and returns the result in JSON output format. 148 | 149 | :return: A QueryJsonResult containing the result of the query. 150 | """ 151 | response = self.get(OutputFormat.JSON) 152 | 153 | return QueryJsonResult(response) 154 | -------------------------------------------------------------------------------- /tests/integration/test_query.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | class TestQueryApi: 5 | @pytest.fixture(autouse=True) 6 | def _put_records(self, client): 7 | client.api.events.send( 8 | "simple", 9 | wait=True, 10 | records=[ 11 | { 12 | "Id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0", 13 | "Timestamp": "2024-01-23T10:30:00.123456", 14 | "Key": "foo", 15 | "Value": "bar", 16 | }, 17 | { 18 | "Id": "d7792957-21d8-46e6-a4e0-188eb36e2758", 19 | "Timestamp": "2024-02-23T11:45:00.234567", 20 | "Key": "baz", 21 | "Value": "ed", 22 | }, 23 | ], 24 | ) 25 | 26 | def test_query_datasource_json(self, client): 27 | response = client.api.query.query("SELECT key, value FROM simple ORDER BY `key` ASC") 28 | 29 | assert response.data == [{"key": "baz", "value": "ed"}, {"key": "foo", "value": "bar"}] 30 | assert response.meta == [ 31 | {"name": "key", "type": "String"}, 32 | {"name": "value", "type": "String"}, 33 | ] 34 | assert response.rows == 2 35 | assert response.statistics["rows_read"] == 2 36 | 37 | def test_query_pipe(self, client): 38 | response = client.api.query.query("SELECT * FROM simple_kv ORDER BY `key` ASC") 39 | 40 | assert response.data == [{"key": "baz", "value": "ed"}, {"key": "foo", "value": "bar"}] 41 | assert response.meta == [ 42 | {"name": "key", "type": "String"}, 43 | {"name": "value", "type": "String"}, 44 | ] 45 | assert response.rows == 2 46 | assert response.statistics["rows_read"] == 2 47 | 48 | def test_query_pipe_parameters(self, client): 49 | response = client.api.query.query( 50 | "SELECT key, value FROM simple_pipe", parameters={"key": "foo"} 51 | ) 52 | 53 | assert response.data == [{"key": "foo", "value": "bar"}] 54 | assert response.meta == [ 55 | {"name": "key", "type": "String"}, 56 | {"name": "value", "type": "String"}, 57 | ] 58 | assert response.rows == 1 59 | assert response.statistics["rows_read"] == 2 60 | 61 | def test_query_pipeline_json(self, client): 62 | response = client.api.query.query( 63 | "SELECT * FROM _ ORDER BY `key` ASC", pipeline="simple_kv" 64 | ) 65 | 66 | assert response.data == [{"key": "baz", "value": "ed"}, {"key": "foo", "value": "bar"}] 67 | assert response.meta == [ 68 | {"name": "key", "type": "String"}, 69 | {"name": "value", "type": "String"}, 70 | ] 71 | assert response.rows == 2 72 | assert response.statistics["rows_read"] == 2 73 | 74 | def test_query_csv(self, client): 75 | response = client.api.query.query( 76 | "SELECT key, value FROM simple ORDER BY `key` ASC", format="CSV" 77 | ) 78 | 79 | assert response.text == '"baz","ed"\n"foo","bar"\n' 80 | 81 | def test_query_csv_with_names(self, client): 82 | response = client.api.query.query( 83 | "SELECT key, value FROM simple ORDER BY `key` ASC", format="CSVWithNames" 84 | ) 85 | 86 | assert ( 87 | response.text 88 | == '"key","value"\n"baz","ed"\n"foo","bar"\n' 89 | != '"baz","ed"\n"foo","bar"\n' 90 | ) 91 | # CSV with names can be parsed as data! 92 | assert response.data == [{"key": "baz", "value": "ed"}, {"key": "foo", "value": "bar"}] 93 | 94 | def test_query_tsv(self, client): 95 | response = client.api.query.query( 96 | "SELECT key, value FROM simple ORDER BY `key` ASC", format="TSV" 97 | ) 98 | 99 | assert response.text == "baz\ted\nfoo\tbar\n" 100 | 101 | def test_query_tsv_with_names(self, client): 102 | response = client.api.query.query( 103 | "SELECT key, value FROM simple ORDER BY `key` ASC", format="TSVWithNames" 104 | ) 105 | 106 | assert response.text == "key\tvalue\nbaz\ted\nfoo\tbar\n" 107 | assert response.data == [{"key": "baz", "value": "ed"}, {"key": "foo", "value": "bar"}] 108 | 109 | def test_query_ndjson(self, client): 110 | response = client.api.query.query( 111 | "SELECT key, value FROM simple ORDER BY `key` ASC", format="JSONEachRow" 112 | ) 113 | 114 | assert ( 115 | response.text 116 | == '{"key":"baz","value":"ed"}\n{"key":"foo","value":"bar"}\n' 117 | != '"key","value"\n"baz","ed"\n"foo","bar"\n' 118 | ) 119 | # CSV with names can be parsed as data! 120 | assert response.data == [{"key": "baz", "value": "ed"}, {"key": "foo", "value": "bar"}] 121 | -------------------------------------------------------------------------------- /verdin/test/cli.py: -------------------------------------------------------------------------------- 1 | """Wrapper around the Tinybird CLI to make available the main commands programmatically.""" 2 | 3 | import dataclasses 4 | import logging 5 | import os 6 | import re 7 | import subprocess 8 | 9 | LOG = logging.getLogger(__name__) 10 | 11 | 12 | @dataclasses.dataclass 13 | class Token: 14 | id: str 15 | name: str 16 | token: str 17 | 18 | 19 | class CliError(Exception): 20 | def __init__(self, output: str, orig: subprocess.SubprocessError) -> None: 21 | super().__init__(output) 22 | self.orig = orig 23 | 24 | 25 | class TinybirdCli: 26 | """Interface around the Tinybird CLI""" 27 | 28 | def __init__(self, host: str = None, token: str = None, cwd: str = None, local: bool = False): 29 | self.host = host 30 | self.token = token 31 | self.cwd = cwd 32 | self.local = local 33 | 34 | def _env(self) -> dict: 35 | """ 36 | Returns a dictionary of environment variables to be used when calling tb CLI commands. 37 | """ 38 | _env = dict(os.environ) 39 | 40 | if self.host: 41 | _env["TB_HOST"] = self.host 42 | if self.token: 43 | _env["TB_TOKEN"] = self.token 44 | 45 | return _env 46 | 47 | def _get_base_args(self) -> list[str]: 48 | args = ["tb"] 49 | if self.local: 50 | args.append("--local") 51 | return args 52 | 53 | def token_ls(self) -> list[Token]: 54 | """ 55 | List all tokens. 56 | 57 | :return: List of Token instances 58 | """ 59 | args = [*self._get_base_args(), "token", "ls"] 60 | 61 | output = subprocess.check_output( 62 | args, 63 | encoding="utf-8", 64 | cwd=self.cwd, 65 | env=self._env(), 66 | ) 67 | """ 68 | output looks like this (unfortunately --output=json doesn't work) 69 | 70 | ** Tokens: 71 | -------------------------------------------------------------------------------- 72 | id: 63678691-7e28-4f2d-8ef7-243ab19ad7cb 73 | name: workspace admin token 74 | token: p.eyJ1IjogIjU2ZThhYmMzLWRjNmYtNDcyYi05Yzg1LTdkZjFiZmUyNjU5YyIsICJpZCI6ICI2MzY3ODY5MS03ZTI4LTRmMmQtOGVmNy0yNDNhYjE5YWQ3Y2IiLCAiaG9zdCI6ICJsb2NhbCJ9.4gzsbiG1cnrIDUfHTxfQd0ZN57YkiOKEIyvuTlnLiaM 75 | -------------------------------------------------------------------------------- 76 | id: 489c8ca1-195b-4383-a388-d84068ff1b2c 77 | name: admin local_testing@tinybird.co 78 | token: p.eyJ1IjogIjU2ZThhYmMzLWRjNmYtNDcyYi05Yzg1LTdkZjFiZmUyNjU5YyIsICJpZCI6ICI0ODljOGNhMS0xOTViLTQzODMtYTM4OC1kODQwNjhmZjFiMmMiLCAiaG9zdCI6ICJsb2NhbCJ9.MmcBjRTCg6dX53sWsZAv6QzHRHKxwu-pEWkqx8opLHA 79 | -------------------------------------------------------------------------------- 80 | """ 81 | tokens = [] 82 | current_token = {} 83 | 84 | for line in output.splitlines(): 85 | # remove color codes 86 | line = re.sub(r"\x1b\[[0-9;]*m", "", line) 87 | line = line.strip() 88 | if line.startswith("id: "): 89 | current_token = {} 90 | current_token["id"] = line[4:] 91 | elif line.startswith("name: "): 92 | current_token["name"] = line[6:] 93 | elif line.startswith("token: "): 94 | current_token["token"] = line[7:] 95 | tokens.append(Token(**current_token)) 96 | 97 | return tokens 98 | 99 | def local_start( 100 | self, daemon: bool = False, skip_new_version: bool = False, volumes_path: str = None 101 | ) -> subprocess.Popen: 102 | """ 103 | Run ``tb local start`` and return the subprocess. 104 | """ 105 | args = ["tb", "local", "start"] 106 | if daemon: 107 | args.append("-d") 108 | if skip_new_version: 109 | args.append("--skip-new-version") 110 | if volumes_path: 111 | args.append("--volumes-path") 112 | args.append(volumes_path) 113 | 114 | return subprocess.Popen(args, cwd=self.cwd, env=self._env()) 115 | 116 | def local_stop(self): 117 | """ 118 | Run ``tb local stop``. 119 | """ 120 | subprocess.check_output(["tb", "local", "stop"]) 121 | 122 | def local_remove(self): 123 | """ 124 | Run ``tb local remove``. 125 | """ 126 | subprocess.check_output( 127 | ["tb", "local", "remove"], 128 | input=b"y\n", 129 | ) 130 | 131 | def deploy( 132 | self, wait: bool = False, auto: bool = False, allow_destructive_operations: bool = False 133 | ): 134 | args = [*self._get_base_args(), "deploy"] 135 | 136 | if wait: 137 | args.append("--wait") 138 | if auto: 139 | args.append("--auto") 140 | if allow_destructive_operations: 141 | args.append("--allow-destructive-operations") 142 | 143 | try: 144 | output = subprocess.check_output( 145 | args, 146 | encoding="utf-8", 147 | cwd=self.cwd, 148 | env=self._env(), 149 | ) 150 | except subprocess.CalledProcessError as e: 151 | raise CliError(f"Failed to deploy project:\n{e.output}", e) from e 152 | 153 | return output 154 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Verdin 2 | ====== 3 | 4 |

5 | PyPI Version 6 | CI Status 7 | Coverage Status 8 | PyPI License 9 | Code style: black 10 |

11 | 12 | Verdin is a [tiny bird](https://en.wikipedia.org/wiki/Verdin), and also a [Tinybird](https://tinybird.co) SDK for Python. 13 | 14 | Install 15 | ------- 16 | 17 | pip install verdin 18 | 19 | Requirements 20 | ------------ 21 | 22 | Python 3.10+ 23 | 24 | Usage 25 | ----- 26 | 27 | ### Run an SQL Query 28 | 29 | ```python 30 | # the tinybird module exposes all important tinybird concepts 31 | from verdin import tinybird 32 | 33 | client = tinybird.Client("p.mytoken") 34 | query = client.sql("select * from my_datasource__v0") 35 | 36 | # run the query with `FORMAT JSON` and receive a QueryJsonResult 37 | response: tinybird.QueryJsonResult = query.json() 38 | 39 | # print records returned from the pipe 40 | print(response.data) 41 | ``` 42 | 43 | You can also run, e.g., `query.get(format=OutputFormat.CSV)` to get the raw response with CSV data. 44 | 45 | ### Query a Pipe 46 | 47 | ```python 48 | from verdin import tinybird 49 | 50 | client = tinybird.Client("p.mytoken") 51 | pipe = client.pipe("my_pipe") 52 | 53 | # query the pipe using dynamic parameters 54 | response: tinybird.PipeJsonResponse = pipe.query({"key": "val"}) 55 | 56 | # print records returned from the pipe 57 | print(response.data) 58 | ``` 59 | 60 | ### Append to a data source 61 | 62 | ```python 63 | from verdin import tinybird 64 | 65 | client = tinybird.Client("p.mytoken") 66 | 67 | # will access my_datasource__v0 68 | datasource = client.datasource("my_datasource", version=0) 69 | 70 | # query the pipe using dynamic parameters 71 | datasource.append([ 72 | ("col1-row1", "col2-row1"), 73 | ("col1-row2", "col2-row2"), 74 | ]) 75 | ``` 76 | 77 | ### Append to a data source using high-frequency ingest 78 | 79 | The `DataSource` object also gives you access to `/v0/events`, which is the high-frequency ingest, to append data. 80 | Use the `send_events` method and pass JSON serializable documents to it. 81 | 82 | ```python 83 | datasource.send_events(records=[ 84 | {"key": "val1"}, 85 | {"key": "val2"}, 86 | ... 87 | ]) 88 | ``` 89 | 90 | ### Queue and batch records into a DataSource 91 | 92 | Verdin provides a way to queue and batch data continuously: 93 | 94 | ```python 95 | from queue import Queue 96 | from threading import Thread 97 | 98 | from verdin import tinybird 99 | from verdin.worker import QueuingDatasourceAppender 100 | 101 | client = tinybird.Client("p.mytoken") 102 | 103 | records = Queue() 104 | 105 | appender = QueuingDatasourceAppender(records, client.datasource("my_datasource")) 106 | Thread(target=appender.run).start() 107 | 108 | # appender will regularly read batches of data from the queue and append them 109 | # to the datasource. the appender respects rate limiting. 110 | 111 | records.put(("col1-row1", "col2-row1")) 112 | records.put(("col1-row2", "col2-row2")) 113 | ``` 114 | 115 | ### API access 116 | 117 | The DataSource and Pipes objects presented so far are high-level abstractions that provide a convenience Python API 118 | to deal with the most common use cases. Verdin also provides more low-level access to APIs via `client.api`. 119 | The following APIs are available: 120 | 121 | * `/v0/datasources`: `client.api.datasources` 122 | * `/v0/events`: `client.api.events` 123 | * `/v0/pipes`: `client.api.pipes` 124 | * `/v0/sql`: `client.api.query` 125 | * `/v0/tokens`: `client.api.tokens` 126 | * `/v0/variables`: `client.api.variables` 127 | 128 | Note that for some (datasources, pipes, tokens), manipulation operations are not implemented as they are typically done 129 | through tb deployments and not through the API. 130 | 131 | Also note that API clients do not take care of retries or rate limiting. The caller is expected to handle fault 132 | tolerance. 133 | 134 | #### Example (Querying a pipe) 135 | 136 | You can query a pipe through the pipes API as follows: 137 | 138 | ```python 139 | from verdin import tinybird 140 | 141 | client = tinybird.Client(...) 142 | 143 | response = client.api.pipes.query( 144 | "my_pipe", 145 | parameters={"my_param": "..."}, 146 | query="SELECT * FROM _ LIMIT 10", 147 | ) 148 | 149 | for record in response.data: 150 | # each record is a dictionary 151 | ... 152 | ``` 153 | 154 | #### Example (High-frequency ingest) 155 | 156 | You can use the HFI endpoint `/v0/events` through the `events` api. As records, you can pass a list of JSON serializable 157 | documents. 158 | 159 | ```python 160 | from verdin import tinybird 161 | 162 | client = tinybird.Client(...) 163 | 164 | response = client.api.events.send("my_datasource", records=[ 165 | {"id": "...", "value": "..."}, 166 | ... 167 | ]) 168 | assert response.quarantined_rows == 0 169 | ``` 170 | 171 | Develop 172 | ------- 173 | 174 | Create the virtual environment, install dependencies, and run tests 175 | 176 | make venv 177 | make test 178 | 179 | Run the code formatter 180 | 181 | make format 182 | 183 | Upload the pypi package using twine 184 | 185 | make upload 186 | -------------------------------------------------------------------------------- /verdin/api/variables.py: -------------------------------------------------------------------------------- 1 | from typing import TypedDict, Literal 2 | 3 | import requests 4 | 5 | from verdin.api.base import Api, ApiError, ApiResponse 6 | 7 | 8 | class VariableNotFoundError(ApiError): 9 | """ 10 | Specific ApiError representing a 404 Not Found when variable names are given. 11 | """ 12 | 13 | 14 | class VariableInfo(TypedDict): 15 | """ 16 | A variable info object. Example:: 17 | 18 | { 19 | "name": "test_password", 20 | "type": "secret", 21 | "created_at": "2024-06-21T10:27:57", 22 | "updated_at": "2024-06-21T10:27:57", 23 | "edited_by": "token: 'admin token'" 24 | } 25 | """ 26 | 27 | name: str 28 | type: str 29 | created_at: str 30 | updated_at: str 31 | edited_by: str 32 | 33 | 34 | class CreateVariableResponse(ApiResponse): 35 | @property 36 | def variable(self) -> VariableInfo: 37 | """ 38 | Returns the variable information. 39 | """ 40 | return self.json 41 | 42 | 43 | class UpdateVariableResponse(ApiResponse): 44 | @property 45 | def variable(self) -> VariableInfo: 46 | """ 47 | Returns the variable information. 48 | """ 49 | return self.json 50 | 51 | 52 | class DeleteVariableResponse(ApiResponse): 53 | @property 54 | def ok(self) -> bool: 55 | """ 56 | Returns whether the operation was successful. 57 | """ 58 | return self.json.get("ok", False) 59 | 60 | 61 | class ListVariablesResponse(ApiResponse): 62 | @property 63 | def variables(self) -> list[VariableInfo]: 64 | """ 65 | Returns the list of variables. 66 | """ 67 | return self.json.get("variables", []) 68 | 69 | 70 | class GetVariableResponse(ApiResponse): 71 | @property 72 | def variable(self) -> VariableInfo: 73 | """ 74 | Returns the variable information. 75 | """ 76 | return self.json 77 | 78 | 79 | class VariablesApi(Api): 80 | """ 81 | ``/v0/variables`` API client. 82 | 83 | This API allows you to create, update, delete, and list environment variables 84 | that can be used in Pipes in a Workspace. 85 | """ 86 | 87 | endpoint: str = "/v0/variables" 88 | 89 | session: requests.Session 90 | 91 | def __init__(self, token: str, host: str = None): 92 | super().__init__(token, host) 93 | 94 | self.session = requests.Session() 95 | if self.token: 96 | self.session.headers.update({"Authorization": f"Bearer {self.token}"}) 97 | 98 | def create( 99 | self, 100 | name: str, 101 | value: str, 102 | type: Literal["secret"] = "secret", 103 | ) -> CreateVariableResponse: 104 | """ 105 | Creates a new environment variable. 106 | 107 | :param name: The name of the variable. 108 | :param value: The value of the variable. 109 | :param type: The type of the variable. Defaults to 'secret'. 110 | :return: A ``CreateVariableResponse`` object. 111 | """ 112 | data = { 113 | "name": name, 114 | "value": value, 115 | "type": type, 116 | } 117 | 118 | response = self.session.request( 119 | method="POST", 120 | url=f"{self.host}{self.endpoint}", 121 | data=data, 122 | ) 123 | 124 | if not response.ok: 125 | raise ApiError(response) 126 | 127 | return CreateVariableResponse(response) 128 | 129 | def delete(self, name: str) -> DeleteVariableResponse: 130 | """ 131 | Deletes an environment variable. 132 | 133 | :param name: The name of the variable to delete. 134 | :return: A ``DeleteVariableResponse`` object. 135 | """ 136 | response = self.session.request( 137 | method="DELETE", 138 | url=f"{self.host}{self.endpoint}/{name}", 139 | ) 140 | 141 | if response.status_code == 404: 142 | raise VariableNotFoundError(response) 143 | 144 | if not response.ok: 145 | raise ApiError(response) 146 | 147 | return DeleteVariableResponse(response) 148 | 149 | def update( 150 | self, 151 | name: str, 152 | value: str, 153 | ) -> UpdateVariableResponse: 154 | """ 155 | Updates an environment variable. 156 | 157 | :param name: The name of the variable to update. 158 | :param value: The new value of the variable. 159 | :return: A ``UpdateVariableResponse`` object. 160 | """ 161 | data = { 162 | "value": value, 163 | } 164 | 165 | response = self.session.request( 166 | method="PUT", 167 | url=f"{self.host}{self.endpoint}/{name}", 168 | data=data, 169 | ) 170 | 171 | if response.status_code == 404: 172 | raise VariableNotFoundError(response) 173 | 174 | if not response.ok: 175 | raise ApiError(response) 176 | 177 | return UpdateVariableResponse(response) 178 | 179 | def list(self) -> ListVariablesResponse: 180 | """ 181 | Lists all environment variables. 182 | 183 | :return: A ``ListVariablesResponse`` object. 184 | """ 185 | response = self.session.request( 186 | method="GET", 187 | url=f"{self.host}{self.endpoint}", 188 | ) 189 | 190 | if not response.ok: 191 | raise ApiError(response) 192 | 193 | return ListVariablesResponse(response) 194 | 195 | def get(self, name: str) -> GetVariableResponse: 196 | """ 197 | Gets information about a specific environment variable. 198 | 199 | :param name: The name of the variable to get. 200 | :return: A ``GetVariableResponse`` object. 201 | """ 202 | response = self.session.request( 203 | method="GET", 204 | url=f"{self.host}{self.endpoint}/{name}", 205 | ) 206 | 207 | if response.status_code == 404: 208 | raise VariableNotFoundError(response) 209 | 210 | if not response.ok: 211 | raise ApiError(response) 212 | 213 | return GetVariableResponse(response) 214 | -------------------------------------------------------------------------------- /tests/integration/test_datasources.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pytest 4 | 5 | from verdin.api import ApiError 6 | from verdin.api.datasources import DataSourceNotFoundError 7 | from tests.utils import retry 8 | 9 | 10 | class TestDataSourcesApi: 11 | def test_list(self, client): 12 | response = client.api.datasources.list() 13 | 14 | assert len(response.datasources) >= 1 15 | 16 | # find "simple" datasource in the list of data sources 17 | ds = None 18 | for datasource in response.datasources: 19 | if datasource["name"] == "simple": 20 | ds = datasource 21 | break 22 | 23 | assert ds 24 | 25 | # smoke tests some attributes 26 | assert ds["engine"]["engine"] == "MergeTree" 27 | assert "simple_kv" in [x["name"] for x in ds["used_by"]] 28 | 29 | def test_get_information(self, client): 30 | response = client.api.datasources.get_information("simple") 31 | 32 | # smoke tests some attributes 33 | assert response.info["name"] == "simple" 34 | assert response.info["engine"]["engine"] == "MergeTree" 35 | assert "simple_kv" in [x["name"] for x in response.info["used_by"]] 36 | 37 | def test_get_information_on_non_existing_datasource(self, client): 38 | with pytest.raises(DataSourceNotFoundError) as e: 39 | client.api.datasources.get_information("non_existing_datasource") 40 | 41 | e.match('Data Source "non_existing_datasource" does not exist') 42 | assert e.value.status_code == 404 43 | 44 | def test_truncate(self, client): 45 | ds = client.datasource("simple") 46 | ds.append_ndjson( 47 | [ 48 | { 49 | "Id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0", 50 | "Timestamp": "2024-01-23T10:30:00.123456", 51 | "Key": "foo", 52 | "Value": "bar", 53 | }, 54 | { 55 | "Id": "d7792957-21d8-46e6-a4e0-188eb36e2758", 56 | "Timestamp": "2024-02-23T11:45:00.234567", 57 | "Key": "baz", 58 | "Value": "ed", 59 | }, 60 | ] 61 | ) 62 | 63 | def _wait_for_count(cnt: int): 64 | query = client.sql("SELECT count(*) as cnt FROM simple") 65 | assert query.json().data == [{"cnt": cnt}] 66 | 67 | retry(_wait_for_count, args=(2,)) 68 | 69 | client.api.datasources.truncate("simple") 70 | 71 | retry(_wait_for_count, args=(0,)) 72 | 73 | def test_append_to_non_existing_data_source(self, client): 74 | with pytest.raises(ApiError) as e: 75 | client.api.datasources.append("non_existing_datasource", "foo,bar\n") 76 | 77 | # this is odd behavior, but currently, this raises a 403, with the error 78 | # "Adding or modifying data sources to this workspace can only be done via deployments" 79 | # due to the way tinybird behaves (apparently it doesn't check mode=append) 80 | 81 | assert e.value.status_code == 403 82 | 83 | def test_append_csv(self, client): 84 | ds = client.api.datasources 85 | 86 | data = "5b6859d2-e060-40a4-949a-7e7fab8e3207,2024-01-23T10:30:00.123456,foo,bar\n" 87 | data += "af49ffce-559c-426e-9787-ddb08628b547,2024-02-23T11:45:00.234567,baz,ed" 88 | 89 | response = ds.append("simple", data) 90 | assert not response.error 91 | assert response.quarantine_rows == 0 92 | assert response.invalid_lines == 0 93 | assert response.datasource["name"] == "simple" 94 | 95 | assert client.sql("SELECT * FROM simple").json().data == [ 96 | { 97 | "id": "5b6859d2-e060-40a4-949a-7e7fab8e3207", 98 | "timestamp": "2024-01-23 10:30:00.123456", 99 | "key": "foo", 100 | "value": "bar", 101 | }, 102 | { 103 | "id": "af49ffce-559c-426e-9787-ddb08628b547", 104 | "timestamp": "2024-02-23 11:45:00.234567", 105 | "key": "baz", 106 | "value": "ed", 107 | }, 108 | ] 109 | 110 | def test_append_csv_with_invalid_data(self, client): 111 | ds = client.api.datasources 112 | 113 | data = "5b6859d2-e060-40a4-949a-7e7fab8e3207,2024-01-23T10:30:00.123456,foo,bar\n" 114 | data += "af49ffce-559c-426e-9787-ddb08628b5472024-02-23T11:45:00.234567,baz,ed" # error in this line 115 | 116 | response = ds.append("simple", data) 117 | assert ( 118 | response.error 119 | == "There was an error with file contents: 1 row in quarantine and 1 invalid line." 120 | ) 121 | assert response.invalid_lines == 1 122 | assert response.quarantine_rows == 1 123 | 124 | def test_append_ndjson(self, client): 125 | ds = client.api.datasources 126 | ds.truncate("simple") 127 | 128 | records = [ 129 | { 130 | "Id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0", 131 | "Timestamp": "2024-01-23T10:30:00.123456", 132 | "Key": "foo", 133 | "Value": "bar", 134 | }, 135 | { 136 | "Id": "d7792957-21d8-46e6-a4e0-188eb36e2758", 137 | "Timestamp": "2024-02-23T11:45:00.234567", 138 | "Key": "baz", 139 | "Value": "ed", 140 | }, 141 | ] 142 | 143 | def _data(): 144 | for r in records: 145 | yield json.dumps(r) + "\n" 146 | 147 | response = ds.append("simple", _data(), format="ndjson") 148 | assert not response.error 149 | assert response.quarantine_rows == 0 150 | assert response.invalid_lines == 0 151 | assert response.datasource["name"] == "simple" 152 | 153 | assert client.sql("SELECT * FROM simple").json().data == [ 154 | { 155 | "id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0", 156 | "timestamp": "2024-01-23 10:30:00.123456", 157 | "key": "foo", 158 | "value": "bar", 159 | }, 160 | { 161 | "id": "d7792957-21d8-46e6-a4e0-188eb36e2758", 162 | "timestamp": "2024-02-23 11:45:00.234567", 163 | "key": "baz", 164 | "value": "ed", 165 | }, 166 | ] 167 | -------------------------------------------------------------------------------- /verdin/pipe.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Any, Iterator, Optional 3 | 4 | import requests 5 | 6 | from . import config 7 | from .api import ApiError 8 | from .api.pipes import PipesApi 9 | 10 | LOG = logging.getLogger(__name__) 11 | 12 | PipeMetadata = list[tuple[str, str]] 13 | PipeJsonData = list[dict[str, Any]] 14 | 15 | 16 | class PipeError(Exception): 17 | """ 18 | Wrapper of the HTTP response returned by a Pipe query if the HTTP response is not a 2XX code. 19 | """ 20 | 21 | response: requests.Response 22 | 23 | def __init__(self, response: requests.Response) -> None: 24 | self.response = response 25 | self.json: dict = response.json() 26 | super().__init__(self.description) 27 | 28 | @property 29 | def description(self) -> str: 30 | return self.json.get("error") 31 | 32 | 33 | class PipeJsonResponse: 34 | """ 35 | Wrapper of the HTTP response returned by a Pipe query. 36 | """ 37 | 38 | response: requests.Response 39 | result: dict 40 | 41 | def __init__(self, response: requests.Response): 42 | self.response = response 43 | self.result = response.json() 44 | 45 | @property 46 | def empty(self) -> bool: 47 | """ 48 | A property to check if the data in the result is empty. 49 | 50 | This property evaluates whether the "data" field within the "result" 51 | attribute is empty. 52 | 53 | :return: Returns True if the "data" field in "result" is missing or empty, 54 | otherwise returns False. 55 | """ 56 | return not self.result.get("data") 57 | 58 | @property 59 | def meta(self) -> PipeMetadata: 60 | """ 61 | Returns the PipeMetadata from the query, which includes attributes and their types. 62 | 63 | :return: The PipeMetadata 64 | """ 65 | return [(t["name"], t["type"]) for t in self.result.get("meta", [])] 66 | 67 | @property 68 | def data(self) -> PipeJsonData: 69 | """ 70 | Returns the data from the query, which is a list of dictionaries representing the rows of the query result. 71 | 72 | :return: The PipeJsonData 73 | """ 74 | return self.result.get("data") 75 | 76 | 77 | PipePageIterator = Iterator[PipeJsonResponse] 78 | 79 | 80 | class PagedPipeQuery(PipePageIterator): 81 | # TODO: allow passing of custom parameters 82 | 83 | pipe: "Pipe" 84 | 85 | def __init__(self, pipe: "Pipe", page_size: int = 50, start_at: int = 0): 86 | self.pipe = pipe 87 | self.limit = page_size 88 | self.offset = start_at 89 | 90 | def __iter__(self): 91 | return self 92 | 93 | def __next__(self): 94 | sql = f"SELECT * FROM _ LIMIT {self.limit} OFFSET {self.offset}" 95 | response = self.pipe.sql(sql) 96 | if response.empty: 97 | raise StopIteration() 98 | self.offset += self.limit 99 | return response 100 | 101 | 102 | class Pipe: 103 | """ 104 | Model abstraction of a tinybird Pipe. 105 | 106 | TODO: implement csv mode 107 | """ 108 | 109 | endpoint: str = "/v0/pipes" 110 | 111 | name: str 112 | version: Optional[int] 113 | resource: str 114 | 115 | def __init__(self, name, token, version: int = None, api: str = None) -> None: 116 | super().__init__() 117 | self.name = name 118 | self.token = token 119 | self.version = version 120 | self.resource = (api or config.API_URL).rstrip("/") + self.endpoint 121 | 122 | self._pipes_api = PipesApi(token, host=(api or config.API_URL).rstrip("/")) 123 | 124 | @property 125 | def canonical_name(self) -> str: 126 | """ 127 | Returns the name of the pipe that can be queried. If a version is specified, the name will be suffixed with 128 | ``__v``. Otherwise, this just returns the name. Note that versions are discouraged in the current 129 | tinybird workflows. 130 | 131 | :return: The canonical name of the pipe that can be used in queries 132 | """ 133 | if self.version is not None: 134 | return f"{self.name}__v{self.version}" 135 | else: 136 | return self.name 137 | 138 | @property 139 | def pipe_url(self) -> str: 140 | """ 141 | Returns the API URL of this pipe. It's something like ``https://api.tinybird.co/v0/pipes/my_pipe.json``. 142 | 143 | :return: The Pipe API URL 144 | """ 145 | return self.resource + "/" + self.canonical_name + ".json" 146 | 147 | def query(self, params: dict[str, str] = None) -> PipeJsonResponse: 148 | """ 149 | Query the pipe endpoint using the given dynamic parameters. Note that the pipe needs to be exposed as an 150 | endpoint. 151 | 152 | See: https://www.tinybird.co/docs/forward/work-with-data/query-parameters#use-pipes-api-endpoints-with-dynamic-parameters 153 | 154 | :param params: The dynamic parameters of the pipe and the values for your query 155 | :return: a PipeJsonResponse containing the result of the query 156 | """ 157 | try: 158 | response = self._pipes_api.query( 159 | self.canonical_name, 160 | parameters=params, 161 | format="json", 162 | ) 163 | return PipeJsonResponse(response._response) 164 | except ApiError as e: 165 | raise PipeError(e._response) 166 | 167 | def pages(self, page_size: int = 50, start_at: int = 0) -> PipePageIterator: 168 | """ 169 | Returns an iterator over the pipe's data pages. Each page contains ``page_size`` records. 170 | 171 | TODO: currently we don't support dynamic parameters with paged queries 172 | 173 | :param page_size: The size of each page (default 50) 174 | :param start_at: The offset at which to start (default 0) 175 | :return: 176 | """ 177 | return PagedPipeQuery(pipe=self, page_size=page_size, start_at=start_at) 178 | 179 | def sql(self, query: str) -> PipeJsonResponse: 180 | """ 181 | Run an SQL query against the pipe. For example: 182 | 183 | pipe.sql("select count() from _") 184 | 185 | See https://docs.tinybird.co/api-reference/query-api.html 186 | 187 | :param query: The SQL query to run 188 | :return: The result of the query 189 | """ 190 | try: 191 | response = self._pipes_api.query(self.canonical_name, query=query, format="json") 192 | return PipeJsonResponse(response._response) 193 | except ApiError as e: 194 | raise PipeError(e._response) 195 | 196 | def __str__(self): 197 | return f"Pipe({self.canonical_name})" 198 | 199 | def __repr__(self): 200 | return self.__str__() 201 | -------------------------------------------------------------------------------- /verdin/datasource.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import io 3 | import json 4 | import logging 5 | import os 6 | from typing import Any, Dict, List, Optional, Tuple, Union, TYPE_CHECKING 7 | 8 | import requests 9 | 10 | from . import config 11 | from .api.datasources import DataSourcesApi 12 | from .api.events import EventsApi, EventsResponse 13 | 14 | if TYPE_CHECKING: 15 | from _typeshed import SupportsWrite 16 | 17 | LOG = logging.getLogger(__name__) 18 | 19 | Record = Union[Tuple, List[Any]] 20 | Records = List[Record] 21 | 22 | 23 | def to_csv(records: Records, **kwargs) -> str: 24 | """ 25 | Convert the given records to CSV using a CSV writer, and return them as a single string. 26 | 27 | :param records: The records to convert to CSV. 28 | :param kwargs: Args to be passed to ``csv.writer``. 29 | :return: A string representing the CSV 30 | """ 31 | output = io.StringIO() 32 | write_csv(output, records, **kwargs) 33 | return output.getvalue() 34 | 35 | 36 | def write_csv(file: "SupportsWrite[str]", records: Records, **kwargs): 37 | """ 38 | Converts the given records to CSV and writes them to the given file. 39 | 40 | :param file: The file passed to the CSV writer. 41 | :param records: The records to convert to CSV. 42 | :param kwargs: Args to be passed to ``csv.writer``. 43 | """ 44 | # TODO: do proper type conversion here to optimize for CSV input 45 | # see: https://guides.tinybird.co/guide/fine-tuning-csvs-for-fast-ingestion 46 | 47 | if "delimiter" in kwargs: 48 | if kwargs["delimiter"] is None: 49 | del kwargs["delimiter"] 50 | 51 | writer = csv.writer(file, quoting=csv.QUOTE_MINIMAL, lineterminator="\n", **kwargs) 52 | 53 | for record in records: 54 | writer.writerow(record) 55 | 56 | 57 | class Datasource: 58 | """ 59 | Abstract tinybird datasource. 60 | """ 61 | 62 | endpoint: str = "/v0/datasources" 63 | 64 | name: str 65 | version: Optional[int] 66 | 67 | def __init__(self, name, token, version: int = None, api: str = None) -> None: 68 | self.name = name 69 | self.token = token 70 | self.version = version 71 | self._host = (api or config.API_URL).rstrip("/") 72 | self.api = self._host + self.endpoint 73 | 74 | # API clients used to make the actual API calls 75 | self._events_api = EventsApi(token, self._host) 76 | self._datasources_api = DataSourcesApi(token, self._host) 77 | 78 | @property 79 | def canonical_name(self) -> str: 80 | """ 81 | Returns the name of the table that can be queried. If a version is specified, the name will be suffixed with 82 | ``__v``. Otherwise, this just returns the name. Note that versions are discouraged in the current 83 | tinybird workflows. 84 | 85 | :return: The canonical name of the table that can be used in queries 86 | """ 87 | if self.version is not None: 88 | return f"{self.name}__v{self.version}" 89 | else: 90 | return self.name 91 | 92 | def send_events( 93 | self, records: list[dict], wait: bool = False, json_encoder: type = None 94 | ) -> EventsResponse: 95 | """ 96 | Uses the ``/v0/events`` API endpoint to send JSON data to the datasource. 97 | 98 | :param records: List of JSON records to append. Records will be converted to NDJSON using ``json.dumps`` 99 | :param wait: 'false' by default. Set to 'true' to wait until the write is acknowledged by the database. 100 | Enabling this flag makes it possible to retry on database errors, but it introduces additional latency. 101 | It's recommended to enable it in use cases in which data loss avoidance is critical. Disable it otherwise. 102 | :param json_encoder: The JSON Encoder class passed to ``json.dumps``. Defaults to ``json.JSONEncoder``. 103 | :return: The EventsResponse from the ``EventsApi``. 104 | :raises ApiError: If the request failed 105 | """ 106 | return self._events_api.send( 107 | self.canonical_name, records=records, wait=wait, json_encoder=json_encoder 108 | ) 109 | 110 | def append(self, records: Records, *args, **kwargs) -> requests.Response: 111 | """Calls ``append_csv``.""" 112 | # TODO: replicate tinybird API concepts instead of returning Response 113 | return self.append_csv(records, *args, **kwargs) 114 | 115 | def append_csv(self, records: Records, delimiter: str = ",") -> requests.Response: 116 | """ 117 | Makes a POST request to the datasource using mode=append with CSV data. This appends data to the table. 118 | 119 | :param records: List of records to append. They will be converted to CSV using the provided delimiter. 120 | :param delimiter: Optional delimiter (defaults to ",") 121 | :return: The HTTP response 122 | """ 123 | 124 | data = self.to_csv(records, delimiter=delimiter) 125 | 126 | LOG.debug( 127 | "appending %d csv records to %s via %s", 128 | len(records), 129 | self, 130 | self.api, 131 | ) 132 | 133 | response = self._datasources_api.append( 134 | name=self.canonical_name, 135 | dialect_delimiter=delimiter, 136 | format="csv", 137 | data=data, 138 | ) 139 | 140 | return response._response 141 | 142 | def append_ndjson(self, records: List[Dict]) -> requests.Response: 143 | """ 144 | Makes a POST request to the datasource using mode=append with ndjson data. This appends data to the table. 145 | 146 | :param records: List of JSON records to append. They will be converted to NDJSON using ``json.dumps`` 147 | :return: The HTTP response 148 | """ 149 | 150 | def _ndjson_iterator(): 151 | for record in records: 152 | yield json.dumps(record) + "\n" 153 | 154 | LOG.debug( 155 | "appending %d ndjson records to %s via %s", 156 | len(records), 157 | self, 158 | self.api, 159 | ) 160 | response = self._datasources_api.append( 161 | name=self.canonical_name, 162 | format="ndjson", 163 | data=_ndjson_iterator(), 164 | ) 165 | 166 | return response._response 167 | 168 | def truncate(self): 169 | """ 170 | Truncate the datasource which removes all records in the table. 171 | """ 172 | self._datasources_api.truncate(name=self.canonical_name) 173 | 174 | @staticmethod 175 | def to_csv(records: List[List[Any]], **kwargs): 176 | return to_csv(records, **kwargs) 177 | 178 | def __str__(self): 179 | return f"Datasource({self.canonical_name})" 180 | 181 | def __repr__(self): 182 | return self.__str__() 183 | 184 | 185 | class FileDatasource(Datasource): 186 | """ 187 | Datasource that writes into a file, used for testing and development purposes. 188 | """ 189 | 190 | def __init__(self, path: str): 191 | name = os.path.basename(path) 192 | super().__init__(name, None) 193 | self.path = path 194 | 195 | def append_csv(self, records: Records, *args, **kwargs) -> requests.Response: 196 | if records: 197 | with open(self.path, "a") as fd: 198 | write_csv(fd, records) 199 | 200 | response = requests.Response() 201 | response.status_code = 200 202 | return response 203 | 204 | def append_ndjson(self, records: List[Dict]) -> requests.Response: 205 | raise NotImplementedError 206 | 207 | def readlines(self) -> List[str]: 208 | with open(self.path, "r") as fd: 209 | return fd.readlines() 210 | 211 | def truncate(self): 212 | raise NotImplementedError 213 | -------------------------------------------------------------------------------- /tests/integration/test_pipes.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from verdin.api.pipes import PipeNotFoundError 4 | 5 | 6 | class TestPipesApi: 7 | def test_list(self, client): 8 | response = client.api.pipes.list( 9 | attrs=["id", "name"], 10 | node_attrs=[], 11 | ) 12 | 13 | for pipe in response.pipes: 14 | assert {"id", "name", "url"} == set(pipe.keys()) 15 | assert pipe["id"] is not None 16 | assert pipe["name"] is not None 17 | 18 | assert "simple_kv" in [pipe["name"] for pipe in response.pipes] 19 | assert "simple_pipe" in [pipe["name"] for pipe in response.pipes] 20 | 21 | def test_query_json(self, client): 22 | client.api.events.send( 23 | "simple", 24 | wait=True, 25 | records=[ 26 | { 27 | "Id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0", 28 | "Timestamp": "2024-01-23T10:30:00.123456", 29 | "Key": "foo", 30 | "Value": "bar", 31 | }, 32 | { 33 | "Id": "d7792957-21d8-46e6-a4e0-188eb36e2758", 34 | "Timestamp": "2024-02-23T11:45:00.234567", 35 | "Key": "baz", 36 | "Value": "ed", 37 | }, 38 | ], 39 | ) 40 | 41 | response = client.api.pipes.query("simple_kv", format="json") 42 | assert response.data == [{"key": "baz", "value": "ed"}, {"key": "foo", "value": "bar"}] 43 | assert response.meta == [ 44 | {"name": "key", "type": "String"}, 45 | {"name": "value", "type": "String"}, 46 | ] 47 | assert response.rows == 2 48 | assert response.statistics["rows_read"] == 2 49 | 50 | @pytest.mark.parametrize("format", ["csv", "ndjson", "json"]) 51 | def test_query_formats(self, client, format): 52 | client.api.events.send( 53 | "simple", 54 | wait=True, 55 | records=[ 56 | { 57 | "Id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0", 58 | "Timestamp": "2024-01-23T10:30:00.123456", 59 | "Key": "foo", 60 | "Value": "bar", 61 | }, 62 | { 63 | "Id": "d7792957-21d8-46e6-a4e0-188eb36e2758", 64 | "Timestamp": "2024-02-23T11:45:00.234567", 65 | "Key": "baz", 66 | "Value": "ed", 67 | }, 68 | { 69 | "Id": "2b84e03e-dbcf-4141-9656-94ff8ac8c036", 70 | "Timestamp": "2024-03-23T11:45:00.345678", 71 | "Key": "format", 72 | "Value": format, 73 | }, 74 | ], 75 | ) 76 | 77 | response = client.api.pipes.query("simple_kv", format=format) 78 | assert response.data == [ 79 | {"key": "baz", "value": "ed"}, 80 | {"key": "foo", "value": "bar"}, 81 | {"key": "format", "value": format}, 82 | ] 83 | 84 | def test_query_with_params(self, client): 85 | client.api.events.send( 86 | "simple", 87 | wait=True, 88 | records=[ 89 | { 90 | "Id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0", 91 | "Timestamp": "2024-01-23T10:30:00.123456", 92 | "Key": "foo", 93 | "Value": "bar", 94 | }, 95 | { 96 | "Id": "d7792957-21d8-46e6-a4e0-188eb36e2758", 97 | "Timestamp": "2024-02-23T11:45:00.234567", 98 | "Key": "baz", 99 | "Value": "ed", 100 | }, 101 | { 102 | "Id": "2b84e03e-dbcf-4141-9656-94ff8ac8c036", 103 | "Timestamp": "2024-03-23T11:45:00.345678", 104 | "Key": "foo", 105 | "Value": "bar2", 106 | }, 107 | ], 108 | ) 109 | 110 | response = client.api.pipes.query("simple_pipe", parameters={"key": "foo"}) 111 | assert response.data == [ 112 | { 113 | "id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0", 114 | "key": "foo", 115 | "timestamp": "2024-01-23 10:30:00.123456", 116 | "value": "bar", 117 | }, 118 | { 119 | "id": "2b84e03e-dbcf-4141-9656-94ff8ac8c036", 120 | "key": "foo", 121 | "timestamp": "2024-03-23 11:45:00.345678", 122 | "value": "bar2", 123 | }, 124 | ] 125 | 126 | response = client.api.pipes.query("simple_pipe", parameters={"key": "does not exist"}) 127 | assert response.data == [] 128 | 129 | def test_query_with_sql(self, client): 130 | client.api.events.send( 131 | "simple", 132 | wait=True, 133 | records=[ 134 | { 135 | "Id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0", 136 | "Timestamp": "2024-01-23T10:30:00.123456", 137 | "Key": "foo", 138 | "Value": "bar", 139 | }, 140 | { 141 | "Id": "d7792957-21d8-46e6-a4e0-188eb36e2758", 142 | "Timestamp": "2024-02-23T11:45:00.234567", 143 | "Key": "baz", 144 | "Value": "ed", 145 | }, 146 | { 147 | "Id": "2b84e03e-dbcf-4141-9656-94ff8ac8c036", 148 | "Timestamp": "2024-03-23T11:45:00.345678", 149 | "Key": "foo", 150 | "Value": "bar2", 151 | }, 152 | ], 153 | ) 154 | 155 | response = client.api.pipes.query( 156 | "simple_pipe", 157 | query="SELECT timestamp,key,value FROM _ ORDER BY `value` DESC", 158 | parameters={"key": "foo"}, 159 | ) 160 | 161 | assert response.data == [ 162 | { 163 | "timestamp": "2024-03-23 11:45:00.345678", 164 | "key": "foo", 165 | "value": "bar2", 166 | }, 167 | { 168 | "timestamp": "2024-01-23 10:30:00.123456", 169 | "key": "foo", 170 | "value": "bar", 171 | }, 172 | ] 173 | 174 | def test_query_with_sql_too_long(self, client): 175 | chars = "a" * 6000 176 | # ``chars`` ends up in both the query and parameters, making the total body ~12k, but the query <8k, which is a 177 | # requirement. 178 | 179 | response = client.api.pipes.query( 180 | "simple_pipe", 181 | query=f"SELECT * FROM _ WHERE `value` = '{chars}'", 182 | parameters={"key": chars}, 183 | ) 184 | 185 | # ofcourse the query is nonsense and returns nothing 186 | assert response.data == [] 187 | 188 | def test_query_with_non_existing_pipe(self, client): 189 | with pytest.raises(PipeNotFoundError) as e: 190 | client.api.pipes.query("non_existent_pipe") 191 | 192 | assert e.match("The pipe 'non_existent_pipe' does not exist") 193 | assert e.value.status_code == 404 194 | 195 | def test_get_information(self, client): 196 | response = client.api.pipes.get_information("simple_kv") 197 | assert response.info["name"] == "simple_kv" 198 | assert response.info["type"] == "endpoint" 199 | 200 | # check that it also works with the pipe's ID 201 | response = client.api.pipes.get_information(response.info["id"]) 202 | assert response.info["name"] == "simple_kv" 203 | assert response.info["type"] == "endpoint" 204 | 205 | def test_get_information_non_existing_pipe(self, client): 206 | with pytest.raises(PipeNotFoundError) as e: 207 | client.api.pipes.get_information("non_existent_pipe") 208 | 209 | assert e.match("Pipe 'non_existent_pipe' not found") 210 | assert e.value.status_code == 404 211 | -------------------------------------------------------------------------------- /verdin/worker.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains a worker that reads batches of records from a Queue and appends them to a Datasource. It provides 3 | an opinionated way to ingest data into tinybird from a python process. Note this worker does not use the ``/v0/events`` 4 | API, but instead uses the datasource's append functionality, which has higher rate limits. 5 | """ 6 | 7 | import logging 8 | import multiprocessing 9 | import time 10 | from queue import Empty, Queue 11 | from typing import Optional, Tuple 12 | 13 | import requests 14 | 15 | from .datasource import Datasource, Records 16 | 17 | LOG = logging.getLogger(__name__) 18 | 19 | 20 | class StopWorker(Exception): 21 | """ 22 | An exception that indicates to stop the QueueingDatasourceAppender worker. 23 | """ 24 | 25 | marker = "__STOP__" 26 | 27 | batch: Optional[Records] 28 | 29 | def __init__(self, batch: Records = None): 30 | self.batch = batch 31 | 32 | 33 | class QueuingDatasourceAppender: 34 | """ 35 | A QueuingDatasourceAppender reads batches of records from a source Queue and appends the batches to a data 36 | source. Once rate limited, it waits for the instructed amount of time (or if that is not specified, 37 | whatever default_retry_after is set to), before appending again. 38 | 39 | See https://docs.tinybird.co/api-reference/api-reference.html#limits-title 40 | 41 | TODO: synchronize multiple appenders through a RateLimiter concurrency structure: data source share rate limiting 42 | across account or workspace (not sure), so running multiple separate queuing datasource appenders will lead to 43 | excessive rate limiting. 44 | """ 45 | 46 | default_retry_after: float = 12 47 | wait_after_rate_limit: float = 12 48 | 49 | source: Queue 50 | destination: Datasource 51 | min_interval: float 52 | 53 | def __init__(self, source: Queue, destination: Datasource, min_interval: float = 5) -> None: 54 | """ 55 | :param source: a queue that buffers records to be appended to the datasource 56 | :param destination: the datasource to append to 57 | :param min_interval: the minimal time to wait between batches 58 | """ 59 | super().__init__() 60 | self.source = source 61 | self.destination = destination 62 | self.stopped = multiprocessing.Event() 63 | self.min_interval = min_interval 64 | 65 | def close(self): 66 | if self.stopped.is_set(): 67 | return 68 | self.stopped.set() 69 | self.source.put_nowait(StopWorker.marker) 70 | 71 | def run(self): 72 | try: 73 | while not self.stopped.is_set(): 74 | try: 75 | then = time.time() 76 | batch, error = self._do_next_batch() 77 | 78 | if error is not None: 79 | # TODO: make sure the batch is not dropped on error. however, if the batch is 80 | # not appendable (for example because of errors in the data), then we need to 81 | # make sure the batch is either dropped, or we try to find the record that 82 | # causes the error. 83 | 84 | raise error 85 | 86 | duration = time.time() - then 87 | LOG.debug("processing batch took %.2f", duration) 88 | if self.min_interval: 89 | self.stopped.wait(self.min_interval) 90 | 91 | except StopWorker as e: 92 | LOG.info("indicated worker shutdown, trying to flush batch") 93 | if e.batch: 94 | self._retry_batch(e.batch, max_retries=2) 95 | return 96 | 97 | except Exception: 98 | LOG.exception("exception while processing batch, events will be dropped") 99 | finally: 100 | LOG.info( 101 | "shutting down DatasourceQueueWorker, %d elements remaining", 102 | self.source.qsize(), 103 | ) 104 | 105 | def _get_batch(self, n=None) -> Records: 106 | """ 107 | Reads the next batch from the queue. 108 | 109 | :param n: the maximum number of items to batch (default is entire queue) 110 | :return: the items from the queue as Batch 111 | :raises StopWorker if the StopWorker.marker was retrieved from the queue 112 | """ 113 | q = self.source 114 | item = q.get() 115 | 116 | if item == StopWorker.marker: 117 | raise StopWorker() 118 | 119 | result = [item] # block until we have at least one item 120 | 121 | if not n: 122 | n = q.qsize() 123 | 124 | try: 125 | while len(result) <= n: 126 | item = q.get(block=False) 127 | 128 | if item == StopWorker.marker: 129 | raise StopWorker(result) 130 | 131 | result.append(item) 132 | except Empty: 133 | pass 134 | 135 | return result 136 | 137 | def _parse_retry_seconds(self, response: requests.Response) -> float: 138 | retry = response.headers.get("Retry-After") 139 | if retry: 140 | try: 141 | return float(retry) + 0.5 142 | except ValueError as e: 143 | LOG.error("error while parsing Retry-After value '%s': %s", retry, e) 144 | 145 | return self.default_retry_after 146 | 147 | def _retry_batch(self, batch, max_retries=10) -> Tuple[requests.Response, bool]: 148 | """ 149 | Tries to append the given batch to the datasource for max_retries amount of times. It 150 | only retries if the request was rate limited, and waits for a certain amount of time 151 | afterwards. 152 | 153 | :param batch: a list of records to append to the datasource 154 | :param max_retries: max number of retries (defaults to 10) 155 | :return: a tuple with the last response and a boolean flag indicating whether the request 156 | was rate-limited 157 | """ 158 | limited = False 159 | response = None 160 | 161 | for _ in range(max_retries): 162 | response = self.destination.append(batch) 163 | 164 | if response.ok: 165 | return response, limited 166 | 167 | if response.status_code == 429: 168 | wait = self._parse_retry_seconds(response) 169 | limited = True 170 | LOG.debug( 171 | "rate limited by API, keeping %d records safe for %d seconds: %s", 172 | len(batch), 173 | wait, 174 | response.text, 175 | ) 176 | time.sleep(wait) 177 | continue 178 | 179 | LOG.warning( 180 | "unhandled error %d: %s while appending to datasource, dropping batch", 181 | response.status_code, 182 | response.text, 183 | ) 184 | return response, limited 185 | 186 | return response, limited 187 | 188 | def _do_next_batch(self) -> Tuple[Records, Optional[Exception]]: 189 | batch = self._get_batch() 190 | 191 | try: 192 | LOG.debug( 193 | "processing batch of size %d into datasource %s", 194 | len(batch), 195 | self.destination.name, 196 | ) 197 | 198 | response, limited = self._retry_batch(batch) 199 | 200 | if limited: 201 | # if the request was rate-limited, we'll try again after X-Ratelimit-Reset, or the 202 | # wait_after_rate_limit value if it is set 203 | try: 204 | if self.wait_after_rate_limit: 205 | wait = self.wait_after_rate_limit 206 | else: 207 | wait = float(response.headers.get("X-Ratelimit-Reset", 0)) 208 | 209 | LOG.info( 210 | "waiting %d second until rate-limit window resets before batching again", 211 | wait, 212 | ) 213 | time.sleep(wait) 214 | except ValueError: 215 | LOG.exception("error while parsing X-Ratelimit-Reset value") 216 | 217 | except Exception as e: 218 | return batch, e 219 | 220 | return batch, None 221 | -------------------------------------------------------------------------------- /verdin/api/query.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | from typing import Literal, Optional, TypedDict, Any 4 | 5 | import requests 6 | 7 | from verdin.api import ApiResponse 8 | from verdin.api.base import Api, ApiError 9 | 10 | QueryOutputFormat = Literal[ 11 | "CSV", 12 | "CSVWithNames", 13 | "JSON", 14 | "TSV", 15 | "TSVWithNames", 16 | "PrettyCompact", 17 | "JSONEachRow", 18 | "Parquet", 19 | "Prometheus", 20 | ] 21 | """See https://www.tinybird.co/docs/api-reference/query-api#id10 22 | 23 | +---------------+--------------------------------------------------+ 24 | | Format | Description | 25 | +===============|==================================================+ 26 | | CSV | CSV without header | 27 | +---------------+--------------------------------------------------+ 28 | | CSVWithNames | CSV with header | 29 | +---------------+--------------------------------------------------+ 30 | | JSON | JSON including data, statistics and schema info | 31 | +---------------+--------------------------------------------------+ 32 | | TSV | TSV without header | 33 | +---------------+--------------------------------------------------+ 34 | | TSVWithNames | TSV with header | 35 | +---------------+--------------------------------------------------+ 36 | | PrettyCompact | Formatted table | 37 | +---------------+--------------------------------------------------+ 38 | | JSONEachRow | Newline-delimited JSON values (NDJSON) | 39 | +---------------+--------------------------------------------------+ 40 | | Parquet | Apache Parquet | 41 | +---------------+--------------------------------------------------+ 42 | | Prometheus | Prometheus text-based format | 43 | +---------------+--------------------------------------------------+ 44 | """ 45 | 46 | 47 | class QueryStatistics(TypedDict): 48 | bytes_read: int 49 | elapsed: float 50 | rows_read: int 51 | 52 | 53 | class QueryMetadata(TypedDict): 54 | name: str 55 | type: str 56 | 57 | 58 | QueryData = list[dict[str, Any]] 59 | 60 | 61 | class QueryResponse(ApiResponse): 62 | @property 63 | def data(self) -> QueryData: 64 | raise NotImplementedError 65 | 66 | 67 | class QueryJsonResponse(QueryResponse): 68 | @property 69 | def data(self) -> QueryData: 70 | """ 71 | Returns the data returned by the query, which is a list of dictionaries representing the records in rows. 72 | 73 | :return: List of records. 74 | """ 75 | return self.json.get("data", []) 76 | 77 | @property 78 | def meta(self) -> list[QueryMetadata]: 79 | """ 80 | Returns the QueryMetadata from the query, which includes attributes and their types. 81 | 82 | :return: The QueryMetadata 83 | """ 84 | return self.json.get("meta", []) 85 | 86 | @property 87 | def rows(self) -> int: 88 | """ 89 | Returns the number of rows returned by the query. 90 | 91 | :return: The number of rows returned by the query. 92 | """ 93 | return self.json.get("rows") 94 | 95 | @property 96 | def statistics(self) -> QueryStatistics: 97 | """ 98 | Returns the query statistics, which include the number of bytes read, the number of rows read, and the elapsed. 99 | :return: The QueryStatistics objects. 100 | """ 101 | return self.json.get("statistics", {}) 102 | 103 | @property 104 | def empty(self) -> bool: 105 | """ 106 | A property to check if the data in the result is empty. 107 | 108 | This property evaluates whether the "data" field within the "result" 109 | attribute is empty. 110 | 111 | :return: Returns True if the "data" field in "result" is missing or empty, 112 | otherwise returns False. 113 | """ 114 | return not self.json.get("data") 115 | 116 | 117 | class QueryNdjsonResponse(QueryResponse): 118 | @property 119 | def data(self) -> list[dict]: 120 | """Parses the CSV response body into a list of dictionaries.""" 121 | for line in self.text.splitlines(): 122 | json.loads(line) 123 | return [json.loads(line) for line in self.text.strip().splitlines()] 124 | 125 | 126 | class QueryCsvResponse(QueryResponse): 127 | def __init__(self, response: requests.Response, delimiter: str = ","): 128 | super().__init__(response) 129 | self.delimiter = delimiter 130 | 131 | @property 132 | def data(self) -> list[dict]: 133 | """Parses the CSV response body into a list of dictionaries.""" 134 | reader = csv.DictReader( 135 | self.text.splitlines(), 136 | delimiter=self.delimiter, 137 | ) 138 | return list(reader) 139 | 140 | 141 | class QueryApi(Api): 142 | """ 143 | The Query API allows you to query your Pipes and Data Sources inside Tinybird as if you were running SQL statements 144 | against a standard database. 145 | 146 | See https://www.tinybird.co/docs/api-reference/query-api. 147 | """ 148 | 149 | endpoint: str = "/v0/sql" 150 | 151 | session: requests.Session 152 | 153 | def __init__(self, token: str, host: str = None): 154 | super().__init__(token, host) 155 | 156 | self.session = requests.Session() 157 | if self.token: 158 | self.session.headers.update({"Authorization": f"Bearer {self.token}"}) 159 | 160 | def query( 161 | self, 162 | query: str, 163 | pipeline: str = None, 164 | parameters: dict[str, str] = None, 165 | output_format_json_quote_64bit_integers: bool = False, 166 | output_format_json_quote_denormals: bool = False, 167 | output_format_parquet_string_as_string: bool = False, 168 | format: QueryOutputFormat = "JSON", 169 | ) -> QueryResponse | QueryJsonResponse | QueryNdjsonResponse | QueryCsvResponse: 170 | """ 171 | Executes a SQL query using the engine. As a response, it gives you the query metadata, the resulting data and 172 | some performance statistics. 173 | 174 | The return type will depend on the desired ``format``. For the following formats, we return special response 175 | objects that contain the parsed data: 176 | * ``JSON``: ``QueryJsonResponse`` (default) 177 | * ``CSVWithNames``: QueryCsvResponse 178 | * ``TSVWithNames``: QueryCsvResponse 179 | * ``JSONEachRow``: ``QueryNdjsonResponse`` 180 | 181 | For all other formats, we return a generic ``QueryResponse`` object, that allows you to access the raw response 182 | body via ``response.text`` (str) or ``response.content`` (bytes). 183 | 184 | :param query: The SQL query 185 | :param pipeline: (Optional) The name of the pipe. It allows writing a query like 'SELECT * FROM _' where '_' is 186 | a placeholder for the 'pipeline' parameter 187 | :param parameters: Additional query parameters 188 | :param output_format_json_quote_64bit_integers: (Optional) Controls quoting of 64-bit or bigger integers (like 189 | UInt64 or Int128) when they are output in a JSON format. Such integers are enclosed in quotes by default. 190 | This behavior is compatible with most JavaScript implementations. Possible values: False — Integers are 191 | output without quotes. True — Integers are enclosed in quotes. Default value is False 192 | :param output_format_json_quote_denormals: (Optional) Controls representation of inf and nan on the UI instead 193 | of null e.g when dividing by 0 - inf and when there is no representation of a number in Javascript - nan. 194 | Default value is false 195 | :param output_format_parquet_string_as_string: (Optional) Use Parquet String type instead of Binary for String 196 | columns. Possible values: False - disabled, True - enabled. Default value is False 197 | :param format: Output format of the query results (defaults to JSON) 198 | :return: QueryResponse object containing the query results 199 | """ 200 | 201 | query = _sql_with_format(query, format) 202 | 203 | data: dict[str, str | int] = dict(parameters) if parameters else {} 204 | if query: 205 | data["q"] = query 206 | if pipeline: 207 | data["pipeline"] = pipeline 208 | if output_format_json_quote_64bit_integers: 209 | data["output_format_json_quote_64bit_integers"] = 1 210 | if output_format_json_quote_denormals: 211 | data["output_format_json_quote_denormals"] = 1 212 | if output_format_parquet_string_as_string: 213 | data["output_format_parquet_string_as_string"] = 1 214 | 215 | # if the query is too large, the web server (nginx) will respond with "414 Request-URI Too Large". it seems 216 | # this limit is around 8kb, so if it's too large, we use a POST request instead. 217 | qsize = 1 # include the "?" character 218 | for k, v in data.items(): 219 | qsize += len(k) + len(v) + 2 # include the ``&`` and ``=`` character 220 | 221 | if qsize > 8192 or parameters: 222 | response = self.session.request( 223 | method="POST", 224 | url=f"{self.host}{self.endpoint}", 225 | data=data, 226 | ) 227 | else: 228 | response = self.session.request( 229 | method="GET", 230 | url=f"{self.host}{self.endpoint}", 231 | params=data, 232 | ) 233 | 234 | if not response.ok: 235 | raise ApiError(response) 236 | 237 | # format-specific response objects 238 | if format == "JSON": 239 | return QueryJsonResponse(response) 240 | if format == "CSVWithNames": 241 | return QueryCsvResponse(response) 242 | if format == "TSVWithNames": 243 | return QueryCsvResponse(response, delimiter="\t") 244 | if format == "JSONEachRow": 245 | return QueryNdjsonResponse(response) 246 | 247 | return QueryResponse(response) 248 | 249 | 250 | def _sql_with_format(sql, output_format: Optional[QueryOutputFormat] = None) -> str: 251 | """ 252 | Returns a formatted SQL query with the given output format. If no output format is specified, the query is 253 | returned as is. 254 | 255 | :param output_format: The output format to use (suffixes ``FORMAT `` to the query) 256 | :return: An SQL string 257 | """ 258 | # TODO: handle potentially already existing FORMAT string 259 | if not output_format: 260 | return sql 261 | return sql + f" FORMAT {output_format}" 262 | -------------------------------------------------------------------------------- /verdin/api/pipes.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | from typing import TypedDict, Literal 4 | 5 | import requests 6 | 7 | from .base import Api, ApiResponse, ApiError 8 | 9 | PipeOutputFormat = Literal["csv", "json", "ndjson", "parquet", "prometheus"] 10 | 11 | 12 | class PipeNotFoundError(ApiError): 13 | """ 14 | Specific ApiError representing a 404 Not Found when pipe names are given. 15 | """ 16 | 17 | 18 | class PipeNode(TypedDict): 19 | id: str 20 | name: str 21 | sql: str 22 | deployment_suffix: str | None 23 | description: str | None 24 | materialized: bool | None 25 | cluster: str | None 26 | tags: dict 27 | created_at: str 28 | updated_at: str 29 | version: int 30 | project: str | None 31 | result: str | None 32 | ignore_sql_errors: bool 33 | node_type: str 34 | dependencies: list[str] | None 35 | params: list | None 36 | 37 | 38 | class PipeListInfo(TypedDict): 39 | id: str 40 | name: str 41 | description: str 42 | endpoint: str 43 | created_at: str 44 | updated_at: str 45 | parent: str | None 46 | nodes: list[PipeNode] 47 | url: str 48 | 49 | 50 | class PipeInfo(TypedDict): 51 | """ 52 | A document returned by the pipe information endpoint. Example:: 53 | 54 | { 55 | "content": "VERSION 0\n\nDESCRIPTION >\n Endpoint to select unique ...", 56 | "created_at": "2025-12-17 13:18:09.799374", 57 | "description": "Endpoint to select unique key/value pairs from simple", 58 | "edited_by": null, 59 | "endpoint": "t_54dffae578ef47238fd51e9849f79a1f", 60 | "id": "t_c50152ced57b46b99acf14930b9c6906", 61 | "last_commit": { 62 | "content_sha": "", 63 | "path": "", 64 | "status": "None" 65 | }, 66 | "name": "simple_kv", 67 | "nodes": [ 68 | { 69 | "cluster": null, 70 | "created_at": "2025-12-17 13:18:09.799385", 71 | "dependencies": [ 72 | "simple" 73 | ], 74 | "deployment_suffix": "", 75 | "description": null, 76 | "id": "t_54dffae578ef47238fd51e9849f79a1f", 77 | "ignore_sql_errors": false, 78 | "materialized": null, 79 | "name": "endpoint", 80 | "node_type": "endpoint", 81 | "params": [], 82 | "project": null, 83 | "result": null, 84 | "sql": "%\n SELECT key, value\n FROM simple\n ...", 85 | "tags": {}, 86 | "updated_at": "2025-12-17 13:18:09.799385", 87 | "version": 0 88 | } 89 | ], 90 | "parent": null, 91 | "path": "endpoints/simple_kv.pipe", 92 | "type": "endpoint", 93 | "updated_at": "2025-12-17 13:18:09.799394", 94 | "url": "http://localhost:8001/v0/pipes/simple_kv.json", 95 | "workspace_id": "2244743a-d384-478f-a9f5-ea4848c56427" 96 | } 97 | """ 98 | 99 | content: str 100 | created_at: str 101 | description: str 102 | edited_by: str | None 103 | endpoint: str 104 | id: str 105 | last_commit: dict 106 | name: str 107 | nodes: list[PipeNode] 108 | parent: str | None 109 | path: str 110 | type: str 111 | updated_at: str 112 | url: str 113 | workspace_id: str 114 | 115 | 116 | class ListPipesResponse(ApiResponse): 117 | @property 118 | def pipes(self) -> list[PipeListInfo]: 119 | return self.json.get("pipes", []) 120 | 121 | 122 | class GetPipeInformationResponse(ApiResponse): 123 | @property 124 | def info(self) -> PipeInfo: 125 | return self.json 126 | 127 | 128 | class QueryPipeResponse(ApiResponse): 129 | @property 130 | def data(self) -> list[dict]: 131 | raise NotImplementedError 132 | 133 | 134 | class QueryPipeJsonResponse(QueryPipeResponse): 135 | @property 136 | def data(self) -> list[dict]: 137 | return self.json.get("data", []) 138 | 139 | @property 140 | def meta(self) -> list[dict]: 141 | return self.json.get("meta", []) 142 | 143 | @property 144 | def rows(self) -> int: 145 | return self.json.get("rows") 146 | 147 | @property 148 | def statistics(self) -> dict: 149 | return self.json.get("statistics", {}) 150 | 151 | 152 | class QueryPipeNdjsonResponse(QueryPipeResponse): 153 | @property 154 | def data(self) -> list[dict]: 155 | """Parses the CSV response body into a list of dictionaries.""" 156 | for line in self.text.splitlines(): 157 | print(line) 158 | json.loads(line) 159 | return [json.loads(line) for line in self.text.strip().splitlines()] 160 | 161 | 162 | class QueryPipeCsvResponse(QueryPipeResponse): 163 | @property 164 | def data(self) -> list[dict]: 165 | """Parses the CSV response body into a list of dictionaries.""" 166 | reader = csv.DictReader(self.text.splitlines()) 167 | return list(reader) 168 | 169 | 170 | class PipesApi(Api): 171 | """ 172 | Pipes API. See https://www.tinybird.co/docs/api-reference/pipe-api 173 | 174 | TODO: missing APIs to implement 175 | * Creating a new pipe (POST /v0/pipes) 176 | * Append a node to a pipe (POST /v0/pipes/:name/nodes) 177 | * Delete a node from a pipe (DELETE /v0/pipes/:name/nodes/:node_id) 178 | * Update a node in a pipe (PUT /v0/pipes/:name/nodes/:node_id) 179 | * Delete a pipe (DELETE /v0/pipes/:name) 180 | * Change a pipe's metadata (PUT /v0/pipes/:name) 181 | * Explain a pipe (GET /v0/pipes/:name/explain) 182 | """ 183 | 184 | endpoint: str = "/v0/pipes" 185 | 186 | session: requests.Session 187 | 188 | def __init__(self, token: str, host: str = None): 189 | super().__init__(token, host) 190 | 191 | self.session = requests.Session() 192 | if self.token: 193 | self.session.headers.update({"Authorization": f"Bearer {self.token}"}) 194 | 195 | def list( 196 | self, 197 | dependencies: bool = False, 198 | attrs: list[str] = None, 199 | node_attrs: list[str] = None, 200 | ) -> ListPipesResponse: 201 | """ 202 | Get a list of pipes in your account. Makes a GET request to ``/v0/pipes`` endpoint, which returns a list of 203 | pipes. 204 | 205 | :param dependencies: Include dependent data sources and pipes, default is false 206 | :param attrs: List of pipe attributes to return (e.g. '["name","description"]') 207 | :param node_attrs: List of node attributes to return (e.g. '["id","name"]') 208 | :return: A ``ListPipesResponse`` object 209 | """ 210 | params = {} 211 | if dependencies: 212 | params["dependencies"] = "true" 213 | if attrs: 214 | params["attrs"] = ",".join(attrs) 215 | if node_attrs: 216 | params["node_attrs"] = ",".join(node_attrs) 217 | 218 | response = self.session.request( 219 | method="GET", 220 | url=f"{self.host}{self.endpoint}", 221 | params=params, 222 | ) 223 | 224 | if not response.ok: 225 | raise ApiError(response) 226 | 227 | return ListPipesResponse(response) 228 | 229 | def query( 230 | self, 231 | name: str, 232 | query: str = None, 233 | parameters: dict[str, str] = None, 234 | format: PipeOutputFormat = "json", 235 | ) -> QueryPipeResponse | QueryPipeJsonResponse | QueryPipeNdjsonResponse | QueryPipeCsvResponse: 236 | """ 237 | Query the Pipe. Makes a GET request to ``/v0/pipes/.`` endpoint, which returns the query result. 238 | The return value depends on the format parameter. Currently, parquet and prometheus formats are only supported 239 | as raw outputs. For all others you can call ``response.data`` and receive a list of dictionary records. 240 | 241 | When using an additional SQL query (through the ``query`` parameter) for the Pipe, you can use the 242 | ``_`` shortcut, which refers to your Pipe name. You can pass both ``parameters`` and ``query``. 243 | 244 | :param name: The name of the pipe to query. 245 | :param query: Optional query to execute against the pipe. 246 | :param parameters: The dynamic parameters passed to the pipe. 247 | :param format: The output format (default: json). 248 | :return: A ``QueryPipeResponse`` object that is specific to the output format. 249 | """ 250 | 251 | params = dict(parameters) if parameters else {} 252 | if query: 253 | params["q"] = query 254 | 255 | # if the query is too large, the web server (nginx) will respond with "414 Request-URI Too Large". it seems 256 | # this limit is around 8kb, so if it's too large, we use a POST request instead. 257 | qsize = 1 # include the "?" character 258 | for k, v in params.items(): 259 | qsize += len(k) + len(v) + 2 # include the ``&`` and ``=`` character 260 | 261 | if qsize > 8192: 262 | response = self.session.request( 263 | method="POST", 264 | url=f"{self.host}{self.endpoint}/{name}.{format}", 265 | data=params, 266 | ) 267 | else: 268 | response = self.session.request( 269 | method="GET", 270 | url=f"{self.host}{self.endpoint}/{name}.{format}", 271 | params=params, 272 | ) 273 | 274 | if response.status_code == 404: 275 | raise PipeNotFoundError(response) 276 | 277 | if not response.ok: 278 | raise ApiError(response) 279 | 280 | # format-specific response objects 281 | if format == "json": 282 | return QueryPipeJsonResponse(response) 283 | if format == "ndjson": 284 | return QueryPipeNdjsonResponse(response) 285 | if format == "csv": 286 | return QueryPipeCsvResponse(response) 287 | 288 | # prometheus and parquet formats are currently only supported as raw outputs 289 | 290 | return QueryPipeResponse(response) 291 | 292 | def get_information(self, name: str) -> GetPipeInformationResponse: 293 | """ 294 | Makes a GET request to ``/v0/pipes/`` endpoint, which returns the pipe information. 295 | See: https://www.tinybird.co/docs/api-reference/pipe-api#get--v0-pipes-(.+\.pipe) 296 | 297 | :param name: The name or ID of the pipe. 298 | :return: A ``GetPipeInformationResponse`` object 299 | """ 300 | response = self.session.request( 301 | method="GET", 302 | url=f"{self.host}{self.endpoint}/{name}", 303 | ) 304 | 305 | if response.status_code == 404: 306 | raise PipeNotFoundError(response) 307 | 308 | if not response.ok: 309 | raise ApiError(response) 310 | 311 | return GetPipeInformationResponse(response) 312 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /verdin/api/datasources.py: -------------------------------------------------------------------------------- 1 | from typing import Literal, Iterable, TypedDict 2 | 3 | import requests 4 | 5 | from .base import Api, ApiError, ApiResponse 6 | 7 | 8 | class DataSourceNotFoundError(ApiError): 9 | """ 10 | Specific ApiError representing a 404 Not Found when database names are given. 11 | """ 12 | 13 | 14 | class DataSourceInfo(TypedDict): 15 | """ 16 | A data source info object. Example:: 17 | 18 | { 19 | "cluster": "tinybird", 20 | "created_at": "2025-12-17 13:18:09.799040", 21 | "description": "Simple Key-Value Data Source", 22 | "engine": { 23 | "engine": "MergeTree", 24 | "engine_full": "MergeTree ORDER BY tuple()", 25 | "sorting_key": "tuple()" 26 | }, 27 | "errors_discarded_at": null, 28 | "headers": {}, 29 | "id": "t_e1ea6e1e32004989af509b034b0987c1", 30 | "indexes": [], 31 | "last_commit": { 32 | "content_sha": "", 33 | "path": "", 34 | "status": "ok" 35 | }, 36 | "name": "simple", 37 | "new_columns_detected": false, 38 | "project": null, 39 | "replicated": false, 40 | "schema": { 41 | "columns": [ 42 | { 43 | "auto": false, 44 | "codec": null, 45 | "default_value": null, 46 | "jsonpath": "$.Id", 47 | "name": "id", 48 | "normalized_name": "id", 49 | "nullable": false, 50 | "type": "UUID" 51 | }, 52 | { 53 | "auto": false, 54 | "codec": null, 55 | "default_value": null, 56 | "jsonpath": "$.Timestamp", 57 | "name": "timestamp", 58 | "normalized_name": "timestamp", 59 | "nullable": false, 60 | "type": "DateTime64(6)" 61 | }, 62 | { 63 | "auto": false, 64 | "codec": null, 65 | "default_value": null, 66 | "jsonpath": "$.Key", 67 | "name": "key", 68 | "normalized_name": "key", 69 | "nullable": false, 70 | "type": "String" 71 | }, 72 | { 73 | "auto": false, 74 | "codec": null, 75 | "default_value": null, 76 | "jsonpath": "$.Value", 77 | "name": "value", 78 | "normalized_name": "value", 79 | "nullable": false, 80 | "type": "String" 81 | } 82 | ], 83 | "sql_schema": "`id` UUID `json:$.Id`, `timestamp` DateTime64(6) `json:$.Timestamp`, `key` String `json:$.Key`, `value` String `json:$.Value`" 84 | }, 85 | "shared_with": [], 86 | "statistics": { 87 | "bytes": 0, 88 | "row_count": 0 89 | }, 90 | "tags": {}, 91 | "type": "ndjson", 92 | "updated_at": "2025-12-17 13:18:09.799040", 93 | "used_by": [ 94 | { 95 | "id": "t_c50152ced57b46b99acf14930b9c6906", 96 | "name": "simple_kv" 97 | } 98 | ], 99 | "version": 0 100 | } 101 | """ 102 | 103 | cluster: str 104 | created_at: str 105 | description: str 106 | engine: dict 107 | errors_discarded_at: str | None 108 | headers: dict 109 | id: str 110 | indexes: list 111 | last_commit: dict 112 | name: str 113 | new_columns_detected: bool 114 | project: str | None 115 | replicated: bool 116 | schema: dict 117 | shared_with: list 118 | statistics: dict 119 | tags: dict 120 | type: str 121 | updated_at: str 122 | used_by: list[dict] 123 | version: int 124 | 125 | 126 | class DataSourceAppendInfo(TypedDict): 127 | """Information about a data source returned when appending to the data source.""" 128 | 129 | cluster: str 130 | created_at: str 131 | description: str 132 | engine: dict # TODO: {'engine': 'MergeTree', 'sorting_key': 'tuple()'} 133 | errors_discarded_at: str | None 134 | headers: dict 135 | id: str 136 | last_commit: dict # TODO: {'content_sha': '', 'path': '', 'status': 'ok'} 137 | name: str 138 | project: str | None 139 | replicated: bool 140 | shared_with: list # TODO 141 | tags: dict 142 | type: str 143 | updated_at: str 144 | used_by: list # TODO 145 | version: int 146 | 147 | 148 | class ListDataSourcesResponse(ApiResponse): 149 | @property 150 | def datasources(self) -> list[DataSourceInfo]: 151 | return self.json.get("datasources", []) 152 | 153 | 154 | class AppendDataResponse(ApiResponse): 155 | @property 156 | def datasource(self) -> DataSourceAppendInfo: 157 | return self.json.get("datasource", {}) 158 | 159 | @property 160 | def import_id(self) -> str: 161 | return self.json.get("import_id") 162 | 163 | @property 164 | def invalid_lines(self) -> int: 165 | return self.json.get("invalid_lines") 166 | 167 | @property 168 | def quarantine_rows(self) -> int: 169 | return self.json.get("quarantine_rows") 170 | 171 | @property 172 | def error(self) -> str | None: 173 | error = self.json.get("error") 174 | if not error: 175 | return None 176 | return error 177 | 178 | 179 | class GetDataSourceInformationResponse(ApiResponse): 180 | @property 181 | def info(self) -> DataSourceInfo: 182 | """ 183 | Returns the data source information. 184 | 185 | Example:: 186 | 187 | { 188 | "id": "t_bd1c62b5e67142bd9bf9a7f113a2b6ea", 189 | "name": "datasource_name", 190 | "statistics": { 191 | "bytes": 430833, 192 | "row_count": 3980 193 | }, 194 | "used_by": [{ 195 | "id": "t_efdc62b5e67142bd9bf9a7f113a34353", 196 | "name": "pipe_using_datasource_name" 197 | }] 198 | "updated_at": "2018-09-07 23:50:32.322461", 199 | "created_at": "2018-11-28 23:50:32.322461", 200 | "type": "csv" 201 | } 202 | 203 | """ 204 | return self.json 205 | 206 | 207 | class DataSourcesApi(Api): 208 | """ 209 | ``/v0/datasources`` API client. 210 | 211 | TODO: missing APIs: 212 | * Creating data sources (POST /v0/datasources with mode=create) 213 | * Replacing data sources (POST /v0/datasources with mode=replace) 214 | * Alter data source (POST /v0/datasources/:name/alter) 215 | * Delete data (POST /v0/datasources/:name/delete) 216 | * Drop data source (DELETE /v0/datasources/:name) 217 | * Update data source attributes (PUT /v0/datasources/:name) 218 | """ 219 | 220 | endpoint: str = "/v0/datasources" 221 | 222 | session: requests.Session 223 | 224 | def __init__(self, token: str, host: str = None): 225 | super().__init__(token, host) 226 | 227 | self.session = requests.Session() 228 | if self.token: 229 | self.session.headers.update({"Authorization": f"Bearer {self.token}"}) 230 | 231 | def append( 232 | self, 233 | name: str, 234 | data: str | bytes | Iterable[bytes] | Iterable[str], 235 | dialect_delimiter: str = None, 236 | dialect_new_line: str = None, 237 | dialect_escapechar: str = None, 238 | progress: bool = False, 239 | format: Literal["csv", "ndjson", "parquet"] = None, 240 | ) -> AppendDataResponse: 241 | """ 242 | Makes a POST request to ``/v0/datasources`` endpoint with mode=append, which appends data to the datasource. 243 | 244 | The data is expected to already be encoded in the format specified by the format parameter. You can pass 245 | generators or other iterables as data. For example:: 246 | 247 | records = [...] # some list of dicts 248 | 249 | def _data(): 250 | # creates an NDJSON stream 251 | for r in records: 252 | yield json.dumps(r) + "\\n" 253 | 254 | response = ds.append("my_table", _data(), format="ndjson") 255 | 256 | :param name: Name of the data source to append data to. 257 | :param data: Data to append. 258 | :param dialect_delimiter: The one-character string separating the fields. We try to guess the delimiter based 259 | on the CSV contents using some statistics, but sometimes we fail to identify the correct one. If you know 260 | your CSV’s field delimiter, you can use this parameter to explicitly define it. 261 | :param dialect_new_line: The one- or two-character string separating the records. We try to guess the delimiter 262 | based on the CSV contents using some statistics, but sometimes we fail to identify the correct one. If you 263 | know your CSV’s record delimiter, you can use this parameter to explicitly define it. 264 | :param dialect_escapechar: The escapechar removes any special meaning from the following character. This is 265 | useful if the CSV does not use double quotes to encapsulate a column but uses double quotes in the content 266 | of a column and it is escaped with, e.g. a backslash. 267 | :param progress: When using true and sending the data in the request body, Tinybird will return block status 268 | while loading using Line-delimited JSON. TODO: currently not supported 269 | :param format: Default: csv. Indicates the format of the data to be ingested in the Data Source. By default is 270 | csv and you should specify format=ndjson for NDJSON format, and format=parquet for Parquet files. 271 | :return: A ``AppendDataResponse`` object. 272 | """ 273 | if progress: 274 | raise NotImplementedError 275 | 276 | params = { 277 | "mode": "append", 278 | "name": name, 279 | } 280 | 281 | if dialect_delimiter: 282 | params["dialect_delimiter"] = dialect_delimiter 283 | if dialect_new_line: 284 | params["dialect_new_line"] = dialect_new_line 285 | if dialect_escapechar: 286 | params["dialect_escapechar"] = dialect_escapechar 287 | if format: 288 | params["format"] = format 289 | 290 | headers = {} 291 | if format == "csv": 292 | headers["Content-Type"] = "text/html; charset=utf-8" 293 | if format == "ndjson": 294 | headers["Content-Type"] = "application/x-ndjson; charset=utf-8" 295 | 296 | response = self.session.request( 297 | method="POST", 298 | url=f"{self.host}{self.endpoint}", 299 | params=params, 300 | headers=headers, 301 | data=data, 302 | ) 303 | 304 | if not response.ok: 305 | raise ApiError(response) 306 | 307 | return AppendDataResponse(response) 308 | 309 | def list(self) -> ListDataSourcesResponse: 310 | """ 311 | Makes a GET request to ``/v0/datasources`` endpoint, which returns a list of datasources. 312 | 313 | :return: A ``ListDataSourcesResponse`` object 314 | """ 315 | response = self.session.request( 316 | method="GET", 317 | url=f"{self.host}{self.endpoint}", 318 | ) 319 | 320 | if not response.ok: 321 | raise ApiError(response) 322 | 323 | return ListDataSourcesResponse(response) 324 | 325 | def get_information(self, name: str) -> GetDataSourceInformationResponse: 326 | """ 327 | Makes a GET request to ``/v0/datasources/`` endpoint, which returns information about the datasource. 328 | 329 | :param name: The name of the datasource to get information about. 330 | :return: A ``GetDataSourceInformationResponse`` 331 | """ 332 | response = self.session.request( 333 | method="GET", 334 | url=f"{self.host}{self.endpoint}/{name}", 335 | ) 336 | 337 | if response.status_code == 404: 338 | raise DataSourceNotFoundError(response) 339 | 340 | if not response.ok: 341 | raise ApiError(response) 342 | 343 | return GetDataSourceInformationResponse(response) 344 | 345 | def truncate(self, name: str): 346 | """ 347 | Makes a POST request to ``/v0/datasources/:name/truncate``, which truncates the datasource. 348 | 349 | :param name: The name of the datasource to truncate. 350 | """ 351 | response = self.session.request( 352 | method="POST", 353 | url=f"{self.host}{self.endpoint}/{name}/truncate", 354 | ) 355 | 356 | if response.status_code == 404: 357 | raise DataSourceNotFoundError(response) 358 | 359 | if not response.ok: 360 | raise ApiError(response) 361 | -------------------------------------------------------------------------------- /tests/integration/project/CLAUDE.md: -------------------------------------------------------------------------------- 1 | 2 | # Tinybird CLI rules 3 | 4 | ## Commands 5 | You have commands at your disposal to develop a tinybird project: 6 | - tb build: to build the project locally and check it works. 7 | - tb deployment create --wait --auto: to create a deployment and promote it automatically 8 | - tb test run: to run existing tests 9 | - tb endpoint url : to get the url of an endpoint, token included. 10 | - tb endpoint data : to get the data of an endpoint. You can pass parameters to the endpoint like this: tb endpoint data --param1 value1 --param2 value2 11 | - tb token ls: to list all the tokens 12 | There are other commands that you can use, but these are the most common ones. Run `tb -h` to see all the commands if needed. 13 | When you need to work with resources or data in cloud, add always the --cloud flag before the command. Example: tb --cloud datasource ls 14 | 15 | ## Development instructions 16 | - When asking to create a tinybird data project, if the needed folders are not already created, use the following structure: 17 | ├── connections 18 | ├── copies 19 | ├── sinks 20 | ├── datasources 21 | ├── endpoints 22 | ├── fixtures 23 | ├── materializations 24 | ├── pipes 25 | └── tests 26 | - The local development server will be available at http://localhost:7181. Even if some response uses another base url, use always http://localhost:7181. 27 | - After every change in your .datasource, .pipe or .ndjson files, run `tb build` to build the project locally. 28 | - When you need to ingest data locally in a datasource, create a .ndjson file with the same name of the datasource and the data you want and run `tb build` so the data is ingested. 29 | - The format of the generated api endpoint urls is: http://localhost:7181/v0/pipe/.json?token= 30 | - Before running the tests, remember to have the project built with `tb build` with the latest changes. 31 | 32 | When asking for ingesting data, adding data or appending data do the following depending on the environment you want to work with: 33 | 34 | ## Ingestion instructions 35 | - When building locally, create a .ndjson file with the data you want to ingest and do `tb build` to ingest the data in the build env. 36 | - We call `cloud` the production environment. 37 | - When appending data in cloud, use `tb --cloud datasource append ` 38 | - When you have a response that says “there are rows in quarantine”, do `tb [--cloud] datasource data _quarantine` to understand what is the problem. 39 | 40 | ## .datasource file instructions 41 | Follow these instructions when creating or updating .datasource files: 42 | 43 | 44 | - Content cannot be empty. 45 | - The datasource names must be unique. 46 | - No indentation is allowed for property names: DESCRIPTION, SCHEMA, ENGINE, ENGINE_PARTITION_KEY, ENGINE_SORTING_KEY, etc. 47 | - Use MergeTree engine by default. 48 | - Use AggregatingMergeTree engine when the datasource is the target of a materialized pipe. 49 | - Use always json paths to define the schema. Example: `user_id` String `json:$.user_id`, 50 | - Array columns are supported with a special syntax. Example: `items` Array(String) `json:$.items[:]` 51 | - If the datasource is using an S3 or GCS connection, they need to set IMPORT_CONNECTION_NAME, IMPORT_BUCKET_URI and IMPORT_SCHEDULE (GCS @on-demand only, S3 supports @auto too) 52 | - If the datasource is using a Kafka connection, they need to set KAFKA_CONNECTION_NAME as the name of the .connection file, KAFKA_TOPIC topic_name and KAFKA_GROUP_ID as the group id for the datasource 53 | - Unless the user asks for them, do not include ENGINE_PARTITION_KEY and ENGINE_PRIMARY_KEY. 54 | - DateTime64 type without precision is not supported. Use DateTime64(3) instead. 55 | 56 | 57 | 58 | ## .pipe file instructions 59 | Follow these instructions when creating or updating .pipe files: 60 | 61 | Follow these instructions when creating or updating any type of .pipe file: 62 | 63 | - The pipe names must be unique. 64 | - Nodes do NOT use the same name as the Pipe they belong to. So if the pipe name is "my_pipe", the nodes must be named different like "my_pipe_node_1", "my_pipe_node_2", etc. 65 | - Node names MUST be different from the resource names in the project. 66 | - No indentation is allowed for property names: DESCRIPTION, NODE, SQL, TYPE, etc. 67 | - Allowed TYPE values are: endpoint, copy, materialized, sink. 68 | - Add always the output node in the TYPE section or in the last node of the pipe. 69 | 70 | 71 | 72 | 73 | - The SQL query must be a valid ClickHouse SQL query that mixes ClickHouse syntax and Tinybird templating syntax (Tornado templating language under the hood). 74 | - SQL queries with parameters must start with "%" character and a newline on top of every query to be able to use the parameters. Examples: 75 | 76 | SELECT * FROM events WHERE session_id={{String(my_param, "default_value")}} 77 | 78 | 79 | % 80 | SELECT * FROM events WHERE session_id={{String(my_param, "default_value")}} 81 | 82 | - The Parameter functions like this one {{String(my_param_name,default_value)}} can be one of the following: String, DateTime, Date, Float32, Float64, Int, Integer, UInt8, UInt16, UInt32, UInt64, UInt128, UInt256, Int8, Int16, Int32, Int64, Int128, Int256 83 | - Parameter names must be different from column names. Pass always the param name and a default value to the function. 84 | - Use ALWAYS hardcoded values for default values for parameters. 85 | - Code inside the template {{template_expression}} follows the rules of Tornado templating language so no module is allowed to be imported. So for example you can't use now() as default value for a DateTime parameter. You need an if else block like this: 86 | 87 | AND timestamp BETWEEN {DateTime(start_date, now() - interval 30 day)} AND {DateTime(end_date, now())} 88 | 89 | 90 | {%if not defined(start_date)%} 91 | timestamp BETWEEN now() - interval 30 day 92 | {%else%} 93 | timestamp BETWEEN {{DateTime(start_date)}} 94 | {%end%} 95 | {%if not defined(end_date)%} 96 | AND now() 97 | {%else%} 98 | AND {{DateTime(end_date)}} 99 | {%end%} 100 | 101 | - Parameters must not be quoted. 102 | - When you use defined function with a paremeter inside, do NOT add quotes around the parameter: 103 | {% if defined('my_param') %} 104 | {% if defined(my_param) %} 105 | - Use datasource names as table names when doing SELECT statements. 106 | - Do not use pipe names as table names. 107 | - The available datasource names to use in the SQL are the ones present in the existing_resources section or the ones you will create. 108 | - Use node names as table names only when nodes are present in the same file. 109 | - Do not reference the current node name in the SQL. 110 | - SQL queries only accept SELECT statements with conditions, aggregations, joins, etc. 111 | - Do NOT use CREATE TABLE, INSERT INTO, CREATE DATABASE, etc. 112 | - Use ONLY SELECT statements in the SQL section. 113 | - INSERT INTO is not supported in SQL section. 114 | - ClickHouse functions supported are: 115 | - General functions supported are: ['BLAKE3', 'CAST', 'CHARACTER_LENGTH', 'CHAR_LENGTH', 'CRC32', 'CRC32IEEE', 'CRC64', 'DATABASE', 'DATE', 'DATE_DIFF', 'DATE_FORMAT', 'DATE_TRUNC', 'DAY', 'DAYOFMONTH', 'DAYOFWEEK', 'DAYOFYEAR', 'FORMAT_BYTES', 'FQDN', 'FROM_BASE64', 'FROM_DAYS', 'FROM_UNIXTIME', 'HOUR', 'INET6_ATON', 'INET6_NTOA', 'INET_ATON', 'INET_NTOA', 'IPv4CIDRToRange', 'IPv4NumToString', 'IPv4NumToStringClassC', 'IPv4StringToNum', 'IPv4StringToNumOrDefault', 'IPv4StringToNumOrNull', 'IPv4ToIPv6', 'IPv6CIDRToRange', 'IPv6NumToString', 'IPv6StringToNum', 'IPv6StringToNumOrDefault', 'IPv6StringToNumOrNull', 'JSONArrayLength', 'JSONExtract', 'JSONExtractArrayRaw', 'JSONExtractBool', 'JSONExtractFloat', 'JSONExtractInt', 'JSONExtractKeys', 'JSONExtractKeysAndValues', 'JSONExtractKeysAndValuesRaw', 'JSONExtractRaw', 'JSONExtractString', 'JSONExtractUInt', 'JSONHas', 'JSONKey', 'JSONLength', 'JSONRemoveDynamoDBAnnotations', 'JSONType', 'JSON_ARRAY_LENGTH', 'JSON_EXISTS', 'JSON_QUERY', 'JSON_VALUE', 'L1Distance', 'L1Norm', 'L1Normalize', 'L2Distance', 'L2Norm', 'L2Normalize', 'L2SquaredDistance', 'L2SquaredNorm', 'LAST_DAY', 'LinfDistance', 'LinfNorm', 'LinfNormalize', 'LpDistance', 'LpNorm', 'LpNormalize', 'MACNumToString', 'MACStringToNum', 'MACStringToOUI', 'MAP_FROM_ARRAYS', 'MD4', 'MD5', 'MILLISECOND', 'MINUTE', 'MONTH', 'OCTET_LENGTH', 'QUARTER', 'REGEXP_EXTRACT', 'REGEXP_MATCHES', 'REGEXP_REPLACE', 'SCHEMA', 'SECOND', 'SHA1', 'SHA224', 'SHA256', 'SHA384', 'SHA512', 'SHA512_256', 'SUBSTRING_INDEX', 'SVG', 'TIMESTAMP_DIFF', 'TO_BASE64', 'TO_DAYS', 'TO_UNIXTIME', 'ULIDStringToDateTime', 'URLHash', 'URLHierarchy', 'URLPathHierarchy', 'UTCTimestamp', 'UTC_timestamp', 'UUIDNumToString', 'UUIDStringToNum', 'UUIDToNum', 'UUIDv7ToDateTime', 'YEAR', 'YYYYMMDDToDate', 'YYYYMMDDToDate32', 'YYYYMMDDhhmmssToDateTime', 'YYYYMMDDhhmmssToDateTime64'] 116 | - Character insensitive functions supported are: ['cast', 'character_length', 'char_length', 'crc32', 'crc32ieee', 'crc64', 'database', 'date', 'date_format', 'date_trunc', 'day', 'dayofmonth', 'dayofweek', 'dayofyear', 'format_bytes', 'fqdn', 'from_base64', 'from_days', 'from_unixtime', 'hour', 'inet6_aton', 'inet6_ntoa', 'inet_aton', 'inet_ntoa', 'json_array_length', 'last_day', 'millisecond', 'minute', 'month', 'octet_length', 'quarter', 'regexp_extract', 'regexp_matches', 'regexp_replace', 'schema', 'second', 'substring_index', 'to_base64', 'to_days', 'to_unixtime', 'utctimestamp', 'utc_timestamp', 'year'] 117 | - Aggregate functions supported are: ['BIT_AND', 'BIT_OR', 'BIT_XOR', 'COVAR_POP', 'COVAR_SAMP', 'STD', 'STDDEV_POP', 'STDDEV_SAMP', 'VAR_POP', 'VAR_SAMP', 'aggThrow', 'analysisOfVariance', 'anova', 'any', 'anyHeavy', 'anyLast', 'anyLast_respect_nulls', 'any_respect_nulls', 'any_value', 'any_value_respect_nulls', 'approx_top_count', 'approx_top_k', 'approx_top_sum', 'argMax', 'argMin', 'array_agg', 'array_concat_agg', 'avg', 'avgWeighted', 'boundingRatio', 'categoricalInformationValue', 'contingency', 'corr', 'corrMatrix', 'corrStable', 'count', 'covarPop', 'covarPopMatrix', 'covarPopStable', 'covarSamp', 'covarSampMatrix', 'covarSampStable', 'cramersV', 'cramersVBiasCorrected', 'deltaSum', 'deltaSumTimestamp', 'dense_rank', 'entropy', 'exponentialMovingAverage', 'exponentialTimeDecayedAvg', 'exponentialTimeDecayedCount', 'exponentialTimeDecayedMax', 'exponentialTimeDecayedSum', 'first_value', 'first_value_respect_nulls', 'flameGraph', 'groupArray', 'groupArrayInsertAt', 'groupArrayIntersect', 'groupArrayLast', 'groupArrayMovingAvg', 'groupArrayMovingSum', 'groupArraySample', 'groupArraySorted', 'groupBitAnd', 'groupBitOr', 'groupBitXor', 'groupBitmap', 'groupBitmapAnd', 'groupBitmapOr', 'groupBitmapXor', 'groupUniqArray', 'histogram', 'intervalLengthSum', 'kolmogorovSmirnovTest', 'kurtPop', 'kurtSamp', 'lagInFrame', 'largestTriangleThreeBuckets', 'last_value', 'last_value_respect_nulls', 'leadInFrame', 'lttb', 'mannWhitneyUTest', 'max', 'maxIntersections', 'maxIntersectionsPosition', 'maxMappedArrays', 'meanZTest', 'median', 'medianBFloat16', 'medianBFloat16Weighted', 'medianDD', 'medianDeterministic', 'medianExact', 'medianExactHigh', 'medianExactLow', 'medianExactWeighted', 'medianGK', 'medianInterpolatedWeighted', 'medianTDigest', 'medianTDigestWeighted', 'medianTiming', 'medianTimingWeighted', 'min', 'minMappedArrays', 'nonNegativeDerivative', 'nothing', 'nothingNull', 'nothingUInt64', 'nth_value', 'ntile', 'quantile', 'quantileBFloat16', 'quantileBFloat16Weighted', 'quantileDD', 'quantileDeterministic', 'quantileExact', 'quantileExactExclusive', 'quantileExactHigh', 'quantileExactInclusive', 'quantileExactLow', 'quantileExactWeighted', 'quantileGK', 'quantileInterpolatedWeighted', 'quantileTDigest', 'quantileTDigestWeighted', 'quantileTiming', 'quantileTimingWeighted', 'quantiles', 'quantilesBFloat16', 'quantilesBFloat16Weighted', 'quantilesDD', 'quantilesDeterministic', 'quantilesExact', 'quantilesExactExclusive', 'quantilesExactHigh', 'quantilesExactInclusive', 'quantilesExactLow', 'quantilesExactWeighted', 'quantilesGK', 'quantilesInterpolatedWeighted', 'quantilesTDigest', 'quantilesTDigestWeighted', 'quantilesTiming', 'quantilesTimingWeighted', 'rank', 'rankCorr', 'retention', 'row_number', 'sequenceCount', 'sequenceMatch', 'sequenceNextNode', 'simpleLinearRegression', 'singleValueOrNull', 'skewPop', 'skewSamp', 'sparkBar', 'sparkbar', 'stddevPop', 'stddevPopStable', 'stddevSamp', 'stddevSampStable', 'stochasticLinearRegression', 'stochasticLogisticRegression', 'studentTTest', 'sum', 'sumCount', 'sumKahan', 'sumMapFiltered', 'sumMapFilteredWithOverflow', 'sumMapWithOverflow', 'sumMappedArrays', 'sumWithOverflow', 'theilsU', 'topK', 'topKWeighted', 'uniq', 'uniqCombined', 'uniqCombined64', 'uniqExact', 'uniqHLL12', 'uniqTheta', 'uniqUpTo', 'varPop', 'varPopStable', 'varSamp', 'varSampStable', 'welchTTest', 'windowFunnel'] 118 | - How to use ClickHouse supported functions: 119 | - When using functions try always ClickHouse functions first, then SQL functions. 120 | - Do not use any ClickHouse function that is not present in the list of general functions, character insensitive functions and aggregate functions. 121 | - If the function is not present in the list, the sql query will fail, so avoid at all costs to use any function that is not present in the list. 122 | - When aliasing a column, use first the column name and then the alias. 123 | - General functions and aggregate functions are case sensitive. 124 | - Character insensitive functions are case insensitive. 125 | - Parameters are never quoted in any case. 126 | - Use the following syntax in the SQL section for the iceberg table function: iceberg('s3://bucket/path/to/table', {{tb_secret('aws_access_key_id')}}, {{tb_secret('aws_secret_access_key')}}) 127 | - Use the following syntax in the SQL section for the postgres table function: postgresql('host:port', 'database', 'table', {{tb_secret('db_username')}}, {{tb_secret('db_password')}}), 'schema') 128 | 129 | 130 | 131 | 132 | DESCRIPTION > 133 | Some meaningful description of the datasource 134 | 135 | SCHEMA > 136 | `column_name_1` clickhouse_tinybird_compatible_data_type `json:$.column_name_1`, 137 | `column_name_2` clickhouse_tinybird_compatible_data_type `json:$.column_name_2`, 138 | ... 139 | `column_name_n` clickhouse_tinybird_compatible_data_type `json:$.column_name_n` 140 | 141 | ENGINE "MergeTree" 142 | ENGINE_PARTITION_KEY "partition_key" 143 | ENGINE_SORTING_KEY "sorting_key_1, sorting_key_2, ..." 144 | 145 | 146 | 147 | 148 | DESCRIPTION > 149 | Some meaningful description of the pipe 150 | 151 | NODE node_1 152 | SQL > 153 | [sql query using clickhouse syntax and tinybird templating syntax and starting always with SELECT or % 154 | SELECT] 155 | TYPE endpoint 156 | 157 | 158 | 159 | 160 | 161 | - Do not create copy pipes by default, unless the user asks for it. 162 | - Copy pipes should be created in the /copies folder. 163 | - In a .pipe file you can define how to export the result of a Pipe to a Data Source, optionally with a schedule. 164 | - Do not include COPY_SCHEDULE in the .pipe file unless is specifically requested by the user. 165 | - COPY_SCHEDULE is a cron expression that defines the schedule of the copy pipe. 166 | - COPY_SCHEDULE is optional and if not provided, the copy pipe will be executed only once. 167 | - TARGET_DATASOURCE is the name of the Data Source to export the result to. 168 | - TYPE COPY is the type of the pipe and it is mandatory for copy pipes. 169 | - If the copy pipe uses parameters, you must include the % character and a newline on top of every query to be able to use the parameters. 170 | - The content of the .pipe file must follow this format: 171 | DESCRIPTION Copy Pipe to export sales hour every hour to the sales_hour_copy Data Source 172 | 173 | NODE daily_sales 174 | SQL > 175 | % 176 | SELECT toStartOfDay(starting_date) day, country, sum(sales) as total_sales 177 | FROM teams 178 | WHERE 179 | day BETWEEN toStartOfDay(now()) - interval 1 day AND toStartOfDay(now()) 180 | and country = {{ String(country, 'US')}} 181 | GROUP BY day, country 182 | 183 | TYPE COPY 184 | TARGET_DATASOURCE sales_hour_copy 185 | COPY_SCHEDULE 0 * * * * 186 | 187 | 188 | 189 | 190 | - Do not create materialized pipes by default, unless the user asks for it. 191 | - Materialized pipes should be created in the /materializations folder. 192 | - In a .pipe file you can define how to materialize each row ingested in the earliest Data Source in the Pipe query to a materialized Data Source. Materialization happens at ingest. 193 | - DATASOURCE: Required when TYPE is MATERIALIZED. Sets the target Data Source for materialized nodes. 194 | - TYPE MATERIALIZED is the type of the pipe and it is mandatory for materialized pipes. 195 | - The content of the .pipe file must follow the materialized_pipe_content format. 196 | - Use State modifier for the aggregated columns in the pipe. 197 | 198 | 199 | NODE daily_sales 200 | SQL > 201 | SELECT toStartOfDay(starting_date) day, country, sumState(sales) as total_sales 202 | FROM teams 203 | GROUP BY day, country 204 | 205 | TYPE MATERIALIZED 206 | DATASOURCE sales_by_hour 207 | 208 | 209 | - The target datasource of a materialized pipe must have an AggregatingMergeTree engine. 210 | - Use AggregateFunction for the aggregated columns in the pipe. 211 | - Pipes using a materialized data source must use the Merge modifier in the SQL query for the aggregated columns. Example: sumMerge(total_sales) 212 | - Put all dimensions in the ENGINE_SORTING_KEY, sorted from least to most cardinality. 213 | 214 | 215 | SCHEMA > 216 | `total_sales` AggregateFunction(sum, Float64), 217 | `sales_count` AggregateFunction(count, UInt64), 218 | `column_name_2` AggregateFunction(avg, Float64), 219 | `dimension_1` String, 220 | `dimension_2` String, 221 | ... 222 | `date` DateTime 223 | 224 | ENGINE "AggregatingMergeTree" 225 | ENGINE_PARTITION_KEY "toYYYYMM(date)" 226 | ENGINE_SORTING_KEY "date, dimension_1, dimension_2, ..." 227 | 228 | 229 | 230 | 231 | - Do not create sink pipes by default, unless the user asks for it. 232 | - Sink pipes should be created in the /sinks folder. 233 | - In a .pipe file you can define how to export the result of a Pipe to an external system, optionally with a schedule. 234 | - Valid external systems are Kafka, S3, GCS. 235 | - Sink pipes depend on a connection, if no connection is provided, search for an existing connection that suits the request. If none, create a new connection. 236 | - Do not include EXPORT_SCHEDULE in the .pipe file unless is specifically requested by the user. 237 | - EXPORT_SCHEDULE is a cron expression that defines the schedule of the sink pipe. 238 | - EXPORT_SCHEDULE is optional and if not provided, the sink pipe will be executed only once. 239 | - EXPORT_CONNECTION_NAME is the name of the connection used to export. 240 | - TYPE SINK is the type of the pipe and it is mandatory for sink pipes. 241 | - If the sink pipe uses parameters, you must include the % character and a newline on top of every query to be able to use the parameters. 242 | - The content of the .pipe file must follow this format: 243 | DESCRIPTION Sink Pipe to export sales hour every hour using my_connection 244 | 245 | NODE daily_sales 246 | SQL > 247 | % 248 | SELECT toStartOfDay(starting_date) day, country, sum(sales) as total_sales 249 | FROM teams 250 | WHERE 251 | day BETWEEN toStartOfDay(now()) - interval 1 day AND toStartOfDay(now()) 252 | and country = {{ String(country, 'US')}} 253 | GROUP BY day, country 254 | 255 | TYPE sink 256 | EXPORT_CONNECTION_NAME "my_connection" 257 | EXPORT_BUCKET_URI "s3://tinybird-sinks" 258 | EXPORT_FILE_TEMPLATE "daily_prices" 259 | EXPORT_SCHEDULE "*/5 * * * *" 260 | EXPORT_FORMAT "csv" 261 | EXPORT_COMPRESSION "gz" 262 | EXPORT_STRATEGY "truncate" 263 | 264 | 265 | 266 | 267 | - Content cannot be empty. 268 | - The connection names must be unique. 269 | - No indentation is allowed for property names 270 | - We support kafka, gcs and s3 connections for now 271 | 272 | 273 | 274 | 275 | TYPE kafka 276 | KAFKA_BOOTSTRAP_SERVERS {{ tb_secret("PRODUCTION_KAFKA_SERVERS", "localhost:9092") }} 277 | KAFKA_SECURITY_PROTOCOL SASL_SSL 278 | KAFKA_SASL_MECHANISM PLAIN 279 | KAFKA_KEY {{ tb_secret("PRODUCTION_KAFKA_USERNAME", "") }} 280 | KAFKA_SECRET {{ tb_secret("PRODUCTION_KAFKA_PASSWORD", "") }} 281 | 282 | 283 | 284 | 285 | TYPE gcs 286 | GCS_SERVICE_ACCOUNT_CREDENTIALS_JSON {{ tb_secret("PRODUCTION_GCS_SERVICE_ACCOUNT_CREDENTIALS_JSON", "") }} 287 | 288 | 289 | 290 | 291 | TYPE gcs 292 | GCS_HMAC_ACCESS_ID {{ tb_secret("gcs_hmac_access_id") }} 293 | GCS_HMAC_SECRET {{ tb_secret("gcs_hmac_secret") }} 294 | 295 | 296 | 297 | 298 | TYPE s3 299 | S3_REGION {{ tb_secret("PRODUCTION_S3_REGION", "") }} 300 | S3_ARN {{ tb_secret("PRODUCTION_S3_ARN", "") }} 301 | 302 | 303 | 304 | ## .test file instructions 305 | Follow these instructions when creating or updating .yaml files for tests: 306 | 307 | - The test file name must match the name of the pipe it is testing. 308 | - Every scenario name must be unique inside the test file. 309 | - When looking for the parameters available, you will find them in the pipes in the following format: {{{{String(my_param_name, default_value)}}}}. 310 | - If there are no parameters, you can omit parameters and generate a single test. 311 | - The format of the parameters is the following: param1=value1¶m2=value2¶m3=value3 312 | - If some parameters are provided by the user and you need to use them, preserve in the same format as they were provided, like case sensitive 313 | - Test as many scenarios as possible. 314 | - The format of the test file is the following: 315 | 316 | - name: kpis_single_day 317 | description: Test hourly granularity for a single day 318 | parameters: date_from=2024-01-01&date_to=2024-01-01 319 | expected_result: | 320 | {"date":"2024-01-01 00:00:00","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0} 321 | {"date":"2024-01-01 01:00:00","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0} 322 | 323 | - name: kpis_date_range 324 | description: Test daily granularity for a date range 325 | parameters: date_from=2024-01-01&date_to=2024-01-31 326 | expected_result: | 327 | {"date":"2024-01-01","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0} 328 | {"date":"2024-01-02","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0} 329 | 330 | - name: kpis_default_range 331 | description: Test default behavior without date parameters (last 7 days) 332 | parameters: '' 333 | expected_result: | 334 | {"date":"2025-01-10","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0} 335 | {"date":"2025-01-11","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0} 336 | 337 | - name: kpis_fixed_time 338 | description: Test with fixed timestamp for consistent testing 339 | parameters: fixed_time=2024-01-15T12:00:00 340 | expected_result: '' 341 | 342 | - name: kpis_single_day 343 | description: Test single day with hourly granularity 344 | parameters: date_from=2024-01-01&date_to=2024-01-01 345 | expected_result: | 346 | {"date":"2024-01-01 00:00:00","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0} 347 | {"date":"2024-01-01 01:00:00","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0} 348 | 349 | 350 | 351 | 352 | ## Deployment instructions 353 | Follow these instructions when evolving a datasource schema: 354 | 355 | - When you make schema changes that are incompatible with the old schema, you must use a forward query in your data source. Forward queries are necessary when introducing breaking changes. Otherwise, your deployment will fail due to a schema mismatch. 356 | - Forward queries translate the old schema to a new one that you define in the .datasource file. This helps you evolve your schema while continuing to ingest data. 357 | Follow these steps to evolve your schema using a forward query: 358 | - Edit the .datasource file to add a forward query. 359 | - Run tb deploy --check to validate the deployment before creating it. 360 | - Deploy and promote your changes in Tinybird Cloud using {base_command} --cloud deploy. 361 | 362 | SCHEMA > 363 | `timestamp` DateTime `json:$.timestamp`, 364 | `session_id` UUID `json:$.session_id`, 365 | `action` String `json:$.action`, 366 | `version` String `json:$.version`, 367 | `payload` String `json:$.payload` 368 | 369 | FORWARD_QUERY > 370 | select timestamp, toUUID(session_id) as session_id, action, version, payload 371 | 372 | 373 | 374 | -------------------------------------------------------------------------------- /tests/integration/project/.cursorrules: -------------------------------------------------------------------------------- 1 | 2 | You are an expert in SQL and Tinybird. Follow these instructions when working with .datasource and .pipe files: 3 | 4 | 5 | You have commands at your disposal to develop a tinybird project: 6 | - tb build: to build the project locally and check it works. 7 | - tb deployment create --wait --auto: to create a deployment and promote it automatically 8 | - tb test run: to run existing tests 9 | - tb endpoint url : to get the url of an endpoint, token included. 10 | - tb endpoint data : to get the data of an endpoint. You can pass parameters to the endpoint like this: tb endpoint data --param1 value1 --param2 value2 11 | - tb token ls: to list all the tokens 12 | There are other commands that you can use, but these are the most common ones. Run `tb -h` to see all the commands if needed. 13 | When you need to work with resources or data in cloud, add always the --cloud flag before the command. Example: tb --cloud datasource ls 14 | 15 | 16 | - When asking to create a tinybird data project, if the needed folders are not already created, use the following structure: 17 | ├── connections 18 | ├── copies 19 | ├── sinks 20 | ├── datasources 21 | ├── endpoints 22 | ├── fixtures 23 | ├── materializations 24 | ├── pipes 25 | └── tests 26 | - The local development server will be available at http://localhost:7181. Even if some response uses another base url, use always http://localhost:7181. 27 | - After every change in your .datasource, .pipe or .ndjson files, run `tb build` to build the project locally. 28 | - When you need to ingest data locally in a datasource, create a .ndjson file with the same name of the datasource and the data you want and run `tb build` so the data is ingested. 29 | - The format of the generated api endpoint urls is: http://localhost:7181/v0/pipe/.json?token= 30 | - Before running the tests, remember to have the project built with `tb build` with the latest changes. 31 | 32 | When asking for ingesting data, adding data or appending data do the following depending on the environment you want to work with: 33 | 34 | - When building locally, create a .ndjson file with the data you want to ingest and do `tb build` to ingest the data in the build env. 35 | - We call `cloud` the production environment. 36 | - When appending data in cloud, use `tb --cloud datasource append ` 37 | - When you have a response that says “there are rows in quarantine”, do `tb [--cloud] datasource data _quarantine` to understand what is the problem. 38 | 39 | 40 | Follow these instructions when creating or updating .datasource files: 41 | 42 | 43 | - Content cannot be empty. 44 | - The datasource names must be unique. 45 | - No indentation is allowed for property names: DESCRIPTION, SCHEMA, ENGINE, ENGINE_PARTITION_KEY, ENGINE_SORTING_KEY, etc. 46 | - Use MergeTree engine by default. 47 | - Use AggregatingMergeTree engine when the datasource is the target of a materialized pipe. 48 | - Use always json paths to define the schema. Example: `user_id` String `json:$.user_id`, 49 | - Array columns are supported with a special syntax. Example: `items` Array(String) `json:$.items[:]` 50 | - If the datasource is using an S3 or GCS connection, they need to set IMPORT_CONNECTION_NAME, IMPORT_BUCKET_URI and IMPORT_SCHEDULE (GCS @on-demand only, S3 supports @auto too) 51 | - If the datasource is using a Kafka connection, they need to set KAFKA_CONNECTION_NAME as the name of the .connection file, KAFKA_TOPIC topic_name and KAFKA_GROUP_ID as the group id for the datasource 52 | - Unless the user asks for them, do not include ENGINE_PARTITION_KEY and ENGINE_PRIMARY_KEY. 53 | - DateTime64 type without precision is not supported. Use DateTime64(3) instead. 54 | 55 | 56 | 57 | 58 | 59 | Follow these instructions when creating or updating .pipe files: 60 | 61 | Follow these instructions when creating or updating any type of .pipe file: 62 | 63 | - The pipe names must be unique. 64 | - Nodes do NOT use the same name as the Pipe they belong to. So if the pipe name is "my_pipe", the nodes must be named different like "my_pipe_node_1", "my_pipe_node_2", etc. 65 | - Node names MUST be different from the resource names in the project. 66 | - No indentation is allowed for property names: DESCRIPTION, NODE, SQL, TYPE, etc. 67 | - Allowed TYPE values are: endpoint, copy, materialized, sink. 68 | - Add always the output node in the TYPE section or in the last node of the pipe. 69 | 70 | 71 | 72 | 73 | - The SQL query must be a valid ClickHouse SQL query that mixes ClickHouse syntax and Tinybird templating syntax (Tornado templating language under the hood). 74 | - SQL queries with parameters must start with "%" character and a newline on top of every query to be able to use the parameters. Examples: 75 | 76 | SELECT * FROM events WHERE session_id={{String(my_param, "default_value")}} 77 | 78 | 79 | % 80 | SELECT * FROM events WHERE session_id={{String(my_param, "default_value")}} 81 | 82 | - The Parameter functions like this one {{String(my_param_name,default_value)}} can be one of the following: String, DateTime, Date, Float32, Float64, Int, Integer, UInt8, UInt16, UInt32, UInt64, UInt128, UInt256, Int8, Int16, Int32, Int64, Int128, Int256 83 | - Parameter names must be different from column names. Pass always the param name and a default value to the function. 84 | - Use ALWAYS hardcoded values for default values for parameters. 85 | - Code inside the template {{template_expression}} follows the rules of Tornado templating language so no module is allowed to be imported. So for example you can't use now() as default value for a DateTime parameter. You need an if else block like this: 86 | 87 | AND timestamp BETWEEN {DateTime(start_date, now() - interval 30 day)} AND {DateTime(end_date, now())} 88 | 89 | 90 | {%if not defined(start_date)%} 91 | timestamp BETWEEN now() - interval 30 day 92 | {%else%} 93 | timestamp BETWEEN {{DateTime(start_date)}} 94 | {%end%} 95 | {%if not defined(end_date)%} 96 | AND now() 97 | {%else%} 98 | AND {{DateTime(end_date)}} 99 | {%end%} 100 | 101 | - Parameters must not be quoted. 102 | - When you use defined function with a paremeter inside, do NOT add quotes around the parameter: 103 | {% if defined('my_param') %} 104 | {% if defined(my_param) %} 105 | - Use datasource names as table names when doing SELECT statements. 106 | - Do not use pipe names as table names. 107 | - The available datasource names to use in the SQL are the ones present in the existing_resources section or the ones you will create. 108 | - Use node names as table names only when nodes are present in the same file. 109 | - Do not reference the current node name in the SQL. 110 | - SQL queries only accept SELECT statements with conditions, aggregations, joins, etc. 111 | - Do NOT use CREATE TABLE, INSERT INTO, CREATE DATABASE, etc. 112 | - Use ONLY SELECT statements in the SQL section. 113 | - INSERT INTO is not supported in SQL section. 114 | - ClickHouse functions supported are: 115 | - General functions supported are: ['BLAKE3', 'CAST', 'CHARACTER_LENGTH', 'CHAR_LENGTH', 'CRC32', 'CRC32IEEE', 'CRC64', 'DATABASE', 'DATE', 'DATE_DIFF', 'DATE_FORMAT', 'DATE_TRUNC', 'DAY', 'DAYOFMONTH', 'DAYOFWEEK', 'DAYOFYEAR', 'FORMAT_BYTES', 'FQDN', 'FROM_BASE64', 'FROM_DAYS', 'FROM_UNIXTIME', 'HOUR', 'INET6_ATON', 'INET6_NTOA', 'INET_ATON', 'INET_NTOA', 'IPv4CIDRToRange', 'IPv4NumToString', 'IPv4NumToStringClassC', 'IPv4StringToNum', 'IPv4StringToNumOrDefault', 'IPv4StringToNumOrNull', 'IPv4ToIPv6', 'IPv6CIDRToRange', 'IPv6NumToString', 'IPv6StringToNum', 'IPv6StringToNumOrDefault', 'IPv6StringToNumOrNull', 'JSONArrayLength', 'JSONExtract', 'JSONExtractArrayRaw', 'JSONExtractBool', 'JSONExtractFloat', 'JSONExtractInt', 'JSONExtractKeys', 'JSONExtractKeysAndValues', 'JSONExtractKeysAndValuesRaw', 'JSONExtractRaw', 'JSONExtractString', 'JSONExtractUInt', 'JSONHas', 'JSONKey', 'JSONLength', 'JSONRemoveDynamoDBAnnotations', 'JSONType', 'JSON_ARRAY_LENGTH', 'JSON_EXISTS', 'JSON_QUERY', 'JSON_VALUE', 'L1Distance', 'L1Norm', 'L1Normalize', 'L2Distance', 'L2Norm', 'L2Normalize', 'L2SquaredDistance', 'L2SquaredNorm', 'LAST_DAY', 'LinfDistance', 'LinfNorm', 'LinfNormalize', 'LpDistance', 'LpNorm', 'LpNormalize', 'MACNumToString', 'MACStringToNum', 'MACStringToOUI', 'MAP_FROM_ARRAYS', 'MD4', 'MD5', 'MILLISECOND', 'MINUTE', 'MONTH', 'OCTET_LENGTH', 'QUARTER', 'REGEXP_EXTRACT', 'REGEXP_MATCHES', 'REGEXP_REPLACE', 'SCHEMA', 'SECOND', 'SHA1', 'SHA224', 'SHA256', 'SHA384', 'SHA512', 'SHA512_256', 'SUBSTRING_INDEX', 'SVG', 'TIMESTAMP_DIFF', 'TO_BASE64', 'TO_DAYS', 'TO_UNIXTIME', 'ULIDStringToDateTime', 'URLHash', 'URLHierarchy', 'URLPathHierarchy', 'UTCTimestamp', 'UTC_timestamp', 'UUIDNumToString', 'UUIDStringToNum', 'UUIDToNum', 'UUIDv7ToDateTime', 'YEAR', 'YYYYMMDDToDate', 'YYYYMMDDToDate32', 'YYYYMMDDhhmmssToDateTime', 'YYYYMMDDhhmmssToDateTime64'] 116 | - Character insensitive functions supported are: ['cast', 'character_length', 'char_length', 'crc32', 'crc32ieee', 'crc64', 'database', 'date', 'date_format', 'date_trunc', 'day', 'dayofmonth', 'dayofweek', 'dayofyear', 'format_bytes', 'fqdn', 'from_base64', 'from_days', 'from_unixtime', 'hour', 'inet6_aton', 'inet6_ntoa', 'inet_aton', 'inet_ntoa', 'json_array_length', 'last_day', 'millisecond', 'minute', 'month', 'octet_length', 'quarter', 'regexp_extract', 'regexp_matches', 'regexp_replace', 'schema', 'second', 'substring_index', 'to_base64', 'to_days', 'to_unixtime', 'utctimestamp', 'utc_timestamp', 'year'] 117 | - Aggregate functions supported are: ['BIT_AND', 'BIT_OR', 'BIT_XOR', 'COVAR_POP', 'COVAR_SAMP', 'STD', 'STDDEV_POP', 'STDDEV_SAMP', 'VAR_POP', 'VAR_SAMP', 'aggThrow', 'analysisOfVariance', 'anova', 'any', 'anyHeavy', 'anyLast', 'anyLast_respect_nulls', 'any_respect_nulls', 'any_value', 'any_value_respect_nulls', 'approx_top_count', 'approx_top_k', 'approx_top_sum', 'argMax', 'argMin', 'array_agg', 'array_concat_agg', 'avg', 'avgWeighted', 'boundingRatio', 'categoricalInformationValue', 'contingency', 'corr', 'corrMatrix', 'corrStable', 'count', 'covarPop', 'covarPopMatrix', 'covarPopStable', 'covarSamp', 'covarSampMatrix', 'covarSampStable', 'cramersV', 'cramersVBiasCorrected', 'deltaSum', 'deltaSumTimestamp', 'dense_rank', 'entropy', 'exponentialMovingAverage', 'exponentialTimeDecayedAvg', 'exponentialTimeDecayedCount', 'exponentialTimeDecayedMax', 'exponentialTimeDecayedSum', 'first_value', 'first_value_respect_nulls', 'flameGraph', 'groupArray', 'groupArrayInsertAt', 'groupArrayIntersect', 'groupArrayLast', 'groupArrayMovingAvg', 'groupArrayMovingSum', 'groupArraySample', 'groupArraySorted', 'groupBitAnd', 'groupBitOr', 'groupBitXor', 'groupBitmap', 'groupBitmapAnd', 'groupBitmapOr', 'groupBitmapXor', 'groupUniqArray', 'histogram', 'intervalLengthSum', 'kolmogorovSmirnovTest', 'kurtPop', 'kurtSamp', 'lagInFrame', 'largestTriangleThreeBuckets', 'last_value', 'last_value_respect_nulls', 'leadInFrame', 'lttb', 'mannWhitneyUTest', 'max', 'maxIntersections', 'maxIntersectionsPosition', 'maxMappedArrays', 'meanZTest', 'median', 'medianBFloat16', 'medianBFloat16Weighted', 'medianDD', 'medianDeterministic', 'medianExact', 'medianExactHigh', 'medianExactLow', 'medianExactWeighted', 'medianGK', 'medianInterpolatedWeighted', 'medianTDigest', 'medianTDigestWeighted', 'medianTiming', 'medianTimingWeighted', 'min', 'minMappedArrays', 'nonNegativeDerivative', 'nothing', 'nothingNull', 'nothingUInt64', 'nth_value', 'ntile', 'quantile', 'quantileBFloat16', 'quantileBFloat16Weighted', 'quantileDD', 'quantileDeterministic', 'quantileExact', 'quantileExactExclusive', 'quantileExactHigh', 'quantileExactInclusive', 'quantileExactLow', 'quantileExactWeighted', 'quantileGK', 'quantileInterpolatedWeighted', 'quantileTDigest', 'quantileTDigestWeighted', 'quantileTiming', 'quantileTimingWeighted', 'quantiles', 'quantilesBFloat16', 'quantilesBFloat16Weighted', 'quantilesDD', 'quantilesDeterministic', 'quantilesExact', 'quantilesExactExclusive', 'quantilesExactHigh', 'quantilesExactInclusive', 'quantilesExactLow', 'quantilesExactWeighted', 'quantilesGK', 'quantilesInterpolatedWeighted', 'quantilesTDigest', 'quantilesTDigestWeighted', 'quantilesTiming', 'quantilesTimingWeighted', 'rank', 'rankCorr', 'retention', 'row_number', 'sequenceCount', 'sequenceMatch', 'sequenceNextNode', 'simpleLinearRegression', 'singleValueOrNull', 'skewPop', 'skewSamp', 'sparkBar', 'sparkbar', 'stddevPop', 'stddevPopStable', 'stddevSamp', 'stddevSampStable', 'stochasticLinearRegression', 'stochasticLogisticRegression', 'studentTTest', 'sum', 'sumCount', 'sumKahan', 'sumMapFiltered', 'sumMapFilteredWithOverflow', 'sumMapWithOverflow', 'sumMappedArrays', 'sumWithOverflow', 'theilsU', 'topK', 'topKWeighted', 'uniq', 'uniqCombined', 'uniqCombined64', 'uniqExact', 'uniqHLL12', 'uniqTheta', 'uniqUpTo', 'varPop', 'varPopStable', 'varSamp', 'varSampStable', 'welchTTest', 'windowFunnel'] 118 | - How to use ClickHouse supported functions: 119 | - When using functions try always ClickHouse functions first, then SQL functions. 120 | - Do not use any ClickHouse function that is not present in the list of general functions, character insensitive functions and aggregate functions. 121 | - If the function is not present in the list, the sql query will fail, so avoid at all costs to use any function that is not present in the list. 122 | - When aliasing a column, use first the column name and then the alias. 123 | - General functions and aggregate functions are case sensitive. 124 | - Character insensitive functions are case insensitive. 125 | - Parameters are never quoted in any case. 126 | - Use the following syntax in the SQL section for the iceberg table function: iceberg('s3://bucket/path/to/table', {{tb_secret('aws_access_key_id')}}, {{tb_secret('aws_secret_access_key')}}) 127 | - Use the following syntax in the SQL section for the postgres table function: postgresql('host:port', 'database', 'table', {{tb_secret('db_username')}}, {{tb_secret('db_password')}}), 'schema') 128 | 129 | 130 | 131 | 132 | DESCRIPTION > 133 | Some meaningful description of the datasource 134 | 135 | SCHEMA > 136 | `column_name_1` clickhouse_tinybird_compatible_data_type `json:$.column_name_1`, 137 | `column_name_2` clickhouse_tinybird_compatible_data_type `json:$.column_name_2`, 138 | ... 139 | `column_name_n` clickhouse_tinybird_compatible_data_type `json:$.column_name_n` 140 | 141 | ENGINE "MergeTree" 142 | ENGINE_PARTITION_KEY "partition_key" 143 | ENGINE_SORTING_KEY "sorting_key_1, sorting_key_2, ..." 144 | 145 | 146 | 147 | 148 | DESCRIPTION > 149 | Some meaningful description of the pipe 150 | 151 | NODE node_1 152 | SQL > 153 | [sql query using clickhouse syntax and tinybird templating syntax and starting always with SELECT or % 154 | SELECT] 155 | TYPE endpoint 156 | 157 | 158 | 159 | 160 | 161 | - Do not create copy pipes by default, unless the user asks for it. 162 | - Copy pipes should be created in the /copies folder. 163 | - In a .pipe file you can define how to export the result of a Pipe to a Data Source, optionally with a schedule. 164 | - Do not include COPY_SCHEDULE in the .pipe file unless is specifically requested by the user. 165 | - COPY_SCHEDULE is a cron expression that defines the schedule of the copy pipe. 166 | - COPY_SCHEDULE is optional and if not provided, the copy pipe will be executed only once. 167 | - TARGET_DATASOURCE is the name of the Data Source to export the result to. 168 | - TYPE COPY is the type of the pipe and it is mandatory for copy pipes. 169 | - If the copy pipe uses parameters, you must include the % character and a newline on top of every query to be able to use the parameters. 170 | - The content of the .pipe file must follow this format: 171 | DESCRIPTION Copy Pipe to export sales hour every hour to the sales_hour_copy Data Source 172 | 173 | NODE daily_sales 174 | SQL > 175 | % 176 | SELECT toStartOfDay(starting_date) day, country, sum(sales) as total_sales 177 | FROM teams 178 | WHERE 179 | day BETWEEN toStartOfDay(now()) - interval 1 day AND toStartOfDay(now()) 180 | and country = {{ String(country, 'US')}} 181 | GROUP BY day, country 182 | 183 | TYPE COPY 184 | TARGET_DATASOURCE sales_hour_copy 185 | COPY_SCHEDULE 0 * * * * 186 | 187 | 188 | 189 | 190 | - Do not create materialized pipes by default, unless the user asks for it. 191 | - Materialized pipes should be created in the /materializations folder. 192 | - In a .pipe file you can define how to materialize each row ingested in the earliest Data Source in the Pipe query to a materialized Data Source. Materialization happens at ingest. 193 | - DATASOURCE: Required when TYPE is MATERIALIZED. Sets the target Data Source for materialized nodes. 194 | - TYPE MATERIALIZED is the type of the pipe and it is mandatory for materialized pipes. 195 | - The content of the .pipe file must follow the materialized_pipe_content format. 196 | - Use State modifier for the aggregated columns in the pipe. 197 | 198 | 199 | NODE daily_sales 200 | SQL > 201 | SELECT toStartOfDay(starting_date) day, country, sumState(sales) as total_sales 202 | FROM teams 203 | GROUP BY day, country 204 | 205 | TYPE MATERIALIZED 206 | DATASOURCE sales_by_hour 207 | 208 | 209 | - The target datasource of a materialized pipe must have an AggregatingMergeTree engine. 210 | - Use AggregateFunction for the aggregated columns in the pipe. 211 | - Pipes using a materialized data source must use the Merge modifier in the SQL query for the aggregated columns. Example: sumMerge(total_sales) 212 | - Put all dimensions in the ENGINE_SORTING_KEY, sorted from least to most cardinality. 213 | 214 | 215 | SCHEMA > 216 | `total_sales` AggregateFunction(sum, Float64), 217 | `sales_count` AggregateFunction(count, UInt64), 218 | `column_name_2` AggregateFunction(avg, Float64), 219 | `dimension_1` String, 220 | `dimension_2` String, 221 | ... 222 | `date` DateTime 223 | 224 | ENGINE "AggregatingMergeTree" 225 | ENGINE_PARTITION_KEY "toYYYYMM(date)" 226 | ENGINE_SORTING_KEY "date, dimension_1, dimension_2, ..." 227 | 228 | 229 | 230 | 231 | - Do not create sink pipes by default, unless the user asks for it. 232 | - Sink pipes should be created in the /sinks folder. 233 | - In a .pipe file you can define how to export the result of a Pipe to an external system, optionally with a schedule. 234 | - Valid external systems are Kafka, S3, GCS. 235 | - Sink pipes depend on a connection, if no connection is provided, search for an existing connection that suits the request. If none, create a new connection. 236 | - Do not include EXPORT_SCHEDULE in the .pipe file unless is specifically requested by the user. 237 | - EXPORT_SCHEDULE is a cron expression that defines the schedule of the sink pipe. 238 | - EXPORT_SCHEDULE is optional and if not provided, the sink pipe will be executed only once. 239 | - EXPORT_CONNECTION_NAME is the name of the connection used to export. 240 | - TYPE SINK is the type of the pipe and it is mandatory for sink pipes. 241 | - If the sink pipe uses parameters, you must include the % character and a newline on top of every query to be able to use the parameters. 242 | - The content of the .pipe file must follow this format: 243 | DESCRIPTION Sink Pipe to export sales hour every hour using my_connection 244 | 245 | NODE daily_sales 246 | SQL > 247 | % 248 | SELECT toStartOfDay(starting_date) day, country, sum(sales) as total_sales 249 | FROM teams 250 | WHERE 251 | day BETWEEN toStartOfDay(now()) - interval 1 day AND toStartOfDay(now()) 252 | and country = {{ String(country, 'US')}} 253 | GROUP BY day, country 254 | 255 | TYPE sink 256 | EXPORT_CONNECTION_NAME "my_connection" 257 | EXPORT_BUCKET_URI "s3://tinybird-sinks" 258 | EXPORT_FILE_TEMPLATE "daily_prices" 259 | EXPORT_SCHEDULE "*/5 * * * *" 260 | EXPORT_FORMAT "csv" 261 | EXPORT_COMPRESSION "gz" 262 | EXPORT_STRATEGY "truncate" 263 | 264 | 265 | 266 | 267 | - Content cannot be empty. 268 | - The connection names must be unique. 269 | - No indentation is allowed for property names 270 | - We support kafka, gcs and s3 connections for now 271 | 272 | 273 | 274 | 275 | TYPE kafka 276 | KAFKA_BOOTSTRAP_SERVERS {{ tb_secret("PRODUCTION_KAFKA_SERVERS", "localhost:9092") }} 277 | KAFKA_SECURITY_PROTOCOL SASL_SSL 278 | KAFKA_SASL_MECHANISM PLAIN 279 | KAFKA_KEY {{ tb_secret("PRODUCTION_KAFKA_USERNAME", "") }} 280 | KAFKA_SECRET {{ tb_secret("PRODUCTION_KAFKA_PASSWORD", "") }} 281 | 282 | 283 | 284 | 285 | TYPE gcs 286 | GCS_SERVICE_ACCOUNT_CREDENTIALS_JSON {{ tb_secret("PRODUCTION_GCS_SERVICE_ACCOUNT_CREDENTIALS_JSON", "") }} 287 | 288 | 289 | 290 | 291 | TYPE gcs 292 | GCS_HMAC_ACCESS_ID {{ tb_secret("gcs_hmac_access_id") }} 293 | GCS_HMAC_SECRET {{ tb_secret("gcs_hmac_secret") }} 294 | 295 | 296 | 297 | 298 | TYPE s3 299 | S3_REGION {{ tb_secret("PRODUCTION_S3_REGION", "") }} 300 | S3_ARN {{ tb_secret("PRODUCTION_S3_ARN", "") }} 301 | 302 | 303 | 304 | 305 | Follow these instructions when creating or updating .yaml files for tests: 306 | 307 | - The test file name must match the name of the pipe it is testing. 308 | - Every scenario name must be unique inside the test file. 309 | - When looking for the parameters available, you will find them in the pipes in the following format: {{{{String(my_param_name, default_value)}}}}. 310 | - If there are no parameters, you can omit parameters and generate a single test. 311 | - The format of the parameters is the following: param1=value1¶m2=value2¶m3=value3 312 | - If some parameters are provided by the user and you need to use them, preserve in the same format as they were provided, like case sensitive 313 | - Test as many scenarios as possible. 314 | - The format of the test file is the following: 315 | 316 | - name: kpis_single_day 317 | description: Test hourly granularity for a single day 318 | parameters: date_from=2024-01-01&date_to=2024-01-01 319 | expected_result: | 320 | {"date":"2024-01-01 00:00:00","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0} 321 | {"date":"2024-01-01 01:00:00","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0} 322 | 323 | - name: kpis_date_range 324 | description: Test daily granularity for a date range 325 | parameters: date_from=2024-01-01&date_to=2024-01-31 326 | expected_result: | 327 | {"date":"2024-01-01","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0} 328 | {"date":"2024-01-02","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0} 329 | 330 | - name: kpis_default_range 331 | description: Test default behavior without date parameters (last 7 days) 332 | parameters: '' 333 | expected_result: | 334 | {"date":"2025-01-10","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0} 335 | {"date":"2025-01-11","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0} 336 | 337 | - name: kpis_fixed_time 338 | description: Test with fixed timestamp for consistent testing 339 | parameters: fixed_time=2024-01-15T12:00:00 340 | expected_result: '' 341 | 342 | - name: kpis_single_day 343 | description: Test single day with hourly granularity 344 | parameters: date_from=2024-01-01&date_to=2024-01-01 345 | expected_result: | 346 | {"date":"2024-01-01 00:00:00","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0} 347 | {"date":"2024-01-01 01:00:00","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0} 348 | 349 | 350 | 351 | 352 | 353 | Follow these instructions when evolving a datasource schema: 354 | 355 | - When you make schema changes that are incompatible with the old schema, you must use a forward query in your data source. Forward queries are necessary when introducing breaking changes. Otherwise, your deployment will fail due to a schema mismatch. 356 | - Forward queries translate the old schema to a new one that you define in the .datasource file. This helps you evolve your schema while continuing to ingest data. 357 | Follow these steps to evolve your schema using a forward query: 358 | - Edit the .datasource file to add a forward query. 359 | - Run tb deploy --check to validate the deployment before creating it. 360 | - Deploy and promote your changes in Tinybird Cloud using {base_command} --cloud deploy. 361 | 362 | SCHEMA > 363 | `timestamp` DateTime `json:$.timestamp`, 364 | `session_id` UUID `json:$.session_id`, 365 | `action` String `json:$.action`, 366 | `version` String `json:$.version`, 367 | `payload` String `json:$.payload` 368 | 369 | FORWARD_QUERY > 370 | select timestamp, toUUID(session_id) as session_id, action, version, payload 371 | 372 | 373 | 374 | 375 | --------------------------------------------------------------------------------