├── tests
    ├── __init__.py
    ├── integration
    │   ├── __init__.py
    │   ├── project
    │   │   ├── .env.local
    │   │   ├── .gitignore
    │   │   ├── endpoints
    │   │   │   ├── simple_kv.pipe
    │   │   │   └── simple_pipe.pipe
    │   │   ├── datasources
    │   │   │   └── simple.datasource
    │   │   ├── CLAUDE.md
    │   │   └── .cursorrules
    │   ├── test_client.py
    │   ├── test_tokens.py
    │   ├── test_pipe.py
    │   ├── test_datasource.py
    │   ├── test_events.py
    │   ├── conftest.py
    │   ├── test_variables.py
    │   ├── test_query.py
    │   ├── test_datasources.py
    │   └── test_pipes.py
    ├── test_utils.py
    ├── test_pipe.py
    ├── utils.py
    ├── test_query.py
    ├── test_datasource.py
    └── test_worker.py
├── verdin
    ├── test
    │   ├── __init__.py
    │   ├── container.py
    │   └── cli.py
    ├── config.py
    ├── __init__.py
    ├── api
    │   ├── __init__.py
    │   ├── apis.py
    │   ├── base.py
    │   ├── events.py
    │   ├── tokens.py
    │   ├── variables.py
    │   ├── query.py
    │   ├── pipes.py
    │   └── datasources.py
    ├── tinybird.py
    ├── client.py
    ├── query.py
    ├── pipe.py
    ├── datasource.py
    └── worker.py
├── Makefile
├── .github
    └── workflows
    │   └── build.yml
├── pyproject.toml
├── .gitignore
├── README.md
└── LICENSE


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/verdin/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/integration/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/integration/project/.env.local:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/integration/project/.gitignore:
--------------------------------------------------------------------------------
1 | .tinyb
2 | .terraform
3 | 


--------------------------------------------------------------------------------
/verdin/config.py:
--------------------------------------------------------------------------------
1 | API_URL: str = "https://api.tinybird.co"
2 | 


--------------------------------------------------------------------------------
/verdin/__init__.py:
--------------------------------------------------------------------------------
1 | name = "verdin"
2 | 
3 | __version__ = "0.5.1"
4 | 


--------------------------------------------------------------------------------
/verdin/api/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import ApiError, ApiResponse
2 | 
3 | __all__ = [
4 |     "ApiError",
5 |     "ApiResponse",
6 | ]
7 | 


--------------------------------------------------------------------------------
/tests/integration/test_client.py:
--------------------------------------------------------------------------------
1 | def test_client_has_token(client):
2 |     """Makes sure the client fixture loaded the admin token correctly"""
3 |     assert client.token.startswith("p.e")
4 | 


--------------------------------------------------------------------------------
/tests/integration/project/endpoints/simple_kv.pipe:
--------------------------------------------------------------------------------
 1 | VERSION 0
 2 | 
 3 | DESCRIPTION >
 4 |     Endpoint to select unique key/value pairs from simple
 5 | 
 6 | NODE endpoint
 7 | SQL >
 8 |     %
 9 |     SELECT key, value
10 |     FROM simple
11 |     ORDER BY key, timestamp desc
12 |     LIMIT 1 by key
13 | 
14 | TYPE ENDPOINT
15 | 


--------------------------------------------------------------------------------
/tests/integration/project/endpoints/simple_pipe.pipe:
--------------------------------------------------------------------------------
 1 | VERSION 0
 2 | 
 3 | DESCRIPTION >
 4 |     Endpoint to select specific keys from the table
 5 | 
 6 | NODE endpoint
 7 | SQL >
 8 |     %
 9 |     SELECT *
10 |     FROM simple
11 |     WHERE 1=1
12 |         {% if defined(key) %} AND key == {{ String(key) }} {% end %}
13 | 
14 | TYPE ENDPOINT
15 | 


--------------------------------------------------------------------------------
/tests/integration/project/datasources/simple.datasource:
--------------------------------------------------------------------------------
 1 | DESCRIPTION >
 2 |     Simple Key-Value Data Source
 3 | 
 4 | SCHEMA >
 5 |     id          UUID            `json:$.Id`,
 6 |     timestamp   DateTime64(6)   `json:$.Timestamp`,
 7 |     key         String          `json:$.Key`,
 8 |     value       String          `json:$.Value`
 9 | 
10 | ENGINE "MergeTree"
11 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from tests.utils import retry
 4 | 
 5 | 
 6 | def test_retry():
 7 |     assert retry(lambda: "foo") == "foo"
 8 |     assert retry(lambda x: f"foo: {x}", kwargs={"x": "bar"}) == "foo: bar"
 9 |     assert retry(lambda x: f"foo: {x}", args=("bar",)) == "foo: bar"
10 | 
11 | 
12 | def test_retry_error():
13 |     def _raise_error():
14 |         raise ValueError("oh noes")
15 | 
16 |     with pytest.raises(TimeoutError) as e:
17 |         retry(_raise_error, retries=2, interval=0.1)
18 | 
19 |     assert e.match("oh noes")
20 | 


--------------------------------------------------------------------------------
/verdin/tinybird.py:
--------------------------------------------------------------------------------
 1 | from .client import Client
 2 | from .datasource import Datasource, Record
 3 | from .pipe import Pipe, PipeError, PipeJsonData, PipeJsonResponse, PipeMetadata, PipePageIterator
 4 | from .query import OutputFormat, QueryError, QueryJsonResult, SqlQuery
 5 | 
 6 | __all__ = [
 7 |     "Client",
 8 |     "Datasource",
 9 |     "Record",
10 |     "Pipe",
11 |     "PipeError",
12 |     "PipeMetadata",
13 |     "PipeJsonData",
14 |     "PipeJsonResponse",
15 |     "PipePageIterator",
16 |     "SqlQuery",
17 |     "QueryError",
18 |     "OutputFormat",
19 |     "QueryJsonResult",
20 | ]
21 | 


--------------------------------------------------------------------------------
/tests/test_pipe.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Optional
 2 | 
 3 | from verdin.pipe import PagedPipeQuery, PipeMetadata
 4 | 
 5 | 
 6 | class MockPipeJsonResponse:
 7 |     def __init__(self, empty: bool, data: Optional[Dict], meta: PipeMetadata):
 8 |         self.empty = empty
 9 |         self.data = data
10 |         self.meta = meta
11 | 
12 | 
13 | class TestPagedPipeQuery:
14 |     def test(self):
15 |         queries = list()
16 | 
17 |         class MockPipe:
18 |             def sql(self, query):
19 |                 queries.append(query)
20 | 
21 |                 if len(queries) == 2:
22 |                     return MockPipeJsonResponse(empty=True, data=None, meta=[])
23 | 
24 |                 return MockPipeJsonResponse(empty=False, data={}, meta=[])
25 | 
26 |         for page in PagedPipeQuery(pipe=MockPipe(), page_size=10, start_at=0):
27 |             assert page.empty is False
28 | 
29 |         assert len(queries) == 2
30 |         assert queries[0] == "SELECT * FROM _ LIMIT 10 OFFSET 0"
31 |         assert queries[1] == "SELECT * FROM _ LIMIT 10 OFFSET 10"
32 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | VENV_BIN = python3 -m venv
 2 | VENV_DIR ?= .venv
 3 | VENV_ACTIVATE = $(VENV_DIR)/bin/activate
 4 | VENV_RUN = . $(VENV_ACTIVATE)
 5 | 
 6 | 
 7 | venv: $(VENV_ACTIVATE)
 8 | 
 9 | $(VENV_ACTIVATE): pyproject.toml
10 | 	test -d $(VENV_DIR) || $(VENV_BIN) $(VENV_DIR)
11 | 	$(VENV_RUN); pip install -e ".[dev]"
12 | 	touch $(VENV_DIR)/bin/activate
13 | 
14 | clean:
15 | 	rm -rf build/
16 | 	rm -rf .eggs/
17 | 	rm -rf *.egg-info/
18 | 	rm -rf .venv
19 | 
20 | clean-dist: clean
21 | 	rm -rf dist/
22 | 
23 | lint: venv
24 | 	$(VENV_RUN); python -m ruff check .
25 | 
26 | format: venv
27 | 	$(VENV_RUN); python -m ruff format . && python -m ruff check . --fix
28 | 
29 | test: venv
30 | 	$(VENV_RUN); python -m pytest
31 | 
32 | test-coverage: venv
33 | 	$(VENV_RUN); coverage run --source=verdin -m pytest tests && coverage lcov -o .coverage.lcov
34 | 
35 | dist: venv
36 | 	$(VENV_RUN); python -m build
37 | 
38 | install: venv
39 | 	$(VENV_RUN); pip install -e .
40 | 
41 | upload: venv
42 | 	$(VENV_RUN); pip install --upgrade twine; twine upload dist/*
43 | 
44 | .PHONY: clean clean-dist format test test-coverage upload
45 | 


--------------------------------------------------------------------------------
/tests/integration/test_tokens.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from verdin.api import ApiError
 4 | 
 5 | 
 6 | class TestTokensApi:
 7 |     def test_list(self, client):
 8 |         api = client.api.tokens
 9 |         tokens = api.list().tokens
10 |         assert tokens
11 |         assert "admin local_testing@tinybird.co" in [token["name"] for token in tokens]
12 | 
13 |     def test_get_information(self, client):
14 |         api = client.api.tokens
15 | 
16 |         token = api.get_information("admin local_testing@tinybird.co").info
17 |         assert token["name"] == "admin local_testing@tinybird.co"
18 |         assert token["token"].startswith("p.e")
19 | 
20 |         # make sure it also works with the id
21 |         token = api.get_information(token["id"]).info
22 |         assert token["name"] == "admin local_testing@tinybird.co"
23 | 
24 |     def test_get_information_on_non_existing_token(self, client):
25 |         api = client.api.tokens
26 | 
27 |         with pytest.raises(ApiError) as e:
28 |             api.get_information("NON EXISTING TOKEN")
29 | 
30 |         assert e.match("Token has not enough permissions to get information about this token")
31 |         assert e.value.status_code == 403
32 | 


--------------------------------------------------------------------------------
/verdin/api/apis.py:
--------------------------------------------------------------------------------
 1 | from .datasources import DataSourcesApi
 2 | from .events import EventsApi
 3 | from .pipes import PipesApi
 4 | from .query import QueryApi
 5 | from .tokens import TokensApi
 6 | from .variables import VariablesApi
 7 | 
 8 | 
 9 | class Apis:
10 |     """
11 |     Factory for Api objects.
12 |     """
13 | 
14 |     _token: str
15 |     _host: str | None
16 | 
17 |     def __init__(self, token: str, host: str = None):
18 |         self._token = token
19 |         self._host = host
20 | 
21 |     @property
22 |     def datasources(self) -> DataSourcesApi:
23 |         return DataSourcesApi(self._token, self._host)
24 | 
25 |     @property
26 |     def events(self) -> EventsApi:
27 |         return EventsApi(self._token, self._host)
28 | 
29 |     @property
30 |     def pipes(self) -> PipesApi:
31 |         return PipesApi(self._token, self._host)
32 | 
33 |     @property
34 |     def query(self) -> QueryApi:
35 |         return QueryApi(self._token, self._host)
36 | 
37 |     @property
38 |     def tokens(self) -> TokensApi:
39 |         return TokensApi(self._token, self._host)
40 | 
41 |     @property
42 |     def variables(self) -> VariablesApi:
43 |         return VariablesApi(self._token, self._host)
44 | 


--------------------------------------------------------------------------------
/tests/integration/test_pipe.py:
--------------------------------------------------------------------------------
 1 | class TestPipe:
 2 |     def test_pipe_query(self, client):
 3 |         ds = client.datasource("simple")
 4 |         ds.truncate()
 5 | 
 6 |         ds.append_ndjson(
 7 |             [
 8 |                 {
 9 |                     "Id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0",
10 |                     "Timestamp": "2024-01-23T10:30:00.123456",
11 |                     "Key": "foo",
12 |                     "Value": "bar",
13 |                 },
14 |                 {
15 |                     "Id": "d7792957-21d8-46e6-a4e0-188eb36e2758",
16 |                     "Timestamp": "2024-02-23T11:45:00.234567",
17 |                     "Key": "baz",
18 |                     "Value": "ed",
19 |                 },
20 |                 {
21 |                     "Id": "fc71d4d5-7e0c-492a-9e3f-8f1cde9bcfaf",
22 |                     "Timestamp": "2024-03-23T11:45:00.234567",
23 |                     "Key": "foo",
24 |                     "Value": "bar2",
25 |                 },
26 |             ]
27 |         )
28 | 
29 |         pipe = client.pipe("simple_kv")
30 | 
31 |         response = pipe.query()
32 |         assert response.data == [
33 |             {"key": "baz", "value": "ed"},
34 |             {"key": "foo", "value": "bar2"},
35 |         ]
36 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     paths-ignore:
 6 |       - 'README.md'
 7 |     branches:
 8 |       - main
 9 |   pull_request:
10 |     branches:
11 |       - main
12 | 
13 | jobs:
14 |   test:
15 |     runs-on: ${{ matrix.os }}
16 |     strategy:
17 |       matrix:
18 |         os: [ ubuntu-latest ]
19 |         python-version: [ '3.10', '3.11', '3.12', '3.13' ]
20 | 
21 |     steps:
22 |       - name: Checkout
23 |         uses: actions/checkout@v2
24 | 
25 |       - name: Set up Python
26 |         uses: actions/setup-python@v2
27 |         with:
28 |           python-version: ${{ matrix.python-version }}
29 | 
30 |       - name: Run linting
31 |         run: |
32 |           make lint
33 | 
34 |       - name: Run tests
35 |         run: |
36 |           make test-coverage
37 | 
38 |       - name: Coveralls Parallel
39 |         uses: coverallsapp/github-action@master
40 |         with:
41 |           github-token: ${{ secrets.GITHUB_TOKEN }}
42 |           flag-name: run-${{ matrix.os }}-${{ matrix.python_version }}
43 |           path-to-lcov: ./.coverage.lcov
44 |           parallel: true
45 | 
46 |   report:
47 |     needs: test
48 |     runs-on: ubuntu-latest
49 |     steps:
50 |       - name: Report coveralls
51 |         uses: coverallsapp/github-action@master
52 |         with:
53 |           github-token: ${{ secrets.GITHUB_TOKEN }}
54 |           parallel-finished: true
55 | 


--------------------------------------------------------------------------------
/verdin/client.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from . import config
 4 | from .api.apis import Apis
 5 | from .datasource import Datasource
 6 | from .pipe import Pipe
 7 | from .query import OutputFormat, SqlQuery
 8 | 
 9 | 
10 | class Client:
11 |     """
12 |     Tinybird HTTP client that holds the access token and provides factory methods for resources.
13 |     """
14 | 
15 |     def __init__(self, token: str, api: str = None):
16 |         self.host = (api or config.API_URL).lstrip("/")
17 |         self.token = token
18 |         self._api = Apis(self.token, self.host)
19 | 
20 |     @property
21 |     def api(self) -> Apis:
22 |         """
23 |         Returns an ``Apis`` object that gives you access to the tinybird API objects.
24 |         :return: An ``Apis`` object
25 |         """
26 |         return self._api
27 | 
28 |     def pipe(self, name: str, version: int = None) -> Pipe:
29 |         """
30 |         Create an object representing a pipe with the given name, e.g.,
31 |         "localstack_dashboard_events.json"
32 |         """
33 |         return Pipe(name, token=self.token, version=version, api=self.host)
34 | 
35 |     def datasource(self, name: str, version: int = None) -> Datasource:
36 |         """
37 |         Create an object representing a datasource with a given name.
38 |         """
39 |         return Datasource(name, token=self.token, version=version, api=self.host)
40 | 
41 |     def sql(self, sql: str, format: Optional[OutputFormat] = None) -> SqlQuery:
42 |         return SqlQuery(sql, format=format, token=self.token, api=self.host)
43 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # Verdin project configuration
 2 | [build-system]
 3 | requires = ['hatchling']
 4 | build-backend = "hatchling.build"
 5 | 
 6 | [project]
 7 | name = "verdin"
 8 | authors = [
 9 |     { name = "Thomas Rausch", email = "info@localstack.cloud" }
10 | ]
11 | description = "A Python SDK for Tinybird"
12 | readme = "README.md"
13 | license = "Apache-2.0"
14 | classifiers = [
15 |     "Development Status :: 4 - Beta",
16 |     "Operating System :: OS Independent",
17 |     "Programming Language :: Python :: 3",
18 |     "Programming Language :: Python :: 3.10",
19 |     "Programming Language :: Python :: 3.11",
20 |     "Programming Language :: Python :: 3.12",
21 |     "Programming Language :: Python :: 3.13",
22 |     "Topic :: System :: Networking",
23 |     "Topic :: Software Development :: Libraries",
24 |     "Topic :: Utilities"
25 | ]
26 | requires-python = ">=3.10"
27 | dynamic = ["version"]
28 | 
29 | [project.urls]
30 | Repository = "https://github.com/localstack/verdin"
31 | 
32 | [project.optional-dependencies]
33 | dev = [
34 |     "pytest>=6.2.4",
35 |     "ruff==0.9.1",
36 |     "pytest_httpserver>=1.0.1",
37 |     "coverage[toml]>=5.0",
38 |     "pytest-cov>=2.7.1",
39 |     "coveralls",
40 |     "tinybird",
41 | ]
42 | 
43 | [tool.hatch.version]
44 | path = "verdin/__init__.py"
45 | 
46 | [tool.ruff]
47 | line-length = 100
48 | target-version = "py310"
49 | 
50 | [tool.coverage.run]
51 | relative_files = true
52 | source = [
53 |     "verdin/"
54 | ]
55 | 
56 | [tool.coverage.report]
57 | exclude_lines = [
58 |     "if __name__ == .__main__.:",
59 |     "raise NotImplementedError",
60 |     "return NotImplemented",
61 |     "def __repr__",
62 | ]
63 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import uuid
 3 | from typing import Callable
 4 | 
 5 | 
 6 | def retry(
 7 |     fn: Callable,
 8 |     args: tuple = None,
 9 |     kwargs: dict = None,
10 |     retries: int = 3,
11 |     interval: float = 1,
12 | ):
13 |     """
14 |     Retries the execution of a function ``fn`` for a specified number of attempts (``retries``) with a delay
15 |     between attempts (``interval``). If all attempts fail, a ``TimeoutError`` is raised indicating the final
16 |     error encountered.
17 | 
18 |     :param fn: The callable function to be executed.
19 |     :param args: A tuple of positional arguments to pass to the ``fn``. Defaults to an empty tuple if not provided.
20 |     :param kwargs: A dictionary of keyword arguments to pass to the ``fn``. Defaults to an empty dictionary if not
21 |         provided.
22 |     :param retries: The number of retry attempts before raising a ``TimeoutError``. Defaults to 3.
23 |     :param interval: The time (in seconds) to wait between each retry attempt. Defaults to 1.0 seconds.
24 |     :return: The result returned by successfully calling the ``fn`` with the specified ``args`` and ``kwargs``.
25 |         Returns `None` only if no successful result is obtained after all retry attempts.
26 |     """
27 |     args = args or ()
28 |     kwargs = kwargs or {}
29 | 
30 |     for i in range(retries):
31 |         try:
32 |             return fn(*args, **kwargs)
33 |         except Exception as e:
34 |             if i == retries - 1:
35 |                 raise TimeoutError(f"Gave up after {retries} retries, final error: {e}") from e
36 |             else:
37 |                 time.sleep(interval)
38 |                 continue
39 | 
40 |     return None
41 | 
42 | 
43 | def short_id() -> str:
44 |     return str(uuid.uuid4())[-8:]
45 | 


--------------------------------------------------------------------------------
/tests/integration/test_datasource.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | LOG = logging.getLogger(__name__)
 5 | 
 6 | 
 7 | class TestDatasource:
 8 |     def test_append_ndjson_query_truncate(self, client):
 9 |         ds = client.datasource("simple")
10 |         ds.truncate()
11 | 
12 |         ds.append_ndjson(
13 |             [
14 |                 {
15 |                     "Id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0",
16 |                     "Timestamp": "2024-01-23T10:30:00.123456",
17 |                     "Key": "foo",
18 |                     "Value": "bar",
19 |                 },
20 |                 {
21 |                     "Id": "d7792957-21d8-46e6-a4e0-188eb36e2758",
22 |                     "Timestamp": "2024-02-23T11:45:00.234567",
23 |                     "Key": "baz",
24 |                     "Value": "ed",
25 |                 },
26 |             ]
27 |         )
28 | 
29 |         query = client.sql("SELECT * FROM simple")
30 |         response = query.json()
31 |         assert response.data == [
32 |             {
33 |                 "id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0",
34 |                 "timestamp": "2024-01-23 10:30:00.123456",
35 |                 "key": "foo",
36 |                 "value": "bar",
37 |             },
38 |             {
39 |                 "id": "d7792957-21d8-46e6-a4e0-188eb36e2758",
40 |                 "timestamp": "2024-02-23 11:45:00.234567",
41 |                 "key": "baz",
42 |                 "value": "ed",
43 |             },
44 |         ]
45 | 
46 |         query = client.sql("SELECT count(*) as cnt FROM simple")
47 |         response = query.json()
48 |         assert response.data == [{"cnt": 2}]
49 | 
50 |         # remove all records from the table
51 |         ds.truncate()
52 | 
53 |         # check that the table is empty
54 |         query = client.sql("SELECT count(*) as cnt FROM simple")
55 |         response = query.json()
56 |         assert response.data == [{"cnt": 0}]
57 | 
58 |         query = client.sql("SELECT * FROM simple")
59 |         response = query.json()
60 |         assert response.data == []
61 | 


--------------------------------------------------------------------------------
/tests/test_query.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from pytest_httpserver import HTTPServer
 3 | from werkzeug import Response
 4 | 
 5 | from verdin.query import QueryError, SqlQuery
 6 | 
 7 | _mock_json_response = """{
 8 | "meta": [
 9 |     {
10 |         "name": "VendorID",
11 |         "type": "Int32"
12 |     },
13 |     {
14 |         "name": "tpep_pickup_datetime",
15 |         "type": "DateTime"
16 |     },
17 |     {
18 |         "name": "passenger_count",
19 |         "type": "Int32"
20 |     }
21 | ],
22 | "data": [
23 |     {
24 |         "VendorID": 2,
25 |         "tpep_pickup_datetime": "2001-01-05 11:45:23",
26 |         "passenger_count": 5
27 |     },
28 |     {
29 |         "VendorID": 2,
30 |         "tpep_pickup_datetime": "2002-12-31 23:01:55",
31 |         "passenger_count": 3
32 |     }
33 | ],
34 | "rows": 2,
35 | "rows_before_limit_at_least": 4,
36 | "statistics":
37 |     {
38 |         "elapsed": 0.00091042,
39 |         "rows_read": 4,
40 |         "bytes_read": 296
41 |     }
42 | }"""
43 | 
44 | 
45 | def test_json(httpserver: HTTPServer):
46 |     def handler(request):
47 |         return Response(_mock_json_response, 200)
48 | 
49 |     httpserver.expect_request(
50 |         "/v0/sql", query_string={"q": "select * from mytable FORMAT JSON"}
51 |     ).respond_with_handler(handler)
52 | 
53 |     query = SqlQuery("select * from mytable", token="12345", api=httpserver.url_for("/"))
54 | 
55 |     response = query.json()
56 | 
57 |     assert response.meta[0] == {"name": "VendorID", "type": "Int32"}
58 |     assert len(response.data) == 2
59 | 
60 | 
61 | def test_json_error(httpserver: HTTPServer):
62 |     def handler(request):
63 |         return Response('{"error": "invalid datasource"}', 403)
64 | 
65 |     httpserver.expect_request(
66 |         "/v0/sql", query_string={"q": "select * from mytable FORMAT JSON"}
67 |     ).respond_with_handler(handler)
68 | 
69 |     query = SqlQuery("select * from mytable", token="12345", api=httpserver.url_for("/"))
70 | 
71 |     with pytest.raises(QueryError) as e:
72 |         query.json()
73 |     e.match("403")
74 |     e.match("invalid datasource")
75 | 


--------------------------------------------------------------------------------
/tests/integration/test_events.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from tests.utils import retry
 4 | 
 5 | 
 6 | class TestEventsApi:
 7 |     @pytest.mark.parametrize("compress", [True, False])
 8 |     def test_events(self, client, compress):
 9 |         events = client.api.events
10 | 
11 |         records = [
12 |             {
13 |                 "Id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0",
14 |                 "Timestamp": "2024-01-23T10:30:00.123456",
15 |                 "Key": "foo",
16 |                 "Value": "bar",
17 |             },
18 |             {
19 |                 "Id": "d7792957-21d8-46e6-a4e0-188eb36e2758",
20 |                 "Timestamp": "2024-02-23T11:45:00.234567",
21 |                 "Key": "baz",
22 |                 "Value": "ed",
23 |             },
24 |         ]
25 | 
26 |         response = events.send("simple", records, compress=compress)
27 | 
28 |         assert response.successful_rows == 2
29 |         assert response.quarantined_rows == 0
30 | 
31 |         def _wait_for_count(cnt: int):
32 |             query = client.sql("SELECT count(*) as cnt FROM simple")
33 |             assert query.json().data == [{"cnt": cnt}]
34 | 
35 |         retry(_wait_for_count, args=(2,))
36 | 
37 |     def test_events_wait(self, client):
38 |         events = client.api.events
39 | 
40 |         records = [
41 |             {
42 |                 "Id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0",
43 |                 "Timestamp": "2024-01-23T10:30:00.123456",
44 |                 "Key": "foo",
45 |                 "Value": "bar",
46 |             },
47 |             {
48 |                 "Id": "d7792957-21d8-46e6-a4e0-188eb36e2758",
49 |                 "Timestamp": "2024-02-23T11:45:00.234567",
50 |                 "Key": "baz",
51 |                 "Value": "ed",
52 |             },
53 |         ]
54 | 
55 |         response = events.send("simple", records, wait=True)
56 | 
57 |         assert response.successful_rows == 2
58 |         assert response.quarantined_rows == 0
59 | 
60 |         query = client.sql("SELECT count(*) as cnt FROM simple")
61 |         assert query.json().data == [{"cnt": 2}]
62 | 


--------------------------------------------------------------------------------
/tests/test_datasource.py:
--------------------------------------------------------------------------------
 1 | from pytest_httpserver import HTTPServer
 2 | from werkzeug import Response
 3 | 
 4 | from verdin.datasource import Datasource, FileDatasource
 5 | 
 6 | 
 7 | class TestDatasource:
 8 |     def test_to_csv(self):
 9 |         records = [["a", "1", "{}"], ["b", "2", '{"foo":"bar","baz":"ed"}']]
10 | 
11 |         csv = Datasource.to_csv(records)
12 | 
13 |         assert csv == """a,1,{}\nb,2,"{""foo"":""bar"",""baz"":""ed""}"\n"""
14 | 
15 |     def test_to_csv_with_delimiter(self):
16 |         records = [["a", "1", "{}"], ["b", "2", '{"foo":"bar","baz":"ed"}']]
17 | 
18 |         csv = Datasource.to_csv(records, delimiter=";")
19 | 
20 |         assert csv == """a;1;{}\nb;2;"{""foo"":""bar"",""baz"":""ed""}"\n"""
21 | 
22 |     def test_append(self, httpserver: HTTPServer):
23 |         ds = Datasource("mydatasource", "123456", api=httpserver.url_for("/"))
24 | 
25 |         expected_data = '''a,1,{}\nb,2,"{""foo"":""bar"",""baz"":""ed""}"'''
26 | 
27 |         def handler(request):
28 |             actual_data = request.data.decode()
29 |             assert expected_data in actual_data
30 |             return Response("", 200)
31 | 
32 |         httpserver.expect_request(
33 |             "/v0/datasources",
34 |             query_string={
35 |                 "name": "mydatasource",
36 |                 "mode": "append",
37 |                 "dialect_delimiter": ",",
38 |                 "format": "csv",
39 |             },
40 |         ).respond_with_handler(handler)
41 | 
42 |         response = ds.append([["a", "1", "{}"], ["b", "2", '{"foo":"bar","baz":"ed"}']])
43 |         httpserver.check()
44 |         assert response.ok
45 | 
46 | 
47 | class TestFileDatasource:
48 |     def test_append(self, tmp_path):
49 |         file_path = tmp_path / "myfile.csv"
50 |         ds = FileDatasource(str(file_path))
51 | 
52 |         records = [["a", "1", "{}"], ["b", "2", '{"foo":"bar","baz":"ed"}']]
53 |         ds.append(records)
54 | 
55 |         records = [["c", "3", "{}"]]
56 |         ds.append(records)
57 | 
58 |         expected = """a,1,{}\nb,2,"{""foo"":""bar"",""baz"":""ed""}"\nc,3,{}\n"""
59 |         actual = file_path.read_text()
60 | 
61 |         assert actual == expected
62 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | .idea/
  2 | *.iml
  3 | *~
  4 | 
  5 | # General
  6 | .DS_Store
  7 | .AppleDouble
  8 | .LSOverride
  9 | 
 10 | # Icon must end with two \r
 11 | Icon
 12 | 
 13 | # Thumbnails
 14 | ._*
 15 | 
 16 | # Files that might appear in the root of a volume
 17 | .DocumentRevisions-V100
 18 | .fseventsd
 19 | .Spotlight-V100
 20 | .TemporaryItems
 21 | .Trashes
 22 | .VolumeIcon.icns
 23 | .com.apple.timemachine.donotpresent
 24 | 
 25 | # Directories potentially created on remote AFP share
 26 | .AppleDB
 27 | .AppleDesktop
 28 | Network Trash Folder
 29 | Temporary Items
 30 | .apdisk
 31 | 
 32 | # Byte-compiled / optimized / DLL files
 33 | __pycache__/
 34 | *.py[cod]
 35 | *$py.class
 36 | 
 37 | # C extensions
 38 | *.so
 39 | 
 40 | # Distribution / packaging
 41 | .Python
 42 | build/
 43 | develop-eggs/
 44 | dist/
 45 | downloads/
 46 | eggs/
 47 | .eggs/
 48 | lib/
 49 | lib64/
 50 | parts/
 51 | sdist/
 52 | var/
 53 | wheels/
 54 | *.egg-info/
 55 | .installed.cfg
 56 | *.egg
 57 | MANIFEST
 58 | 
 59 | # PyInstaller
 60 | #  Usually these files are written by a python script from a template
 61 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 62 | *.manifest
 63 | *.spec
 64 | 
 65 | # Installer logs
 66 | pip-log.txt
 67 | pip-delete-this-directory.txt
 68 | 
 69 | # Unit test / coverage reports
 70 | htmlcov/
 71 | .tox/
 72 | .coverage
 73 | .coverage.*
 74 | .cache
 75 | nosetests.xml
 76 | coverage.xml
 77 | *.cover
 78 | .hypothesis/
 79 | 
 80 | # Translations
 81 | *.mo
 82 | *.pot
 83 | 
 84 | # Django stuff:
 85 | *.log
 86 | .static_storage/
 87 | .media/
 88 | local_settings.py
 89 | 
 90 | # Flask stuff:
 91 | instance/
 92 | .webassets-cache
 93 | 
 94 | # Scrapy stuff:
 95 | .scrapy
 96 | 
 97 | # Sphinx documentation
 98 | docs/_build/
 99 | 
100 | # PyBuilder
101 | target/
102 | 
103 | # Jupyter Notebook
104 | .ipynb_checkpoints
105 | 
106 | # pyenv
107 | .python-version
108 | 
109 | # celery beat schedule file
110 | celerybeat-schedule
111 | 
112 | # SageMath parsed files
113 | *.sage.py
114 | 
115 | # Environments
116 | .venv
117 | env/
118 | venv/
119 | ENV/
120 | env.bak/
121 | venv.bak/
122 | 
123 | # Spyder project settings
124 | .spyderproject
125 | .spyproject
126 | 
127 | # Rope project settings
128 | .ropeproject
129 | 
130 | # mkdocs documentation
131 | /site
132 | 
133 | # mypy
134 | .mypy_cache/
135 | 


--------------------------------------------------------------------------------
/tests/integration/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | 
 4 | import pytest
 5 | 
 6 | from verdin.api import ApiError
 7 | from verdin.client import Client
 8 | from verdin.test.cli import TinybirdCli
 9 | from verdin.test.container import TinybirdLocalContainer
10 | 
11 | # os.environ["SKIP_TINYBIRD_LOCAL_START"] = "1"
12 | 
13 | 
14 | def _is_skip_tinybird_local_start() -> bool:
15 |     """
16 |     Set SKIP_TINYBIRD_LOCAL_START=1 if you have a tb local container running already with the project deployed. This
17 |     allows faster iterations.
18 |     """
19 |     return os.environ.get("SKIP_TINYBIRD_LOCAL_START") in ["1", "true", "True", True]
20 | 
21 | 
22 | @pytest.fixture(scope="session")
23 | def client(tinybird_local_container) -> Client:
24 |     return tinybird_local_container.client()
25 | 
26 | 
27 | @pytest.fixture(scope="session")
28 | def cli(tinybird_local_container) -> TinybirdCli:
29 |     project_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "project"))
30 | 
31 |     return TinybirdCli(
32 |         host=tinybird_local_container.url,
33 |         local=True,
34 |         cwd=project_dir,
35 |     )
36 | 
37 | 
38 | @pytest.fixture(scope="session", autouse=True)
39 | def tinybird_local_container():
40 |     """
41 |     Starts a tinybird local container in the background and waits until it becomes available.
42 |     """
43 |     project_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "project"))
44 | 
45 |     container = TinybirdLocalContainer(cwd=project_dir)
46 | 
47 |     if not _is_skip_tinybird_local_start():
48 |         container.start()
49 | 
50 |     container.wait_is_up()
51 | 
52 |     yield container
53 | 
54 |     # cleanup
55 |     if not _is_skip_tinybird_local_start():
56 |         container.stop()
57 | 
58 | 
59 | @pytest.fixture(scope="session", autouse=True)
60 | def deployed_project(cli):
61 |     if _is_skip_tinybird_local_start():
62 |         yield
63 |         return
64 | 
65 |     time.sleep(5)
66 |     cli.deploy(wait=True, auto=True)
67 |     yield
68 | 
69 | 
70 | @pytest.fixture(autouse=True)
71 | def _truncate_datasource(client):
72 |     # make sure to truncate "simple" datasource and its quarantine table before and after each test
73 | 
74 |     client.api.datasources.truncate("simple")
75 |     try:
76 |         # also truncate the quarantine table if it exists
77 |         client.api.datasources.truncate("simple_quarantine")
78 |     except ApiError:
79 |         pass
80 | 
81 |     yield
82 |     client.api.datasources.truncate("simple")
83 | 
84 |     try:
85 |         client.api.datasources.truncate("simple_quarantine")
86 |     except ApiError:
87 |         pass
88 | 


--------------------------------------------------------------------------------
/verdin/api/base.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | 
 4 | class Api:
 5 |     DEFAULT_HOST = "https://api.tinybird.co"
 6 | 
 7 |     host: str
 8 |     token: str
 9 | 
10 |     def __init__(self, token: str, host: str = None):
11 |         self.token = token
12 |         self.host = host or Api.DEFAULT_HOST
13 | 
14 | 
15 | class ApiResponse:
16 |     _response: requests.Response
17 | 
18 |     def __init__(self, response: requests.Response):
19 |         self._response = response
20 |         self._json: dict | None = None  # cache for json response
21 | 
22 |     @property
23 |     def text(self) -> str:
24 |         """
25 |         Returns the body of the HTTP response as a string.
26 | 
27 |         :return: The response body as a string.
28 |         """
29 |         return self._response.text
30 | 
31 |     @property
32 |     def content(self) -> bytes:
33 |         """
34 |         Returns the body of the HTTP response as bytes.
35 | 
36 |         :return: The response body as a bytes.
37 |         """
38 |         return self._response.content
39 | 
40 |     @property
41 |     def json(self) -> dict:
42 |         """
43 |         Parses the JSON response and returns a dictionary. It caches the result so that later calls to this method
44 |         do not parse the response every time.
45 | 
46 |         :return: The parsed JSON response.
47 |         """
48 |         if self._json:
49 |             return self._json
50 | 
51 |         self._json = self._response.json()
52 |         return self._json
53 | 
54 | 
55 | class ApiError(Exception):
56 |     """
57 |     Exception that represents a non-200 HTTP response from the API.
58 |     """
59 | 
60 |     _response: requests.Response
61 | 
62 |     def __init__(self, response: requests.Response):
63 |         self._response = response
64 |         super().__init__(self._render_message(response))
65 | 
66 |     def _render_message(self, response: requests.Response) -> str:
67 |         error = None
68 |         documentation = None
69 | 
70 |         if response.headers.get("Content-Type").startswith("application/json"):
71 |             doc = response.json()
72 |             error = doc.get("error")
73 |             documentation = doc.get("documentation")
74 | 
75 |         if not error:
76 |             error = response.text
77 | 
78 |         error.rstrip(".")
79 | 
80 |         message = f"API Error ({response.status_code}): {error}."
81 | 
82 |         if documentation:
83 |             message += f" Documentation: {documentation}"
84 | 
85 |         return message
86 | 
87 |     @property
88 |     def status_code(self) -> int:
89 |         return self._response.status_code
90 | 
91 |     @property
92 |     def text(self) -> str:
93 |         return self._response.text
94 | 
95 |     @property
96 |     def json(self) -> str:
97 |         return self._response.json()
98 | 


--------------------------------------------------------------------------------
/verdin/test/container.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import time
 3 | 
 4 | import requests
 5 | 
 6 | from verdin.test.cli import TinybirdCli
 7 | from verdin.client import Client
 8 | 
 9 | 
10 | class TinybirdLocalContainer:
11 |     def __init__(self, cwd: str = None):
12 |         """
13 |         Creates a new TinybirdLocalContainer instance.
14 | 
15 |         :param cwd: The current working directory to use for the tinybird local container.
16 |         """
17 |         self.cwd = cwd
18 |         self.url = "http://localhost:7181"
19 |         self.proc: None | subprocess.Popen = None
20 | 
21 |     def start(self):
22 |         """
23 |         Start the tinybird local container in a background process.
24 |         """
25 |         cli = TinybirdCli(cwd=self.cwd, local=True)
26 |         self.proc = cli.local_start(daemon=True, skip_new_version=True)
27 | 
28 |     def client(self) -> Client:
29 |         """
30 |         Returns a tinybird Client that connects to this container with admin privileged.
31 | 
32 |         :return: Tinybird Client
33 |         """
34 |         cli = TinybirdCli(host=self.url, cwd=self.cwd, local=True)
35 | 
36 |         cli_tokens = cli.token_ls()
37 | 
38 |         # i'm not really sure why this is needed, but when we use a token returned by the /tokens api, the
39 |         # client cannot find datasources created through ``tb deploy``.
40 |         token_to_use = None
41 |         for token in cli_tokens:
42 |             if token.name == "admin local_testing@tinybird.co":
43 |                 token_to_use = token.token
44 |                 break
45 | 
46 |         return Client(
47 |             token=token_to_use,
48 |             api=self.url,
49 |         )
50 | 
51 |     def wait_is_up(self, timeout: int = 120):
52 |         """
53 |         Wait for the container to appear by querying the tokens endpoint.
54 | 
55 |         :param timeout: Timeout in seconds
56 |         :raises TimeoutError: If the container does not appear within the timeout
57 |         """
58 |         # Wait for the service to become available
59 |         start_time = time.time()
60 |         while time.time() - start_time < timeout:
61 |             try:
62 |                 response = requests.get(f"{self.url}/tokens")
63 |                 if response.status_code == 200:
64 |                     break
65 |             except requests.RequestException:
66 |                 pass
67 |             time.sleep(1)
68 |         else:
69 |             raise TimeoutError("Tinybird container failed to start within timeout")
70 | 
71 |     def stop(self):
72 |         """
73 |         Stops and removes the tinybird local container.
74 |         """
75 |         cli = TinybirdCli(cwd=self.cwd, local=True)
76 |         cli.local_stop()
77 | 
78 |         if self.proc:
79 |             self.proc.kill()
80 |             self.proc = None
81 | 
82 |         cli.local_remove()
83 | 


--------------------------------------------------------------------------------
/verdin/api/events.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | import json
 3 | import logging
 4 | 
 5 | import requests
 6 | 
 7 | from .base import Api, ApiError, ApiResponse
 8 | 
 9 | LOG = logging.getLogger(__name__)
10 | 
11 | 
12 | class EventsResponse(ApiResponse):
13 |     @property
14 |     def successful_rows(self) -> int:
15 |         return self.json.get("successful_rows")
16 | 
17 |     @property
18 |     def quarantined_rows(self) -> int:
19 |         return self.json.get("quarantined_rows")
20 | 
21 | 
22 | class EventsApi(Api):
23 |     endpoint: str = "/v0/events"
24 | 
25 |     session: requests.Session
26 | 
27 |     def __init__(self, token: str, host: str = None):
28 |         super().__init__(token, host)
29 | 
30 |         self.session = requests.Session()
31 |         if self.token:
32 |             self.session.headers.update({"Authorization": f"Bearer {self.token}"})
33 | 
34 |     def send(
35 |         self,
36 |         name: str,
37 |         records: list[dict],
38 |         wait: bool = False,
39 |         json_encoder: type = None,
40 |         compress: bool = False,
41 |     ) -> EventsResponse:
42 |         """
43 |         Makes a POST request to ``/v0/events?name=<name>`` with NDJSON encoded data.
44 | 
45 |         :param name: Name or ID of the target Data Source to append data to it
46 |         :param records: List of JSON records to append. Records will be converted to NDJSON using ``json.dumps``
47 |         :param wait: 'false' by default. Set to 'true' to wait until the write is acknowledged by the database.
48 |             Enabling this flag makes it possible to retry on database errors, but it introduces additional latency.
49 |             It's recommended to enable it in use cases in which data loss avoidance is critical. Disable it otherwise.
50 |         :param json_encoder: The JSON Encoder class passed to ``json.dumps``. Defaults to ``json.JSONEncoder``.
51 |         :param compress: Whether to compress the data using gzip. Defaults to False.
52 | 
53 |         :return: The EventsResponse
54 |         :raises ApiError: If the request failed
55 |         """
56 |         url = f"{self.host}{self.endpoint}"
57 | 
58 |         docs = [json.dumps(doc, cls=json_encoder) for doc in records]
59 |         data = "\n".join(docs)
60 | 
61 |         params = {"name": name}
62 |         if wait:
63 |             params["wait"] = "true"
64 | 
65 |         LOG.debug("sending %d ndjson records to %s via %s", len(records), name, url)
66 | 
67 |         headers = {"Content-Type": "application/x-ndjson"}
68 | 
69 |         if compress:
70 |             headers["Content-Encoding"] = "gzip"
71 |             data = gzip.compress(data.encode("utf-8"))
72 | 
73 |         response = self.session.post(
74 |             url=url,
75 |             params=params,
76 |             headers=headers,
77 |             data=data,
78 |         )
79 | 
80 |         if not response.ok:
81 |             raise ApiError(response)
82 | 
83 |         return EventsResponse(response)
84 | 


--------------------------------------------------------------------------------
/verdin/api/tokens.py:
--------------------------------------------------------------------------------
 1 | from typing import TypedDict
 2 | 
 3 | import requests
 4 | 
 5 | from verdin.api import ApiResponse
 6 | from verdin.api.base import Api, ApiError
 7 | 
 8 | 
 9 | class TokenNotFoundError(ApiError):
10 |     """Specific ApiError representing a 404 Not Found when token names are given."""
11 | 
12 | 
13 | class Scope(TypedDict):
14 |     type: str
15 |     resource: str | None
16 |     filter: str | None
17 | 
18 | 
19 | class TokenInfo(TypedDict):
20 |     id: str
21 |     token: str
22 |     scopes: list[Scope]
23 |     name: str
24 |     description: str | None
25 |     origin: dict | None
26 |     host: str
27 |     is_internal: bool
28 | 
29 | 
30 | class ListTokensResponse(ApiResponse):
31 |     @property
32 |     def tokens(self) -> list[TokenInfo]:
33 |         return self.json.get("tokens", [])
34 | 
35 | 
36 | class GetTokenInfoResponse(ApiResponse):
37 |     @property
38 |     def info(self) -> TokenInfo:
39 |         return self.json
40 | 
41 | 
42 | class TokensApi(Api):
43 |     """
44 |     Tokens API client.
45 | 
46 |     TODO: The following APIs are not yet implemented (note that some workspaces only allow resource modification
47 |      through deployments anyway)
48 |      - Create a new Token: Static or JWT (POST /v0/tokens)
49 |      - Refresh a static token (POST /v0/tokens/:name/refresh)
50 |      - Delete a Token (DELETE /v0/tokens/:name)
51 |      - Modify a Token (PUT /v0/tokens/:name)
52 |     """
53 | 
54 |     endpoint: str = "/v0/tokens"
55 | 
56 |     session: requests.Session
57 | 
58 |     def __init__(self, token: str, host: str = None):
59 |         super().__init__(token, host)
60 | 
61 |         self.session = requests.Session()
62 |         if self.token:
63 |             self.session.headers.update({"Authorization": f"Bearer {self.token}"})
64 | 
65 |     def get_information(self, token: str):
66 |         """
67 |         Fetches information about a particular Static Token. Makes a GET request to ``/v0/tokens/:name``. If the token
68 |         doesn't exist, a 403 may be returned ("Not enough permissions to get information about this token").
69 | 
70 |         :param token: The token identifier.
71 |         :return: A ``GetTokenInfoResponse`` object.
72 |         """
73 |         response = self.session.request(
74 |             method="GET",
75 |             url=f"{self.host}{self.endpoint}/{token}",
76 |         )
77 | 
78 |         if not response.ok:
79 |             raise ApiError(response)
80 | 
81 |         return GetTokenInfoResponse(response)
82 | 
83 |     def list(self) -> ListTokensResponse:
84 |         """
85 |         Retrieves all workspace Static Tokens. Makes a GET request to ``/v0/tokens``.
86 | 
87 |         :return: A ``ListTokensResponse`` object.
88 |         """
89 |         response = self.session.request(
90 |             method="GET",
91 |             url=f"{self.host}{self.endpoint}",
92 |         )
93 | 
94 |         if not response.ok:
95 |             raise ApiError(response)
96 | 
97 |         return ListTokensResponse(response)
98 | 


--------------------------------------------------------------------------------
/tests/integration/test_variables.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | import time
 3 | 
 4 | import pytest
 5 | 
 6 | from tests.utils import short_id
 7 | from verdin.api.variables import VariableNotFoundError
 8 | 
 9 | 
10 | class TestVariables:
11 |     def test_integration(self, client):
12 |         # E2E test for variables API
13 |         variable_name = f"test_variable_{short_id()}"
14 |         variable_value = "test_value"
15 | 
16 |         # List variables and make sure the variable_name is not in the list
17 |         response = client.api.variables.list()
18 |         variable_names = [var["name"] for var in response.variables]
19 |         assert variable_name not in variable_names
20 | 
21 |         # Make sure an API Error with 404 is raised when getting a non-existent variable
22 |         with pytest.raises(VariableNotFoundError) as e:
23 |             client.api.variables.get(variable_name)
24 |         assert e.value.status_code == 404
25 | 
26 |         # Create the variable
27 |         create_response = client.api.variables.create(name=variable_name, value=variable_value)
28 |         assert create_response.variable["name"] == variable_name
29 |         assert create_response.variable["type"] == "secret"
30 | 
31 |         # List again and check that it's there
32 |         list_response = client.api.variables.list()
33 |         variable_names = [var["name"] for var in list_response.variables]
34 |         assert variable_name in variable_names
35 | 
36 |         # Get the variable and assert the response
37 |         get_response = client.api.variables.get(variable_name)
38 |         assert get_response.variable["name"] == variable_name
39 |         assert get_response.variable["type"] == "secret"
40 | 
41 |         # delete the variable and check again
42 |         response = client.api.variables.delete(variable_name)
43 |         assert response.ok
44 | 
45 |         with pytest.raises(VariableNotFoundError) as e:
46 |             client.api.variables.get(variable_name)
47 | 
48 |     def test_get_non_existing_variable(self, client):
49 |         with pytest.raises(VariableNotFoundError) as e:
50 |             client.api.variables.get("non_existing_variable")
51 | 
52 |         assert e.match("Not found")
53 |         assert e.value.status_code == 404
54 | 
55 |     def test_delete_non_existing_variable(self, client):
56 |         with pytest.raises(VariableNotFoundError) as e:
57 |             client.api.variables.delete("non_existing_variable")
58 | 
59 |         assert e.match("Variable not found")
60 |         assert e.value.status_code == 404
61 | 
62 |     def test_update_non_existing_variable(self, client):
63 |         with pytest.raises(VariableNotFoundError) as e:
64 |             client.api.variables.update("non_existing_variable", value="foo")
65 | 
66 |         assert e.match("Variable not found")
67 |         assert e.value.status_code == 404
68 | 
69 |     def test_update_variable(self, client):
70 |         variable_name = f"test_variable_{short_id()}"
71 |         variable_value = "test_value"
72 | 
73 |         response = client.api.variables.create(name=variable_name, value=variable_value)
74 | 
75 |         assert datetime.fromisoformat(response.variable["created_at"]).replace(
76 |             microsecond=0
77 |         ) == datetime.fromisoformat(response.variable["updated_at"]).replace(microsecond=0)
78 | 
79 |         time.sleep(1)
80 | 
81 |         response = client.api.variables.update(name=variable_name, value=variable_value + "1")
82 |         assert response.variable["created_at"] != response.variable["updated_at"]
83 | 
84 |         client.api.variables.delete(variable_name)
85 | 


--------------------------------------------------------------------------------
/tests/test_worker.py:
--------------------------------------------------------------------------------
  1 | import threading
  2 | import time
  3 | from queue import Queue
  4 | 
  5 | import requests
  6 | 
  7 | from verdin.datasource import Datasource
  8 | from verdin.worker import QueuingDatasourceAppender
  9 | 
 10 | 
 11 | class QueueingDatasource(Datasource):
 12 |     def __init__(self, name, queue=None):
 13 |         super().__init__(name, None)
 14 |         self.queue = queue or Queue()
 15 | 
 16 |     def append(self, records) -> requests.Response:
 17 |         if records:
 18 |             self.queue.put(records)
 19 | 
 20 |         response = requests.Response()
 21 |         response.status_code = 200
 22 |         return response
 23 | 
 24 | 
 25 | class TestQueuingDatasourceAppender:
 26 |     def test_batching(self):
 27 |         source = Queue()
 28 |         destination = QueueingDatasource("datasource")
 29 | 
 30 |         appender = QueuingDatasourceAppender(source, destination)
 31 |         appender.min_interval = 0
 32 | 
 33 |         source.put(("a", 1))
 34 |         source.put(("b", 2))
 35 |         source.put(("c", 3))
 36 | 
 37 |         thread = threading.Thread(target=appender.run)
 38 |         thread.start()
 39 | 
 40 |         batch = destination.queue.get(timeout=1)
 41 |         assert len(batch) == 3
 42 |         assert batch[0] == ("a", 1)
 43 |         assert batch[1] == ("b", 2)
 44 |         assert batch[2] == ("c", 3)
 45 | 
 46 |         source.put(("d", 4))
 47 | 
 48 |         batch = destination.queue.get(timeout=1)
 49 |         assert len(batch) == 1
 50 |         assert batch[0] == ("d", 4)
 51 | 
 52 |         appender.close()
 53 |         thread.join(timeout=2)
 54 |         assert appender.stopped.is_set()
 55 | 
 56 |     def test_stop_while_running(self):
 57 |         # instrument the queue
 58 |         source = Queue()
 59 |         destination = QueueingDatasource("datasource")
 60 |         appender = QueuingDatasourceAppender(source, destination)
 61 |         appender.min_interval = 0
 62 | 
 63 |         thread = threading.Thread(target=appender.run)
 64 |         thread.start()
 65 |         time.sleep(0.2)
 66 | 
 67 |         appender.close()
 68 |         thread.join(timeout=2)
 69 |         assert appender.stopped.is_set()
 70 | 
 71 |     def test_retry(self):
 72 |         class MockQueueingDatasource(QueueingDatasource):
 73 |             first_call = True
 74 | 
 75 |             def append(self, records) -> requests.Response:
 76 |                 if self.first_call:
 77 |                     self.first_call = False
 78 | 
 79 |                     response = requests.Response()
 80 |                     response.status_code = 429
 81 |                     response.headers["Retry-After"] = "1"
 82 |                     return response
 83 | 
 84 |                 return super().append(records)
 85 | 
 86 |         source = Queue()
 87 |         destination = MockQueueingDatasource("datasource")
 88 |         appender = QueuingDatasourceAppender(source, destination)
 89 |         appender.min_interval = 0
 90 |         appender.wait_after_rate_limit = 0.5
 91 | 
 92 |         source.put(("a", 1))
 93 |         source.put(("b", 2))
 94 | 
 95 |         thread = threading.Thread(target=appender.run)
 96 |         thread.start()
 97 |         time.sleep(0.5)
 98 | 
 99 |         # should not be batched because we're still retrying with the previous batch
100 |         source.put(("c", 3))
101 | 
102 |         batch = destination.queue.get(timeout=5)
103 |         assert len(batch) == 2
104 | 
105 |         batch = destination.queue.get(timeout=5)
106 |         assert len(batch) == 1
107 |         assert batch[0] == ("c", 3)
108 | 
109 |         appender.close()
110 |         thread.join(timeout=5)
111 |         assert appender.stopped.is_set()
112 | 


--------------------------------------------------------------------------------
/verdin/query.py:
--------------------------------------------------------------------------------
  1 | import enum
  2 | import logging
  3 | from typing import Any, Optional, TypedDict
  4 | 
  5 | import requests
  6 | 
  7 | from . import config
  8 | from .api import ApiError
  9 | from .api.query import QueryApi
 10 | 
 11 | LOG = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | class OutputFormat(enum.Enum):
 15 |     # https://docs.tinybird.co/api-reference/query-api.html#id6
 16 |     CSV = "CSV"
 17 |     CSVWithNames = "CSVWithNames"
 18 |     JSON = "JSON"
 19 |     TSV = "TSV"
 20 |     TSVWithNames = "TSVWithNames"
 21 |     PrettyCompact = "PrettyCompact"
 22 |     JSONEachRow = "JSONEachRow"
 23 | 
 24 | 
 25 | class QueryMetadata(TypedDict):
 26 |     name: str
 27 |     type: str
 28 | 
 29 | 
 30 | class Statistics(TypedDict):
 31 |     elapsed: float
 32 |     rows_read: int
 33 |     bytes_read: int
 34 | 
 35 | 
 36 | JsonData = dict[str, Any]
 37 | QueryJsonData = list[dict[str, Any]]
 38 | 
 39 | 
 40 | class JsonResult(TypedDict):
 41 |     meta: list[QueryMetadata]
 42 |     data: QueryJsonData
 43 |     rows: int
 44 |     statistics: Statistics
 45 | 
 46 | 
 47 | class QueryJsonResult:
 48 |     response: requests.Response
 49 |     result: JsonResult
 50 | 
 51 |     def __init__(self, response: requests.Response):
 52 |         self.response = response
 53 |         self.result = response.json()
 54 | 
 55 |     @property
 56 |     def empty(self):
 57 |         """
 58 |         A property to check if the data in the result is empty.
 59 | 
 60 |         This property evaluates whether the "data" field within the "result"
 61 |         attribute is empty.
 62 | 
 63 |         :return: Returns True if the "data" field in "result" is missing or empty,
 64 |             otherwise returns False.
 65 |         """
 66 |         return not self.result.get("data")
 67 | 
 68 |     @property
 69 |     def meta(self) -> list[QueryMetadata]:
 70 |         """
 71 |         Returns the QueryMetadata from the query, which includes attributes and their types.
 72 | 
 73 |         :return: The QueryMetadata
 74 |         """
 75 |         return self.result.get("meta")
 76 | 
 77 |     @property
 78 |     def data(self) -> QueryJsonData:
 79 |         """
 80 |         Returns the data from the query, which is a list of dictionaries representing the rows of the query result.
 81 | 
 82 |         :return: The QueryJsonData
 83 |         """
 84 |         return self.result.get("data")
 85 | 
 86 | 
 87 | class QueryError(Exception):
 88 |     def __init__(self, response: requests.Response) -> None:
 89 |         self.response = response
 90 |         msg = response.text
 91 |         try:
 92 |             doc = response.json()
 93 |             if doc["error"]:
 94 |                 msg = doc["error"]
 95 |         except Exception:
 96 |             pass
 97 |         super().__init__(f"{response.status_code}: {msg}")
 98 | 
 99 | 
100 | class SqlQuery:
101 |     """
102 |     Tinybird SQL Query. https://docs.tinybird.co/api-reference/query-api.html#get--v0-sql
103 |     """
104 | 
105 |     endpoint: str = "/v0/sql"
106 | 
107 |     sql: str
108 |     format: Optional[OutputFormat]
109 | 
110 |     def __init__(
111 |         self, sql: str, token, format: Optional[OutputFormat] = None, api: str = None
112 |     ) -> None:
113 |         self.sql = sql
114 |         self.format = format or OutputFormat.JSON
115 |         self.token = token
116 |         host = (api or config.API_URL).rstrip("/")
117 |         self.api = host + self.endpoint
118 |         self._query_api = QueryApi(token=token, host=host)
119 | 
120 |     def get(self, format: Optional[OutputFormat] = None) -> requests.Response:
121 |         """
122 |         Runs the query and returns the response.
123 | 
124 |         TODO: replicate tinybird API concepts instead of returning Response
125 | 
126 |         :param format: Overwrite the default output format set in the constructor.
127 |         :return: the HTTP response
128 |         """
129 | 
130 |         LOG.debug(
131 |             "querying %s with query: %s",
132 |             self.api,
133 |             self.sql,
134 |         )
135 | 
136 |         try:
137 |             response = self._query_api.query(
138 |                 self.sql,
139 |                 format=(format or self.format).value,
140 |             )
141 |             return response._response
142 |         except ApiError as e:
143 |             raise QueryError(response=e._response) from e
144 | 
145 |     def json(self) -> QueryJsonResult:
146 |         """
147 |         Runs the query and returns the result in JSON output format.
148 | 
149 |         :return: A QueryJsonResult containing the result of the query.
150 |         """
151 |         response = self.get(OutputFormat.JSON)
152 | 
153 |         return QueryJsonResult(response)
154 | 


--------------------------------------------------------------------------------
/tests/integration/test_query.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | 
  4 | class TestQueryApi:
  5 |     @pytest.fixture(autouse=True)
  6 |     def _put_records(self, client):
  7 |         client.api.events.send(
  8 |             "simple",
  9 |             wait=True,
 10 |             records=[
 11 |                 {
 12 |                     "Id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0",
 13 |                     "Timestamp": "2024-01-23T10:30:00.123456",
 14 |                     "Key": "foo",
 15 |                     "Value": "bar",
 16 |                 },
 17 |                 {
 18 |                     "Id": "d7792957-21d8-46e6-a4e0-188eb36e2758",
 19 |                     "Timestamp": "2024-02-23T11:45:00.234567",
 20 |                     "Key": "baz",
 21 |                     "Value": "ed",
 22 |                 },
 23 |             ],
 24 |         )
 25 | 
 26 |     def test_query_datasource_json(self, client):
 27 |         response = client.api.query.query("SELECT key, value FROM simple ORDER BY `key` ASC")
 28 | 
 29 |         assert response.data == [{"key": "baz", "value": "ed"}, {"key": "foo", "value": "bar"}]
 30 |         assert response.meta == [
 31 |             {"name": "key", "type": "String"},
 32 |             {"name": "value", "type": "String"},
 33 |         ]
 34 |         assert response.rows == 2
 35 |         assert response.statistics["rows_read"] == 2
 36 | 
 37 |     def test_query_pipe(self, client):
 38 |         response = client.api.query.query("SELECT * FROM simple_kv ORDER BY `key` ASC")
 39 | 
 40 |         assert response.data == [{"key": "baz", "value": "ed"}, {"key": "foo", "value": "bar"}]
 41 |         assert response.meta == [
 42 |             {"name": "key", "type": "String"},
 43 |             {"name": "value", "type": "String"},
 44 |         ]
 45 |         assert response.rows == 2
 46 |         assert response.statistics["rows_read"] == 2
 47 | 
 48 |     def test_query_pipe_parameters(self, client):
 49 |         response = client.api.query.query(
 50 |             "SELECT key, value FROM simple_pipe", parameters={"key": "foo"}
 51 |         )
 52 | 
 53 |         assert response.data == [{"key": "foo", "value": "bar"}]
 54 |         assert response.meta == [
 55 |             {"name": "key", "type": "String"},
 56 |             {"name": "value", "type": "String"},
 57 |         ]
 58 |         assert response.rows == 1
 59 |         assert response.statistics["rows_read"] == 2
 60 | 
 61 |     def test_query_pipeline_json(self, client):
 62 |         response = client.api.query.query(
 63 |             "SELECT * FROM _ ORDER BY `key` ASC", pipeline="simple_kv"
 64 |         )
 65 | 
 66 |         assert response.data == [{"key": "baz", "value": "ed"}, {"key": "foo", "value": "bar"}]
 67 |         assert response.meta == [
 68 |             {"name": "key", "type": "String"},
 69 |             {"name": "value", "type": "String"},
 70 |         ]
 71 |         assert response.rows == 2
 72 |         assert response.statistics["rows_read"] == 2
 73 | 
 74 |     def test_query_csv(self, client):
 75 |         response = client.api.query.query(
 76 |             "SELECT key, value FROM simple ORDER BY `key` ASC", format="CSV"
 77 |         )
 78 | 
 79 |         assert response.text == '"baz","ed"\n"foo","bar"\n'
 80 | 
 81 |     def test_query_csv_with_names(self, client):
 82 |         response = client.api.query.query(
 83 |             "SELECT key, value FROM simple ORDER BY `key` ASC", format="CSVWithNames"
 84 |         )
 85 | 
 86 |         assert (
 87 |             response.text
 88 |             == '"key","value"\n"baz","ed"\n"foo","bar"\n'
 89 |             != '"baz","ed"\n"foo","bar"\n'
 90 |         )
 91 |         # CSV with names can be parsed as data!
 92 |         assert response.data == [{"key": "baz", "value": "ed"}, {"key": "foo", "value": "bar"}]
 93 | 
 94 |     def test_query_tsv(self, client):
 95 |         response = client.api.query.query(
 96 |             "SELECT key, value FROM simple ORDER BY `key` ASC", format="TSV"
 97 |         )
 98 | 
 99 |         assert response.text == "baz\ted\nfoo\tbar\n"
100 | 
101 |     def test_query_tsv_with_names(self, client):
102 |         response = client.api.query.query(
103 |             "SELECT key, value FROM simple ORDER BY `key` ASC", format="TSVWithNames"
104 |         )
105 | 
106 |         assert response.text == "key\tvalue\nbaz\ted\nfoo\tbar\n"
107 |         assert response.data == [{"key": "baz", "value": "ed"}, {"key": "foo", "value": "bar"}]
108 | 
109 |     def test_query_ndjson(self, client):
110 |         response = client.api.query.query(
111 |             "SELECT key, value FROM simple ORDER BY `key` ASC", format="JSONEachRow"
112 |         )
113 | 
114 |         assert (
115 |             response.text
116 |             == '{"key":"baz","value":"ed"}\n{"key":"foo","value":"bar"}\n'
117 |             != '"key","value"\n"baz","ed"\n"foo","bar"\n'
118 |         )
119 |         # CSV with names can be parsed as data!
120 |         assert response.data == [{"key": "baz", "value": "ed"}, {"key": "foo", "value": "bar"}]
121 | 


--------------------------------------------------------------------------------
/verdin/test/cli.py:
--------------------------------------------------------------------------------
  1 | """Wrapper around the Tinybird CLI to make available the main commands programmatically."""
  2 | 
  3 | import dataclasses
  4 | import logging
  5 | import os
  6 | import re
  7 | import subprocess
  8 | 
  9 | LOG = logging.getLogger(__name__)
 10 | 
 11 | 
 12 | @dataclasses.dataclass
 13 | class Token:
 14 |     id: str
 15 |     name: str
 16 |     token: str
 17 | 
 18 | 
 19 | class CliError(Exception):
 20 |     def __init__(self, output: str, orig: subprocess.SubprocessError) -> None:
 21 |         super().__init__(output)
 22 |         self.orig = orig
 23 | 
 24 | 
 25 | class TinybirdCli:
 26 |     """Interface around the Tinybird CLI"""
 27 | 
 28 |     def __init__(self, host: str = None, token: str = None, cwd: str = None, local: bool = False):
 29 |         self.host = host
 30 |         self.token = token
 31 |         self.cwd = cwd
 32 |         self.local = local
 33 | 
 34 |     def _env(self) -> dict:
 35 |         """
 36 |         Returns a dictionary of environment variables to be used when calling tb CLI commands.
 37 |         """
 38 |         _env = dict(os.environ)
 39 | 
 40 |         if self.host:
 41 |             _env["TB_HOST"] = self.host
 42 |         if self.token:
 43 |             _env["TB_TOKEN"] = self.token
 44 | 
 45 |         return _env
 46 | 
 47 |     def _get_base_args(self) -> list[str]:
 48 |         args = ["tb"]
 49 |         if self.local:
 50 |             args.append("--local")
 51 |         return args
 52 | 
 53 |     def token_ls(self) -> list[Token]:
 54 |         """
 55 |         List all tokens.
 56 | 
 57 |         :return: List of Token instances
 58 |         """
 59 |         args = [*self._get_base_args(), "token", "ls"]
 60 | 
 61 |         output = subprocess.check_output(
 62 |             args,
 63 |             encoding="utf-8",
 64 |             cwd=self.cwd,
 65 |             env=self._env(),
 66 |         )
 67 |         """
 68 |         output looks like this (unfortunately --output=json doesn't work)
 69 | 
 70 |         ** Tokens:
 71 |         --------------------------------------------------------------------------------
 72 |         id: 63678691-7e28-4f2d-8ef7-243ab19ad7cb
 73 |         name: workspace admin token
 74 |         token: p.eyJ1IjogIjU2ZThhYmMzLWRjNmYtNDcyYi05Yzg1LTdkZjFiZmUyNjU5YyIsICJpZCI6ICI2MzY3ODY5MS03ZTI4LTRmMmQtOGVmNy0yNDNhYjE5YWQ3Y2IiLCAiaG9zdCI6ICJsb2NhbCJ9.4gzsbiG1cnrIDUfHTxfQd0ZN57YkiOKEIyvuTlnLiaM
 75 |         --------------------------------------------------------------------------------
 76 |         id: 489c8ca1-195b-4383-a388-d84068ff1b2c
 77 |         name: admin local_testing@tinybird.co 
 78 |         token: p.eyJ1IjogIjU2ZThhYmMzLWRjNmYtNDcyYi05Yzg1LTdkZjFiZmUyNjU5YyIsICJpZCI6ICI0ODljOGNhMS0xOTViLTQzODMtYTM4OC1kODQwNjhmZjFiMmMiLCAiaG9zdCI6ICJsb2NhbCJ9.MmcBjRTCg6dX53sWsZAv6QzHRHKxwu-pEWkqx8opLHA
 79 |         --------------------------------------------------------------------------------
 80 |         """
 81 |         tokens = []
 82 |         current_token = {}
 83 | 
 84 |         for line in output.splitlines():
 85 |             # remove color codes
 86 |             line = re.sub(r"\x1b\[[0-9;]*m", "", line)
 87 |             line = line.strip()
 88 |             if line.startswith("id: "):
 89 |                 current_token = {}
 90 |                 current_token["id"] = line[4:]
 91 |             elif line.startswith("name: "):
 92 |                 current_token["name"] = line[6:]
 93 |             elif line.startswith("token: "):
 94 |                 current_token["token"] = line[7:]
 95 |                 tokens.append(Token(**current_token))
 96 | 
 97 |         return tokens
 98 | 
 99 |     def local_start(
100 |         self, daemon: bool = False, skip_new_version: bool = False, volumes_path: str = None
101 |     ) -> subprocess.Popen:
102 |         """
103 |         Run ``tb local start`` and return the subprocess.
104 |         """
105 |         args = ["tb", "local", "start"]
106 |         if daemon:
107 |             args.append("-d")
108 |         if skip_new_version:
109 |             args.append("--skip-new-version")
110 |         if volumes_path:
111 |             args.append("--volumes-path")
112 |             args.append(volumes_path)
113 | 
114 |         return subprocess.Popen(args, cwd=self.cwd, env=self._env())
115 | 
116 |     def local_stop(self):
117 |         """
118 |         Run ``tb local stop``.
119 |         """
120 |         subprocess.check_output(["tb", "local", "stop"])
121 | 
122 |     def local_remove(self):
123 |         """
124 |         Run ``tb local remove``.
125 |         """
126 |         subprocess.check_output(
127 |             ["tb", "local", "remove"],
128 |             input=b"y\n",
129 |         )
130 | 
131 |     def deploy(
132 |         self, wait: bool = False, auto: bool = False, allow_destructive_operations: bool = False
133 |     ):
134 |         args = [*self._get_base_args(), "deploy"]
135 | 
136 |         if wait:
137 |             args.append("--wait")
138 |         if auto:
139 |             args.append("--auto")
140 |         if allow_destructive_operations:
141 |             args.append("--allow-destructive-operations")
142 | 
143 |         try:
144 |             output = subprocess.check_output(
145 |                 args,
146 |                 encoding="utf-8",
147 |                 cwd=self.cwd,
148 |                 env=self._env(),
149 |             )
150 |         except subprocess.CalledProcessError as e:
151 |             raise CliError(f"Failed to deploy project:\n{e.output}", e) from e
152 | 
153 |         return output
154 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Verdin
  2 | ======
  3 | 
  4 | <p>
  5 |   <a href="https://pypi.org/project/verdin/"><img alt="PyPI Version" src="https://img.shields.io/pypi/v/verdin?color=blue"></a>
  6 |   <a href="https://github.com/localstack/verdin/actions/workflows/build.yml"><img alt="CI Status" src="https://github.com/localstack/verdin/actions/workflows/build.yml/badge.svg"></a>
  7 |   <a href="https://coveralls.io/github/localstack/verdin?branch=main"><img src="https://coveralls.io/repos/github/localstack/verdin/badge.svg?branch=main" alt="Coverage Status" /></a>
  8 |   <a href="https://img.shields.io/pypi/l/verdin.svg"><img alt="PyPI License" src="https://img.shields.io/pypi/l/verdin.svg"></a>
  9 |   <a href="https://github.com/psf/black"><img alt="Code style: black" src="https://img.shields.io/badge/code%20style-black-000000.svg"></a>
 10 | </p>
 11 | 
 12 | Verdin is a [tiny bird](https://en.wikipedia.org/wiki/Verdin), and also a [Tinybird](https://tinybird.co) SDK for Python.
 13 | 
 14 | Install
 15 | -------
 16 | 
 17 |     pip install verdin
 18 | 
 19 | Requirements
 20 | ------------
 21 | 
 22 | Python 3.10+
 23 | 
 24 | Usage
 25 | -----
 26 | 
 27 | ### Run an SQL Query
 28 | 
 29 | ```python
 30 | # the tinybird module exposes all important tinybird concepts
 31 | from verdin import tinybird
 32 | 
 33 | client = tinybird.Client("p.mytoken")
 34 | query = client.sql("select * from my_datasource__v0")
 35 | 
 36 | # run the query with `FORMAT JSON` and receive a QueryJsonResult
 37 | response: tinybird.QueryJsonResult = query.json()
 38 | 
 39 | # print records returned from the pipe
 40 | print(response.data)
 41 | ```
 42 | 
 43 | You can also run, e.g., `query.get(format=OutputFormat.CSV)` to get the raw response with CSV data. 
 44 | 
 45 | ### Query a Pipe
 46 | 
 47 | ```python
 48 | from verdin import tinybird
 49 | 
 50 | client = tinybird.Client("p.mytoken")
 51 | pipe = client.pipe("my_pipe")
 52 | 
 53 | # query the pipe using dynamic parameters
 54 | response: tinybird.PipeJsonResponse = pipe.query({"key": "val"})
 55 | 
 56 | # print records returned from the pipe
 57 | print(response.data)
 58 | ```
 59 | 
 60 | ### Append to a data source
 61 | 
 62 | ```python
 63 | from verdin import tinybird
 64 | 
 65 | client = tinybird.Client("p.mytoken")
 66 | 
 67 | # will access my_datasource__v0
 68 | datasource = client.datasource("my_datasource", version=0)
 69 | 
 70 | # query the pipe using dynamic parameters
 71 | datasource.append([
 72 |     ("col1-row1", "col2-row1"),
 73 |     ("col1-row2", "col2-row2"),
 74 | ])
 75 | ```
 76 | 
 77 | ### Append to a data source using high-frequency ingest
 78 | 
 79 | The `DataSource` object also gives you access to `/v0/events`, which is the high-frequency ingest, to append data.
 80 | Use the `send_events` method and pass JSON serializable documents to it.
 81 | 
 82 | ```python
 83 | datasource.send_events(records=[
 84 |     {"key": "val1"},
 85 |     {"key": "val2"},
 86 |     ...
 87 | ])
 88 | ```
 89 | 
 90 | ### Queue and batch records into a DataSource
 91 | 
 92 | Verdin provides a way to queue and batch data continuously:
 93 | 
 94 | ```python
 95 | from queue import Queue
 96 | from threading import Thread
 97 | 
 98 | from verdin import tinybird
 99 | from verdin.worker import QueuingDatasourceAppender
100 | 
101 | client = tinybird.Client("p.mytoken")
102 | 
103 | records = Queue()
104 | 
105 | appender = QueuingDatasourceAppender(records, client.datasource("my_datasource"))
106 | Thread(target=appender.run).start()
107 | 
108 | # appender will regularly read batches of data from the queue and append them
109 | # to the datasource. the appender respects rate limiting.
110 | 
111 | records.put(("col1-row1", "col2-row1"))
112 | records.put(("col1-row2", "col2-row2"))
113 | ```
114 | 
115 | ### API access
116 | 
117 | The DataSource and Pipes objects presented so far are high-level abstractions that provide a convenience Python API
118 | to deal with the most common use cases. Verdin also provides more low-level access to APIs via `client.api`.
119 | The following APIs are available:
120 | 
121 | * `/v0/datasources`: `client.api.datasources`
122 | * `/v0/events`: `client.api.events`
123 | * `/v0/pipes`: `client.api.pipes`
124 | * `/v0/sql`: `client.api.query`
125 | * `/v0/tokens`: `client.api.tokens`
126 | * `/v0/variables`: `client.api.variables`
127 | 
128 | Note that for some (datasources, pipes, tokens), manipulation operations are not implemented as they are typically done
129 | through tb deployments and not through the API.
130 | 
131 | Also note that API clients do not take care of retries or rate limiting. The caller is expected to handle fault
132 | tolerance.
133 | 
134 | #### Example (Querying a pipe)
135 | 
136 | You can query a pipe through the pipes API as follows:
137 | 
138 | ```python
139 | from verdin import tinybird
140 | 
141 | client = tinybird.Client(...)
142 | 
143 | response = client.api.pipes.query(
144 |     "my_pipe",
145 |     parameters={"my_param": "..."},
146 |     query="SELECT * FROM _ LIMIT 10",
147 | )
148 | 
149 | for record in response.data:
150 |     # each record is a dictionary
151 |     ...
152 | ```
153 | 
154 | #### Example (High-frequency ingest)
155 | 
156 | You can use the HFI endpoint `/v0/events` through the `events` api. As records, you can pass a list of JSON serializable
157 | documents.
158 | 
159 | ```python
160 | from verdin import tinybird
161 | 
162 | client = tinybird.Client(...)
163 | 
164 | response = client.api.events.send("my_datasource", records=[
165 |     {"id": "...", "value": "..."},
166 |     ...
167 | ])
168 | assert response.quarantined_rows == 0
169 | ```
170 | 
171 | Develop
172 | -------
173 | 
174 | Create the virtual environment, install dependencies, and run tests
175 | 
176 |     make venv
177 |     make test
178 | 
179 | Run the code formatter
180 | 
181 |     make format
182 | 
183 | Upload the pypi package using twine
184 | 
185 |     make upload
186 | 


--------------------------------------------------------------------------------
/verdin/api/variables.py:
--------------------------------------------------------------------------------
  1 | from typing import TypedDict, Literal
  2 | 
  3 | import requests
  4 | 
  5 | from verdin.api.base import Api, ApiError, ApiResponse
  6 | 
  7 | 
  8 | class VariableNotFoundError(ApiError):
  9 |     """
 10 |     Specific ApiError representing a 404 Not Found when variable names are given.
 11 |     """
 12 | 
 13 | 
 14 | class VariableInfo(TypedDict):
 15 |     """
 16 |     A variable info object. Example::
 17 | 
 18 |         {
 19 |             "name": "test_password",
 20 |             "type": "secret",
 21 |             "created_at": "2024-06-21T10:27:57",
 22 |             "updated_at": "2024-06-21T10:27:57",
 23 |             "edited_by": "token: 'admin token'"
 24 |         }
 25 |     """
 26 | 
 27 |     name: str
 28 |     type: str
 29 |     created_at: str
 30 |     updated_at: str
 31 |     edited_by: str
 32 | 
 33 | 
 34 | class CreateVariableResponse(ApiResponse):
 35 |     @property
 36 |     def variable(self) -> VariableInfo:
 37 |         """
 38 |         Returns the variable information.
 39 |         """
 40 |         return self.json
 41 | 
 42 | 
 43 | class UpdateVariableResponse(ApiResponse):
 44 |     @property
 45 |     def variable(self) -> VariableInfo:
 46 |         """
 47 |         Returns the variable information.
 48 |         """
 49 |         return self.json
 50 | 
 51 | 
 52 | class DeleteVariableResponse(ApiResponse):
 53 |     @property
 54 |     def ok(self) -> bool:
 55 |         """
 56 |         Returns whether the operation was successful.
 57 |         """
 58 |         return self.json.get("ok", False)
 59 | 
 60 | 
 61 | class ListVariablesResponse(ApiResponse):
 62 |     @property
 63 |     def variables(self) -> list[VariableInfo]:
 64 |         """
 65 |         Returns the list of variables.
 66 |         """
 67 |         return self.json.get("variables", [])
 68 | 
 69 | 
 70 | class GetVariableResponse(ApiResponse):
 71 |     @property
 72 |     def variable(self) -> VariableInfo:
 73 |         """
 74 |         Returns the variable information.
 75 |         """
 76 |         return self.json
 77 | 
 78 | 
 79 | class VariablesApi(Api):
 80 |     """
 81 |     ``/v0/variables`` API client.
 82 | 
 83 |     This API allows you to create, update, delete, and list environment variables
 84 |     that can be used in Pipes in a Workspace.
 85 |     """
 86 | 
 87 |     endpoint: str = "/v0/variables"
 88 | 
 89 |     session: requests.Session
 90 | 
 91 |     def __init__(self, token: str, host: str = None):
 92 |         super().__init__(token, host)
 93 | 
 94 |         self.session = requests.Session()
 95 |         if self.token:
 96 |             self.session.headers.update({"Authorization": f"Bearer {self.token}"})
 97 | 
 98 |     def create(
 99 |         self,
100 |         name: str,
101 |         value: str,
102 |         type: Literal["secret"] = "secret",
103 |     ) -> CreateVariableResponse:
104 |         """
105 |         Creates a new environment variable.
106 | 
107 |         :param name: The name of the variable.
108 |         :param value: The value of the variable.
109 |         :param type: The type of the variable. Defaults to 'secret'.
110 |         :return: A ``CreateVariableResponse`` object.
111 |         """
112 |         data = {
113 |             "name": name,
114 |             "value": value,
115 |             "type": type,
116 |         }
117 | 
118 |         response = self.session.request(
119 |             method="POST",
120 |             url=f"{self.host}{self.endpoint}",
121 |             data=data,
122 |         )
123 | 
124 |         if not response.ok:
125 |             raise ApiError(response)
126 | 
127 |         return CreateVariableResponse(response)
128 | 
129 |     def delete(self, name: str) -> DeleteVariableResponse:
130 |         """
131 |         Deletes an environment variable.
132 | 
133 |         :param name: The name of the variable to delete.
134 |         :return: A ``DeleteVariableResponse`` object.
135 |         """
136 |         response = self.session.request(
137 |             method="DELETE",
138 |             url=f"{self.host}{self.endpoint}/{name}",
139 |         )
140 | 
141 |         if response.status_code == 404:
142 |             raise VariableNotFoundError(response)
143 | 
144 |         if not response.ok:
145 |             raise ApiError(response)
146 | 
147 |         return DeleteVariableResponse(response)
148 | 
149 |     def update(
150 |         self,
151 |         name: str,
152 |         value: str,
153 |     ) -> UpdateVariableResponse:
154 |         """
155 |         Updates an environment variable.
156 | 
157 |         :param name: The name of the variable to update.
158 |         :param value: The new value of the variable.
159 |         :return: A ``UpdateVariableResponse`` object.
160 |         """
161 |         data = {
162 |             "value": value,
163 |         }
164 | 
165 |         response = self.session.request(
166 |             method="PUT",
167 |             url=f"{self.host}{self.endpoint}/{name}",
168 |             data=data,
169 |         )
170 | 
171 |         if response.status_code == 404:
172 |             raise VariableNotFoundError(response)
173 | 
174 |         if not response.ok:
175 |             raise ApiError(response)
176 | 
177 |         return UpdateVariableResponse(response)
178 | 
179 |     def list(self) -> ListVariablesResponse:
180 |         """
181 |         Lists all environment variables.
182 | 
183 |         :return: A ``ListVariablesResponse`` object.
184 |         """
185 |         response = self.session.request(
186 |             method="GET",
187 |             url=f"{self.host}{self.endpoint}",
188 |         )
189 | 
190 |         if not response.ok:
191 |             raise ApiError(response)
192 | 
193 |         return ListVariablesResponse(response)
194 | 
195 |     def get(self, name: str) -> GetVariableResponse:
196 |         """
197 |         Gets information about a specific environment variable.
198 | 
199 |         :param name: The name of the variable to get.
200 |         :return: A ``GetVariableResponse`` object.
201 |         """
202 |         response = self.session.request(
203 |             method="GET",
204 |             url=f"{self.host}{self.endpoint}/{name}",
205 |         )
206 | 
207 |         if response.status_code == 404:
208 |             raise VariableNotFoundError(response)
209 | 
210 |         if not response.ok:
211 |             raise ApiError(response)
212 | 
213 |         return GetVariableResponse(response)
214 | 


--------------------------------------------------------------------------------
/tests/integration/test_datasources.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import pytest
  4 | 
  5 | from verdin.api import ApiError
  6 | from verdin.api.datasources import DataSourceNotFoundError
  7 | from tests.utils import retry
  8 | 
  9 | 
 10 | class TestDataSourcesApi:
 11 |     def test_list(self, client):
 12 |         response = client.api.datasources.list()
 13 | 
 14 |         assert len(response.datasources) >= 1
 15 | 
 16 |         # find "simple" datasource in the list of data sources
 17 |         ds = None
 18 |         for datasource in response.datasources:
 19 |             if datasource["name"] == "simple":
 20 |                 ds = datasource
 21 |                 break
 22 | 
 23 |         assert ds
 24 | 
 25 |         # smoke tests some attributes
 26 |         assert ds["engine"]["engine"] == "MergeTree"
 27 |         assert "simple_kv" in [x["name"] for x in ds["used_by"]]
 28 | 
 29 |     def test_get_information(self, client):
 30 |         response = client.api.datasources.get_information("simple")
 31 | 
 32 |         # smoke tests some attributes
 33 |         assert response.info["name"] == "simple"
 34 |         assert response.info["engine"]["engine"] == "MergeTree"
 35 |         assert "simple_kv" in [x["name"] for x in response.info["used_by"]]
 36 | 
 37 |     def test_get_information_on_non_existing_datasource(self, client):
 38 |         with pytest.raises(DataSourceNotFoundError) as e:
 39 |             client.api.datasources.get_information("non_existing_datasource")
 40 | 
 41 |         e.match('Data Source "non_existing_datasource" does not exist')
 42 |         assert e.value.status_code == 404
 43 | 
 44 |     def test_truncate(self, client):
 45 |         ds = client.datasource("simple")
 46 |         ds.append_ndjson(
 47 |             [
 48 |                 {
 49 |                     "Id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0",
 50 |                     "Timestamp": "2024-01-23T10:30:00.123456",
 51 |                     "Key": "foo",
 52 |                     "Value": "bar",
 53 |                 },
 54 |                 {
 55 |                     "Id": "d7792957-21d8-46e6-a4e0-188eb36e2758",
 56 |                     "Timestamp": "2024-02-23T11:45:00.234567",
 57 |                     "Key": "baz",
 58 |                     "Value": "ed",
 59 |                 },
 60 |             ]
 61 |         )
 62 | 
 63 |         def _wait_for_count(cnt: int):
 64 |             query = client.sql("SELECT count(*) as cnt FROM simple")
 65 |             assert query.json().data == [{"cnt": cnt}]
 66 | 
 67 |         retry(_wait_for_count, args=(2,))
 68 | 
 69 |         client.api.datasources.truncate("simple")
 70 | 
 71 |         retry(_wait_for_count, args=(0,))
 72 | 
 73 |     def test_append_to_non_existing_data_source(self, client):
 74 |         with pytest.raises(ApiError) as e:
 75 |             client.api.datasources.append("non_existing_datasource", "foo,bar\n")
 76 | 
 77 |         # this is odd behavior, but currently, this raises a 403, with the error
 78 |         # "Adding or modifying data sources to this workspace can only be done via deployments"
 79 |         # due to the way tinybird behaves (apparently it doesn't check mode=append)
 80 | 
 81 |         assert e.value.status_code == 403
 82 | 
 83 |     def test_append_csv(self, client):
 84 |         ds = client.api.datasources
 85 | 
 86 |         data = "5b6859d2-e060-40a4-949a-7e7fab8e3207,2024-01-23T10:30:00.123456,foo,bar\n"
 87 |         data += "af49ffce-559c-426e-9787-ddb08628b547,2024-02-23T11:45:00.234567,baz,ed"
 88 | 
 89 |         response = ds.append("simple", data)
 90 |         assert not response.error
 91 |         assert response.quarantine_rows == 0
 92 |         assert response.invalid_lines == 0
 93 |         assert response.datasource["name"] == "simple"
 94 | 
 95 |         assert client.sql("SELECT * FROM simple").json().data == [
 96 |             {
 97 |                 "id": "5b6859d2-e060-40a4-949a-7e7fab8e3207",
 98 |                 "timestamp": "2024-01-23 10:30:00.123456",
 99 |                 "key": "foo",
100 |                 "value": "bar",
101 |             },
102 |             {
103 |                 "id": "af49ffce-559c-426e-9787-ddb08628b547",
104 |                 "timestamp": "2024-02-23 11:45:00.234567",
105 |                 "key": "baz",
106 |                 "value": "ed",
107 |             },
108 |         ]
109 | 
110 |     def test_append_csv_with_invalid_data(self, client):
111 |         ds = client.api.datasources
112 | 
113 |         data = "5b6859d2-e060-40a4-949a-7e7fab8e3207,2024-01-23T10:30:00.123456,foo,bar\n"
114 |         data += "af49ffce-559c-426e-9787-ddb08628b5472024-02-23T11:45:00.234567,baz,ed"  # error in this line
115 | 
116 |         response = ds.append("simple", data)
117 |         assert (
118 |             response.error
119 |             == "There was an error with file contents: 1 row in quarantine and 1 invalid line."
120 |         )
121 |         assert response.invalid_lines == 1
122 |         assert response.quarantine_rows == 1
123 | 
124 |     def test_append_ndjson(self, client):
125 |         ds = client.api.datasources
126 |         ds.truncate("simple")
127 | 
128 |         records = [
129 |             {
130 |                 "Id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0",
131 |                 "Timestamp": "2024-01-23T10:30:00.123456",
132 |                 "Key": "foo",
133 |                 "Value": "bar",
134 |             },
135 |             {
136 |                 "Id": "d7792957-21d8-46e6-a4e0-188eb36e2758",
137 |                 "Timestamp": "2024-02-23T11:45:00.234567",
138 |                 "Key": "baz",
139 |                 "Value": "ed",
140 |             },
141 |         ]
142 | 
143 |         def _data():
144 |             for r in records:
145 |                 yield json.dumps(r) + "\n"
146 | 
147 |         response = ds.append("simple", _data(), format="ndjson")
148 |         assert not response.error
149 |         assert response.quarantine_rows == 0
150 |         assert response.invalid_lines == 0
151 |         assert response.datasource["name"] == "simple"
152 | 
153 |         assert client.sql("SELECT * FROM simple").json().data == [
154 |             {
155 |                 "id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0",
156 |                 "timestamp": "2024-01-23 10:30:00.123456",
157 |                 "key": "foo",
158 |                 "value": "bar",
159 |             },
160 |             {
161 |                 "id": "d7792957-21d8-46e6-a4e0-188eb36e2758",
162 |                 "timestamp": "2024-02-23 11:45:00.234567",
163 |                 "key": "baz",
164 |                 "value": "ed",
165 |             },
166 |         ]
167 | 


--------------------------------------------------------------------------------
/verdin/pipe.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import Any, Iterator, Optional
  3 | 
  4 | import requests
  5 | 
  6 | from . import config
  7 | from .api import ApiError
  8 | from .api.pipes import PipesApi
  9 | 
 10 | LOG = logging.getLogger(__name__)
 11 | 
 12 | PipeMetadata = list[tuple[str, str]]
 13 | PipeJsonData = list[dict[str, Any]]
 14 | 
 15 | 
 16 | class PipeError(Exception):
 17 |     """
 18 |     Wrapper of the HTTP response returned by a Pipe query if the HTTP response is not a 2XX code.
 19 |     """
 20 | 
 21 |     response: requests.Response
 22 | 
 23 |     def __init__(self, response: requests.Response) -> None:
 24 |         self.response = response
 25 |         self.json: dict = response.json()
 26 |         super().__init__(self.description)
 27 | 
 28 |     @property
 29 |     def description(self) -> str:
 30 |         return self.json.get("error")
 31 | 
 32 | 
 33 | class PipeJsonResponse:
 34 |     """
 35 |     Wrapper of the HTTP response returned by a Pipe query.
 36 |     """
 37 | 
 38 |     response: requests.Response
 39 |     result: dict
 40 | 
 41 |     def __init__(self, response: requests.Response):
 42 |         self.response = response
 43 |         self.result = response.json()
 44 | 
 45 |     @property
 46 |     def empty(self) -> bool:
 47 |         """
 48 |         A property to check if the data in the result is empty.
 49 | 
 50 |         This property evaluates whether the "data" field within the "result"
 51 |         attribute is empty.
 52 | 
 53 |         :return: Returns True if the "data" field in "result" is missing or empty,
 54 |             otherwise returns False.
 55 |         """
 56 |         return not self.result.get("data")
 57 | 
 58 |     @property
 59 |     def meta(self) -> PipeMetadata:
 60 |         """
 61 |         Returns the PipeMetadata from the query, which includes attributes and their types.
 62 | 
 63 |         :return: The PipeMetadata
 64 |         """
 65 |         return [(t["name"], t["type"]) for t in self.result.get("meta", [])]
 66 | 
 67 |     @property
 68 |     def data(self) -> PipeJsonData:
 69 |         """
 70 |         Returns the data from the query, which is a list of dictionaries representing the rows of the query result.
 71 | 
 72 |         :return: The PipeJsonData
 73 |         """
 74 |         return self.result.get("data")
 75 | 
 76 | 
 77 | PipePageIterator = Iterator[PipeJsonResponse]
 78 | 
 79 | 
 80 | class PagedPipeQuery(PipePageIterator):
 81 |     # TODO: allow passing of custom parameters
 82 | 
 83 |     pipe: "Pipe"
 84 | 
 85 |     def __init__(self, pipe: "Pipe", page_size: int = 50, start_at: int = 0):
 86 |         self.pipe = pipe
 87 |         self.limit = page_size
 88 |         self.offset = start_at
 89 | 
 90 |     def __iter__(self):
 91 |         return self
 92 | 
 93 |     def __next__(self):
 94 |         sql = f"SELECT * FROM _ LIMIT {self.limit} OFFSET {self.offset}"
 95 |         response = self.pipe.sql(sql)
 96 |         if response.empty:
 97 |             raise StopIteration()
 98 |         self.offset += self.limit
 99 |         return response
100 | 
101 | 
102 | class Pipe:
103 |     """
104 |     Model abstraction of a tinybird Pipe.
105 | 
106 |     TODO: implement csv mode
107 |     """
108 | 
109 |     endpoint: str = "/v0/pipes"
110 | 
111 |     name: str
112 |     version: Optional[int]
113 |     resource: str
114 | 
115 |     def __init__(self, name, token, version: int = None, api: str = None) -> None:
116 |         super().__init__()
117 |         self.name = name
118 |         self.token = token
119 |         self.version = version
120 |         self.resource = (api or config.API_URL).rstrip("/") + self.endpoint
121 | 
122 |         self._pipes_api = PipesApi(token, host=(api or config.API_URL).rstrip("/"))
123 | 
124 |     @property
125 |     def canonical_name(self) -> str:
126 |         """
127 |         Returns the name of the pipe that can be queried. If a version is specified, the name will be suffixed with
128 |         ``__v<version>``. Otherwise, this just returns the name. Note that versions are discouraged in the current
129 |         tinybird workflows.
130 | 
131 |         :return: The canonical name of the pipe that can be used in queries
132 |         """
133 |         if self.version is not None:
134 |             return f"{self.name}__v{self.version}"
135 |         else:
136 |             return self.name
137 | 
138 |     @property
139 |     def pipe_url(self) -> str:
140 |         """
141 |         Returns the API URL of this pipe. It's something like ``https://api.tinybird.co/v0/pipes/my_pipe.json``.
142 | 
143 |         :return: The Pipe API URL
144 |         """
145 |         return self.resource + "/" + self.canonical_name + ".json"
146 | 
147 |     def query(self, params: dict[str, str] = None) -> PipeJsonResponse:
148 |         """
149 |         Query the pipe endpoint using the given dynamic parameters. Note that the pipe needs to be exposed as an
150 |         endpoint.
151 | 
152 |         See: https://www.tinybird.co/docs/forward/work-with-data/query-parameters#use-pipes-api-endpoints-with-dynamic-parameters
153 | 
154 |         :param params: The dynamic parameters of the pipe and the values for your query
155 |         :return: a PipeJsonResponse containing the result of the query
156 |         """
157 |         try:
158 |             response = self._pipes_api.query(
159 |                 self.canonical_name,
160 |                 parameters=params,
161 |                 format="json",
162 |             )
163 |             return PipeJsonResponse(response._response)
164 |         except ApiError as e:
165 |             raise PipeError(e._response)
166 | 
167 |     def pages(self, page_size: int = 50, start_at: int = 0) -> PipePageIterator:
168 |         """
169 |         Returns an iterator over the pipe's data pages. Each page contains ``page_size`` records.
170 | 
171 |         TODO: currently we don't support dynamic parameters with paged queries
172 | 
173 |         :param page_size: The size of each page (default 50)
174 |         :param start_at: The offset at which to start (default 0)
175 |         :return:
176 |         """
177 |         return PagedPipeQuery(pipe=self, page_size=page_size, start_at=start_at)
178 | 
179 |     def sql(self, query: str) -> PipeJsonResponse:
180 |         """
181 |         Run an SQL query against the pipe. For example:
182 | 
183 |             pipe.sql("select count() from _")
184 | 
185 |         See https://docs.tinybird.co/api-reference/query-api.html
186 | 
187 |         :param query: The SQL query to run
188 |         :return: The result of the query
189 |         """
190 |         try:
191 |             response = self._pipes_api.query(self.canonical_name, query=query, format="json")
192 |             return PipeJsonResponse(response._response)
193 |         except ApiError as e:
194 |             raise PipeError(e._response)
195 | 
196 |     def __str__(self):
197 |         return f"Pipe({self.canonical_name})"
198 | 
199 |     def __repr__(self):
200 |         return self.__str__()
201 | 


--------------------------------------------------------------------------------
/verdin/datasource.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import io
  3 | import json
  4 | import logging
  5 | import os
  6 | from typing import Any, Dict, List, Optional, Tuple, Union, TYPE_CHECKING
  7 | 
  8 | import requests
  9 | 
 10 | from . import config
 11 | from .api.datasources import DataSourcesApi
 12 | from .api.events import EventsApi, EventsResponse
 13 | 
 14 | if TYPE_CHECKING:
 15 |     from _typeshed import SupportsWrite
 16 | 
 17 | LOG = logging.getLogger(__name__)
 18 | 
 19 | Record = Union[Tuple, List[Any]]
 20 | Records = List[Record]
 21 | 
 22 | 
 23 | def to_csv(records: Records, **kwargs) -> str:
 24 |     """
 25 |     Convert the given records to CSV using a CSV writer, and return them as a single string.
 26 | 
 27 |     :param records: The records to convert to CSV.
 28 |     :param kwargs: Args to be passed to ``csv.writer``.
 29 |     :return: A string representing the CSV
 30 |     """
 31 |     output = io.StringIO()
 32 |     write_csv(output, records, **kwargs)
 33 |     return output.getvalue()
 34 | 
 35 | 
 36 | def write_csv(file: "SupportsWrite[str]", records: Records, **kwargs):
 37 |     """
 38 |     Converts the given records to CSV and writes them to the given file.
 39 | 
 40 |     :param file: The file passed to the CSV writer.
 41 |     :param records: The records to convert to CSV.
 42 |     :param kwargs: Args to be passed to ``csv.writer``.
 43 |     """
 44 |     # TODO: do proper type conversion here to optimize for CSV input
 45 |     #  see: https://guides.tinybird.co/guide/fine-tuning-csvs-for-fast-ingestion
 46 | 
 47 |     if "delimiter" in kwargs:
 48 |         if kwargs["delimiter"] is None:
 49 |             del kwargs["delimiter"]
 50 | 
 51 |     writer = csv.writer(file, quoting=csv.QUOTE_MINIMAL, lineterminator="\n", **kwargs)
 52 | 
 53 |     for record in records:
 54 |         writer.writerow(record)
 55 | 
 56 | 
 57 | class Datasource:
 58 |     """
 59 |     Abstract tinybird datasource.
 60 |     """
 61 | 
 62 |     endpoint: str = "/v0/datasources"
 63 | 
 64 |     name: str
 65 |     version: Optional[int]
 66 | 
 67 |     def __init__(self, name, token, version: int = None, api: str = None) -> None:
 68 |         self.name = name
 69 |         self.token = token
 70 |         self.version = version
 71 |         self._host = (api or config.API_URL).rstrip("/")
 72 |         self.api = self._host + self.endpoint
 73 | 
 74 |         # API clients used to make the actual API calls
 75 |         self._events_api = EventsApi(token, self._host)
 76 |         self._datasources_api = DataSourcesApi(token, self._host)
 77 | 
 78 |     @property
 79 |     def canonical_name(self) -> str:
 80 |         """
 81 |         Returns the name of the table that can be queried. If a version is specified, the name will be suffixed with
 82 |         ``__v<version>``. Otherwise, this just returns the name. Note that versions are discouraged in the current
 83 |         tinybird workflows.
 84 | 
 85 |         :return: The canonical name of the table that can be used in queries
 86 |         """
 87 |         if self.version is not None:
 88 |             return f"{self.name}__v{self.version}"
 89 |         else:
 90 |             return self.name
 91 | 
 92 |     def send_events(
 93 |         self, records: list[dict], wait: bool = False, json_encoder: type = None
 94 |     ) -> EventsResponse:
 95 |         """
 96 |         Uses the ``/v0/events`` API endpoint to send JSON data to the datasource.
 97 | 
 98 |         :param records: List of JSON records to append. Records will be converted to NDJSON using ``json.dumps``
 99 |         :param wait: 'false' by default. Set to 'true' to wait until the write is acknowledged by the database.
100 |             Enabling this flag makes it possible to retry on database errors, but it introduces additional latency.
101 |             It's recommended to enable it in use cases in which data loss avoidance is critical. Disable it otherwise.
102 |         :param json_encoder: The JSON Encoder class passed to ``json.dumps``. Defaults to ``json.JSONEncoder``.
103 |         :return: The EventsResponse from the ``EventsApi``.
104 |         :raises ApiError: If the request failed
105 |         """
106 |         return self._events_api.send(
107 |             self.canonical_name, records=records, wait=wait, json_encoder=json_encoder
108 |         )
109 | 
110 |     def append(self, records: Records, *args, **kwargs) -> requests.Response:
111 |         """Calls ``append_csv``."""
112 |         # TODO: replicate tinybird API concepts instead of returning Response
113 |         return self.append_csv(records, *args, **kwargs)
114 | 
115 |     def append_csv(self, records: Records, delimiter: str = ",") -> requests.Response:
116 |         """
117 |         Makes a POST request to the datasource using mode=append with CSV data. This appends data to the table.
118 | 
119 |         :param records: List of records to append. They will be converted to CSV using the provided delimiter.
120 |         :param delimiter: Optional delimiter (defaults to ",")
121 |         :return: The HTTP response
122 |         """
123 | 
124 |         data = self.to_csv(records, delimiter=delimiter)
125 | 
126 |         LOG.debug(
127 |             "appending %d csv records to %s via %s",
128 |             len(records),
129 |             self,
130 |             self.api,
131 |         )
132 | 
133 |         response = self._datasources_api.append(
134 |             name=self.canonical_name,
135 |             dialect_delimiter=delimiter,
136 |             format="csv",
137 |             data=data,
138 |         )
139 | 
140 |         return response._response
141 | 
142 |     def append_ndjson(self, records: List[Dict]) -> requests.Response:
143 |         """
144 |         Makes a POST request to the datasource using mode=append with ndjson data. This appends data to the table.
145 | 
146 |         :param records: List of JSON records to append. They will be converted to NDJSON using ``json.dumps``
147 |         :return: The HTTP response
148 |         """
149 | 
150 |         def _ndjson_iterator():
151 |             for record in records:
152 |                 yield json.dumps(record) + "\n"
153 | 
154 |         LOG.debug(
155 |             "appending %d ndjson records to %s via %s",
156 |             len(records),
157 |             self,
158 |             self.api,
159 |         )
160 |         response = self._datasources_api.append(
161 |             name=self.canonical_name,
162 |             format="ndjson",
163 |             data=_ndjson_iterator(),
164 |         )
165 | 
166 |         return response._response
167 | 
168 |     def truncate(self):
169 |         """
170 |         Truncate the datasource which removes all records in the table.
171 |         """
172 |         self._datasources_api.truncate(name=self.canonical_name)
173 | 
174 |     @staticmethod
175 |     def to_csv(records: List[List[Any]], **kwargs):
176 |         return to_csv(records, **kwargs)
177 | 
178 |     def __str__(self):
179 |         return f"Datasource({self.canonical_name})"
180 | 
181 |     def __repr__(self):
182 |         return self.__str__()
183 | 
184 | 
185 | class FileDatasource(Datasource):
186 |     """
187 |     Datasource that writes into a file, used for testing and development purposes.
188 |     """
189 | 
190 |     def __init__(self, path: str):
191 |         name = os.path.basename(path)
192 |         super().__init__(name, None)
193 |         self.path = path
194 | 
195 |     def append_csv(self, records: Records, *args, **kwargs) -> requests.Response:
196 |         if records:
197 |             with open(self.path, "a") as fd:
198 |                 write_csv(fd, records)
199 | 
200 |         response = requests.Response()
201 |         response.status_code = 200
202 |         return response
203 | 
204 |     def append_ndjson(self, records: List[Dict]) -> requests.Response:
205 |         raise NotImplementedError
206 | 
207 |     def readlines(self) -> List[str]:
208 |         with open(self.path, "r") as fd:
209 |             return fd.readlines()
210 | 
211 |     def truncate(self):
212 |         raise NotImplementedError
213 | 


--------------------------------------------------------------------------------
/tests/integration/test_pipes.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from verdin.api.pipes import PipeNotFoundError
  4 | 
  5 | 
  6 | class TestPipesApi:
  7 |     def test_list(self, client):
  8 |         response = client.api.pipes.list(
  9 |             attrs=["id", "name"],
 10 |             node_attrs=[],
 11 |         )
 12 | 
 13 |         for pipe in response.pipes:
 14 |             assert {"id", "name", "url"} == set(pipe.keys())
 15 |             assert pipe["id"] is not None
 16 |             assert pipe["name"] is not None
 17 | 
 18 |         assert "simple_kv" in [pipe["name"] for pipe in response.pipes]
 19 |         assert "simple_pipe" in [pipe["name"] for pipe in response.pipes]
 20 | 
 21 |     def test_query_json(self, client):
 22 |         client.api.events.send(
 23 |             "simple",
 24 |             wait=True,
 25 |             records=[
 26 |                 {
 27 |                     "Id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0",
 28 |                     "Timestamp": "2024-01-23T10:30:00.123456",
 29 |                     "Key": "foo",
 30 |                     "Value": "bar",
 31 |                 },
 32 |                 {
 33 |                     "Id": "d7792957-21d8-46e6-a4e0-188eb36e2758",
 34 |                     "Timestamp": "2024-02-23T11:45:00.234567",
 35 |                     "Key": "baz",
 36 |                     "Value": "ed",
 37 |                 },
 38 |             ],
 39 |         )
 40 | 
 41 |         response = client.api.pipes.query("simple_kv", format="json")
 42 |         assert response.data == [{"key": "baz", "value": "ed"}, {"key": "foo", "value": "bar"}]
 43 |         assert response.meta == [
 44 |             {"name": "key", "type": "String"},
 45 |             {"name": "value", "type": "String"},
 46 |         ]
 47 |         assert response.rows == 2
 48 |         assert response.statistics["rows_read"] == 2
 49 | 
 50 |     @pytest.mark.parametrize("format", ["csv", "ndjson", "json"])
 51 |     def test_query_formats(self, client, format):
 52 |         client.api.events.send(
 53 |             "simple",
 54 |             wait=True,
 55 |             records=[
 56 |                 {
 57 |                     "Id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0",
 58 |                     "Timestamp": "2024-01-23T10:30:00.123456",
 59 |                     "Key": "foo",
 60 |                     "Value": "bar",
 61 |                 },
 62 |                 {
 63 |                     "Id": "d7792957-21d8-46e6-a4e0-188eb36e2758",
 64 |                     "Timestamp": "2024-02-23T11:45:00.234567",
 65 |                     "Key": "baz",
 66 |                     "Value": "ed",
 67 |                 },
 68 |                 {
 69 |                     "Id": "2b84e03e-dbcf-4141-9656-94ff8ac8c036",
 70 |                     "Timestamp": "2024-03-23T11:45:00.345678",
 71 |                     "Key": "format",
 72 |                     "Value": format,
 73 |                 },
 74 |             ],
 75 |         )
 76 | 
 77 |         response = client.api.pipes.query("simple_kv", format=format)
 78 |         assert response.data == [
 79 |             {"key": "baz", "value": "ed"},
 80 |             {"key": "foo", "value": "bar"},
 81 |             {"key": "format", "value": format},
 82 |         ]
 83 | 
 84 |     def test_query_with_params(self, client):
 85 |         client.api.events.send(
 86 |             "simple",
 87 |             wait=True,
 88 |             records=[
 89 |                 {
 90 |                     "Id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0",
 91 |                     "Timestamp": "2024-01-23T10:30:00.123456",
 92 |                     "Key": "foo",
 93 |                     "Value": "bar",
 94 |                 },
 95 |                 {
 96 |                     "Id": "d7792957-21d8-46e6-a4e0-188eb36e2758",
 97 |                     "Timestamp": "2024-02-23T11:45:00.234567",
 98 |                     "Key": "baz",
 99 |                     "Value": "ed",
100 |                 },
101 |                 {
102 |                     "Id": "2b84e03e-dbcf-4141-9656-94ff8ac8c036",
103 |                     "Timestamp": "2024-03-23T11:45:00.345678",
104 |                     "Key": "foo",
105 |                     "Value": "bar2",
106 |                 },
107 |             ],
108 |         )
109 | 
110 |         response = client.api.pipes.query("simple_pipe", parameters={"key": "foo"})
111 |         assert response.data == [
112 |             {
113 |                 "id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0",
114 |                 "key": "foo",
115 |                 "timestamp": "2024-01-23 10:30:00.123456",
116 |                 "value": "bar",
117 |             },
118 |             {
119 |                 "id": "2b84e03e-dbcf-4141-9656-94ff8ac8c036",
120 |                 "key": "foo",
121 |                 "timestamp": "2024-03-23 11:45:00.345678",
122 |                 "value": "bar2",
123 |             },
124 |         ]
125 | 
126 |         response = client.api.pipes.query("simple_pipe", parameters={"key": "does not exist"})
127 |         assert response.data == []
128 | 
129 |     def test_query_with_sql(self, client):
130 |         client.api.events.send(
131 |             "simple",
132 |             wait=True,
133 |             records=[
134 |                 {
135 |                     "Id": "e7f2af3e-99d1-4d4f-8a8c-d6aee4ab89b0",
136 |                     "Timestamp": "2024-01-23T10:30:00.123456",
137 |                     "Key": "foo",
138 |                     "Value": "bar",
139 |                 },
140 |                 {
141 |                     "Id": "d7792957-21d8-46e6-a4e0-188eb36e2758",
142 |                     "Timestamp": "2024-02-23T11:45:00.234567",
143 |                     "Key": "baz",
144 |                     "Value": "ed",
145 |                 },
146 |                 {
147 |                     "Id": "2b84e03e-dbcf-4141-9656-94ff8ac8c036",
148 |                     "Timestamp": "2024-03-23T11:45:00.345678",
149 |                     "Key": "foo",
150 |                     "Value": "bar2",
151 |                 },
152 |             ],
153 |         )
154 | 
155 |         response = client.api.pipes.query(
156 |             "simple_pipe",
157 |             query="SELECT timestamp,key,value FROM _ ORDER BY `value` DESC",
158 |             parameters={"key": "foo"},
159 |         )
160 | 
161 |         assert response.data == [
162 |             {
163 |                 "timestamp": "2024-03-23 11:45:00.345678",
164 |                 "key": "foo",
165 |                 "value": "bar2",
166 |             },
167 |             {
168 |                 "timestamp": "2024-01-23 10:30:00.123456",
169 |                 "key": "foo",
170 |                 "value": "bar",
171 |             },
172 |         ]
173 | 
174 |     def test_query_with_sql_too_long(self, client):
175 |         chars = "a" * 6000
176 |         # ``chars`` ends up in both the query and parameters, making the total body ~12k, but the query <8k, which is a
177 |         # requirement.
178 | 
179 |         response = client.api.pipes.query(
180 |             "simple_pipe",
181 |             query=f"SELECT * FROM _ WHERE `value` = '{chars}'",
182 |             parameters={"key": chars},
183 |         )
184 | 
185 |         # ofcourse the query is nonsense and returns nothing
186 |         assert response.data == []
187 | 
188 |     def test_query_with_non_existing_pipe(self, client):
189 |         with pytest.raises(PipeNotFoundError) as e:
190 |             client.api.pipes.query("non_existent_pipe")
191 | 
192 |         assert e.match("The pipe 'non_existent_pipe' does not exist")
193 |         assert e.value.status_code == 404
194 | 
195 |     def test_get_information(self, client):
196 |         response = client.api.pipes.get_information("simple_kv")
197 |         assert response.info["name"] == "simple_kv"
198 |         assert response.info["type"] == "endpoint"
199 | 
200 |         # check that it also works with the pipe's ID
201 |         response = client.api.pipes.get_information(response.info["id"])
202 |         assert response.info["name"] == "simple_kv"
203 |         assert response.info["type"] == "endpoint"
204 | 
205 |     def test_get_information_non_existing_pipe(self, client):
206 |         with pytest.raises(PipeNotFoundError) as e:
207 |             client.api.pipes.get_information("non_existent_pipe")
208 | 
209 |         assert e.match("Pipe 'non_existent_pipe' not found")
210 |         assert e.value.status_code == 404
211 | 


--------------------------------------------------------------------------------
/verdin/worker.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module contains a worker that reads batches of records from a Queue and appends them to a Datasource. It provides
  3 | an opinionated way to ingest data into tinybird from a python process. Note this worker does not use the ``/v0/events``
  4 | API, but instead uses the datasource's append functionality, which has higher rate limits.
  5 | """
  6 | 
  7 | import logging
  8 | import multiprocessing
  9 | import time
 10 | from queue import Empty, Queue
 11 | from typing import Optional, Tuple
 12 | 
 13 | import requests
 14 | 
 15 | from .datasource import Datasource, Records
 16 | 
 17 | LOG = logging.getLogger(__name__)
 18 | 
 19 | 
 20 | class StopWorker(Exception):
 21 |     """
 22 |     An exception that indicates to stop the QueueingDatasourceAppender worker.
 23 |     """
 24 | 
 25 |     marker = "__STOP__"
 26 | 
 27 |     batch: Optional[Records]
 28 | 
 29 |     def __init__(self, batch: Records = None):
 30 |         self.batch = batch
 31 | 
 32 | 
 33 | class QueuingDatasourceAppender:
 34 |     """
 35 |     A QueuingDatasourceAppender reads batches of records from a source Queue and appends the batches to a data
 36 |     source. Once rate limited, it waits for the instructed amount of time (or if that is not specified,
 37 |     whatever default_retry_after is set to), before appending again.
 38 | 
 39 |     See https://docs.tinybird.co/api-reference/api-reference.html#limits-title
 40 | 
 41 |     TODO: synchronize multiple appenders through a RateLimiter concurrency structure: data source share rate limiting
 42 |      across account or workspace (not sure), so running multiple separate queuing datasource appenders will lead to
 43 |      excessive rate limiting.
 44 |     """
 45 | 
 46 |     default_retry_after: float = 12
 47 |     wait_after_rate_limit: float = 12
 48 | 
 49 |     source: Queue
 50 |     destination: Datasource
 51 |     min_interval: float
 52 | 
 53 |     def __init__(self, source: Queue, destination: Datasource, min_interval: float = 5) -> None:
 54 |         """
 55 |         :param source: a queue that buffers records to be appended to the datasource
 56 |         :param destination: the datasource to append to
 57 |         :param min_interval: the minimal time to wait between batches
 58 |         """
 59 |         super().__init__()
 60 |         self.source = source
 61 |         self.destination = destination
 62 |         self.stopped = multiprocessing.Event()
 63 |         self.min_interval = min_interval
 64 | 
 65 |     def close(self):
 66 |         if self.stopped.is_set():
 67 |             return
 68 |         self.stopped.set()
 69 |         self.source.put_nowait(StopWorker.marker)
 70 | 
 71 |     def run(self):
 72 |         try:
 73 |             while not self.stopped.is_set():
 74 |                 try:
 75 |                     then = time.time()
 76 |                     batch, error = self._do_next_batch()
 77 | 
 78 |                     if error is not None:
 79 |                         # TODO: make sure the batch is not dropped on error. however, if the batch is
 80 |                         #  not appendable (for example because of errors in the data), then we need to
 81 |                         #  make sure the batch is either dropped, or we try to find the record that
 82 |                         #  causes the error.
 83 | 
 84 |                         raise error
 85 | 
 86 |                     duration = time.time() - then
 87 |                     LOG.debug("processing batch took %.2f", duration)
 88 |                     if self.min_interval:
 89 |                         self.stopped.wait(self.min_interval)
 90 | 
 91 |                 except StopWorker as e:
 92 |                     LOG.info("indicated worker shutdown, trying to flush batch")
 93 |                     if e.batch:
 94 |                         self._retry_batch(e.batch, max_retries=2)
 95 |                     return
 96 | 
 97 |                 except Exception:
 98 |                     LOG.exception("exception while processing batch, events will be dropped")
 99 |         finally:
100 |             LOG.info(
101 |                 "shutting down DatasourceQueueWorker, %d elements remaining",
102 |                 self.source.qsize(),
103 |             )
104 | 
105 |     def _get_batch(self, n=None) -> Records:
106 |         """
107 |         Reads the next batch from the queue.
108 | 
109 |         :param n: the maximum number of items to batch (default is entire queue)
110 |         :return: the items from the queue as Batch
111 |         :raises StopWorker if the StopWorker.marker was retrieved from the queue
112 |         """
113 |         q = self.source
114 |         item = q.get()
115 | 
116 |         if item == StopWorker.marker:
117 |             raise StopWorker()
118 | 
119 |         result = [item]  # block until we have at least one item
120 | 
121 |         if not n:
122 |             n = q.qsize()
123 | 
124 |         try:
125 |             while len(result) <= n:
126 |                 item = q.get(block=False)
127 | 
128 |                 if item == StopWorker.marker:
129 |                     raise StopWorker(result)
130 | 
131 |                 result.append(item)
132 |         except Empty:
133 |             pass
134 | 
135 |         return result
136 | 
137 |     def _parse_retry_seconds(self, response: requests.Response) -> float:
138 |         retry = response.headers.get("Retry-After")
139 |         if retry:
140 |             try:
141 |                 return float(retry) + 0.5
142 |             except ValueError as e:
143 |                 LOG.error("error while parsing Retry-After value '%s': %s", retry, e)
144 | 
145 |         return self.default_retry_after
146 | 
147 |     def _retry_batch(self, batch, max_retries=10) -> Tuple[requests.Response, bool]:
148 |         """
149 |         Tries to append the given batch to the datasource for max_retries amount of times. It
150 |         only retries if the request was rate limited, and waits for a certain amount of time
151 |         afterwards.
152 | 
153 |         :param batch: a list of records to append to the datasource
154 |         :param max_retries: max number of retries (defaults to 10)
155 |         :return: a tuple with the last response and a boolean flag indicating whether the request
156 |                  was rate-limited
157 |         """
158 |         limited = False
159 |         response = None
160 | 
161 |         for _ in range(max_retries):
162 |             response = self.destination.append(batch)
163 | 
164 |             if response.ok:
165 |                 return response, limited
166 | 
167 |             if response.status_code == 429:
168 |                 wait = self._parse_retry_seconds(response)
169 |                 limited = True
170 |                 LOG.debug(
171 |                     "rate limited by API, keeping %d records safe for %d seconds: %s",
172 |                     len(batch),
173 |                     wait,
174 |                     response.text,
175 |                 )
176 |                 time.sleep(wait)
177 |                 continue
178 | 
179 |             LOG.warning(
180 |                 "unhandled error %d: %s while appending to datasource, dropping batch",
181 |                 response.status_code,
182 |                 response.text,
183 |             )
184 |             return response, limited
185 | 
186 |         return response, limited
187 | 
188 |     def _do_next_batch(self) -> Tuple[Records, Optional[Exception]]:
189 |         batch = self._get_batch()
190 | 
191 |         try:
192 |             LOG.debug(
193 |                 "processing batch of size %d into datasource %s",
194 |                 len(batch),
195 |                 self.destination.name,
196 |             )
197 | 
198 |             response, limited = self._retry_batch(batch)
199 | 
200 |             if limited:
201 |                 # if the request was rate-limited, we'll try again after X-Ratelimit-Reset, or the
202 |                 # wait_after_rate_limit value if it is set
203 |                 try:
204 |                     if self.wait_after_rate_limit:
205 |                         wait = self.wait_after_rate_limit
206 |                     else:
207 |                         wait = float(response.headers.get("X-Ratelimit-Reset", 0))
208 | 
209 |                     LOG.info(
210 |                         "waiting %d second until rate-limit window resets before batching again",
211 |                         wait,
212 |                     )
213 |                     time.sleep(wait)
214 |                 except ValueError:
215 |                     LOG.exception("error while parsing X-Ratelimit-Reset value")
216 | 
217 |         except Exception as e:
218 |             return batch, e
219 | 
220 |         return batch, None
221 | 


--------------------------------------------------------------------------------
/verdin/api/query.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import json
  3 | from typing import Literal, Optional, TypedDict, Any
  4 | 
  5 | import requests
  6 | 
  7 | from verdin.api import ApiResponse
  8 | from verdin.api.base import Api, ApiError
  9 | 
 10 | QueryOutputFormat = Literal[
 11 |     "CSV",
 12 |     "CSVWithNames",
 13 |     "JSON",
 14 |     "TSV",
 15 |     "TSVWithNames",
 16 |     "PrettyCompact",
 17 |     "JSONEachRow",
 18 |     "Parquet",
 19 |     "Prometheus",
 20 | ]
 21 | """See https://www.tinybird.co/docs/api-reference/query-api#id10
 22 | 
 23 | +---------------+--------------------------------------------------+
 24 | | Format        | Description                                      |
 25 | +===============|==================================================+
 26 | | CSV           | CSV without header                               |
 27 | +---------------+--------------------------------------------------+
 28 | | CSVWithNames  | CSV with header                                  |
 29 | +---------------+--------------------------------------------------+
 30 | | JSON          | JSON including data, statistics and schema info  |
 31 | +---------------+--------------------------------------------------+
 32 | | TSV           | TSV without header                               |
 33 | +---------------+--------------------------------------------------+
 34 | | TSVWithNames  | TSV with header                                  |
 35 | +---------------+--------------------------------------------------+
 36 | | PrettyCompact | Formatted table                                  |
 37 | +---------------+--------------------------------------------------+
 38 | | JSONEachRow   | Newline-delimited JSON values (NDJSON)           |
 39 | +---------------+--------------------------------------------------+
 40 | | Parquet       | Apache Parquet                                   |
 41 | +---------------+--------------------------------------------------+
 42 | | Prometheus    | Prometheus text-based format                     |
 43 | +---------------+--------------------------------------------------+
 44 | """
 45 | 
 46 | 
 47 | class QueryStatistics(TypedDict):
 48 |     bytes_read: int
 49 |     elapsed: float
 50 |     rows_read: int
 51 | 
 52 | 
 53 | class QueryMetadata(TypedDict):
 54 |     name: str
 55 |     type: str
 56 | 
 57 | 
 58 | QueryData = list[dict[str, Any]]
 59 | 
 60 | 
 61 | class QueryResponse(ApiResponse):
 62 |     @property
 63 |     def data(self) -> QueryData:
 64 |         raise NotImplementedError
 65 | 
 66 | 
 67 | class QueryJsonResponse(QueryResponse):
 68 |     @property
 69 |     def data(self) -> QueryData:
 70 |         """
 71 |         Returns the data returned by the query, which is a list of dictionaries representing the records in rows.
 72 | 
 73 |         :return: List of records.
 74 |         """
 75 |         return self.json.get("data", [])
 76 | 
 77 |     @property
 78 |     def meta(self) -> list[QueryMetadata]:
 79 |         """
 80 |         Returns the QueryMetadata from the query, which includes attributes and their types.
 81 | 
 82 |         :return: The QueryMetadata
 83 |         """
 84 |         return self.json.get("meta", [])
 85 | 
 86 |     @property
 87 |     def rows(self) -> int:
 88 |         """
 89 |         Returns the number of rows returned by the query.
 90 | 
 91 |         :return: The number of rows returned by the query.
 92 |         """
 93 |         return self.json.get("rows")
 94 | 
 95 |     @property
 96 |     def statistics(self) -> QueryStatistics:
 97 |         """
 98 |         Returns the query statistics, which include the number of bytes read, the number of rows read, and the elapsed.
 99 |         :return: The QueryStatistics objects.
100 |         """
101 |         return self.json.get("statistics", {})
102 | 
103 |     @property
104 |     def empty(self) -> bool:
105 |         """
106 |         A property to check if the data in the result is empty.
107 | 
108 |         This property evaluates whether the "data" field within the "result"
109 |         attribute is empty.
110 | 
111 |         :return: Returns True if the "data" field in "result" is missing or empty,
112 |             otherwise returns False.
113 |         """
114 |         return not self.json.get("data")
115 | 
116 | 
117 | class QueryNdjsonResponse(QueryResponse):
118 |     @property
119 |     def data(self) -> list[dict]:
120 |         """Parses the CSV response body into a list of dictionaries."""
121 |         for line in self.text.splitlines():
122 |             json.loads(line)
123 |         return [json.loads(line) for line in self.text.strip().splitlines()]
124 | 
125 | 
126 | class QueryCsvResponse(QueryResponse):
127 |     def __init__(self, response: requests.Response, delimiter: str = ","):
128 |         super().__init__(response)
129 |         self.delimiter = delimiter
130 | 
131 |     @property
132 |     def data(self) -> list[dict]:
133 |         """Parses the CSV response body into a list of dictionaries."""
134 |         reader = csv.DictReader(
135 |             self.text.splitlines(),
136 |             delimiter=self.delimiter,
137 |         )
138 |         return list(reader)
139 | 
140 | 
141 | class QueryApi(Api):
142 |     """
143 |     The Query API allows you to query your Pipes and Data Sources inside Tinybird as if you were running SQL statements
144 |     against a standard database.
145 | 
146 |     See https://www.tinybird.co/docs/api-reference/query-api.
147 |     """
148 | 
149 |     endpoint: str = "/v0/sql"
150 | 
151 |     session: requests.Session
152 | 
153 |     def __init__(self, token: str, host: str = None):
154 |         super().__init__(token, host)
155 | 
156 |         self.session = requests.Session()
157 |         if self.token:
158 |             self.session.headers.update({"Authorization": f"Bearer {self.token}"})
159 | 
160 |     def query(
161 |         self,
162 |         query: str,
163 |         pipeline: str = None,
164 |         parameters: dict[str, str] = None,
165 |         output_format_json_quote_64bit_integers: bool = False,
166 |         output_format_json_quote_denormals: bool = False,
167 |         output_format_parquet_string_as_string: bool = False,
168 |         format: QueryOutputFormat = "JSON",
169 |     ) -> QueryResponse | QueryJsonResponse | QueryNdjsonResponse | QueryCsvResponse:
170 |         """
171 |         Executes a SQL query using the engine. As a response, it gives you the query metadata, the resulting data and
172 |         some performance statistics.
173 | 
174 |         The return type will depend on the desired ``format``. For the following formats, we return special response
175 |         objects that contain the parsed data:
176 |         * ``JSON``: ``QueryJsonResponse`` (default)
177 |         * ``CSVWithNames``: QueryCsvResponse
178 |         * ``TSVWithNames``: QueryCsvResponse
179 |         * ``JSONEachRow``: ``QueryNdjsonResponse``
180 | 
181 |         For all other formats, we return a generic ``QueryResponse`` object, that allows you to access the raw response
182 |         body via ``response.text`` (str) or ``response.content`` (bytes).
183 | 
184 |         :param query: The SQL query
185 |         :param pipeline: (Optional) The name of the pipe. It allows writing a query like 'SELECT * FROM _' where '_' is
186 |             a placeholder for the 'pipeline' parameter
187 |         :param parameters: Additional query parameters
188 |         :param output_format_json_quote_64bit_integers: (Optional) Controls quoting of 64-bit or bigger integers (like
189 |             UInt64 or Int128) when they are output in a JSON format. Such integers are enclosed in quotes by default.
190 |             This behavior is compatible with most JavaScript implementations. Possible values: False — Integers are
191 |             output without quotes. True — Integers are enclosed in quotes. Default value is False
192 |         :param output_format_json_quote_denormals: (Optional) Controls representation of inf and nan on the UI instead
193 |             of null e.g when dividing by 0 - inf and when there is no representation of a number in Javascript - nan.
194 |             Default value is false
195 |         :param output_format_parquet_string_as_string: (Optional) Use Parquet String type instead of Binary for String
196 |             columns. Possible values: False - disabled, True - enabled. Default value is False
197 |         :param format: Output format of the query results (defaults to JSON)
198 |         :return: QueryResponse object containing the query results
199 |         """
200 | 
201 |         query = _sql_with_format(query, format)
202 | 
203 |         data: dict[str, str | int] = dict(parameters) if parameters else {}
204 |         if query:
205 |             data["q"] = query
206 |         if pipeline:
207 |             data["pipeline"] = pipeline
208 |         if output_format_json_quote_64bit_integers:
209 |             data["output_format_json_quote_64bit_integers"] = 1
210 |         if output_format_json_quote_denormals:
211 |             data["output_format_json_quote_denormals"] = 1
212 |         if output_format_parquet_string_as_string:
213 |             data["output_format_parquet_string_as_string"] = 1
214 | 
215 |         # if the query is too large, the web server (nginx) will respond with "414 Request-URI Too Large". it seems
216 |         # this limit is around 8kb, so if it's too large, we use a POST request instead.
217 |         qsize = 1  # include the "?" character
218 |         for k, v in data.items():
219 |             qsize += len(k) + len(v) + 2  # include the ``&`` and ``=`` character
220 | 
221 |         if qsize > 8192 or parameters:
222 |             response = self.session.request(
223 |                 method="POST",
224 |                 url=f"{self.host}{self.endpoint}",
225 |                 data=data,
226 |             )
227 |         else:
228 |             response = self.session.request(
229 |                 method="GET",
230 |                 url=f"{self.host}{self.endpoint}",
231 |                 params=data,
232 |             )
233 | 
234 |         if not response.ok:
235 |             raise ApiError(response)
236 | 
237 |         # format-specific response objects
238 |         if format == "JSON":
239 |             return QueryJsonResponse(response)
240 |         if format == "CSVWithNames":
241 |             return QueryCsvResponse(response)
242 |         if format == "TSVWithNames":
243 |             return QueryCsvResponse(response, delimiter="\t")
244 |         if format == "JSONEachRow":
245 |             return QueryNdjsonResponse(response)
246 | 
247 |         return QueryResponse(response)
248 | 
249 | 
250 | def _sql_with_format(sql, output_format: Optional[QueryOutputFormat] = None) -> str:
251 |     """
252 |     Returns a formatted SQL query with the given output format. If no output format is specified, the query is
253 |     returned as is.
254 | 
255 |     :param output_format: The output format to use (suffixes ``FORMAT <format>`` to the query)
256 |     :return: An SQL string
257 |     """
258 |     # TODO: handle potentially already existing FORMAT string
259 |     if not output_format:
260 |         return sql
261 |     return sql + f" FORMAT {output_format}"
262 | 


--------------------------------------------------------------------------------
/verdin/api/pipes.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import json
  3 | from typing import TypedDict, Literal
  4 | 
  5 | import requests
  6 | 
  7 | from .base import Api, ApiResponse, ApiError
  8 | 
  9 | PipeOutputFormat = Literal["csv", "json", "ndjson", "parquet", "prometheus"]
 10 | 
 11 | 
 12 | class PipeNotFoundError(ApiError):
 13 |     """
 14 |     Specific ApiError representing a 404 Not Found when pipe names are given.
 15 |     """
 16 | 
 17 | 
 18 | class PipeNode(TypedDict):
 19 |     id: str
 20 |     name: str
 21 |     sql: str
 22 |     deployment_suffix: str | None
 23 |     description: str | None
 24 |     materialized: bool | None
 25 |     cluster: str | None
 26 |     tags: dict
 27 |     created_at: str
 28 |     updated_at: str
 29 |     version: int
 30 |     project: str | None
 31 |     result: str | None
 32 |     ignore_sql_errors: bool
 33 |     node_type: str
 34 |     dependencies: list[str] | None
 35 |     params: list | None
 36 | 
 37 | 
 38 | class PipeListInfo(TypedDict):
 39 |     id: str
 40 |     name: str
 41 |     description: str
 42 |     endpoint: str
 43 |     created_at: str
 44 |     updated_at: str
 45 |     parent: str | None
 46 |     nodes: list[PipeNode]
 47 |     url: str
 48 | 
 49 | 
 50 | class PipeInfo(TypedDict):
 51 |     """
 52 |     A document returned by the pipe information endpoint. Example::
 53 | 
 54 |     {
 55 |       "content": "VERSION 0\n\nDESCRIPTION >\n    Endpoint to select unique ...",
 56 |       "created_at": "2025-12-17 13:18:09.799374",
 57 |       "description": "Endpoint to select unique key/value pairs from simple",
 58 |       "edited_by": null,
 59 |       "endpoint": "t_54dffae578ef47238fd51e9849f79a1f",
 60 |       "id": "t_c50152ced57b46b99acf14930b9c6906",
 61 |       "last_commit": {
 62 |         "content_sha": "",
 63 |         "path": "",
 64 |         "status": "None"
 65 |       },
 66 |       "name": "simple_kv",
 67 |       "nodes": [
 68 |         {
 69 |           "cluster": null,
 70 |           "created_at": "2025-12-17 13:18:09.799385",
 71 |           "dependencies": [
 72 |             "simple"
 73 |           ],
 74 |           "deployment_suffix": "",
 75 |           "description": null,
 76 |           "id": "t_54dffae578ef47238fd51e9849f79a1f",
 77 |           "ignore_sql_errors": false,
 78 |           "materialized": null,
 79 |           "name": "endpoint",
 80 |           "node_type": "endpoint",
 81 |           "params": [],
 82 |           "project": null,
 83 |           "result": null,
 84 |           "sql": "%\n    SELECT key, value\n    FROM simple\n    ...",
 85 |           "tags": {},
 86 |           "updated_at": "2025-12-17 13:18:09.799385",
 87 |           "version": 0
 88 |         }
 89 |       ],
 90 |       "parent": null,
 91 |       "path": "endpoints/simple_kv.pipe",
 92 |       "type": "endpoint",
 93 |       "updated_at": "2025-12-17 13:18:09.799394",
 94 |       "url": "http://localhost:8001/v0/pipes/simple_kv.json",
 95 |       "workspace_id": "2244743a-d384-478f-a9f5-ea4848c56427"
 96 |     }
 97 |     """
 98 | 
 99 |     content: str
100 |     created_at: str
101 |     description: str
102 |     edited_by: str | None
103 |     endpoint: str
104 |     id: str
105 |     last_commit: dict
106 |     name: str
107 |     nodes: list[PipeNode]
108 |     parent: str | None
109 |     path: str
110 |     type: str
111 |     updated_at: str
112 |     url: str
113 |     workspace_id: str
114 | 
115 | 
116 | class ListPipesResponse(ApiResponse):
117 |     @property
118 |     def pipes(self) -> list[PipeListInfo]:
119 |         return self.json.get("pipes", [])
120 | 
121 | 
122 | class GetPipeInformationResponse(ApiResponse):
123 |     @property
124 |     def info(self) -> PipeInfo:
125 |         return self.json
126 | 
127 | 
128 | class QueryPipeResponse(ApiResponse):
129 |     @property
130 |     def data(self) -> list[dict]:
131 |         raise NotImplementedError
132 | 
133 | 
134 | class QueryPipeJsonResponse(QueryPipeResponse):
135 |     @property
136 |     def data(self) -> list[dict]:
137 |         return self.json.get("data", [])
138 | 
139 |     @property
140 |     def meta(self) -> list[dict]:
141 |         return self.json.get("meta", [])
142 | 
143 |     @property
144 |     def rows(self) -> int:
145 |         return self.json.get("rows")
146 | 
147 |     @property
148 |     def statistics(self) -> dict:
149 |         return self.json.get("statistics", {})
150 | 
151 | 
152 | class QueryPipeNdjsonResponse(QueryPipeResponse):
153 |     @property
154 |     def data(self) -> list[dict]:
155 |         """Parses the CSV response body into a list of dictionaries."""
156 |         for line in self.text.splitlines():
157 |             print(line)
158 |             json.loads(line)
159 |         return [json.loads(line) for line in self.text.strip().splitlines()]
160 | 
161 | 
162 | class QueryPipeCsvResponse(QueryPipeResponse):
163 |     @property
164 |     def data(self) -> list[dict]:
165 |         """Parses the CSV response body into a list of dictionaries."""
166 |         reader = csv.DictReader(self.text.splitlines())
167 |         return list(reader)
168 | 
169 | 
170 | class PipesApi(Api):
171 |     """
172 |     Pipes API. See https://www.tinybird.co/docs/api-reference/pipe-api
173 | 
174 |     TODO: missing APIs to implement
175 |         * Creating a new pipe (POST /v0/pipes)
176 |         * Append a node to a pipe (POST /v0/pipes/:name/nodes)
177 |         * Delete a node from a pipe (DELETE /v0/pipes/:name/nodes/:node_id)
178 |         * Update a node in a pipe (PUT /v0/pipes/:name/nodes/:node_id)
179 |         * Delete a pipe (DELETE /v0/pipes/:name)
180 |         * Change a pipe's metadata (PUT /v0/pipes/:name)
181 |         * Explain a pipe (GET /v0/pipes/:name/explain)
182 |     """
183 | 
184 |     endpoint: str = "/v0/pipes"
185 | 
186 |     session: requests.Session
187 | 
188 |     def __init__(self, token: str, host: str = None):
189 |         super().__init__(token, host)
190 | 
191 |         self.session = requests.Session()
192 |         if self.token:
193 |             self.session.headers.update({"Authorization": f"Bearer {self.token}"})
194 | 
195 |     def list(
196 |         self,
197 |         dependencies: bool = False,
198 |         attrs: list[str] = None,
199 |         node_attrs: list[str] = None,
200 |     ) -> ListPipesResponse:
201 |         """
202 |         Get a list of pipes in your account. Makes a GET request to ``/v0/pipes`` endpoint, which returns a list of
203 |         pipes.
204 | 
205 |         :param dependencies: Include dependent data sources and pipes, default is false
206 |         :param attrs: List of pipe attributes to return (e.g. '["name","description"]')
207 |         :param node_attrs: List of node attributes to return (e.g. '["id","name"]')
208 |         :return: A ``ListPipesResponse`` object
209 |         """
210 |         params = {}
211 |         if dependencies:
212 |             params["dependencies"] = "true"
213 |         if attrs:
214 |             params["attrs"] = ",".join(attrs)
215 |         if node_attrs:
216 |             params["node_attrs"] = ",".join(node_attrs)
217 | 
218 |         response = self.session.request(
219 |             method="GET",
220 |             url=f"{self.host}{self.endpoint}",
221 |             params=params,
222 |         )
223 | 
224 |         if not response.ok:
225 |             raise ApiError(response)
226 | 
227 |         return ListPipesResponse(response)
228 | 
229 |     def query(
230 |         self,
231 |         name: str,
232 |         query: str = None,
233 |         parameters: dict[str, str] = None,
234 |         format: PipeOutputFormat = "json",
235 |     ) -> QueryPipeResponse | QueryPipeJsonResponse | QueryPipeNdjsonResponse | QueryPipeCsvResponse:
236 |         """
237 |         Query the Pipe. Makes a GET request to ``/v0/pipes/<name>.<format>`` endpoint, which returns the query result.
238 |         The return value depends on the format parameter. Currently, parquet and prometheus formats are only supported
239 |         as raw outputs. For all others you can call ``response.data`` and receive a list of dictionary records.
240 | 
241 |         When using an additional SQL query (through the ``query`` parameter) for the Pipe, you can use the
242 |         ``_`` shortcut, which refers to your Pipe name. You can pass both ``parameters`` and ``query``.
243 | 
244 |         :param name: The name of the pipe to query.
245 |         :param query: Optional query to execute against the pipe.
246 |         :param parameters: The dynamic parameters passed to the pipe.
247 |         :param format: The output format (default: json).
248 |         :return: A ``QueryPipeResponse`` object that is specific to the output format.
249 |         """
250 | 
251 |         params = dict(parameters) if parameters else {}
252 |         if query:
253 |             params["q"] = query
254 | 
255 |         # if the query is too large, the web server (nginx) will respond with "414 Request-URI Too Large". it seems
256 |         # this limit is around 8kb, so if it's too large, we use a POST request instead.
257 |         qsize = 1  # include the "?" character
258 |         for k, v in params.items():
259 |             qsize += len(k) + len(v) + 2  # include the ``&`` and ``=`` character
260 | 
261 |         if qsize > 8192:
262 |             response = self.session.request(
263 |                 method="POST",
264 |                 url=f"{self.host}{self.endpoint}/{name}.{format}",
265 |                 data=params,
266 |             )
267 |         else:
268 |             response = self.session.request(
269 |                 method="GET",
270 |                 url=f"{self.host}{self.endpoint}/{name}.{format}",
271 |                 params=params,
272 |             )
273 | 
274 |         if response.status_code == 404:
275 |             raise PipeNotFoundError(response)
276 | 
277 |         if not response.ok:
278 |             raise ApiError(response)
279 | 
280 |         # format-specific response objects
281 |         if format == "json":
282 |             return QueryPipeJsonResponse(response)
283 |         if format == "ndjson":
284 |             return QueryPipeNdjsonResponse(response)
285 |         if format == "csv":
286 |             return QueryPipeCsvResponse(response)
287 | 
288 |         # prometheus and parquet formats are currently only supported as raw outputs
289 | 
290 |         return QueryPipeResponse(response)
291 | 
292 |     def get_information(self, name: str) -> GetPipeInformationResponse:
293 |         """
294 |         Makes a GET request to ``/v0/pipes/<name>`` endpoint, which returns the pipe information.
295 |         See: https://www.tinybird.co/docs/api-reference/pipe-api#get--v0-pipes-(.+\.pipe)
296 | 
297 |         :param name: The name or ID of the pipe.
298 |         :return: A ``GetPipeInformationResponse`` object
299 |         """
300 |         response = self.session.request(
301 |             method="GET",
302 |             url=f"{self.host}{self.endpoint}/{name}",
303 |         )
304 | 
305 |         if response.status_code == 404:
306 |             raise PipeNotFoundError(response)
307 | 
308 |         if not response.ok:
309 |             raise ApiError(response)
310 | 
311 |         return GetPipeInformationResponse(response)
312 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/verdin/api/datasources.py:
--------------------------------------------------------------------------------
  1 | from typing import Literal, Iterable, TypedDict
  2 | 
  3 | import requests
  4 | 
  5 | from .base import Api, ApiError, ApiResponse
  6 | 
  7 | 
  8 | class DataSourceNotFoundError(ApiError):
  9 |     """
 10 |     Specific ApiError representing a 404 Not Found when database names are given.
 11 |     """
 12 | 
 13 | 
 14 | class DataSourceInfo(TypedDict):
 15 |     """
 16 |     A data source info object. Example::
 17 | 
 18 |         {
 19 |           "cluster": "tinybird",
 20 |           "created_at": "2025-12-17 13:18:09.799040",
 21 |           "description": "Simple Key-Value Data Source",
 22 |           "engine": {
 23 |             "engine": "MergeTree",
 24 |             "engine_full": "MergeTree ORDER BY tuple()",
 25 |             "sorting_key": "tuple()"
 26 |           },
 27 |           "errors_discarded_at": null,
 28 |           "headers": {},
 29 |           "id": "t_e1ea6e1e32004989af509b034b0987c1",
 30 |           "indexes": [],
 31 |           "last_commit": {
 32 |             "content_sha": "",
 33 |             "path": "",
 34 |             "status": "ok"
 35 |           },
 36 |           "name": "simple",
 37 |           "new_columns_detected": false,
 38 |           "project": null,
 39 |           "replicated": false,
 40 |           "schema": {
 41 |             "columns": [
 42 |               {
 43 |                 "auto": false,
 44 |                 "codec": null,
 45 |                 "default_value": null,
 46 |                 "jsonpath": "$.Id",
 47 |                 "name": "id",
 48 |                 "normalized_name": "id",
 49 |                 "nullable": false,
 50 |                 "type": "UUID"
 51 |               },
 52 |               {
 53 |                 "auto": false,
 54 |                 "codec": null,
 55 |                 "default_value": null,
 56 |                 "jsonpath": "$.Timestamp",
 57 |                 "name": "timestamp",
 58 |                 "normalized_name": "timestamp",
 59 |                 "nullable": false,
 60 |                 "type": "DateTime64(6)"
 61 |               },
 62 |               {
 63 |                 "auto": false,
 64 |                 "codec": null,
 65 |                 "default_value": null,
 66 |                 "jsonpath": "$.Key",
 67 |                 "name": "key",
 68 |                 "normalized_name": "key",
 69 |                 "nullable": false,
 70 |                 "type": "String"
 71 |               },
 72 |               {
 73 |                 "auto": false,
 74 |                 "codec": null,
 75 |                 "default_value": null,
 76 |                 "jsonpath": "$.Value",
 77 |                 "name": "value",
 78 |                 "normalized_name": "value",
 79 |                 "nullable": false,
 80 |                 "type": "String"
 81 |               }
 82 |             ],
 83 |             "sql_schema": "`id` UUID `json:$.Id`, `timestamp` DateTime64(6) `json:$.Timestamp`, `key` String `json:$.Key`, `value` String `json:$.Value`"
 84 |           },
 85 |           "shared_with": [],
 86 |           "statistics": {
 87 |             "bytes": 0,
 88 |             "row_count": 0
 89 |           },
 90 |           "tags": {},
 91 |           "type": "ndjson",
 92 |           "updated_at": "2025-12-17 13:18:09.799040",
 93 |           "used_by": [
 94 |             {
 95 |               "id": "t_c50152ced57b46b99acf14930b9c6906",
 96 |               "name": "simple_kv"
 97 |             }
 98 |           ],
 99 |           "version": 0
100 |         }
101 |     """
102 | 
103 |     cluster: str
104 |     created_at: str
105 |     description: str
106 |     engine: dict
107 |     errors_discarded_at: str | None
108 |     headers: dict
109 |     id: str
110 |     indexes: list
111 |     last_commit: dict
112 |     name: str
113 |     new_columns_detected: bool
114 |     project: str | None
115 |     replicated: bool
116 |     schema: dict
117 |     shared_with: list
118 |     statistics: dict
119 |     tags: dict
120 |     type: str
121 |     updated_at: str
122 |     used_by: list[dict]
123 |     version: int
124 | 
125 | 
126 | class DataSourceAppendInfo(TypedDict):
127 |     """Information about a data source returned when appending to the data source."""
128 | 
129 |     cluster: str
130 |     created_at: str
131 |     description: str
132 |     engine: dict  # TODO: {'engine': 'MergeTree', 'sorting_key': 'tuple()'}
133 |     errors_discarded_at: str | None
134 |     headers: dict
135 |     id: str
136 |     last_commit: dict  # TODO: {'content_sha': '', 'path': '', 'status': 'ok'}
137 |     name: str
138 |     project: str | None
139 |     replicated: bool
140 |     shared_with: list  # TODO
141 |     tags: dict
142 |     type: str
143 |     updated_at: str
144 |     used_by: list  # TODO
145 |     version: int
146 | 
147 | 
148 | class ListDataSourcesResponse(ApiResponse):
149 |     @property
150 |     def datasources(self) -> list[DataSourceInfo]:
151 |         return self.json.get("datasources", [])
152 | 
153 | 
154 | class AppendDataResponse(ApiResponse):
155 |     @property
156 |     def datasource(self) -> DataSourceAppendInfo:
157 |         return self.json.get("datasource", {})
158 | 
159 |     @property
160 |     def import_id(self) -> str:
161 |         return self.json.get("import_id")
162 | 
163 |     @property
164 |     def invalid_lines(self) -> int:
165 |         return self.json.get("invalid_lines")
166 | 
167 |     @property
168 |     def quarantine_rows(self) -> int:
169 |         return self.json.get("quarantine_rows")
170 | 
171 |     @property
172 |     def error(self) -> str | None:
173 |         error = self.json.get("error")
174 |         if not error:
175 |             return None
176 |         return error
177 | 
178 | 
179 | class GetDataSourceInformationResponse(ApiResponse):
180 |     @property
181 |     def info(self) -> DataSourceInfo:
182 |         """
183 |         Returns the data source information.
184 | 
185 |         Example::
186 | 
187 |         {
188 |             "id": "t_bd1c62b5e67142bd9bf9a7f113a2b6ea",
189 |             "name": "datasource_name",
190 |             "statistics": {
191 |                 "bytes": 430833,
192 |                 "row_count": 3980
193 |             },
194 |             "used_by": [{
195 |                 "id": "t_efdc62b5e67142bd9bf9a7f113a34353",
196 |                 "name": "pipe_using_datasource_name"
197 |             }]
198 |             "updated_at": "2018-09-07 23:50:32.322461",
199 |             "created_at": "2018-11-28 23:50:32.322461",
200 |             "type": "csv"
201 |         }
202 | 
203 |         """
204 |         return self.json
205 | 
206 | 
207 | class DataSourcesApi(Api):
208 |     """
209 |     ``/v0/datasources`` API client.
210 | 
211 |     TODO: missing APIs:
212 |      * Creating data sources (POST /v0/datasources with mode=create)
213 |      * Replacing data sources (POST /v0/datasources with mode=replace)
214 |      * Alter data source (POST /v0/datasources/:name/alter)
215 |      * Delete data (POST /v0/datasources/:name/delete)
216 |      * Drop data source (DELETE /v0/datasources/:name)
217 |      * Update data source attributes (PUT /v0/datasources/:name)
218 |     """
219 | 
220 |     endpoint: str = "/v0/datasources"
221 | 
222 |     session: requests.Session
223 | 
224 |     def __init__(self, token: str, host: str = None):
225 |         super().__init__(token, host)
226 | 
227 |         self.session = requests.Session()
228 |         if self.token:
229 |             self.session.headers.update({"Authorization": f"Bearer {self.token}"})
230 | 
231 |     def append(
232 |         self,
233 |         name: str,
234 |         data: str | bytes | Iterable[bytes] | Iterable[str],
235 |         dialect_delimiter: str = None,
236 |         dialect_new_line: str = None,
237 |         dialect_escapechar: str = None,
238 |         progress: bool = False,
239 |         format: Literal["csv", "ndjson", "parquet"] = None,
240 |     ) -> AppendDataResponse:
241 |         """
242 |         Makes a POST request to ``/v0/datasources`` endpoint with mode=append, which appends data to the datasource.
243 | 
244 |         The data is expected to already be encoded in the format specified by the format parameter. You can pass
245 |         generators or other iterables as data. For example::
246 | 
247 |             records = [...]  # some list of dicts
248 | 
249 |             def _data():
250 |                 # creates an NDJSON stream
251 |                 for r in records:
252 |                     yield json.dumps(r) + "\\n"
253 | 
254 |             response = ds.append("my_table", _data(), format="ndjson")
255 | 
256 |         :param name: Name of the data source to append data to.
257 |         :param data: Data to append.
258 |         :param dialect_delimiter: The one-character string separating the fields. We try to guess the delimiter based
259 |             on the CSV contents using some statistics, but sometimes we fail to identify the correct one. If you know
260 |             your CSV’s field delimiter, you can use this parameter to explicitly define it.
261 |         :param dialect_new_line: The one- or two-character string separating the records. We try to guess the delimiter
262 |             based on the CSV contents using some statistics, but sometimes we fail to identify the correct one. If you
263 |             know your CSV’s record delimiter, you can use this parameter to explicitly define it.
264 |         :param dialect_escapechar: The escapechar removes any special meaning from the following character. This is
265 |             useful if the CSV does not use double quotes to encapsulate a column but uses double quotes in the content
266 |             of a column and it is escaped with, e.g. a backslash.
267 |         :param progress: When using true and sending the data in the request body, Tinybird will return block status
268 |             while loading using Line-delimited JSON. TODO: currently not supported
269 |         :param format: Default: csv. Indicates the format of the data to be ingested in the Data Source. By default is
270 |             csv and you should specify format=ndjson for NDJSON format, and format=parquet for Parquet files.
271 |         :return: A ``AppendDataResponse`` object.
272 |         """
273 |         if progress:
274 |             raise NotImplementedError
275 | 
276 |         params = {
277 |             "mode": "append",
278 |             "name": name,
279 |         }
280 | 
281 |         if dialect_delimiter:
282 |             params["dialect_delimiter"] = dialect_delimiter
283 |         if dialect_new_line:
284 |             params["dialect_new_line"] = dialect_new_line
285 |         if dialect_escapechar:
286 |             params["dialect_escapechar"] = dialect_escapechar
287 |         if format:
288 |             params["format"] = format
289 | 
290 |         headers = {}
291 |         if format == "csv":
292 |             headers["Content-Type"] = "text/html; charset=utf-8"
293 |         if format == "ndjson":
294 |             headers["Content-Type"] = "application/x-ndjson; charset=utf-8"
295 | 
296 |         response = self.session.request(
297 |             method="POST",
298 |             url=f"{self.host}{self.endpoint}",
299 |             params=params,
300 |             headers=headers,
301 |             data=data,
302 |         )
303 | 
304 |         if not response.ok:
305 |             raise ApiError(response)
306 | 
307 |         return AppendDataResponse(response)
308 | 
309 |     def list(self) -> ListDataSourcesResponse:
310 |         """
311 |         Makes a GET request to ``/v0/datasources`` endpoint, which returns a list of datasources.
312 | 
313 |         :return: A ``ListDataSourcesResponse`` object
314 |         """
315 |         response = self.session.request(
316 |             method="GET",
317 |             url=f"{self.host}{self.endpoint}",
318 |         )
319 | 
320 |         if not response.ok:
321 |             raise ApiError(response)
322 | 
323 |         return ListDataSourcesResponse(response)
324 | 
325 |     def get_information(self, name: str) -> GetDataSourceInformationResponse:
326 |         """
327 |         Makes a GET request to ``/v0/datasources/<name>`` endpoint, which returns information about the datasource.
328 | 
329 |         :param name: The name of the datasource to get information about.
330 |         :return: A ``GetDataSourceInformationResponse``
331 |         """
332 |         response = self.session.request(
333 |             method="GET",
334 |             url=f"{self.host}{self.endpoint}/{name}",
335 |         )
336 | 
337 |         if response.status_code == 404:
338 |             raise DataSourceNotFoundError(response)
339 | 
340 |         if not response.ok:
341 |             raise ApiError(response)
342 | 
343 |         return GetDataSourceInformationResponse(response)
344 | 
345 |     def truncate(self, name: str):
346 |         """
347 |         Makes a POST request to ``/v0/datasources/:name/truncate``, which truncates the datasource.
348 | 
349 |         :param name: The name of the datasource to truncate.
350 |         """
351 |         response = self.session.request(
352 |             method="POST",
353 |             url=f"{self.host}{self.endpoint}/{name}/truncate",
354 |         )
355 | 
356 |         if response.status_code == 404:
357 |             raise DataSourceNotFoundError(response)
358 | 
359 |         if not response.ok:
360 |             raise ApiError(response)
361 | 


--------------------------------------------------------------------------------
/tests/integration/project/CLAUDE.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Tinybird CLI rules
  3 | 
  4 | ## Commands
  5 | You have commands at your disposal to develop a tinybird project:
  6 | - tb build: to build the project locally and check it works.
  7 | - tb deployment create --wait --auto: to create a deployment and promote it automatically
  8 | - tb test run: to run existing tests
  9 | - tb endpoint url <pipe_name>: to get the url of an endpoint, token included.
 10 | - tb endpoint data <pipe_name>: to get the data of an endpoint. You can pass parameters to the endpoint like this: tb endpoint data <pipe_name> --param1 value1 --param2 value2
 11 | - tb  token ls: to list all the tokens
 12 | There are other commands that you can use, but these are the most common ones. Run `tb -h` to see all the commands if needed.
 13 | When you need to work with resources or data in cloud, add always the --cloud flag before the command. Example: tb --cloud datasource ls
 14 | 
 15 | ## Development instructions
 16 | - When asking to create a tinybird data project, if the needed folders are not already created, use the following structure:
 17 | ├── connections
 18 | ├── copies
 19 | ├── sinks
 20 | ├── datasources
 21 | ├── endpoints
 22 | ├── fixtures
 23 | ├── materializations
 24 | ├── pipes
 25 | └── tests
 26 | - The local development server will be available at http://localhost:7181. Even if some response uses another base url, use always http://localhost:7181.
 27 | - After every change in your .datasource, .pipe or .ndjson files, run `tb build` to build the project locally.
 28 | - When you need to ingest data locally in a datasource, create a .ndjson file with the same name of the datasource and the data you want and run `tb build` so the data is ingested.
 29 | - The format of the generated api endpoint urls is: http://localhost:7181/v0/pipe/<pipe_name>.json?token=<token>
 30 | - Before running the tests, remember to have the project built with `tb build` with the latest changes.
 31 | </development_instructions>
 32 | When asking for ingesting data, adding data or appending data do the following depending on the environment you want to work with:
 33 | 
 34 | ## Ingestion instructions
 35 | - When building locally, create a .ndjson file with the data you want to ingest and do `tb build` to ingest the data in the build env.
 36 | - We call `cloud` the production environment.
 37 | - When appending data in cloud, use `tb --cloud datasource append <datasource_name> <file_name>`
 38 | - When you have a response that says “there are rows in quarantine”, do `tb [--cloud] datasource data <datasource_name>_quarantine` to understand what is the problem.
 39 | 
 40 | ## .datasource file instructions
 41 | Follow these instructions when creating or updating .datasource files:
 42 | 
 43 | <datasource_file_instructions>
 44 |     - Content cannot be empty.
 45 |     - The datasource names must be unique.
 46 |     - No indentation is allowed for property names: DESCRIPTION, SCHEMA, ENGINE, ENGINE_PARTITION_KEY, ENGINE_SORTING_KEY, etc.
 47 |     - Use MergeTree engine by default.
 48 |     - Use AggregatingMergeTree engine when the datasource is the target of a materialized pipe.
 49 |     - Use always json paths to define the schema. Example: `user_id` String `json:$.user_id`,
 50 |     - Array columns are supported with a special syntax. Example: `items` Array(String) `json:$.items[:]`
 51 |     - If the datasource is using an S3 or GCS connection, they need to set IMPORT_CONNECTION_NAME, IMPORT_BUCKET_URI and IMPORT_SCHEDULE (GCS @on-demand only, S3 supports @auto too)
 52 |     - If the datasource is using a Kafka connection, they need to set KAFKA_CONNECTION_NAME as the name of the .connection file, KAFKA_TOPIC topic_name and KAFKA_GROUP_ID as the group id for the datasource
 53 |     - Unless the user asks for them, do not include ENGINE_PARTITION_KEY and ENGINE_PRIMARY_KEY.
 54 |     - DateTime64 type without precision is not supported. Use DateTime64(3) instead.
 55 | </datasource_file_instructions>
 56 | 
 57 | 
 58 | ## .pipe file instructions
 59 | Follow these instructions when creating or updating .pipe files:
 60 | 
 61 | Follow these instructions when creating or updating any type of .pipe file:
 62 | <pipe_file_instructions>
 63 |     - The pipe names must be unique.
 64 |     - Nodes do NOT use the same name as the Pipe they belong to. So if the pipe name is "my_pipe", the nodes must be named different like "my_pipe_node_1", "my_pipe_node_2", etc.
 65 |     - Node names MUST be different from the resource names in the project.
 66 |     - No indentation is allowed for property names: DESCRIPTION, NODE, SQL, TYPE, etc.
 67 |     - Allowed TYPE values are: endpoint, copy, materialized, sink.
 68 |     - Add always the output node in the TYPE section or in the last node of the pipe.
 69 | </pipe_file_instructions>
 70 | 
 71 | 
 72 | <sql_instructions>
 73 |     - The SQL query must be a valid ClickHouse SQL query that mixes ClickHouse syntax and Tinybird templating syntax (Tornado templating language under the hood).
 74 |     - SQL queries with parameters must start with "%" character and a newline on top of every query to be able to use the parameters. Examples:
 75 |     <invalid_query_with_parameters_no_%_on_top>
 76 |     SELECT * FROM events WHERE session_id={{String(my_param, "default_value")}}
 77 |     </invalid_query_with_parameters_no_%_on_top>
 78 |     <valid_query_with_parameters_with_%_on_top>
 79 |     %
 80 |     SELECT * FROM events WHERE session_id={{String(my_param, "default_value")}}
 81 |     </valid_query_with_parameters_with_%_on_top>
 82 |     - The Parameter functions like this one {{String(my_param_name,default_value)}} can be one of the following: String, DateTime, Date, Float32, Float64, Int, Integer, UInt8, UInt16, UInt32, UInt64, UInt128, UInt256, Int8, Int16, Int32, Int64, Int128, Int256
 83 |     - Parameter names must be different from column names. Pass always the param name and a default value to the function.
 84 |     - Use ALWAYS hardcoded values for default values for parameters.
 85 |     - Code inside the template {{template_expression}} follows the rules of Tornado templating language so no module is allowed to be imported. So for example you can't use now() as default value for a DateTime parameter. You need an if else block like this:
 86 |     <invalid_condition_with_now>
 87 |     AND timestamp BETWEEN {DateTime(start_date, now() - interval 30 day)} AND {DateTime(end_date, now())}
 88 |     </invalid_condition_with_now>
 89 |     <valid_condition_without_now>
 90 |     {%if not defined(start_date)%}
 91 |     timestamp BETWEEN now() - interval 30 day
 92 |     {%else%}
 93 |     timestamp BETWEEN {{DateTime(start_date)}}
 94 |     {%end%}
 95 |     {%if not defined(end_date)%}
 96 |     AND now()
 97 |     {%else%}
 98 |     AND {{DateTime(end_date)}}
 99 |     {%end%}
100 |     </valid_condition_without_now>
101 |     - Parameters must not be quoted.
102 |     - When you use defined function with a paremeter inside, do NOT add quotes around the parameter:
103 |     <invalid_defined_function_with_parameter>{% if defined('my_param') %}</invalid_defined_function_with_parameter>
104 |     <valid_defined_function_without_parameter>{% if defined(my_param) %}</valid_defined_function_without_parameter>
105 |     - Use datasource names as table names when doing SELECT statements.
106 |     - Do not use pipe names as table names.
107 |     - The available datasource names to use in the SQL are the ones present in the existing_resources section or the ones you will create.
108 |     - Use node names as table names only when nodes are present in the same file.
109 |     - Do not reference the current node name in the SQL.
110 |     - SQL queries only accept SELECT statements with conditions, aggregations, joins, etc.
111 |     - Do NOT use CREATE TABLE, INSERT INTO, CREATE DATABASE, etc.
112 |     - Use ONLY SELECT statements in the SQL section.
113 |     - INSERT INTO is not supported in SQL section.
114 |     - ClickHouse functions supported are:
115 |         - General functions supported are: ['BLAKE3', 'CAST', 'CHARACTER_LENGTH', 'CHAR_LENGTH', 'CRC32', 'CRC32IEEE', 'CRC64', 'DATABASE', 'DATE', 'DATE_DIFF', 'DATE_FORMAT', 'DATE_TRUNC', 'DAY', 'DAYOFMONTH', 'DAYOFWEEK', 'DAYOFYEAR', 'FORMAT_BYTES', 'FQDN', 'FROM_BASE64', 'FROM_DAYS', 'FROM_UNIXTIME', 'HOUR', 'INET6_ATON', 'INET6_NTOA', 'INET_ATON', 'INET_NTOA', 'IPv4CIDRToRange', 'IPv4NumToString', 'IPv4NumToStringClassC', 'IPv4StringToNum', 'IPv4StringToNumOrDefault', 'IPv4StringToNumOrNull', 'IPv4ToIPv6', 'IPv6CIDRToRange', 'IPv6NumToString', 'IPv6StringToNum', 'IPv6StringToNumOrDefault', 'IPv6StringToNumOrNull', 'JSONArrayLength', 'JSONExtract', 'JSONExtractArrayRaw', 'JSONExtractBool', 'JSONExtractFloat', 'JSONExtractInt', 'JSONExtractKeys', 'JSONExtractKeysAndValues', 'JSONExtractKeysAndValuesRaw', 'JSONExtractRaw', 'JSONExtractString', 'JSONExtractUInt', 'JSONHas', 'JSONKey', 'JSONLength', 'JSONRemoveDynamoDBAnnotations', 'JSONType', 'JSON_ARRAY_LENGTH', 'JSON_EXISTS', 'JSON_QUERY', 'JSON_VALUE', 'L1Distance', 'L1Norm', 'L1Normalize', 'L2Distance', 'L2Norm', 'L2Normalize', 'L2SquaredDistance', 'L2SquaredNorm', 'LAST_DAY', 'LinfDistance', 'LinfNorm', 'LinfNormalize', 'LpDistance', 'LpNorm', 'LpNormalize', 'MACNumToString', 'MACStringToNum', 'MACStringToOUI', 'MAP_FROM_ARRAYS', 'MD4', 'MD5', 'MILLISECOND', 'MINUTE', 'MONTH', 'OCTET_LENGTH', 'QUARTER', 'REGEXP_EXTRACT', 'REGEXP_MATCHES', 'REGEXP_REPLACE', 'SCHEMA', 'SECOND', 'SHA1', 'SHA224', 'SHA256', 'SHA384', 'SHA512', 'SHA512_256', 'SUBSTRING_INDEX', 'SVG', 'TIMESTAMP_DIFF', 'TO_BASE64', 'TO_DAYS', 'TO_UNIXTIME', 'ULIDStringToDateTime', 'URLHash', 'URLHierarchy', 'URLPathHierarchy', 'UTCTimestamp', 'UTC_timestamp', 'UUIDNumToString', 'UUIDStringToNum', 'UUIDToNum', 'UUIDv7ToDateTime', 'YEAR', 'YYYYMMDDToDate', 'YYYYMMDDToDate32', 'YYYYMMDDhhmmssToDateTime', 'YYYYMMDDhhmmssToDateTime64']
116 |         - Character insensitive functions supported are: ['cast', 'character_length', 'char_length', 'crc32', 'crc32ieee', 'crc64', 'database', 'date', 'date_format', 'date_trunc', 'day', 'dayofmonth', 'dayofweek', 'dayofyear', 'format_bytes', 'fqdn', 'from_base64', 'from_days', 'from_unixtime', 'hour', 'inet6_aton', 'inet6_ntoa', 'inet_aton', 'inet_ntoa', 'json_array_length', 'last_day', 'millisecond', 'minute', 'month', 'octet_length', 'quarter', 'regexp_extract', 'regexp_matches', 'regexp_replace', 'schema', 'second', 'substring_index', 'to_base64', 'to_days', 'to_unixtime', 'utctimestamp', 'utc_timestamp', 'year']
117 |         - Aggregate functions supported are: ['BIT_AND', 'BIT_OR', 'BIT_XOR', 'COVAR_POP', 'COVAR_SAMP', 'STD', 'STDDEV_POP', 'STDDEV_SAMP', 'VAR_POP', 'VAR_SAMP', 'aggThrow', 'analysisOfVariance', 'anova', 'any', 'anyHeavy', 'anyLast', 'anyLast_respect_nulls', 'any_respect_nulls', 'any_value', 'any_value_respect_nulls', 'approx_top_count', 'approx_top_k', 'approx_top_sum', 'argMax', 'argMin', 'array_agg', 'array_concat_agg', 'avg', 'avgWeighted', 'boundingRatio', 'categoricalInformationValue', 'contingency', 'corr', 'corrMatrix', 'corrStable', 'count', 'covarPop', 'covarPopMatrix', 'covarPopStable', 'covarSamp', 'covarSampMatrix', 'covarSampStable', 'cramersV', 'cramersVBiasCorrected', 'deltaSum', 'deltaSumTimestamp', 'dense_rank', 'entropy', 'exponentialMovingAverage', 'exponentialTimeDecayedAvg', 'exponentialTimeDecayedCount', 'exponentialTimeDecayedMax', 'exponentialTimeDecayedSum', 'first_value', 'first_value_respect_nulls', 'flameGraph', 'groupArray', 'groupArrayInsertAt', 'groupArrayIntersect', 'groupArrayLast', 'groupArrayMovingAvg', 'groupArrayMovingSum', 'groupArraySample', 'groupArraySorted', 'groupBitAnd', 'groupBitOr', 'groupBitXor', 'groupBitmap', 'groupBitmapAnd', 'groupBitmapOr', 'groupBitmapXor', 'groupUniqArray', 'histogram', 'intervalLengthSum', 'kolmogorovSmirnovTest', 'kurtPop', 'kurtSamp', 'lagInFrame', 'largestTriangleThreeBuckets', 'last_value', 'last_value_respect_nulls', 'leadInFrame', 'lttb', 'mannWhitneyUTest', 'max', 'maxIntersections', 'maxIntersectionsPosition', 'maxMappedArrays', 'meanZTest', 'median', 'medianBFloat16', 'medianBFloat16Weighted', 'medianDD', 'medianDeterministic', 'medianExact', 'medianExactHigh', 'medianExactLow', 'medianExactWeighted', 'medianGK', 'medianInterpolatedWeighted', 'medianTDigest', 'medianTDigestWeighted', 'medianTiming', 'medianTimingWeighted', 'min', 'minMappedArrays', 'nonNegativeDerivative', 'nothing', 'nothingNull', 'nothingUInt64', 'nth_value', 'ntile', 'quantile', 'quantileBFloat16', 'quantileBFloat16Weighted', 'quantileDD', 'quantileDeterministic', 'quantileExact', 'quantileExactExclusive', 'quantileExactHigh', 'quantileExactInclusive', 'quantileExactLow', 'quantileExactWeighted', 'quantileGK', 'quantileInterpolatedWeighted', 'quantileTDigest', 'quantileTDigestWeighted', 'quantileTiming', 'quantileTimingWeighted', 'quantiles', 'quantilesBFloat16', 'quantilesBFloat16Weighted', 'quantilesDD', 'quantilesDeterministic', 'quantilesExact', 'quantilesExactExclusive', 'quantilesExactHigh', 'quantilesExactInclusive', 'quantilesExactLow', 'quantilesExactWeighted', 'quantilesGK', 'quantilesInterpolatedWeighted', 'quantilesTDigest', 'quantilesTDigestWeighted', 'quantilesTiming', 'quantilesTimingWeighted', 'rank', 'rankCorr', 'retention', 'row_number', 'sequenceCount', 'sequenceMatch', 'sequenceNextNode', 'simpleLinearRegression', 'singleValueOrNull', 'skewPop', 'skewSamp', 'sparkBar', 'sparkbar', 'stddevPop', 'stddevPopStable', 'stddevSamp', 'stddevSampStable', 'stochasticLinearRegression', 'stochasticLogisticRegression', 'studentTTest', 'sum', 'sumCount', 'sumKahan', 'sumMapFiltered', 'sumMapFilteredWithOverflow', 'sumMapWithOverflow', 'sumMappedArrays', 'sumWithOverflow', 'theilsU', 'topK', 'topKWeighted', 'uniq', 'uniqCombined', 'uniqCombined64', 'uniqExact', 'uniqHLL12', 'uniqTheta', 'uniqUpTo', 'varPop', 'varPopStable', 'varSamp', 'varSampStable', 'welchTTest', 'windowFunnel']
118 |     - How to use ClickHouse supported functions:
119 |         - When using functions try always ClickHouse functions first, then SQL functions.
120 |         - Do not use any ClickHouse function that is not present in the list of general functions, character insensitive functions and aggregate functions.
121 |         - If the function is not present in the list, the sql query will fail, so avoid at all costs to use any function that is not present in the list.
122 |         - When aliasing a column, use first the column name and then the alias.
123 |         - General functions and aggregate functions are case sensitive.
124 |         - Character insensitive functions are case insensitive.
125 |     - Parameters are never quoted in any case.
126 |     - Use the following syntax in the SQL section for the iceberg table function: iceberg('s3://bucket/path/to/table', {{tb_secret('aws_access_key_id')}}, {{tb_secret('aws_secret_access_key')}})
127 |     - Use the following syntax in the SQL section for the postgres table function: postgresql('host:port', 'database', 'table', {{tb_secret('db_username')}}, {{tb_secret('db_password')}}), 'schema')
128 | </sql_instructions>
129 | 
130 | 
131 | <datasource_content>
132 | DESCRIPTION >
133 |     Some meaningful description of the datasource
134 | 
135 | SCHEMA >
136 |     `column_name_1` clickhouse_tinybird_compatible_data_type `json:$.column_name_1`,
137 |     `column_name_2` clickhouse_tinybird_compatible_data_type `json:$.column_name_2`,
138 |     ...
139 |     `column_name_n` clickhouse_tinybird_compatible_data_type `json:$.column_name_n`
140 | 
141 | ENGINE "MergeTree"
142 | ENGINE_PARTITION_KEY "partition_key"
143 | ENGINE_SORTING_KEY "sorting_key_1, sorting_key_2, ..."
144 | </datasource_content>
145 | 
146 | 
147 | <pipe_content>
148 | DESCRIPTION >
149 |     Some meaningful description of the pipe
150 | 
151 | NODE node_1
152 | SQL >
153 |     [sql query using clickhouse syntax and tinybird templating syntax and starting always with SELECT or %
154 | SELECT]
155 | TYPE endpoint
156 | 
157 | </pipe_content>
158 | 
159 | 
160 | <copy_pipe_instructions>
161 | - Do not create copy pipes by default, unless the user asks for it.
162 | - Copy pipes should be created in the /copies folder.
163 | - In a .pipe file you can define how to export the result of a Pipe to a Data Source, optionally with a schedule.
164 | - Do not include COPY_SCHEDULE in the .pipe file unless is specifically requested by the user.
165 | - COPY_SCHEDULE is a cron expression that defines the schedule of the copy pipe.
166 | - COPY_SCHEDULE is optional and if not provided, the copy pipe will be executed only once.
167 | - TARGET_DATASOURCE is the name of the Data Source to export the result to.
168 | - TYPE COPY is the type of the pipe and it is mandatory for copy pipes.
169 | - If the copy pipe uses parameters, you must include the % character and a newline on top of every query to be able to use the parameters.
170 | - The content of the .pipe file must follow this format:
171 | DESCRIPTION Copy Pipe to export sales hour every hour to the sales_hour_copy Data Source
172 | 
173 | NODE daily_sales
174 | SQL >
175 |     %
176 |     SELECT toStartOfDay(starting_date) day, country, sum(sales) as total_sales
177 |     FROM teams
178 |     WHERE
179 |     day BETWEEN toStartOfDay(now()) - interval 1 day AND toStartOfDay(now())
180 |     and country = {{ String(country, 'US')}}
181 |     GROUP BY day, country
182 | 
183 | TYPE COPY
184 | TARGET_DATASOURCE sales_hour_copy
185 | COPY_SCHEDULE 0 * * * *
186 | </copy_pipe_instructions>
187 | 
188 | 
189 | <materialized_pipe_instructions>
190 | - Do not create materialized pipes by default, unless the user asks for it.
191 | - Materialized pipes should be created in the /materializations folder.
192 | - In a .pipe file you can define how to materialize each row ingested in the earliest Data Source in the Pipe query to a materialized Data Source. Materialization happens at ingest.
193 | - DATASOURCE: Required when TYPE is MATERIALIZED. Sets the target Data Source for materialized nodes.
194 | - TYPE MATERIALIZED is the type of the pipe and it is mandatory for materialized pipes.
195 | - The content of the .pipe file must follow the materialized_pipe_content format.
196 | - Use State modifier for the aggregated columns in the pipe.
197 | </materialized_pipe_instructions>
198 | <materialized_pipe_content>
199 | NODE daily_sales
200 | SQL >
201 |     SELECT toStartOfDay(starting_date) day, country, sumState(sales) as total_sales
202 |     FROM teams
203 |     GROUP BY day, country
204 | 
205 | TYPE MATERIALIZED
206 | DATASOURCE sales_by_hour
207 | </materialized_pipe_content>
208 | <target_datasource_instructions>
209 | - The target datasource of a materialized pipe must have an AggregatingMergeTree engine.
210 | - Use AggregateFunction for the aggregated columns in the pipe.
211 | - Pipes using a materialized data source must use the Merge modifier in the SQL query for the aggregated columns. Example: sumMerge(total_sales)
212 | - Put all dimensions in the ENGINE_SORTING_KEY, sorted from least to most cardinality.
213 | </target_datasource_instructions>
214 | <target_datasource_content>
215 | SCHEMA >
216 |     `total_sales` AggregateFunction(sum, Float64),
217 |     `sales_count` AggregateFunction(count, UInt64),
218 |     `column_name_2` AggregateFunction(avg, Float64),
219 |     `dimension_1` String,
220 |     `dimension_2` String,
221 |     ...
222 |     `date` DateTime
223 | 
224 | ENGINE "AggregatingMergeTree"
225 | ENGINE_PARTITION_KEY "toYYYYMM(date)"
226 | ENGINE_SORTING_KEY "date, dimension_1, dimension_2, ..."
227 | </target_datasource_content>
228 | 
229 | 
230 | <sink_pipe_instructions>
231 | - Do not create sink pipes by default, unless the user asks for it.
232 | - Sink pipes should be created in the /sinks folder.
233 | - In a .pipe file you can define how to export the result of a Pipe to an external system, optionally with a schedule.
234 | - Valid external systems are Kafka, S3, GCS.
235 | - Sink pipes depend on a connection, if no connection is provided, search for an existing connection that suits the request. If none, create a new connection.
236 | - Do not include EXPORT_SCHEDULE in the .pipe file unless is specifically requested by the user.
237 | - EXPORT_SCHEDULE is a cron expression that defines the schedule of the sink pipe.
238 | - EXPORT_SCHEDULE is optional and if not provided, the sink pipe will be executed only once.
239 | - EXPORT_CONNECTION_NAME is the name of the connection used to export.
240 | - TYPE SINK is the type of the pipe and it is mandatory for sink pipes.
241 | - If the sink pipe uses parameters, you must include the % character and a newline on top of every query to be able to use the parameters.
242 | - The content of the .pipe file must follow this format:
243 | DESCRIPTION Sink Pipe to export sales hour every hour using my_connection
244 | 
245 | NODE daily_sales
246 | SQL >
247 |     %
248 |     SELECT toStartOfDay(starting_date) day, country, sum(sales) as total_sales
249 |     FROM teams
250 |     WHERE
251 |     day BETWEEN toStartOfDay(now()) - interval 1 day AND toStartOfDay(now())
252 |     and country = {{ String(country, 'US')}}
253 |     GROUP BY day, country
254 | 
255 | TYPE sink
256 | EXPORT_CONNECTION_NAME "my_connection"
257 | EXPORT_BUCKET_URI "s3://tinybird-sinks"
258 | EXPORT_FILE_TEMPLATE "daily_prices"
259 | EXPORT_SCHEDULE "*/5 * * * *"
260 | EXPORT_FORMAT "csv"
261 | EXPORT_COMPRESSION "gz"
262 | EXPORT_STRATEGY "truncate"
263 | </sink_pipe_instructions>
264 | 
265 | 
266 | <connection_file_instructions>
267 |     - Content cannot be empty.
268 |     - The connection names must be unique.
269 |     - No indentation is allowed for property names
270 |     - We support kafka, gcs and s3 connections for now
271 | </connection_file_instructions>
272 | 
273 | 
274 | <kafka_connection_content>
275 | TYPE kafka
276 | KAFKA_BOOTSTRAP_SERVERS {{ tb_secret("PRODUCTION_KAFKA_SERVERS", "localhost:9092") }}
277 | KAFKA_SECURITY_PROTOCOL SASL_SSL
278 | KAFKA_SASL_MECHANISM PLAIN
279 | KAFKA_KEY {{ tb_secret("PRODUCTION_KAFKA_USERNAME", "") }}
280 | KAFKA_SECRET {{ tb_secret("PRODUCTION_KAFKA_PASSWORD", "") }}
281 | </kafka_connection_content>
282 | 
283 | 
284 | <gcs_connection_content>
285 | TYPE gcs
286 | GCS_SERVICE_ACCOUNT_CREDENTIALS_JSON {{ tb_secret("PRODUCTION_GCS_SERVICE_ACCOUNT_CREDENTIALS_JSON", "") }}
287 | </gcs_connection_content>
288 | 
289 | 
290 | <gcs_hmac_connection_content>
291 | TYPE gcs
292 | GCS_HMAC_ACCESS_ID {{ tb_secret("gcs_hmac_access_id") }}
293 | GCS_HMAC_SECRET {{ tb_secret("gcs_hmac_secret") }}
294 | </gcs_hmac_connection_content>
295 | 
296 | 
297 | <s3_connection_content>
298 | TYPE s3
299 | S3_REGION {{ tb_secret("PRODUCTION_S3_REGION", "") }}
300 | S3_ARN {{ tb_secret("PRODUCTION_S3_ARN", "") }}
301 | </s3_connection_content>
302 | 
303 | 
304 | ## .test file instructions
305 | Follow these instructions when creating or updating .yaml files for tests:
306 | 
307 | - The test file name must match the name of the pipe it is testing.
308 | - Every scenario name must be unique inside the test file.
309 | - When looking for the parameters available, you will find them in the pipes in the following format: {{{{String(my_param_name, default_value)}}}}.
310 | - If there are no parameters, you can omit parameters and generate a single test.
311 | - The format of the parameters is the following: param1=value1&param2=value2&param3=value3
312 | - If some parameters are provided by the user and you need to use them, preserve in the same format as they were provided, like case sensitive
313 | - Test as many scenarios as possible.
314 | - The format of the test file is the following:
315 | <test_file_format>
316 | - name: kpis_single_day
317 |   description: Test hourly granularity for a single day
318 |   parameters: date_from=2024-01-01&date_to=2024-01-01
319 |   expected_result: |
320 |     {"date":"2024-01-01 00:00:00","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0}
321 |     {"date":"2024-01-01 01:00:00","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0}
322 | 
323 | - name: kpis_date_range
324 |   description: Test daily granularity for a date range
325 |   parameters: date_from=2024-01-01&date_to=2024-01-31
326 |   expected_result: |
327 |     {"date":"2024-01-01","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0}
328 |     {"date":"2024-01-02","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0}
329 | 
330 | - name: kpis_default_range
331 |   description: Test default behavior without date parameters (last 7 days)
332 |   parameters: ''
333 |   expected_result: |
334 |     {"date":"2025-01-10","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0}
335 |     {"date":"2025-01-11","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0}
336 | 
337 | - name: kpis_fixed_time
338 |   description: Test with fixed timestamp for consistent testing
339 |   parameters: fixed_time=2024-01-15T12:00:00
340 |   expected_result: ''
341 | 
342 | - name: kpis_single_day
343 |   description: Test single day with hourly granularity
344 |   parameters: date_from=2024-01-01&date_to=2024-01-01
345 |   expected_result: |
346 |     {"date":"2024-01-01 00:00:00","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0}
347 |     {"date":"2024-01-01 01:00:00","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0}
348 | 
349 | </test_file_format>
350 | 
351 | 
352 | ## Deployment instructions
353 | Follow these instructions when evolving a datasource schema:
354 | 
355 | - When you make schema changes that are incompatible with the old schema, you must use a forward query in your data source. Forward queries are necessary when introducing breaking changes. Otherwise, your deployment will fail due to a schema mismatch.
356 | - Forward queries translate the old schema to a new one that you define in the .datasource file. This helps you evolve your schema while continuing to ingest data.
357 | Follow these steps to evolve your schema using a forward query:
358 | - Edit the .datasource file to add a forward query.
359 | - Run tb deploy --check to validate the deployment before creating it.
360 | - Deploy and promote your changes in Tinybird Cloud using {base_command} --cloud deploy.
361 |     <forward_query_example>
362 | SCHEMA >
363 |     `timestamp` DateTime `json:$.timestamp`,
364 |     `session_id` UUID `json:$.session_id`,
365 |     `action` String `json:$.action`,
366 |     `version` String `json:$.version`,
367 |     `payload` String `json:$.payload`
368 | 
369 | FORWARD_QUERY >
370 |     select timestamp, toUUID(session_id) as session_id, action, version, payload
371 |     </forward_query_example>
372 | </deployment_instruction>
373 | 
374 | 


--------------------------------------------------------------------------------
/tests/integration/project/.cursorrules:
--------------------------------------------------------------------------------
  1 | 
  2 | You are an expert in SQL and Tinybird. Follow these instructions when working with .datasource and .pipe files:
  3 | 
  4 | <command_calling>
  5 | You have commands at your disposal to develop a tinybird project:
  6 | - tb build: to build the project locally and check it works.
  7 | - tb deployment create --wait --auto: to create a deployment and promote it automatically
  8 | - tb test run: to run existing tests
  9 | - tb endpoint url <pipe_name>: to get the url of an endpoint, token included.
 10 | - tb endpoint data <pipe_name>: to get the data of an endpoint. You can pass parameters to the endpoint like this: tb endpoint data <pipe_name> --param1 value1 --param2 value2
 11 | - tb  token ls: to list all the tokens
 12 | There are other commands that you can use, but these are the most common ones. Run `tb -h` to see all the commands if needed.
 13 | When you need to work with resources or data in cloud, add always the --cloud flag before the command. Example: tb --cloud datasource ls
 14 | </command_calling>
 15 | <development_instructions>
 16 | - When asking to create a tinybird data project, if the needed folders are not already created, use the following structure:
 17 | ├── connections
 18 | ├── copies
 19 | ├── sinks
 20 | ├── datasources
 21 | ├── endpoints
 22 | ├── fixtures
 23 | ├── materializations
 24 | ├── pipes
 25 | └── tests
 26 | - The local development server will be available at http://localhost:7181. Even if some response uses another base url, use always http://localhost:7181.
 27 | - After every change in your .datasource, .pipe or .ndjson files, run `tb build` to build the project locally.
 28 | - When you need to ingest data locally in a datasource, create a .ndjson file with the same name of the datasource and the data you want and run `tb build` so the data is ingested.
 29 | - The format of the generated api endpoint urls is: http://localhost:7181/v0/pipe/<pipe_name>.json?token=<token>
 30 | - Before running the tests, remember to have the project built with `tb build` with the latest changes.
 31 | </development_instructions>
 32 | When asking for ingesting data, adding data or appending data do the following depending on the environment you want to work with:
 33 | <ingest_data_instructions>
 34 | - When building locally, create a .ndjson file with the data you want to ingest and do `tb build` to ingest the data in the build env.
 35 | - We call `cloud` the production environment.
 36 | - When appending data in cloud, use `tb --cloud datasource append <datasource_name> <file_name>`
 37 | - When you have a response that says “there are rows in quarantine”, do `tb [--cloud] datasource data <datasource_name>_quarantine` to understand what is the problem.
 38 | </ingest_data_instructions>
 39 | <datasource_file_instructions>
 40 | Follow these instructions when creating or updating .datasource files:
 41 | 
 42 | <datasource_file_instructions>
 43 |     - Content cannot be empty.
 44 |     - The datasource names must be unique.
 45 |     - No indentation is allowed for property names: DESCRIPTION, SCHEMA, ENGINE, ENGINE_PARTITION_KEY, ENGINE_SORTING_KEY, etc.
 46 |     - Use MergeTree engine by default.
 47 |     - Use AggregatingMergeTree engine when the datasource is the target of a materialized pipe.
 48 |     - Use always json paths to define the schema. Example: `user_id` String `json:$.user_id`,
 49 |     - Array columns are supported with a special syntax. Example: `items` Array(String) `json:$.items[:]`
 50 |     - If the datasource is using an S3 or GCS connection, they need to set IMPORT_CONNECTION_NAME, IMPORT_BUCKET_URI and IMPORT_SCHEDULE (GCS @on-demand only, S3 supports @auto too)
 51 |     - If the datasource is using a Kafka connection, they need to set KAFKA_CONNECTION_NAME as the name of the .connection file, KAFKA_TOPIC topic_name and KAFKA_GROUP_ID as the group id for the datasource
 52 |     - Unless the user asks for them, do not include ENGINE_PARTITION_KEY and ENGINE_PRIMARY_KEY.
 53 |     - DateTime64 type without precision is not supported. Use DateTime64(3) instead.
 54 | </datasource_file_instructions>
 55 | 
 56 | </datasource_file_instructions>
 57 | 
 58 | <pipe_file_instructions>
 59 | Follow these instructions when creating or updating .pipe files:
 60 | 
 61 | Follow these instructions when creating or updating any type of .pipe file:
 62 | <pipe_file_instructions>
 63 |     - The pipe names must be unique.
 64 |     - Nodes do NOT use the same name as the Pipe they belong to. So if the pipe name is "my_pipe", the nodes must be named different like "my_pipe_node_1", "my_pipe_node_2", etc.
 65 |     - Node names MUST be different from the resource names in the project.
 66 |     - No indentation is allowed for property names: DESCRIPTION, NODE, SQL, TYPE, etc.
 67 |     - Allowed TYPE values are: endpoint, copy, materialized, sink.
 68 |     - Add always the output node in the TYPE section or in the last node of the pipe.
 69 | </pipe_file_instructions>
 70 | 
 71 | 
 72 | <sql_instructions>
 73 |     - The SQL query must be a valid ClickHouse SQL query that mixes ClickHouse syntax and Tinybird templating syntax (Tornado templating language under the hood).
 74 |     - SQL queries with parameters must start with "%" character and a newline on top of every query to be able to use the parameters. Examples:
 75 |     <invalid_query_with_parameters_no_%_on_top>
 76 |     SELECT * FROM events WHERE session_id={{String(my_param, "default_value")}}
 77 |     </invalid_query_with_parameters_no_%_on_top>
 78 |     <valid_query_with_parameters_with_%_on_top>
 79 |     %
 80 |     SELECT * FROM events WHERE session_id={{String(my_param, "default_value")}}
 81 |     </valid_query_with_parameters_with_%_on_top>
 82 |     - The Parameter functions like this one {{String(my_param_name,default_value)}} can be one of the following: String, DateTime, Date, Float32, Float64, Int, Integer, UInt8, UInt16, UInt32, UInt64, UInt128, UInt256, Int8, Int16, Int32, Int64, Int128, Int256
 83 |     - Parameter names must be different from column names. Pass always the param name and a default value to the function.
 84 |     - Use ALWAYS hardcoded values for default values for parameters.
 85 |     - Code inside the template {{template_expression}} follows the rules of Tornado templating language so no module is allowed to be imported. So for example you can't use now() as default value for a DateTime parameter. You need an if else block like this:
 86 |     <invalid_condition_with_now>
 87 |     AND timestamp BETWEEN {DateTime(start_date, now() - interval 30 day)} AND {DateTime(end_date, now())}
 88 |     </invalid_condition_with_now>
 89 |     <valid_condition_without_now>
 90 |     {%if not defined(start_date)%}
 91 |     timestamp BETWEEN now() - interval 30 day
 92 |     {%else%}
 93 |     timestamp BETWEEN {{DateTime(start_date)}}
 94 |     {%end%}
 95 |     {%if not defined(end_date)%}
 96 |     AND now()
 97 |     {%else%}
 98 |     AND {{DateTime(end_date)}}
 99 |     {%end%}
100 |     </valid_condition_without_now>
101 |     - Parameters must not be quoted.
102 |     - When you use defined function with a paremeter inside, do NOT add quotes around the parameter:
103 |     <invalid_defined_function_with_parameter>{% if defined('my_param') %}</invalid_defined_function_with_parameter>
104 |     <valid_defined_function_without_parameter>{% if defined(my_param) %}</valid_defined_function_without_parameter>
105 |     - Use datasource names as table names when doing SELECT statements.
106 |     - Do not use pipe names as table names.
107 |     - The available datasource names to use in the SQL are the ones present in the existing_resources section or the ones you will create.
108 |     - Use node names as table names only when nodes are present in the same file.
109 |     - Do not reference the current node name in the SQL.
110 |     - SQL queries only accept SELECT statements with conditions, aggregations, joins, etc.
111 |     - Do NOT use CREATE TABLE, INSERT INTO, CREATE DATABASE, etc.
112 |     - Use ONLY SELECT statements in the SQL section.
113 |     - INSERT INTO is not supported in SQL section.
114 |     - ClickHouse functions supported are:
115 |         - General functions supported are: ['BLAKE3', 'CAST', 'CHARACTER_LENGTH', 'CHAR_LENGTH', 'CRC32', 'CRC32IEEE', 'CRC64', 'DATABASE', 'DATE', 'DATE_DIFF', 'DATE_FORMAT', 'DATE_TRUNC', 'DAY', 'DAYOFMONTH', 'DAYOFWEEK', 'DAYOFYEAR', 'FORMAT_BYTES', 'FQDN', 'FROM_BASE64', 'FROM_DAYS', 'FROM_UNIXTIME', 'HOUR', 'INET6_ATON', 'INET6_NTOA', 'INET_ATON', 'INET_NTOA', 'IPv4CIDRToRange', 'IPv4NumToString', 'IPv4NumToStringClassC', 'IPv4StringToNum', 'IPv4StringToNumOrDefault', 'IPv4StringToNumOrNull', 'IPv4ToIPv6', 'IPv6CIDRToRange', 'IPv6NumToString', 'IPv6StringToNum', 'IPv6StringToNumOrDefault', 'IPv6StringToNumOrNull', 'JSONArrayLength', 'JSONExtract', 'JSONExtractArrayRaw', 'JSONExtractBool', 'JSONExtractFloat', 'JSONExtractInt', 'JSONExtractKeys', 'JSONExtractKeysAndValues', 'JSONExtractKeysAndValuesRaw', 'JSONExtractRaw', 'JSONExtractString', 'JSONExtractUInt', 'JSONHas', 'JSONKey', 'JSONLength', 'JSONRemoveDynamoDBAnnotations', 'JSONType', 'JSON_ARRAY_LENGTH', 'JSON_EXISTS', 'JSON_QUERY', 'JSON_VALUE', 'L1Distance', 'L1Norm', 'L1Normalize', 'L2Distance', 'L2Norm', 'L2Normalize', 'L2SquaredDistance', 'L2SquaredNorm', 'LAST_DAY', 'LinfDistance', 'LinfNorm', 'LinfNormalize', 'LpDistance', 'LpNorm', 'LpNormalize', 'MACNumToString', 'MACStringToNum', 'MACStringToOUI', 'MAP_FROM_ARRAYS', 'MD4', 'MD5', 'MILLISECOND', 'MINUTE', 'MONTH', 'OCTET_LENGTH', 'QUARTER', 'REGEXP_EXTRACT', 'REGEXP_MATCHES', 'REGEXP_REPLACE', 'SCHEMA', 'SECOND', 'SHA1', 'SHA224', 'SHA256', 'SHA384', 'SHA512', 'SHA512_256', 'SUBSTRING_INDEX', 'SVG', 'TIMESTAMP_DIFF', 'TO_BASE64', 'TO_DAYS', 'TO_UNIXTIME', 'ULIDStringToDateTime', 'URLHash', 'URLHierarchy', 'URLPathHierarchy', 'UTCTimestamp', 'UTC_timestamp', 'UUIDNumToString', 'UUIDStringToNum', 'UUIDToNum', 'UUIDv7ToDateTime', 'YEAR', 'YYYYMMDDToDate', 'YYYYMMDDToDate32', 'YYYYMMDDhhmmssToDateTime', 'YYYYMMDDhhmmssToDateTime64']
116 |         - Character insensitive functions supported are: ['cast', 'character_length', 'char_length', 'crc32', 'crc32ieee', 'crc64', 'database', 'date', 'date_format', 'date_trunc', 'day', 'dayofmonth', 'dayofweek', 'dayofyear', 'format_bytes', 'fqdn', 'from_base64', 'from_days', 'from_unixtime', 'hour', 'inet6_aton', 'inet6_ntoa', 'inet_aton', 'inet_ntoa', 'json_array_length', 'last_day', 'millisecond', 'minute', 'month', 'octet_length', 'quarter', 'regexp_extract', 'regexp_matches', 'regexp_replace', 'schema', 'second', 'substring_index', 'to_base64', 'to_days', 'to_unixtime', 'utctimestamp', 'utc_timestamp', 'year']
117 |         - Aggregate functions supported are: ['BIT_AND', 'BIT_OR', 'BIT_XOR', 'COVAR_POP', 'COVAR_SAMP', 'STD', 'STDDEV_POP', 'STDDEV_SAMP', 'VAR_POP', 'VAR_SAMP', 'aggThrow', 'analysisOfVariance', 'anova', 'any', 'anyHeavy', 'anyLast', 'anyLast_respect_nulls', 'any_respect_nulls', 'any_value', 'any_value_respect_nulls', 'approx_top_count', 'approx_top_k', 'approx_top_sum', 'argMax', 'argMin', 'array_agg', 'array_concat_agg', 'avg', 'avgWeighted', 'boundingRatio', 'categoricalInformationValue', 'contingency', 'corr', 'corrMatrix', 'corrStable', 'count', 'covarPop', 'covarPopMatrix', 'covarPopStable', 'covarSamp', 'covarSampMatrix', 'covarSampStable', 'cramersV', 'cramersVBiasCorrected', 'deltaSum', 'deltaSumTimestamp', 'dense_rank', 'entropy', 'exponentialMovingAverage', 'exponentialTimeDecayedAvg', 'exponentialTimeDecayedCount', 'exponentialTimeDecayedMax', 'exponentialTimeDecayedSum', 'first_value', 'first_value_respect_nulls', 'flameGraph', 'groupArray', 'groupArrayInsertAt', 'groupArrayIntersect', 'groupArrayLast', 'groupArrayMovingAvg', 'groupArrayMovingSum', 'groupArraySample', 'groupArraySorted', 'groupBitAnd', 'groupBitOr', 'groupBitXor', 'groupBitmap', 'groupBitmapAnd', 'groupBitmapOr', 'groupBitmapXor', 'groupUniqArray', 'histogram', 'intervalLengthSum', 'kolmogorovSmirnovTest', 'kurtPop', 'kurtSamp', 'lagInFrame', 'largestTriangleThreeBuckets', 'last_value', 'last_value_respect_nulls', 'leadInFrame', 'lttb', 'mannWhitneyUTest', 'max', 'maxIntersections', 'maxIntersectionsPosition', 'maxMappedArrays', 'meanZTest', 'median', 'medianBFloat16', 'medianBFloat16Weighted', 'medianDD', 'medianDeterministic', 'medianExact', 'medianExactHigh', 'medianExactLow', 'medianExactWeighted', 'medianGK', 'medianInterpolatedWeighted', 'medianTDigest', 'medianTDigestWeighted', 'medianTiming', 'medianTimingWeighted', 'min', 'minMappedArrays', 'nonNegativeDerivative', 'nothing', 'nothingNull', 'nothingUInt64', 'nth_value', 'ntile', 'quantile', 'quantileBFloat16', 'quantileBFloat16Weighted', 'quantileDD', 'quantileDeterministic', 'quantileExact', 'quantileExactExclusive', 'quantileExactHigh', 'quantileExactInclusive', 'quantileExactLow', 'quantileExactWeighted', 'quantileGK', 'quantileInterpolatedWeighted', 'quantileTDigest', 'quantileTDigestWeighted', 'quantileTiming', 'quantileTimingWeighted', 'quantiles', 'quantilesBFloat16', 'quantilesBFloat16Weighted', 'quantilesDD', 'quantilesDeterministic', 'quantilesExact', 'quantilesExactExclusive', 'quantilesExactHigh', 'quantilesExactInclusive', 'quantilesExactLow', 'quantilesExactWeighted', 'quantilesGK', 'quantilesInterpolatedWeighted', 'quantilesTDigest', 'quantilesTDigestWeighted', 'quantilesTiming', 'quantilesTimingWeighted', 'rank', 'rankCorr', 'retention', 'row_number', 'sequenceCount', 'sequenceMatch', 'sequenceNextNode', 'simpleLinearRegression', 'singleValueOrNull', 'skewPop', 'skewSamp', 'sparkBar', 'sparkbar', 'stddevPop', 'stddevPopStable', 'stddevSamp', 'stddevSampStable', 'stochasticLinearRegression', 'stochasticLogisticRegression', 'studentTTest', 'sum', 'sumCount', 'sumKahan', 'sumMapFiltered', 'sumMapFilteredWithOverflow', 'sumMapWithOverflow', 'sumMappedArrays', 'sumWithOverflow', 'theilsU', 'topK', 'topKWeighted', 'uniq', 'uniqCombined', 'uniqCombined64', 'uniqExact', 'uniqHLL12', 'uniqTheta', 'uniqUpTo', 'varPop', 'varPopStable', 'varSamp', 'varSampStable', 'welchTTest', 'windowFunnel']
118 |     - How to use ClickHouse supported functions:
119 |         - When using functions try always ClickHouse functions first, then SQL functions.
120 |         - Do not use any ClickHouse function that is not present in the list of general functions, character insensitive functions and aggregate functions.
121 |         - If the function is not present in the list, the sql query will fail, so avoid at all costs to use any function that is not present in the list.
122 |         - When aliasing a column, use first the column name and then the alias.
123 |         - General functions and aggregate functions are case sensitive.
124 |         - Character insensitive functions are case insensitive.
125 |     - Parameters are never quoted in any case.
126 |     - Use the following syntax in the SQL section for the iceberg table function: iceberg('s3://bucket/path/to/table', {{tb_secret('aws_access_key_id')}}, {{tb_secret('aws_secret_access_key')}})
127 |     - Use the following syntax in the SQL section for the postgres table function: postgresql('host:port', 'database', 'table', {{tb_secret('db_username')}}, {{tb_secret('db_password')}}), 'schema')
128 | </sql_instructions>
129 | 
130 | 
131 | <datasource_content>
132 | DESCRIPTION >
133 |     Some meaningful description of the datasource
134 | 
135 | SCHEMA >
136 |     `column_name_1` clickhouse_tinybird_compatible_data_type `json:$.column_name_1`,
137 |     `column_name_2` clickhouse_tinybird_compatible_data_type `json:$.column_name_2`,
138 |     ...
139 |     `column_name_n` clickhouse_tinybird_compatible_data_type `json:$.column_name_n`
140 | 
141 | ENGINE "MergeTree"
142 | ENGINE_PARTITION_KEY "partition_key"
143 | ENGINE_SORTING_KEY "sorting_key_1, sorting_key_2, ..."
144 | </datasource_content>
145 | 
146 | 
147 | <pipe_content>
148 | DESCRIPTION >
149 |     Some meaningful description of the pipe
150 | 
151 | NODE node_1
152 | SQL >
153 |     [sql query using clickhouse syntax and tinybird templating syntax and starting always with SELECT or %
154 | SELECT]
155 | TYPE endpoint
156 | 
157 | </pipe_content>
158 | 
159 | 
160 | <copy_pipe_instructions>
161 | - Do not create copy pipes by default, unless the user asks for it.
162 | - Copy pipes should be created in the /copies folder.
163 | - In a .pipe file you can define how to export the result of a Pipe to a Data Source, optionally with a schedule.
164 | - Do not include COPY_SCHEDULE in the .pipe file unless is specifically requested by the user.
165 | - COPY_SCHEDULE is a cron expression that defines the schedule of the copy pipe.
166 | - COPY_SCHEDULE is optional and if not provided, the copy pipe will be executed only once.
167 | - TARGET_DATASOURCE is the name of the Data Source to export the result to.
168 | - TYPE COPY is the type of the pipe and it is mandatory for copy pipes.
169 | - If the copy pipe uses parameters, you must include the % character and a newline on top of every query to be able to use the parameters.
170 | - The content of the .pipe file must follow this format:
171 | DESCRIPTION Copy Pipe to export sales hour every hour to the sales_hour_copy Data Source
172 | 
173 | NODE daily_sales
174 | SQL >
175 |     %
176 |     SELECT toStartOfDay(starting_date) day, country, sum(sales) as total_sales
177 |     FROM teams
178 |     WHERE
179 |     day BETWEEN toStartOfDay(now()) - interval 1 day AND toStartOfDay(now())
180 |     and country = {{ String(country, 'US')}}
181 |     GROUP BY day, country
182 | 
183 | TYPE COPY
184 | TARGET_DATASOURCE sales_hour_copy
185 | COPY_SCHEDULE 0 * * * *
186 | </copy_pipe_instructions>
187 | 
188 | 
189 | <materialized_pipe_instructions>
190 | - Do not create materialized pipes by default, unless the user asks for it.
191 | - Materialized pipes should be created in the /materializations folder.
192 | - In a .pipe file you can define how to materialize each row ingested in the earliest Data Source in the Pipe query to a materialized Data Source. Materialization happens at ingest.
193 | - DATASOURCE: Required when TYPE is MATERIALIZED. Sets the target Data Source for materialized nodes.
194 | - TYPE MATERIALIZED is the type of the pipe and it is mandatory for materialized pipes.
195 | - The content of the .pipe file must follow the materialized_pipe_content format.
196 | - Use State modifier for the aggregated columns in the pipe.
197 | </materialized_pipe_instructions>
198 | <materialized_pipe_content>
199 | NODE daily_sales
200 | SQL >
201 |     SELECT toStartOfDay(starting_date) day, country, sumState(sales) as total_sales
202 |     FROM teams
203 |     GROUP BY day, country
204 | 
205 | TYPE MATERIALIZED
206 | DATASOURCE sales_by_hour
207 | </materialized_pipe_content>
208 | <target_datasource_instructions>
209 | - The target datasource of a materialized pipe must have an AggregatingMergeTree engine.
210 | - Use AggregateFunction for the aggregated columns in the pipe.
211 | - Pipes using a materialized data source must use the Merge modifier in the SQL query for the aggregated columns. Example: sumMerge(total_sales)
212 | - Put all dimensions in the ENGINE_SORTING_KEY, sorted from least to most cardinality.
213 | </target_datasource_instructions>
214 | <target_datasource_content>
215 | SCHEMA >
216 |     `total_sales` AggregateFunction(sum, Float64),
217 |     `sales_count` AggregateFunction(count, UInt64),
218 |     `column_name_2` AggregateFunction(avg, Float64),
219 |     `dimension_1` String,
220 |     `dimension_2` String,
221 |     ...
222 |     `date` DateTime
223 | 
224 | ENGINE "AggregatingMergeTree"
225 | ENGINE_PARTITION_KEY "toYYYYMM(date)"
226 | ENGINE_SORTING_KEY "date, dimension_1, dimension_2, ..."
227 | </target_datasource_content>
228 | 
229 | 
230 | <sink_pipe_instructions>
231 | - Do not create sink pipes by default, unless the user asks for it.
232 | - Sink pipes should be created in the /sinks folder.
233 | - In a .pipe file you can define how to export the result of a Pipe to an external system, optionally with a schedule.
234 | - Valid external systems are Kafka, S3, GCS.
235 | - Sink pipes depend on a connection, if no connection is provided, search for an existing connection that suits the request. If none, create a new connection.
236 | - Do not include EXPORT_SCHEDULE in the .pipe file unless is specifically requested by the user.
237 | - EXPORT_SCHEDULE is a cron expression that defines the schedule of the sink pipe.
238 | - EXPORT_SCHEDULE is optional and if not provided, the sink pipe will be executed only once.
239 | - EXPORT_CONNECTION_NAME is the name of the connection used to export.
240 | - TYPE SINK is the type of the pipe and it is mandatory for sink pipes.
241 | - If the sink pipe uses parameters, you must include the % character and a newline on top of every query to be able to use the parameters.
242 | - The content of the .pipe file must follow this format:
243 | DESCRIPTION Sink Pipe to export sales hour every hour using my_connection
244 | 
245 | NODE daily_sales
246 | SQL >
247 |     %
248 |     SELECT toStartOfDay(starting_date) day, country, sum(sales) as total_sales
249 |     FROM teams
250 |     WHERE
251 |     day BETWEEN toStartOfDay(now()) - interval 1 day AND toStartOfDay(now())
252 |     and country = {{ String(country, 'US')}}
253 |     GROUP BY day, country
254 | 
255 | TYPE sink
256 | EXPORT_CONNECTION_NAME "my_connection"
257 | EXPORT_BUCKET_URI "s3://tinybird-sinks"
258 | EXPORT_FILE_TEMPLATE "daily_prices"
259 | EXPORT_SCHEDULE "*/5 * * * *"
260 | EXPORT_FORMAT "csv"
261 | EXPORT_COMPRESSION "gz"
262 | EXPORT_STRATEGY "truncate"
263 | </sink_pipe_instructions>
264 | 
265 | 
266 | <connection_file_instructions>
267 |     - Content cannot be empty.
268 |     - The connection names must be unique.
269 |     - No indentation is allowed for property names
270 |     - We support kafka, gcs and s3 connections for now
271 | </connection_file_instructions>
272 | 
273 | 
274 | <kafka_connection_content>
275 | TYPE kafka
276 | KAFKA_BOOTSTRAP_SERVERS {{ tb_secret("PRODUCTION_KAFKA_SERVERS", "localhost:9092") }}
277 | KAFKA_SECURITY_PROTOCOL SASL_SSL
278 | KAFKA_SASL_MECHANISM PLAIN
279 | KAFKA_KEY {{ tb_secret("PRODUCTION_KAFKA_USERNAME", "") }}
280 | KAFKA_SECRET {{ tb_secret("PRODUCTION_KAFKA_PASSWORD", "") }}
281 | </kafka_connection_content>
282 | 
283 | 
284 | <gcs_connection_content>
285 | TYPE gcs
286 | GCS_SERVICE_ACCOUNT_CREDENTIALS_JSON {{ tb_secret("PRODUCTION_GCS_SERVICE_ACCOUNT_CREDENTIALS_JSON", "") }}
287 | </gcs_connection_content>
288 | 
289 | 
290 | <gcs_hmac_connection_content>
291 | TYPE gcs
292 | GCS_HMAC_ACCESS_ID {{ tb_secret("gcs_hmac_access_id") }}
293 | GCS_HMAC_SECRET {{ tb_secret("gcs_hmac_secret") }}
294 | </gcs_hmac_connection_content>
295 | 
296 | 
297 | <s3_connection_content>
298 | TYPE s3
299 | S3_REGION {{ tb_secret("PRODUCTION_S3_REGION", "") }}
300 | S3_ARN {{ tb_secret("PRODUCTION_S3_ARN", "") }}
301 | </s3_connection_content>
302 | 
303 | </pipe_file_instructions>
304 | <test_file_instructions>
305 | Follow these instructions when creating or updating .yaml files for tests:
306 | 
307 | - The test file name must match the name of the pipe it is testing.
308 | - Every scenario name must be unique inside the test file.
309 | - When looking for the parameters available, you will find them in the pipes in the following format: {{{{String(my_param_name, default_value)}}}}.
310 | - If there are no parameters, you can omit parameters and generate a single test.
311 | - The format of the parameters is the following: param1=value1&param2=value2&param3=value3
312 | - If some parameters are provided by the user and you need to use them, preserve in the same format as they were provided, like case sensitive
313 | - Test as many scenarios as possible.
314 | - The format of the test file is the following:
315 | <test_file_format>
316 | - name: kpis_single_day
317 |   description: Test hourly granularity for a single day
318 |   parameters: date_from=2024-01-01&date_to=2024-01-01
319 |   expected_result: |
320 |     {"date":"2024-01-01 00:00:00","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0}
321 |     {"date":"2024-01-01 01:00:00","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0}
322 | 
323 | - name: kpis_date_range
324 |   description: Test daily granularity for a date range
325 |   parameters: date_from=2024-01-01&date_to=2024-01-31
326 |   expected_result: |
327 |     {"date":"2024-01-01","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0}
328 |     {"date":"2024-01-02","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0}
329 | 
330 | - name: kpis_default_range
331 |   description: Test default behavior without date parameters (last 7 days)
332 |   parameters: ''
333 |   expected_result: |
334 |     {"date":"2025-01-10","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0}
335 |     {"date":"2025-01-11","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0}
336 | 
337 | - name: kpis_fixed_time
338 |   description: Test with fixed timestamp for consistent testing
339 |   parameters: fixed_time=2024-01-15T12:00:00
340 |   expected_result: ''
341 | 
342 | - name: kpis_single_day
343 |   description: Test single day with hourly granularity
344 |   parameters: date_from=2024-01-01&date_to=2024-01-01
345 |   expected_result: |
346 |     {"date":"2024-01-01 00:00:00","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0}
347 |     {"date":"2024-01-01 01:00:00","visits":0,"pageviews":0,"bounce_rate":null,"avg_session_sec":0}
348 | 
349 | </test_file_format>
350 | 
351 | </test_file_instructions>
352 | <deployment_instruction>
353 | Follow these instructions when evolving a datasource schema:
354 | 
355 | - When you make schema changes that are incompatible with the old schema, you must use a forward query in your data source. Forward queries are necessary when introducing breaking changes. Otherwise, your deployment will fail due to a schema mismatch.
356 | - Forward queries translate the old schema to a new one that you define in the .datasource file. This helps you evolve your schema while continuing to ingest data.
357 | Follow these steps to evolve your schema using a forward query:
358 | - Edit the .datasource file to add a forward query.
359 | - Run tb deploy --check to validate the deployment before creating it.
360 | - Deploy and promote your changes in Tinybird Cloud using {base_command} --cloud deploy.
361 |     <forward_query_example>
362 | SCHEMA >
363 |     `timestamp` DateTime `json:$.timestamp`,
364 |     `session_id` UUID `json:$.session_id`,
365 |     `action` String `json:$.action`,
366 |     `version` String `json:$.version`,
367 |     `payload` String `json:$.payload`
368 | 
369 | FORWARD_QUERY >
370 |     select timestamp, toUUID(session_id) as session_id, action, version, payload
371 |     </forward_query_example>
372 | </deployment_instruction>
373 | 
374 | </deployment_instruction>
375 | 


--------------------------------------------------------------------------------